From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/README | 74 ++ gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm | 168 ++++ gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h | 225 +++++ gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm | 474 ++++++++++ gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm | 140 +++ gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm | 170 ++++ gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm | 53 ++ gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm | 172 ++++ gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm | 176 ++++ gmp-6.3.0/mpn/x86_64/aors_err1_n.asm | 225 +++++ gmp-6.3.0/mpn/x86_64/aors_err2_n.asm | 172 ++++ gmp-6.3.0/mpn/x86_64/aors_err3_n.asm | 156 ++++ gmp-6.3.0/mpn/x86_64/aors_n.asm | 178 ++++ gmp-6.3.0/mpn/x86_64/aorsmul_1.asm | 190 ++++ gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm | 186 ++++ gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm | 238 +++++ gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm | 191 ++++ gmp-6.3.0/mpn/x86_64/atom/aors_n.asm | 128 +++ gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm | 194 ++++ gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm | 38 + gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm | 38 + gmp-6.3.0/mpn/x86_64/atom/com.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/dive_1.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h | 222 +++++ gmp-6.3.0/mpn/x86_64/atom/lshift.asm | 123 +++ gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm | 127 +++ gmp-6.3.0/mpn/x86_64/atom/mul_1.asm | 147 +++ gmp-6.3.0/mpn/x86_64/atom/mul_2.asm | 190 ++++ gmp-6.3.0/mpn/x86_64/atom/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/atom/redc_1.asm | 579 ++++++++++++ gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm | 287 ++++++ gmp-6.3.0/mpn/x86_64/atom/rshift.asm | 121 +++ gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm | 242 +++++ gmp-6.3.0/mpn/x86_64/bd1/README | 11 + gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm | 235 +++++ gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm | 190 ++++ gmp-6.3.0/mpn/x86_64/bd1/com.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h | 265 ++++++ gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm | 206 +++++ gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm | 193 ++++ gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm | 195 ++++ gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm | 416 +++++++++ gmp-6.3.0/mpn/x86_64/bd1/popcount.asm | 191 ++++ gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm | 96 ++ gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm | 142 +++ gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h | 263 ++++++ gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm | 96 ++ gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h | 266 ++++++ gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm | 106 +++ gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm | 195 ++++ gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm | 159 ++++ gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm | 191 ++++ gmp-6.3.0/mpn/x86_64/bt1/copyd.asm | 91 ++ gmp-6.3.0/mpn/x86_64/bt1/copyi.asm | 94 ++ gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm | 119 +++ gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h | 230 +++++ gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm | 241 +++++ gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm | 486 ++++++++++ gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm | 507 +++++++++++ gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm | 565 ++++++++++++ gmp-6.3.0/mpn/x86_64/bt2/com.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h | 240 +++++ gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm | 183 ++++ gmp-6.3.0/mpn/x86_64/com.asm | 95 ++ gmp-6.3.0/mpn/x86_64/copyd.asm | 93 ++ gmp-6.3.0/mpn/x86_64/copyi.asm | 92 ++ gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm | 53 ++ gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm | 53 ++ gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm | 225 +++++ gmp-6.3.0/mpn/x86_64/core2/aors_n.asm | 150 ++++ gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm | 188 ++++ gmp-6.3.0/mpn/x86_64/core2/com.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm | 243 +++++ gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm | 93 ++ gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm | 137 +++ gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h | 222 +++++ gmp-6.3.0/mpn/x86_64/core2/hamdist.asm | 210 +++++ gmp-6.3.0/mpn/x86_64/core2/logops_n.asm | 285 ++++++ gmp-6.3.0/mpn/x86_64/core2/lshift.asm | 145 +++ gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm | 159 ++++ gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm | 975 ++++++++++++++++++++ gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm | 427 +++++++++ gmp-6.3.0/mpn/x86_64/core2/popcount.asm | 185 ++++ gmp-6.3.0/mpn/x86_64/core2/redc_1.asm | 430 +++++++++ gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm | 169 ++++ gmp-6.3.0/mpn/x86_64/core2/rshift.asm | 143 +++ gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm | 984 +++++++++++++++++++++ gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm | 47 + gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm | 47 + gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm | 158 ++++ gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm | 210 +++++ gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h | 246 ++++++ gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm | 195 ++++ gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm | 368 ++++++++ gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm | 395 +++++++++ gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 +++++++++++++++ gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm | 839 ++++++++++++++++++ gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm | 241 +++++ gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm | 261 ++++++ gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm | 201 +++++ gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm | 138 +++ gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h | 253 ++++++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm | 159 ++++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm | 176 ++++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm | 441 +++++++++ gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm | 422 +++++++++ gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm | 437 +++++++++ gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm | 506 +++++++++++ gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm | 200 +++++ gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm | 190 ++++ gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h | 238 +++++ gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm | 196 ++++ gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm | 182 ++++ gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm | 549 ++++++++++++ gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm | 224 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm | 54 ++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm | 56 ++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm | 173 ++++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm | 215 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm | 203 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm | 212 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm | 174 ++++ gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm | 200 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h | 241 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm | 199 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm | 167 ++++ gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm | 407 +++++++++ gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm | 384 ++++++++ gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm | 546 ++++++++++++ gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm | 193 ++++ gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm | 484 ++++++++++ gmp-6.3.0/mpn/x86_64/darwin.m4 | 82 ++ gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm | 247 ++++++ gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm | 158 ++++ gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm | 200 +++++ gmp-6.3.0/mpn/x86_64/dive_1.asm | 158 ++++ gmp-6.3.0/mpn/x86_64/divrem_1.asm | 314 +++++++ gmp-6.3.0/mpn/x86_64/divrem_2.asm | 192 ++++ gmp-6.3.0/mpn/x86_64/dos64.m4 | 101 +++ gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm | 181 ++++ gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm | 178 ++++ gmp-6.3.0/mpn/x86_64/fastsse/README | 22 + gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm | 311 +++++++ gmp-6.3.0/mpn/x86_64/fastsse/com.asm | 175 ++++ gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm | 254 ++++++ gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm | 166 ++++ gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm | 300 +++++++ gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm | 185 ++++ gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm | 182 ++++ gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm | 173 ++++ gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm | 193 ++++ gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm | 183 ++++ gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm | 201 +++++ gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm | 204 +++++ gmp-6.3.0/mpn/x86_64/fat/addmul_2.c | 38 + gmp-6.3.0/mpn/x86_64/fat/fat.c | 473 ++++++++++ gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm | 209 +++++ gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h | 72 ++ gmp-6.3.0/mpn/x86_64/fat/mod_1.c | 32 + gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/fat/redc_1.c | 32 + gmp-6.3.0/mpn/x86_64/fat/redc_2.c | 32 + gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/gcd_11.asm | 114 +++ gmp-6.3.0/mpn/x86_64/gcd_22.asm | 163 ++++ gmp-6.3.0/mpn/x86_64/gmp-mparam.h | 217 +++++ gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h | 264 ++++++ gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm | 37 + gmp-6.3.0/mpn/x86_64/invert_limb.asm | 112 +++ gmp-6.3.0/mpn/x86_64/invert_limb_table.asm | 50 ++ gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm | 142 +++ gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h | 248 ++++++ gmp-6.3.0/mpn/x86_64/k10/hamdist.asm | 109 +++ gmp-6.3.0/mpn/x86_64/k10/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/popcount.asm | 138 +++ gmp-6.3.0/mpn/x86_64/k10/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm | 153 ++++ gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm | 195 ++++ gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm | 217 +++++ gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm | 179 ++++ gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm | 249 ++++++ gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h | 237 +++++ gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm | 469 ++++++++++ gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm | 436 +++++++++ gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm | 559 ++++++++++++ gmp-6.3.0/mpn/x86_64/k8/redc_1.asm | 591 +++++++++++++ gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm | 807 +++++++++++++++++ gmp-6.3.0/mpn/x86_64/logops_n.asm | 260 ++++++ gmp-6.3.0/mpn/x86_64/lshift.asm | 172 ++++ gmp-6.3.0/mpn/x86_64/lshiftc.asm | 182 ++++ gmp-6.3.0/mpn/x86_64/lshsub_n.asm | 172 ++++ gmp-6.3.0/mpn/x86_64/missing-call.m4 | 53 ++ gmp-6.3.0/mpn/x86_64/missing-inline.m4 | 100 +++ gmp-6.3.0/mpn/x86_64/missing.asm | 130 +++ gmp-6.3.0/mpn/x86_64/mod_1_1.asm | 238 +++++ gmp-6.3.0/mpn/x86_64/mod_1_2.asm | 241 +++++ gmp-6.3.0/mpn/x86_64/mod_1_4.asm | 272 ++++++ gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm | 215 +++++ gmp-6.3.0/mpn/x86_64/mode1o.asm | 171 ++++ gmp-6.3.0/mpn/x86_64/mul_1.asm | 192 ++++ gmp-6.3.0/mpn/x86_64/mul_2.asm | 204 +++++ gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm | 157 ++++ gmp-6.3.0/mpn/x86_64/nano/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/dive_1.asm | 166 ++++ gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h | 243 +++++ gmp-6.3.0/mpn/x86_64/nano/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm | 196 ++++ gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm | 50 ++ gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm | 50 ++ gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm | 203 +++++ gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h | 257 ++++++ gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm | 167 ++++ gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm | 334 +++++++ gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm | 169 ++++ gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/popham.asm | 163 ++++ gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm | 189 ++++ gmp-6.3.0/mpn/x86_64/rshift.asm | 176 ++++ gmp-6.3.0/mpn/x86_64/sec_tabselect.asm | 176 ++++ gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm | 50 ++ gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm | 50 ++ gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h | 252 ++++++ gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm | 38 + gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm | 38 + gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h | 246 ++++++ gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm | 116 +++ gmp-6.3.0/mpn/x86_64/sublsh1_n.asm | 160 ++++ gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 | 493 +++++++++++ gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm | 227 +++++ gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm | 165 ++++ gmp-6.3.0/mpn/x86_64/zen/com.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h | 280 ++++++ gmp-6.3.0/mpn/x86_64/zen/hamdist.asm | 38 + gmp-6.3.0/mpn/x86_64/zen/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/mul_1.asm | 161 ++++ gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm | 455 ++++++++++ gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm | 299 +++++++ gmp-6.3.0/mpn/x86_64/zen/popcount.asm | 38 + gmp-6.3.0/mpn/x86_64/zen/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm | 507 +++++++++++ gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm | 482 ++++++++++ gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h | 276 ++++++ gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h | 222 +++++ gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm | 208 +++++ gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm | 37 + 314 files changed, 53887 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/README create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err3_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/atom/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/README create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/core2/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/darwin.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/dos64.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/README create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fat/addmul_2.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/fat.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mod_1.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/redc_1.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/redc_2.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/x86_64/invert_limb_table.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/k10/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshsub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/missing-call.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/missing-inline.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/missing.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/nano/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/popham.asm create mode 100644 gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm (limited to 'gmp-6.3.0/mpn/x86_64') diff --git a/gmp-6.3.0/mpn/x86_64/README b/gmp-6.3.0/mpn/x86_64/README new file mode 100644 index 0000000..9c8a586 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/README @@ -0,0 +1,74 @@ +Copyright 2003, 2004, 2006, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + AMD64 MPN SUBROUTINES + + +This directory contains mpn functions for AMD64 chips. It is also useful +for 64-bit Pentiums, and "Core 2". + + + RELEVANT OPTIMIZATION ISSUES + +The Opteron and Athlon64 can sustain up to 3 instructions per cycle, but in +practice that is only possible for integer instructions. But almost any +three integer instructions can issue simultaneously, including any 3 ALU +operations, including shifts. Up to two memory operations can issue each +cycle. + +Scheduling typically requires that load-use instructions are split into +separate load and use instructions. That requires more decode resources, +and it is rarely a win. Opteron/Athlon64 have deep out-of-order core. + + +Optimizing for 64-bit Pentium4 is probably a waste of time, as the most +critical instructions are very poorly implemented here. Perhaps we could +save a cycle or two, but the most common loops now run at between 10 and 22 +cycles, so a saved cycle isn't too exciting. + + +The new spin of the venerable P6 core, the "Core 2" is much better than the +Pentium4 for the GMP loops. Its integer pipeline is somewhat similar to to +the Opteron/Athlon64 pipeline, except that the GMP favourites ADC/SBB and +MUL are slower. Furthermore, an INC/DEC followed by ADC/SBB incur a +pipeline stall of around 10 cycles. The default mpn_add_n and mpn_sub_n +code suffers badly from the stall. The code in the core2 subdirectory uses +the almost forgotten instruction JRCXZ for loop control, and updates the +induction variable using LEA. + + + +REFERENCES + +"System V Application Binary Interface AMD64 Architecture Processor +Supplement", draft version 0.99, December 2007. +http://www.x86-64.org/documentation/abi.pdf diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm new file mode 100644 index 0000000..d105da6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm @@ -0,0 +1,168 @@ +dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL ? +C Intel ALD 1.29 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + mov %rdx, %rax + mov %rcx, v0 + mov %rax, n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jl L(b1) + jz L(b2) + +L(b3): mulx( %r8, %r11, %r10) + mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + inc n + lea -8(up), up + lea -24(rp), rp + jmp L(lo3) + +L(b0): mulx( %r8, %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %r13, %r12) + lea -16(rp), rp + jmp L(lo0) + +L(b2): mulx( %r8, %r13, %r12) + mulx( 8,(up), %rbx, %rax) + lea -2(n), n + jrcxz L(n2) + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(lo2) +L(n2): jmp L(wd2) + +L(b1): mulx( %r8, %rbx, %rax) + sub $1, n + jrcxz L(n1) + mulx( 8,(up), %r9, %r8) + mulx( 16,(up), %r11, %r10) + lea 8(up), up + lea -8(rp), rp + jmp L(lo1) +L(n1): add (rp), %rbx + adc %rcx, %rax + mov %rbx, (rp) + pop %r13 + pop %r12 + pop %rbx + ret + +L(top): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(lo2): adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(lo1): adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) +L(lo0): adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) +L(lo3): adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea -4(n), n + jrcxz L(end) + jmp L(top) + +L(end): adcx( %r10, %r13) + mov %r11, -8(rp) +L(wd2): adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + pop %r13 + pop %r12 + pop %rbx + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h new file mode 100644 index 0000000..0bffc3d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h @@ -0,0 +1,225 @@ +/* Intel Alder Lake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3700-4900 MHz Alder Lake */ +/* FFT tuning limit = 10,000,000 */ +/* Generated by tuneup.c, 2022-03-15, gcc 11.2 */ + +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 34 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 30 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 559 + +#define MUL_TOOM22_THRESHOLD 13 +#define MUL_TOOM33_THRESHOLD 97 +#define MUL_TOOM44_THRESHOLD 148 +#define MUL_TOOM6H_THRESHOLD 562 +#define MUL_TOOM8H_THRESHOLD 608 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 259 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 144 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 86 +#define SQR_TOOM4_THRESHOLD 582 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 753 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 384 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 384, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 135,11}, { 79, 9}, { 319, 8}, \ + { 639, 9}, { 335, 8}, { 671,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543, 8}, { 1087, 9}, { 575,10}, \ + { 303, 9}, { 607,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 575,11}, { 303,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,11}, { 367,10}, { 735, 9}, \ + { 1471, 8}, { 2943,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471, 9}, \ + { 2943, 8}, { 5887,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,10}, { 1791,12}, { 479,11}, \ + { 959,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 959,11}, { 1919,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1471,11}, \ + { 2943,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,12}, { 1919,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 190 +#define MUL_FFT_THRESHOLD 2496 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,10}, { 319, 9}, { 639,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,11}, { 959,10}, { 1919,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 607,11}, { 1215,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,13}, { 447,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 166 +#define SQR_FFT_THRESHOLD 2240 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 4940 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 73 +#define SQRLO_SQR_THRESHOLD 4392 + +#define DC_DIV_QR_THRESHOLD 19 +#define DC_DIVAPPR_Q_THRESHOLD 139 +#define DC_BDIV_QR_THRESHOLD 62 +#define DC_BDIV_Q_THRESHOLD 126 + +#define INV_MULMOD_BNM1_THRESHOLD 24 +#define INV_NEWTON_THRESHOLD 108 +#define INV_APPR_THRESHOLD 108 + +#define BINV_NEWTON_THRESHOLD 208 +#define REDC_1_TO_REDC_2_THRESHOLD 36 +#define REDC_2_TO_REDC_N_THRESHOLD 53 + +#define MU_DIV_QR_THRESHOLD 855 +#define MU_DIVAPPR_Q_THRESHOLD 1120 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,11,70,702,2499 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 2150 +#define SET_STR_PRECOMPUTE_THRESHOLD 2943 + +#define FAC_DSC_THRESHOLD 298 +#define FAC_ODD_THRESHOLD 51 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 2.38% faster than 3 */ +#define HGCD_THRESHOLD 142 +#define HGCD_APPR_THRESHOLD 159 +#define HGCD_REDUCE_THRESHOLD 2384 +#define GCD_DC_THRESHOLD 483 +#define GCDEXT_DC_THRESHOLD 492 +#define JACOBI_BASE_METHOD 1 /* 0.94% faster than 3 */ diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm new file mode 100644 index 0000000..9400fe5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm @@ -0,0 +1,474 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL ? +C Intel ALD 1.29 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +C TODO +C * Do overlapped software pipelining. +C * Try shallower pipeline, which would result in using fewer registers. +C * There are false dependencies on CF/OF between iterations. Try breaking +C them to see if it helps. + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`un_arg',`%rdx') dnl r8 +define(`vp_arg',`%rcx') dnl r9 +define(`vn_arg',`%r8') dnl stack + +define(`un', `%r14') +define(`vp', `%r15') +define(`vn', `%rbp') + +define(`n', `%rcx') +define(`v0', `%rdx') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + cmp $2, un_arg + ja L(gen) + mov (vp_arg), %rdx + mulx( (up), %rax, %r9) + mov %rax, (rp) + je L(s2x) + + mov %r9, 8(rp) + ret + +L(s2x): mulx( 8,(up), %rax, %r10) + add %r9, %rax + adc $0, %r10 + cmp $2, R32(vn_arg) + je L(s22) + +L(s21): mov %rax, 8(rp) + mov %r10, 16(rp) + ret + +L(s22): mov 8(vp_arg), %rdx + mulx( (up), %r8, %r9) + add %r8, %rax + adc %r10, %r9 + mov %rax, 8(rp) + mulx( 8,(up), %rax, %r10) + adc $0, %r10 + adc %r9, %rax + mov %rax, 16(rp) + adc $0, %r10 + mov %r10, 24(rp) + ret + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov un_arg, un + neg un + shl $3, un + mov vp_arg, vp + mov vn_arg, vn + + test $1, R8(un_arg) + mov (vp), %rdx + jz L(bx0) + +L(bx1): test $16, R8(un) + jnz L(b01) + +L(b11): lea 24(un), n + mulx( (up), %r11, %r10) + mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + lea 8(rp), rp + lea 24(up), up + jrcxz L(med3) +L(mtp3):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med3) + jmp L(mtp3) +L(med3):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out3):lea 32(rp,un), rp + lea 24(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + mulx( -24,(up), %r11, %r10) + mulx( -16,(up), %r13, %r12) + mulx( -8,(up), %rbx, %rax) + lea 24(un), n + adox( -8,(rp), %r11) + jrcxz L(ed3) +L(tp3): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed3) + jmp L(tp3) +L(ed3): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out3) + jmp L(ret) + + +L(b01): mulx( (up), %rbx, %rax) + lea 8(un), n + mulx( 8,(up), %r9, %r8) + mulx( 16,(up), %r11, %r10) + lea 8(up), up + lea -8(rp), rp + jmp L(ml1) +L(mtp1):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(ml1): mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med1) + jmp L(mtp1) +L(med1):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out1):lea 16(rp,un), rp + lea 8(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + lea 8(un), n + mulx( -8,(up), %rbx, %rax) + mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + jmp L(lo1) +L(tp1): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(lo1): adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed1) + jmp L(tp1) +L(ed1): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out1) + jmp L(ret) + + +L(bx0): test $16, R8(un) + jz L(b00) + +L(b10): mulx( (up), %r13, %r12) + mulx( 8,(up), %rbx, %rax) + lea 16(un), n + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(ml2) +L(mtp2):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(ml2): mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med2) + jmp L(mtp2) +L(med2):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out2):lea 24(rp,un), rp + lea 16(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + mulx( -16,(up), %r13, %r12) + mulx( -8,(up), %rbx, %rax) + lea 16(un), n + mulx( (up), %r9, %r8) + jmp L(lo2) +L(tp2): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(lo2): adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed2) + jmp L(tp2) +L(ed2): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out2) + jmp L(ret) + + +L(b00): lea 32(un), n + mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %r13, %r12) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, (rp) + lea 32(up), up + lea 16(rp), rp + jrcxz L(med0) +L(mtp0):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med0) + jmp L(mtp0) +L(med0):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out0):lea 40(rp,un), rp + lea 32(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + lea 32(un), n + mulx( -32,(up), %r9, %r8) + mulx( -24,(up), %r11, %r10) + mulx( -16,(up), %r13, %r12) + adox( -16,(rp), %r9) + mulx( -8,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, -16(rp) + adox( -8,(rp), %r11) + jrcxz L(ed0) +L(tp0): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed0) + jmp L(tp0) +L(ed0): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out0) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm new file mode 100644 index 0000000..d7d6b0d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm @@ -0,0 +1,140 @@ +dnl AMD64 mpn_submul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 2.0 +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL 2.0 +C Intel ALD 1.53 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_submul_1) + mov n_param, %rax + mov v0_param, v0 + mov %rax, n + test $1, R8(n) + jz L(bx0) + +L(bx1): mulx( (up), %r9, %rax) + test $2, R8(n) + stc + jz L(b01) + +L(b11): lea 1(n), n + lea 16(up), up + lea 16(rp), rp + jmp L(lo3) + +L(b01): lea 3(n), n + jmp L(lo1) + +L(bx0): mulx( (up), %r9, %r8) + test $2, R8(n) + stc + jz L(b00) + +L(b10): lea 8(up), up + lea 8(rp), rp + lea 2(n), n + jmp L(lo2) + +L(b00): lea 24(up), up + lea 24(rp), rp + jmp L(lo0) + +L(top): lea 32(up), up + lea 32(rp), rp + mulx( -24,(up), %r9, %r8) + adox( %rax, %r9) +L(lo0): not %r9 + adcx( -24,(rp), %r9) + mov %r9, -24(rp) + mulx( -16,(up), %r9, %rax) + adox( %r8, %r9) +L(lo3): not %r9 + adcx( -16,(rp), %r9) + mov %r9, -16(rp) + mulx( -8,(up), %r9, %r8) + adox( %rax, %r9) +L(lo2): not %r9 + adcx( -8,(rp), %r9) + mov %r9, -8(rp) + mulx( (up), %r9, %rax) + adox( %r8, %r9) +L(lo1): not %r9 + adcx( (rp), %r9) + mov %r9, (rp) + lea -4(n), n + jrcxz L(end) + jmp L(top) + +L(end): adox( %rcx, %rax) + sbb $-1, %rax + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm new file mode 100644 index 0000000..6ee0872 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm @@ -0,0 +1,170 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bd1 ? +C AMD bobcat ? +C Intel P4 13 +C Intel core2 3.45 +C Intel NHM ? +C Intel SBR ? +C Intel atom ? +C VIA nano ? + + +C Sometimes speed degenerates, supposedly related to that some operand +C alignments cause cache conflicts. + +C The speed is limited by decoding/issue bandwidth. There are 22 instructions +C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l. + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbp + + mov (vp), %r8 + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + xor R32(%rbp), R32(%rbp) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + ADCSBB 16(up,n,8), %r10 + mov %r10, 16(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $3, n + jmp L(ent) + +L(b10): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $2, n + jmp L(ent) + +L(b01): add %r8, %r8 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + mov %r8, (rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + inc n +L(ent): jns L(end) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + + mov (vp,n,8), %r8 +L(b00): adc %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + mov 24(vp,n,8), %r11 + adc %r11, %r11 + + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + + ADCSBB (up,n,8), %r8 + nop C Hammer speedup! + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov %r10, 16(rp,n,8) + mov %r11, 24(rp,n,8) + + sbb R32(%rbp), R32(%rbp) C save acy + add $4, n + js L(top) + +L(end): +ifdef(`OPERATION_addlsh1_n',` + add R32(%rbp), R32(%rax) + neg R32(%rax)') +ifdef(`OPERATION_rsblsh1_n',` + sub R32(%rax), R32(%rbp) + movslq R32(%rbp), %rax') + + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm new file mode 100644 index 0000000..999e972 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm new file mode 100644 index 0000000..de00154 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) +dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] + +dnl Copyright 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C AMD K8,K9 2.1 +C AMD K10 2.0 +C AMD bd1 ~2.7 +C AMD bd2 ~2.7 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 2.0 +C AMD bt1 3.3 +C AMD bt2 3.0 +C Intel P4 ? +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.55 +C Intel IBR 2.49 +C Intel HWL 2.25 +C Intel BWL 1.89 +C Intel SKL 1.90 +C Intel atom 8.4 +C Intel SLM 4.0 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +define(M, eval(m4_lshift(1,LSH))) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %r12 + push %r13 + push %r14 + push %r15 + + mov (vp), %r8 + lea (,%r8,M), %r12 + shr $RSH, %r8 + + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + and $3, R8(%rax) + je L(b00) + cmp $2, R8(%rax) + jc L(b01) + je L(b10) + +L(b11): mov 8(vp,n,8), %r10 + lea (%r8,%r10,M), %r14 + shr $RSH, %r10 + mov 16(vp,n,8), %r11 + lea (%r10,%r11,M), %r15 + shr $RSH, %r11 + ADDSUB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r14 + ADCSBB 16(up,n,8), %r15 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + mov %r14, 8(rp,n,8) + mov %r15, 16(rp,n,8) + add $3, n + js L(top) + jmp L(end) + +L(b01): mov %r8, %r11 + ADDSUB (up,n,8), %r12 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + add $1, n + js L(top) + jmp L(end) + +L(b10): mov 8(vp,n,8), %r11 + lea (%r8,%r11,M), %r15 + shr $RSH, %r11 + ADDSUB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r15 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + mov %r15, 8(rp,n,8) + add $2, n + js L(top) + jmp L(end) + +L(b00): mov 8(vp,n,8), %r9 + mov 16(vp,n,8), %r10 + jmp L(e00) + + ALIGN(16) +L(top): mov 16(vp,n,8), %r10 + mov (vp,n,8), %r8 + mov 8(vp,n,8), %r9 + lea (%r11,%r8,M), %r12 + shr $RSH, %r8 +L(e00): lea (%r8,%r9,M), %r13 + shr $RSH, %r9 + mov 24(vp,n,8), %r11 + lea (%r9,%r10,M), %r14 + shr $RSH, %r10 + lea (%r10,%r11,M), %r15 + shr $RSH, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r13 + ADCSBB 16(up,n,8), %r14 + ADCSBB 24(up,n,8), %r15 + mov %r12, (rp,n,8) + mov %r13, 8(rp,n,8) + mov %r14, 16(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry for next + mov %r15, 24(rp,n,8) + add $4, n + js L(top) +L(end): + +ifelse(ADDSUB,add,` + sub R32(%r11), R32(%rax) + neg R32(%rax) +',` + add R32(%r11), R32(%rax) + movslq R32(%rax), %rax +') + pop %r15 + pop %r14 + pop %r13 + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm new file mode 100644 index 0000000..5ca128f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 3.1 < 3.85 for lshift + add_n +C AMD K10 3.1 < 3.85 for lshift + add_n +C Intel P4 14.6 > 7.33 for lshift + add_n +C Intel core2 3.87 > 3.27 for lshift + add_n +C Intel NHM 4 > 3.75 for lshift + add_n +C Intel SBR (5.8) > 3.46 for lshift + add_n +C Intel atom (7.75) < 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C This was written quickly and not optimized at all. Surely one could get +C closer to 3 c/l or perhaps even under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %r13 + push %r14 + push %rbp + push %rbx + + mov n, %rax + xor R32(%rbx), R32(%rbx) C clear carry save register + mov R32(%r8), R32(%rcx) C shift count + xor R32(%rbp), R32(%rbp) C limb carry + + mov R32(%rax), R32(%r11) + and $3, R32(%r11) + je L(4) + sub $1, R32(%r11) + +L(012): mov (vp), %r8 + mov %r8, %r12 + shl R8(%rcx), %r8 + or %rbp, %r8 + neg R8(%rcx) + mov %r12, %rbp + shr R8(%rcx), %rbp + neg R8(%rcx) + add R32(%rbx), R32(%rbx) + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbx), R32(%rbx) + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sub $1, R32(%r11) + jnc L(012) + +L(4): sub $4, %rax + jc L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + mov %r8, %r12 + mov 8(vp), %r9 + mov %r9, %r13 + mov 16(vp), %r10 + mov %r10, %r14 + mov 24(vp), %r11 + + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + shl R8(%rcx), %r10 + or %rbp, %r8 + mov %r11, %rbp + shl R8(%rcx), %r11 + + neg R8(%rcx) + + shr R8(%rcx), %r12 + shr R8(%rcx), %r13 + shr R8(%rcx), %r14 + shr R8(%rcx), %rbp C used next iteration + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg R8(%rcx) + + add R32(%rbx), R32(%rbx) C restore carry flag + + ADCSBB (up), %r8 + ADCSBB 8(up), %r9 + ADCSBB 16(up), %r10 + ADCSBB 24(up), %r11 + + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + sbb R32(%rbx), R32(%rbx) C save carry flag + + lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + + sub $4, %rax + jnc L(top) + +L(end): add R32(%rbx), R32(%rbx) + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbx + pop %rbp + pop %r14 + pop %r13 + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm new file mode 100644 index 0000000..54d0b3f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm @@ -0,0 +1,225 @@ +dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.75 (degenerates to 3 c/l for some alignments) +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp', `%r8') +define(`n', `%r9') +define(`cy_param', `8(%rsp)') + +define(`el', `%rbx') +define(`eh', `%rbp') +define(`t0', `%r10') +define(`t1', `%r11') +define(`t2', `%r12') +define(`t3', `%r13') +define(`w0', `%r14') +define(`w1', `%r15') + +ifdef(`OPERATION_add_err1_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err1_n)') +ifdef(`OPERATION_sub_err1_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err1_n)') + +MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + + mov R32(n), R32(%r10) + and $3, R32(%r10) + jz L(0mod4) + cmp $2, R32(%r10) + jc L(1mod4) + jz L(2mod4) +L(3mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + xor R32(t1), R32(t1) + lea -24(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 16(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc 8(yp), t0 + mov 16(up,n,8), w0 + ADCSBB 16(vp,n,8), w0 + mov w0, 16(rp,n,8) + cmovc (yp), t1 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + + add $3, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(0mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea (yp,n,8), yp + neg n + jmp L(loop) + + ALIGN(16) +L(1mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea -8(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc (yp), el + setc %al C save carry + + add $1, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(2mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + lea -16(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 8(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc (yp), t0 + setc %al C save carry + add t0, el + adc $0, eh + + add $2, n + jnz L(loop) + jmp L(end) + + ALIGN(32) +L(loop): + shr $1, %al C restore carry + mov -8(yp), t0 + mov $0, R32(t3) + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + cmovnc t3, t0 + ADCSBB 8(vp,n,8), w1 + mov -16(yp), t1 + mov w0, (rp,n,8) + mov 16(up,n,8), w0 + mov w1, 8(rp,n,8) + cmovnc t3, t1 + mov -24(yp), t2 + ADCSBB 16(vp,n,8), w0 + cmovnc t3, t2 + mov 24(up,n,8), w1 + ADCSBB 24(vp,n,8), w1 + cmovc -32(yp), t3 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + add t2, el + adc $0, eh + mov w0, 16(rp,n,8) + add t3, el + lea -32(yp), yp + adc $0, eh + mov w1, 24(rp,n,8) + add $4, n + jnz L(loop) + +L(end): + mov el, (ep) + mov eh, 8(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm new file mode 100644 index 0000000..ce5c2a4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_add_err2_n, mpn_sub_err2_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 ? +C Intel P4 ? +C Intel core2 6.9 +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp1', `%r8') +define(`yp2', `%r9') +define(`n_param', `8(%rsp)') +define(`cy_param', `16(%rsp)') + +define(`cy1', `%r14') +define(`cy2', `%rax') + +define(`n', `%r10') + +define(`w', `%rbx') +define(`e1l', `%rbp') +define(`e1h', `%r11') +define(`e2l', `%r12') +define(`e2h', `%r13') + + +ifdef(`OPERATION_add_err2_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err2_n)') +ifdef(`OPERATION_sub_err2_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err2_n)') + +MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, cy2 + mov n_param, n + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor R32(e1l), R32(e1l) + xor R32(e1h), R32(e1h) + xor R32(e2l), R32(e2l) + xor R32(e2h), R32(e2h) + + sub yp1, yp2 + + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + + test $1, n + jnz L(odd) + + lea -8(yp1,n,8), yp1 + neg n + jmp L(top) + + ALIGN(16) +L(odd): + lea -16(yp1,n,8), yp1 + neg n + shr $1, cy2 + mov (up,n,8), w + ADCSBB (vp,n,8), w + cmovc 8(yp1), e1l + cmovc 8(yp1,yp2), e2l + mov w, (rp,n,8) + sbb cy2, cy2 + inc n + jz L(end) + + ALIGN(16) +L(top): + mov (up,n,8), w + shr $1, cy2 C restore carry + ADCSBB (vp,n,8), w + mov w, (rp,n,8) + sbb cy1, cy1 C generate mask, preserve CF + + mov 8(up,n,8), w + ADCSBB 8(vp,n,8), w + mov w, 8(rp,n,8) + sbb cy2, cy2 C generate mask, preserve CF + + mov (yp1), w C (e1h:e1l) += cy1 * yp1 limb + and cy1, w + add w, e1l + adc $0, e1h + + and (yp1,yp2), cy1 C (e2h:e2l) += cy1 * yp2 limb + add cy1, e2l + adc $0, e2h + + mov -8(yp1), w C (e1h:e1l) += cy2 * next yp1 limb + and cy2, w + add w, e1l + adc $0, e1h + + mov -8(yp1,yp2), w C (e2h:e2l) += cy2 * next yp2 limb + and cy2, w + add w, e2l + adc $0, e2h + + add $2, n + lea -16(yp1), yp1 + jnz L(top) +L(end): + + mov e1l, (ep) + mov e1h, 8(ep) + mov e2l, 16(ep) + mov e2h, 24(ep) + + and $1, %eax C return carry + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm new file mode 100644 index 0000000..bb6d0c5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm @@ -0,0 +1,156 @@ +dnl AMD64 mpn_add_err3_n, mpn_sub_err3_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 7.0 +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp1', `%r8') +define(`yp2', `%r9') +define(`yp3_param', `8(%rsp)') +define(`n_param', `16(%rsp)') +define(`cy_param', `24(%rsp)') + +define(`n', `%r10') +define(`yp3', `%rcx') +define(`t', `%rbx') + +define(`e1l', `%rbp') +define(`e1h', `%r11') +define(`e2l', `%r12') +define(`e2h', `%r13') +define(`e3l', `%r14') +define(`e3h', `%r15') + + + +ifdef(`OPERATION_add_err3_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err3_n)') +ifdef(`OPERATION_sub_err3_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err3_n)') + +MULFUNC_PROLOGUE(mpn_add_err3_n mpn_sub_err3_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + mov n_param, n + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push ep + mov 64(%rsp), yp3 C load from yp3_param + + xor R32(e1l), R32(e1l) + xor R32(e1h), R32(e1h) + xor R32(e2l), R32(e2l) + xor R32(e2h), R32(e2h) + xor R32(e3l), R32(e3l) + xor R32(e3h), R32(e3h) + + sub yp1, yp2 + sub yp1, yp3 + + lea -8(yp1,n,8), yp1 + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + + ALIGN(16) +L(top): + shr $1, %rax C restore carry + mov (up,n,8), %rax + ADCSBB (vp,n,8), %rax + mov %rax, (rp,n,8) + sbb %rax, %rax C save carry and generate mask + + mov (yp1), t + and %rax, t + add t, e1l + adc $0, e1h + + mov (yp1,yp2), t + and %rax, t + add t, e2l + adc $0, e2h + + mov (yp1,yp3), t + and %rax, t + add t, e3l + adc $0, e3h + + lea -8(yp1), yp1 + inc n + jnz L(top) + +L(end): + and $1, %eax + pop ep + + mov e1l, (ep) + mov e1h, 8(ep) + mov e2l, 16(ep) + mov e2h, 24(ep) + mov e3l, 32(ep) + mov e3h, 40(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_n.asm b/gmp-6.3.0/mpn/x86_64/aors_n.asm new file mode 100644 index 0000000..d5a314a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_n.asm @@ -0,0 +1,178 @@ +dnl AMD64 mpn_add_n, mpn_sub_n + +dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.5 +C AMD K10 1.5 +C AMD bd1 1.8 +C AMD bd2 1.74 +C AMD bd3 ? +C AMD bd4 1.78 +C AMD zen 1.5 +C AMD bt1 2.54 +C AMD bt2 2.15 +C Intel P4 11.5 +C Intel core2 4.9 +C Intel NHM 5.53 +C Intel SBR 1.59 +C Intel IBR 1.55 +C Intel HWL 1.44 +C Intel BWL 1.14 +C Intel SKL 1.21 +C Intel atom 4 +C Intel SLM 3 +C VIA nano 3.25 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + mov R32(n), R32(%rax) + shr $2, n + and $3, R32(%rax) + bt $0, %r8 C cy flag <- carry parameter + jrcxz L(lt4) + + mov (up), %r8 + mov 8(up), %r9 + dec n + jmp L(mid) + +EPILOGUE() + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + mov R32(n), R32(%rax) + shr $2, n + and $3, R32(%rax) + jrcxz L(lt4) + + mov (up), %r8 + mov 8(up), %r9 + dec n + jmp L(mid) + +L(lt4): dec R32(%rax) + mov (up), %r8 + jnz L(2) + ADCSBB (vp), %r8 + mov %r8, (rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + +L(2): dec R32(%rax) + mov 8(up), %r9 + jnz L(3) + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + mov %r8, (rp) + mov %r9, 8(rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + +L(3): mov 16(up), %r10 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + setc R8(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(top): ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + mov %r8, (rp) + lea 32(up), up + mov %r9, 8(rp) + mov %r10, 16(rp) + dec n + mov %r11, 24(rp) + lea 32(vp), vp + mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp +L(mid): mov 16(up), %r10 + mov 24(up), %r11 + jnz L(top) + +L(end): lea 32(up), up + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + lea 32(rp), rp + + inc R32(%rax) + dec R32(%rax) + jnz L(lt4) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm new file mode 100644 index 0000000..dfe4dc4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.52 +C AMD K10 2.51 +C AMD bd1 4.43 +C AMD bd2 5.03 5.63 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat 6.20 +C AMD jaguar 5.57 6.56 +C Intel P4 14.9 17.1 +C Intel core2 5.15 +C Intel NHM 4.93 +C Intel SBR 3.95 +C Intel IBR 3.75 +C Intel HWL 3.62 +C Intel BWL 2.53 +C Intel SKL 2.53 +C Intel atom 21.3 +C Intel SLM 9.0 +C VIA nano 5.0 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul vl +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + ADDSUB %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor R32(%rbx), R32(%rbx) + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + ALIGN(16) +L(top): ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, R32(%r10) +L(L1): mul vl + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + ADDSUB %r8, 16(rp,n,8) + adc %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%r8) C zero + mov %r8, %rbx C zero + adc %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 C zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + adc %r8, %rdx + ADDSUB %r9, 8(rp,n,8) +L(ret): adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm new file mode 100644 index 0000000..c1dcdc4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm @@ -0,0 +1,186 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom 18.8 this +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add w3, (rp) + mov $0, R32(w2) + adc %rax, w0 + mov (up), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp) + adc %rax, w1 + mov 8(up), %rax + mov $0, R32(w3) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add w1, 16(rp) + adc %rax, w2 + mov 16(up), %rax + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp) + adc %rax, w3 + mov 24(up), %rax + adc %rdx, w0 + mov $0, R32(w1) + lea 32(rp), rp + mul v0 + lea 32(up), up + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm new file mode 100644 index 0000000..f44de19 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm @@ -0,0 +1,238 @@ +dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 433 bytes. +C * sublsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.875 (4.75 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + lea 64(rp), rp + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + mov 32(vp), %r12 + adc %r12, %r12 + mov 40(vp), %r13 + adc %r13, %r13 + mov 48(vp), %r14 + adc %r14, %r14 + mov 56(vp), %rbx + adc %rbx, %rbx + lea 64(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + ADCSBB 32(up), %r12 + mov %r12, 32(rp) + ADCSBB 40(up), %r13 + mov %r13, 40(rp) + ADCSBB 48(up), %r14 + mov %r14, 48(rp) + ADCSBB 56(up), %rbx + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 64(up), up +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): +ifdef(`OPERATION_addlsh1_n',` + add R32(%rbp), R32(%rax) + neg R32(%rax)') +ifdef(`OPERATION_rsblsh1_n',` + sub R32(%rax), R32(%rbp) + movslq R32(%rbp), %rax') + + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm new file mode 100644 index 0000000..02fb29d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm @@ -0,0 +1,191 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] +dnl Optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5.75 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +define(`LSH', 2) +define(`RSH', 62) +define(M, eval(m4_lshift(1,LSH))) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) C we rely on rax = 0 at target + cmp $2, R32(%rax) + mov $0, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r9 + lea (%rax,%r9,M), %rbp + shr $RSH, %r9 + sub $1, n + lea -8(up), up + lea -8(rp), rp + jz L(cj1) + mov 8(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 16(vp), %r11 + lea 24(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + jmp L(L1) + +L(b2): lea -32(rp), rp + mov (vp), %r8 + lea -32(up), up + lea (%rax,%r8,M), %rbx + shr $RSH, %r8 + mov 8(vp), %r9 + sub $2, n + jle L(end) + jmp L(top) + +L(b3): lea -24(up), up + mov (vp), %r11 + lea -24(rp), rp + mov 8(vp), %r8 + lea (%rax,%r11,M), %r10 + shr $RSH, %r11 + lea 8(vp), vp + lea (%r11,%r8,M), %rbx + add $1, n + jmp L(L3) + +L(b0): lea -16(up), up + mov (vp), %r10 + lea (%rax,%r10,M), %r9 + shr $RSH, %r10 + mov 8(vp), %r11 + lea -16(rp), rp + mov 16(vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + lea 16(vp), vp + jmp L(L0) + + ALIGN(16) +L(top): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + mov 16(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 24(vp), %r11 + lea 32(rp), rp + lea 32(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(L1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) +L(L0): ADCSBB 16(up), %r9 + lea (%r11,%r8,M), %rbx + mov %r9, 16(rp) +L(L3): ADCSBB 24(up), %r10 + sbb R32(%rax), R32(%rax) +L(L2): shr $RSH, %r8 + mov 8(vp), %r9 + mov %r10, 24(rp) + sub $4, n + jg L(top) + +L(end): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + lea 32(rp), rp + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(cj1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) + +ifdef(`OPERATION_addlsh2_n',` + mov R32(n), R32(%rax) C zero rax + adc %r9, %rax') +ifdef(`OPERATION_rsblsh2_n',` + sbb n, %r9 C subtract 0 + mov %r9, %rax') + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm new file mode 100644 index 0000000..83b8df9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm @@ -0,0 +1,128 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom. + +dnl Copyright 2011, 2017 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Marco Bodrato. Ported to 64-bit by +dnl Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bull 2.34\2.63 +C AMD pile 2.27\2.52 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.78 +C Intel P4 11 +C Intel core2 7.5 +C Intel NHM 8.5 +C Intel SBR 2.11 +C Intel IBR 2.07 +C Intel HWL 1.75 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom 3 +C Intel SLM 4 +C VIA nano + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func_n, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func_n, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + xor cy, cy C carry + +L(com): shr n C n >> 1 + jz L(1) C n == 1 + jc L(1m2) C n % 2 == 1 + +L(0m2): shr cy + mov (up), %r10 + lea 8(up), up + lea 8(vp), vp + lea -8(rp), rp + jmp L(mid) + +L(1): shr cy + mov (up), %r9 + jmp L(end) + +L(1m2): shr cy + mov (up), %r9 + + ALIGN(16) +L(top): ADCSBB (vp), %r9 + lea 16(up), up + mov -8(up), %r10 + lea 16(vp), vp + mov %r9, (rp) +L(mid): ADCSBB -8(vp), %r10 + lea 16(rp), rp + dec n + mov (up), %r9 + mov %r10, -8(rp) + jnz L(top) + +L(end): ADCSBB (vp), %r9 + mov $0, R32(%rax) + mov %r9, (rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), cy ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm new file mode 100644 index 0000000..7cbc085 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm @@ -0,0 +1,194 @@ +dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bull 4.73 +C AMD pile 4.60 4.80 +C AMD steam +C AMD excavator +C AMD bobcat 5.48 +C AMD jaguar 5.61 +C Intel P4 16.6 +C Intel core2 5.09 +C Intel NHM 4.79 +C Intel SBR 3.88 +C Intel IBR 3.65 +C Intel HWL 3.53 +C Intel BWL 2.75 +C Intel SKL 2.76 +C Intel atom 19.4 +C Intel SLM 8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + + mov (up), %rax + lea -8(up,n_param,8), up + lea -16(rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + jmp L(lo0) + +L(b10): mov $3, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov -8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + test n, n + jns L(cj2) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $2, R32(n) + sub n_param, n + mul v0 + test n, n + jns L(cj1) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + jmp L(lo1) + +L(b11): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 24(up,n,8), %rax + jmp L(lo3) + + ALIGN(16) +L(top): mul v0 + ADDSUB %r8, -16(rp,n,8) + mov %rax, %r8 + mov (up,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(lo2): mul v0 + ADDSUB %r11, -8(rp,n,8) + mov %rax, %r11 + mov 8(up,n,8), %rax + adc %r10, %r8 + mov %rdx, %r10 + adc $0, %r9 +L(lo1): mul v0 + ADDSUB %r8, (rp,n,8) + mov %rax, %r8 + adc %r9, %r11 + mov 16(up,n,8), %rax + adc $0, %r10 +L(lo0): mov %rdx, %r9 + mul v0 + ADDSUB %r11, 8(rp,n,8) + mov %rax, %r11 + adc %r10, %r8 + mov 24(up,n,8), %rax + adc $0, %r9 +L(lo3): add $4, n + mov %rdx, %r10 + js L(top) + +L(end): mul v0 + ADDSUB %r8, -16(rp,n,8) + adc %r9, %r11 + adc $0, %r10 +L(cj2): ADDSUB %r11, -8(rp,n,8) + adc %r10, %rax + adc $0, %rdx +L(cj1): ADDSUB %rax, (rp,n,8) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm new file mode 100644 index 0000000..fcb9a0f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_add_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_add_n) +include_mpn(`x86_64/coreisbr/cnd_add_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm new file mode 100644 index 0000000..9eee1c1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_sub_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_sub_n) +include_mpn(`x86_64/coreisbr/cnd_sub_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/com.asm b/gmp-6.3.0/mpn/x86_64/atom/com.asm new file mode 100644 index 0000000..6b6460f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for Intel Atom. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyd.asm b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm new file mode 100644 index 0000000..e309279 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyi.asm b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm new file mode 100644 index 0000000..00ec3c2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm new file mode 100644 index 0000000..d9ba5fe --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divexact_1) +include_mpn(`x86_64/nano/dive_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h new file mode 100644 index 0000000..2cd90f6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define SHLD_SLOW 1 +#define SHRD_SLOW 1 + +/* 1600 MHz Diamondville (Atom 330) */ +/* FFT tuning limit = 50,646,641 */ +/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 201 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 155 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 159 +#define SQR_TOOM8_THRESHOLD 236 + +#define MULMID_TOOM42_THRESHOLD 16 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 511,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 767,13}, \ + { 447,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1791,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,15}, { 1535,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 169 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 11, 6}, { 13, 7}, { 7, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255,10}, { 39, 8}, { 159,10}, \ + { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \ + { 71, 9}, { 143, 8}, { 287, 7}, { 575, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319, 8}, \ + { 639,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,10}, { 223, 9}, \ + { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,10}, { 447,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,12}, { 287,11}, { 575,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 575,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 767,13}, { 447,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,15}, { 255,14}, \ + { 511,13}, { 1151,14}, { 639,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2047,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,15}, { 1535,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 172 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 85 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 34 +#define DC_DIVAPPR_Q_THRESHOLD 119 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 179 +#define REDC_1_TO_REDC_2_THRESHOLD 24 +#define REDC_2_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 807 + +#define POWM_SEC_TABLE 1,22,114,326,1486 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1565 + +#define FAC_DSC_THRESHOLD 960 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD2_DIV1_METHOD 3 /* 5.86% faster than 4 */ +#define HGCD_THRESHOLD 88 +#define HGCD_APPR_THRESHOLD 88 +#define HGCD_REDUCE_THRESHOLD 1182 +#define GCD_DC_THRESHOLD 241 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 3 /* 9.43% faster than 2 */ + +/* Tuneup completed successfully, took 193098 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshift.asm b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm new file mode 100644 index 0000000..1b37d5d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm @@ -0,0 +1,123 @@ +dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + dec n + lea 8(rp), rp + lea -8(up), up + jz L(end) + + ALIGN(8) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + neg R8(%rcx) + mov -8(up), %r8 + mov %r8, %r10 + mov %r9, -8(rp) + shr R8(%rcx), %r8 + lea -16(rp), rp +L(lo1): mov -16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm new file mode 100644 index 0000000..7385f8f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm @@ -0,0 +1,127 @@ +dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5 +C times larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + not %r11 + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + lea 8(rp), rp + lea -8(up), up + jmp L(lo0) + +C ALIGN(16) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + not %r9 + neg R8(%rcx) + mov -8(up), %r8 + lea -16(rp), rp + mov %r8, %r10 + shr R8(%rcx), %r8 + mov %r9, 8(rp) +L(lo1): or %r11, %r8 + mov -16(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + not %r8 + mov %r8, (rp) +L(lo0): dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + not %r9 + shl R8(%rcx), %r11 + not %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm new file mode 100644 index 0000000..a0dcf1e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm @@ -0,0 +1,147 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.03 +C AMD K10 3.03 +C AMD bull 4.74 +C AMD pile 4.56 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 6.04 +C AMD jaguar 5.55 5.84 +C Intel P4 13.05 +C Intel core2 4.03 +C Intel NHM 3.80 +C Intel SBR 2.75 +C Intel IBR 2.69 +C Intel HWL 2.50 +C Intel BWL 2.55 +C Intel SKL 2.57 +C Intel atom 17.3 +C Intel SLM 14.7 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor %r8, %r8 +L(com): mov (up), %rax + lea -16(up,n_param,8), up + lea -8(rp,n_param,8), rp + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): mov %r8, %r9 + test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $2, R32(n) + sub n_param, n + jmp L(lo0) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $3, R32(n) + sub n_param, n + mul v0 + cmp $2, n + jnz L(lo1) + jmp L(cj1) + +L(b11): mov $1, R32(n) + sub n_param, n + jmp L(lo3) + +L(b10): xor R32(n), R32(n) + sub n_param, n + jmp L(lo2) + +L(top): mul v0 + mov %r9, -24(rp,n,8) +L(lo1): xor %r9d, %r9d + add %rax, %r8 + mov (up,n,8), %rax + adc %rdx, %r9 + mov %r8, -16(rp,n,8) +L(lo0): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 8(up,n,8), %rax + adc %rdx, %r8 + mov %r9, -8(rp,n,8) +L(lo3): xor %r9d, %r9d + mul v0 + add %rax, %r8 + mov 16(up,n,8), %rax + adc %rdx, %r9 + mov %r8, (rp,n,8) +L(lo2): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 24(up,n,8), %rax + adc %rdx, %r8 + add $4, n + js L(top) + +L(end): mul v0 + mov %r9, -8(rp) +L(cj1): add %rax, %r8 + mov $0, R32(%rax) + adc %rdx, %rax + mov %r8, (rp) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm new file mode 100644 index 0000000..4bc22cd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 5.78 +C AMD K10 5.78 +C AMD bull 9.10 +C AMD pile 9.17 +C AMD steam +C AMD excavator +C AMD bobcat 11.3 +C AMD jaguar 10.9 +C Intel P4 24.6 +C Intel core2 8.06 +C Intel NHM 7.65 +C Intel SBR 6.28 +C Intel IBR 6.10 +C Intel HWL 6.09 +C Intel BWL 4.73 +C Intel SKL 4.77 +C Intel atom 35.3 +C Intel SLM 25.6 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add %rax, w0 + mov (up), %rax + mov $0, R32(w2) + mov w3, (rp) + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add %rax, w1 + mov 8(up), %rax + mov w0, 8(rp) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + mov $0, R32(w3) + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up), %rax + mov w1, 16(rp) + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 +L(lo2): mov $0, R32(w1) + mov w2, 24(rp) + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov 24(up), %rax + lea 32(up), up + adc %rdx, w0 + mul v0 + lea 32(rp), rp + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/popcount.asm b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm new file mode 100644 index 0000000..fb14dd3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm new file mode 100644 index 0000000..62b9a84 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm @@ -0,0 +1,579 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Make lead-in code for the inner loops be more similar. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + adc %r9, %rbx + mov 24(mp,n,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + adc %r10, %rbp + mov 32(mp,n,8), %rax + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 +L(e1): add $4, i + mov %rdx, %r10 + js L(tp1) + +L(ed1): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + mov 24(mp,n,8), %rax + adc %r9, %rbx + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + mov 32(mp,n,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 +L(e3): mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp3) + +L(ed3): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): cmp $-4, R32(n) + jz L(n4) + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(e0): mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp0) + +L(ed0): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 +L(e2): mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp2) + +L(ed2): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n4): mov -32(mp), %rax + mul q0 + mov %rax, %r11 + mov -24(mp), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov -16(mp), %rax + mov %rdx, %r9 + mul q0 + add -32(up), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov -8(mp), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add -24(up), %rbx + mov %rbx, -24(up) + adc %r9, %r11 + adc $0, %r10 + imul u0inv, %rbx C next q limb + add %r11, -16(up) + adc %r10, %rax + adc $0, %rdx + add %rax, -8(up) + adc $0, %rdx + mov %rdx, -32(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + dec j + lea 8(up), up C up++ + jnz L(n4) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm new file mode 100644 index 0000000..6f5f638 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm @@ -0,0 +1,287 @@ +dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Schedule loop less. It is now almost surely overscheduled, resulting in +C large feed-in and wind-down code. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NMH ? +C Intel SBR ? +C Intel atom 5.25 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), %r15 + ADDSUB (vp), %r15 + sbb R32(%rbx), R32(%rbx) + xor R32(%rax), R32(%rax) + shr %r15 + adc R32(%rax), R32(%rax) C return value + + mov R32(n), R32(%rbp) + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(b23) + +L(b1): dec n + jnz L(gt1) + shl $63, %rbx + add %rbx, %r15 + mov %r15, (rp) + jmp L(cj1) +L(gt1): lea 24(up), up + lea 24(vp), vp + mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + lea 24(rp), rp + mov (up), %r11 + ADCSBB -16(vp), %r9 + ADCSBB -8(vp), %r10 + mov %r15, %r12 + ADCSBB (vp), %r11 + mov %r9, %r13 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + shl $63, %r9 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jz L(cj5) +L(gt5): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + jmp L(lo1) + +L(b23): jnz L(b3) + mov 8(up), %r8 + sub $2, n + jnz L(gt2) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r8, %r12 + jmp L(cj2) +L(gt2): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + ADCSBB 16(vp), %r9 + lea 32(up), up + ADCSBB 24(vp), %r10 + mov %r9, %r13 + ADCSBB 32(vp), %r11 + mov %r8, %r12 + jmp L(lo2) + +L(b3): lea 40(up), up + lea 8(vp), vp + mov %r15, %r14 + add R32(%rbx), R32(%rbx) + mov -32(up), %r11 + ADCSBB 0(vp), %r11 + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + shl $63, %r11 + mov -24(up), %r8 + shr %r15 + or %r11, %r14 + sub $3, n + jnz L(gt3) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj3) +L(gt3): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + ADCSBB 24(vp), %r10 + mov %r8, %r12 + jmp L(lo3) + +L(b0): lea 48(up), up + lea 16(vp), vp + add R32(%rbx), R32(%rbx) + mov -40(up), %r10 + lea 16(rp), rp + mov -32(up), %r11 + ADCSBB -8(vp), %r10 + mov %r15, %r13 + ADCSBB (vp), %r11 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + mov -24(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jnz L(gt4) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj4) +L(gt4): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + jmp L(lo0) + + ALIGN(8) +L(top): mov 16(up), %r9 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + mov %rbp, (rp) + ADCSBB 8(vp), %r8 + mov %r15, %rbp + lea 32(rp), rp + mov 32(up), %r11 +L(lo1): ADCSBB 16(vp), %r9 + lea 32(up), up + mov %r12, -24(rp) +L(lo0): ADCSBB 24(vp), %r10 + mov %r8, %r12 + mov %r13, -16(rp) +L(lo3): ADCSBB 32(vp), %r11 + mov %r9, %r13 + mov %r14, -8(rp) +L(lo2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + mov %r11, %r15 + shr %r12 + mov %r10, %r14 + shl $63, %r9 + lea 32(vp), vp + shl $63, %r10 + or %r8, %rbp + shl $63, %r11 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + sub $4, n + jg L(top) + +L(end): shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + mov %rbp, (rp) + lea 32(rp), rp +L(cj5): add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r12, -24(rp) +L(cj4): mov %r13, -16(rp) +L(cj3): mov %r8, %r12 + mov %r14, -8(rp) +L(cj2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + shr %r12 + or %r8, %r15 + shl $63, %rbx + add %rbx, %r12 + mov %r15, (rp) + mov %r12, 8(rp) +L(cj1): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/rshift.asm b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm new file mode 100644 index 0000000..29c027d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm @@ -0,0 +1,121 @@ +dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shr R8(cnt), %r11 + neg R8(cnt) + shl R8(cnt), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov 8(up), %r8 + mov %r8, %r10 + shl R8(cnt), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(cnt) + shl R8(cnt), %rax + mov 8(up), %r9 + mov %r9, %r11 + shl R8(cnt), %r9 + neg R8(cnt) + dec n + lea -8(rp), rp + lea 8(up), up + jz L(end) + + ALIGN(8) +L(top): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + neg R8(cnt) + mov 8(up), %r8 + mov %r8, %r10 + mov %r9, 8(rp) + shl R8(cnt), %r8 + lea 16(rp), rp +L(lo1): mov 16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shl R8(cnt), %r9 + lea 16(up), up + neg R8(cnt) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + mov %r9, 8(rp) + mov %r11, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm new file mode 100644 index 0000000..1306acd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm @@ -0,0 +1,242 @@ +dnl AMD64 mpn_sublsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 501 bytes. +C * aorrlsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 (4.875 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sublsh1_n) + FUNC_ENTRY(4) + push %rbp + push %r15 + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + mov 24(up), %r15 + sbb %r11, %r15 + mov %r15, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): mov (vp), %r8 + add R32(%rax), R32(%rax) + lea 64(vp), vp + adc %r8, %r8 + mov -56(vp), %r9 + adc %r9, %r9 + mov -48(vp), %r10 + adc %r10, %r10 + mov -40(vp), %r11 + adc %r11, %r11 + mov -32(vp), %r12 + adc %r12, %r12 + mov -24(vp), %r13 + adc %r13, %r13 + mov -16(vp), %r14 + adc %r14, %r14 + mov -8(vp), %r15 + adc %r15, %r15 + sbb R32(%rax), R32(%rax) + add R32(%rbp), R32(%rbp) + mov (up), %rbp + lea 64(rp), rp + mov 8(up), %rbx + sbb %r8, %rbp + mov 32(up), %r8 + mov %rbp, (rp) + sbb %r9, %rbx + mov 16(up), %rbp + mov %rbx, 8(rp) + sbb %r10, %rbp + mov 24(up), %rbx + mov %rbp, 16(rp) + sbb %r11, %rbx + mov %rbx, 24(rp) + sbb %r12, %r8 + mov 40(up), %r9 + mov %r8, 32(rp) + sbb %r13, %r9 + mov 48(up), %rbp + mov %r9, 40(rp) + sbb %r14, %rbp + mov 56(up), %rbx + mov %rbp, 48(rp) + sbb %r15, %rbx + lea 64(up), up + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): + add R32(%rbp), R32(%rax) + neg R32(%rax) + + pop %r15 + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(mpn_sublsh1_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + push %r15 + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/README b/gmp-6.3.0/mpn/x86_64/bd1/README new file mode 100644 index 0000000..ccd210e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/README @@ -0,0 +1,11 @@ +This directory contains code for AMD bulldozer including its piledriver update. + +We currently make limited use of SIMD instructions, both via the MPN_PATH and +via inclusion of x86_64/fastsse files. + +The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably +means that an all-core GMP load (such as a HPC load) might run slower if there +is significant SIMD dependency. + +We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any +SIMD code. diff --git a/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm new file mode 100644 index 0000000..b54e91a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm @@ -0,0 +1,235 @@ +dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4.2 +C AMD bd2 4.4 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`X0', `%r12') +define(`X1', `%r13') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + mov $0, R32(w2) C abuse w2 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + sub n_param, w2 + mul v0 + + test $1, R8(w2) + jnz L(bx1) + +L(bx0): mov %rdx, X0 + mov %rax, X1 + test $2, R8(w2) + jnz L(b10) + +L(b00): lea (w2), n C un = 4, 8, 12, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w3 + mul v1 + mov %rax, w0 + mov 8(up,w2,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(w2), n C un = 2, 6, 10, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w1 + mul v1 + mov %rdx, w3 + mov %rax, w2 + mov -8(up,n,8), %rax + test n, n + jz L(end) + jmp L(top) + +L(bx1): mov %rax, X0 + mov %rdx, X1 + test $2, R8(w2) + jz L(b11) + +L(b01): lea 1(w2), n C un = 1, 5, 9, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(w2), n C un = 3, 7, 11, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w0 + mov %rax, w1 + mov 8(up,w2,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, -16(rp) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up), %rax + mul v1 + mov -8(rp), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, -8(rp) + add w3, X1 + mov X1, (rp) + adc $0, %rdx + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm new file mode 100644 index 0000000..c34a5fa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm new file mode 100644 index 0000000..5516c9d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm new file mode 100644 index 0000000..143c42e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm new file mode 100644 index 0000000..fc0d2fe --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.30 3.58 +C AMD K10 3.09 +C AMD bull 4.47 4.72 +C AMD pile 4.66 +C AMD steam +C AMD excavator +C AMD bobcat 6.30 +C AMD jaguar 6.29 +C Intel P4 17.3 17.8 +C Intel core2 5.13 +C Intel NHM 4.85 +C Intel SBR 3.83 +C Intel IBR 3.75 +C Intel HWL 3.45 +C Intel BWL 2.56 +C Intel SKL 2.53 +C Intel atom 20.3 +C Intel SLM 9 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul v0 + +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%rbx) + jb L(b1) + jz L(b2) + +L(b3): mov $0, R32(%r8) + mov %rax, %rbx + mov $0, R32(%r9) + mov 8(up), %rax + mov %rdx, %r10 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov $0, R32(%r10) + mov %rax, %r8 + mov %rdx, %rbx + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): cmp $1, n + jz L(n1) + mov %rax, %r9 + mov 8(up), %rax + mov %rdx, %r8 + mov $0, R32(%rbx) + lea (up,n,8), up + neg n + inc n + jmp L(L1) + +L(b2): mov $0, R32(%rbx) + mov %rax, %r10 + mov %rdx, %r9 + mov 8(up), %rax + mov $0, R32(%r8) + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 +L(L1): mul v0 + mov $0, R32(%r10) + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx + mov 8(up,n,8), %rax +L(L0): mul v0 + ADDSUB %r8, 16(rp,n,8) + mov $0, R32(%r8) + adc %rax, %rbx + mov $0, R32(%r9) + mov 16(up,n,8), %rax + adc %rdx, %r10 +L(L3): mul v0 + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%rbx) + adc %rax, %r10 + adc %rdx, %r9 + mov 24(up,n,8), %rax + add $4, n + js L(top) + +L(end): mul v0 + ADDSUB %r10, (rp) + adc %r9, %rax + adc %r8, %rdx +L(n1): ADDSUB %rax, 8(rp) + adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/com.asm b/gmp-6.3.0/mpn/x86_64/bd1/com.asm new file mode 100644 index 0000000..43f3561 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm new file mode 100644 index 0000000..675cdc3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm new file mode 100644 index 0000000..ceef036 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h new file mode 100644 index 0000000..210f382 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h @@ -0,0 +1,265 @@ +/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600-3800 MHz Bulldozer Zambezi */ +/* FFT tuning limit = 464,627,200 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 275 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 161 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 339 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 234 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63, 7}, \ + { 1023, 8}, { 543, 9}, { 303,10}, { 167,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,10}, { 2687,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 251 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 364, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95, 7}, \ + { 1535, 8}, { 799, 7}, { 1599, 8}, { 831, 9}, \ + { 447,10}, { 239,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 303,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 303,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \ + { 895,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,11}, { 3199,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,11}, { 6911,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4351,12}, { 8703,13}, { 4479,12}, { 8959,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,13}, { 8703,14}, \ + { 4479,13}, { 8959,15}, { 2303,14}, { 4991,13}, \ + { 9983,15}, { 2559,14}, { 5119,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4351,14}, { 8959,15}, \ + { 4863,14}, { 9983,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 275 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 23 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 167 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 197 +#define INV_APPR_THRESHOLD 179 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 32 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 92 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define POWM_SEC_TABLE 1,22,194,434,452 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 438 +#define SET_STR_PRECOMPUTE_THRESHOLD 1254 + +#define FAC_DSC_THRESHOLD 189 +#define FAC_ODD_THRESHOLD 26 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 3 /* 2.31% faster than 4 */ +#define HGCD_THRESHOLD 104 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 283 +#define JACOBI_BASE_METHOD 4 /* 5.81% faster than 1 */ + +/* Tuneup completed successfully, took 554602 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm new file mode 100644 index 0000000..799cdda --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm @@ -0,0 +1,206 @@ +dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.51-2.0 y +C AMD bd2 1.50-1.9 y +C AMD bd3 ? +C AMD bd4 ? +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/core2/hamdist.asm') +',` + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + cmp $5, n + jl L(sma) + + lea L(cnsts)(%rip), %r9 + + xor R32(%r10), R32(%r10) + test $8, R8(vp) + jz L(ali) + mov (up), %r8 + xor (vp), %r8 + add $8, up + add $8, vp + dec n + popcnt %r8, %r10 +L(ali): + +ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', + `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm5 C masks + pxor %xmm4, %xmm4 + pxor %xmm8, %xmm8 C grand total count + + mov R32(n), R32(%rax) + and $6, R32(%rax) + lea -64(up,%rax,8), up + lea -64(vp,%rax,8), vp +ifdef(`PIC',` + movslq (%r9,%rax,2), %r11 + add %r9, %r11 + jmp *%r11 +',` + jmp *(%r9,%rax,4) +') + +L(0): add $64, up + add $64, vp + sub $2, n + + ALIGN(32) +L(top): lddqu (up), %xmm0 + pxor (vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(6): lddqu 16(up), %xmm0 + pxor 16(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(4): lddqu 32(up), %xmm0 + pxor 32(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm3 + paddb %xmm2, %xmm4 + paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts +L(2): mov 48(up), %r8 + mov 56(up), %r9 + add $64, up + xor 48(vp), %r8 + xor 56(vp), %r9 + add $64, vp + popcnt %r8, %r8 + popcnt %r9, %r9 + add %r8, %r10 + add %r9, %r10 + sub $8, n + jg L(top) + + test $1, R8(n) + jz L(x) + mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %r8 + add %r8, %r10 +L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + paddq %xmm0, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %r10, %rax + FUNC_EXIT() + ret + +L(sma): mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %rax + dec n + jz L(ed) +L(tp): mov 8(up), %r8 + add $8, up + xor 8(vp), %r8 + add $8, vp + popcnt %r8, %r8 + add %r8, %rax + dec n + jnz L(tp) +L(ed): FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(0), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(6), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm new file mode 100644 index 0000000..2fb097f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.65 +C AMD K10 3.30 3.68 +C AMD bull 4.04 4.29 +C AMD pile 4.33 +C AMD steam +C AMD excavator +C AMD bobcat 5.73 +C AMD jaguar 5.87 +C Intel P4 12.5 +C Intel core2 4.38 +C Intel NHM 4.28 +C Intel SBR 2.69 +C Intel IBR 2.55 +C Intel HWL 2.41 +C Intel BWL 2.49 +C Intel SKL 2.50 +C Intel atom 20.3 +C Intel SLM 7.8 +C VIA nano 4.25 + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Move loop code into feed-in blocks, to save insn for zeroing regs. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``rbx'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +IFSTD(` add %r8, %rax ') +IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) + adc $0, %rdx + jmp L(common) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +L(common): +IFSTD(` mov %r11, n ') + + and $3, R32(%r11) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%r11) + jb L(b1) + jz L(b2) + +L(b3): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up), %rax + mul v0 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov %rax, %r9 + mov %rdx, %r10 + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): mov %rax, %r8 + cmp $1, n + jz L(n1) + mov %rdx, %r9 + lea (up,n,8), up + neg n + mov %r8, 16(rp,n,8) + inc n + jmp L(L1) + +L(b2): mov %rax, %r11 + mov %rdx, %r8 + mov 8(up), %rax + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(16) +L(top): mul v0 + mov %rdx, %r9 + add %rax, %r8 + adc $0, %r9 + mov %r8, 8(rp,n,8) + mov %r11, (rp,n,8) +L(L1): mov (up,n,8), %rax + mul v0 + add %rax, %r9 + mov %rdx, %r10 + mov 8(up,n,8), %rax + adc $0, %r10 +L(L0): mul v0 + add %rax, %r10 + mov %rdx, %r11 + mov 16(up,n,8), %rax + adc $0, %r11 + mul v0 + mov %r9, 16(rp,n,8) +L(L3): add %rax, %r11 + mov %r10, 24(rp,n,8) + mov %rdx, %r8 + adc $0, %r8 + add $4, n + mov -8(up,n,8), %rax + js L(top) + +L(end): mul v0 + add %rax, %r8 + adc $0, %rdx + mov %r11, (rp) +L(n1): mov %r8, 8(rp) + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm new file mode 100644 index 0000000..85fa7aa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 6.78 +C AMD K10 6.78 +C AMD bd1 8.39 8.65 +C AMD bd2 8.47 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 12.1 +C AMD bt2 11.5 +C Intel P4 24.0 +C Intel PNR 8.14 +C Intel NHM 7.78 +C Intel SBR 6.34 +C Intel IBR 6.15 +C Intel HWL 6.04 +C Intel BWL 4.33 +C Intel SKL 4.41 +C Intel atom 39.5 +C Intel SLM 27.8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mov n_param, n + mul v0 + neg n + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov (up,n,8), %rax + jmp L(lo0) + +L(b10): mov %rax, w2 + mov %rdx, w3 + mov (up,n,8), %rax + xor R32(w0), R32(w0) + mul v1 + add $-2, n + jmp L(lo2) + +L(bx1): test $2, R8(n) + jz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up,n,8), %rax + mul v1 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + xor R32(w3), R32(w3) + dec n + jmp L(lo3) + + ALIGN(32) +L(top): mov -8(up,n,8), %rax + mul v1 + mov w2, -16(rp,n,8) +L(lo1): add %rax, w0 + mov w3, -8(rp,n,8) + adc %rdx, w1 + mov (up,n,8), %rax + mul v0 + mov $0, R32(w2) + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + mov (up,n,8), %rax +L(lo0): mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,n,8), %rax + mul v0 + add %rax, w1 + mov w0, (rp,n,8) + mov $0, R32(w3) + mov 8(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + mov $0, R32(w0) + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov w1, 8(rp,n,8) +L(lo2): add %rax, w3 + adc %rdx, w0 + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + mov $0, R32(w1) + adc $0, R32(w1) + add $4, n + jnc L(top) + +L(end): mov -8(up), %rax + mul v1 + mov w2, -16(rp) + add %rax, w0 + mov w3, -8(rp) + adc %rdx, w1 + mov w0, (rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm new file mode 100644 index 0000000..e47ba58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm @@ -0,0 +1,416 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull ~4.8 ~4.55 - ~4.3 +C AMD pile ~4.6 ~4.55 - ~4.55 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. +C Alternatively, we could tweak the present code (which was loopmixed for a +C different CPU). +C * Merge faster mul_2, such as the one in the same directory as this file. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + mov %rax, w2 C 0 + mov (up,un,8), %rax + mov %rdx, w1 C 1 + mul v1 + mov %rax, w0 C 1 + mov w2, (rp,un,8) C 0 + mov 8(up,un,8), %rax + mov %rdx, w2 C 2 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + mov %rax, w0 C 1 + mov %rdx, w3 C 2 + mov (up,un,8), %rax + mul v1 + mov w0, (rp,un,8) C 1 + mov %rdx, w0 C 3 + mov %rax, w2 C 0 + mov 8(up,un,8), %rax + jmp L(m2l1) + + ALIGN(32) +L(m2tp):add %rax, w2 C 0 + mov (up,n,8), %rax + adc $0, w0 C 1 +L(m2l1):mul v0 + add %rax, w2 C 0 + mov (up,n,8), %rax + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mul v1 + add w3, w2 C 0 + adc $0, w1 C 1 + add %rax, w0 C 1 + mov w2, (rp,n,8) C 0 + mov 8(up,n,8), %rax + mov %rdx, w2 C 2 + adc $0, w2 C 2 +L(m2l0):mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + add w1, w0 C 1 + adc $0, w3 C 2 + mov 8(up,n,8), %rax + mul v1 + add $2, n + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + jnc L(m2tp) + +L(m2ed):add %rax, w2 + adc $0, %rdx + add w3, w2 + adc $0, %rdx + mov w2, I((rp),(rp,n,8)) + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + mov (up,un,8), %rax + mul v0 + + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov %rax, X1 + mov (up,un,8), %rax + mov %rdx, X0 + mul v1 + test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), n C un = 4, 8, 12, ... + mov (rp,un,8), w3 + mov %rax, w0 + mov 8(up,un,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(un), n C un = 2, 6, 10, ... + mov (rp,un,8), w1 + mov %rdx, w3 + mov %rax, w2 + mov 8(up,un,8), %rax + jmp L(lo2) + +L(bx1): mov %rax, X0 + mov (up,un,8), %rax + mov %rdx, X1 + mul v1 + test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), n C un = 1, 5, 9, ... + mov (rp,un,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(un), n C un = 3, 7, 11, ... + mov (rp,un,8), w0 + mov %rax, w1 + mov 8(up,un,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, I(-16(rp),-16(rp,n,8)) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + mov I(-8(rp),-8(rp,n,8)), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, I(-8(rp),-8(rp,n,8)) + add w3, X1 + mov X1, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm new file mode 100644 index 0000000..7b084f4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm @@ -0,0 +1,191 @@ +dnl AMD64 SSSE3/XOP mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.27 y +C AMD bd2 1.24 y +C AMD bd3 ? +C AMD bd4 1.22 +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/core2/popcount.asm') +',` + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(3) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)', + `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm9 C masks + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 C 0-reg + pxor %xmm8, %xmm8 C grand total count + + xor R32(%rdx), R32(%rdx) + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx + add $8, up + dec n + jnz L(top) + mov %rdx, %rax + FUNC_EXIT() + ret + +L(2): add $-48, up + jmp L(e2) + +L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-40, up + jmp L(e2) + +L(4): add $-32, up + jmp L(e4) + +L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-24, up + jmp L(e4) + +L(6): add $-16, up + jmp L(e6) + +L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-8, up + jmp L(e6) + + ALIGN(32) +L(top): lddqu (up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm4 +L(e2): popcnt 48(up), %r8 + popcnt 56(up), %r9 + add $64, up + paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts + add %r8, %rdx + add %r9, %rdx + sub $8, n + jg L(top) + + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + paddq %xmm5, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %rdx, %rax + FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm new file mode 100644 index 0000000..4ba673d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_sublsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm new file mode 100644 index 0000000..b167077 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm @@ -0,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD2, BD3, BT2. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 5.4 +C AMD bd2 3.72 +C AMD bd3 ? +C AMD bd4 4.12 +C AMD bt1 9.0 +C AMD bt2 3.97 +C AMD zn1 3.36 +C AMD zn2 3.33 +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov v0, %rdx + sub u0, %rdx + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + mov u0, %rax + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 + mov v0, %rdx + sub u0, %rdx C v - u + jnz L(top) + +L(end): mov v0, %rax + C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm new file mode 100644 index 0000000..a4f30ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm @@ -0,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 12.3 +C AMD K10 8.0 +C AMD bd1 10.0 +C AMD bd2 7.2 +C AMD bd3 ? +C AMD bd4 6.7 +C AMD bt1 13.6 +C AMD bt2 8.9 +C AMD zn1 5.7 +C AMD zn2 5.6 +C Intel P4 ? +C Intel CNR 9.7 +C Intel PNR 9.7 +C Intel NHM 9.4 +C Intel WSM 9.5 +C Intel SBR 10.3 +C Intel IBR ? +C Intel HWL 8.2 +C Intel BWL 7.4 +C Intel SKL 7.3 +C Intel atom 26.5 +C Intel SLM 17.4 +C Intel GLM 13.4 +C Intel GLM+ 12.4 +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h new file mode 100644 index 0000000..61573ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h @@ -0,0 +1,263 @@ +/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4000-4200 MHz Piledriver Vishera */ +/* FFT tuning limit = 464,626,631 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 23 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 293 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 152 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 103 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 200 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 55,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79, 8}, \ + { 639, 9}, { 335,10}, { 175, 9}, { 351,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287,11}, { 159,12}, { 95,11}, \ + { 191,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,12}, { 319,11}, { 639,10}, { 1279,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,10}, { 4863,13}, { 639,12}, { 1343,11}, \ + { 2687,13}, { 703,12}, { 1407,11}, { 2815,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1727,11}, { 3455,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \ + { 1791,12}, { 3583,13}, { 1919,12}, { 3839,11}, \ + { 7679,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,12}, { 7679,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4479,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2815,13}, { 5631,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4479,13}, { 8959,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,14}, \ + { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 262 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 351,12}, { 191,11}, { 383,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 639,10}, \ + { 1279,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \ + { 831,12}, { 1727,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2943,12}, { 5887,11}, { 11775,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,13}, { 8959,15}, { 2303,14}, { 4863,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,14}, { 15871,17}, { 2047,16}, \ + { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \ + { 19967,16}, { 5119,15}, { 10239,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 254 +#define SQR_FFT_THRESHOLD 2880 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 30 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 53 +#define SQRLO_SQR_THRESHOLD 5724 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 159 +#define DC_BDIV_QR_THRESHOLD 44 +#define DC_BDIV_Q_THRESHOLD 79 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 172 +#define INV_APPR_THRESHOLD 172 + +#define BINV_NEWTON_THRESHOLD 226 +#define REDC_1_TO_REDC_2_THRESHOLD 40 +#define REDC_2_TO_REDC_N_THRESHOLD 51 + +#define MU_DIV_QR_THRESHOLD 1308 +#define MU_DIVAPPR_Q_THRESHOLD 1258 +#define MUPI_DIV_QR_THRESHOLD 85 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1210 + +#define POWM_SEC_TABLE 3,16,129,523,1297 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 228 +#define SET_STR_PRECOMPUTE_THRESHOLD 1033 + +#define FAC_DSC_THRESHOLD 172 +#define FAC_ODD_THRESHOLD 28 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 8.54% faster than 3 */ +#define HGCD_THRESHOLD 108 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 393 +#define GCDEXT_DC_THRESHOLD 278 +#define JACOBI_BASE_METHOD 4 /* 13.69% faster than 1 */ + +/* Tuneup completed successfully, took 463931 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm new file mode 100644 index 0000000..4176b85 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm @@ -0,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD4, ZN1. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 3.73 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 3.33 +C AMD zn2 3.48 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov u0, %rax + mov v0, %rdx + sub u0, %rdx C v - u + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shrx( %rcx, u0, %rax) + shrx( %rcx, u0, u0) + mov v0, %rdx + sub %rax, %rdx C v - u + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h new file mode 100644 index 0000000..9d2038c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h @@ -0,0 +1,266 @@ +/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3800-4200 MHz Excavator/Bristol Ridge */ +/* FFT tuning limit = 461,179,335 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 52 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 25 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 142 +#define MUL_TOOM6H_THRESHOLD 206 +#define MUL_TOOM8H_THRESHOLD 292 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 83 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 102 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 71 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 298 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 87,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135, 9}, { 271, 5}, { 4351, 6}, { 2303, 7}, \ + { 1215, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415, 9}, \ + { 831,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1151,11}, { 2303,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1535,13}, { 3071,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,17}, { 2047,16}, { 4095,15}, { 8191,16}, \ + { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 253 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 300 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 300, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ + { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383, 5}, { 6399, 6}, { 3327, 7}, \ + { 1727, 6}, { 3455, 7}, { 1791,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415, 9}, { 831,13}, { 63,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3583,14}, { 7167,15}, \ + { 3839,14}, { 7679,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8447,16}, { 4607,15}, { 9983,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 273 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 8397 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 39 +#define DC_DIVAPPR_Q_THRESHOLD 165 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 155 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 998 +#define MU_BDIV_Q_THRESHOLD 1142 + +#define POWM_SEC_TABLE 1,16,175,269,839,1420 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 552 +#define SET_STR_PRECOMPUTE_THRESHOLD 1038 + +#define FAC_DSC_THRESHOLD 151 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 8.11% faster than 3 */ +#define HGCD_THRESHOLD 87 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 327 +#define GCDEXT_DC_THRESHOLD 241 +#define JACOBI_BASE_METHOD 4 /* 21.40% faster than 1 */ + +/* Tuneup completed successfully, took 431056 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm new file mode 100644 index 0000000..a53bd52 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm @@ -0,0 +1,106 @@ +dnl x86_64 mpn_bdiv_dbm1. + +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.25 +C AMD K10 2.25 +C Intel P4 12.5 +C Intel core2 4 +C Intel NHM 3.75 +C Intel SBR 3.6 +C Intel atom 20 +C VIA nano 4 + +C TODO +C * Optimise feed-in code. + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`bd', `%rcx') +define(`cy', `%r8') + +define(`n', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + mov (up), %rax + mov n_param, n + mov R32(n_param), R32(%r11) + mul bd + lea (up,n,8), up + lea (qp,n,8), qp + neg n + and $3, R32(%r11) + jz L(lo0) + lea -4(n,%r11), n + cmp $2, R32(%r11) + jc L(lo1) + jz L(lo2) + jmp L(lo3) + + ALIGN(16) +L(top): mov (up,n,8), %rax + mul bd +L(lo0): sub %rax, %r8 + mov %r8, (qp,n,8) + sbb %rdx, %r8 + mov 8(up,n,8), %rax + mul bd +L(lo3): sub %rax, %r8 + mov %r8, 8(qp,n,8) + sbb %rdx, %r8 + mov 16(up,n,8), %rax + mul bd +L(lo2): sub %rax, %r8 + mov %r8, 16(qp,n,8) + sbb %rdx, %r8 + mov 24(up,n,8), %rax + mul bd +L(lo1): sub %rax, %r8 + mov %r8, 24(qp,n,8) + sbb %rdx, %r8 + add $4, n + jnz L(top) + + mov %r8, %rax + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm new file mode 100644 index 0000000..85538c9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm +C AMD K8,K9 11 11 +C AMD K10 11 11 +C AMD bull 13.5 14 +C AMD pile 14 15 +C AMD steam +C AMD excavator +C AMD bobcat 14 14 +C AMD jaguar 14.5 15 +C Intel P4 33 33 +C Intel core2 13.5 13.25 +C Intel NHM 14 14 +C Intel SBR 8 8.25 +C Intel IBR 7.75 7.85 +C Intel HWL 8 8 +C Intel BWL 8 8 +C Intel SKL 8 8 +C Intel atom 34 36 +C Intel SLM 13.7 13.5 +C VIA nano 19.25 19.25 needs re-measuring + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + lea 8(up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + test R32(%rcx), R32(%rcx) + jnz L(unorm) C branch if count != 0 + xor R32(%rbx), R32(%rbx) + jmp L(nent) + + ALIGN(8) +L(ntop):mul %r11 C carry limb in rdx 0 10 + mov -8(up,%r10,8), %rax C + sub %rbx, %rax C apply carry bit + setc R8(%rbx) C + sub %rdx, %rax C apply carry limb 5 + adc $0, R32(%rbx) C 6 +L(nent):imul %r8, %rax C 6 + mov %rax, (rp,%r10,8) C + inc %r10 C + jnz L(ntop) + + mov -8(up), %r9 C up high limb + jmp L(com) + +L(unorm): + mov (up,%r10,8), %r9 C up[1] + shr R8(%rcx), %rax C + neg R32(%rcx) + shl R8(%rcx), %r9 C + neg R32(%rcx) + or %r9, %rax + xor R32(%rbx), R32(%rbx) + jmp L(uent) + + ALIGN(8) +L(utop):mul %r11 C carry limb in rdx 0 10 + mov (up,%r10,8), %rax C + shl R8(%rcx), %rax C + neg R32(%rcx) + or %r9, %rax + sub %rbx, %rax C apply carry bit + setc R8(%rbx) C + sub %rdx, %rax C apply carry limb 5 + adc $0, R32(%rbx) C 6 +L(uent):imul %r8, %rax C 6 + mov (up,%r10,8), %r9 C + shr R8(%rcx), %r9 C + neg R32(%rcx) + mov %rax, (rp,%r10,8) C + inc %r10 C + jnz L(utop) + +L(com): mul %r11 C carry limb in rdx + sub %rbx, %r9 C apply carry bit + sub %rdx, %r9 C apply carry limb + imul %r8, %r9 + mov %r9, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm new file mode 100644 index 0000000..9b6b5c7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm @@ -0,0 +1,159 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.77 +C AMD K10 1.76\1.82 +C AMD bd1 1.67\2.12 +C AMD bd2 1.62\1.82 +C AMD bd3 +C AMD bd4 1.55\2.2 +C AMD zen +C AMD bt1 2.54 +C AMD bt2 2 +C Intel P4 11 +C Intel PNR 4.76 +C Intel NHM 5.27 +C Intel SBR 2 +C Intel IBR 1.94 +C Intel HWL 1.63 +C Intel BWL 1.51 +C Intel SKL 1.51 +C Intel atom 3.56 +C Intel SLM 4 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(ent): test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): shr $2, n + neg %r8 + mov $3, R32(%rax) + mov (up), %r10 + mov 8(up), %r11 + jmp L(lo0) + +L(b10): shr $2, n + neg %r8 + mov $1, R32(%rax) + mov (up), %r8 + mov 8(up), %r9 + jrcxz L(cj2) + jmp L(top) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): shr $2, n + neg %r8 + mov $0, R32(%rax) + mov (up), %r9 + jrcxz L(cj1) + mov 8(up), %r10 + jmp L(lo1) + + ALIGN(8) +L(b11): inc n + shr $2, n + neg %r8 + mov $2, R32(%rax) + mov (up), %r11 + jmp L(lo3) + + ALIGN(4) +L(top): mov 8(up,%rax,8), %r10 + ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(lo1): mov 16(up,%rax,8), %r11 + ADCSBB (vp,%rax,8), %r9 + lea 4(%rax), %rax + mov %r9, -32(rp,%rax,8) +L(lo0): ADCSBB -24(vp,%rax,8), %r10 + mov %r10, -24(rp,%rax,8) +L(lo3): ADCSBB -16(vp,%rax,8), %r11 + dec n + mov -8(up,%rax,8), %r8 + mov %r11, -16(rp,%rax,8) +L(lo2): mov (up,%rax,8), %r9 + jnz L(top) + +L(cj2): ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(cj1): ADCSBB (vp,%rax,8), %r9 + mov %r9, (rp,%rax,8) + + mov $0, R32(%rax) + adc $0, R32(%rax) + + FUNC_EXIT() + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm new file mode 100644 index 0000000..41e1d8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm @@ -0,0 +1,191 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 old measurement +C AMD K10 4.51 old measurement +C AMD bd1 4.66 old measurement +C AMD bd2 4.57 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.04 +C AMD bt2 5.07 +C Intel P4 16.8 18.6 old measurement +C Intel PNR 5.59 old measurement +C Intel NHM 5.39 old measurement +C Intel SBR 3.93 old measurement +C Intel IBR 3.59 old measurement +C Intel HWL 3.61 old measurement +C Intel BWL 2.76 old measurement +C Intel SKL 2.77 old measurement +C Intel atom 23 old measurement +C Intel SLM 8 old measurement +C Intel GLM ? +C VIA nano 5.63 old measurement + +C The ALIGNment here might look completely ad-hoc. They are not. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(func) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + je L(b01) + +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -24(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): xor R32(%rax), R32(%rax) + ADDSUB w2, -8(rp) + adc w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): ADDSUB %rax, -8(rp) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm new file mode 100644 index 0000000..877714e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm @@ -0,0 +1,91 @@ +dnl AMD64 mpn_copyd optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + sub $4, n + jl L(end) + ALIGN(16) +L(top): mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) +L(ent): sub $4, n + jge L(top) + +L(end): cmp $-4, R32(n) + jz L(ret) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + cmp $-3, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + cmp $-2, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm new file mode 100644 index 0000000..ee0f578 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm @@ -0,0 +1,94 @@ +dnl AMD64 mpn_copyi optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + lea -32(up,n,8), up + lea -32(rp,n,8), rp + neg n + add $4, n + jg L(end) + ALIGN(16) +L(top): mov (up,n,8), %r8 + mov %r8, (rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) +L(ent): add $4, n + jle L(top) + +L(end): cmp $4, R32(n) + jz L(ret) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) + cmp $3, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + cmp $2, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm new file mode 100644 index 0000000..ef53392 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm @@ -0,0 +1,119 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 5.4 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +define(`cnt', `%rcx') +define(`s0', `%rax') +define(`t0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r10) + mov v0, t0 + sub u0, t0 + jz L(end) + + ALIGN(16) +L(top): mov u0, s0 + sub v0, u0 + cmovc t0, u0 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +L(shr): shr R8(cnt), u0 + mov v0, t0 + sub u0, t0 + jnz L(top) + +L(end): mov v0, %rax + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(count_better): + bsf u0, cnt + jmp L(shr) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm new file mode 100644 index 0000000..c9f221e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h new file mode 100644 index 0000000..977a209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h @@ -0,0 +1,230 @@ +/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 1600 MHz AMD Bobcat/Zacate */ +/* FFT tuning limit = 110,472,704 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 71 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 270 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 127 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 372 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83, 5}, { 1343, 4}, \ + { 2687, 5}, { 1407, 6}, { 735, 7}, { 415, 8}, \ + { 223,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 7}, { 575, 8}, \ + { 303, 9}, { 159,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 186 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 42 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 7 +#define SQRLO_DC_THRESHOLD 100 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 70 +#define DC_DIVAPPR_Q_THRESHOLD 204 +#define DC_BDIV_QR_THRESHOLD 59 +#define DC_BDIV_Q_THRESHOLD 148 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 246 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,960,1603,1811,2499 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 34 +#define SET_STR_DC_THRESHOLD 345 +#define SET_STR_PRECOMPUTE_THRESHOLD 1787 + +#define FAC_DSC_THRESHOLD 781 +#define FAC_ODD_THRESHOLD 104 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.20% faster than 5 */ +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 293 +#define JACOBI_BASE_METHOD 2 /* 9.38% faster than 1 */ + +/* Tuneup completed successfully, took 358881 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm new file mode 100644 index 0000000..4394d6e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.53 old measurement +C AMD K10 4.53 old measurement +C AMD bd1 4.56 old measurement +C AMD bd2 4.47 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.12 +C AMD bt2 5.17 +C Intel P4 12.6 old measurement +C Intel PNR 4.53 old measurement +C Intel NHM 4.36 old measurement +C Intel SBR 3.0 old measurement +C Intel IBR 2.55 old measurement +C Intel HWL 2.28 old measurement +C Intel BWL 2.36 old measurement +C Intel SKL 2.39 old measurement +C Intel atom 21.0 old measurement +C Intel SLM 9 old measurement +C Intel GLM ? +C VIA nano ? + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +define(`cy', `%r8') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +IFDOS(` define(`cy', `56(%rsp)')') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + + ALIGN(64) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + jne L(b11) + +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -24(rp,n,8) + add w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): mov w2, -8(rp) + mov w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): mov %rax, -8(rp) + mov %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + mov cy, w2 + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(cx1) + +L(cx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + add w2, w0 + adc $0, w1 + test $2, R8(n) + jne L(L2) + +L(c00): add $2, n + jmp L(L0) + + ALIGN(16) +L(cx1): mul v0 + test $2, R8(n) + je L(c01) + +L(c11): neg n + inc n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(L3) + +L(c01): cmp $1, n + jz L(m1) + neg n + add $3, n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(top) + + ALIGN(32) +L(m1): add %rax, w2 + mov %rdx, %rax + mov w2, -8(rp) + adc $0, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm new file mode 100644 index 0000000..e7d46bf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm @@ -0,0 +1,486 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This mul_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 3 code. +C * Fix slowdown for un=vn=3 (67->71) compared to default code. +C * This is 1263 bytes, compared to 1099 bytes for default code. Consider +C combining addmul loops like that code. Tolerable slowdown? +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (up), %rax + mov (vp), v0 + + cmp $2, un_param + ja L(ge3) + jz L(u2) + + mul v0 C u0 x v0 + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(u2): mul v0 C u0 x v0 + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + cmp $1, R32(vn) + jnz L(u2v2) + mov w0, 8(rp) + mov w1, 16(rp) + FUNC_EXIT() + ret + +L(u2v2):mov 8(vp), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, %r8 C CAUTION: r8 realloc + adc $0, %r8 + mov 8(up), %rax + mul v0 + add w1, %r8 + adc $0, %rdx + add %r8, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(ge3): push %rbx + push %rbp + push %r12 + push %r13 + + lea 8(vp), vp + + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + xor R32(un), R32(un) + mov $2, R32(n) + sub un_param, un + sub un_param, n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + +C Switch on n into right addmul_l loop + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + jmp L(r1) + + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $2, un + +C outer loop(3) +L(to3): dec vn + jz L(ret) + mov (vp), v0 + mov 8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to3) + + +L(r2): mov X(0(up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + inc un + +C outer loop(2) +L(to2): dec vn + jz L(ret) + mov (vp), v0 + mov 16(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to2) + + +L(r1): mov X(0(up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $4, un + +C outer loop(1) +L(to1): dec vn + jz L(ret) + mov (vp), v0 + mov -8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to1) + + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $3, un + +C outer loop(0) +L(to0): dec vn + jz L(ret) + mov (vp), v0 + mov (up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to0) + + +L(ret): pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm new file mode 100644 index 0000000..d55b1e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm @@ -0,0 +1,507 @@ +dnl X86-64 mpn_redc_1 optimised for AMD bobcat. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 +L(e1): mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp1) + +L(ed1): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 +L(e3): mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp3) + +L(ed3): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e0): mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp0) + +L(ed0): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e2): mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp2) + +L(ed2): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm new file mode 100644 index 0000000..0e417a1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm @@ -0,0 +1,565 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 4 code. +C * Perhaps implement a larger final corner (it is now 2 x 1). +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`n', `%rbp') +define(`v0', `%rcx') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') +dnl define(`X',`$1') + + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + mov (up), %rax + + cmp $2, R32(un_param) + jae L(ge2) + + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(ge2): mov (up), v0 + jnz L(g2) + + mul %rax + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + mov 8(up), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, w0 C CAUTION: r8 realloc + adc $0, w0 + mov 8(up), %rax + mul v0 + add w1, w0 + adc $0, %rdx + add w0, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(g2): cmp $3, R32(un_param) + ja L(g3) + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + mov 8(up), %rax + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov (up), v0 + mov 8(up), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov 16(up), %rax + mul v0 + xor R32(w2), R32(w2) + add %rax, w1 + adc %rdx, w2 + + mov 8(up), v0 + mov 16(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, w2 + adc %rdx, w3 + add w0, w0 + adc w1, w1 + adc w2, w2 + adc w3, w3 + mov $0, R32(v0) + adc v0, v0 + add w0, 8(rp) + adc w1, 16(rp) + adc w2, 24(rp) + adc w3, 32(rp) + adc v0, 40(rp) + FUNC_EXIT() + ret + +L(g3): push %rbx + push %rbp + + mov 8(up), %rax + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + neg un_param + push un_param C for sqr_diag_addlsh1 + lea (un_param), un + lea 3(un_param), n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + + +L(r1): mov X((up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $5, un + jmp L(to0) + +L(r2): mov X((up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + add $6, un + jmp L(to1) + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $3, un + jmp L(to2) + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $4, un +C jmp L(to3) +C fall through into main loop + + +L(outer): + mov un, n + mov (up,un,8), v0 + mov 8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to2): mov un, n + cmp $-4, R32(un) + jnc L(end) + add $4, un + mov 8(up,n,8), v0 + mov 16(up,n,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to1): mov un, n + mov -16(up,un,8), v0 + mov -8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to0): mov un, n + mov -8(up,un,8), v0 + mov (up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(outer) + + +L(end): mov X(8(up,un,8),(up)), v0 + mov X(16(up,un,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov X(24(up,un,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, X(24(rp,un,8),16(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(32(rp,un,8),24(rp)) + adc $0, w3 + mov X(16(up,un,8),8(up)), v0 + mov X(24(up,un,8),16(up)), %rax + mul v0 + add %rax, w3 + mov w3, X(40(rp,un,8),32(rp)) + adc $0, %rdx + mov %rdx, X(48(rp,un,8),40(rp)) + + +C sqr_diag_addlsh1 + + lea 16(up), up + lea 40(rp), rp + pop n + lea 2(n,n), n + + mov (up,n,4), %rax + mul %rax + xor R32(w2), R32(w2) + + mov 8(rp,n,8), w0 + mov %rax, (rp,n,8) + jmp L(lm) + + ALIGN(8) +L(tsd): add %rbx, w0 + adc %rax, w1 + mov w0, -8(rp,n,8) + mov 8(rp,n,8), w0 + mov w1, (rp,n,8) +L(lm): mov 16(rp,n,8), w1 + adc w0, w0 + adc w1, w1 + lea (%rdx,w2), %rbx + mov 8(up,n,4), %rax + setc R8(w2) + mul %rax + add $2, n + js L(tsd) + +L(esd): add %rbx, w0 + adc %rax, w1 + mov w0, X(-8(rp,n,8),-8(rp)) + mov w1, X((rp,n,8),(rp)) + adc w2, %rdx + mov %rdx, X(8(rp,n,8),8(rp)) + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt2/com.asm b/gmp-6.3.0/mpn/x86_64/bt2/com.asm new file mode 100644 index 0000000..87085ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm new file mode 100644 index 0000000..83c0618 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm new file mode 100644 index 0000000..148d0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm new file mode 100644 index 0000000..d693628 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/bd2/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h new file mode 100644 index 0000000..3e26726 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h @@ -0,0 +1,240 @@ +/* AMD Jaguar gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2050 MHz AMD Jaguar/Kabini */ +/* FFT tuning limit = 225,381,546 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 65 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define DIV_1_VS_MUL_1_PERCENT 267 + +#define MUL_TOOM22_THRESHOLD 25 +#define MUL_TOOM33_THRESHOLD 32 +#define MUL_TOOM44_THRESHOLD 93 +#define MUL_TOOM6H_THRESHOLD 366 +#define MUL_TOOM8H_THRESHOLD 537 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 63 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 172 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 434 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 8}, { 303, 9}, \ + { 159,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,11}, \ + { 223,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 607,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \ + { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1343,12}, { 2687,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,15}, { 767,14}, { 1663,13}, { 3327,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,13}, { 8447,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3071,14}, \ + { 6271,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8447,15}, { 4351,14}, { 8959,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 201 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 6}, \ + { 1663, 7}, { 895, 9}, { 239, 8}, { 479,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \ + { 271,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,10}, { 319, 9}, { 639,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4479,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 208 +#define SQR_FFT_THRESHOLD 2880 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 63 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5558 + +#define DC_DIV_QR_THRESHOLD 72 +#define DC_DIVAPPR_Q_THRESHOLD 195 +#define DC_BDIV_QR_THRESHOLD 50 +#define DC_BDIV_Q_THRESHOLD 90 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 195 +#define INV_APPR_THRESHOLD 197 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 104 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,16,194,712,779,2387 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 216 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 9.38% faster than 3 */ +#define HGCD_THRESHOLD 77 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 1 /* 7.74% faster than 4 */ + +/* Tuneup completed successfully, took 495910 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm new file mode 100644 index 0000000..13a2ab3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm @@ -0,0 +1,183 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bd1 2.32 +C AMD bobcat 3 +C Intel P4 13 +C Intel core2 2.9 +C Intel NHM 2.8 +C Intel SBR 2.4 +C Intel atom 5.33 +C VIA nano 3 + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory +C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use +C ADCSBB-to-memory, again saving 1 insn/limb. +C * This runs optimally at decoder bandwidth on K10. It has not been tuned +C for any other processor. + +C INPUT PARAMETERS +define(`cnd', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +ifdef(`OPERATION_cnd_add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + neg cnd + sbb cnd, cnd C make cnd mask + + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + jz L(top) C carry-save reg rax = 0 in this arc + cmp $2, R32(%rax) + jc L(b1) + jz L(b2) + +L(b3): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + and cnd, %r12 + mov (up,n,8), %r10 + and cnd, %r13 + mov 8(up,n,8), %rbx + and cnd, %r14 + mov 16(up,n,8), %rbp + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + ADCSBB %r14, %rbp + mov %rbp, 16(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $3, n + js L(top) + jmp L(end) + +L(b2): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov (up,n,8), %r10 + and cnd, %r12 + mov 8(up,n,8), %rbx + and cnd, %r13 + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $2, n + js L(top) + jmp L(end) + +L(b1): mov (vp,n,8), %r12 + mov (up,n,8), %r10 + and cnd, %r12 + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $1, n + jns L(end) + + ALIGN(16) +L(top): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + mov 24(vp,n,8), %r11 + and cnd, %r12 + mov (up,n,8), %r10 + and cnd, %r13 + mov 8(up,n,8), %rbx + and cnd, %r14 + mov 16(up,n,8), %rbp + and cnd, %r11 + mov 24(up,n,8), %r9 + add R32(%rax), R32(%rax) C restore carry + ADCSBB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + ADCSBB %r14, %rbp + mov %rbp, 16(rp,n,8) + ADCSBB %r11, %r9 + mov %r9, 24(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $4, n + js L(top) + +L(end): neg R32(%rax) + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/com.asm b/gmp-6.3.0/mpn/x86_64/com.asm new file mode 100644 index 0000000..006acaf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/com.asm @@ -0,0 +1,95 @@ +dnl AMD64 mpn_com. + +dnl Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 1.25 +C AMD K10 1.25 +C Intel P4 2.78 +C Intel core2 1.1 +C Intel corei 1.5 +C Intel atom ? +C VIA nano 2 + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + movq (up), %r8 + movl R32(%rdx), R32(%rax) + leaq (up,n,8), up + leaq (rp,n,8), rp + negq n + andl $3, R32(%rax) + je L(b00) + cmpl $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): notq %r8 + movq %r8, (rp,n,8) + decq n + jmp L(e11) +L(b10): addq $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): notq %r8 + movq %r8, (rp,n,8) + incq n + jz L(ret) + +L(oop): movq (up,n,8), %r8 +L(b00): movq 8(up,n,8), %r9 + notq %r8 + notq %r9 + movq %r8, (rp,n,8) + movq %r9, 8(rp,n,8) +L(e11): movq 16(up,n,8), %r8 +L(e10): movq 24(up,n,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(rp,n,8) + movq %r9, 24(rp,n,8) + addq $4, n + jnc L(oop) +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/copyd.asm b/gmp-6.3.0/mpn/x86_64/copyd.asm new file mode 100644 index 0000000..a5e6e59 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/copyd.asm @@ -0,0 +1,93 @@ +dnl AMD64 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1 +C AMD bd1 1.36 +C AMD bobcat 1.71 +C Intel P4 2-3 +C Intel core2 1 +C Intel NHM 1 +C Intel SBR 1 +C Intel atom 2 +C VIA nano 2 + + +IFSTD(`define(`rp',`%rdi')') +IFSTD(`define(`up',`%rsi')') +IFSTD(`define(`n', `%rdx')') + +IFDOS(`define(`rp',`%rcx')') +IFDOS(`define(`up',`%rdx')') +IFDOS(`define(`n', `%r8')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyd) + lea -8(up,n,8), up + lea (rp,n,8), rp + sub $4, n + jc L(end) + nop + +L(top): mov (up), %rax + mov -8(up), %r9 + lea -32(rp), rp + mov -16(up), %r10 + mov -24(up), %r11 + lea -32(up), up + mov %rax, 24(rp) + mov %r9, 16(rp) + sub $4, n + mov %r10, 8(rp) + mov %r11, (rp) + jnc L(top) + +L(end): shr R32(n) + jnc 1f + mov (up), %rax + mov %rax, -8(rp) + lea -8(rp), rp + lea -8(up), up +1: shr R32(n) + jnc 1f + mov (up), %rax + mov -8(up), %r9 + mov %rax, -8(rp) + mov %r9, -16(rp) +1: ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/copyi.asm b/gmp-6.3.0/mpn/x86_64/copyi.asm new file mode 100644 index 0000000..bafce7a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/copyi.asm @@ -0,0 +1,92 @@ +dnl AMD64 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1 +C AMD bd1 1.36 +C AMD bobcat 1.71 +C Intel P4 2-3 +C Intel core2 1 +C Intel NHM 1 +C Intel SBR 1 +C Intel atom 2 +C VIA nano 2 + + +IFSTD(`define(`rp',`%rdi')') +IFSTD(`define(`up',`%rsi')') +IFSTD(`define(`n', `%rdx')') + +IFDOS(`define(`rp',`%rcx')') +IFDOS(`define(`up',`%rdx')') +IFDOS(`define(`n', `%r8')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) + .byte 0,0,0,0,0,0 +PROLOGUE(mpn_copyi) + lea -8(rp), rp + sub $4, n + jc L(end) + +L(top): mov (up), %rax + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + mov %rax, -24(rp) + mov %r9, -16(rp) + sub $4, n + mov %r10, -8(rp) + mov %r11, (rp) + jnc L(top) + +L(end): shr R32(n) + jnc 1f + mov (up), %rax + mov %rax, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: shr R32(n) + jnc 1f + mov (up), %rax + mov 8(up), %r9 + mov %rax, 8(rp) + mov %r9, 16(rp) +1: ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm new file mode 100644 index 0000000..7066bb4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm new file mode 100644 index 0000000..5065120 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm new file mode 100644 index 0000000..57abf31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/coreinhm/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm new file mode 100644 index 0000000..3f875ae --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm @@ -0,0 +1,225 @@ +dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 4.14 +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp', `%r8') +define(`n', `%r9') +define(`cy_param', `8(%rsp)') + +define(`el', `%rbx') +define(`eh', `%rbp') +define(`t0', `%r10') +define(`t1', `%r11') +define(`t2', `%r12') +define(`t3', `%r13') +define(`w0', `%r14') +define(`w1', `%r15') + +ifdef(`OPERATION_add_err1_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err1_n)') +ifdef(`OPERATION_sub_err1_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err1_n)') + +MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + + mov R32(n), R32(%r10) + and $3, R32(%r10) + jz L(0mod4) + cmp $2, R32(%r10) + jc L(1mod4) + jz L(2mod4) +L(3mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + xor R32(t1), R32(t1) + lea -24(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 16(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc 8(yp), t0 + mov 16(up,n,8), w0 + ADCSBB 16(vp,n,8), w0 + mov w0, 16(rp,n,8) + cmovc (yp), t1 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + + add $3, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(0mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea (yp,n,8), yp + neg n + jmp L(loop) + + ALIGN(16) +L(1mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea -8(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc (yp), el + setc %al C save carry + + add $1, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(2mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + lea -16(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 8(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc (yp), t0 + setc %al C save carry + add t0, el + adc $0, eh + + add $2, n + jnz L(loop) + jmp L(end) + + ALIGN(32) +L(loop): + mov (up,n,8), w0 + shr $1, %al C restore carry + mov -8(yp), t0 + mov $0, R32(t3) + ADCSBB (vp,n,8), w0 + cmovnc t3, t0 + mov w0, (rp,n,8) + mov 8(up,n,8), w1 + mov 16(up,n,8), w0 + ADCSBB 8(vp,n,8), w1 + mov -16(yp), t1 + cmovnc t3, t1 + mov -24(yp), t2 + mov w1, 8(rp,n,8) + ADCSBB 16(vp,n,8), w0 + cmovnc t3, t2 + mov 24(up,n,8), w1 + ADCSBB 24(vp,n,8), w1 + cmovc -32(yp), t3 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + add t2, el + adc $0, eh + lea -32(yp), yp + mov w0, 16(rp,n,8) + add t3, el + adc $0, eh + add $4, n + mov w1, -8(rp,n,8) + jnz L(loop) + +L(end): + mov el, (ep) + mov eh, 8(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm new file mode 100644 index 0000000..f9e0039 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm @@ -0,0 +1,150 @@ +dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem. + +dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 1.93\2 +C AMD bull 1.62\2.1 +C AMD pile 1.6\1.7 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.54 +C Intel P4 10 +C Intel core2 2 +C Intel NHM 2 +C Intel SBR 2 +C Intel IBR 1.95 +C Intel HWL 1.72 +C Intel BWL 1.54 +C Intel SKL 1.52 +C Intel atom 9 +C Intel SLM 6.5 +C VIA nano 3 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(start): + mov (up), %r10 + mov (vp), %r11 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + je L(b00) + add %rax, n C clear low rcx bits for jrcxz + cmp $2, R32(%rax) + jl L(b01) + je L(b10) + +L(b11): neg %r8 C set cy + jmp L(e11) + +L(b00): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + + nop + nop + nop +L(b01): neg %r8 C set cy + jmp L(top) + +L(b10): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, -8(rp) + mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0 + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(top): jrcxz L(end) + mov (up,n,8), %r8 + mov (vp,n,8), %r9 + lea 4(n), n + ADCSBB %r11, %r10 + mov %r10, -40(rp,n,8) +L(e00): mov -24(up,n,8), %r10 + mov -24(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -32(rp,n,8) +L(e11): mov -16(up,n,8), %r8 + mov -16(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e10): mov -8(up,n,8), %r10 + mov -8(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) + jmp L(top) +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(start) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm new file mode 100644 index 0000000..a7a5d6e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm @@ -0,0 +1,188 @@ +dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". + +dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 +C AMD K10 4.01 +C AMD bull 4.98 +C AMD pile 4.83 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 +C AMD jaguar 5.54 +C Intel P4 16.3 17.3 +C Intel core2 4.32 4.61 +C Intel NHM 5.08 +C Intel SBR 4.04 +C Intel IBR 3.95 +C Intel HWL 3.66 +C Intel BWL 2.87 +C Intel SKL 2.79 +C Intel atom 20.6 +C Intel SLM 7.6 +C VIA nano 5.25 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`v0', `%rcx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') + define(`func_1c', `mpn_addmul_1c') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') + define(`func_1c', `mpn_submul_1c') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + + C For DOS, on the stack we have four saved registers, return address, + C space for four register arguments, and finally the carry input. + +IFDOS(` define(`carry_in', `72(%rsp)')') dnl +IFSTD(` define(`carry_in', `%r8')') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_1c) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + add carry_in, %rax + adc $0, %rdx + jmp L(start_nc) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + +L(start_nc): + test $1, R8(%rbx) + jnz L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jz L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + + ALIGN(8) +L(odd): inc %rbx + jz L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + ALIGN(16) +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc %rbx, %rbp C rbx = 0 +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc %rbx, %rdx C rbx = 0 +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov R32(%rbx), R32(%rax) C rbx = 0 + adc %rdx, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/com.asm b/gmp-6.3.0/mpn/x86_64/core2/com.asm new file mode 100644 index 0000000..d7d9f79 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyd.asm b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm new file mode 100644 index 0000000..57ea0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyi.asm b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm new file mode 100644 index 0000000..f0c7607 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm new file mode 100644 index 0000000..1b3f139 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm @@ -0,0 +1,243 @@ +dnl x86-64 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C norm unorm frac +C AMD K8,K9 15 15 12 +C AMD K10 15 15 12 +C Intel P4 44 44 43 +C Intel core2 24 24 19.5 +C Intel corei 19 19 18 +C Intel atom 51 51 36 +C VIA nano 46 44 22.5 + +C mp_limb_t +C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d) + +C mp_limb_t +C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d, +C mp_limb_t dinv, int cnt) + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn_param', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`d', `%r8') +define(`dinv', `%r9') C only for mpn_preinv_divrem_1 +C shift passed on stack C only for mpn_preinv_divrem_1 + +define(`cnt', `%rcx') +define(`up', `%rsi') +define(`fn', `%r12') +define(`un', `%rbx') + + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C cnt qp d dinv + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFSTD(`define(`CNTOFF', `40($1)')') +IFDOS(`define(`CNTOFF', `104($1)')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + + lea -8(qp,un_param,8), qp + + mov CNTOFF(%rsp), R8(cnt) + shl R8(cnt), d + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + je L(ret) + + lea -8(qp,un_param,8), qp + xor R32(%rbp), R32(%rbp) + +L(unnormalized): + test un, un + je L(44) + mov -8(up,un,8), %rax + cmp d, %rax + jae L(44) + mov %rbp, (qp) + mov %rax, %rbp + lea -8(qp), qp + je L(ret) + dec un +L(44): + bsr d, %rcx + not R32(%rcx) + sal R8(%rcx), d + sal R8(%rcx), %rbp + + push %rcx +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` sub $8, %rsp ') +IFSTD(` mov d, %rdi ') +IFDOS(` sub $40, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + pop %rcx + + mov %rax, dinv + mov %rbp, %rax + test un, un + je L(frac) + +L(ent): mov -8(up,un,8), %rbp + shr R8(%rcx), %rax + shld R8(%rcx), %rbp, %rax + sub $2, un + js L(end) + + ALIGN(16) +L(top): lea 1(%rax), %r11 + mul dinv + mov (up,un,8), %r10 + shld R8(%rcx), %r10, %rbp + mov %rbp, %r13 + add %rax, %r13 + adc %r11, %rdx + mov %rdx, %r11 + imul d, %rdx + sub %rdx, %rbp + lea (d,%rbp), %rax + sub $8, qp + cmp %r13, %rbp + cmovc %rbp, %rax + adc $-1, %r11 + cmp d, %rax + jae L(ufx) +L(uok): dec un + mov %r11, 8(qp) + mov %r10, %rbp + jns L(top) + +L(end): lea 1(%rax), %r11 + sal R8(%rcx), %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(efx) +L(eok): mov %r13, (qp) + sub $8, qp + jmp L(frac) + +L(ufx): sub d, %rax + inc %r11 + jmp L(uok) +L(efx): sub d, %rax + inc %r13 + jmp L(eok) + +L(frac):mov d, %rbp + neg %rbp + jmp L(fent) + + ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 +L(ftop):mul dinv C 0,12 0,17 0,17 + add %r11, %rdx C 5 8 10 + mov %rax, %r11 C 4 8 3 + mov %rdx, %r13 C 6 9 11 + imul %rbp, %rdx C 6 9 11 + mov d, %rax C + add %rdx, %rax C 10 14 14 + cmp %r11, %rdx C 10 14 14 + cmovc %rdx, %rax C 11 15 15 + adc $-1, %r13 C + mov %r13, (qp) C + sub $8, qp C +L(fent):lea 1(%rax), %r11 C + dec fn C + jns L(ftop) C + + shr R8(%rcx), %rax +L(ret): pop %rbx + pop %rbp + pop %r12 + pop %r13 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm new file mode 100644 index 0000000..b00451f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm @@ -0,0 +1,93 @@ +dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 4.22 * +C Intel PNR 4.22 * +C Intel NHM 4.97 +C Intel WSM 5.17 +C Intel SBR 4.83 * +C Intel IBR 4.16 * +C Intel HWL 3.84 +C Intel BWL 3.76 +C Intel SKL 3.83 +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + jmp L(odd) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 +L(odd): mov v0, %rdx + sub u0, %rdx C v - u + bsf %rdx, %rcx + mov u0, %rax + sub v0, u0 C u - v + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm new file mode 100644 index 0000000..b5aa73b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm @@ -0,0 +1,137 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 8.7 +C Intel PNR 8.7 +C Intel NHM 9.2 +C Intel WSM 9.2 +C Intel SBR 9.1 +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shrd R8(cnt), u1, u0 + shr R8(cnt), u1 + + mov v1, t1 + or u1, t1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h new file mode 100644 index 0000000..44f1494 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3000 MHz Penryn */ +/* FFT tuning limit = 116,220,984 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 16 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 26 + +#define DIV_1_VS_MUL_1_PERCENT 284 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 184 +#define MUL_TOOM6H_THRESHOLD 256 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 102 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 366 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 368, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31, 9}, \ + { 255,10}, { 135,11}, { 79,10}, { 159, 9}, \ + { 319,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 799,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2559,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4991,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 176 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,11}, { 79,10}, \ + { 159, 6}, { 2559, 7}, { 1343, 6}, { 2687, 7}, \ + { 1407, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1663,14}, { 895,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,16}, { 511,15}, { 1023,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,12}, { 11775,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 183 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 67 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 11 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 53 +#define DC_DIVAPPR_Q_THRESHOLD 163 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 158 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 248 +#define REDC_1_TO_REDC_N_THRESHOLD 44 + +#define MU_DIV_QR_THRESHOLD 1187 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,64,105,579,1486 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 134 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD2_DIV1_METHOD 3 /* 2.14% faster than 5 */ +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 161 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 351 +#define JACOBI_BASE_METHOD 4 /* 3.56% faster than 1 */ + +/* Tuneup completed successfully, took 132491 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm new file mode 100644 index 0000000..ded7b67 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm @@ -0,0 +1,210 @@ +dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 n/a +C Intel CNR 4.50 y +C Intel PNR 3.28 y +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + movq (vp), %xmm10 + add $8, vp + pxor %xmm10, %xmm1 + jmp L(e1) + +L(2): add $-48, up + add $-48, vp + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + movq (vp), %xmm10 + add $-40, vp + pxor %xmm10, %xmm1 + jmp L(e3) + +L(4): add $-32, up + add $-32, vp + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + movq (vp), %xmm10 + add $-24, vp + pxor %xmm10, %xmm1 + jmp L(e5) + +L(6): add $-16, up + add $-16, vp + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + movq (vp), %xmm10 + add $-8, vp + pxor %xmm10, %xmm1 + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 + lddqu (vp), %xmm10 + pxor %xmm10, %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 + lddqu 16(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 + lddqu 32(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up + lddqu 48(vp), %xmm10 + add $64, vp + pxor %xmm10, %xmm1 +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm new file mode 100644 index 0000000..5ff174c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm @@ -0,0 +1,285 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 +C AMD K10 1.52 1.75 1.75 n +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.79 = +C AMD bt2 2.15 2.65 2.65 n +C AMD zen 1.5 1.5 1.5 = +C Intel P4 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.5 1.5 y +C Intel IBR 1.47 1.48 1.48 y +C Intel HWL 1.11 1.35 1.35 y +C Intel BWL 1.09 1.30 1.30 y +C Intel SKL 1.21 1.27 1.27 y +C Intel atom 3.31 3.57 3.57 y +C Intel SLM 3.0 3.0 3.0 = +C VIA nano + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + +define(`addptr', `lea $1($2), $2') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 + not %r8 +L(b00): mov 8(vp), %r9 + not %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 + not %r8 +L(e10): mov 24(vp), %r9 + not %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + not %r8 + LOGOP 8(up), %r9 + not %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + not %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + not %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshift.asm b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm new file mode 100644 index 0000000..9016a71 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm @@ -0,0 +1,145 @@ +dnl x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld R8(cnt), %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld R8(cnt), %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld R8(cnt), %r11, %r10 + mov -24(up), %r9 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm new file mode 100644 index 0000000..c428f13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm @@ -0,0 +1,159 @@ +dnl x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.52 +C Intel NHM 1.78 (just 2.15 for n < 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +C TODO +C * This runs poorly on Nehalem compared to plain lshift, in particular for +C n < 256. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + shld R8(cnt), %r11, %r10 + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %r9 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + shld R8(cnt), %r9, %r8 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + not %r11 + shld R8(cnt), %r9, %r8 + mov %r11, (rp) +L(10): mov -8(up), %r11 + not %r8 + shld R8(cnt), %r10, %r9 + mov %r8, -8(rp) +L(01): mov -16(up), %r8 + not %r9 + shld R8(cnt), %r11, %r10 + mov %r9, -16(rp) +L(00): mov -24(up), %r9 + not %r10 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + not %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + not %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + not %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm new file mode 100644 index 0000000..d16be85 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm @@ -0,0 +1,975 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.0 - 4.18-4.25 +C Intel NHM 3.75 3.8 - 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4) +C | | | | +C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) | +C | / | / | / | / +C | / | / | / | / +C | / | / | / | / +C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_ +C _____ _____ _____ _____ +C / \ / \ / \ / \ +C \|/ | \|/ | \|/ | \|/ | +C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) | +C \ /|\ \ /|\ \ /|\ \ /|\ +C \_____/ \_____/ \_____/ \_____/ + +C TODO +C * Tune. None done so far. +C * Currently 2687 bytes, making it smaller would be nice. +C * Implement some basecases, say for un < 4. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance! +define(`vn_param', `%r8') + +define(`un', `%r9') +define(`vn', `(%rsp)') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`i', `%r13') +define(`vp', `%r14') + +define(`X0', `%r8') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + mov (up), %rax C shared for mul_1 and mul_2 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (vp_param), v0 C shared for mul_1 and mul_2 + + xor un, un + sub un_param, un C un = -un_param + + lea (up,un_param,8), up + lea (rp,un_param,8), rp + + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn_param) + jz L(m2) + + lea 8(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):test $2, R8(un) + jnz L(m1s2) + +L(m1s0): + lea (un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + lea L(do_am0)(%rip), %rbp + jmp L(m1e0) + +L(m1s2): + lea 2(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mul v0 + lea L(do_am2)(%rip), %rbp + test i, i + jnz L(m1e2) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),8(rp,un,8)) + mov %rdx, I((rp),16(rp,un,8)) + jmp L(ret2) + +L(m1x1):test $2, R8(un) + jz L(m1s3) + +L(m1s1): + lea 1(un), i + mov %rax, (rp,un,8) + test i, i + jz L(1) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am1)(%rip), %rbp + jmp L(m1e1) +L(1): mov %rdx, I((rp),8(rp,un,8)) + jmp L(ret2) + +L(m1s3): + lea -1(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am3)(%rip), %rbp + jmp L(m1e3) + + ALIGNx +L(m1top): + mul v0 + mov w1, -16(rp,i,8) +L(m1e2):xor R32(w1), R32(w1) + add %rax, w0 + mov (up,i,8), %rax + adc %rdx, w1 + mov w0, -8(rp,i,8) +L(m1e1):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 8(up,i,8), %rax + adc %rdx, w0 + mov w1, (rp,i,8) +L(m1e0):xor R32(w1), R32(w1) + mul v0 + add %rax, w0 + mov 16(up,i,8), %rax + adc %rdx, w1 + mov w0, 8(rp,i,8) +L(m1e3):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 24(up,i,8), %rax + adc %rdx, w0 + add $4, i + js L(m1top) + + mul v0 + mov w1, I(-16(rp),-16(rp,i,8)) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),-8(rp,i,8)) + mov %rdx, I((rp),(rp,i,8)) + + dec vn_param + jz L(ret2) + lea -8(rp), rp + jmp *%rbp + +L(m2): + mov 8(vp_param), v1 + lea 16(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(bx1) + +L(bx0): test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), i + mov %rax, (rp,un,8) + mov %rdx, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov $0, R32(w2) + jmp L(m2e0) + +L(b10): lea -2(un), i + mov %rax, w2 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w3 C FIXME: Use lea? + mov $0, R32(w0) + jmp L(m2e2) + +L(bx1): test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), i + mov %rax, (rp,un,8) + mov (up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mov $0, R32(w1) + jmp L(m2e1) + +L(b11): lea -1(un), i + mov %rax, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w2 C FIXME: Use lea? + mov $0, R32(w3) + jmp L(m2e3) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am0): + push %r15 + push vn_param + +L(olo0): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax +C lea 0(un), i + mov un, i + mul v0 + mov %rax, X0 + mov (up,un,8), %rax + MOV( %rdx, X1, 2) + mul v1 + MOV( %rdx, w0, 4) + mov (rp,un,8), w2 + mov %rax, w3 + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo0) + +L(ret): pop %rax + pop %r15 +L(ret2):pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am1): + push %r15 + push vn_param + +L(olo1): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea 1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (up,un,8), %rax + mov (rp,un,8), w1 + mul v1 + mov %rax, w2 + mov 8(up,un,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo1) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top2): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am2): + push %r15 + push vn_param + +L(olo2): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -2(un), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (up,un,8), %rax + mov (rp,un,8), w0 + mul v1 + mov %rax, w1 + lea (%rdx), w2 + mov 8(up,un,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo2) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am3): + push %r15 + push vn_param + +L(olo3): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,un,8), %rax + mov (rp,un,8), w3 + mul v1 + mov %rax, w0 + MOV( %rdx, w1, 16) + mov 8(up,un,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo3) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm new file mode 100644 index 0000000..0f03d86 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm @@ -0,0 +1,427 @@ +dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.18-4.25 +C Intel NHM 3.75 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`n', `%r9') +define(`i', `%r13') +define(`vp', `%r8') + +define(`X0', `%r14') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n_param + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n_param,8), rp C point rp at R[un] + push %rbp + lea (up,n_param,8), up C point up right after U's end + push %r12 + mov $0, R32(n) C FIXME + sub n_param, n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n_param) + jnz L(m2x1) + +L(m2x0):test $2, R8(n_param) + jnz L(m2b2) + +L(m2b0):lea (n), i + mov %rax, (rp,n,8) + mov %rdx, w1 + mov (up,n,8), %rax + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(m2b2):lea -2(n), i + mov %rax, w2 + mov (up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(m2e2) + +L(m2x1):test $2, R8(n_param) + jnz L(m2b3) + +L(m2b1):lea 1(n), i + mov %rax, (rp,n,8) + mov (up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(m2b3):lea -1(n), i + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + jmp L(m2e3) + + ALIGNx +L(m2tp):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) C FIXME: dead in last iteration + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 C FIXME: dead in last iteration + add $4, i + js L(m2tp) + +L(m2ed):imul v0, %rax + add w3, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea (n), i + mov (rp,n,8), w3 + mov %rax, w0 + MOV( %rdx, w1, 16) + jmp L(lo0) + +L(a110):lea 2(n), i + mov (rp,n,8), w1 + mov %rax, w2 + mov 8(up,n,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo2) + +L(a1x1):mov %rax, X0 + MOV( %rdx, X1, 2) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 1(n), i + MOV( %rdx, w0, 4) + mov (rp,n,8), w2 + mov %rax, w3 + jmp L(lo1) + +L(a111):lea -1(n), i + MOV( %rdx, w2, 64) + mov %rax, w1 + mov (rp,n,8), w0 + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(top): mul v1 + add w0, w1 + adc %rax, w2 + mov -8(up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov -8(up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov -8(rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, -8(rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov (up,i,8), %rax + mov (rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, (rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 8(rp,i,8), w3 + adc $0, X1 + mov 8(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 16(up,i,8), %rax + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 16(up,i,8), %rax + mov 16(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add w0, w1 + adc %rax, w2 + mov I(-8(up),-8(up,i,8)), %rax + imul v0, %rax + add w1, X1 + mov X1, I(-16(rp),-16(rp,i,8)) + adc X0, %rax + mov I(-8(rp),-8(rp,i,8)), w1 + add w1, w2 + add w2, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %rbx + imul v0, %rbx + mov -16(up), %rcx + imul v1, %rcx + mov %rax, -16(rp) + add %rbx, %rcx + add %rdx, %rcx + mov %rcx, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n_param + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/popcount.asm b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm new file mode 100644 index 0000000..3de69d8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm @@ -0,0 +1,185 @@ +dnl AMD64 SSSE3 mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.79-1.91 n +C AMD bd2 1.73-1.85 n +C AMD bd3 ? +C AMD bd4 1.73-1.85 n +C AMD zen 1.47 n +C AMD bobcat 8.0 n +C AMD jaguar 4.78 n +C Intel P4 n/a +C Intel CNR 3.75 +C Intel PNR 2.61 y +C Intel NHM 2.03 n +C Intel SBR 1.87 n +C Intel IBR 1.52-1.58 n +C Intel HWL 1.52-1.58 n +C Intel BWL 1.52-1.58 n +C Intel SKL 1.51 n +C Intel atom 12.3 n +C Intel SLM 9.1 n +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`n', `%rsi') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + jmp L(e1) + +L(2): add $-48, up + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + jmp L(e3) + +L(4): add $-32, up + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + jmp L(e5) + +L(6): add $-16, up + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm new file mode 100644 index 0000000..8c296fd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm @@ -0,0 +1,430 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core 4.5 (fluctuating) +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Keep up[i] in registers for basecases (might require pushes). + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C X q0' n X rp up u0i mp q0 i j + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea -16(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(b0) + +L(b1): cmp $-1, R32(n) + jz L(n1) + cmp $-3, R32(n) + jz L(n3) + + push rp + +L(otp1):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %rbp + mov 8(mp,n,8), %rax + lea (%rdx), %r9 + mul q0 + lea (%rax), %r11 + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %rdi + mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov 24(mp,n,8), %rax + adc %r9, %r11 + mov 24(up,n,8), %rbx + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %rbx + lea (%rax), %r11 + mov 32(mp,n,8), %rax + adc %rdi, %rbp + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + imul u0inv, %rbx C next q limb + add $2, i + jns L(ed1) + + ALIGNx +L(tp1): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp1) + +L(ed1): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b0): cmp $-2, R32(n) + jz L(n2) + cmp $-4, R32(n) + jz L(n4) + + push rp + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %r11 + mov 8(mp,n,8), %rax + lea (%rdx), %rdi + mul q0 + lea (%rax), %rbp + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 24(mp,n,8), %rax + adc %rdi, %rbp + mov 24(up,n,8), %rbx + lea (%rdx), %rdi + adc $0, %r9 + mul q0 + add %rbp, %rbx + lea (%rax), %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi +L(e0): mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp0) + +L(ed0): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + +L(cj): lea 16(up), up C FIXME + pop rp +L(add_n): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(n1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 16(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov 8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, (up) + add %r9, %r10 + adc $0, %r11 + mov %r10, 8(up) + mov %r11, -8(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -32(up), %rdx + mov -24(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc 8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n4): mov -32(mp), %rax + mul q0 + lea (%rax), %r11 + mov -24(mp), %rax + lea (%rdx), %r14 + mul q0 + lea (%rax), %rbp + mov -16(mp), %rax + mov -16(up), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov -8(mp), %rax + adc %r14, %rbp + mov -8(up), %rbx + lea (%rdx), %r14 + adc $0, %r9 + mul q0 + add %rbp, %rbx + adc %r9, %r11 + mov %rbx, -8(up) + mov (up), %r10 + adc $0, %r14 + imul u0inv, %rbx C next q limb + add %r11, %r10 + adc %r14, %rax + mov %r10, (up) + mov 8(up), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, 8(up) + adc $0, %rdx + mov %rdx, -16(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(n4) + lea 16(up), up + jmp L(add_n) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm new file mode 100644 index 0000000..27eed37 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm @@ -0,0 +1,169 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn. + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.05 +C Intel NHM 3.3 +C Intel SBR 2.5 +C Intel atom ? +C VIA nano ? + +C TODO +C * Loopmix to approach 2.5 c/l on NHM. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %r8 + ADCSBB (vp), %r8 + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %r8 + ADDSUB (vp), %r8 +L(ent): sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %rax + and $1, R32(%rax) C return value + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rbp) + neg n + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(n1) + +L(b1): mov %r8, %rbp + inc n + js L(top) + jmp L(end) + +L(n1): jnz L(b3) + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r10 + add $-2, n + jmp L(2) + +L(b3): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r10 + mov 16(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r10 + ADCSBB 16(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r9 + dec n + jmp L(3) + +L(b0): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + jmp L(4) + + ALIGN(16) + +L(top): add R32(%rbx), R32(%rbx) C restore cy + mov (up,n,8), %r8 + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB (vp,n,8), %r8 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + shrd $1, %r8, %rbp + mov %rbp, -8(rp,n,8) +L(4): shrd $1, %r9, %r8 + mov %r8, (rp,n,8) +L(3): shrd $1, %r10, %r9 + mov %r9, 8(rp,n,8) +L(2): shrd $1, %r11, %r10 + mov %r10, 16(rp,n,8) +L(1): add $4, n + mov %r11, %rbp + js L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, -8(rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rshift.asm b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm new file mode 100644 index 0000000..7578a53 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm @@ -0,0 +1,143 @@ +dnl x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea 8(up), up + lea -24(rp), rp + mov -8(up), %r10 + mov (up), %r11 + shrd R8(cnt), %r10, %rax + mov 8(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea 16(up), up + lea -16(rp), rp + mov -16(up), %r9 + shrd R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov -8(up), %r10 + mov (up), %r11 + jmp L(01) + +L(b10): lea 24(up), up + lea -8(rp), rp + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov -8(up), %r10 + jmp L(10) + +L(b11): lea 32(up), up + mov -32(up), %r11 + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shrd R8(cnt), %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd R8(cnt), %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd R8(cnt), %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd R8(cnt), %r11, %r10 + mov 16(up), %r9 + add $32, up + mov %r10, 24(rp) + add $32, rp + dec n + jnz L(top) + +L(end): shrd R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shrd R8(cnt), %r9, %r8 + mov %r8, 8(rp) +L(1): shr R8(cnt), %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm new file mode 100644 index 0000000..a112c1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm @@ -0,0 +1,984 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.9 4.18-4.25 3.87 +C Intel NHM 3.8 4.06-4.2 3.5 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C tail(0m2) tail(1m2) +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Tune. None done so far. +C * Currently 2761 bytes, making it smaller would be nice. +C * Consider using a jumptab-based entry sequence. One might even use a mask- +C less sequence, if the table is large enough to support tuneup's needs. +C The code would be, using non-PIC code, +C lea tab(%rip),%rax; jmp *(n,%rax) +C or, +C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx +C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,.. +C with the last four entries repeated a safe number of times. +C * Consider expanding feed-in code in order to avoid zeroing registers. +C * Zero consistently with xor. +C * Check if using "lea (reg),reg" should be done in more places; we have some +C explicit "mov %rax,reg" now. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r8') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r9') +define(`i', `%r13') + +define(`X0', `%r12') +define(`X1', `%r14') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $4, n_param + jl L(small) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (up), v0 + mov 8(up), %rax + mov %rax, v1 + + mov $1, R32(n) + sub n_param, n C n = -n_param+1 + push n + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + mov %rax, (rp,n,8) + jnz L(b10) + +L(b00): lea (n), i C n = 5, 9, ... + mov %rdx, w1 C FIXME: Use lea? + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(b10): lea 2(n), i C n = 7, 11, ... + mov 8(up,n,8), %rax + mov %rdx, w3 C FIXME: Use lea? + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(m2e2) + +L(bx1): test $2, R8(n) + mov %rax, (rp,n,8) + jz L(b11) + +L(b01): lea 1(n), i C n = 6, 10, ... + mov %rdx, w0 C FIXME: Use lea? + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(b11): lea -1(n), i C n = 4, 8, 12, ... + mov %rdx, w2 C FIXME: Use lea? + xor R32(w3), R32(w3) + jmp L(m2e3) + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 +L(m2e1):mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o3) + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 +L(m2e3):mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-1, n + jz L(cor1) C jumps iff entry n = 4 + +L(am2o1): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (rp,n,8), w1 + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + xor R32(w3), R32(w3) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o3): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (rp,n,8), w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-1, n + jnz L(am2o1) + +L(cor1):pop n + mov %rdx, w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 + add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(sqr_diag_addlsh1) + + ALIGNx +L(m2top2): +L(m2e2):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o0) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 +L(m2e0):mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-2, n + jz L(cor2) C jumps iff entry n = 5 + +L(am2o2): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -2(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (rp,n,8), w0 + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o0): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 0(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 2) + xor R32(w0), R32(w0) + mov (rp,n,8), w2 + xor R32(w3), R32(w3) + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-2, n + jnz L(am2o2) + +L(cor2):pop n + mov -24(up), v0 + mov %rax, w2 + mov %rdx, w0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov -8(up), %rax + mul v0 + add w2, X0 + mov X0, -16(rp) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov -8(up), %rax + adc $0, X0 + mul v1 + add w0, X1 + adc $0, X0 + mov X1, -8(rp) + add X0, %rax + mov %rax, (rp) + adc $0, %rdx + mov %rdx, 8(rp) + lea 8(rp), rp + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + xor R32(%rbx), R32(%rbx) + mul %rax + mov 8(rp,n,8), %r11 + lea (%rdx), %r10 + mov 16(rp,n,8), %r9 + add %r11, %r11 + jmp L(dm) + + ALIGNx +L(dtop):mul %rax + add %r11, %r10 + mov 8(rp,n,8), %r11 + mov %r10, -8(rp,n,8) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov 16(rp,n,8), %r9 + adc %r11, %r11 +L(dm): mov %rax, (rp,n,8) + mov (up,n,4), %rax + adc %r9, %r9 + setc R8(%rbx) + add $2, n + js L(dtop) + + mul %rax + add %r11, %r10 + mov %r10, -8(rp) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov %rax, (rp) + adc $0, %r10 + mov %r10, 8(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + mov (up), %rax + cmp $2, n_param + jae L(gt1) +L(n1): + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) +L(n2): mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): +L(n3): mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm new file mode 100644 index 0000000..46488fc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm new file mode 100644 index 0000000..f3b1e28 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm new file mode 100644 index 0000000..272700d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm @@ -0,0 +1,158 @@ +dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and +dnl Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C AMD K8,K9 4.25 +C AMD K10 ? +C Intel P4 ? +C Intel core2 3 +C Intel NHM 3.1 +C Intel SBR 2.47 +C Intel atom ? +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %r12 + + mov R32(%rcx), R32(%rax) + lea 24(up,n,8), up + lea 24(vp,n,8), vp + lea 24(rp,n,8), rp + neg n + + xor R32(%r11), R32(%r11) + + mov -24(vp,n,8), %r8 C do first limb early + shrd $RSH, %r8, %r11 + + and $3, R32(%rax) + je L(b0) + cmp $2, R32(%rax) + jc L(b1) + je L(b2) + +L(b3): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + mov %r10, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $3, n + js L(top) + jmp L(end) + +L(b1): mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov %r8, %r11 + sbb R32(%rax), R32(%rax) C save cy + inc n + js L(top) + jmp L(end) + +L(b2): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov %r9, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $2, n + js L(top) + jmp L(end) + + ALIGN(16) +L(top): mov -24(vp,n,8), %r8 + shrd $RSH, %r8, %r11 +L(b0): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov (vp,n,8), %rbx + shrd $RSH, %rbx, %r10 + + add R32(%rax), R32(%rax) C restore cy + + mov -24(up,n,8), %r12 + ADCSBB %r11, %r12 + mov %r12, -24(rp,n,8) + + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + + mov (up,n,8), %r12 + ADCSBB %r10, %r12 + mov %r12, (rp,n,8) + + mov %rbx, %r11 + sbb R32(%rax), R32(%rax) C save cy + + add $4, n + js L(top) + +L(end): shr $RSH, %r11 + pop %r12 + pop %rbx + sub R32(%r11), R32(%rax) + neg R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm new file mode 100644 index 0000000..8d3a44a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm @@ -0,0 +1,210 @@ +dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 ? +C AMD zen1 ? +C AMD zen2 ? +C AMD zen3 1.5 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL 1.67 1.74 +C Intel SKL 1.63 1.71 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * For DOS64, fix nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_1) + FUNC_ENTRY(4) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + jmp L(b6) + +L(f1): mulx( (up), %r9, %rax) + jrcxz L(1) + jmp L(b1) +L(1): add (rp), %r9 + mov %r9, (rp) + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +ifdef(`PIC', +` nop;nop;nop;nop', +` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop') + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), n + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + jmp L(b7) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h new file mode 100644 index 0000000..91c91b5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h @@ -0,0 +1,246 @@ +/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */ +/* FFT tuning limit = 467,964,472 */ +/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 24 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 455 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 198 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39, 8}, \ + { 79,10}, { 23, 9}, { 55,11}, { 15,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 639,12}, { 1279,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8703,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95, 8}, { 1599, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 735,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2175,13}, \ + { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9471,14}, { 18943,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 215 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 80 +#define MULLO_MUL_N_THRESHOLD 11025 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 109 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 86 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 171 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_2_THRESHOLD 33 +#define REDC_2_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1866 + +#define POWM_SEC_TABLE 2,10,191,494,712,1378 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 644 +#define SET_STR_PRECOMPUTE_THRESHOLD 1658 + +#define FAC_DSC_THRESHOLD 562 +#define FAC_ODD_THRESHOLD 48 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 5 /* 0.38% faster than 3 */ +#define HGCD_THRESHOLD 73 +#define HGCD_APPR_THRESHOLD 67 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 630 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 29.65% faster than 4 */ + +/* Tuneup completed successfully, took 239050 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm new file mode 100644 index 0000000..b7fae2f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 1.70 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * Port to DOS64, not forgetting nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +dnl ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(b6) + +L(f7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(b7) + +L(f1): mulx( (up), %r9, %rax) + test n, n + jnz L(b1) +L(1): mov %r9, (rp) + ret + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + test n, n + jz L(end) + + ALIGN(32) +L(top): mov %r10, -8(rp) + adc %r8, %r9 +L(b1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(b0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(b7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(b6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(b5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(b4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(b3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec n + mulx( (up), %r9, %rax) + jnz L(top) + +L(end): mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm new file mode 100644 index 0000000..7ca5a9b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm @@ -0,0 +1,368 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`vp', `%r14') +define(`unneg', `%rbx') +define(`v0', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(gen): + push %rbx + push %rbp + push %r12 + push %r14 + + mov vp_param, vp + lea 1(un_param), unneg + mov un_param, n_save + mov R32(un_param), R32(%rax) + and $-8, unneg + shr $3, n_save C loop count + neg unneg + and $7, R32(%rax) C clear CF for adc as side-effect + C note that rax lives very long + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), w2, w3) + lea 56(up), up + lea -8(rp), rp + jmp L(mb0) + +L(mf3): mulx( (up), w0, w1) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(mb3) + +L(mf4): mulx( (up), w2, w3) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(mb4) + +L(mf5): mulx( (up), w0, w1) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(mb5) + +L(mf6): mulx( (up), w2, w3) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(mb6) + +L(mf7): mulx( (up), w0, w1) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(mb7) + +L(mf1): mulx( (up), w0, w1) + jmp L(mb1) + +L(mf2): mulx( (up), w2, w3) + lea 8(up), up + lea 8(rp), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(m1top): + mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up + mov w0, (rp) +L(mb0): mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec n + mulx( (up), w0, w1) + jnz L(m1top) + +L(m1end): + mov w2, -8(rp) + adc w3, w0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn + jz L(done) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + +L(outer): + lea (up,unneg,8), up + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + jmp *jaddr + +L(f0): mulx( 8,(up), w2, w3) + lea 8(rp,unneg,8), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( -16,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b3) + +L(f4): mulx( -24,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b4) + +L(f5): mulx( -32,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b5) + +L(f6): mulx( -40,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b6) + +L(f7): mulx( 16,(up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b7) + +L(f1): mulx( (up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b1) + +L(am1end): + adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn C clear OF as side-effect + jnz L(outer) +L(done): + pop %r14 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(f2): mulx( -8,(up), w2, w3) + lea 8(rp,unneg,8), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(am1top): + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(am1end) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea -1(n), n + mov w0, (rp) + adcx( w1, w2) +L(b0): mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) +L(b7): mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b6): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b5): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b4): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b3): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(am1top) + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) + JMPENT( L(mf7), L(mtab)) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm new file mode 100644 index 0000000..5cdb209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -0,0 +1,395 @@ +dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`jmpreg',`%rbx') +define(`nn', `%rbp') + +C TODO +C * Suppress more rp[] rewrites in corner. +C * Rearrange feed-in jumps for short branch forms. +C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since +C feed-in code implodes, the blow-up will not be more than perhaps 4x. +C * Micro-optimise critical lead-in code block around L(ent). +C * Write n < 4 code specifically for Broadwell (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r14 + push %r12 + push %rbx + push %rbp + mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end + imul (up), %r14 C FIXME Put at absolute end + lea -3(n), R32(nn) + lea 8(vp_param), vp + mov (vp_param), %rdx + + mov R32(n), R32(%rax) + shr $3, R32(n) + and $7, R32(%rax) C clear OF, CF as side-effect + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + jrcxz L(mc) + inc R32(n) + lea L(f2)(%rip), jmpreg + jmp L(mb3) + +L(mc): mulx( -8,(up), %r10, %r8) + add %rax, %r10 + mov %r9, -16(rp) + mulx( (up), %r9, %rax) + mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + jmp L(c2) + +L(mf4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx( (up), %r9, %rax) + lea L(f0)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + lea L(f1)(%rip), jmpreg + mulx( (up), %r9, %rax) + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(mtop):mov %r10, -8(rp) + adc %r8, %r9 +L(mb1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(mb0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(mb7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(mb6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(mb5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(mb4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(mb3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec R32(n) + mulx( (up), %r9, %rax) + jnz L(mtop) + +L(mend):mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + + lea 8(,nn,8), %r12 + neg %r12 + shr $3, R32(nn) + jmp L(ent) + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f1): mulx( (up), %r9, %rax) + lea -1(nn), R32(nn) + lea L(f0)(%rip), jmpreg + jmp L(b1) + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits + lea 8(%r12), %r12 +L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) + add %rax, %r14 + add %r10, %r14 C h + lea (up,%r12), up C reset up + lea 8(rp,%r12), rp C reset rp + mov (vp), %rdx + lea 8(vp), vp + or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) + jmp *jmpreg + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + lea L(f1)(%rip), jmpreg + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), R32(n) + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jrcxz L(cor) + lea L(f2)(%rip), jmpreg + jmp L(b3) + +L(cor): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) C FIXME suppress + adox( (rp), %r9) + mov %r9, (rp) C FIXME suppress + adox( %rcx, %rax) +L(c2): + mulx( 8,(up), %r10, %r8) + adc %rax, %r14 + add %r10, %r14 + mov (vp), %rdx + test R32(%rcx), R32(%rcx) + mulx( -16,(up), %r10, %r8) + mulx( -8,(up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + adox( (rp), %r9) + adox( %rcx, %rax) + adc %rcx, %rax + mulx( (up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov 8(vp), %rdx + mulx( -16,(up), %rcx, %rax) + add %r9, %rcx + mov %rcx, (rp) + adc $0, %rax + mulx( -8,(up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov %r14, 8(rp) + pop %rbp + pop %rbx + pop %r12 + pop %r14 + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..ff35124 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm @@ -0,0 +1,710 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell. + +dnl Copyright 2015, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zn1 ? ? +C AMD zn2 ? ? +C AMD zn3 ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * Reduce register use, i.e., by combining n_neg and n_save. +C * Supporess initial store through up, it's always a zero. +C * Streamline up and dp setup. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +dnl mp_limb_t +dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, +dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) + +define(`up', `%rdi') +define(`un', `%rsi') +define(`dp_param',`%rdx') +define(`dn_param',`%rcx') +define(`dinv', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`dp', `%r14') +define(`n_neg', `%rbx') +define(`q', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ifdef(`MAX_SPECIAL',,` +define(`MAX_SPECIAL', 8)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + lea L(atab)(%rip), %r10 + + cmp $MAX_SPECIAL, dn_param + jbe L(sma) + +ifelse(MAX_SPECIAL,8,,` +forloop(i,eval(MAX_SPECIAL+1),9,`L(i): +')') + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + + sub dn_param, un C outer loop count + + lea -8(,dn_param,8), n_neg + neg n_neg + mov dn_param, n_save + mov R32(dn_param), R32(%rax) + shr $3, n_save C loop count + and $7, R32(%rax) C clear CF and OF as side-effect + +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax,%r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + mov (up), q + imul dinv, q + jmp L(outer) + +L(f0): mulx( (dp), w2, w3) + lea -1(n), n + mulx( 8,(dp), w0, w1) + lea -8(dp), dp + adcx( w3, w0) + adox( (up), w2) + lea -8(up), up + jmp L(b0x) + +L(f3): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -48(up), up + lea 16(dp), dp + jmp L(b3x) + +L(f4): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 24(dp), dp + adox( (up), w2) + lea -40(up), up + adcx( w3, w0) + jmp L(b4x) + +L(f5): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 32(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -32(up), up + jmp L(b5x) + +L(f6): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 40(dp), dp + adox( (up), w2) + lea -24(up), up + adcx( w3, w0) + jmp L(b6x) + +L(f7): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 48(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -16(up), up + jmp L(b7x) + +L(f1): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + jmp L(b1x) + +L(f2): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 8(dp), dp + adox( (up), w2) + lea 8(up), up + adcx( w3, w0) + jmp L(b2x) + +L(end): adox( (up), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (up) + adc %rcx, w1 C relies on rcx = 0 + mov 8(up,n_neg), q C Compute next quotient early... + mulx( dinv, q, %r12) C ...(unused in last iteration) + bt $0, R32(%r13) + adc w1, 8(up) + setc R8(%r13) + dec un C clear OF as side-effect + jz L(done) + + lea (dp,n_neg), dp C reset dp to D[]'s beginning + lea 8(up,n_neg), up C point up to U[]'s current beginning +L(outer): + mov n_save, n + test %eax, %eax C clear CF and OF + jmp *jaddr + + ALIGN(16) +L(top): adox( -8,(up), w2) + adcx( w3, w0) + mov w2, -8(up) + jrcxz L(end) +L(b2x): mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + mov w0, (up) +L(b1x): adcx( w1, w2) + mulx( 16,(dp), w0, w1) + adcx( w3, w0) + adox( 8,(up), w2) + mov w2, 8(up) +L(b0x): mulx( 24,(dp), w2, w3) + lea 64(dp), dp + adcx( w1, w2) + adox( 16,(up), w0) + mov w0, 16(up) +L(b7x): mulx( -32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) +L(b6x): mulx( -24,(dp), w2, w3) + adcx( w1, w2) + adox( 32,(up), w0) + mov w0, 32(up) +L(b5x): mulx( -16,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) +L(b4x): adox( 48,(up), w0) + mulx( -8,(dp), w2, w3) + mov w0, 48(up) +L(b3x): lea 64(up), up + adcx( w1, w2) + mulx( (dp), w0, w1) + jmp L(top) + +L(done):mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(sma): +ifdef(`PIC', +` movslq 28(%r10,dn_param,4), %rax + lea (%rax,%r10), jaddr +',` + mov 56(%r10,dn_param,8), jaddr +') + jmp *jaddr + +L(1): mov (dp_param), %r10 + xor R32(%rax), R32(%rax) + mov (up), %rdx + dec un + mov %rdx, %r9 +L(o1): mulx( dinv, %rdx, %r11) C next quotient + lea 8(up), up + mulx( %r10, %rcx, %rdx) C 0 1 + add %r9, %rcx C 0 + adc %rax, %rdx C 1 + add (up), %rdx C 1 + setc R8(%rax) C 2 + mov %rdx, %r9 C 1 + dec un + jnz L(o1) + mov %r9, (up) + + FUNC_EXIT() + ret + +ifdef(`VER',,`define(`VER',1)') +L(2): push %r12 + push %r14 + + mov dp_param, dp C free up rdx + sub dn_param, un C loop count + mov (up), q + imul dinv, q + +ifelse(VER,0,` + xor R32(%rax), R32(%rax) +L(o2): test %eax, %eax C clear CF and OF + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + add (up), w2 C 0 + adc 8(up), %rdx C 1 + adc $0, w1 C 2 cannot carry further + add w3, %rdx C 1 + mov %rdx, 8(up) C 1 + adc $0, w1 C 2 + imul dinv, q C + bt $0, R32(%rax) + adc 16(up), w1 C 2 + mov w1, 16(up) + setc R8(%rax) + lea 8(up), up + dec un + jnz L(o2) +') +ifelse(VER,1,` + push %rbx + push %r13 + xor R32(%r13), R32(%r13) + mov (up), %rax + mov 8(up), %rbx +L(o2): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + adox( %rax, w2) C 0 + adcx( w3, %rdx) C 1 + adox( %rbx, %rdx) C 1 + adox( %rcx, w1) C 2 cannot carry further + mov %rdx, %rax C 1 + adc %rcx, w1 C 2 + imul dinv, q C + bt $0, R32(%r13) + adc 16(up), w1 C 2 + mov w1, %rbx + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o2) + + mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r13 + pop %rbx +') +ifelse(VER,2,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + mulx( 8,(dp), %rdx, %rcx) + add %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r10, %r12 C 0 add just to produce carry + adc %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') +ifelse(VER,3,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + add %r10, %r12 C 0 add just to produce carry + mulx( 8,(dp), %rdx, %rcx) + adc %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') + pop %r14 + pop %r12 + FUNC_EXIT() + ret + +ifelse(eval(MAX_SPECIAL>=3),1,` +L(3): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF + mulx( (dp), w0, w1) C 0 1 + adox( %rax, w0) C 0 + mulx( 8,(dp), %rax, w3) C 1 2 + adcx( w1, %rax) C 1 + adox( %rbx, %rax) C 1 + mulx( 16,(dp), %rbx, w1) C 2 3 + mov dinv, q C 1 + mulx( %rax, q, w0) + adcx( w3, %rbx) C 2 + adox( 16,(up), %rbx) C 2 + adox( %rcx, w1) C 3 + adc $0, w1 C 3 + bt $0, R32(%r13) + adc w1, 24(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o3) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=4),1,` +L(4): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o4): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + mov dinv, q + mulx( %rax, q, w2) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + adox( %rcx, w1) + mov w0, 24(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 32(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o4) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=5),1,` +L(5): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o5): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + adox( 16,(up), %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 24(up) + adox( 32,(up), w0) + adox( %rcx, w1) + mov w0, 32(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 40(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o5) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=6),1,` +L(6): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o6): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + mulx( 32,(dp), w2, w3) + mov w0, 24(up) + adcx( w1, w2) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 32(up) + adox( 40,(up), w0) + adox( %rcx, w1) + mov w0, 40(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 48(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o6) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=7),1,` +L(7): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o7): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + adox( 16,(up), %rbx) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) + adox( 32,(up), w0) + mulx( 40,(dp), w2, w3) + mov w0, 32(up) + adcx( w1, w2) + mulx( 48,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) + mov %rax, q + mulx( dinv, q, w2) + adox( 48,(up), w0) + adox( %rcx, w1) + mov w0, 48(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 56(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o7) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=8),1,` +L(8): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o8): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + mulx( 32,(dp), w2, w3) + adcx( w1, w2) + adox( 24,(up), w0) + mov w0, 24(up) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov w2, 32(up) + adox( 40,(up), w0) + mulx( 48,(dp), w2, w3) + mov w0, 40(up) + adcx( w1, w2) + mulx( 56,(dp), w0, w1) + adox( 48,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 48(up) + adox( 56,(up), w0) + adox( %rcx, w1) + mov w0, 56(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 64(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o8) + jmp L(esma) +') + +L(esma):mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret + + + JUMPTABSECT + ALIGN(8) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(1), L(atab)) + JMPENT( L(2), L(atab)) + JMPENT( L(3), L(atab)) + JMPENT( L(4), L(atab)) + JMPENT( L(5), L(atab)) + JMPENT( L(6), L(atab)) + JMPENT( L(7), L(atab)) + JMPENT( L(8), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm new file mode 100644 index 0000000..e81b01b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm @@ -0,0 +1,839 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * We have 8 addmul_1 loops which fall into each other. The idea is to save +C on switching code, since a circularly updated computed goto target will +C hardly allow correct branch prediction. On 2nd thought, we now might make +C each of the 8 loop branches be poorly predicted since they will be +C executed fewer times for each time. With just one addmul_1 loop, the loop +C count will change only once each 8th time. +C * Do overlapped software pipelining. +C * Perhaps load in shrx/sarx, eliminating separate load insn. +C * Schedule add+stored in small n code. +C * Try swapping adox and adcx insn, making mulx have more time to run. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`n', `%rcx') +define(`un_save', `%rbx') +define(`u0', `%rdx') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %rbx + + lea -3(un_param), R32(un_save) + lea 5(un_param), R32(n) + mov R32(un_param), R32(%rax) + and $-8, R32(un_save) + shr $3, R32(n) C count for mul_1 loop + neg un_save C 8*count and offert for addmul_1 loops + and $7, R32(%rax) C clear CF for adc as side-effect + + mov (up), u0 + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + lea 64(up), up + add w1, w2 + jmp L(mb0) + +L(mf3): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mov w2, (rp) + mulx( 8,(up), w0, w1) + lea 24(up), up + lea 24(rp), rp + add w3, w0 + jmp L(mb3) + +L(mf4): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 32(up), up + lea 32(rp), rp + add w1, w2 + jmp L(mb4) + +L(mf5): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 40(up), up + lea 40(rp), rp + add w3, w0 + jmp L(mb5) + +L(mf6): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 48(up), up + lea 48(rp), rp + add w1, w2 + jmp L(mb6) + +L(mf7): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 56(up), up + lea 56(rp), rp + add w3, w0 + jmp L(mb7) + +L(mf1): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 8(up), up + lea 8(rp), rp + add w3, w0 + jmp L(mb1) + +L(mf2): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 16(up), up + lea 16(rp), rp + dec R32(n) + add w1, w2 + mulx( (up), w0, w1) + + ALIGN(16) +L(top): mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up +L(mb0): mov w0, (rp) + mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec R32(n) + mulx( (up), w0, w1) + jnz L(top) + +L(end): mov w2, -8(rp) + adc w3, w0 +C mov w0, (rp) +C adc %rcx, w1 +C mov w1, 8(rp) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r11 +',` + mov (%r10,%rax,8), %r11 +') + mov $63, R32(%rax) + jmp *%r11 + +L(ed0): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f7): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov (up), w1 C up[-1] + mov 8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + jmp L(b7) + + ALIGN(16) +L(tp0): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) +L(b0): mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp0) + +L(ed1): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f0): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + jmp L(b0) + + ALIGN(16) +L(tp1): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed1) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp1) + +L(ed2): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f1): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea 8(un_save), un_save + lea -56(rp,un_save,8), rp + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) C FIXME: crossjump? + mulx( (up), w0, w1) + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jmp L(b1) + + ALIGN(16) +L(tp2): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed2) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b2): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp2) + +L(ed3): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f2): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + or R32(un_save), R32(n) + jz L(cor3) + lea -56(rp,un_save,8), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b2) + + ALIGN(16) +L(tp3): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed3) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b3): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp3) + +L(ed4): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f3): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -32(up), w1 C up[-1] + mov -24(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b3) + + ALIGN(16) +L(tp4): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed4) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b4): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp4) + +L(ed5): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f4): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -40(up), w3 C up[-1] + mov -32(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b4) + + ALIGN(16) +L(tp5): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed5) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b5): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp5) + +L(ed6): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f5): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -48(up), w1 C up[-1] + mov -40(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b5) + + ALIGN(16) +L(tp6): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed6) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up +L(b6): adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp6) + +L(ed7): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f6): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -56(up), w3 C up[-1] + mov -48(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( -40,(up), w2, w3) + jmp L(b6) + + ALIGN(16) +L(tp7): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed7) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) +L(b7): adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp7) + +L(cor3):lea -64(rp), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + adox( 56,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 56(rp) + adcx( w1, w2) + mulx( (up), %rbx, w1) + adox( 64,(rp), w2) + adcx( w3, %rbx) + mov w2, 64(rp) + adox( 72,(rp), %rbx) + adox( %rcx, w1) C relies on rcx = 0 + adc %rcx, w1 C relies on rcx = 0 + mov w1, 80(rp) C FIXME +C wd2 + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + mulx( (up), w0, %rax) + adox( %rbx, w2) + adcx( w3, w0) + mov w2, 72(rp) + adox( 80,(rp), w0) + adox( %rcx, %rax) C relies on rcx = 0 + mov w0, 80(rp) + adc %rcx, %rax C relies on rcx = 0 +C wd1 + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + sar $63, w3 + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + adcx( w3, w0) + adox( %rax, w0) + mov w0, 88(rp) + adcx( %rcx, w1) + adox( %rcx, w1) + mov w1, 96(rp) + + pop %rbx + FUNC_EXIT() + ret + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) +L(atab):JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm new file mode 100644 index 0000000..9d1c405 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bull n/a +C AMD pile n/a +C AMD steam n/a +C AMD excavator ? +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel core n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.15 +C Intel BWL 2.33 +C Intel SKL 2.22 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') +define(`X0', `%r12') +define(`X1', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + test $2, R8(n_param) + jnz L(b10) + +L(b00): mov (up), %rdx + lea 16(up), up + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov -8(up), %rdx + lea 16(rp), rp + jmp L(lo0) + +L(b10): mov (up), %rdx + inc n + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + mov X0, (rp) + mov 16(rp), X0 + add %rax, X1 + adc $0, w2 + xor w0, w0 + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + test $2, R8(n_param) + jnz L(b11) + +L(b01): mov (up), %rdx + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mov 16(rp), X1 + mulx( v0, %rax, w1) + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): mov (up), %rdx + inc n + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov X1, (rp) + mov 8(up), %rdx + mulx( v0, %rax, w1) + lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 +L(lo2): mov 8(up), %rdx + lea 32(up), up + dec n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm new file mode 100644 index 0000000..fc99627 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm @@ -0,0 +1,261 @@ +dnl AMD64 mpn_add_n, mpn_sub_n + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 1.5 with fluctuations +C AMD bd2 1.5 with fluctuations +C AMD bd3 +C AMD bd4 1.6 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL 1.21 +C Intel BWL 1.04 +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) + + lea L(tab)(%rip), %r9 + neg %r8 C set carry +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) C clear cy as side-effect + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(0): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + jmp L(e0) + +L(4): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + lea -32(up), up + lea -32(vp), vp + lea -32(rp), rp + inc n + jmp L(e4) + +L(5): mov (up), %r11 + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB (vp), %r11 + lea -24(up), up + lea -24(vp), vp + lea -24(rp), rp + inc n + jmp L(e5) + +L(6): mov (up), %r10 + ADCSBB (vp), %r10 + mov 8(up), %r11 + lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + inc n + jmp L(e6) + +L(7): mov (up), %r9 + mov 8(up), %r10 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + inc n + jmp L(e7) + + ALIGN(16) +L(top): +L(e3): mov %r9, 40(rp) +L(e2): mov %r10, 48(rp) +L(e1): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + mov %r11, 56(rp) + lea 64(rp), rp +L(e0): mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + mov %r8, (rp) +L(e7): mov 24(up), %r11 + mov %r9, 8(rp) +L(e6): mov 32(up), %r8 + mov 40(up), %r9 + ADCSBB 24(vp), %r11 + mov %r10, 16(rp) +L(e5): ADCSBB 32(vp), %r8 + mov %r11, 24(rp) +L(e4): mov 48(up), %r10 + mov 56(up), %r11 + mov %r8, 32(rp) + lea 64(up), up + ADCSBB 40(vp), %r9 + ADCSBB 48(vp), %r10 + ADCSBB 56(vp), %r11 + lea 64(vp), vp + dec n + jnz L(top) + +L(end): mov %r9, 40(rp) + mov %r10, 48(rp) + mov %r11, 56(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(3): mov (up), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + jrcxz L(x3) + lea 24(up), up + lea 24(vp), vp + lea -40(rp), rp + jmp L(e3) +L(x3): mov %r9, (rp) + mov %r10, 8(rp) + mov %r11, 16(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(1): mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(x1) + lea 8(up), up + lea 8(vp), vp + lea -56(rp), rp + jmp L(e1) +L(x1): mov %r11, (rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(2): mov (up), %r10 + mov 8(up), %r11 + ADCSBB (vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(x2) + lea 16(up), up + lea 16(vp), vp + lea -48(rp), rp + jmp L(e2) +L(x2): mov %r10, (rp) + mov %r11, 8(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm new file mode 100644 index 0000000..3f43afa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm @@ -0,0 +1,201 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 2.32 +C Intel BWL 2.04 +C Intel SKL 1.95 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Handle small n separately, for lower overhead. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rbp') +define(`v0', `%rdx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`ADCSBB', `adc') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`ADCSBB', `sbb') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov n_param, n + mov v0_param, v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): shr $2, n + jc L(b10) + +L(b00): mulx( (up), %r13, %r12) + mulx( 8,(up), %rbx, %rax) + add %r12, %rbx + adc $0, %rax + mov (rp), %r12 + mov 8(rp), %rcx + mulx( 16,(up), %r9, %r8) + lea -16(rp), rp + lea 16(up), up + ADDSUB %r13, %r12 + jmp L(lo0) + +L(bx1): shr $2, n + jc L(b11) + +L(b01): mulx( (up), %r11, %r10) + jnz L(gt1) +L(n1): ADDSUB %r11, (rp) + mov $0, R32(%rax) + adc %r10, %rax + jmp L(ret) + +L(gt1): mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + lea 24(up), up + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (rp), %r10 + mov 8(rp), %r12 + mov 16(rp), %rcx + lea -8(rp), rp + ADDSUB %r11, %r10 + jmp L(lo1) + +L(b11): mulx( (up), %rbx, %rax) + mov (rp), %rcx + mulx( 8,(up), %r9, %r8) + lea 8(up), up + lea -24(rp), rp + inc n C adjust n + ADDSUB %rbx, %rcx + jmp L(lo3) + +L(b10): mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + lea -32(rp), rp + mov $0, R32(%rax) + clc C clear cf + jz L(end) C depends on old shift + + ALIGN(16) +L(top): adc %rax, %r9 + lea 32(rp), rp + adc %r8, %r11 + mulx( 16,(up), %r13, %r12) + mov (rp), %r8 + mulx( 24,(up), %rbx, %rax) + lea 32(up), up + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(rp), %r10 + mov 16(rp), %r12 + ADDSUB %r9, %r8 + mov 24(rp), %rcx + mov %r8, (rp) + ADCSBB %r11, %r10 +L(lo1): mulx( (up), %r9, %r8) + mov %r10, 8(rp) + ADCSBB %r13, %r12 +L(lo0): mov %r12, 16(rp) + ADCSBB %rbx, %rcx +L(lo3): mulx( 8,(up), %r11, %r10) + mov %rcx, 24(rp) + dec n + jnz L(top) + +L(end): adc %rax, %r9 + adc %r8, %r11 + mov 32(rp), %r8 + mov %r10, %rax + adc $0, %rax + mov 40(rp), %r10 + ADDSUB %r9, %r8 + mov %r8, 32(rp) + ADCSBB %r11, %r10 + mov %r10, 40(rp) + adc $0, %rax + +L(ret): pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm new file mode 100644 index 0000000..b5863b6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm @@ -0,0 +1,138 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, useful tzcnt, shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 6.7 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 5.4 +C AMD zn2 5.5 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL 7.1 +C Intel BWL 5.5 +C Intel SKL 5.6 +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') +define(`cnt', `%rax') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + + mov u0, s0 + sub v0, u0 + mov u1, s1 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + xor R32(t0), R32(t0) + sub cnt, t0 + shlx( t0, u1, s1) + shrx( cnt, u0, u0) + shrx( cnt, u1, u1) + or s1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + rep;bsf t0, cnt C tzcnt! + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): mov v0, %rax + C mov v1, %rdx +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h new file mode 100644 index 0000000..c11aeec --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h @@ -0,0 +1,253 @@ +/* Haswell gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */ +/* FFT tuning limit = 467,964,359 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 25 + +#define DIV_1_VS_MUL_1_PERCENT 427 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 276 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 120 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 315 +#define SQR_TOOM6_THRESHOLD 414 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63, 8}, { 1023,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,10}, \ + { 895,11}, { 479,13}, { 127,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 735,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,14}, { 127,12}, { 543,11}, { 1087,12}, \ + { 607,11}, { 1215,10}, { 2431,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,13}, \ + { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7679,14}, { 15359,15}, \ + { 7935,17}, { 2047,16}, { 4095,15}, { 8447,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 238 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 6}, { 5631, 7}, \ + { 2943, 6}, { 5887, 8}, { 1535,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 447,11}, { 239,10}, \ + { 479,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,11}, \ + { 367,10}, { 735,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 607,10}, \ + { 1215,11}, { 671,12}, { 351,11}, { 735,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,11}, { 959,14}, \ + { 127,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 607,11}, { 1215,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1151,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,11}, \ + { 2815,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 237 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 8967 +#define SQRLO_BASECASE_THRESHOLD 11 +#define SQRLO_DC_THRESHOLD 80 +#define SQRLO_SQR_THRESHOLD 6481 + +#define DC_DIV_QR_THRESHOLD 58 +#define DC_DIVAPPR_Q_THRESHOLD 182 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 123 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 179 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 48 +#define REDC_2_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1506 + +#define POWM_SEC_TABLE 1,22,194,473,1297,2698 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 1391 +#define SET_STR_PRECOMPUTE_THRESHOLD 2654 + +#define FAC_DSC_THRESHOLD 562 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 5 /* 3.49% faster than 3 */ +#define HGCD_THRESHOLD 96 +#define HGCD_APPR_THRESHOLD 92 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 501 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 23.87% faster than 4 */ + +/* Tuneup completed successfully, took 238360 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm new file mode 100644 index 0000000..5e649e8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm @@ -0,0 +1,159 @@ +dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 1.59 +C Intel BWL 1.76 +C Intel SKL 1.54 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rbp') +define(`v0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + + mov n_param, n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + mov v0_param, v0 + jnz L(b10) + +L(b00): mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %rcx, %r12) + lea -32(rp), rp + jmp L(lo0) + +L(b10): mulx( (up), %rcx, %r12) + mulx( 8,(up), %rbx, %rax) + lea -16(rp), rp + test n, n + jz L(cj2) + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + mov v0_param, v0 + jnz L(b11) + +L(b01): mulx( (up), %rbx, %rax) + lea -24(rp), rp + test n, n + jz L(cj1) + mulx( 8,(up), %r9, %r8) + lea 8(up), up + jmp L(lo1) + +L(b11): mulx( (up), %r11, %r10) + mulx( 8,(up), %rcx, %r12) + mulx( 16,(up), %rbx, %rax) + lea -8(rp), rp + test n, n + jz L(cj3) + lea 24(up), up + jmp L(lo3) + + ALIGN(32) +L(top): lea 32(rp), rp + mov %r9, (rp) + adc %r8, %r11 +L(lo3): mulx( (up), %r9, %r8) + mov %r11, 8(rp) + adc %r10, %rcx +L(lo2): mov %rcx, 16(rp) + adc %r12, %rbx +L(lo1): mulx( 8,(up), %r11, %r10) + adc %rax, %r9 + mulx( 16,(up), %rcx, %r12) + mov %rbx, 24(rp) +L(lo0): mulx( 24,(up), %rbx, %rax) + lea 32(up), up + dec n + jnz L(top) + +L(end): lea 32(rp), rp + mov %r9, (rp) + adc %r8, %r11 +L(cj3): mov %r11, 8(rp) + adc %r10, %rcx +L(cj2): mov %rcx, 16(rp) + adc %r12, %rbx +L(cj1): mov %rbx, 24(rp) + adc $0, %rax + + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm new file mode 100644 index 0000000..f1f044f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 3.74 +C Intel BWL 4.21 +C Intel SKL 4.20 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Move test and jcc together, for insn fusion. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + lea 3(n_param), n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): xor w0, w0 + test $2, R8(n_param) + mov (up), %rdx + mulx( v0, w2, w1) + jz L(lo0) + +L(b10): lea -16(rp), rp + lea -16(up), up + jmp L(lo2) + +L(bx1): xor w2, w2 + test $2, R8(n_param) + mov (up), %rdx + mulx( v0, w0, w3) + jnz L(b11) + +L(b01): lea -24(rp), rp + lea 8(up), up + jmp L(lo1) + +L(b11): lea -8(rp), rp + lea -8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v1, %rax, w0) + add %rax, w2 C 0 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 1 + add %rax, w2 C 0 + adc $0, w1 C 1 + add w3, w2 C 0 +L(lo0): mov w2, (rp) C 0 + adc $0, w1 C 1 + mulx( v1, %rax, w2) + add %rax, w0 C 1 + mov 8(up), %rdx + adc $0, w2 C 2 + mulx( v0, %rax, w3) + add %rax, w0 C 1 + adc $0, w3 C 2 + add w1, w0 C 1 +L(lo3): mov w0, 8(rp) C 1 + adc $0, w3 C 2 + mulx( v1, %rax, w0) + add %rax, w2 C 2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 3 + add %rax, w2 C 2 + adc $0, w1 C 3 + add w3, w2 C 2 +L(lo2): mov w2, 16(rp) C 2 + adc $0, w1 C 3 + mulx( v1, %rax, w2) + add %rax, w0 C 3 + mov 24(up), %rdx + adc $0, w2 C 4 + mulx( v0, %rax, w3) + add %rax, w0 C 3 + adc $0, w3 C 4 + add w1, w0 C 3 + lea 32(up), up +L(lo1): mov w0, 24(rp) C 3 + adc $0, w3 C 4 + dec n + lea 32(rp), rp + jnz L(top) + +L(end): mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm new file mode 100644 index 0000000..b2656c8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm @@ -0,0 +1,441 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 n/a n/a - n/a +C AMD K10 n/a n/a - n/a +C AMD bull n/a n/a - n/a +C AMD pile n/a n/a - n/a +C AMD steam ? ? - ? +C AMD bobcat n/a n/a - n/a +C AMD jaguar ? ? - ? +C Intel P4 n/a n/a - n/a +C Intel core n/a n/a - n/a +C Intel NHM n/a n/a - n/a +C Intel SBR n/a n/a - n/a +C Intel IBR n/a n/a - n/a +C Intel HWL 1.77 1.86 - 2.15 +C Intel BWL ? ? - ? +C Intel atom n/a n/a - n/a +C VIA nano n/a n/a - n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Adjoin a mul_3. +C * Further micro-optimise. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov un_param, un C free up rdx + neg un + + mov un_param, n C FIXME: share + sar $2, n C FIXME: share + + test $1, R8(vn) + jz L(do_mul_2) + +define(`w4', `%r9') +define(`w5', `%r14') + + mov (vp), %rdx + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):test $2, R8(un) + jnz L(m110) + +L(m100): + mulx( (up), w5, w2) + mulx( 8,(up), w1, w3) + lea -24(rp), rp + jmp L(m1l0) + +L(m110): + mulx( (up), w3, w4) + mulx( 8,(up), w1, w5) + lea -8(rp), rp + test n, n + jz L(cj2) + mulx( 16,(up), w0, w2) + lea 16(up), up + jmp L(m1l2) + +L(m1x1):test $2, R8(un) + jz L(m111) + +L(m101): + mulx( (up), w4, w5) + lea -16(rp), rp + test n, n + jz L(cj1) + mulx( 8,(up), w0, w2) + lea 8(up), up + jmp L(m1l1) + +L(m111): + mulx( (up), w2, w3) + mulx( 8,(up), w0, w4) + mulx( 16,(up), w1, w5) + lea 24(up), up + test n, n + jnz L(gt3) + add w0, w3 + jmp L(cj3) +L(gt3): add w0, w3 + jmp L(m1l3) + + ALIGN(32) +L(m1tp):lea 32(rp), rp +L(m1l3):mov w2, (rp) + mulx( (up), w0, w2) +L(m1l2):mov w3, 8(rp) + adc w1, w4 +L(m1l1):adc w0, w5 + mov w4, 16(rp) + mulx( 8,(up), w1, w3) +L(m1l0):mov w5, 24(rp) + mulx( 16,(up), w0, w4) + adc w1, w2 + mulx( 24,(up), w1, w5) + adc w0, w3 + lea 32(up), up + dec n + jnz L(m1tp) + +L(m1ed):lea 32(rp), rp +L(cj3): mov w2, (rp) +L(cj2): mov w3, 8(rp) + adc w1, w4 +L(cj1): mov w4, 16(rp) + adc $0, w5 + mov w5, 24(rp) + + dec R32(vn) + jz L(ret5) + + lea 8(vp), vp + lea 32(rp), rp +C push %r12 +C push %r13 +C push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') +C push %r12 +C push %r13 +C push %r14 + + mov (vp), v0 + mov 8(vp), v1 + + lea (un), n + sar $2, n + + test $1, R8(un) + jnz L(m2x1) + +L(m2x0):xor w0, w0 + test $2, R8(un) + mov (up), %rdx + mulx( v0, w2, w1) + jz L(m2l0) + +L(m210):lea -16(rp), rp + lea -16(up), up + jmp L(m2l2) + +L(m2x1):xor w2, w2 + test $2, R8(un) + mov (up), %rdx + mulx( v0, w0, w3) + jz L(m211) + +L(m201):lea -24(rp), rp + lea 8(up), up + jmp L(m2l1) + +L(m211):lea -8(rp), rp + lea -8(up), up + jmp L(m2l3) + + ALIGN(16) +L(m2tp):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2l0):mov w2, (rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 8(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 +L(m2l3):mov w0, 8(rp) + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, w2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2l2):mov w2, 16(rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 24(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 + lea 32(up), up +L(m2l1):mov w0, 24(rp) + adc $0, w3 + inc n + lea 32(rp), rp + jnz L(m2tp) + +L(m2ed):mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + mov %rax, 8(rp) + + add $-2, R32(vn) + jz L(ret5) + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + + lea (rp,un,8), rp + lea (up,un,8), up + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + lea 2(un), n + sar $2, n + + mov (up), %rdx + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + test $2, R8(un) + jnz L(b10) + +L(b00): lea 16(up), up + lea 16(rp), rp + jmp L(lo0) + +L(b10): mov 16(rp), X0 + lea 32(up), up + mulx( v0, %rax, w3) + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(un) + jz L(b11) + +L(b01): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo2): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + + addl $-2, vn + lea 16(vp), vp + lea -16(up,un,8), up + lea 32(rp,un,8), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 +L(ret4):pop %r13 +L(ret3):pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm new file mode 100644 index 0000000..e65559b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm @@ -0,0 +1,422 @@ +dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bull n/a n/a +C AMD pile n/a n/a +C AMD steam ? ? +C AMD bobcat n/a n/a +C AMD jaguar ? ? +C Intel P4 n/a n/a +C Intel core n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.86 2.15 +C Intel BWL ? ? +C Intel atom n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Micro-optimise. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r8') +define(`X0', `%r14') +define(`X1', `%r15') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`i', `%rbp') +define(`v0', `%r9') +define(`v1', `%rbx') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov vp_param, vp + mov (up), %rdx + + cmp $4, n + jb L(small) + + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + lea 2(n), i + shr $2, i + neg n + add $2, n + + push up C put entry `up' on stack + + test $1, R8(n) + jnz L(m2x1) + +L(m2x0):mulx( v0, w0, w3) + xor R32(w2), R32(w2) + test $2, R8(n) + jz L(m2b2) + +L(m2b0):lea -8(rp), rp + lea -8(up), up + jmp L(m2e0) + +L(m2b2):lea -24(rp), rp + lea 8(up), up + jmp L(m2e2) + +L(m2x1):mulx( v0, w2, w1) + xor R32(w0), R32(w0) + test $2, R8(n) + jnz L(m2b3) + +L(m2b1):jmp L(m2e1) + +L(m2b3):lea -16(rp), rp + lea -16(up), up + jmp L(m2e3) + + ALIGN(16) +L(m2tp):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2e1):mov w2, (rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 8(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 +L(m2e0):mov w0, 8(rp) + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, w2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2e3):mov w2, 16(rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 24(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 + lea 32(up), up +L(m2e2):mov w0, 24(rp) + adc $0, w3 + dec i + lea 32(rp), rp + jnz L(m2tp) + +L(m2ed):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + add w2, %rax + add w3, %rax + mov %rax, (rp) + + mov (%rsp), up C restore `up' to beginning + lea 16(vp), vp + lea 8(rp,n,8), rp C put back rp to old rp + 2 + add $2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + lea (n), i + sar $2, i + + mov (up), %rdx + test $1, R8(n) + jnz L(bx1) + +L(bx0): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(n) + jz L(b2) + +L(b0): lea 8(rp), rp + lea 8(up), up + jmp L(lo0) + +L(b2): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo2) + +L(bx1): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + test $2, R8(n) + jnz L(b3) + +L(b1): lea 16(up), up + lea 16(rp), rp + jmp L(lo1) + +L(b3): mov 16(rp), X0 + lea 32(up), up + mulx( v0, %rax, w3) + inc i + jz L(cj3) + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo3): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo2): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo1): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo0): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc i + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(cj3): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + add w1, X1 + mov -16(up), %rdx + mov X1, 8(rp) + adc $0, w3 + add w2, X0 + mulx( v0, %rax, w1) + add X0, %rax + add w3, %rax + mov %rax, 16(rp) + + mov 16(%rsp), up C restore `up' to beginning + lea 16(vp), vp + lea 24(rp,n,8), rp C put back rp to old rp + 2 + add $2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov (up), %rdx + mulx( v0, %r12, %rbp) C u0 x v2 + add (rp), %r12 C FIXME: rp[0] still available in reg? + adc %rax, %rbp + mov 8(up), %r10 + imul v0, %r10 + imul v1, %rdx + mov %r12, (rp) + add %r10, %rdx + add %rbp, %rdx + mov %rdx, 8(rp) + pop %rax C deallocate `up' copy + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul (up), %r11 + add %rax, %r11 + mov %r11, (rp) + pop %rax C deallocate `up' copy + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rdx + mulx( %r11, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm new file mode 100644 index 0000000..b1d6c0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm @@ -0,0 +1,437 @@ +dnl AMD64 mpn_redc_1 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bull n/a +C AMD pile n/a +C AMD steam ? +C AMD bobcat n/a +C AMD jaguar ? +C Intel P4 n/a +C Intel core n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.32 +C Intel BWL ? +C Intel atom n/a +C VIA nano n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise. +C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2 +C cycles, though. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv_param', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%rdi') +define(`u0inv', `(%rsp)') C stack + +ABI_SUPPORT(DOS64) C FIXME: needs verification +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + push rp + mov mp_param, mp C note that rp and mp shares register + mov (up), %rdx + + neg n + push %r8 C put u0inv on stack + imul u0inv_param, %rdx C first iteration q0 + mov n, j C outer loop induction var + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jz L(o0b) + + cmp $-2, R32(n) + jnz L(o2) + +C Special code for n = 2 since general code cannot handle it + mov 8(%rsp), %rbx C rp + lea 16(%rsp), %rsp C deallocate two slots + mulx( (mp), %r9, %r12) + mulx( 8,(mp), %r11, %r10) + add %r12, %r11 + adc $0, %r10 + add (up), %r9 C = 0 + adc 8(up), %r11 C r11 = up[1] + adc $0, %r10 C -> up[0] + mov %r11, %rdx + imul u0inv_param, %rdx + mulx( (mp), %r13, %r12) + mulx( 8,(mp), %r14, %r15) + xor R32(%rax), R32(%rax) + add %r12, %r14 + adc $0, %r15 + add %r11, %r13 C = 0 + adc 16(up), %r14 C rp[2] + adc $0, %r15 C -> up[1] + add %r14, %r10 + adc 24(up), %r15 + mov %r10, (%rbx) + mov %r15, 8(%rbx) + setc R8(%rax) + jmp L(ret) + +L(o2): lea 2(n), i C inner loop induction var + mulx( (mp), %r9, %r8) + mulx( 8,(mp), %r11, %r10) + sar $2, i + add %r8, %r11 + jmp L(lo2) + + ALIGN(16) +L(tp2): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 +L(lo2): mulx( 16,(mp), %r13, %r12) + mov (up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(up), %r10 + mov 16(up), %r12 + add %r9, %r8 + mov 24(up), %rbp + mov %r8, (up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, 8(up) + adc %r13, %r12 + mov %r12, 16(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 24(up) + inc i + jnz L(tp2) + +L(ed2): mov 56(up,n,8), %rdx C next iteration up[0] + lea 16(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 32(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 40(up), %rax + add %r9, %r8 + mov %r8, 32(up) + adc %r11, %rax + mov %rax, 40(up) + lea 56(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o2) + + jmp L(cj) + + +L(bx1): test $2, R8(n) + jz L(o3a) + +L(o1a): cmp $-1, R32(n) + jnz L(o1b) + +C Special code for n = 1 since general code cannot handle it + mov 8(%rsp), %rbx C rp + lea 16(%rsp), %rsp C deallocate two slots + mulx( (mp), %r11, %r10) + add (up), %r11 + adc 8(up), %r10 + mov %r10, (%rbx) + mov $0, R32(%rax) + setc R8(%rax) + jmp L(ret) + +L(o1b): lea 24(mp), mp +L(o1): lea 1(n), i C inner loop induction var + mulx( -24,(mp), %r11, %r10) + mulx( -16,(mp), %r13, %r12) + mulx( -8,(mp), %rbx, %rax) + sar $2, i + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (up), %r10 + mov 8(up), %r12 + mov 16(up), %rbp + add %r11, %r10 + jmp L(lo1) + + ALIGN(16) +L(tp1): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov -8(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (up), %r10 + mov 8(up), %r12 + add %r9, %r8 + mov 16(up), %rbp + mov %r8, -8(up) + adc %r11, %r10 +L(lo1): mulx( (mp), %r9, %r8) + mov %r10, (up) + adc %r13, %r12 + mov %r12, 8(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 16(up) + inc i + jnz L(tp1) + +L(ed1): mov 48(up,n,8), %rdx C next iteration up[0] + lea 40(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 24(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 32(up), %rax + add %r9, %r8 + mov %r8, 24(up) + adc %r11, %rax + mov %rax, 32(up) + lea 48(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o1) + + jmp L(cj) + +L(o3a): cmp $-3, R32(n) + jnz L(o3b) + +C Special code for n = 3 since general code cannot handle it +L(n3): mulx( (mp), %rbx, %rax) + mulx( 8,(mp), %r9, %r14) + add (up), %rbx + mulx( 16,(mp), %r11, %r10) + adc %rax, %r9 C W 1 + adc %r14, %r11 C W 2 + mov 8(up), %r14 + mov u0inv_param, %rdx + adc $0, %r10 C W 3 + mov 16(up), %rax + add %r9, %r14 C W 1 + mov %r14, 8(up) + mulx( %r14, %rdx, %r13) C next iteration q0 + adc %r11, %rax C W 2 + mov %rax, 16(up) + adc $0, %r10 C W 3 + mov %r10, (up) + lea 8(up), up C up = (last starting up) + 1 + inc j + jnz L(n3) + + jmp L(cj) + +L(o3b): lea 8(mp), mp +L(o3): lea 4(n), i C inner loop induction var + mulx( -8,(mp), %rbx, %rax) + mulx( (mp), %r9, %r8) + mov (up), %rbp + mulx( 8,(mp), %r11, %r10) + sar $2, i + add %rbx, %rbp + nop + adc %rax, %r9 + jmp L(lo3) + + ALIGN(16) +L(tp3): adc %rax, %r9 + lea 32(up), up +L(lo3): adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov 8(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 16(up), %r10 + mov 24(up), %r12 + add %r9, %r8 + mov 32(up), %rbp + mov %r8, 8(up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, 16(up) + adc %r13, %r12 + mov %r12, 24(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 32(up) + inc i + jnz L(tp3) + +L(ed3): mov 64(up,n,8), %rdx C next iteration up[0] + lea 24(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 40(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 48(up), %rax + add %r9, %r8 + mov %r8, 40(up) + adc %r11, %rax + mov %rax, 48(up) + lea 64(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o3) + + jmp L(cj) + +L(o0b): lea 16(mp), mp +L(o0): mov n, i C inner loop induction var + mulx( -16,(mp), %r13, %r12) + mulx( -8,(mp), %rbx, %rax) + sar $2, i + add %r12, %rbx + adc $0, %rax + mov (up), %r12 + mov 8(up), %rbp + mulx( (mp), %r9, %r8) + add %r13, %r12 + jmp L(lo0) + + ALIGN(16) +L(tp0): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov -16(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov -8(up), %r10 + mov (up), %r12 + add %r9, %r8 + mov 8(up), %rbp + mov %r8, -16(up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, -8(up) + adc %r13, %r12 + mov %r12, (up) +L(lo0): adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 8(up) + inc i + jnz L(tp0) + +L(ed0): mov 40(up,n,8), %rdx C next iteration up[0] + lea 32(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 16(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 24(up), %rax + add %r9, %r8 + mov %r8, 16(up) + adc %r11, %rax + mov %rax, 24(up) + lea 40(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o0) + +L(cj): +IFSTD(` mov 8(%rsp), %rdi C param 1: rp + lea 16-8(%rsp), %rsp C deallocate 2, add back for alignment + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` mov up, %rdx C param 2: up + lea (up,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov 8(%rsp), %rcx C param 1: rp + lea 16-32-8(%rsp), %rsp') C deallocate 2, allocate shadow, align + + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) + +IFSTD(` lea 8(%rsp), %rsp ') +IFDOS(` lea 32+8(%rsp), %rsp') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm new file mode 100644 index 0000000..641cdf3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm @@ -0,0 +1,506 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 n/a n/a n/a +C AMD K10 n/a n/a n/a +C AMD bull n/a n/a n/a +C AMD pile n/a n/a n/a +C AMD steam ? ? ? +C AMD bobcat n/a n/a n/a +C AMD jaguar ? ? ? +C Intel P4 n/a n/a n/a +C Intel core n/a n/a n/a +C Intel NHM n/a n/a n/a +C Intel SBR n/a n/a n/a +C Intel IBR n/a n/a n/a +C Intel HWL 1.86 2.15 ~2.5 +C Intel BWL ? ? ? +C Intel atom n/a n/a n/a +C VIA nano n/a n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund, except +C that the sqr_diag_addlsh1 loop was manually written. + +C TODO +C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be +C possible. +C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for +C n = 0 (mod 2). These loops could fall into specific "corner" code. +C * Consider splitting outer loop into 4. +C * Streamline pointer updates. +C * Perhaps suppress a few more xor insns in feed-in code. +C * Make sure we write no dead registers in feed-in code. +C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch +C out for negative sizes being zero-extended, though. +C * Provide straight-line code for n = 4; then look for simplifications in +C main code. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w2', `%r11') + + mov (up), v0 + mov 8(up), %rdx + mov %rdx, v1 + mulx( v0, w2, %rax) + mov 16(up), %rdx + mulx( v0, w0, %rcx) + mov w2, %r8 + add %rax, w0 + adc $0, %rcx + mulx( v1, %rdx, %rax) + add %rcx, %rdx + mov %rdx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + xor R32(%rcx), R32(%rcx) + mov (up), %rdx + mulx( %rdx, %rax, w2) + mov %rax, (rp) + add %r8, %r8 + adc w0, w0 + setc R8(%rcx) + mov 8(up), %rdx + mulx( %rdx, %rax, %rdx) + add w2, %r8 + adc %rax, w0 + mov %r8, 8(rp) + mov w0, 16(rp) + mov 24(rp), %r8 + mov 32(rp), w0 + lea (%rdx,%rcx), w2 + adc %r8, %r8 + adc w0, w0 + setc R8(%rcx) + mov 16(up), %rdx + mulx( %rdx, %rax, %rdx) + add w2, %r8 + adc %rax, w0 + mov %r8, 24(rp) + mov w0, 32(rp) + adc %rcx, %rdx + mov %rdx, 40(rp) + FUNC_EXIT() + ret + +L(gt3): + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%rbx') +define(`w3', `%rbp') +define(`un', `%r12') +define(`n', `%rcx') + +define(`X0', `%r13') +define(`X1', `%r14') + +L(do_mul_2): + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov $0, R32(un) + sub un_param, un C free up rdx + push un + mov (up), v0 + mov 8(up), %rdx + lea 2(un), n + sar $2, n C FIXME: suppress, change loop? + inc un C decrement |un| + mov %rdx, v1 + + test $1, R8(un) + jnz L(mx1) + +L(mx0): mulx( v0, w2, w1) + mov 16(up), %rdx + mov w2, 8(rp) + xor w2, w2 + mulx( v0, w0, w3) + test $2, R8(un) + jz L(m00) + +L(m10): lea -8(rp), rp + lea -8(up), up + jmp L(mlo2) + +L(m00): lea 8(up), up + lea 8(rp), rp + jmp L(mlo0) + +L(mx1): mulx( v0, w0, w3) + mov 16(up), %rdx + mov w0, 8(rp) + xor w0, w0 + mulx( v0, w2, w1) + test $2, R8(un) + jz L(mlo3) + +L(m01): lea 16(rp), rp + lea 16(up), up + jmp L(mlo1) + + ALIGN(32) +L(mtop):mulx( v1, %rax, w0) + add %rax, w2 C 0 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 1 + add %rax, w2 C 0 +L(mlo1):adc $0, w1 C 1 + add w3, w2 C 0 + mov w2, (rp) C 0 + adc $0, w1 C 1 + mulx( v1, %rax, w2) + add %rax, w0 C 1 + mov 8(up), %rdx + adc $0, w2 C 2 + mulx( v0, %rax, w3) + add %rax, w0 C 1 + adc $0, w3 C 2 +L(mlo0):add w1, w0 C 1 + mov w0, 8(rp) C 1 + adc $0, w3 C 2 + mulx( v1, %rax, w0) + add %rax, w2 C 2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 3 + add %rax, w2 C 2 + adc $0, w1 C 3 +L(mlo3):add w3, w2 C 2 + mov w2, 16(rp) C 2 + adc $0, w1 C 3 + mulx( v1, %rax, w2) + add %rax, w0 C 3 + mov 24(up), %rdx + adc $0, w2 C 4 + mulx( v0, %rax, w3) + add %rax, w0 C 3 + adc $0, w3 C 4 +L(mlo2):add w1, w0 C 3 + lea 32(up), up + mov w0, 24(rp) C 3 + adc $0, w3 C 4 + inc n + lea 32(rp), rp + jnz L(mtop) + +L(mend):mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + mov %rax, 8(rp) + + lea 16(up), up + lea -16(rp), rp + +L(do_addmul_2): +L(outer): + lea (up,un,8), up C put back up to 2 positions above last time + lea 48(rp,un,8), rp C put back rp to 4 positions above last time + + mov -8(up), v0 C shared between addmul_2 and corner + + add $2, un C decrease |un| + cmp $-2, un + jge L(corner) + + mov (up), v1 + + lea 1(un), n + sar $2, n C FIXME: suppress, change loop? + + mov v1, %rdx + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mov X0, (rp) + xor w2, w2 + test $2, R8(un) + jnz L(b10) + +L(b00): mov 8(up), %rdx + lea 16(rp), rp + lea 16(up), up + jmp L(lo0) + +L(b10): mov 8(up), %rdx + mov 16(rp), X0 + lea 32(up), up + inc n + mulx( v0, %rax, w3) + jz L(ex) + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + mov 8(up), %rdx + add %rax, X1 + adc $0, w3 + xor w0, w0 + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(un) + jz L(b11) + +L(b01): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(32) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo2): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(ex): add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add %rdx, w3 + mov w3, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + + jmp L(outer) C loop until a small corner remains + +L(corner): + pop un + mov (up), %rdx + jg L(small_corner) + + mov %rdx, v1 + mov (rp), X0 + mov %rax, X1 C Tricky rax reuse of last iteration + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mov X0, (rp) + mov 8(up), %rdx + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + lea 32(rp), rp + lea 16(up), up + jmp L(com) + +L(small_corner): + mulx( v0, X1, w3) + add %rax, X1 C Tricky rax reuse of last iteration + adc $0, w3 + mov X1, (rp) + mov w3, 8(rp) + lea 16(rp), rp + lea 8(up), up + +L(com): + +L(sqr_diag_addlsh1): + lea 8(up,un,8), up C put back up at its very beginning + lea (rp,un,8), rp + lea (rp,un,8), rp C put back rp at its very beginning + inc un + + mov -8(up), %rdx + xor R32(%rbx), R32(%rbx) C clear CF as side effect + mulx( %rdx, %rax, %r10) + mov %rax, 8(rp) + mov 16(rp), %r8 + mov 24(rp), %r9 + jmp L(dm) + + ALIGN(16) +L(dtop):mov 32(rp), %r8 + mov 40(rp), %r9 + lea 16(rp), rp + lea (%rdx,%rbx), %r10 +L(dm): adc %r8, %r8 + adc %r9, %r9 + setc R8(%rbx) + mov (up), %rdx + lea 8(up), up + mulx( %rdx, %rax, %rdx) + add %r10, %r8 + adc %rax, %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + inc un + jnz L(dtop) + +L(dend):adc %rbx, %rdx + mov %rdx, 32(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm new file mode 100644 index 0000000..eed64e7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm @@ -0,0 +1,200 @@ +dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k) +dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[] +dnl Optimised for Nehalem. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 4.75 +C Intel P4 ? +C Intel core2 2.8-3 +C Intel NHM 2.8 +C Intel SBR 3.55 +C Intel atom ? +C VIA nano ? + +C The inner-loop probably runs close to optimally on Nehalem (using 4-way +C unrolling). The rest of the code is quite crude, and could perhaps be made +C both smaller and faster. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') +define(`cy', `%r9') C for _nc variant + +ifdef(`OPERATION_addlsh_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(IFRSB, ) + define(func_n, mpn_addlsh_n) + define(func_nc, mpn_addlsh_nc)') +ifdef(`OPERATION_rsblsh_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(IFRSB, `$1') + define(func_n, mpn_rsblsh_n) + define(func_nc, mpn_rsblsh_nc)') + +C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh_nc +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt + push %rbx + xor R32(%rbx), R32(%rbx) C clear CF save register +L(ent): push %rbp + mov R32(n), R32(%rbp) + mov n, %rax + + mov R32(cnt), R32(%rcx) + neg R32(%rcx) + + lea -8(up,%rax,8), up + lea -8(vp,%rax,8), vp + lea -40(rp,%rax,8), rp + neg %rax + + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jc L(b1) + jz L(b2) + +L(b3): xor R32(%r9), R32(%r9) + mov 8(vp,%rax,8), %r10 + mov 16(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB 8(up,%rax,8), %r9 + mov 24(vp,%rax,8), %r8 + ADCSBB 16(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $3, %rax + jmp L(lo3) + +L(b0): mov 8(vp,%rax,8), %r9 + xor R32(%r8), R32(%r8) + shrd %cl, %r9, %r8 + mov 16(vp,%rax,8), %r10 + mov 24(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB 8(up,%rax,8), %r8 + mov %r8, 40(rp,%rax,8) C offset 40 + ADCSBB 16(up,%rax,8), %r9 + mov 32(vp,%rax,8), %r8 + ADCSBB 24(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $4, %rax + jmp L(lo0) + +L(b1): mov 8(vp,%rax,8), %r8 + add $1, %rax + jz L(1) + mov 8(vp,%rax,8), %r9 + xor R32(%rbp), R32(%rbp) + jmp L(lo1) +L(1): xor R32(%r11), R32(%r11) + jmp L(wd1) + +L(b2): xor %r10, %r10 + mov 8(vp,%rax,8), %r11 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + mov 16(vp,%rax,8), %r8 + ADCSBB 8(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $2, %rax + jz L(end) + + ALIGN(16) +L(top): mov 8(vp,%rax,8), %r9 + mov %r11, %rbp +L(lo2): mov %r10, 24(rp,%rax,8) C offset 24 +L(lo1): shrd %cl, %r8, %rbp + shrd %cl, %r9, %r8 + mov 16(vp,%rax,8), %r10 + mov 24(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB (up,%rax,8), %rbp + ADCSBB 8(up,%rax,8), %r8 + mov %r8, 40(rp,%rax,8) C offset 40 + ADCSBB 16(up,%rax,8), %r9 + mov 32(vp,%rax,8), %r8 + ADCSBB 24(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $4, %rax + mov %rbp, (rp,%rax,8) C offset 32 +L(lo0): +L(lo3): mov %r9, 16(rp,%rax,8) C offset 48 + jnz L(top) + +L(end): mov %r10, 24(rp,%rax,8) +L(wd1): shrd %cl, %r8, %r11 + add R32(%rbx), R32(%rbx) + ADCSBB (up,%rax,8), %r11 + mov %r11, 32(rp,%rax,8) C offset 32 + adc R32(%rax), R32(%rax) C rax is zero after loop + shr R8(%rcx), %r8 + ADDSUB %r8, %rax +IFRSB( neg %rax) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy + push %rbx + neg cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm new file mode 100644 index 0000000..1be829f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.0 +C AMD K10 4.0 +C AMD bull 5.0 +C AMD pile 4.84 5.39 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 +C AMD jaguar 5.30 +C Intel P4 15.7 17.2 +C Intel core2 5.15 +C Intel NHM 4.56 +C Intel SBR 3.44 +C Intel HWL 3.03 +C Intel BWL 2.77 +C Intel SKL 2.76 +C Intel atom 21 +C Intel SLM 11 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C N.B.: Be careful if editing, making sure the loop alignment padding does not +C become large, as we currently fall into it. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + + mov (up), %rax + lea -8(up,n_param,8), up + mov (rp), %r8 + lea -8(rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $3, R32(n) + sub n_param, n + mul v0 + mov $0, R32(%r11) + mov %r8, %r10 + ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc %rdx, %r11 + jmp L(lo0) + +L(b10): mov $1, R32(n) + sub n_param, n + mul v0 + mov %r8, %r10 + mov $0, R32(%r11) + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc %rdx, %r11 + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + jz L(b01) + +L(b11): mov $2, R32(n) + sub n_param, n + mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov (up,n,8), %rax + adc %rdx, %r9 + jmp L(lo3) + +L(b01): mov $0, R32(n) + sub n_param, n + xor %r11, %r11 + add $4, n + jc L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov -16(up,n,8), %rax + adc %rdx, %r9 +L(lo1): mul v0 + ADDSUB %r11, %r8 + mov $0, R32(%r11) + mov -16(rp,n,8), %r10 + adc $0, %r9 + ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc %rdx, %r11 + mov %r8, -24(rp,n,8) + ADDSUB %r9, %r10 + adc $0, %r11 +L(lo0): mov -8(rp,n,8), %r8 + mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov (up,n,8), %rax + adc %rdx, %r9 + mov %r10, -16(rp,n,8) + ADDSUB %r11, %r8 + adc $0, %r9 +L(lo3): mul v0 + mov (rp,n,8), %r10 + mov $0, R32(%r11) + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc %rdx, %r11 + mov %r8, -8(rp,n,8) + ADDSUB %r9, %r10 + adc $0, %r11 +L(lo2): mov 8(rp,n,8), %r8 + mov %r10, (rp,n,8) + add $4, n + jnc L(top) + +L(end): mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%rax) + adc %rdx, %rax + ADDSUB %r11, %r8 + adc $0, %rax + mov %r8, (rp) + + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h new file mode 100644 index 0000000..f56c128 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h @@ -0,0 +1,238 @@ +/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2933-3200 MHz Intel Xeon X3470 Nehalem */ +/* FFT tuning limit = 468,424,931 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 10 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 17 + +#define DIV_1_VS_MUL_1_PERCENT 301 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 59 +#define MUL_TOOM44_THRESHOLD 169 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 98 +#define SQR_TOOM4_THRESHOLD 250 +#define SQR_TOOM6_THRESHOLD 351 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 28 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31, 8}, { 511,10}, \ + { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 3839,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \ + { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 204 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \ + { 79, 9}, { 319, 6}, { 2687, 7}, { 1407, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \ + { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6655,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 218 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 49 +#define MULLO_MUL_N_THRESHOLD 8397 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 11 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 151 +#define DC_BDIV_QR_THRESHOLD 40 +#define DC_BDIV_Q_THRESHOLD 30 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 199 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 254 +#define REDC_1_TO_REDC_N_THRESHOLD 48 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1308 + +#define POWM_SEC_TABLE 1,64,66,452,1486 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 141 +#define SET_STR_PRECOMPUTE_THRESHOLD 1023 + +#define FAC_DSC_THRESHOLD 182 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 5 /* 2.91% faster than 3 */ +#define HGCD_THRESHOLD 116 +#define HGCD_APPR_THRESHOLD 164 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 321 +#define GCDEXT_DC_THRESHOLD 358 +#define JACOBI_BASE_METHOD 4 /* 0.12% faster than 1 */ + +/* Tuneup completed successfully, took 452116 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm new file mode 100644 index 0000000..a5a63e4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm @@ -0,0 +1,196 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 3.26 +C AMD bd1 4.2 +C AMD bd2 4.2 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 1.15 +C AMD bobcat 7.29 +C AMD jaguar 2.53 +C Intel P4 n/a +C Intel core2 n/a +C Intel NHM 2.03 +C Intel SBR 1.66 +C Intel IBR 1.62 +C Intel HWL 1.50 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom n/a +C Intel SLM 2.55 +C VIA nano n/a + +C TODO +C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later +C Intel hardware. Perhaps mix such a loop with popcnt instructions. +C * The random placement of the L0, L1, L2, etc blocks are due to branch +C shortening. More work could be done there. +C * Combine the accumulators rax and rcx into one register to save some +C bookkeeping and a push/pop pair. Unfortunately this cause a slight +C slowdown for at leat NHM and SBR. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`sum', `lea ($1,$2), $2') +define(`sum', `add $1, $2') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + push %rbx + push %rbp + + mov (up), %r10 + xor (vp), %r10 + + mov R32(n), R32(%r8) + and $3, R32(%r8) + + xor R32(%rcx), R32(%rcx) + .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 +',` + jmp *(%r9,%r8,8) +') + +L(3): mov 8(up), %r10 + mov 16(up), %r11 + xor 8(vp), %r10 + xor 16(vp), %r11 + xor R32(%rbp), R32(%rbp) + sub $4, n + jle L(x3) + mov 24(up), %r8 + mov 32(up), %r9 + add $24, up + add $24, vp + jmp L(e3) + +L(0): mov 8(up), %r9 + xor 8(vp), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + xor R32(%rbx), R32(%rbx) + xor 16(vp), %r10 + xor 24(vp), %r11 + add $32, up + add $32, vp + sub $4, n + jle L(x4) + + ALIGN(16) +L(top): +L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp + mov (up), %r8 + mov 8(up), %r9 + sum( %rbx, %rax) +L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx + xor (vp), %r8 + xor 8(vp), %r9 + sum( %rbp, %rcx) +L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp + mov 16(up), %r10 + mov 24(up), %r11 + add $32, up + sum( %rbx, %rax) +L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx + xor 16(vp), %r10 + xor 24(vp), %r11 + add $32, vp + sum( %rbp, %rcx) + sub $4, n + jg L(top) + +L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp + sum( %rbx, %rax) +L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx + sum( %rbp, %rcx) + .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp + sum( %rbx, %rax) + sum( %rbp, %rcx) +L(x2): add %rcx, %rax +L(x1): pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(2): mov 8(up), %r11 + xor 8(vp), %r11 + sub $2, n + jle L(n2) + mov 16(up), %r8 + mov 24(up), %r9 + xor R32(%rbx), R32(%rbx) + xor 16(vp), %r8 + xor 24(vp), %r9 + add $16, up + add $16, vp + jmp L(e2) +L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx + jmp L(x2) + +L(1): dec n + jle L(x1) + mov 8(up), %r8 + mov 16(up), %r9 + xor 8(vp), %r8 + xor 16(vp), %r9 + xor R32(%rbp), R32(%rbp) + mov 24(up), %r10 + mov 32(up), %r11 + add $40, up + add $8, vp + jmp L(e1) + +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm new file mode 100644 index 0000000..0a3c867 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 1.39 +C AMD bd1 4 +C AMD bd2 4 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 0.72 +C AMD bobcat 5.78 +C AMD jaguar 1.27 +C Intel P4 n/a +C Intel core2 n/a +C Intel NHM 1.04 +C Intel SBR 1.02 +C Intel IBR 1.0 +C Intel HWL 1.0 +C Intel BWL 1.0 +C Intel SKL 1.0 +C Intel atom n/a +C Intel SLM 1.34 +C VIA nano n/a + +C TODO +C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would +C not cause any additional feed-in overhead as we already use a jump table. +C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later +C Intel hardware. Perhaps mix such a loop with popcnt instructions. +C * The random placement of the L0, L1, L2, etc blocks are due to branch +C shortening. + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(2) + + mov R32(n), R32(%r8) + and $7, R32(%r8) + + .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax + xor R32(%rcx), R32(%rcx) + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 +',` + jmp *(%r9,%r8,8) +') + +L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11 + add $24, up + sub $8, n + jg L(e34) + add %r10, %rax + add %r11, %rax +L(s1): FUNC_EXIT() + ret + +L(1): sub $8, n + jle L(s1) + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9 + add $8, up + jmp L(e12) + +L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11 + add $-8, up + jmp L(e07) + +L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + jmp L(e07) + +L(4): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + add $32, up + sub $8, n + jle L(x4) + + ALIGN(16) +L(top): +L(e34): .byte 0xf3,0x4c,0x0f,0xb8,0x07 C popcnt (%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%r9 + add %r10, %rcx + add %r11, %rax +L(e12): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + add %r8, %rcx + add %r9, %rax +L(e07): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 C popcnt 0x20(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 C popcnt 0x28(%rdi),%r9 + add %r10, %rcx + add %r11, %rax +L(e56): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 C popcnt 0x30(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 C popcnt 0x38(%rdi),%r11 + add $64, up + add %r8, %rcx + add %r9, %rax + sub $8, n + jg L(top) + +L(x4): add %r10, %rcx + add %r11, %rax +L(x2): add %rcx, %rax + + FUNC_EXIT() + ret + +L(2): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + sub $8, n + jle L(x2) + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 + add $16, up + jmp L(e12) + +L(5): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 0x8(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 0x10(%rdi),%r9 + add $-24, up + jmp L(e56) + +L(6): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 + add $-16, up + jmp L(e56) +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm new file mode 100644 index 0000000..fc71c1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm @@ -0,0 +1,549 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 3(n), i + mov (mp,n,8), %rax + mov (up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 8(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + mov $0, R32(%r11) + mov 8(up,n,8), %rbx + add %rax, %rbx + mov 16(mp,n,8), %rax + adc %rdx, %r11 + add %r9, %rbx + adc $0, %r11 + mov 16(up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 24(mp,n,8), %rax + adc %rdx, %r9 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) +L(e1): add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp1) + +L(ed1): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 5(n), i + mov (mp,n,8), %rax + mov (up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 8(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + mov 8(up,n,8), %rbx + mov $0, R32(%r11) + add %rax, %rbx + mov 16(mp,n,8), %rax + adc %rdx, %r11 + add %r9, %rbx + adc $0, %r11 + mov 16(up,n,8), %rbp + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb +C jmp L(tp3) + + ALIGNx +L(tp3): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp3) + +L(ed3): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov $0, R32(%r11) + mov (up,n,8), %r10 + add %rax, %r10 + mov 8(mp,n,8), %rax + adc %rdx, %r11 + mov 8(up,n,8), %rbx + mul q0 + add %rax, %rbx + mov $0, R32(%r9) + mov 16(mp,n,8), %rax + adc %rdx, %r9 + add %r11, %rbx + adc $0, %r9 + mul q0 + mov 16(up,n,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 24(mp,n,8), %rax + adc %rdx, %r11 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) +L(e0): add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp0) + +L(ed0): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,n,8), %rax + adc %rdx, %r11 + mov 8(up,n,8), %rbx + mul q0 + add %rax, %rbx + mov $0, R32(%r9) + mov 16(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbx + mov $0, R32(%r11) + mov 16(up,n,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov 24(mp,n,8), %rax + adc %rdx, %r11 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) +L(e2): add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp2) + +L(ed2): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm new file mode 100644 index 0000000..21f0bf4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm @@ -0,0 +1,224 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.93 this +C Intel IBR 2.66 this +C Intel HWL 2.5 2.15 +C Intel BWL +C Intel atom +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`X0', `%r12') +define(`X1', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + + lea (up,n_param,8), up + lea 8(rp,n_param,8), rp + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): mov -8(rp,n,8), X0 + mov %rdx, w1 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + xor w0, w0 + xor w3, w3 + test $2, R8(n) + jnz L(b10) + +L(b00): nop C this nop make loop go faster on SBR! + mul v1 + mov (rp,n,8), X1 + jmp L(lo0) + +L(b10): lea -2(n), n + jmp L(lo2) + +L(bx1): mov -8(rp,n,8), X1 + mov %rdx, w3 + add %rax, X1 + adc $0, w3 + mov (up,n,8), %rax + xor w1, w1 + xor w2, w2 + test $2, R8(n) + jz L(b11) + +L(b01): mov (rp,n,8), X0 + inc n + jmp L(lo1) + +L(b11): dec n + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo1): mul v1 + mov %rdx, w0 C 1 + add %rax, X0 C 0 + adc $0, w0 C 1 + add w1, X1 C 3 + adc $0, w3 C 0 + add w2, X0 C 0 + adc $0, w0 C 1 + mov (up,n,8), %rax + mul v0 + add %rax, X0 C 0 + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mov (up,n,8), %rax + mul v1 + mov X1, -16(rp,n,8) C 3 + mov (rp,n,8), X1 C 1 + add w3, X0 C 0 + adc $0, w1 C 1 +L(lo0): mov %rdx, w2 C 2 + mov X0, -8(rp,n,8) C 0 + add %rax, X1 C 1 + adc $0, w2 C 2 + mov 8(up,n,8), %rax + add w0, X1 C 1 + adc $0, w2 C 2 + mul v0 + add %rax, X1 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov 8(up,n,8), %rax +L(lo3): mul v1 + add w1, X1 C 1 + mov 8(rp,n,8), X0 C 2 + adc $0, w3 C 2 + mov %rdx, w0 C 3 + add %rax, X0 C 2 + adc $0, w0 C 3 + mov 16(up,n,8), %rax + mul v0 + add w2, X0 C 2 + mov X1, (rp,n,8) C 1 + mov %rdx, w1 C 3 + adc $0, w0 C 3 + add %rax, X0 C 2 + adc $0, w1 C 3 + mov 16(up,n,8), %rax + add w3, X0 C 2 + adc $0, w1 C 3 +L(lo2): mul v1 + mov 16(rp,n,8), X1 C 3 + add %rax, X1 C 3 + mov %rdx, w2 C 4 + adc $0, w2 C 4 + mov 24(up,n,8), %rax + mov X0, 8(rp,n,8) C 2 + mul v0 + add w0, X1 C 3 + mov %rdx, w3 C 4 + adc $0, w2 C 4 + add %rax, X1 C 3 + mov 24(up,n,8), %rax + mov 24(rp,n,8), X0 C 0 useless but harmless final read + adc $0, w3 C 4 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-16(rp),-16(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I(-8(rp),-8(rp,n,8)) + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm new file mode 100644 index 0000000..2319a80 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm @@ -0,0 +1,54 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm new file mode 100644 index 0000000..3b7bb22 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm @@ -0,0 +1,56 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh2_nc +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm new file mode 100644 index 0000000..23ace41 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) +dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] + +dnl Copyright 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.25 +C Intel NHM 4 +C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32)) +C Intel atom ? +C VIA nano ? + +C This code probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). It also runs reasonably well on Core 2, but it runs poorly on +C all other processors, including Nehalem. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov cy, %rax + neg %rax C set msb on carry + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%r9) + and $3, R32(%r9) + je L(b00) + cmp $2, R32(%r9) + jc L(b01) + je L(b10) + jmp L(b11) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, %rbp + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $3, n + ja L(top) + jmp L(end) + +L(b01): add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + mov %rbp, (rp) + mov %r8, %rbp + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $1, n + ja L(top) + jmp L(end) + +L(b10): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, %rbp + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $2, n + ja L(top) + jmp L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + shrd $RSH, %r8, %rbp +L(b00): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + mov 24(vp), %r11 + shrd $RSH, %r11, %r10 + lea 32(vp), vp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, 24(rp) + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $4, n + jnz L(top) + +L(end): shr $RSH, %rbp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm new file mode 100644 index 0000000..db8ee68 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k) +dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[] +dnl Optimised for Sandy Bridge. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 5.25 +C Intel P4 ? +C Intel core2 3.1 +C Intel NHM 3.95 +C Intel SBR 2.75 +C Intel atom ? +C VIA nano ? + +C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). The rest of the code is quite crude, and could perhaps be made +C both smaller and faster. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') +define(`cy', `%r9') C for _nc variant + +ifdef(`OPERATION_addlsh_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(IFRSB, ) + define(func_n, mpn_addlsh_n) + define(func_nc, mpn_addlsh_nc)') +ifdef(`OPERATION_rsblsh_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(IFRSB, `$1') + define(func_n, mpn_rsblsh_n) + define(func_nc, mpn_rsblsh_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh_nc +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt + push %rbx + xor R32(%rbx), R32(%rbx) C clear CF save register +L(ent): push %rbp + mov R32(n), R32(%rbp) + mov n, %rax + mov R32(cnt), R32(%rcx) + neg R32(%rcx) + and $3, R32(%rbp) + jz L(b0) + lea -32(vp,%rbp,8), vp + lea -32(up,%rbp,8), up + lea -32(rp,%rbp,8), rp + cmp $2, R32(%rbp) + jc L(b1) + jz L(b2) + +L(b3): xor %r8, %r8 + mov 8(vp), %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $3, %rax + jz L(3) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo3) +L(3): add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd3) + +L(b0): mov (vp), %r8 + mov 8(vp), %r9 + xor R32(%rbp), R32(%rbp) + jmp L(lo0) + +L(b1): xor %r10, %r10 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $1, %rax + jz L(1) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 24(up), %r10 + lea 32(up), up + mov (vp), %r8 + jmp L(lo1) +L(1): add R32(%rbx), R32(%rbx) + ADCSBB 24(up), %r10 + jmp L(wd1) + +L(b2): xor %r9, %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $2, %rax + jz L(2) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo2) +L(2): add R32(%rbx), R32(%rbx) + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd2) + + ALIGN(32) C 16-byte alignment is not enough! +L(top): shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) + lea 32(up), up +L(lo3): mov %r8, 8(rp) +L(lo2): mov %r9, 16(rp) + mov (vp), %r8 +L(lo1): mov %r10, 24(rp) + mov 8(vp), %r9 + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rbx), R32(%rbx) +L(lo0): shrd R8(%rcx), %r8, %rbp + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + sub $4, %rax + jg L(top) + + shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) +L(wd3): mov %r8, 8(rp) +L(wd2): mov %r9, 16(rp) +L(wd1): mov %r10, 24(rp) + adc R32(%rax), R32(%rax) C rax is zero after loop + shr R8(%rcx), %r11 + ADDSUB %r11, %rax +IFRSB( neg %rax) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy + push %rbx + neg cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm new file mode 100644 index 0000000..61fee3e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm @@ -0,0 +1,203 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and +dnl Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.75\2.52 +C AMD K10 1.5 +C AMD bd1 1.69\2.25 +C AMD bd2 1.65 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 1.5 +C AMD bt1 2.67 +C AMD bt2 2.16 +C Intel P4 11.54 +C Intel PNR 5 +C Intel NHM 5.5 +C Intel SBR 1.54 +C Intel IBR 1.5 +C Intel HWL 1.32 +C Intel BWL 1.07 +C Intel SKL 1.21 +C Intel atom 4.3 +C Intel SLM 3 +C VIA nano ? + +C The loop of this code was manually written. It runs close to optimally on +C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems. +C It also runs slightly faster on average on AMD bd1 and bd2. +C +C No micro-optimisation has been done. +C +C N.B.! The loop alignment padding insns are executed. If editing the code, +C make sure the padding does not become excessive. It is now a 4-byte nop. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 + +L(ent): mov R32(n), R32(%rax) + shr $2, n + + test $1, R8(%rax) + jnz L(bx1) + +L(bx0): test $2, R8(%rax) + jnz L(b10) + +L(b00): neg %r8 + mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp + lea -16(rp), rp + jmp L(lo0) + +L(b10): neg %r8 + mov (up), %r10 + mov 8(up), %r11 + ADCSBB 0(vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(e2) + mov 16(up), %r8 + mov 24(up), %r9 + lea 16(up), up + ADCSBB 16(vp), %r8 + ADCSBB 24(vp), %r9 + lea 16(vp), vp +C lea (rp), rp + jmp L(lo2) + +L(e2): mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(bx1): test $2, R8(%rax) + jnz L(b11) + +L(b01): neg %r8 + mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(e1) + mov 8(up), %r8 + mov 16(up), %r9 + lea 8(up), up + lea -8(rp), rp + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + lea 8(vp), vp + jmp L(lo1) + +L(e1): mov %r11, (rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(b11): neg %r8 + mov (up), %r9 + ADCSBB (vp), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + lea 24(up), up + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + lea 24(vp), vp + mov %r9, (rp) + lea 8(rp), rp + jrcxz L(end) + + ALIGN(32) +L(top): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 +L(lo2): mov %r10, (rp) +L(lo1): mov %r11, 8(rp) + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp +L(lo0): mov %r8, 16(rp) +L(lo3): mov %r9, 24(rp) + lea 32(rp), rp + dec n + jnz L(top) + +L(end): mov R32(n), R32(%rax) C zero rax + mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret +EPILOGUE() + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm new file mode 100644 index 0000000..b4c1572 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm @@ -0,0 +1,212 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.27 +C AMD K10 4.27 4.54 +C AMD bull 4.76 +C AMD pile 4.55 +C AMD steam +C AMD excavator +C AMD bobcat 5.30 +C AMD jaguar 5.28 +C Intel P4 16.2 17.1 +C Intel core2 5.26 +C Intel NHM 5.09 +C Intel SBR 3.21 +C Intel IBR 2.96 +C Intel HWL 2.81 +C Intel BWL 2.76 +C Intel SKL 2.76 +C Intel atom 21.5 +C Intel SLM 9.5 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjörn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +define(`I',`$1') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'')') dnl +IFDOS(` define(`rp', ``%rcx'')') dnl +IFDOS(` define(`v0', ``%r9'')') dnl +IFDOS(` define(`r9', ``rdi'')') dnl +IFDOS(` define(`n_param',``%r8'')') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax + push %rbx + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b13) + +L(b02): xor R32(%r11), R32(%r11) + test $2, R8(n_param) + jnz L(b2) + +L(b0): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + jmp L(e0) + + ALIGN(16) +L(b2): mov $-1, n + sub n_param, n + mul v0 + mov 8(rp,n,8), %r8 + mov %rdx, %r9 + jmp L(e2) + + ALIGN(16) +L(b13): xor R32(%r9), R32(%r9) + test $2, R8(n_param) + jnz L(b3) + +L(b1): mov $2, R32(n) + sub n_param, n + jns L(1) + mul v0 + mov -16(rp,n,8), %r10 + mov %rdx, %r11 + jmp L(e1) + + ALIGN(16) +L(b3): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov (rp,n,8), %r10 + jmp L(e3) + + ALIGN(32) +L(top): mul v0 + mov -16(rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, -24(rp,n,8) +L(e1): ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc $0, %r11 + mul v0 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + adc $0, %r11 + mov %r10, -16(rp,n,8) +L(e0): ADDSUB %rax, %r8 + adc $0, %r9 + mov (up,n,8), %rax + mul v0 + mov (rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %r8, -8(rp,n,8) + adc $0, %r9 +L(e3): mov %rdx, %r11 + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc $0, %r11 + mul v0 + mov 8(rp,n,8), %r8 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov %r10, (rp,n,8) + adc $0, %r11 +L(e2): ADDSUB %rax, %r8 + adc $0, %r9 + mov 16(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + mov I(-8(rp),-16(rp,n,8)), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, I(-16(rp),-24(rp,n,8)) + ADDSUB %rax, %r10 + adc $0, %r11 + ADDSUB %r9, %r10 + adc $0, %r11 + mov %r10, I(-8(rp),-16(rp,n,8)) + mov %r11, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret + + ALIGN(16) +L(1): mul v0 + ADDSUB %rax, -8(rp) + mov %rdx, %rax + adc $0, %rax + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm new file mode 100644 index 0000000..43abcc8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm @@ -0,0 +1,174 @@ +dnl AMD64 mpn_cnd_add_n. + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 3.75 +C Intel SBR 1.93 +C Intel IBR 1.89 +C Intel HWL 1.78 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom +C Intel SLM 4.0 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +define(ADDSUB, add) +define(ADCSBB, adc) +define(func, mpn_cnd_add_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_add_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + and cnd, %rdi + and cnd, %r9 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + and cnd, %rdi + ADDSUB (up), %rdi + mov %rdi, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + lea 32(up), up + mov %r11, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm new file mode 100644 index 0000000..f55492b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm @@ -0,0 +1,200 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.15 +C Intel IBR 1.96 +C Intel HWL 2.0 +C Intel BWL 1.65 +C Intel SKL 1.65 +C Intel atom +C Intel SLM 4.5 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Given that we have a dedicated cnd_add_n, it might look strange that this +C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and +C this file's generality might come in handy for some pipeline. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + mov (up), %r12 + and cnd, %rdi + mov 8(up), %r13 + and cnd, %r9 + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + mov (up), %r12 + and cnd, %rdi + ADDSUB %rdi, %r12 + mov %r12, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + mov 24(up), %rax + lea 32(up), up + ADCSBB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + ADCSBB %r11, %rax + mov %rax, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm new file mode 100644 index 0000000..d9f371f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_divrem_1 + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1) +include_mpn(`x86_64/divrem_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h new file mode 100644 index 0000000..36f4512 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h @@ -0,0 +1,241 @@ +/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */ +/* FFT tuning limit = 468,152,320 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 30 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 462 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 396, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \ + { 95, 7}, { 1535, 8}, { 831,10}, { 223, 9}, \ + { 447,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1919,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8191,16}, { 4607,15}, { 9983,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 6}, \ + { 4351, 7}, { 2303, 8}, { 1215,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,10}, { 607,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 210 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 66 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 172 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 92 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 228 +#define REDC_1_TO_REDC_2_THRESHOLD 36 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1442 + +#define POWM_SEC_TABLE 1,16,191,452,1297 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 1160 +#define SET_STR_PRECOMPUTE_THRESHOLD 2043 + +#define FAC_DSC_THRESHOLD 426 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 5 /* 0.74% faster than 3 */ +#define HGCD_THRESHOLD 96 +#define HGCD_APPR_THRESHOLD 60 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 32.22% faster than 4 */ + +/* Tuneup completed successfully, took 276198 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm new file mode 100644 index 0000000..a1cbc31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm new file mode 100644 index 0000000..ac90edb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm new file mode 100644 index 0000000..a43a117 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm @@ -0,0 +1,199 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 +C Intel NHM +C Intel SBR 2.49 +C Intel IBR 2.32 +C Intel HWL 2.44 +C Intel BWL 2.43 +C Intel SKL 2.47 +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up_param',`%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 +define(`cin', `%r8') C stack + +define(`up', `%rsi') C same as rp_param +define(`n', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`rp', `%rcx')') +IFDOS(` define(`up_param',`%rdx')') +IFDOS(` define(`n_param', `%r8')') +IFDOS(` define(`v0', `%r9')') +IFDOS(` define(`cin', `48(%rsp)')') + +IFDOS(` define(`up', `%rsi')') +IFDOS(` define(`n', `%r8')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0) +L(x1): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01) + +L(11): mov 8(up,n,8), %rax + dec n + jmp L(L3) + +L(01): inc n + jnz L(L1) + mov %rax, (rp) + mov %rdx, %rax +IFDOS(` pop %rsi ') + ret + +L(x0): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up,n,8), %rax + test $2, R8(n) + jz L(L0) + +L(10): add $-2, n + jmp L(L2) + + ALIGN(8) +L(top): mov %rdx, %r10 + add %rax, %r11 +L(L1): mov 0(up,n,8), %rax + adc $0, %r10 + mul v0 + add %rax, %r10 + mov %r11, 0(rp,n,8) + mov 8(up,n,8), %rax + mov %rdx, %r11 +L(L0c): adc $0, %r11 +L(L0): mul v0 + mov %r10, 8(rp,n,8) + add %rax, %r11 + mov %rdx, %r10 +L(L3c): mov 16(up,n,8), %rax + adc $0, %r10 +L(L3): mul v0 + mov %r11, 16(rp,n,8) + mov %rdx, %r11 + add %rax, %r10 +L(L2c): mov 24(up,n,8), %rax + adc $0, %r11 +L(L2): mul v0 + mov %r10, 24(rp,n,8) + add $4, n + jnc L(top) + +L(end): add %rax, %r11 + mov %rdx, %rax + adc $0, %rax + mov %r11, (rp) + +IFDOS(` pop %rsi ') + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0c) +L(x1c): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01c) + +L(11c): add cin, %r11 + dec n + jmp L(L3c) + +L(01c): add cin, %r11 + inc n + jnz L(L1) + mov %r11, (rp) + mov %rdx, %rax + adc $0, %rax +IFDOS(` pop %rsi ') + ret + +L(x0c): mov %rax, %r10 + mov %rdx, %r11 + test $2, R8(n) + jz L(00c) + +L(10c): add $-2, n + add cin, %r10 + jmp L(L2c) + +L(00c): add cin, %r10 + mov 8(up,n,8), %rax + jmp L(L0c) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm new file mode 100644 index 0000000..781534d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 8.03 +C AMD K10 8.03 +C AMD bull 9.19 +C AMD pile 9.16 +C AMD steam +C AMD excavator +C AMD bobcat 10.6 +C AMD jaguar 11.0 +C Intel P4 26.0 +C Intel core2 8.73 +C Intel NHM 8.55 +C Intel SBR 5.15 +C Intel IBR 4.57 +C Intel HWL 4.08 +C Intel BWL 4.10 +C Intel SKL 4.14 +C Intel atom 39.5 +C Intel SLM 26.3 +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b1) + +L(b0): mov $0, R32(n) + sub n_param, n + xor w0, w0 + mul v0 + mov %rax, w2 + mov %rdx, w1 + mov (up,n,8), %rax + jmp L(lo0) + +L(b1): mov $1, R32(n) + sub n_param, n + xor w2, w2 + mul v0 + mov %rax, w0 + mov %rdx, w3 + mov -8(up,n,8), %rax + mul v1 + jmp L(lo1) + + ALIGN(32) +L(top): mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 C 1 + adc $0, w3 C 2 +L(lo1): add %rax, w2 C 2 + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + adc $0, w0 C 3 + mov (up,n,8), %rax + mul v0 + add %rax, w2 C 2 + mov %rdx, w1 C 3 + adc $0, w1 C 3 + add w3, w2 C 2 + mov (up,n,8), %rax + adc $0, w1 C 1 +L(lo0): mul v1 + mov w2, (rp,n,8) C 2 + add %rax, w0 C 3 + mov %rdx, w2 C 4 + mov 8(up,n,8), %rax + adc $0, w2 C 4 + add $2, n + jnc L(top) + +L(end): mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm new file mode 100644 index 0000000..35fd1cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm @@ -0,0 +1,407 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.5 - 2.95 +C Intel IBR 2.4 2.3 - 2.68 +C Intel HWL 2.35 2.0 - 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Fix the addmul_2 fluctuation affecting SBR. +C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in +C the loops at the expense of code size. +C * Adjoin a mul_3, avoiding slow mul_1 for odd vn. +C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight +C speedup. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) C FIXME +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,un,8), %rax + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + add %rax, X0 + adc $0, w1 + mov (up,un,8), %rax + mul v1 + mov 8(rp,un,8), X1 + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + xor w3, w3 + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + mov %rdx, w3 + add %rax, X1 + mov (up,un,8), %rax + mov 8(rp,un,8), X0 + adc $0, w3 + jmp L(top) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 +L(lo0): mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo3): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 +L(lo2): add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 C useless but harmless in final iter + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm new file mode 100644 index 0000000..a41a8ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm @@ -0,0 +1,384 @@ +dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.95 +C Intel IBR 2.3 2.68 +C Intel HWL 2.0 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r8') +define(`X0', `%r14') +define(`X1', `%r15') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`i', `%rbp') +define(`v0', `%r9') +define(`v1', `%rbx') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n,8), rp C point rp at R[un] + push %rbp + lea (up,n,8), up C point up right after U's end + push %r12 + neg n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n) + jnz L(m2b1) + +L(m2b0):lea (n), i + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(n), i + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,i,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,i,8) + mov %rdx, w0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,i,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,i,8) + add %rax, w0 + mov %rdx, w2 C FIXME: dead in last iteration + mov 8(up,i,8), %rax + adc $0, w2 C FIXME: dead in last iteration + add $2, i + jnc L(m2tp) + +L(m2ed):imul v0, %rax + add w0, %rax + add w1, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov (rp,n,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea 1(n), i + jmp L(lo0) + +L(a110):lea 3(n), i + mov %rdx, w3 + add %rax, X1 + mov (up,n,8), %rax + mov 8(rp,n,8), X0 + adc $0, w3 + jmp L(lo2) + +L(a1x1):mov (rp,n,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 2(n), i + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + mul v1 + mov 8(rp,n,8), X1 + jmp L(lo1) + +L(a111):lea (n), i + xor w3, w3 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,i,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,i,8), %rax + mul v1 + mov X1, -24(rp,i,8) + mov -8(rp,i,8), X1 + add w3, X0 + adc $0, w1 +L(lo1): mov %rdx, w2 + mov X0, -16(rp,i,8) + add %rax, X1 + adc $0, w2 + mov -8(up,i,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo0): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,i,8), %rax + mul v1 + add w1, X1 + mov (rp,i,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,i,8) + mov %rdx, w1 + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mov (up,i,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,i,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,i,8), %rax + mov X0, (rp,i,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,i,8), %rax + mov 16(rp,i,8), X0 + adc $0, w3 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add %rax, X0 + add w1, X1 + adc $0, w3 + add w2, X0 + mov I(-8(up),-16(up,i,8)), %rax + imul v0, %rax + add X0, %rax + mov X1, I(-16(rp),-24(rp,i,8)) + add w3, %rax + mov %rax, I(-8(rp),-16(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %r10 + imul v0, %r10 + mov -16(up), %r11 + imul v1, %r11 + mov %rax, -16(rp) + add %r10, %r11 + add %rdx, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm new file mode 100644 index 0000000..f0dbe07 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm @@ -0,0 +1,546 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR 3.24 +C Intel IBR 3.04 +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea 8(mp_param,n,8), mp + lea 8(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov %rdx, %r9 + mov (up,n,8), %rbx + add %rax, %rbx + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbx + mov %rbx, -8(up,i,8) C next low remainder limb + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 +L(e1): mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp1) + +L(ed1): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r9 + add %rax, %rbx + adc $0, %r9 + mov 8(mp,n,8), %rax + mul q0 + mov 8(up,n,8), %r10 + add %r11, %rbx + mov %rdx, %r11 + adc $0, %r9 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) +L(e3): add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp3) + +L(ed3): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea -8(up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea -8(up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov -8(mp,n,8), %rax + mul q0 + mov %rdx, %r9 + mov -8(up,n,8), %rbp + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,n,8), %rbp + add %r9, %rbx + mov %rdx, %r9 + mov %rbx, (up,n,8) + adc $0, %r11 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 +L(e0): add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp0) + +L(ed0): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %rbp + mov %rdx, %r9 + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + add %r9, %rbx + mov %rdx, %r9 + mov 8(up,n,8), %rbp + adc $0, %r11 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) +L(e2): add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp2) + +L(ed2): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -16(up), %rax + adc -8(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -16(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -24(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -8(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc (up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -32(mp), %rax + mov -32(up), %r10 + mul q0 + add %rax, %r10 + mov -24(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + add %r11, %rbp + mov -16(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -24(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -16(up) + mov %r11, -32(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm new file mode 100644 index 0000000..fd2eaea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm @@ -0,0 +1,193 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge. + +dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 4.25 +C Intel P4 21.5 +C Intel core2 3.2 +C Intel NHM 3.87 +C Intel SBR 2.05 +C Intel atom ? +C VIA nano 44.9 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %rbp + ADCSBB (vp), %rbp + + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rbp + ADDSUB (vp), %rbp +L(ent): + sbb R32(%rbx), R32(%rbx) C save cy + mov R32(%rbp), R32(%rax) + and $1, R32(%rax) C return value + + mov R32(n), R32(%r11) + and $3, R32(%r11) + + cmp $1, R32(%r11) + je L(do) C jump if n = 1 5 9 ... + +L(n1): cmp $2, R32(%r11) + jne L(n2) C jump unless n = 2 6 10 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r10 + ADCSBB 8(vp), %r10 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r10, %rbp + mov %rbp, -8(rp) + jmp L(cj1) + +L(n2): cmp $3, R32(%r11) + jne L(n3) C jump unless n = 3 7 11 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r9 + mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r9, %rbp + mov %rbp, -16(rp) + jmp L(cj2) + +L(n3): dec n C come here for n = 4 8 12 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + ADCSBB 24(vp), %r10 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, -24(rp) + shrd $1, %r9, %r8 + mov %r8, -16(rp) +L(cj2): shrd $1, %r10, %r9 + mov %r9, -8(rp) +L(cj1): mov %r10, %rbp + +L(do): + shr $2, n C 4 + je L(end) C 2 + ALIGN(16) +L(top): add R32(%rbx), R32(%rbx) C restore cy + + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + mov 32(up), %r11 + ADCSBB 24(vp), %r10 + ADCSBB 32(vp), %r11 + + lea 32(up), up + lea 32(vp), vp + + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, (rp) + shrd $1, %r9, %r8 + mov %r8, 8(rp) + shrd $1, %r10, %r9 + mov %r9, 16(rp) + shrd $1, %r11, %r10 + mov %r10, 24(rp) + + dec n + mov %r11, %rbp + lea 32(rp), rp + jne L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm new file mode 100644 index 0000000..4c1c0d4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm new file mode 100644 index 0000000..46a3612 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm @@ -0,0 +1,484 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 ? ? ? +C AMD K10 ? ? ? +C AMD bull ? ? ? +C AMD pile ? ? ? +C AMD steam ? ? ? +C AMD bobcat ? ? ? +C AMD jaguar ? ? ? +C Intel P4 ? ? ? +C Intel core ? ? ? +C Intel NHM ? ? ? +C Intel SBR 2.57 2.93 3.0 +C Intel IBR 2.35 2.66 3.0 +C Intel HWL 2.02 2.5 2.5 +C Intel BWL ? ? ? +C Intel atom ? ? ? +C VIA nano ? ? ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund, except +C that the sqr_diag_addlsh1 loop was manually written. + +C TODO +C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy. +C * Streamline pointer updates. +C * Perhaps suppress a few more xor insns in feed-in code. +C * Make sure we write no dead registers in feed-in code. +C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch +C out for negative sizes being zero-extended, though. +C * The straight-line code for n <= 3 comes from the K8 code, and might be +C quite sub-optimal here. Write specific code, and add code for n = 4. +C * The mul_2 loop has a 10 insn common sequence in the loop start and the +C wind-down code. Try re-rolling it. +C * This file has been the subject to just basic micro-optimisation. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rax + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w2', `%r11') + + mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(gt3): + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%rbx') +define(`w3', `%rbp') +define(`un', `%r12') +define(`n', `%rcx') + +define(`X0', `%r13') +define(`X1', `%r14') + +L(do_mul_2): + mov (up), v0 + push %rbx + lea (rp,un_param,8), rp C point rp at R[un] + mov 8(up), %rax + push %rbp + lea (up,un_param,8), up C point up right after U's end + mov %rax, v1 + push %r12 + mov $1, R32(un) C free up rdx + push %r13 + sub un_param, un + push %r14 + push un + mul v0 + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea 2(un), n + xor R32(w1), R32(w1) C FIXME + xor R32(w2), R32(w2) C FIXME + mov %rdx, w0 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor R32(w3), R32(w3) C FIXME + xor R32(w0), R32(w0) C FIXME + mov %rdx, w2 + jmp L(m2l1) + + ALIGN(32) +L(m2tp): +L(m2l0):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax +L(m2l1):mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 + mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $2, un C decrease |un| + +L(do_addmul_2): +L(outer): + lea 16(rp), rp + cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1} + jge L(corner) C FIXME: move to before the lea above + + mov -8(up,un,8), v0 + mov (up,un,8), %rax + mov %rax, v1 + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov 8(rp,un,8), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, (rp,un,8) + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + mov 8(rp,un,8), X0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + xor w0, w0 + mov 8(up,un,8), %rax + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + jmp L(lo1) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax +L(lo1): mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 + mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 +L(lo0): mul v0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax +L(lo3): mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) +L(lo2): mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + add $2, un C decrease |un| + jmp L(outer) C loop until a small corner remains + +L(corner): + pop n + jg L(small_corner) + + lea 8(rp), rp + mov -24(up), v0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov -24(rp), X0 + mov -16(rp), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, -24(rp) + mov -8(up), %rax + mul v0 + add $0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov -8(up), %rax + adc $0, w3 + mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, -16(rp) + jmp L(com) + +L(small_corner): + mov -8(rp), w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 +L(com): add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + mul %rax + mov %rax, (rp,n,8) + + xor R32(%rbx), R32(%rbx) + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + jmp L(dm) + + ALIGN(32) +L(dtop):add %r8, %r10 + adc %r9, %rax + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + mov %r10, -8(rp,n,8) + mov %rax, (rp,n,8) +L(dm): adc %r8, %r8 + adc %r9, %r9 + mov (up,n,4), %rax + lea (%rdx,%rbx), %r10 + setc R8(%rbx) + mul %rax + add $2, n + js L(dtop) + +L(dend):add %r8, %r10 + adc %r9, %rax + mov %r10, I(-8(rp),-8(rp,n,8)) + mov %rax, I((rp),(rp,n,8)) + adc %rbx, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/darwin.m4 b/gmp-6.3.0/mpn/x86_64/darwin.m4 new file mode 100644 index 0000000..7771476 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/darwin.m4 @@ -0,0 +1,82 @@ +divert(-1) +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`DARWIN') + +define(`LEA',`dnl +ifdef(`PIC', + `lea $1(%rip), $2' +, + `movabs `$'$1, $2') +') + +dnl Usage: CALL(funcname) +dnl +dnl Simply override the definition in x86_64-defs.m4. + +define(`CALL',`call GSYM_PREFIX`'$1') +define(`TCALL',`jmp GSYM_PREFIX`'$1') + + +dnl Usage: JUMPTABSECT +dnl +dnl CAUTION: Do not put anything sensible here, like RODATA. That works with +dnl some Darwin tool chains, but silently breaks with other. (Note that +dnl putting jump tables in the text segment is a really poor idea for many PC +dnl processors, since they cannot cache the same thing in both L1D and L2I.) + +define(`JUMPTABSECT', `.text') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT',`dnl +ifdef(`PIC', + `.set $1_tmp, $1-$2 + .long $1_tmp' +, + `.quad $1' +)') + +dnl Target ABI macros. For Darwin we override IFELF (and leave default for +dnl IFDOS and IFSTD). + +define(`IFELF', `') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `.private_extern $1') + + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm new file mode 100644 index 0000000..b3d45e2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm @@ -0,0 +1,247 @@ +dnl x86-64 mpn_div_qr_1n_pi1 +dnl -- Divide an mpn number by a normalized single-limb number, +dnl using a single-limb inverse. + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C AMD K8,K9 13 +C AMD K10 13 +C AMD bull 16.5 +C AMD pile 15 +C AMD steam ? +C AMD bobcat 16 +C AMD jaguar ? +C Intel P4 47 poor +C Intel core 19.25 +C Intel NHM 18 +C Intel SBR 15 poor +C Intel IBR 13 +C Intel HWL 11.7 +C Intel BWL ? +C Intel atom 52 very poor +C VIA nano 19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx') C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8') C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + + ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + dec UN_INPUT + jnz L(first) + + C Just a single 2/1 division. + C T, U0 are allocated in scratch registers + lea 1(U1), T + mov U1, %rax + mul DINV + mov (UP), U0 + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(single_div_done) + sub D, %rax + add $1, T +L(single_div_done): + mov T, (QP) + FUNC_EXIT() + ret +L(first): + C FIXME: Could delay some of these until we enter the loop. + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov D, B2 + imul DINV, B2 + neg B2 + mov B2, B2md + sub D, B2md + + C D not needed until final reduction + push D + mov UN_INPUT, UN C Clobbers D + + mov DINV, %rax + mul U1 + mov %rax, Q0 + add U1, %rdx + mov %rdx, T + + mov B2, %rax + mul U1 + mov -8(UP, UN, 8), U0 + mov (UP, UN, 8), U1 + mov T, (QP, UN, 8) + add %rax, U0 + adc %rdx, U1 + sbb U2, U2 + dec UN + mov U1, %rax + jz L(final) + + ALIGN(16) + + C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles. + C At entry, %rax holds an extra copy of U1 +L(loop): + C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 + C Remains to add in B (U1 + c) + mov DINV, Q1 + mov U2, Q2 + and U2, Q1 + neg Q2 + mul DINV + add %rdx, Q1 + adc $0, Q2 + add Q0, Q1 + mov %rax, Q0 + mov B2, %rax + lea (B2md, U0), T + adc $0, Q2 + + C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u + mul U1 + and B2, U2 + add U2, U0 + cmovnc U0, T + + C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c + adc U1, Q1 + mov -8(UP, UN, 8), U0 + adc Q2, 8(QP, UN, 8) + jc L(q_incr) +L(q_incr_done): + add %rax, U0 + mov T, %rax + adc %rdx, %rax + mov Q1, (QP, UN, 8) + sbb U2, U2 + dec UN + mov %rax, U1 + jnz L(loop) + +L(final): + pop D + + mov U2, Q1 + and D, U2 + sub U2, %rax + neg Q1 + + mov %rax, U1 + sub D, %rax + cmovc U1, %rax + sbb $-1, Q1 + + lea 1(%rax), T + mul DINV + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(div_done) + sub D, %rax + add $1, T +L(div_done): + add T, Q0 + mov Q0, (QP) + adc Q1, 8(QP) + jnc L(done) +L(final_q_incr): + addq $1, 16(QP) + lea 8(QP), QP + jc L(final_q_incr) + +L(done): + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(q_incr): + C U1 is not live, so use it for indexing + lea 16(QP, UN, 8), U1 +L(q_incr_loop): + addq $1, (U1) + jnc L(q_incr_done) + lea 8(U1), U1 + jmp L(q_incr_loop) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm new file mode 100644 index 0000000..5e59a0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm @@ -0,0 +1,158 @@ +dnl x86-64 mpn_div_qr_2n_pi1 +dnl -- Divide an mpn number by a normalized 2-limb number, +dnl using a single-limb inverse. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`rp', `%rsi') +define(`up_param', `%rdx') +define(`un', `%rcx') +define(`d1', `%r8') +define(`d0', `%r9') +define(`di_param', `8(%rsp)') + +define(`di', `%r10') +define(`up', `%r11') +define(`u2', `%rbx') +define(`u1', `%r12') +define(`t1', `%r13') +define(`t0', `%r14') +define(`md1', `%r15') + +C TODO +C * Store qh in the same stack slot as di_param, instead of pushing +C it. (we could put it in register %rbp, but then we would need to +C save and restore that instead, which doesn't seem like a win). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_2n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') +IFDOS(`define(`di_param', `72(%rsp)')') + mov di_param, di + mov up_param, up + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + mov -16(up, un, 8), u1 + mov -8(up, un, 8), u2 + + mov u1, t0 + mov u2, t1 + sub d0, t0 + sbb d1, t1 + cmovnc t0, u1 + cmovnc t1, u2 + C push qh which is !carry + sbb %rax, %rax + inc %rax + push %rax + lea -2(un), un + mov d1, md1 + neg md1 + + jmp L(next) + + ALIGN(16) +L(loop): + C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) + C Based on the optimized divrem_2.asm code. + + mov di, %rax + mul u2 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov (up, un, 8), u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix) +L(bck): + mov t1, (qp, un, 8) +L(next): + sub $1, un + jnc L(loop) +L(end): + mov u2, 8(rp) + mov u1, (rp) + + C qh on stack + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(fix): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm new file mode 100644 index 0000000..85af96f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm @@ -0,0 +1,200 @@ +dnl x86-64 mpn_div_qr_2u_pi1 +dnl -- Divide an mpn number by an unnormalized 2-limb number, +dnl using a single-limb inverse and shifting the dividend on the fly. + +dnl Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`rp', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') dnl %rcx needed for shift count +define(`d1', `%r8') +define(`d0', `%r9') +define(`shift_param', `FRAME+8(%rsp)') +define(`di_param', `FRAME+16(%rsp)') + +define(`di', `%r10') +define(`up', `%r11') +define(`un', `%rbp') +define(`u2', `%rbx') +define(`u1', `%r12') +define(`u0', `%rsi') dnl Same as rp, which is saved and restored. +define(`t1', `%r13') +define(`t0', `%r14') +define(`md1', `%r15') + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME', 0) +PROLOGUE(mpn_div_qr_2u_pi1) + mov di_param, di + mov up_param, up + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + push rp +deflit(`FRAME', 56) + lea -2(un_param), un + mov d1, md1 + neg md1 + + C int parameter, 32 bits only + movl shift_param, R32(%rcx) + + C FIXME: Different code for SHLD_SLOW + + xor R32(u2), R32(u2) + mov 8(up, un, 8), u1 + shld %cl, u1, u2 + C Remains to read (up, un, 8) and shift u1, u0 + C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di) + mov di, %rax + mul u2 + mov (up, un, 8), u0 + shld %cl, u0, u1 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov u0, u1 + shl %cl, u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix_qh) +L(bck_qh): + push t1 C push qh on stack + + jmp L(next) + + ALIGN(16) +L(loop): + C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) + C Based on the optimized divrem_2.asm code. + + mov di, %rax + mul u2 + mov (up, un, 8), u0 + xor R32(t1), R32(t1) + shld %cl, u0, t1 + or t1, u1 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov u0, u1 + shl %cl, u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix) +L(bck): + mov t1, (qp, un, 8) +L(next): + sub $1, un + jnc L(loop) +L(end): + C qh on stack + pop %rax + pop rp + shrd %cl, u2, u1 + shr %cl, u2 + mov u2, 8(rp) + mov u1, (rp) + + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret + +L(fix): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck) + +C Duplicated, just jumping back to a different address. +L(fix_qh): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck_qh) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck_qh) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/dive_1.asm b/gmp-6.3.0/mpn/x86_64/dive_1.asm new file mode 100644 index 0000000..988bdab --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/dive_1.asm @@ -0,0 +1,158 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13.25 +C Intel corei 14 +C Intel atom 42 +C VIA nano 43 + +C A quick adoption of the 32-bit K7 code. + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C divisor rcx + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divexact_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C shift count + mov %rdx, %r8 + + bt $0, R32(%rax) + jnc L(evn) C skip bsfq unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r10 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits + + lea (%rsi,%r8,8), %rsi C up end + lea -8(%rdi,%r8,8), %rdi C rp end + neg %r8 C -n + + mov (%rsi,%r8,8), %rax C up[0] + + inc %r8 + jz L(one) + + mov (%rsi,%r8,8), %rdx C up[1] + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx shift + C rdx + C rsi up end + C rdi rp end + C r8 counter, limbs, negative + C r10 d^(-1) mod 2^64 + C r11 d, shifted down + + mul %r11 C carry limb in rdx 0 10 + mov -8(%rsi,%r8,8), %rax C + mov (%rsi,%r8,8), %r9 C + shrd R8(%rcx), %r9, %rax C + nop C + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(ent): imul %r10, %rax C 6 + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(top) + + mul %r11 C carry limb in rdx + mov -8(%rsi), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/divrem_1.asm new file mode 100644 index 0000000..d4d61ad --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/divrem_1.asm @@ -0,0 +1,314 @@ +dnl x86-64 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C norm unorm frac +C AMD K8,K9 13 13 12 +C AMD K10 13 13 12 +C Intel P4 43 44 43 +C Intel core2 24.5 24.5 19.5 +C Intel corei 20.5 19.5 18 +C Intel atom 43 46 36 +C VIA nano 25.5 25.5 24 + +C mp_limb_t +C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d) + +C mp_limb_t +C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d, +C mp_limb_t dinv, int cnt) + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn_param', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`d', `%r8') +define(`dinv', `%r9') C only for mpn_preinv_divrem_1 +C shift passed on stack C only for mpn_preinv_divrem_1 + +define(`cnt', `%rcx') +define(`up', `%rsi') +define(`fn', `%r12') +define(`un', `%rbx') + + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C cnt qp d dinv + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFSTD(`define(`CNTOFF', `40($1)')') +IFDOS(`define(`CNTOFF', `104($1)')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + + lea -8(qp,un_param,8), qp + + test d, d + js L(nent) + + mov CNTOFF(%rsp), R8(cnt) + shl R8(cnt), d + jmp L(uent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + je L(ret) + + lea -8(qp,un_param,8), qp + xor R32(%rbp), R32(%rbp) + + test d, d + jns L(unnormalized) + +L(normalized): + test un, un + je L(8) C un == 0 + mov -8(up,un,8), %rbp + dec un + mov %rbp, %rax + sub d, %rbp + cmovc %rax, %rbp + sbb R32(%rax), R32(%rax) + inc R32(%rax) + mov %rax, (qp) + lea -8(qp), qp +L(8): +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` mov d, %rdi ') +IFDOS(` sub $32, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + + mov %rax, dinv + mov %rbp, %rax + jmp L(nent) + + ALIGN(16) +L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4 + mul dinv C 0,13 0,20 0,18 0,45 + add %r10, %rax C 4 8 3 12 + adc %rbp, %rdx C 5 9 10 13 + mov %rax, %rbp C 5 9 4 13 + mov %rdx, %r13 C 6 11 12 23 + imul d, %rdx C 6 11 11 23 + sub %rdx, %r10 C 10 16 14 33 + mov d, %rax C + add %r10, %rax C 11 17 15 34 + cmp %rbp, %r10 C 11 17 15 34 + cmovc %r10, %rax C 12 18 16 35 + adc $-1, %r13 C + cmp d, %rax C + jae L(nfx) C +L(nok): mov %r13, (qp) C + sub $8, qp C +L(nent):lea 1(%rax), %rbp C + dec un C + jns L(ntop) C + + xor R32(%rcx), R32(%rcx) + jmp L(frac) + +L(nfx): sub d, %rax + inc %r13 + jmp L(nok) + +L(unnormalized): + test un, un + je L(44) + mov -8(up,un,8), %rax + cmp d, %rax + jae L(44) + mov %rbp, (qp) + mov %rax, %rbp + lea -8(qp), qp + je L(ret) + dec un +L(44): + bsr d, %rcx + not R32(%rcx) + shl R8(%rcx), d + shl R8(%rcx), %rbp + + push %rcx +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` sub $8, %rsp ') +IFSTD(` mov d, %rdi ') +IFDOS(` sub $40, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + pop %rcx + + mov %rax, dinv + mov %rbp, %rax + test un, un + je L(frac) + +L(uent):dec un + mov (up,un,8), %rbp + neg R32(%rcx) + shr R8(%rcx), %rbp + neg R32(%rcx) + or %rbp, %rax + jmp L(ent) + + ALIGN(16) +L(utop):mov (up,un,8), %r10 + shl R8(%rcx), %rbp + neg R32(%rcx) + shr R8(%rcx), %r10 + neg R32(%rcx) + or %r10, %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(ufx) +L(uok): mov %r13, (qp) + sub $8, qp +L(ent): mov (up,un,8), %rbp + dec un + lea 1(%rax), %r11 + jns L(utop) + +L(uend):shl R8(%rcx), %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(efx) +L(eok): mov %r13, (qp) + sub $8, qp + jmp L(frac) + +L(ufx): sub d, %rax + inc %r13 + jmp L(uok) +L(efx): sub d, %rax + inc %r13 + jmp L(eok) + +L(frac):mov d, %rbp + neg %rbp + jmp L(fent) + + ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 +L(ftop):mul dinv C 0,12 0,17 0,17 + add %r11, %rdx C 5 8 10 + mov %rax, %r11 C 4 8 3 + mov %rdx, %r13 C 6 9 11 + imul %rbp, %rdx C 6 9 11 + mov d, %rax C + add %rdx, %rax C 10 14 14 + cmp %r11, %rdx C 10 14 14 + cmovc %rdx, %rax C 11 15 15 + adc $-1, %r13 C + mov %r13, (qp) C + sub $8, qp C +L(fent):lea 1(%rax), %r11 C + dec fn C + jns L(ftop) C + + shr R8(%rcx), %rax +L(ret): pop %rbx + pop %rbp + pop %r12 + pop %r13 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/divrem_2.asm b/gmp-6.3.0/mpn/x86_64/divrem_2.asm new file mode 100644 index 0000000..20811cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/divrem_2.asm @@ -0,0 +1,192 @@ +dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb best +C AMD K8,K9 18 +C AMD K10 18 +C AMD bull +C AMD pile +C AMD bobcat +C AMD jaguar +C Intel P4 68 +C Intel core 34 +C Intel NHM 30.25 +C Intel SBR 21.3 +C Intel IBR 21.4 +C Intel HWL 20.6 +C Intel BWL +C Intel atom 73 +C VIA nano 33 + + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`dp', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divrem_2) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %r15 + push %r14 + push %r13 + push %r12 + lea -24(%rdx,%rcx,8), %r12 C r12 = &up[un-1] + mov %rsi, %r13 + push %rbp + mov %rdi, %rbp + push %rbx + mov 8(%r8), %r11 C d1 + mov 16(%r12), %rbx + mov (%r8), %r8 C d0 + mov 8(%r12), %r10 + + xor R32(%r15), R32(%r15) + cmp %rbx, %r11 + ja L(2) + setb %dl + cmp %r10, %r8 + setbe %al + orb %al, %dl C "orb" form to placate Sun tools + je L(2) + inc R32(%r15) + sub %r8, %r10 + sbb %r11, %rbx +L(2): + lea -3(%rcx,%r13), %r14 C un + fn - 3 + test %r14, %r14 + js L(end) + + push %r8 + push %r10 + push %r11 +IFSTD(` mov %r11, %rdi ') +IFDOS(` mov %r11, %rcx ') +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + pop %r11 + pop %r10 + pop %r8 + + mov %r11, %rdx + mov %rax, %rdi + imul %rax, %rdx + mov %rdx, %r9 + mul %r8 + xor R32(%rcx), R32(%rcx) + add %r8, %r9 + adc $-1, %rcx + add %rdx, %r9 + adc $0, %rcx + js 2f +1: dec %rdi + sub %r11, %r9 + sbb $0, %rcx + jns 1b +2: + + lea (%rbp,%r14,8), %rbp + mov %r11, %rsi + neg %rsi C -d1 + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C n2 un -d1 dinv qp d0 q0 d1 up fn msl + + ALIGN(16) +L(top): mov %rdi, %rax C di ncp + mul %rbx C 0, 17 + mov %r10, %rcx C + add %rax, %rcx C 4 + adc %rbx, %rdx C 5 + mov %rdx, %r9 C q 6 + imul %rsi, %rdx C 6 + mov %r8, %rax C ncp + lea (%rdx, %r10), %rbx C n1 -= ... 10 + xor R32(%r10), R32(%r10) C + mul %r9 C 7 + cmp %r14, %r13 C + jg L(19) C + mov (%r12), %r10 C + sub $8, %r12 C +L(19): sub %r8, %r10 C ncp + sbb %r11, %rbx C 11 + sub %rax, %r10 C 11 + sbb %rdx, %rbx C 12 + xor R32(%rax), R32(%rax) C + xor R32(%rdx), R32(%rdx) C + cmp %rcx, %rbx C 13 + cmovnc %r8, %rax C 14 + cmovnc %r11, %rdx C 14 + adc $0, %r9 C adjust q 14 + nop + add %rax, %r10 C 15 + adc %rdx, %rbx C 16 + cmp %r11, %rbx C + jae L(fix) C +L(bck): mov %r9, (%rbp) C + sub $8, %rbp C + dec %r14 + jns L(top) + +L(end): mov %r10, 8(%r12) + mov %rbx, 16(%r12) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + mov %r15, %rax + pop %r15 + FUNC_EXIT() + ret + +L(fix): seta %dl + cmp %r8, %r10 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc %r9 + sub %r8, %r10 + sbb %r11, %rbx + jmp L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/dos64.m4 b/gmp-6.3.0/mpn/x86_64/dos64.m4 new file mode 100644 index 0000000..0da1b36 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/dos64.m4 @@ -0,0 +1,101 @@ +divert(-1) +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`HOST_DOS64') + + +dnl On DOS64 we always generate position-independent-code +dnl + +define(`PIC') + + +define(`LEA',` + lea $1(%rip), $2 +') + + +dnl Usage: CALL(funcname) +dnl +dnl Simply override the definition in x86_64-defs.m4. + +define(`CALL',`call GSYM_PREFIX`'$1') +define(`TCALL',`jmp GSYM_PREFIX`'$1') + + +dnl Usage: JUMPTABSECT + +define(`JUMPTABSECT', `RODATA') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT', `.long $1-$2') + + +dnl Usage: FUNC_ENTRY(nregparmas) +dnl Usage: FUNC_EXIT() + +dnl FUNC_ENTRY and FUNC_EXIT provide an easy path for adoption of standard +dnl ABI assembly to the DOS64 ABI. + +define(`FUNC_ENTRY', + `push %rdi + push %rsi + mov %rcx, %rdi +ifelse(eval($1>=2),1,`dnl + mov %rdx, %rsi +ifelse(eval($1>=3),1,`dnl + mov %r8, %rdx +ifelse(eval($1>=4),1,`dnl + mov %r9, %rcx +')')')') + +define(`FUNC_EXIT', + `pop %rsi + pop %rdi') + + +dnl Target ABI macros. For DOS64 we override the defaults. + +define(`IFDOS', `$1') +define(`IFSTD', `') +define(`IFELF', `') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `') + + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm new file mode 100644 index 0000000..21ab210 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm @@ -0,0 +1,181 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -32(rp,n,8), rp + lea -32(up,n,8), up + + cmp $7, n C basecase needed for correctness + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov 24(up), %rax + lea -8(up), up + mov %rax, 24(rp) + lea -8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + vmovdqu -64(up), %ymm2 + vmovdqu -96(up), %ymm3 + lea -128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + vmovdqa %ymm2, -64(rp) + vmovdqa %ymm3, -96(rp) + lea -128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + lea -64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea -32(up), up + vmovdqa %ymm0, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %r8 + mov %r8, 24(rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + mov 8(up), %r8 + mov (up), %r9 + lea -32(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + mov %r8, 8(rp) + mov %r9, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + lea -16(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %rax + mov %rax, 24(rp) +1: + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm new file mode 100644 index 0000000..03c2440 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm @@ -0,0 +1,178 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $7, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov (up), %rax + lea 8(up), up + mov %rax, (rp) + lea 8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + vmovdqu 64(up), %ymm2 + vmovdqu 96(up), %ymm3 + lea 128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + vmovdqa %ymm2, 64(rp) + vmovdqa %ymm3, 96(rp) + lea 128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + lea 64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea 32(up), up + vmovdqa %ymm0, (rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + mov 16(up), %r8 + mov 24(up), %r9 + lea 32(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + mov %r8, 16(rp) + mov %r9, 24(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + mov %rax, (rp) +1: + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/README b/gmp-6.3.0/mpn/x86_64/fastsse/README new file mode 100644 index 0000000..5538b2d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/README @@ -0,0 +1,22 @@ +This directory contains code for x86-64 processors with fast +implementations of SSE operations, hence the name "fastsse". + +Current processors that might benefit from this code are: + + AMD K10 + AMD Bulldozer/Piledriver/Steamroller/Excavator + Intel Nocona + Intel Nehalem/Westmere + Intel Sandybridge/Ivybridge + Intel Haswell/Broadwell + VIA Nano + +Current processors that do not benefit from this code are: + + AMD K8 + AMD Bobcat + Intel Atom + +Intel Conroe/Penryn is a border case; its handling of non-aligned +128-bit memory operands is poor. VIA Nano also have poor handling of +non-aligned operands. diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm new file mode 100644 index 0000000..69027bc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm @@ -0,0 +1,311 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 1.39 ? 1.45 Y/N +C AMD bd2 0.8-1.4 0.7-1.4 Y +C AMD bd3 +C AMD bd4 +C AMD bobcat 1.97 ? 8.17 1.5/1.5 N +C AMD jaguar 1.02 1.02 0.91/0.91 N +C Intel P4 2.26 illop Y/N +C Intel core 0.58 0.87 opt/0.74 Y +C Intel NHM 0.64 1.14 opt/bad Y +C Intel SBR 0.51 0.65 opt/opt Y +C Intel IBR 0.50 0.64 opt/0.57 Y +C Intel HWL 0.51 0.58 opt/opt Y +C Intel BWL 0.52 0.64 opt/opt Y +C Intel SKL 0.51 0.63 opt/opt Y +C Intel atom 1.16 1.70 opt/opt Y +C Intel SLM 1.02 1.52 N +C VIA nano 1.09 1.10 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken +C from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + + cmp $COM_SSE_THRESHOLD, n + jbe L(bc) + + pcmpeqb %xmm5, %xmm5 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + mov (up), %r8 + lea 8(up), up + not %r8 + mov %r8, (rp) + lea 8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COM_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + +C FIXME: The code below only handles overlap if it is close to complete, or +C quite separate: up-rp < 5 or up-up > 15 limbs + lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES + sub rp, %rax + cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES + jbe L(bc) C deflect to plain loop + + sub $16, n + jc L(uend) + + movdqa 120(up), %xmm3 + + sub $16, n + jmp L(um) + + ALIGN(16) +L(utop):movdqa 120(up), %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + sub $16, n +L(um): movdqa 104(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 88(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 112(rp) + palignr($8, %xmm1, %xmm2) + movdqa 72(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 96(rp) + palignr($8, %xmm0, %xmm1) + movdqa 56(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 80(rp) + palignr($8, %xmm3, %xmm0) + movdqa 40(up), %xmm2 + pxor %xmm5, %xmm0 + movdqa %xmm0, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 128(up), up + lea 128(rp), rp + jnc L(utop) + + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + +L(uend):test $8, R8(n) + jz 1f + movdqa 56(up), %xmm3 + movdqa 40(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 64(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa 24(up), %xmm1 + movdqa 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` ALIGN(16)') +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + not %r8 + not %r9 + not %r10 + not %r11 + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + not %r8 + not %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm new file mode 100644 index 0000000..c867222 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm @@ -0,0 +1,175 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 2.0 N +C AMD K10 0.85 1.3 Y/N +C AMD bull 1.40 1.40 Y +C AMD pile 0.9-1.4 0.9-1.4 Y +C AMD steam +C AMD excavator +C AMD bobcat 3.1 3.1 N +C AMD jaguar 0.91 0.91 opt/opt Y +C Intel P4 2.28 illop Y +C Intel core2 1.02 1.02 N +C Intel NHM 0.53 0.68 Y +C Intel SBR 0.51 0.75 opt/0.65 Y/N +C Intel IBR 0.50 0.57 opt/opt Y +C Intel HWL 0.51 0.64 opt/0.58 Y +C Intel BWL 0.61 0.65 0.57/opt Y +C Intel atom 3.68 3.68 N +C Intel SLM 1.09 1.35 N +C VIA nano 1.17 5.09 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + pcmpeqb %xmm7, %xmm7 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov (up), %rax + lea 8(up), up + not %rax + mov %rax, (rp) + lea 8(rp), rp + dec n + + sub $14, n + jc L(sma) + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + lea 112(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm7, %xmm6 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + lea 112(rp), rp +L(ali): sub $14, n + jnc L(top) + +L(sma): add $14, n + test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + pxor %xmm7, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + not %rax + mov %rax, (rp) +1: +L(don): +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm new file mode 100644 index 0000000..fac6f8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm @@ -0,0 +1,254 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bull 0.70 0.70 Y +C AMD pile 0.68 0.68 Y +C AMD steam +C AMD excavator +C AMD bobcat 1.97 8.24 1.5/1.5 N +C AMD jaguar 0.77 0.89 0.65/opt N/Y +C Intel P4 2.26 illop Y/N +C Intel core 0.52 0.80 opt/opt Y +C Intel NHM 0.52 0.64 opt/opt Y +C Intel SBR 0.51 0.51 opt/opt Y +C Intel IBR 0.50 0.50 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.66 opt/opt Y +C Intel SLM 1.02 1.04 opt/opt Y +C VIA nano 1.08 1.06 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). +C +C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + + cmp $COPYD_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jnz L(rp_aligned) C jump if rp aligned + + mov (up), %rax C copy one limb + mov %rax, (rp) + lea -8(up), up + lea -8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jz L(uent) + +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + movdqa -40(up), %xmm2 + movdqa -56(up), %xmm3 + lea -64(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + movdqa %xmm2, -40(rp) + movdqa %xmm3, -56(rp) + lea -64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + lea -32(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -8(up), %xmm0 + lea -16(up), up + movdqa %xmm0, -8(rp) + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent):sub $16, n + movdqa (up), %xmm0 + jc L(uend) + + ALIGN(16) +L(utop):sub $16, n + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -56(rp) + movdqa -80(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -72(rp) + movdqa -96(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -88(rp) + movdqa -112(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -104(rp) + movdqa -128(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -120(rp) + lea -128(up), up + lea -128(rp), rp + jnc L(utop) + +L(uend):test $8, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -56(rp) + lea -64(up), up + lea -64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + lea -32(up), up + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + lea -16(up), up + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov -8(up), %r9 + lea -32(rp), rp + mov -16(up), %r10 + mov -24(up), %r11 + lea -32(up), up + mov %r8, 32(rp) + mov %r9, 24(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, 16(rp) + mov %r11, 8(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + lea -8(rp), rp + lea -8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov -8(up), %r9 + mov %r8, (rp) + mov %r9, -8(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm new file mode 100644 index 0000000..b3c4706 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm @@ -0,0 +1,166 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 Y +C AMD pile 0.68 0.98 Y/N +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.80 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.50 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.04 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + test n, n + jz L(don) + + lea -16(rp,n,8), rp + lea -16(up,n,8), up + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov 8(up), %rax + lea -8(up), up + mov %rax, 8(rp) + lea -8(rp), rp + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + movdqu -64(up), %xmm4 + movdqu -80(up), %xmm5 + movdqu -96(up), %xmm6 + movdqu -112(up), %xmm7 + lea -128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + movdqa %xmm4, -64(rp) + movdqa %xmm5, -80(rp) + movdqa %xmm6, -96(rp) + movdqa %xmm7, -112(rp) + lea -128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + lea -64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + lea -32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea -16(up), up + movdqa %xmm0, (rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 8(up), %r8 + mov %r8, 8(rp) +1: +L(don): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm new file mode 100644 index 0000000..9876a47 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm @@ -0,0 +1,300 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 0.70 0.66 Y +C AMD bd2 0.68 0.66 Y +C AMD bd3 ? ? +C AMD bd4 ? ? +C AMD bt1 1.97 8.16 1.5/1.5 N +C AMD bt2 0.77 0.93 0.65/opt N/Y +C AMD zn1 ? ? +C AMD zn2 ? ? +C Intel P4 2.26 illop Y/N +C Intel CNR 0.52 0.64 opt/opt Y +C Intel NHM 0.52 0.71 0.50/0.67 N +C Intel SBR 0.51 0.54 opt/0.51 Y +C Intel IBR 0.50 0.54 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.61 opt/opt Y +C Intel SLM 1.02 1.07 opt/opt Y +C VIA nano 1.09 1.08 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +dnl define(`movdqa', ``movaps'') + +ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $COPYI_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + movsq C copy one limb + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + + cmp $16, n + jc L(ued0) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') +IFDOS(` movdqa %xmm8, 32(%rsp) ') + + movaps 120(up), %xmm7 + movaps 104(up), %xmm6 + movaps 88(up), %xmm5 + movaps 72(up), %xmm4 + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + lea 128(up), up + sub $32, n + jc L(ued1) + + ALIGN(16) +L(utop):movaps -104(up), %xmm1 + sub $16, n + movaps -120(up), %xmm0 + palignr($8, %xmm6, %xmm7) + movaps -136(up), %xmm8 + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movaps 120(up), %xmm7 + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movaps 104(up), %xmm6 + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movaps 88(up), %xmm5 + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movaps 72(up), %xmm4 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movaps 56(up), %xmm3 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movaps 40(up), %xmm2 + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + lea 128(up), up + movdqa %xmm0, (rp) + lea 128(rp), rp + jnc L(utop) + +L(ued1):movaps -104(up), %xmm1 + movaps -120(up), %xmm0 + movaps -136(up), %xmm8 + palignr($8, %xmm6, %xmm7) + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + movdqa %xmm0, (rp) + lea 128(rp), rp + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` movdqa 32(%rsp), %xmm8 ') +IFDOS(` add $56, %rsp ') + +L(ued0):test $8, R8(n) + jz 1f + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + movaps -8(up), %xmm4 + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm4, %xmm0) + lea 64(up), up + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movaps -8(up), %xmm3 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm new file mode 100644 index 0000000..97f7865 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm @@ -0,0 +1,185 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 N +C AMD pile 0.77 0.93 N +C AMD steam ? ? +C AMD excavator ? ? +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.64 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.54 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.07 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $3, n C NB: bc code below assumes this limit + jc L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + movsq C copy single limb + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + movdqu 112(up), %xmm7 + lea 128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + movdqa %xmm7, 112(rp) + lea 128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + ALIGN(16) +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for correctness as +C the above code is currently written. The commented-out lines need to be +C reinstated if this code is to be used for n > 3, and then the post loop +C offsets need fixing. + +L(bc): sub $2, n + jc L(end) + ALIGN(16) +1: mov (up), %rax + mov 8(up), %rcx +dnl lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) +dnl lea 16(rp), rp +dnl sub $2, n +dnl jnc 1b + + test $1, R8(n) + jz L(ret) + mov 16(up), %rax + mov %rax, 16(rp) +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm new file mode 100644 index 0000000..a05e850 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm new file mode 100644 index 0000000..6a17b93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.68 (1.45) 1.75 (1.49) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4 4 +C Intel P4 3 (2.7) 3 (2.7) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.75) 2.09 (2) +C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y +C Intel atom ? ? +C VIA nano 2.25 (2) 2.5 (2) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movdqa -8(ap,n,8), %xmm0 + movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent): + sub $2, n + ja L(atop) + jne L(end8) + + movdqa (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm new file mode 100644 index 0000000..8250910 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 ? no, use shl/shr +C AMD K10 1.8-2.0 1.8-2.0 ? yes +C AMD bd1 1.9 1.9 ? yes +C AMD bobcat 3.67 3.67 yes, bad for n < 20 +C Intel P4 4.75 4.75 ? no, slow movdqu +C Intel core2 2.27 2.27 ? no, use shld/shrd +C Intel NHM 2.15 2.15 ? no, use shld/shrd +C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 +C Intel atom 12.9 12.9 ? no +C VIA nano 6.18 6.44 ? no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm3, %xmm3 C set to 111...111 + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm new file mode 100644 index 0000000..a616075 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm @@ -0,0 +1,183 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.85 (1.635) 1.9 (1.67) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4.5 4.5 +C Intel P4 3.6 (3.125) 3.6 (3.125) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.875) 2.6 (2.25) +C Intel SBR 1.55 (1.44) 2 (1.57) Y +C Intel atom ? ? +C VIA nano 2.5 (2.5) 2.5 (2.5) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We always write using +C 16-byte operations, we read with both 8-byte and 16-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes too many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm2, %xmm2 C set to 111...111 + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + movdqa -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent):sub $2, n + ja L(atop) + + jne L(end8) + + movdqa (ap), %xmm0 + pxor %xmm1, %xmm1 + punpcklqdq %xmm0, %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm new file mode 100644 index 0000000..1e270b1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm @@ -0,0 +1,201 @@ +dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov (ap), %rax + shl R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + test $8, R8(rp) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq (ap), %xmm0 + movq 8(ap), %xmm1 + psrlq %xmm4, %xmm0 + psllq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, (rp) + lea 8(ap), ap + lea 8(rp), rp + dec n + +L(rp_aligned): + lea 1(n), %r8d + lea (ap,n,8), ap + lea (rp,n,8), rp + neg n + + and $6, R32(%r8) + jz L(bu0) + cmp $4, R32(%r8) + jz L(bu4) + jc L(bu2) +L(bu6): add $4, n + jmp L(i56) +L(bu0): add $6, n + jmp L(i70) +L(bu4): add $2, n + jmp L(i34) +L(bu2): add $8, n + jge L(end) + + ALIGN(16) +L(top): movdqu -64(ap,n,8), %xmm1 + movdqu -56(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -64(rp,n,8) +L(i70): + movdqu -48(ap,n,8), %xmm1 + movdqu -40(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -48(rp,n,8) +L(i56): + movdqu -32(ap,n,8), %xmm1 + movdqu -24(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -32(rp,n,8) +L(i34): + movdqu -16(ap,n,8), %xmm1 + movdqu -8(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp,n,8) + add $8, n + jl L(top) + +L(end): test $1, R8(n) + jnz L(e1) + + movdqu -16(ap), %xmm1 + movq -8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp) + FUNC_EXIT() + ret + +L(e1): movq -8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, -8(rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jnz 1f + movq (ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret + +1: movq (ap), %xmm1 + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp) + dec R32(n) + jnz 1f + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 8(rp) + FUNC_EXIT() + ret + +1: movq 8(ap), %xmm1 + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm new file mode 100644 index 0000000..e7b7feb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm @@ -0,0 +1,204 @@ +dnl AMD64 SSE mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb +C ali,evn n unal,evn n other cases +C AMD K8,K9 1.65 1.65 1.8 +C AMD K10 0.78 0.78 0.85 +C AMD bd1 0.80 0.91 1.25 +C AMD bobcat 2.15 2.15 2.37 +C Intel P4 2.5 2.5 2.95 +C Intel core2 1.17 1.25 1.25 +C Intel NHM 0.87 0.90 0.90 +C Intel SBR 0.63 0.79 0.77 +C Intel atom 4.3 4.3 4.3 slower than plain code +C VIA nano 1.4 5.1 3.14 too alignment dependent + +C NOTES +C * We only honour the least significant 32 bits of the `which' and `nents' +C arguments to allow efficient code using just SSE2. We would need to +C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. +C * We use movd for copying between xmm and plain registers, since old gas +C rejects movq. But gas assembles movd as movq when given a 64-bit greg. + +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%r10') +define(`j', `%r9') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab which j i temp * * * * + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + +IFDOS(` add $-88, %rsp ') +IFDOS(` movdqu %xmm6, (%rsp) ') +IFDOS(` movdqu %xmm7, 16(%rsp) ') +IFDOS(` movdqu %xmm8, 32(%rsp) ') +IFDOS(` movdqu %xmm9, 48(%rsp) ') + + movd which, %xmm8 + pshufd $0, %xmm8, %xmm8 C 4 `which' copies + mov $1, R32(%rax) + movd %rax, %xmm9 + pshufd $0, %xmm9, %xmm9 C 4 copies of 1 + + mov n, j + add $-8, j + js L(outer_end) + +L(outer_top): + mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ALIGN(16) +L(top): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + movdqu 32(tp), %xmm2 + movdqu 48(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + lea (tp,n,8), tp + add $-1, i + jne L(top) + + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + movdqu %xmm6, 32(rp) + movdqu %xmm7, 48(rp) + + lea 64(%r11), tp + lea 64(rp), rp + add $-8, j + jns L(outer_top) +L(outer_end): + + test $4, R8(n) + je L(b0xx) +L(b1xx):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + ALIGN(16) +L(tp4): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + lea (tp,n,8), tp + add $-1, i + jne L(tp4) + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + lea 32(%r11), tp + lea 32(rp), rp + +L(b0xx):test $2, R8(n) + je L(b00x) +L(b01x):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp2): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp2) + movdqu %xmm4, 0(rp) + lea 16(%r11), tp + lea 16(rp), rp + +L(b00x):test $1, R8(n) + je L(b000) +L(b001):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp1): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movq 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp1) + movq %xmm4, 0(rp) + +L(b000): +IFDOS(` movdqu (%rsp), %xmm6 ') +IFDOS(` movdqu 16(%rsp), %xmm7 ') +IFDOS(` movdqu 32(%rsp), %xmm8 ') +IFDOS(` movdqu 48(%rsp), %xmm9 ') +IFDOS(` add $88, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c new file mode 100644 index 0000000..e0d7358 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c @@ -0,0 +1,38 @@ +/* Fat binary fallback mpn_addmul_2. + +Copyright 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, const mp_limb_t vp[2]) +{ + rp[n] = mpn_addmul_1 (rp, up, n, vp[0]); + return mpn_addmul_1 (rp + 1, up, n, vp[1]); +} diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat.c b/gmp-6.3.0/mpn/x86_64/fat/fat.c new file mode 100644 index 0000000..cc35afa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/fat.c @@ -0,0 +1,473 @@ +/* x86_64 fat binary initializers. + + Contributed to the GNU project by Kevin Ryde (original x86_32 code) and + Torbjorn Granlund (port to x86_64) + + THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY. + THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR + COMPLETELY IN FUTURE GNU MP RELEASES. + +Copyright 2003, 2004, 2009, 2011-2015, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for printf */ +#include /* for getenv */ +#include + +#include "gmp-impl.h" + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + +/* fat_entry.asm */ +long __gmpn_cpuid (char [12], int); + + +#if WANT_FAKE_CPUID +/* The "name"s in the table are values for the GMP_CPU_TYPE environment + variable. Anything can be used, but for now it's the canonical cpu types + as per config.guess/config.sub. */ + +#define __gmpn_cpuid fake_cpuid + +#define MAKE_FMS(family, model) \ + ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \ + + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12)) + +static struct { + const char *name; + const char *vendor; + unsigned fms; +} fake_cpuid_table[] = { + { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) }, + { "nehalem", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "nhm", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) }, + { "westmere", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "wsm", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "sbr", "GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "slm", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "haswell", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "hwl", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "broadwell", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "bwl", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "skylake", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + { "sky", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + { "pentium4", "GenuineIntel", MAKE_FMS (15, 3) }, + + { "k8", "AuthenticAMD", MAKE_FMS (15, 0) }, + { "k10", "AuthenticAMD", MAKE_FMS (16, 0) }, + { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) }, + { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) }, + { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) }, + { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) }, + { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) }, + { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) }, + { "zen", "AuthenticAMD", MAKE_FMS (23, 1) }, + + { "nano", "CentaurHauls", MAKE_FMS (6, 15) }, +}; + +static int +fake_cpuid_lookup (void) +{ + char *s; + int i; + + s = getenv ("GMP_CPU_TYPE"); + if (s == NULL) + { + printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n"); + abort (); + } + + for (i = 0; i < numberof (fake_cpuid_table); i++) + if (strcmp (s, fake_cpuid_table[i].name) == 0) + return i; + + printf ("GMP_CPU_TYPE=%s unknown\n", s); + abort (); +} + +static long +fake_cpuid (char dst[12], unsigned int id) +{ + int i = fake_cpuid_lookup(); + + switch (id) { + case 0: + memcpy (dst, fake_cpuid_table[i].vendor, 12); + return 0; + case 1: + return fake_cpuid_table[i].fms; + case 7: + dst[0] = 0xff; /* BMI1, AVX2, etc */ + dst[1] = 0xff; /* BMI2, etc */ + return 0; + case 0x80000001: + dst[4 + 29 / 8] = (1 << (29 % 8)); /* "long" mode */ + return 0; + default: + printf ("fake_cpuid(): oops, unknown id %d\n", id); + abort (); + } +} +#endif + + +typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t)); +typedef DECL_preinv_mod_1 ((*preinv_mod_1_t)); + +struct cpuvec_t __gmpn_cpuvec = { + __MPN(add_n_init), + __MPN(addlsh1_n_init), + __MPN(addlsh2_n_init), + __MPN(addmul_1_init), + __MPN(addmul_2_init), + __MPN(bdiv_dbm1c_init), + __MPN(cnd_add_n_init), + __MPN(cnd_sub_n_init), + __MPN(com_init), + __MPN(copyd_init), + __MPN(copyi_init), + __MPN(divexact_1_init), + __MPN(divrem_1_init), + __MPN(gcd_11_init), + __MPN(lshift_init), + __MPN(lshiftc_init), + __MPN(mod_1_init), + __MPN(mod_1_1p_init), + __MPN(mod_1_1p_cps_init), + __MPN(mod_1s_2p_init), + __MPN(mod_1s_2p_cps_init), + __MPN(mod_1s_4p_init), + __MPN(mod_1s_4p_cps_init), + __MPN(mod_34lsub1_init), + __MPN(modexact_1c_odd_init), + __MPN(mul_1_init), + __MPN(mul_basecase_init), + __MPN(mullo_basecase_init), + __MPN(preinv_divrem_1_init), + __MPN(preinv_mod_1_init), + __MPN(redc_1_init), + __MPN(redc_2_init), + __MPN(rshift_init), + __MPN(sqr_basecase_init), + __MPN(sub_n_init), + __MPN(sublsh1_n_init), + __MPN(submul_1_init), + 0 +}; + +int __gmpn_cpuvec_initialized = 0; + +/* The following setups start with generic x86, then overwrite with + specifics for a chip, and higher versions of that chip. + + The arrangement of the setups here will normally be the same as the $path + selections in configure.in for the respective chips. + + This code is reentrant and thread safe. We always calculate the same + decided_cpuvec, so if two copies of the code are running it doesn't + matter which completes first, both write the same to __gmpn_cpuvec. + + We need to go via decided_cpuvec because if one thread has completed + __gmpn_cpuvec then it may be making use of the threshold values in that + vector. If another thread is still running __gmpn_cpuvec_init then we + don't want it to write different values to those fields since some of the + asm routines only operate correctly up to their own defined threshold, + not an arbitrary value. */ + +static int +gmp_workaround_skylake_cpuid_bug () +{ + char feature_string[49]; + char processor_name_string[49]; + static const char *bad_cpus[] = {" G44", " G45", " G39" /* , "6600" */ }; + int i; + + /* Example strings: */ + /* "Intel(R) Pentium(R) CPU G4400 @ 3.30GHz" */ + /* "Intel(R) Core(TM) i5-6600K CPU @ 3.50GHz" */ + /* ^ ^ ^ */ + /* 0x80000002 0x80000003 0x80000004 */ + /* We match out just the 0x80000003 part here. */ + + /* In their infinitive wisdom, Intel decided to use one register order for + the vendor string, and another for the processor name string. We shuffle + things about here, rather than write a new variant of our assembly cpuid. + */ + + unsigned int eax, ebx, ecx, edx; + eax = __gmpn_cpuid (feature_string, 0x80000003); + ebx = ((unsigned int *)feature_string)[0]; + edx = ((unsigned int *)feature_string)[1]; + ecx = ((unsigned int *)feature_string)[2]; + + ((unsigned int *) (processor_name_string))[0] = eax; + ((unsigned int *) (processor_name_string))[1] = ebx; + ((unsigned int *) (processor_name_string))[2] = ecx; + ((unsigned int *) (processor_name_string))[3] = edx; + + processor_name_string[16] = 0; + + for (i = 0; i < sizeof (bad_cpus) / sizeof (char *); i++) + { + if (strstr (processor_name_string, bad_cpus[i]) != 0) + return 1; + } + return 0; +} + +enum {BMI2_BIT = 8}; + +void +__gmpn_cpuvec_init (void) +{ + struct cpuvec_t decided_cpuvec; + char vendor_string[13]; + char dummy_string[12]; + long fms; + int family, model; + + TRACE (printf ("__gmpn_cpuvec_init:\n")); + + memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec)); + + CPUVEC_SETUP_x86_64; + CPUVEC_SETUP_fat; + + __gmpn_cpuid (vendor_string, 0); + vendor_string[12] = 0; + + fms = __gmpn_cpuid (dummy_string, 1); + family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff); + model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0); + + /* Check extended feature flags */ + __gmpn_cpuid (dummy_string, 0x80000001); + if ((dummy_string[4 + 29 / 8] & (1 << (29 % 8))) == 0) + abort (); /* longmode-capable-bit turned off! */ + + /*********************************************************/ + /*** WARNING: keep this list in sync with config.guess ***/ + /*********************************************************/ + if (strcmp (vendor_string, "GenuineIntel") == 0) + { + switch (family) + { + case 6: + switch (model) + { + case 0x0f: /* Conroe Merom Kentsfield Allendale */ + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: /* PNR Wolfdale Yorkfield */ + case 0x18: + case 0x19: + case 0x1d: /* PNR Dunnington */ + CPUVEC_SETUP_core2; + break; + + case 0x1c: /* Atom Silverthorne */ + case 0x26: /* Atom Lincroft */ + case 0x27: /* Atom Saltwell? */ + case 0x36: /* Atom Cedarview/Saltwell */ + CPUVEC_SETUP_atom; + break; + + case 0x1a: /* NHM Gainestown */ + case 0x1b: + case 0x1e: /* NHM Lynnfield/Jasper */ + case 0x1f: + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: /* WSM Clarkdale/Arrandale */ + case 0x28: + case 0x29: + case 0x2b: + case 0x2c: /* WSM Gulftown */ + case 0x2e: /* NHM Beckton */ + case 0x2f: /* WSM Eagleton */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + break; + + case 0x37: /* Silvermont */ + case 0x4a: /* Silvermont */ + case 0x4c: /* Airmont */ + case 0x4d: /* Silvermont/Avoton */ + case 0x5a: /* Silvermont */ + CPUVEC_SETUP_atom; + CPUVEC_SETUP_silvermont; + break; + + case 0x5c: /* Goldmont */ + case 0x5f: /* Goldmont */ + case 0x7a: /* Goldmont Plus */ + CPUVEC_SETUP_atom; + CPUVEC_SETUP_silvermont; + CPUVEC_SETUP_goldmont; + break; + + case 0x2a: /* SB */ + case 0x2d: /* SBC-EP */ + case 0x3a: /* IBR */ + case 0x3e: /* IBR Ivytown */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + break; + case 0x3c: /* Haswell client */ + case 0x3f: /* Haswell server */ + case 0x45: /* Haswell ULT */ + case 0x46: /* Crystal Well */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + /* Some Haswells lack BMI2. Let them appear as Sandybridges for + now. */ + __gmpn_cpuid (dummy_string, 7); + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + CPUVEC_SETUP_coreihwl; + break; + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell */ + case 0x4f: /* Broadwell server */ + case 0x56: /* Broadwell microserver */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + CPUVEC_SETUP_coreihwl; + CPUVEC_SETUP_coreibwl; + break; + case 0x4e: /* Skylake client */ + case 0x55: /* Skylake server */ + case 0x5e: /* Skylake */ + case 0x8e: /* Kabylake */ + case 0x9e: /* Kabylake */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + if (gmp_workaround_skylake_cpuid_bug ()) + break; + CPUVEC_SETUP_coreihwl; + CPUVEC_SETUP_coreibwl; + CPUVEC_SETUP_skylake; + break; + } + break; + + case 15: + CPUVEC_SETUP_pentium4; + break; + } + } + else if (strcmp (vendor_string, "AuthenticAMD") == 0) + { + switch (family) + { + case 0x0f: /* k8 */ + case 0x11: /* "fam 11h", mix of k8 and k10 */ + case 0x13: + CPUVEC_SETUP_k8; + break; + + case 0x10: /* k10 */ + case 0x12: /* k10 (llano) */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + break; + + case 0x14: /* bobcat */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bt1; + break; + + case 0x16: /* jaguar */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bt1; + CPUVEC_SETUP_bt2; + break; + + case 0x15: /* bulldozer, piledriver, steamroller, excavator */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bd1; + break; + + case 0x17: /* zen */ + case 0x19: /* zen3 */ + CPUVEC_SETUP_zen; + break; + } + } + else if (strcmp (vendor_string, "CentaurHauls") == 0) + { + switch (family) + { + case 6: + if (model >= 15) + CPUVEC_SETUP_nano; + break; + } + } + + /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1. + Instead default to the plain versions from whichever CPU we detected. + The function arguments are compatible, no need for any glue code. */ + if (decided_cpuvec.preinv_divrem_1 == NULL) + decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1; + if (decided_cpuvec.preinv_mod_1 == NULL) + decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1; + + ASSERT_CPUVEC (decided_cpuvec); + CPUVEC_INSTALL (decided_cpuvec); + + /* Set this once the threshold fields are ready. + Use volatile to prevent it getting moved. */ + *((volatile int *) &__gmpn_cpuvec_initialized) = 1; +} diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm new file mode 100644 index 0000000..5f244ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm @@ -0,0 +1,209 @@ +dnl x86 fat binary entrypoints. + +dnl Contributed to the GNU project by Kevin Ryde (original x86_32 code) and +dnl Torbjorn Granlund (port to x86_64) + +dnl Copyright 2003, 2009, 2011-2014, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +dnl Forcibly disable profiling. +dnl +dnl The entrypoints and inits are small enough not to worry about, the real +dnl routines arrived at will have any profiling. Also, the way the code +dnl here ends with a jump means we won't work properly with the +dnl "instrument" profiling scheme anyway. + +define(`WANT_PROFILING',no) + + +dnl We define PRETEND_PIC as a helper symbol, the use it for suppressing +dnl normal, fast call code, since that triggers problems on Darwin, OpenBSD +dnl and some versions of GNU/Linux. This will go away when symbol hiding is +dnl finished. + +ifdef(`DARWIN', +`define(`PRETEND_PIC')') +ifdef(`OPENBSD', +`define(`PRETEND_PIC')') +ifdef(`LINUX', +`define(`PRETEND_PIC')') +ifdef(`PIC', +`define(`PRETEND_PIC')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + + TEXT + +dnl Usage: FAT_ENTRY(name, offset) +dnl +dnl Emit a fat binary entrypoint function of the given name. This is the +dnl normal entry for applications, eg. __gmpn_add_n. +dnl +dnl The code simply jumps through the function pointer in __gmpn_cpuvec at +dnl the given "offset" (in bytes). +dnl +dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be +dnl fine for all x86s. +dnl +dnl For ELF/DARWIN PIC, the jumps are 20 bytes each, and are best aligned to +dnl 16 to ensure at least the first two instructions don't cross a cache line +dnl boundary. +dnl +dnl For DOS64, the jumps are 6 bytes. The same form works also for GNU/Linux +dnl (at least with certain assembler/linkers) but FreeBSD 8.2 crashes. Not +dnl tested on Darwin, Slowaris, NetBSD, etc. +dnl +dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE +dnl grepping in configure, stopping that code trying to eval something with +dnl $1 in it. + +define(FAT_ENTRY, +m4_assert_numargs(2) +`ifdef(`HOST_DOS64', +` ALIGN(8) +`'PROLOGUE($1) + jmp *$2+GSYM_PREFIX`'__gmpn_cpuvec(%rip) +EPILOGUE() +', +` ALIGN(ifdef(`PIC',16,8)) +`'PROLOGUE($1) +ifdef(`PRETEND_PIC', +` LEA( GSYM_PREFIX`'__gmpn_cpuvec, %rax) + jmp *$2(%rax) +',`dnl non-PIC + jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2 +') +EPILOGUE() +')') + + +dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_ENTRY(MPN(i),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 8))', +CPUVEC_FUNCS_LIST) + + +dnl Usage: FAT_INIT(name, offset) +dnl +dnl Emit a fat binary initializer function of the given name. These +dnl functions are the initial values for the pointers in __gmpn_cpuvec. +dnl +dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through +dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes). +dnl __gmpn_cpuvec_init will have stored the address of the selected +dnl implementation there. +dnl +dnl Only one of these routines will be executed, and only once, since after +dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no +dnl need for anything special here, just something small and simple. To +dnl keep code size down, "fat_init" is a shared bit of code, arrived at +dnl with the offset in %al. %al is used since the movb instruction is 2 +dnl bytes where %eax would be 4. +dnl +dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the +dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval +dnl something with $1 in it. +dnl +dnl We need to preserve parameter registers over the __gmpn_cpuvec_init call + +define(FAT_INIT, +m4_assert_numargs(2) +`PROLOGUE($1) + mov $`'$2, %al + jmp L(fat_init) +EPILOGUE() +') + +dnl FAT_INIT for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_INIT(MPN(i`'_init),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 1))', +CPUVEC_FUNCS_LIST) + +L(fat_init): + C al __gmpn_cpuvec byte offset + + movzbl %al, %eax +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %rdx + push %rcx + push %r8 + push %r9 + push %rax +IFDOS(` sub $32, %rsp ') + CALL( __gmpn_cpuvec_init) +IFDOS(` add $32, %rsp ') + pop %rax + pop %r9 + pop %r8 + pop %rcx + pop %rdx +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') +ifdef(`PRETEND_PIC',` + LEA( GSYM_PREFIX`'__gmpn_cpuvec, %r10) + jmp *(%r10,%rax,8) +',`dnl non-PIC + jmp *GSYM_PREFIX`'__gmpn_cpuvec(,%rax,8) +') + + +C long __gmpn_cpuid (char dst[12], int id); +C +C This is called only 3 times, so just something simple and compact is fine. +C +C The rcx/ecx zeroing here is needed for the BMI2 check. + +define(`rp', `%rdi') +define(`idx', `%rsi') + +PROLOGUE(__gmpn_cpuid) + FUNC_ENTRY(2) + mov %rbx, %r8 + mov R32(idx), R32(%rax) + xor %ecx, %ecx + cpuid + mov %ebx, (rp) + mov %edx, 4(rp) + mov %ecx, 8(rp) + mov %r8, %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h new file mode 100644 index 0000000..005c893 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h @@ -0,0 +1,72 @@ +/* Fat binary x86_64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2003, 2009, 2011 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time + this might not be true currently is for actual 80386 and 80486 chips, + where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but + that's not worth worrying about. */ +#define DIVEXACT_1_THRESHOLD 0 + +/* Only some of the x86s have an mpn_preinv_divrem_1, but we set + USE_PREINV_DIVREM_1 so that all callers use it, and then let the + __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual + preinv. */ +#define USE_PREINV_DIVREM_1 1 + +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need + for mpn_sqr to call the latter. */ +#define SQR_BASECASE_THRESHOLD 0 + +/* Sensible fallbacks for these, when not taken from a cpu-specific + gmp-mparam.h. */ +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 130 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 200 + +/* These are values more or less in the middle of what the typical x86 chips + come out as. For a fat binary it's necessary to have values for these, + since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out + as non-constant array initializers. FIXME: Perhaps these should be done + in the cpuvec structure like other thresholds. */ +#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 400 +#define MUL_FFT_THRESHOLD 2000 + +#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 500 +#define SQR_FFT_THRESHOLD 3000 diff --git a/gmp-6.3.0/mpn/x86_64/fat/mod_1.c b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c new file mode 100644 index 0000000..4f149cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mod_1. + +Copyright 2003, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mod_1.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c new file mode 100644 index 0000000..d9eb471 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mul_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mul_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c new file mode 100644 index 0000000..7f86be6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mullo_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mullo_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_1.c b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c new file mode 100644 index 0000000..0025403 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_1. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_1.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_2.c b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c new file mode 100644 index 0000000..1932d58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_2. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_2.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c new file mode 100644 index 0000000..d1c5dcd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_sqr_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/sqr_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/gcd_11.asm new file mode 100644 index 0000000..f9b3bcc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gcd_11.asm @@ -0,0 +1,114 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 5.5 +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 7.1 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom 9.1 +C Intel SLM 6.9 +C Intel GLM 6.0 +C Intel GLM+ 5.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r8) + jmp L(ent) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) +L(mid): and $MASK, R32(%rdx) + movzbl (%r8,%rdx), R32(%rcx) + jz L(shift_alot) + shr R8(%rcx), u0 +L(ent): mov u0, %rax + mov v0, %rdx + sub u0, %rdx + sub v0, u0 + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(shift_alot): + shr $MAXSHIFT, u0 + mov u0, %rdx + jmp L(mid) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/gcd_22.asm new file mode 100644 index 0000000..78f985f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gcd_22.asm @@ -0,0 +1,163 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx. +dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always +dnl works for our use, and helps a lot for certain CPUs. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 8.9 +C AMD K10 8.8 +C AMD bd1 9.7 +C AMD bd2 7.8 +C AMD bd3 ? +C AMD bd4 7.4 +C AMD bt1 9.2 +C AMD bt2 9.1 +C AMD zn1 7.5 +C AMD zn2 7.5 +C Intel P4 ? +C Intel CNR 10.5 +C Intel PNR 10.5 +C Intel NHM 9.7 +C Intel WSM 9.7 +C Intel SBR 10.7 +C Intel IBR ? +C Intel HWL 9.5 +C Intel BWL 8.7 +C Intel SKL 8.6 +C Intel atom 18.9 +C Intel SLM 14.0 +C Intel GLM 9.8 +C Intel GLM+ 8.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%rcx') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + LEA( ctz_table, %r10) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(count_better): + rep;bsf u0, cnt C tzcnt! + jmp L(shr) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h new file mode 100644 index 0000000..db94fb7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h @@ -0,0 +1,217 @@ +/* AMD K8-K10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 234 +#define MUL_TOOM6H_THRESHOLD 418 +#define MUL_TOOM8H_THRESHOLD 466 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 175 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 327 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 2,67,322,991 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 29, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 25, 7}, { 51, 8}, \ + { 29, 9}, { 15, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 55,10}, { 15, 9}, { 43,10}, { 23, 9}, \ + { 55,10}, { 31, 9}, { 63, 5}, { 1023, 4}, \ + { 2431, 5}, { 1279, 6}, { 671, 7}, { 367, 8}, \ + { 189, 9}, { 95, 8}, { 195, 9}, { 111,11}, \ + { 31, 9}, { 131,10}, { 71, 9}, { 155,10}, \ + { 79, 9}, { 159,10}, { 87,11}, { 47,10}, \ + { 111,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,10}, \ + { 319,11}, { 175,12}, { 95,11}, { 207,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \ + { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 799,11}, { 1599,12}, { 831,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2687,14}, { 1407,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1791,16}, \ + { 511,15}, { 1023,14}, { 2431,15}, { 1279,14}, \ + { 2815,15}, { 1535,14}, { 3199,15}, { 1791,14}, \ + { 3583,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 185 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 29, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 83,10}, { 47, 6}, { 767, 4}, \ + { 3263, 5}, { 1727, 4}, { 3455, 5}, { 1791, 6}, \ + { 927, 7}, { 479, 6}, { 959, 7}, { 511, 8}, \ + { 271, 9}, { 147,10}, { 87,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1663,13}, { 895,12}, { 1791,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1855,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2303,14}, \ + { 1279,13}, { 2559,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3071,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2303,15}, { 1279,14}, { 2687,15}, \ + { 1535,14}, { 3199,15}, { 1791,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 203 +#define SQR_FFT_THRESHOLD 5248 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 15604 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 220 +#define DC_BDIV_QR_THRESHOLD 52 +#define DC_BDIV_Q_THRESHOLD 152 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 214 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1895 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 125 +#define HGCD_APPR_THRESHOLD 173 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 555 +#define GCDEXT_DC_THRESHOLD 478 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1648 + +#define FAC_DSC_THRESHOLD 1075 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm new file mode 100644 index 0000000..06c5d5d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addlsh_n, mpn_rsblsh_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/k8/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm new file mode 100644 index 0000000..1818f9f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm new file mode 100644 index 0000000..9c5f631 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h new file mode 100644 index 0000000..531521d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h @@ -0,0 +1,264 @@ +/* Intel Goldmont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */ +/* FFT tuning limit = 468,030,122 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 17 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 301 + +#define MUL_TOOM22_THRESHOLD 23 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 258 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 290 +#define SQR_TOOM6_THRESHOLD 351 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 440 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 440, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 367,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \ + { 3071,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,12}, { 15359,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,13}, { 15359,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 261 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 607,13}, { 319,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,13}, { 447,12}, { 895,11}, { 1791,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \ + { 2431,14}, { 639,13}, { 1279,12}, { 2559,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3583,13}, { 7167,14}, { 3839,13}, { 7679,12}, \ + { 15359,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5119,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \ + { 3583,14}, { 7167,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 259 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 13 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 51 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 204 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 54 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1620 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,642 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 218 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 6.58% faster than 3 */ +#define HGCD_THRESHOLD 136 +#define HGCD_APPR_THRESHOLD 168 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 393 +#define JACOBI_BASE_METHOD 4 /* 1.17% faster than 3 */ + +/* Tuneup completed successfully, took 800192 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm new file mode 100644 index 0000000..ed1ec54 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/coreisbr/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm new file mode 100644 index 0000000..1192635 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/k8/redc_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb.asm b/gmp-6.3.0/mpn/x86_64/invert_limb.asm new file mode 100644 index 0000000..b375ad3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/invert_limb.asm @@ -0,0 +1,112 @@ +dnl AMD64 mpn_invert_limb -- Invert a normalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + +dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb (approx) div +C AMD K8,K9 48 71 +C AMD K10 48 77 +C Intel P4 135 161 +C Intel core2 69 116 +C Intel corei 55 89 +C Intel atom 129 191 +C VIA nano 79 157 + +C rax rcx rdx rdi rsi r8 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +PROTECT(`mpn_invert_limb_table') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_invert_limb) C Kn C2 Ci + FUNC_ENTRY(1) + mov %rdi, %rax C 0 0 0 + shr $55, %rax C 1 1 1 +ifdef(`DARWIN',` + lea mpn_invert_limb_table(%rip), %r8 + add $-512, %r8 +',` + lea -512+mpn_invert_limb_table(%rip), %r8 +') + movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 + + C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 + mov %rdi, %rsi C 0 0 0 + mov R32(%rcx), R32(%rax) C 4 5 5 + imul R32(%rcx), R32(%rcx) C 4 5 5 + shr $24, %rsi C 1 1 1 + inc %rsi C %rsi = d40 + imul %rsi, %rcx C 8 10 8 + shr $40, %rcx C 12 15 11 + sal $11, R32(%rax) C 5 6 6 + dec R32(%rax) + sub R32(%rcx), R32(%rax) C %rax = v1 + + C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) + mov $0x1000000000000000, %rcx + imul %rax, %rsi C 14 17 13 + sub %rsi, %rcx + imul %rax, %rcx + sal $13, %rax + shr $47, %rcx + add %rax, %rcx C %rcx = v2 + + C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65) + mov %rdi, %rsi C 0 0 0 + shr %rsi C d/2 + sbb %rax, %rax C -d0 = -(d mod 2) + sub %rax, %rsi C d63 = ceil(d/2) + imul %rcx, %rsi C v2 * d63 + and %rcx, %rax C v2 * d0 + shr %rax C (v2>>1) * d0 + sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 + mul %rcx + sal $31, %rcx + shr %rdx + add %rdx, %rcx C %rcx = v3 + + mov %rdi, %rax + mul %rcx + add %rdi, %rax + mov %rcx, %rax + adc %rdi, %rdx + sub %rdx, %rax + + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm new file mode 100644 index 0000000..739d59e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm @@ -0,0 +1,50 @@ +dnl Table used for mpn_invert_limb + +dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + +dnl Copyright 2004, 2007-2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +PROTECT(`mpn_invert_limb_table') + +ASM_START() +C Table entry X contains floor (0x7fd00 / (0x100 + X)) + + RODATA + ALIGN(2) + GLOBL mpn_invert_limb_table +mpn_invert_limb_table: +forloop(i,256,512-1,dnl +` .value eval(0x7fd00/i) +')dnl +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm new file mode 100644 index 0000000..f58b4cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm @@ -0,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useless shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 7.4 +C AMD bd1 9.9 +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM 9.2 +C Intel WSM 9.0 +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovnc u1, t1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shr R8(cnt), u0 + mov t1, u1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test u1, u1 + jnz L(top) + test v1, v1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h new file mode 100644 index 0000000..349bace --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h @@ -0,0 +1,248 @@ +/* AMD K10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c) \ + (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \ + : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 3200-3600 MHz K10 Thuban */ +/* FFT tuning limit = 427,161,280 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define DIV_1_VS_MUL_1_PERCENT 324 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 363 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 155 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 280 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 530, 5}, { 24, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 35, 8}, { 71, 9}, \ + { 39, 8}, { 81, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 87,11}, { 47,10}, \ + { 111,12}, { 31,11}, { 63,10}, { 143,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ + { 111,12}, { 63,11}, { 143,10}, { 287,11}, \ + { 159,12}, { 95,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1471,14}, \ + { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2559,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2431,15}, { 1279,14}, { 2943,15}, { 1535,14}, \ + { 3199,15}, { 1791,14}, { 3583,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,14}, { 6911,15}, \ + { 3583,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7167,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 6143,15}, { 12543,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 207 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 29, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 49, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 95,10}, \ + { 55,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 103,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 799,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \ + { 1407,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,14}, { 1279,13}, { 2559,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2303,15}, { 1279,14}, { 2815,15}, \ + { 1535,14}, { 3199,15}, { 1791,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,14}, { 6911,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,16}, { 5119,15}, { 10495,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 6143,15}, { 12287,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 224 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 10950 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 238 +#define DC_BDIV_QR_THRESHOLD 54 +#define DC_BDIV_Q_THRESHOLD 42 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 25 +#define REDC_2_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1620 +#define MU_DIVAPPR_Q_THRESHOLD 1620 +#define MUPI_DIV_QR_THRESHOLD 104 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,22,321,473,2144 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1304 + +#define FAC_DSC_THRESHOLD 470 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 5 /* 8.38% faster than 4 */ +#define HGCD_THRESHOLD 115 +#define HGCD_APPR_THRESHOLD 146 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 535 +#define GCDEXT_DC_THRESHOLD 460 +#define JACOBI_BASE_METHOD 1 /* 0.90% faster than 4 */ + +/* Tuneup completed successfully, took 448763 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm new file mode 100644 index 0000000..f70494a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm @@ -0,0 +1,109 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2008, 2010-2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 2.0 = +C AMD bd1 ~4.4 = +C AMD bd2 ~4.4 = +C AMD bd3 +C AMD bd4 +C AMD bobcat 7.55 = +C AMD jaguar 2.52 - +C Intel P4 - +C Intel core2 - +C Intel NHM 2.03 + +C Intel SBR 2.01 + +C Intel IBR 1.96 + +C Intel HWL 1.64 = +C Intel BWL 1.56 - +C Intel SKL 1.52 = +C Intel atom +C Intel SLM 3.0 - +C VIA nano + +define(`ap', `%rdi') +define(`bp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + mov (ap), %r8 + xor (bp), %r8 + + lea (ap,n,8), ap C point at A operand end + lea (bp,n,8), bp C point at B operand end + neg n + + test $1, R8(n) + jz L(2) + +L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor R32(%r10), R32(%r10) + inc n + js L(top) + FUNC_EXIT() + ret + + ALIGN(16) +L(2): mov 8(ap,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor 8(bp,n,8), %r9 + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + lea (%r10, %rax), %rax + FUNC_EXIT() + ret + + ALIGN(16) +L(top): mov (ap,n,8), %r8 + lea (%r10, %rax), %rax + mov 8(ap,n,8), %r9 + xor (bp,n,8), %r8 + xor 8(bp,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx + lea (%rcx, %rax), %rax + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + + lea (%r10, %rax), %rax + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshift.asm b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm new file mode 100644 index 0000000..cadf9b9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm new file mode 100644 index 0000000..48a92e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/popcount.asm b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm new file mode 100644 index 0000000..3814aea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm @@ -0,0 +1,138 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 1.125 +C Intel P4 n/a +C Intel core2 n/a +C Intel corei 1.25 +C Intel atom n/a +C VIA nano n/a + +C * The zero-offset of popcount is misassembled to the offset-less form, which +C is one byte shorter and therefore will mess up the switching code. +C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn, +C which is the main reason for our usage of '.byte'. + +C TODO +C * Improve switching code, the current code sucks. + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(2) + +ifelse(1,1,` + lea (up,n,8), up + +C mov R32(n), R32(%rcx) +C neg R32(%rcx) + imul $-1, R32(n), R32(%rcx) + and $8-1, R32(%rcx) + + neg n + + mov R32(%rcx), R32(%rax) + neg %rax + lea (up,%rax,8),up + + xor R32(%rax), R32(%rax) + + lea (%rcx,%rcx,4), %rcx + + lea L(top)(%rip), %rdx + lea (%rdx,%rcx,2), %rdx + jmp *%rdx +',` + lea (up,n,8), up + + mov R32(n), R32(%rcx) + neg R32(%rcx) + and $8-1, R32(%rcx) + + neg n + + mov R32(%rcx), R32(%rax) + shl $3, R32(%rax) + sub %rax, up + + xor R32(%rax), R32(%rax) + +C add R32(%rcx), R32(%rcx) C 2x +C lea (%rcx,%rcx,4), %rcx C 10x + imul $10, R32(%rcx) + + lea L(top)(%rip), %rdx + add %rcx, %rdx + jmp *%rdx +') + + ALIGN(32) +L(top): +C 0 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00 C popcnt 0(up,n,8), %r8 + add %r8, %rax +C 7 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08 C popcnt 8(up,n,8), %r9 + add %r9, %rax +C 6 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10 C popcnt 16(up,n,8), %r8 + add %r8, %rax +C 5 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18 C popcnt 24(up,n,8), %r9 + add %r9, %rax +C 4 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20 C popcnt 32(up,n,8), %r8 + add %r8, %rax +C 3 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28 C popcnt 40(up,n,8), %r9 + add %r9, %rax +C 2 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30 C popcnt 48(up,n,8), %r8 + add %r8, %rax +C 1 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38 C popcnt 56(up,n,8), %r9 + add %r9, %rax + + add $8, n + js L(top) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/rshift.asm b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm new file mode 100644 index 0000000..249051a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm new file mode 100644 index 0000000..3e1898b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm @@ -0,0 +1,153 @@ +dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2008, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.167 +C AMD K10 2.167 +C Intel P4 12.0 +C Intel core2 4.0 +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Perhaps handle various n mod 3 sizes better. The code now is too large. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') +define(`n', `%rcx') +define(`u0', `%r8') +define(`v0', `%r9') + + +define(`bp', `%rbp') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addaddmul_1msb0) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbp + + lea (ap,n,8), ap + lea (bp_param,n,8), bp + lea (rp,n,8), rp + neg n + + mov (ap,n,8), %rax + mul %r8 + mov %rax, %r11 + mov (bp,n,8), %rax + mov %rdx, %r10 + add $3, n + jns L(end) + + push %r13 + + ALIGN(16) +L(top): mul %r9 + add %rax, %r11 + mov -16(ap,n,8), %rax + adc %rdx, %r10 + mov %r11, -24(rp,n,8) + mul %r8 + add %rax, %r10 + mov -16(bp,n,8), %rax + mov $0, R32(%r13) + adc %rdx, %r13 + mul %r9 + add %rax, %r10 + mov -8(ap,n,8), %rax + adc %rdx, %r13 + mov %r10, -16(rp,n,8) + mul %r8 + add %rax, %r13 + mov -8(bp,n,8), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r13 + adc %rdx, %r11 + mov (ap,n,8), %rax + mul %r8 + add %rax, %r11 + mov %r13, -8(rp,n,8) + mov (bp,n,8), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + add $3, n + js L(top) + + pop %r13 + +L(end): mul %r9 + add %rax, %r11 + adc %rdx, %r10 + cmp $1, R32(n) + ja L(two) + mov -16(ap,n,8), %rax + mov %r11, -24(rp,n,8) + mov %r10, %r11 + jz L(one) + +L(nul): mul %r8 + add %rax, %r10 + mov -16(bp), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(ap), %rax + adc %rdx, %r11 + mov %r10, -16(rp) +L(one): mul %r8 + add %rax, %r11 + mov -8(bp), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + mul %r9 + add %rax, %r11 + adc %rdx, %r10 + +L(two): mov %r11, -8(rp) + mov %r10, %rax +L(ret): pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm new file mode 100644 index 0000000..78bcba1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl add the result to a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cfg cycles/limb am1+am1 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C AMD bull 5.2 <- 4.6-4.75 bad +C AMD pile 4.96 <- 4.6-4.75 bad +C AMD steam ? +C AMD excavator ? +C AMD bobcat 5.75 5.0 bad +C AMD jaguar 5.9 5.2-5.4 bad +C Intel P4 15-16 +C Intel core2 4.5 4.25-4.5 bad +C Intel NHM 4.33 4.55 bad +C Intel SBR 3.4 2.93 3.24 bad +C Intel IBR 3.35 2.6 2.95 bad +C Intel HWL 3.3 2.15 2.3 bad +C Intel BWL 2.33 2.33 1.65 bad +C Intel SKL 2.37 2.21 1.64 bad +C Intel atom 20 18.7 +C Intel SLM 8 8.5 +C VIA nano 4.4 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Tune feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + mov n_param, n + push %rbx + push %rbp + + mov 0(vp), v0 + mov 8(vp), v1 + + mov R32(n_param), R32(%rbx) + mov (up), %rax + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + mul v0 + neg n + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov 8(up,n,8), %rax + dec n + jmp L(lo3) + +L(b2): mov %rax, w2 + mov 8(up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + add $-2, n + jmp L(lo2) + +L(b1): mov %rax, w3 + mov 8(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b0): mov $0, R32(w3) + mov %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(lo0) + + ALIGN(32) +L(top): mov $0, R32(w1) + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(lo1): mul v1 + add w3, (rp,n,8) + mov $0, R32(w3) + adc %rax, w0 + mov $0, R32(w2) + mov 8(up,n,8), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov 8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mul v0 + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 16(up,n,8), %rax +L(lo3): mul v1 + add w1, 16(rp,n,8) + adc %rax, w2 + adc %rdx, w3 + xor R32(w0), R32(w0) + mov 24(up,n,8), %rax + mul v0 + add %rax, w2 + mov 24(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,n,8), %rax + add $4, n + js L(top) + +L(end): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm new file mode 100644 index 0000000..ff3a184 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm @@ -0,0 +1,217 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.87 < 3.85 for lshift + add_n +C AMD K10 2.75 < 3.85 for lshift + add_n +C Intel P4 22 > 7.33 for lshift + add_n +C Intel core2 4.1 > 3.27 for lshift + add_n +C Intel NHM 4.4 > 3.75 for lshift + add_n +C Intel SBR 3.17 < 3.46 for lshift + add_n +C Intel atom ? ? 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C TODO +C * Can we propagate carry into rdx instead of using a special carry register? +C That could save enough insns to get to 10 cycles/iteration. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') +define(`cnt', `%r8') + +define(`vp', `%r12') +define(`n', `%rbp') + +ifdef(`OPERATION_addlsh_n',` + define(ADDSUB, `add') + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADDSUB, `sub') + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %rbp + push %rbx + + mov (vp_param), %rax C load first V limb early + + mov $0, R32(n) + sub n_param, n + + lea -16(up,n_param,8), up + lea -16(rp,n_param,8), rp + lea 16(vp_param,n_param,8), vp + + mov n_param, %r9 + + mov %r8, %rcx + mov $1, R32(%r8) + shl R8(%rcx), %r8 + + mul %r8 C initial multiply + + and $3, R32(%r9) + jz L(b0) + cmp $2, R32(%r9) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, %r11 + ADDSUB 16(up,n,8), %r11 + mov -8(vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, n + jnz L(lo3) + jmp L(cj3) + +L(b2): mov %rax, %rbx + mov -8(vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, n + jz L(cj2) + mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + xor R32(%rcx), R32(%rcx) C clear carry register + jmp L(lo2) + +L(b1): mov %rax, %r9 + mov %rdx, %r10 + add $1, n + jnz L(gt1) + ADDSUB 8(up,n,8), %r9 + jmp L(cj1) +L(gt1): mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + jmp L(lo1) + +L(b0): mov %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(vp,n,8), %rax + add $4, n + jz L(end) + + ALIGN(8) +L(top): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(lo3): mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(rp,n,8) +L(lo2): mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rbx, (rp,n,8) +L(lo1): mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(rp,n,8) +L(lo0): mov 8(vp,n,8), %rax + add $4, n + jnz L(top) + +L(end): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(cj3): mov %r11, -8(rp,n,8) +L(cj2): add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + mov %rbx, (rp,n,8) +L(cj1): mov %r9, 8(rp,n,8) + mov %rdx, %rax + ADCSBB $0, %rax + pop %rbx + pop %rbp + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm new file mode 100644 index 0000000..1172b0d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm @@ -0,0 +1,179 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, +dnl returning quotient only. + +dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm/unorm +C AMD K8,K9 10 + +C AMD K10 10 + +C AMD bull 13.7 - +C AMD pile 13.7 + +C AMD steam +C AMD excavator +C AMD bobcat 15 - +C AMD jaguar 16 - +C Intel P4 33 = +C Intel core2 13.25 = +C Intel NHM 14 = +C Intel SBR 8.5 - +C Intel IBR 8.5 - +C Intel HWL 8 = +C Intel BWL 8 = +C Intel SKL 8 = +C Intel atom 42 -- +C Intel SLM 20.4 -- +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + mov 8(up), %rdx C up[1] + lea (up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx ncnt + C rdx + C r10 counter, limbs, negative + C r11 d + + mul %r11 C carry limb in rdx + mov (up,%r10,8), %rax + mov 8(up,%r10,8), %r9 + shrd R8(%rcx), %r9, %rax + nop + sub %rbx, %rax C apply carry bit + setc R8(%rbx) + sub %rdx, %rax C apply carry limb + adc $0, R32(%rbx) +L(ent): imul %r8, %rax + mov %rax, (rp,%r10,8) + inc %r10 + jnz L(top) + + mul %r11 C carry limb in rdx + mov (up), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm new file mode 100644 index 0000000..86de08c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm @@ -0,0 +1,249 @@ +dnl x86-64 mpn_div_qr_1n_pi1 +dnl -- Divide an mpn number by a normalized single-limb number, +dnl using a single-limb inverse. + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C AMD K8,K9 11 +C AMD K10 11 +C AMD bull 16 +C AMD pile 14.25 +C AMD steam ? +C AMD bobcat 16 +C AMD jaguar ? +C Intel P4 47.5 poor +C Intel core 28.5 very poor +C Intel NHM 29 very poor +C Intel SBR 16 poor +C Intel IBR 13.5 +C Intel HWL 12 +C Intel BWL ? +C Intel atom 53 very poor +C VIA nano 19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx') C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8') C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + + ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + dec UN_INPUT + jnz L(first) + + C Just a single 2/1 division. + C T, U0 are allocated in scratch registers + lea 1(U1), T + mov U1, %rax + mul DINV + mov (UP), U0 + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(single_div_done) + sub D, %rax + add $1, T +L(single_div_done): + mov T, (QP) + FUNC_EXIT() + ret +L(first): + C FIXME: Could delay some of these until we enter the loop. + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov D, B2 + imul DINV, B2 + neg B2 + mov B2, B2md + sub D, B2md + + C D not needed until final reduction + push D + mov UN_INPUT, UN C Clobbers D + + mov DINV, %rax + mul U1 + mov %rax, Q0 + add U1, %rdx + mov %rdx, T + + mov B2, %rax + mul U1 + mov -8(UP, UN, 8), U0 + mov (UP, UN, 8), U1 + mov T, (QP, UN, 8) + add %rax, U0 + adc %rdx, U1 + sbb U2, U2 + dec UN + mov U1, %rax + jz L(final) + mov $0, R32(Q1) + + ALIGN(16) + + C Loop is 28 instructions, 30 K8/K10 decoder slots, should run + C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1 + C is zero, and carry holds an extra copy of U2. +L(loop): + C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 + C Remains to add in B (U1 + c) + cmovc DINV, Q1 + mov U2, Q2 + neg Q2 + mul DINV + add %rdx, Q1 + adc $0, Q2 + add Q0, Q1 + mov %rax, Q0 + mov B2, %rax + lea (B2md, U0), T + adc $0, Q2 + + C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u + mul U1 + and B2, U2 + add U2, U0 + cmovnc U0, T + + C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c + adc U1, Q1 + mov -8(UP, UN, 8), U0 + adc Q2, 8(QP, UN, 8) + jc L(q_incr) +L(q_incr_done): + add %rax, U0 + mov T, %rax + adc %rdx, %rax + mov Q1, (QP, UN, 8) + mov $0, R32(Q1) + sbb U2, U2 + dec UN + mov %rax, U1 + jnz L(loop) + +L(final): + pop D + + mov U2, Q1 + and D, U2 + sub U2, %rax + neg Q1 + + mov %rax, U1 + sub D, %rax + cmovc U1, %rax + sbb $-1, Q1 + + lea 1(%rax), T + mul DINV + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(div_done) + sub D, %rax + add $1, T +L(div_done): + add T, Q0 + mov Q0, (QP) + adc Q1, 8(QP) + jnc L(done) +L(final_q_incr): + addq $1, 16(QP) + lea 8(QP), QP + jc L(final_q_incr) + +L(done): + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(q_incr): + C U1 is not live, so use it for indexing + lea 16(QP, UN, 8), U1 +L(q_incr_loop): + addq $1, (U1) + jnc L(q_incr_done) + lea 8(U1), U1 + jmp L(q_incr_loop) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h new file mode 100644 index 0000000..d87cc3b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h @@ -0,0 +1,237 @@ +/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c) \ + (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \ + : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 2500 MHz K8 Brisbane */ +/* FFT tuning limit = 115,768,433 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 309 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 324 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 430 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 44, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 53, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 55, 8}, \ + { 111,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 111,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 71, 9}, { 147,10}, { 87,11}, { 47,10}, \ + { 111,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 199,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1215,14}, \ + { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1855,15}, { 511,14}, { 1023,13}, \ + { 2047,14}, { 1151,13}, { 2367,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \ + { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4735,15}, { 2559,16}, \ + { 1535,15}, { 3071,14}, { 6271,15}, { 3327,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 11520 + +#define SQR_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 540, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 16, 6}, { 33, 7}, { 33, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \ + { 65, 9}, { 35, 8}, { 71, 9}, { 43,10}, \ + { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55, 9}, { 111,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 87,11}, { 47,10}, { 111,12}, \ + { 31,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,10}, { 511, 9}, \ + { 1023,11}, { 271,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,13}, \ + { 127,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,14}, { 127,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,14}, { 511,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2111,14}, \ + { 1151,13}, { 2303,14}, { 1407,15}, { 767,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \ + { 1279,14}, { 2687,15}, { 1535,14}, { 3199,15}, \ + { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 202 +#define SQR_FFT_THRESHOLD 7296 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 22239 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 14281 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 266 +#define DC_BDIV_QR_THRESHOLD 38 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 258 +#define REDC_1_TO_REDC_2_THRESHOLD 35 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 99 +#define MU_BDIV_QR_THRESHOLD 1787 +#define MU_BDIV_Q_THRESHOLD 1895 + +#define POWM_SEC_TABLE 1,16,194,960,2825 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1747 + +#define FAC_DSC_THRESHOLD 1240 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 3 /* 4.10% faster than 5 */ +#define HGCD_THRESHOLD 141 +#define HGCD_APPR_THRESHOLD 181 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 622 +#define GCDEXT_DC_THRESHOLD 496 +#define JACOBI_BASE_METHOD 1 /* 0.97% faster than 3 */ + +/* Tuneup completed successfully, took 131832 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm new file mode 100644 index 0000000..ca2efb9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm @@ -0,0 +1,469 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. + +dnl Copyright 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 15-16 +C Intel core2 4.45 +C Intel corei 4.35 +C Intel atom ? +C VIA nano 4.5 + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Use fewer registers. (how??? I can't see it -- david) +C * Avoid some "mov $0,r" and instead use "xor r,r". +C * Can the top of each L(addmul_outer_n) prologue be folded into the +C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the +C case where vn = 1 or 2; is it worth it? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + xor R32(un), R32(un) + mov (up), %rax + mov (vp), v0 + + sub un_param, un C rdx used by mul + mov un, n + mov R32(un_param), R32(w0) + + lea (rp,un_param,8), rp + lea (up,un_param,8), up + + mul v0 + + test $1, R8(vn) + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + add $-1, n + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w3 + mov %rdx, w0 + jmp L(mul_1_entry_3) + +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note: already w0 == 0 + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + +L(mul_1_prologue_1): + cmp $-1, un + jne 2f + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(ret) +2: add $1, n + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov (up,n,8), %rax + jmp L(mul_1_entry_1) + +L(mul_1_prologue_2): + add $-2, n + lea L(addmul_outer_2)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + mov w2, (rp) + + add $-1, vn C vn -= 1 + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 8(vp), vp C vp += 1 + lea 8(rp), rp C rp += 1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov 8(vp), v1 + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jz L(mul_2_prologue_2) + jc L(mul_2_prologue_1) + +L(mul_2_prologue_3): + lea L(addmul_outer_3)(%rip), outer_addr + add $2, n + mov %rax, -16(rp,n,8) + mov %rdx, w2 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + mov -16(up,n,8), %rax + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + add $3, n + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov -24(up,n,8), %rax + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + add $1, n + lea L(addmul_outer_2)(%rip), outer_addr + mov $0, R32(w0) + mov $0, R32(w1) + mov %rax, w2 + mov -8(up,n,8), %rax + mov %rdx, w3 + jmp L(mul_2_entry_2) + + C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_2_top): + mov -32(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,n,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(mul_2_entry_0): + mul v1 + add %rax, w1 + mov w0, -24(rp,n,8) + adc %rdx, w2 + mov -16(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,n,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(rp,n,8) +L(mul_2_entry_3): + mul v1 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) C adc $0, w0 +L(mul_2_entry_2): + mul v1 + add %rax, w3 + mov w2, -8(rp,n,8) + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, n + mov w3, -32(rp,n,8) + js L(mul_2_top) + + mov -32(up,n,8), %rax C FIXME: n is constant + mul v1 + add %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp C vp += 2 + lea 16(rp), rp C rp += 2 + + jmp *outer_addr + + +C =========================================================== +C addmul_2 for remaining vp's + + C in the following prologues, we reuse un to store the + C adjusted value of n that is reloaded on each iteration + +L(addmul_outer_0): + add $3, un + lea 0(%rip), outer_addr + + mov un, n + mov -24(up,un,8), %rax + mul v0 + mov %rax, w0 + mov -24(up,un,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(addmul_entry_0) + +L(addmul_outer_1): + mov un, n + mov (up,un,8), %rax + mul v0 + mov %rax, w3 + mov (up,un,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(addmul_entry_1) + +L(addmul_outer_2): + add $1, un + lea 0(%rip), outer_addr + + mov un, n + mov -8(up,un,8), %rax + mul v0 + xor R32(w0), R32(w0) + mov %rax, w2 + xor R32(w1), R32(w1) + mov %rdx, w3 + mov -8(up,un,8), %rax + jmp L(addmul_entry_2) + +L(addmul_outer_3): + add $2, un + lea 0(%rip), outer_addr + + mov un, n + mov -16(up,un,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,un,8), %rax + mov %rdx, w2 + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,n,8) + adc %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) C adc $0, w2 +L(addmul_entry_0): + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,n,8) + adc %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,n,8) + adc %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,n,8), %rax + adc R32(w1), R32(w0) C adc $0, w0 +L(addmul_entry_2): + mul v1 + add w2, -8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, n + js L(addmul_top) + + add w3, -8(rp) + adc %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + lea 16(rp), rp C rp += 2 + lea 16(vp), vp C vp += 2 + + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + + ALIGN(16) +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm new file mode 100644 index 0000000..fa00f42 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm @@ -0,0 +1,436 @@ +dnl AMD64 mpn_mullo_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should start with mul_2 for any operand +C size congruence class. +C * Stop iterating addmul_2 earlier, falling into straight-line triangle code +C for the last 2-3 iterations. +C * Perhaps implement n=4 special code. +C * The reload of the outer loop jump address hurts branch prediction. +C * The addmul_2 loop ends with an MUL whose high part is not used upon loop +C exit. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`outer_addr', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, n + jge L(gen) + mov (up), %rax C u0 + mov (vp_param), %r8 C v0 + + lea L(tab)(%rip), %r9 +ifdef(`PIC', +` movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 +',` + jmp *(%r9,n,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(tab), L(tab)) C not allowed + JMPENT( L(1), L(tab)) C 1 + JMPENT( L(2), L(tab)) C 2 + JMPENT( L(3), L(tab)) C 3 +dnl JMPENT( L(0m4), L(tab)) C 4 +dnl JMPENT( L(1m4), L(tab)) C 5 +dnl JMPENT( L(2m4), L(tab)) C 6 +dnl JMPENT( L(3m4), L(tab)) C 7 +dnl JMPENT( L(0m4), L(tab)) C 8 +dnl JMPENT( L(1m4), L(tab)) C 9 +dnl JMPENT( L(2m4), L(tab)) C 10 +dnl JMPENT( L(3m4), L(tab)) C 11 + TEXT + +L(1): imul %r8, %rax + mov %rax, (rp) + FUNC_EXIT() + ret + +L(2): mov 8(vp_param), %r11 + imul %rax, %r11 C u0 x v1 + mul %r8 C u0 x v0 + mov %rax, (rp) + imul 8(up), %r8 C u1 x v0 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(rp) + FUNC_EXIT() + ret + +L(3): mov 8(vp_param), %r9 C v1 + mov 16(vp_param), %r11 + mul %r8 C u0 x v0 -> + mov %rax, (rp) C r0 + mov (up), %rax C u0 + mov %rdx, %rcx C r1 + mul %r9 C u0 x v1 -> + imul 8(up), %r9 C u1 x v1 -> r2 + mov 16(up), %r10 + imul %r8, %r10 C u2 x v0 -> r2 + add %rax, %rcx + adc %rdx, %r9 + add %r10, %r9 + mov 8(up), %rax C u1 + mul %r8 C u1 x v0 -> + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (up), %rax C u0 x v2 -> r2 + add %rax, %r9 + mov %rcx, 8(rp) + mov %r9, 16(rp) + FUNC_EXIT() + ret + +L(0m4): +L(1m4): +L(2m4): +L(3m4): +L(gen): push %rbx + push %rbp + push %r13 + push %r14 + push %r15 + + mov (up), %rax + mov (vp_param), v0 + mov vp_param, vp + + lea (rp,n,8), rp + lea (up,n,8), up + neg n + + mul v0 + + test $1, R8(n) + jz L(mul_2) + +L(mul_1): + lea -8(rp), rp + lea -8(up), up + test $2, R8(n) + jnz L(mul_1_prologue_3) + +L(mul_1_prologue_2): C n = 7, 11, 15, ... + lea -1(n), j + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + mov 16(up,n,8), %rax + jmp L(mul_1_entry_2) + +L(mul_1_prologue_3): C n = 5, 9, 13, ... + lea 1(n), j + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w2 + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,j,8) + add %rax, w1 + mov (up,j,8), %rax + adc %rdx, w2 + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,j,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,j,8), %rax + mul v0 + mov w2, (rp,j,8) + add %rax, w3 + adc %rdx, w0 + mov 16(up,j,8), %rax + mul v0 + mov w3, 8(rp,j,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,j,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, j + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + + imul (up), v0 + add v0, w2 + mov w2, (rp) + + add $1, n + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 16(up), up + lea 8(vp), vp + lea 24(rp), rp + + jmp *outer_addr + + +L(mul_2): + mov 8(vp), v1 + test $2, R8(n) + jz L(mul_2_prologue_3) + + ALIGN(16) +L(mul_2_prologue_1): + lea 0(n), j + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + mov (up,n,8), %rax + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_3): + lea 2(n), j + mov $0, R32(w3) + mov %rax, w1 + mov (up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_top): + mov -32(up,j,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, -24(rp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) +L(mul_2_entry_3): + mov $0, R32(w0) + mov w1, -16(rp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(rp,j,8) + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, j + mov w3, -32(rp,j,8) + js L(mul_2_top) + + imul -16(up), v1 + add v1, w0 + imul -8(up), v0 + add v0, w0 + mov w0, -8(rp) + + add $2, n + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp + lea 16(rp), rp + + jmp *outer_addr + + +L(addmul_outer_1): + lea -2(n), j + mov -16(up,n,8), %rax + mul v0 + mov %rax, w3 + mov -16(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(addmul_entry_1) + +L(addmul_outer_3): + lea 0(n), j + mov -16(up,n,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(addmul_entry_3) + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,j,8) + adc %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,j,8) + adc %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,j,8) + adc %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,j,8), %rax + adc R32(w1), R32(w0) + mul v1 + add w2, -8(rp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + mov (up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, j + js L(addmul_top) + + add w3, -32(rp) + adc %rax, w0 + + imul -24(up), v0 + add v0, w0 + add w0, -24(rp) + + add $2, n + jns L(ret) + + lea 16(vp), vp + + mov (vp), v0 + mov 8(vp), v1 + + lea -16(up), up + + jmp *outer_addr + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm new file mode 100644 index 0000000..86f1414 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm @@ -0,0 +1,559 @@ +dnl AMD64 mpn_mulmid_basecase + +dnl Contributed by David Harvey. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C K8,K9: 2.375 (2.5 when un - vn is "small") +C K10: ? +C P4: ? +C P6-15: ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') +define(`vp', `%r15') + +define(`vp_inner', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mulmid_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov vp_param, vp + + C use un for row length (= un_param - vn + 1) + lea 1(un_param), un + sub vn, un + + lea (rp,un,8), rp + + cmp $4, un C TODO: needs tuning + jc L(diagonal) + + lea (up,un_param,8), up + + test $1, vn + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + mov R32(un), R32(w0) + + neg un + mov (up,un,8), %rax + mov (vp), v0 + mul v0 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + mov %rax, w3 + mov %rdx, w0 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_1_entry_3) + + ALIGN(16) +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note already w0 == 0 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_prologue_1): + add $4, n + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + mov (up,n,8), %rax + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_1_entry_1) + + ALIGN(16) +L(mul_1_prologue_2): + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + mov $0, R32(w2) + mov $0, R32(w3) + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + mov $0, R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + mov $0, R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + mov w2, 8(rp) C zero last limb of output + adc %rdx, w2 + mov w2, (rp) + + dec vn + jz L(ret) + + lea -8(up), up + lea 8(vp), vp + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov R32(un), R32(w0) + + neg un + mov -8(up,un,8), %rax + mov (vp), v0 + mov 8(vp), v1 + mul v1 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jc L(mul_2_prologue_1) + jz L(mul_2_prologue_2) + +L(mul_2_prologue_3): + mov %rax, w1 + mov %rdx, w2 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + mov %rax, w0 + mov %rdx, w1 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov 16(up,n,8), %rax + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_2_entry_2) + + + C this loop is 18 c/loop = 2.25 c/l on K8 + + ALIGN(16) +L(mul_2_top): + mov -8(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 +L(mul_2_entry_0): + mov $0, R32(w2) + mov (up,n,8), %rax + mul v0 + add %rax, w0 + mov (up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, (rp,n,8) + adc %rdx, w2 +L(mul_2_entry_3): + mov 8(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov $0, R32(w0) + adc $0, R32(w3) + mov 8(up,n,8), %rax + mov w1, 8(rp,n,8) + mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 +L(mul_2_entry_2): + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, 16(rp,n,8) + adc %rdx, w0 +L(mul_2_entry_1): + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, n + mov w3, -8(rp,n,8) + jnz L(mul_2_top) + + mov w0, (rp) + mov w1, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C addmul_2 for remaining vp's + + ALIGN(16) +L(addmul_prologue_0): + mov -8(up,n,8), %rax + mul v1 + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + jmp L(addmul_entry_0) + + ALIGN(16) +L(addmul_prologue_1): + mov 16(up,n,8), %rax + mul v1 + mov %rax, w0 + mov %rdx, w1 + mov $0, R32(w2) + mov 24(up,n,8), %rax + jmp L(addmul_entry_1) + + ALIGN(16) +L(addmul_prologue_2): + mov 8(up,n,8), %rax + mul v1 + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + jmp L(addmul_entry_2) + + ALIGN(16) +L(addmul_prologue_3): + mov (up,n,8), %rax + mul v1 + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov $0, R32(w1) + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8 + + ALIGN(16) +L(addmul_top): + mov $0, R32(w3) + add %rax, w0 + mov -8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add w0, -8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 +L(addmul_entry_0): + mov (up,n,8), %rax + mul v0 + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add w1, (rp,n,8) + mov $0, R32(w1) + adc %rax, w2 + mov $0, R32(w0) + adc %rdx, w3 +L(addmul_entry_3): + mov 8(up,n,8), %rax + mul v0 + add %rax, w2 + mov 8(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add w2, 8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 +L(addmul_entry_2): + mov 16(up,n,8), %rax + mul v0 + add %rax, w3 + mov 16(up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add w3, 16(rp,n,8) + nop C don't ask... + adc %rax, w0 + mov $0, R32(w2) + mov 24(up,n,8), %rax + adc %rdx, w1 +L(addmul_entry_1): + mul v0 + add $4, n + jnz L(addmul_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + + add w0, -8(rp) + adc w1, (rp) + adc w2, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C accumulate along diagonals if un - vn is small + + ALIGN(16) +L(diagonal): + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + + neg un + + mov R32(vn), %eax + and $3, %eax + jz L(diag_prologue_0) + cmp $2, %eax + jc L(diag_prologue_1) + jz L(diag_prologue_2) + +L(diag_prologue_3): + lea -8(vp), vp + mov vp, vp_inner + add $1, vn + mov vn, n + lea L(diag_entry_3)(%rip), outer_addr + jmp L(diag_entry_3) + +L(diag_prologue_0): + mov vp, vp_inner + mov vn, n + lea 0(%rip), outer_addr + mov -8(up,n,8), %rax + jmp L(diag_entry_0) + +L(diag_prologue_1): + lea 8(vp), vp + mov vp, vp_inner + add $3, vn + mov vn, n + lea 0(%rip), outer_addr + mov -8(vp_inner), %rax + jmp L(diag_entry_1) + +L(diag_prologue_2): + lea -16(vp), vp + mov vp, vp_inner + add $2, vn + mov vn, n + lea 0(%rip), outer_addr + mov 16(vp_inner), %rax + jmp L(diag_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(diag_top): + add %rax, w0 + adc %rdx, w1 + mov -8(up,n,8), %rax + adc $0, w2 +L(diag_entry_0): + mulq (vp_inner) + add %rax, w0 + adc %rdx, w1 + adc $0, w2 +L(diag_entry_3): + mov -16(up,n,8), %rax + mulq 8(vp_inner) + add %rax, w0 + mov 16(vp_inner), %rax + adc %rdx, w1 + adc $0, w2 +L(diag_entry_2): + mulq -24(up,n,8) + add %rax, w0 + mov 24(vp_inner), %rax + adc %rdx, w1 + lea 32(vp_inner), vp_inner + adc $0, w2 +L(diag_entry_1): + mulq -32(up,n,8) + sub $4, n + jnz L(diag_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, w2 + + mov w0, (rp,un,8) + + inc un + jz L(diag_end) + + mov vn, n + mov vp, vp_inner + + lea 8(up), up + mov w1, w0 + mov w2, w1 + xor R32(w2), R32(w2) + + jmp *outer_addr + +L(diag_end): + mov w1, (rp) + mov w2, 8(rp) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm new file mode 100644 index 0000000..9327b21 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm @@ -0,0 +1,591 @@ +dnl X86-64 mpn_redc_1 optimised for AMD K8-K10. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * This looks different from other current redc_1.asm variants. Consider +C adapting this to the mainstream style. +C * Is this code really faster than more approaches which compute q0 later? +C Is the use of a jump jump table faster? Or is the edge of this due to the +C inlined add_n code? +C * Put initial m[0] x q0 computation in header. +C * Put basecases at the file's end, single them out before the pushes. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r11') +define(`nneg', `%r12') +define(`mp', `%r13') +define(`q0', `%rbp') +define(`vp', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov (up), q0 C up[0] + push %rbx + imul u0inv, q0 C first q0, for all execution paths + push %r12 + push %r13 + push %r14 + push %r15 + + mov n, nneg + neg nneg + lea (mp_param,n,8), mp C mp += n + lea -16(up,n,8), up C up += n + + mov R32(n), R32(%rax) + and $3, R32(%rax) + lea 4(%rax), %r9 + cmp $4, R32(n) + cmovg %r9, %rax + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + + ALIGN(16) +L(1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2): mov (mp_param), %rax + mul q0 + xor R32(%r14), R32(%r14) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r9 + mul q0 + add (up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(up), %r9 + adc $0, %r14 + mov %r9, q0 + imul u0inv, q0 + mov -16(mp), %rax + mul q0 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r11 + mul q0 + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(up), %r11 + adc $0, %rbx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rbx + mov %r14, (rp) + mov %rbx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + +L(3): mov (mp_param), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add -8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add (up), %r10 + mov %r10, (up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 8(up) + adc $0, %r14 + mov %r14, -8(up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add (up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 8(up), %r10 + mov %r10, 8(up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 16(up) + adc $0, %r14 + mov %r14, (up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add 8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 16(up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 24(up), %r9 + adc $0, %r14 + + xor R32(%rax), R32(%rax) + add -8(up), %r10 + adc (up), %r9 + adc 32(up), %r14 + mov %r10, (rp) + mov %r9, 8(rp) + mov %r14, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2m4): +L(lo2): mov (mp,nneg,8), %rax + mul q0 + xor R32(%r14), R32(%r14) + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r9 + mul q0 + add 16(up,nneg,8), %r10 + adc %rax, %r9 + mov 16(mp,nneg,8), %rax + adc %rdx, %r14 + mul q0 + mov $0, R32(%r10) C xor? + lea 2(nneg), i + add %r9, %r15 + imul u0inv, %r15 + jmp L(e2) + + ALIGN(16) +L(li2): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 +L(e2): add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li2) + +L(le2): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo2) + + mov nneg, n + sar $2, n + lea 32(up,nneg,8), up + lea (up,nneg,8), vp + + mov -16(up), %r8 + mov -8(up), %r9 + add -16(vp), %r8 + adc -8(vp), %r9 + mov %r8, (rp) + mov %r9, 8(rp) + lea 16(rp), rp + jmp L(addx) + + + ALIGN(16) +L(1m4): +L(lo1): mov (mp,nneg,8), %rax + xor %r9, %r9 + xor R32(%rbx), R32(%rbx) + mul q0 + mov %rax, %r9 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r14 + mov $0, R32(%r10) C xor? + mul q0 + add 16(up,nneg,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(mp,nneg,8), %rax + mul q0 + lea 1(nneg), i + add %r14, %r15 + imul u0inv, %r15 + jmp L(e1) + + ALIGN(16) +L(li1): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 +L(e1): add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li1) + +L(le1): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo1) + + mov nneg, n + sar $2, n + lea 24(up,nneg,8), up + lea (up,nneg,8), vp + + mov -8(up), %r8 + add -8(vp), %r8 + mov %r8, (rp) + lea 8(rp), rp + jmp L(addx) + + + ALIGN(16) +L(0): +L(0m4): +L(lo0): mov (mp,nneg,8), %rax + mov nneg, i + mul q0 + xor R32(%r10), R32(%r10) + mov %rax, %r14 + mov %rdx, %rbx + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul u0inv, %r15 + jmp L(e0) + + ALIGN(16) +L(li0): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 +L(e0): mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li0) + +L(le0): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo0) + + mov nneg, n + sar $2, n + clc + lea 16(up,nneg,8), up + lea (up,nneg,8), vp + jmp L(addy) + + + ALIGN(16) +L(3m4): +L(lo3): mov (mp,nneg,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %rbx C result is zero, might carry + mov $0, R32(%rbx) C zero + mov %rbx, %r14 C zero + adc %rax, %r10 + mov 16(mp,nneg,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + add %r10, %r15 + mul q0 + lea 3(nneg), i + imul u0inv, %r15 +C jmp L(li3) + + ALIGN(16) +L(li3): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li3) + +L(le3): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + mov %r15, q0 + lea 8(up), up + dec n + jnz L(lo3) + + +C ==== Addition code ==== + mov nneg, n + sar $2, n + lea 40(up,nneg,8), up + lea (up,nneg,8), vp + + mov -24(up), %r8 + mov -16(up), %r9 + mov -8(up), %r10 + add -24(vp), %r8 + adc -16(vp), %r9 + adc -8(vp), %r10 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + lea 24(rp), rp + +L(addx):inc n + jz L(ad3) + +L(addy):mov (up), %r8 + mov 8(up), %r9 + inc n + jmp L(mid) + +C ALIGN(16) +L(al3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + lea 32(up), up + mov %r9, 8(rp) + mov %r10, 16(rp) + inc n + mov %r11, 24(rp) + lea 32(vp), vp + mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp +L(mid): mov 16(up), %r10 + mov 24(up), %r11 + jnz L(al3) + +L(ae3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + +L(ad3): mov R32(n), R32(%rax) C zero + adc R32(%rax), R32(%rax) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm new file mode 100644 index 0000000..60cf945 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm @@ -0,0 +1,807 @@ +dnl AMD64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should follow the generic/sqr_basecase.c +C code which uses addmul_2s from the start, conditionally leaving a 1x1 +C multiply to the end. (In assembly code, one would stop invoking +C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) +C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to +C save/restore carry, instead it can propagate into the high product word. +C * Align more labels, should shave off a few cycles. +C * We can safely use 32-bit size operations, since operands with (2^32) +C limbs will lead to non-termination in practice. +C * The jump table could probably be optimized, at least for non-pic. +C * The special code for n <= 4 was quickly written. It is probably too +C large and unnecessarily slow. +C * Consider combining small cases code so that the n=k-1 code jumps into the +C middle of the n=k code. +C * Avoid saving registers for small cases code. +C * Needed variables: +C n r11 input size +C i r8 work left, initially n +C j r9 inner loop count +C r15 unused +C v0 r13 +C v1 r14 +C rp rdi +C up rsi +C w0 rbx +C w1 rcx +C w2 rbp +C w3 r10 +C tp r12 +C lo rax +C hi rdx +C rsp + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r11') +define(`tp', `%r12') +define(`i', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + mov R32(n_param), R32(%rcx) + mov R32(n_param), R32(n) C free original n register (rdx) + + add $-40, %rsp + + and $3, R32(%rcx) + cmp $4, R32(n_param) + lea 4(%rcx), %r8 + + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea L(tab)(%rip), %rax +ifdef(`PIC', +` movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax +',` + jmp *(%rax,%rcx,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(4), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + +L(1): mov (up), %rax + mul %rax + add $40, %rsp + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(2): mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(3): mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(4): mov (up), %rax + mov %rax, %r11 + mul %rax + mov 8(up), %rbx + mov %rax, (rp) + mov %rbx, %rax + mov %rdx, 8(rp) + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + mov 24(up), %rax + mul %rax + mov %rax, 48(rp) + mov %rbx, %rax + mov %rdx, 56(rp) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(up), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(up), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(up), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(up), %rax + mul %rbx + pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(up), %rdx + mov 24(up), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, R32(%rax) + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %r11, 32(rp) + adc %rcx, 40(rp) + adc %rdx, 48(rp) + adc %rax, 56(rp) + FUNC_EXIT() + ret + + +L(0m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) + xor R32(j), R32(j) + sub n, j + + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov 16(up,j,8), %rax + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(mul_1_m3_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 + xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 +L(L3): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m3_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile) + + +L(1m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -3(n), i +C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) + lea -3(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov %rax, 8(rp) + jmp L(m0) + + ALIGN(16) +L(mul_2_m0_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 +L(m0): mov -16(up,j,8), %rax C u2, u6 ... + mul v0 C u0 * u2 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2x): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m0_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + lea eval(3*8-24)(tp), tp C tp += 3 + jmp L(dowhile_end) + + +L(2m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) + lea -2(n), j + neg j + + mul v0 + mov %rax, w2 + mov (up,j,8), %rax + mov %rdx, w1 + jmp L(L1) + + ALIGN(16) +L(mul_1_m1_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 +L(L1): xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 + xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m1_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile_mid) + + +L(3m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -5(n), i +C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) + lea -1(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, 8(rp) + jmp L(m2) + + ALIGN(16) +L(mul_2_m2_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m2_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + jmp L(dowhile_mid) + +L(dowhile): +C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) + lea 4(i), j + neg j + + mov 16(up,j,8), v0 + mov 24(up,j,8), v1 + mov 24(up,j,8), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, 24(tp,j,8) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(am2) + + ALIGN(16) +L(addmul_2_m2_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 +L(am2): mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m2_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + + add $-2, R32(i) C i -= 2 + +L(dowhile_mid): +C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) + lea 2(i), j + neg j + + mov (up,j,8), v0 + mov 8(up,j,8), v1 + mov 8(up,j,8), %rax + mul v0 + xor R32(w1), R32(w1) + add %rax, 8(tp,j,8) + adc %rdx, w1 + xor R32(w2), R32(w2) + jmp L(20) + + ALIGN(16) +L(addmul_2_m0_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 +L(20): mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m0_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 +L(dowhile_end): + + add $-2, R32(i) C i -= 2 + jne L(dowhile) + +C Function mpn_addmul_2s_2 + mov -16(up), v0 + mov -8(up), v1 + mov -8(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, -8(tp) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov (up), %rax + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + mul v1 + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + +C Function mpn_sqr_diag_addlsh1 + lea -4(n,n), j + + mov 8(rp), %r11 + lea -8(up), up + lea (rp,j,8), rp + neg j + mov (up,j,4), %rax + mul %rax + test $2, R8(j) + jnz L(odd) + +L(evn): add %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + jmp L(d0) + +L(odd): add %r11, %r11 + sbb R32(%rbp), R32(%rbp) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + lea -2(j), j + jmp L(d1) + + ALIGN(16) +L(top): mov (up,j,4), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp,j,8) +L(d0): mov %r11, 8(rp,j,8) + mov 16(rp,j,8), %r10 + adc %r10, %r10 + mov 24(rp,j,8), %r11 + adc %r11, %r11 + nop + sbb R32(%rbp), R32(%rbp) C save CF + mov 8(up,j,4), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(rp,j,8) +L(d1): mov %r11, 24(rp,j,8) + mov 32(rp,j,8), %r10 + adc %r10, %r10 + mov 40(rp,j,8), %r11 + adc %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add $4, j + js L(top) + + mov (up), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp) + mov %r11, 8(rp) + mov 16(rp), %r10 + adc %r10, %r10 + sbb R32(%rbp), R32(%rbp) C save CF + neg R32(%rbp) + mov 8(up), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(rp) + mov %rdx, 24(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/logops_n.asm b/gmp-6.3.0/mpn/x86_64/logops_n.asm new file mode 100644 index 0000000..e25854d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/logops_n.asm @@ -0,0 +1,260 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 1.5 1.5 1.5 y +C AMD K10 1.5 1.5 1.5 y +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.67 +C AMD bt2 2.0 2.28 2.28 y +C AMD zen 1.5 1.5 1.5 = +C Intel P4 2.8 3.35 3.6 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.75 1.75 n +C Intel IBR 1.48 1.71 1.72 n +C Intel HWL 1.5 1.5 1.5 n +C Intel BWL 1.5 1.5 1.5 n +C Intel SKL 1.5 1.5 1.5 n +C Intel atom 3.82 3.82 3.82 n +C Intel SLM 3.0 3.0 3.0 = +C VIA nano 3.25 + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) +L(b01): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 +L(b00): mov 8(vp,n,8), %r9 + LOGOP (up,n,8), %r8 + LOGOP 8(up,n,8), %r9 + nop C K8/K9/K10 concession + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 +L(e10): mov 24(vp,n,8), %r9 + LOGOP 16(up,n,8), %r8 + LOGOP 24(up,n,8), %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 + not %r8 +L(b00): mov 8(vp,n,8), %r9 + not %r9 + LOGOP (up,n,8), %r8 + LOGOP 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 + not %r8 +L(e10): mov 24(vp,n,8), %r9 + not %r9 + LOGOP 16(up,n,8), %r8 + LOGOP 24(up,n,8), %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + not %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): LOGOP (up,n,8), %r8 + not %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 +L(b00): mov 8(vp,n,8), %r9 + LOGOP (up,n,8), %r8 + not %r8 + LOGOP 8(up,n,8), %r9 + not %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 +L(e10): mov 24(vp,n,8), %r9 + LOGOP 16(up,n,8), %r8 + not %r8 + LOGOP 24(up,n,8), %r9 + not %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86_64/lshift.asm b/gmp-6.3.0/mpn/x86_64/lshift.asm new file mode 100644 index 0000000..fff3152 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshift.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_lshift -- mpn left shift. + +dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012, 2018 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cnt=1 +C AMD K8,K9 2.375 1.375 +C AMD K10 2.375 1.375 +C Intel P4 8 10.5 +C Intel core2 2.11 4.28 +C Intel corei ? ? +C Intel atom 5.75 3.5 +C VIA nano 3.5 2.25 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov -8(up,n,8), %rax + shr R8(%rcx), %rax C function return value + + neg R32(%rcx) C put lsh count in cl + lea 1(n), R32(%r8) + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov %r10, -8(rp,n,8) + dec n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov %r10, -8(rp,n,8) + dec n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $1, n + je L(ast) + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -24(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -8(rp,n,8) + mov %r11, -16(rp,n,8) + sub $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + + sub $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 8(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 24(rp,n,8) + mov %r11, 16(rp,n,8) + C start two new rsh + mov 0(up,n,8), %r8 + mov -8(up,n,8), %r9 + shr R8(%rcx), %r8 + shr R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov 8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shl R8(%rcx), %r10 + or %r10, %r8 + shl R8(%rcx), %r11 + or %r11, %r9 + mov %r8, 8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov -8(up,n,8), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r10 + shl R8(%rcx), %r11 + + sub $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov 8(up), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 16(rp) + mov %r11, 8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shl R8(%rcx), %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/lshiftc.asm new file mode 100644 index 0000000..c4ba04a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshiftc.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_lshiftc -- mpn left shift with complement. + +dnl Copyright 2003, 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.75 +C AMD K10 2.75 +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano 3.75 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov -8(up,n,8), %rax + shr R8(%rcx), %rax C function return value + + neg R32(%rcx) C put lsh count in cl + lea 1(n), R32(%r8) + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(rp,n,8) + dec n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(rp,n,8) + dec n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $1, n + je L(ast) + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -24(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(rp,n,8) + mov %r11, -16(rp,n,8) + sub $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + + sub $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 8(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(rp,n,8) + mov %r11, 16(rp,n,8) + C start two new rsh + mov 0(up,n,8), %r8 + mov -8(up,n,8), %r9 + shr R8(%rcx), %r8 + shr R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov 8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shl R8(%rcx), %r10 + or %r10, %r8 + shl R8(%rcx), %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov -8(up,n,8), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r10 + shl R8(%rcx), %r11 + + sub $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov 8(up), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(rp) + mov %r11, 8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shl R8(%rcx), %r10 + not %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/lshsub_n.asm b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm new file mode 100644 index 0000000..4d428c0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). + +dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C Intel P4 16.5 +C Intel core2 4.35 +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C This was written quickly and not optimized at all, but it runs very well on +C K8. But perhaps one could get under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshsub_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + push %r12 + push %r13 + push %r14 + push %r15 + push %rbx + + mov n, %rax + xor R32(%rbx), R32(%rbx) C clear carry save register + mov R32(%r8), R32(%rcx) C shift count + xor R32(%r15), R32(%r15) C limb carry + + mov R32(%rax), R32(%r11) + and $3, R32(%r11) + je L(4) + sub $1, R32(%r11) + +L(oopette): + add R32(%rbx), R32(%rbx) C restore carry flag + mov 0(up), %r8 + lea 8(up), up + sbb 0(vp), %r8 + mov %r8, %r12 + sbb R32(%rbx), R32(%rbx) C save carry flag + shl R8(%rcx), %r8 + or %r15, %r8 + mov %r12, %r15 + lea 8(vp), vp + neg R8(%rcx) + shr R8(%rcx), %r15 + neg R8(%rcx) + mov %r8, 0(rp) + lea 8(rp), rp + sub $1, R32(%r11) + jnc L(oopette) + +L(4): + sub $4, %rax + jc L(end) + + ALIGN(16) +L(oop): + add R32(%rbx), R32(%rbx) C restore carry flag + + mov 0(up), %r8 + mov 8(up), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + + lea 32(up), up + + sbb 0(vp), %r8 + mov %r8, %r12 + sbb 8(vp), %r9 + mov %r9, %r13 + sbb 16(vp), %r10 + mov %r10, %r14 + sbb 24(vp), %r11 + + sbb R32(%rbx), R32(%rbx) C save carry flag + + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + shl R8(%rcx), %r10 + or %r15, %r8 + mov %r11, %r15 + shl R8(%rcx), %r11 + + lea 32(vp), vp + + neg R8(%rcx) + + shr R8(%rcx), %r12 + shr R8(%rcx), %r13 + shr R8(%rcx), %r14 + shr R8(%rcx), %r15 C used next loop + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg R8(%rcx) + + mov %r8, 0(rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + lea 32(rp), rp + + sub $4, %rax + jnc L(oop) +L(end): + neg R32(%rbx) + shl R8(%rcx), %rbx + adc %r15, %rbx + mov %rbx, %rax + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/missing-call.m4 b/gmp-6.3.0/mpn/x86_64/missing-call.m4 new file mode 100644 index 0000000..c024f0e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing-call.m4 @@ -0,0 +1,53 @@ +dnl AMD64 MULX/ADX simulation support, function call version. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +define(`adox',` + push $1 + push $2 + call __gmp_adox + pop $2 +') + +define(`adcx',` + push $1 + push $2 + call __gmp_adcx + pop $2 +') + +define(`mulx',` + push $1 + call __gmp_mulx + pop $2 + pop $3 +') diff --git a/gmp-6.3.0/mpn/x86_64/missing-inline.m4 b/gmp-6.3.0/mpn/x86_64/missing-inline.m4 new file mode 100644 index 0000000..bd1df13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing-inline.m4 @@ -0,0 +1,100 @@ +dnl AMD64 MULX/ADX simulation support, inline version. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +define(`adox',` + push $2 + push %rcx + push %rbx + push %rax + mov $1, %rcx + pushfq + pushfq +C copy 0(%rsp):11 to 0(%rsp):0 + mov (%rsp), %rbx + shr %rbx + bt $`'10, %rbx + adc %rbx, %rbx + mov %rbx, (%rsp) +C put manipulated flags into eflags, execute a plain adc + popfq + adc %rcx, 32(%rsp) +C copy CF to 0(%rsp):11 + mov (%rsp), %rbx + sbb R32(%rax), R32(%rax) + and $`'0x800, R32(%rax) + and $`'0xfffffffffffff7ff, %rbx + or %rax, %rbx + mov %rbx, (%rsp) +C put manipulated flags into eflags + popfq + pop %rax + pop %rbx + pop %rcx + pop $2 +') + +define(`adcx',` + push $2 + push %rcx + push %rbx + push %rax + mov $1, %rcx + pushfq + adc %rcx, 32(%rsp) + mov (%rsp), %rbx + sbb R32(%rax), R32(%rax) + and $`'0xfffffffffffffffe, %rbx + sub %rax, %rbx + mov %rbx, (%rsp) + popfq + pop %rax + pop %rbx + pop %rcx + pop $2 +') + +define(`mulx',` + lea -16(%rsp), %rsp + push %rax + push %rdx + pushfq C preserve all flags + mov $1, %rax + mul %rdx + mov %rax, 24(%rsp) + mov %rdx, 32(%rsp) + popfq C restore eflags + pop %rdx + pop %rax + pop $2 + pop $3 +') diff --git a/gmp-6.3.0/mpn/x86_64/missing.asm b/gmp-6.3.0/mpn/x86_64/missing.asm new file mode 100644 index 0000000..9b65c89 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing.asm @@ -0,0 +1,130 @@ + + dnl AMD64 MULX/ADX simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() + +C Fake the MULX instruction +C +C Accept the single explicit parameter on the stack, return the two result +C words on the stack. This calling convention means that we need to move the +C return address up. +C +PROLOGUE(__gmp_mulx) + lea -8(%rsp), %rsp + push %rax + push %rdx + pushfq C preserve all flags + mov 32(%rsp), %rax C move retaddr... + mov %rax, 24(%rsp) C ...up the stack + mov 40(%rsp), %rax C input parameter + mul %rdx + mov %rax, 32(%rsp) + mov %rdx, 40(%rsp) + popfq C restore eflags + pop %rdx + pop %rax + ret +EPILOGUE() +PROTECT(__gmp_mulx) + + +C Fake the ADOX instruction +C +C Accept the two parameters on the stack, return the result word on the stack. +C This calling convention means that we need to move the return address down. +C +PROLOGUE(__gmp_adox) + push %rcx + push %rbx + push %rax + mov 32(%rsp), %rcx C src2 + mov 24(%rsp), %rax C move retaddr... + mov %rax, 32(%rsp) C ...down the stack + pushfq +C copy 0(%rsp):11 to 0(%rsp):0 + mov (%rsp), %rbx + shr %rbx + bt $10, %rbx + adc %rbx, %rbx + push %rbx +C put manipulated flags into eflags, execute a plain adc + popfq + adc %rcx, 48(%rsp) +C copy CF to 0(%rsp):11 + pop %rbx + sbb R32(%rax), R32(%rax) + and $0x800, R32(%rax) + and $0xfffffffffffff7ff, %rbx + or %rax, %rbx + push %rbx +C put manipulated flags into eflags + popfq + pop %rax + pop %rbx + pop %rcx + lea 8(%rsp), %rsp + ret +EPILOGUE() +PROTECT(__gmp_adox) + + +C Fake the ADCX instruction +C +C Accept the two parameters on the stack, return the result word on the stack. +C This calling convention means that we need to move the return address down. +C +PROLOGUE(__gmp_adcx) + push %rcx + push %rbx + push %rax + mov 32(%rsp), %rcx C src2 + mov 24(%rsp), %rax C move retaddr... + mov %rax, 32(%rsp) C ...down the stack + pushfq + adc %rcx, 48(%rsp) + pop %rbx + sbb R32(%rax), R32(%rax) + and $`'0xfffffffffffffffe, %rbx + sub %rax, %rbx + push %rbx + popfq + pop %rax + pop %rbx + pop %rcx + lea 8(%rsp), %rsp + ret +EPILOGUE() +PROTECT(__gmp_adcx) diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_1.asm b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm new file mode 100644 index 0000000..255305f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm @@ -0,0 +1,238 @@ +dnl AMD64 mpn_mod_1_1p + +dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 6 +C AMD K10 6 +C Intel P4 26 +C Intel core2 12.5 +C Intel NHM 11.3 +C Intel SBR 8.4 (slowdown, old code took 8.0) +C Intel atom 26 +C VIA nano 13 + +define(`B2mb', `%r10') +define(`B2modb', `%r11') +define(`ap', `%rdi') +define(`n', `%rsi') +define(`pre', `%r8') +define(`b', `%rbx') + +define(`r0', `%rbp') C r1 kept in %rax +define(`r2', `%rcx') C kept negated. Also used as shift count +define(`t0', `%r9') + +C mp_limb_t +C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) +C %rdi %rsi %rdx %rcx +C The pre array contains bi, cnt, B1modb, B2modb +C Note: This implementation needs B1modb only when cnt > 0 + +C The iteration is almost as follows, +C +C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u +C +C where r2 is a single bit represented as a mask. But to make sure that the +C result fits in two limbs and a bit, carry from the addition +C +C r_0 + r_2 B2mod +C +C is handled specially. On carry, we subtract b to cancel the carry, +C and we use instead the value +C +C r_0 + B2mb (mod B) +C +C This addition can be issued early since it doesn't depend on r2, and it is +C the source of the cmov in the loop. +C +C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1_1p) + FUNC_ENTRY(4) + push %rbp + push %rbx + mov %rdx, b + mov %rcx, pre + + mov -8(ap, n, 8), %rax + cmp $3, n + jnc L(first) + mov -16(ap, n, 8), r0 + jmp L(reduce_two) + +L(first): + C First iteration, no r2 + mov 24(pre), B2modb + mul B2modb + mov -24(ap, n, 8), r0 + add %rax, r0 + mov -16(ap, n, 8), %rax + adc %rdx, %rax + sbb r2, r2 + sub $4, n + jc L(reduce_three) + + mov B2modb, B2mb + sub b, B2mb + + ALIGN(16) +L(top): and B2modb, r2 + lea (B2mb, r0), t0 + mul B2modb + add r0, r2 + mov (ap, n, 8), r0 + cmovc t0, r2 + add %rax, r0 + mov r2, %rax + adc %rdx, %rax + sbb r2, r2 + sub $1, n + jnc L(top) + +L(reduce_three): + C Eliminate r2 + and b, r2 + sub r2, %rax + +L(reduce_two): + mov 8(pre), R32(%rcx) + test R32(%rcx), R32(%rcx) + jz L(normalized) + + C Unnormalized, use B1modb to reduce to size < B (b+1) + mulq 16(pre) + xor t0, t0 + add %rax, r0 + adc %rdx, t0 + mov t0, %rax + + C Left-shift to normalize +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rax + mov r0, t0 + neg R32(%rcx) + shr R8(%rcx), t0 + or t0, %rax + neg R32(%rcx) +',` + shld R8(%rcx), r0, %rax +') + shl R8(%rcx), r0 + jmp L(udiv) + +L(normalized): + mov %rax, t0 + sub b, t0 + cmovnc t0, %rax + +L(udiv): + lea 1(%rax), t0 + mulq (pre) + add r0, %rax + adc t0, %rdx + imul b, %rdx + sub %rdx, r0 + cmp r0, %rax + lea (b, r0), %rax + cmovnc r0, %rax + cmp b, %rax + jnc L(fix) +L(ok): shr R8(%rcx), %rax + + pop %rbx + pop %rbp + FUNC_EXIT() + ret +L(fix): sub b, %rax + jmp L(ok) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1_1p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) + sal R8(%rcx), %r12 +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + neg %r12 + mov %r12, %r8 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + imul %rax, %r12 + mov %r12, 24(%rbx) C store B2modb + mov R32(%rbp), R32(%rcx) + test R32(%rcx), R32(%rcx) + jz L(z) + + mov $1, R32(%rdx) +ifdef(`SHLD_SLOW',` + C Destroys %rax, unlike shld. Otherwise, we could do B1modb + C before B2modb, and get rid of the move %r12, %r8 above. + + shl R8(%rcx), %rdx + neg R32(%rcx) + shr R8(%rcx), %rax + or %rax, %rdx + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rdx +') + imul %rdx, %r8 + shr R8(%rcx), %r8 + mov %r8, 16(%rbx) C store B1modb +L(z): + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_2.asm b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm new file mode 100644 index 0000000..40fcaeb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_mod_1s_2p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4 +C AMD K10 4 +C Intel P4 19 +C Intel core2 8 +C Intel NHM 6.5 +C Intel SBR 4.5 +C Intel atom 28 +C VIA nano 8 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_2p) + FUNC_ENTRY(4) + push %r14 + test $1, R8(%rsi) + mov %rdx, %r14 + push %r13 + mov %rcx, %r13 + push %r12 + push %rbp + push %rbx + mov 16(%rcx), %r10 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + je L(b0) + dec %rsi + je L(one) + mov -8(%rdi,%rsi,8), %rax + mul %r10 + mov %rax, %r9 + mov %rdx, %r8 + mov (%rdi,%rsi,8), %rax + add -16(%rdi,%rsi,8), %r9 + adc $0, %r8 + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + jmp L(11) + +L(b0): mov -8(%rdi,%rsi,8), %r8 + mov -16(%rdi,%rsi,8), %r9 + +L(11): sub $4, %rsi + jb L(ed2) + lea 40(%rdi,%rsi,8), %rdi + mov -40(%rdi), %r11 + mov -32(%rdi), %rax + jmp L(m0) + + ALIGN(16) +L(top): mov -24(%rdi), %r9 + add %rax, %r11 + mov -16(%rdi), %rax + adc %rdx, %r12 + mul %r10 + add %rax, %r9 + mov %r11, %rax + mov %rdx, %r8 + adc $0, %r8 + mul %rbx + add %rax, %r9 + mov %r12, %rax + adc %rdx, %r8 + mul %rbp + sub $2, %rsi + jb L(ed1) + mov -40(%rdi), %r11 + add %rax, %r9 + mov -32(%rdi), %rax + adc %rdx, %r8 +L(m0): mul %r10 + add %rax, %r11 + mov %r9, %rax + mov %rdx, %r12 + adc $0, %r12 + mul %rbx + add %rax, %r11 + lea -32(%rdi), %rdi C ap -= 4 + mov %r8, %rax + adc %rdx, %r12 + mul %rbp + sub $2, %rsi + jae L(top) + +L(ed0): mov %r11, %r9 + mov %r12, %r8 +L(ed1): add %rax, %r9 + adc %rdx, %r8 +L(ed2): mov 8(%r13), R32(%rdi) C cnt + mov %r8, %rax + mov %r9, %r8 + mul %r10 + add %rax, %r8 + adc $0, %rdx +L(1): xor R32(%rcx), R32(%rcx) + mov %r8, %r9 + sub R32(%rdi), R32(%rcx) + shr R8(%rcx), %r9 + mov R32(%rdi), R32(%rcx) + sal R8(%rcx), %rdx + or %rdx, %r9 + sal R8(%rcx), %r8 + mov %r9, %rax + mulq (%r13) + mov %rax, %rsi + inc %r9 + add %r8, %rsi + adc %r9, %rdx + imul %r14, %rdx + sub %rdx, %r8 + lea (%r8,%r14), %rax + cmp %r8, %rsi + cmovc %rax, %r8 + mov %r8, %rax + sub %r14, %rax + cmovc %r8, %rax + mov R32(%rdi), R32(%rcx) + shr R8(%rcx), %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + FUNC_EXIT() + ret +L(one): + mov (%rdi), %r8 + mov 8(%rcx), R32(%rdi) + xor %rdx, %rdx + jmp L(1) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_2p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) C preserve cnt over call + sal R8(%rcx), %r12 C b << cnt +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + neg %r8 + mov R32(%rbp), R32(%rcx) + mov $1, R32(%rsi) +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rsi + neg R32(%rcx) + mov %rax, %rbp + shr R8(%rcx), %rax + or %rax, %rsi + mov %rbp, %rax + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano +') + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 16(%rbx) C store B1modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 24(%rbx) C store B2modb + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr R8(%rcx), %r12 + mov %r12, 32(%rbx) C store B3modb + + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_4.asm b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm new file mode 100644 index 0000000..6cf304c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm @@ -0,0 +1,272 @@ +dnl AMD64 mpn_mod_1s_4p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3 +C AMD K10 3 +C Intel P4 15.5 +C Intel core2 5 +C Intel corei 4 +C Intel atom 23 +C VIA nano 4.75 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + FUNC_ENTRY(4) + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rdx, %r15 + mov %rcx, %r14 + mov 16(%rcx), %r11 C B1modb + mov 24(%rcx), %rbx C B2modb + mov 32(%rcx), %rbp C B3modb + mov 40(%rcx), %r13 C B4modb + mov 48(%rcx), %r12 C B5modb + xor R32(%r8), R32(%r8) + mov R32(%rsi), R32(%rdx) + and $3, R32(%rdx) + je L(b0) + cmp $2, R32(%rdx) + jc L(b1) + je L(b2) + +L(b3): lea -24(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + jmp L(m0) + + ALIGN(8) +L(b0): lea -32(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + mov 24(%rdi), %rax + mul %rbp + jmp L(m0) + + ALIGN(8) +L(b1): lea -8(%rdi,%rsi,8), %rdi + mov (%rdi), %r9 + jmp L(m1) + + ALIGN(8) +L(b2): lea -16(%rdi,%rsi,8), %rdi + mov 8(%rdi), %r8 + mov (%rdi), %r9 + jmp L(m1) + + ALIGN(16) +L(top): mov -24(%rdi), %rax + mov -32(%rdi), %r10 + mul %r11 C up[1] * B1modb + add %rax, %r10 + mov -16(%rdi), %rax + mov $0, R32(%rcx) + adc %rdx, %rcx + mul %rbx C up[2] * B2modb + add %rax, %r10 + mov -8(%rdi), %rax + adc %rdx, %rcx + sub $32, %rdi + mul %rbp C up[3] * B3modb + add %rax, %r10 + mov %r13, %rax + adc %rdx, %rcx + mul %r9 C rl * B4modb + add %rax, %r10 + mov %r12, %rax + adc %rdx, %rcx + mul %r8 C rh * B5modb + mov %r10, %r9 + mov %rcx, %r8 +L(m0): add %rax, %r9 + adc %rdx, %r8 +L(m1): sub $4, %rsi + ja L(top) + +L(end): mov 8(%r14), R32(%rsi) + mov %r8, %rax + mul %r11 + mov %rax, %r8 + add %r9, %r8 + adc $0, %rdx + xor R32(%rcx), R32(%rcx) + sub R32(%rsi), R32(%rcx) + mov %r8, %rdi + shr R8(%rcx), %rdi + mov R32(%rsi), R32(%rcx) + sal R8(%rcx), %rdx + or %rdx, %rdi + mov %rdi, %rax + mulq (%r14) + mov %r15, %rbx + mov %rax, %r9 + sal R8(%rcx), %r8 + inc %rdi + add %r8, %r9 + adc %rdi, %rdx + imul %rbx, %rdx + sub %rdx, %r8 + lea (%r8,%rbx), %rax + cmp %r8, %r9 + cmovc %rax, %r8 + mov %r8, %rax + sub %rbx, %rax + cmovc %r8, %rax + shr R8(%rcx), %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) C preserve cnt over call + sal R8(%rcx), %r12 C b << cnt +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + neg %r8 + mov R32(%rbp), R32(%rcx) + mov $1, R32(%rsi) +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rsi + neg R32(%rcx) + mov %rax, %rbp + shr R8(%rcx), %rax + or %rax, %rsi + mov %rbp, %rax + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano +') + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 16(%rbx) C store B1modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 24(%rbx) C store B2modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 32(%rbx) C store B3modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 40(%rbx) C store B4modb + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr R8(%rcx), %r12 + mov %r12, 48(%rbx) C store B5modb + + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm new file mode 100644 index 0000000..75421a6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way +C AMD K10 0.67 this seems hard to beat +C AMD bd1 1 +C AMD bd2 1 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 0.62 +C AMD bobcat 1.07 +C AMD jaguar 1 +C Intel P4 7.35 terrible, use old code +C Intel core2 1.25 1+epsilon with huge unrolling +C Intel NHM 1.15 this seems hard to beat +C Intel SBR 0.93 +C Intel IBR 0.93 +C Intel HWL 0.82 +C Intel BWL 0.64 +C Intel SKY 0.60 +C Intel atom 2.5 +C Intel SLM 1.59 +C VIA nano 1.25 this seems hard to beat + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + mov (ap), %rax + + cmp $2, %rsi + ja L(gt2) + + jb L(one) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax +L(one): FUNC_EXIT() + ret + + +C Don't change this, the wind-down code is not able to handle greater values +define(UNROLL,3) + +L(gt2): mov 8(ap), %rcx + mov 16(ap), %rdx + xor %r9, %r9 + add $24, ap + sub $eval(UNROLL*3+3), %rsi + jc L(end) + ALIGN(16) +L(top): + add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 +forloop(i,1,UNROLL-1,`dnl + add eval(i*24)(ap), %rax + adc eval(i*24+8)(ap), %rcx + adc eval(i*24+16)(ap), %rdx + adc $0, %r9 +')dnl + add $eval(UNROLL*24), ap + sub $eval(UNROLL*3), %rsi + jnc L(top) + +L(end): + lea L(tab)(%rip), %r8 +ifdef(`PIC', +` movslq 36(%r8,%rsi,4), %r10 + add %r10, %r8 + jmp *%r8 +',` + jmp *72(%r8,%rsi,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) + JMPENT( L(8), L(tab)) + TEXT + +L(6): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(3): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + jmp L(cj1) + +L(7): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(4): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(1): add (ap), %rax + adc $0, %rcx + jmp L(cj2) + +L(8): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(5): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(2): add (ap), %rax + adc 8(ap), %rcx + +L(cj2): adc $0, %rdx +L(cj1): adc $0, %r9 +L(0): add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax + + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mode1o.asm b/gmp-6.3.0/mpn/x86_64/mode1o.asm new file mode 100644 index 0000000..2cd2b08 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mode1o.asm @@ -0,0 +1,171 @@ +dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. + +dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13 +C Intel corei 14.5 +C Intel atom 35 +C VIA nano ? + + +C The dependent chain in the main loop is +C +C cycles +C sub %rdx, %rax 1 +C imul %r9, %rax 4 +C mul %r8 5 +C ---- +C total 10 +C +C The mov load from src seems to need to be scheduled back before the jz to +C achieve this speed, out-of-order execution apparently can't completely hide +C the latency otherwise. +C +C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it +C for the first iteration (where there's no cbit). +C +C The code alignment used (32-byte) for the loop also seems necessary. Without +C that the non-PIC case has adc crossing the 0x60 offset, apparently making it +C run at 11 cycles instead of 10. + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_modexact_1_odd) + FUNC_ENTRY(3) + mov $0, R32(%rcx) +IFDOS(` jmp L(ent) ') + +PROLOGUE(mpn_modexact_1c_odd) + FUNC_ENTRY(4) +L(ent): + C rdi src + C rsi size + C rdx divisor + C rcx carry + + mov %rdx, %r8 C d + shr R32(%rdx) C d/2 + + LEA( binvert_limb_table, %r9) + + and $127, R32(%rdx) + mov %rcx, %r10 C initial carry + + movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits + + mov (%rdi), %rax C src[0] + lea (%rdi,%rsi,8), %r11 C src end + mov %r8, %rdi C d, made available to imull + + lea (%rdx,%rdx), R32(%rcx) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + + neg %rsi C -size + + imul R32(%rdi), R32(%rdx) C inv*inv*d + + sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rcx,%rcx), R32(%rdx) C 2*inv + imul R32(%rcx), R32(%rcx) C inv*inv + + imul R32(%rdi), R32(%rcx) C inv*inv*d + + sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits + xor R32(%rcx), R32(%rcx) C initial cbit + + lea (%rdx,%rdx), %r9 C 2*inv + imul %rdx, %rdx C inv*inv + + imul %r8, %rdx C inv*inv*d + + sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits + mov %r10, %rdx C initial climb + + ASSERT(e,` C d*inv == 1 mod 2^64 + mov %r8, %r10 + imul %r9, %r10 + cmp $1, %r10') + + inc %rsi + jz L(one) + + + ALIGN(16) +L(top): + C rax l = src[i]-cbit + C rcx new cbit, 0 or 1 + C rdx climb, high of last product + C rsi counter, limbs, negative + C rdi + C r8 divisor + C r9 inverse + C r11 src end ptr + + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + mov (%r11,%rsi,8), %rax C src[i+1] + sub %rcx, %rax C next l = src[i+1] - cbit + setc R8(%rcx) C new cbit + + inc %rsi + jnz L(top) + + +L(one): + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + lea (%rcx,%rdx), %rax C climb+cbit + FUNC_EXIT() + ret + +EPILOGUE(mpn_modexact_1c_odd) +EPILOGUE(mpn_modexact_1_odd) diff --git a/gmp-6.3.0/mpn/x86_64/mul_1.asm b/gmp-6.3.0/mpn/x86_64/mul_1.asm new file mode 100644 index 0000000..e1ba89b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mul_1.asm @@ -0,0 +1,192 @@ +dnl AMD64 mpn_mul_1. + +dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.54 +C AMD K10 2.54 +C AMD bull 4.98 +C AMD pile 4.80 +C AMD steam +C AMD excavator +C AMD bobcat 5.37 +C AMD jaguar 6.16 +C Intel P4 12.6 +C Intel core2 4.05 +C Intel NHM 4.0 +C Intel SBR 2.91 +C Intel IBR 2.73 +C Intel HWL 2.44 +C Intel BWL 2.39 +C Intel SKL 2.44 +C Intel atom 19.8 +C Intel SLM 9.0 +C VIA nano 4.25 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 + +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + push %rbx +IFSTD(` mov %r8, %r10') +IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns) + jmp L(common) +EPILOGUE() + +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + push %rbx + xor %r10, %r10 +L(common): + mov (up), %rax C read first u limb early +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul vl +IFSTD(` mov %rbx, n ') + + add %r10, %rax + adc $0, %rdx + + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor R32(%rbx), R32(%rbx) + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + ALIGN(16) +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, R32(%r10) +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, R32(%r8) C zero + mov %r8, %rbx C zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 C zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mul_2.asm b/gmp-6.3.0/mpn/x86_64/mul_2.asm new file mode 100644 index 0000000..d64313b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mul_2.asm @@ -0,0 +1,204 @@ +dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl store the result in a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.53 +C AMD K10 4.53 +C AMD bull 9.76 10.37 +C AMD pile 9.22 +C AMD steam +C AMD excavator +C AMD bobcat 11.3 +C AMD jaguar 11.9 +C Intel P4 25.0 +C Intel core2 8.05 +C Intel NHM 7.72 +C Intel SBR 6.33 +C Intel IBR 6.15 +C Intel HWL 6.00 +C Intel BWL 4.44 +C Intel SKL 4.54 +C Intel atom 39.0 +C Intel SLM 24.0 +C VIA nano + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Work on feed-in and wind-down code. +C * Convert "mov $0" to "xor". +C * Adjust initial lea to save some bytes. +C * Perhaps adjust n from n_param&3 value? +C * Replace with 2.25 c/l sequence. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + + and $3, R32(n_param) + jz L(m2p0) + cmp $2, R32(n_param) + jc L(m2p1) + jz L(m2p2) +L(m2p3): + mul v0 + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov 8(up,n,8), %rax + add $-1, n + mul v1 + add %rax, w2 + jmp L(m23) +L(m2p0): + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov %rdx, w1 + jmp L(m20) +L(m2p1): + mul v0 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + add $1, n + jmp L(m2top) +L(m2p2): + mul v0 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, w2 + mov %rdx, w3 + mov 8(up,n,8), %rax + add $-2, n + jmp L(m22) + + + ALIGN(32) +L(m2top): + add %rax, w3 + adc %rdx, w0 + mov 0(up,n,8), %rax + adc $0, R32(w1) + mov $0, R32(w2) + mul v1 + add %rax, w0 + mov w3, 0(rp,n,8) + adc %rdx, w1 + mov 8(up,n,8), %rax + mul v0 + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) +L(m20): mov 8(up,n,8), %rax + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mov $0, R32(w3) + mul v0 + add %rax, w1 + mov 16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add %rax, w2 + mov w0, 8(rp,n,8) +L(m23): adc %rdx, w3 + mov 24(up,n,8), %rax + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov w1, 16(rp,n,8) + mov 24(up,n,8), %rax + mov $0, R32(w1) + adc $0, R32(w0) +L(m22): mul v1 + add %rax, w3 + mov w2, 24(rp,n,8) + adc %rdx, w0 + mov 32(up,n,8), %rax + mul v0 + add $4, n + js L(m2top) + + + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + mov (up), %rax + mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm new file mode 100644 index 0000000..9ceb611 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm @@ -0,0 +1,157 @@ +dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zen ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + +C Testing mechanism for running this on older AMD64 processors +ifelse(FAKE_MULXADX,1,` + include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4') +',` + define(`adox', ``adox' $1, $2') + define(`adcx', ``adcx' $1, $2') + define(`mulx', ``mulx' $1, $2, $3') +') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -16(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jl L(b1) + jz L(b2) + +L(b3): mulx( (up,n,8), %r11, %r10) + mulx( 8(up,n,8), %r13, %r12) + mulx( 16(up,n,8), %rbx, %rax) + dec n + jmp L(lo3) + +L(b0): mulx( (up,n,8), %r9, %r8) + mulx( 8(up,n,8), %r11, %r10) + mulx( 16(up,n,8), %r13, %r12) + jmp L(lo0) + +L(b2): mulx( (up,n,8), %r13, %r12) + mulx( 8(up,n,8), %rbx, %rax) + lea 2(n), n + jrcxz L(wd2) +L(gt2): mulx( (up,n,8), %r9, %r8) + jmp L(lo2) + +L(b1): and R8(%rax), R8(%rax) + mulx( (up,n,8), %rbx, %rax) + lea 1(n), n + jrcxz L(wd1) + mulx( (up,n,8), %r9, %r8) + mulx( 8(up,n,8), %r11, %r10) + jmp L(lo1) + +L(end): adcx( %r10, %r13) + mov %r11, -8(rp) +L(wd2): adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) +L(wd1): adox( 8(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + pop %r13 + pop %r12 + pop %rbx + ret + +L(top): jrcxz L(end) + mulx( (up,n,8), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp,n,8) +L(lo2): adox( (rp,n,8), %r13) + mulx( 8(up,n,8), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp,n,8) +L(lo1): adox( 8(rp,n,8), %rbx) + mulx( 16(up,n,8), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp,n,8) +L(lo0): adox( 16(rp,n,8), %r9) + mulx( 24(up,n,8), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp,n,8) +L(lo3): adox( 24(rp,n,8), %r11) + lea 4(n), n + jmp L(top) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyd.asm b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm new file mode 100644 index 0000000..f0dc54a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyi.asm b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm new file mode 100644 index 0000000..9c26e00 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm new file mode 100644 index 0000000..e9a0763 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm @@ -0,0 +1,166 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C norm unorm +C AMD K8,K9 11 11 +C AMD K10 11 11 +C Intel P4 ? +C Intel core2 13.5 13.25 +C Intel corei 14.25 +C Intel atom 34 36 +C VIA nano 19.25 19.25 + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C divisor rcx + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divexact_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C shift count + mov %rdx, %r8 + + bt $0, R32(%rax) + jc L(odd) C skip bsfq unless divisor is even + bsf %rax, %rcx + shr R8(%rcx), %rax +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r10 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits + + lea (%rsi,%r8,8), %rsi C up end + lea -8(%rdi,%r8,8), %rdi C rp end + neg %r8 C -n + + mov (%rsi,%r8,8), %rax C up[0] + + inc %r8 + jz L(one) + + test R32(%rcx), R32(%rcx) + jnz L(unorm) C branch if count != 0 + xor R32(%rbx), R32(%rbx) + jmp L(nent) + + ALIGN(8) +L(ntop):mul %r11 C carry limb in rdx 0 10 + mov -8(%rsi,%r8,8), %rax C + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(nent):imul %r10, %rax C 6 + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(ntop) + + mov -8(%rsi), %r9 C up high limb + jmp L(com) + +L(unorm): + mov (%rsi,%r8,8), %r9 C up[1] + shr R8(%rcx), %rax C + neg R32(%rcx) + shl R8(%rcx), %r9 C + neg R32(%rcx) + or %r9, %rax + xor R32(%rbx), R32(%rbx) + jmp L(uent) + + ALIGN(8) +L(utop):mul %r11 C carry limb in rdx 0 10 + mov (%rsi,%r8,8), %rax C + shl R8(%rcx), %rax C + neg R32(%rcx) + or %r9, %rax + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(uent):imul %r10, %rax C 6 + mov (%rsi,%r8,8), %r9 C + shr R8(%rcx), %r9 C + neg R32(%rcx) + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(utop) + +L(com): mul %r11 C carry limb in rdx + sub %rbx, %r9 C apply carry bit + sub %rdx, %r9 C apply carry limb + imul %r10, %r9 + mov %r9, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h new file mode 100644 index 0000000..fde69db --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h @@ -0,0 +1,243 @@ +/* VIA Nano gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define SHLD_SLOW 1 +#define SHRD_SLOW 1 + +/* 1600 MHz Nano 2xxx */ +/* FFT tuning limit = 25000000 */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 324 +#define MUL_TOOM6H_THRESHOLD 450 +#define MUL_TOOM8H_THRESHOLD 632 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 211 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 315 + +#define SQR_BASECASE_THRESHOLD 10 +#define SQR_TOOM2_THRESHOLD 52 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 387 +#define SQR_TOOM6_THRESHOLD 662 +#define SQR_TOOM8_THRESHOLD 781 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 159, 9}, { 319,10}, { 175,11}, { 95, 9}, \ + { 383, 8}, { 767,10}, { 207,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143, 9}, \ + { 575, 8}, { 1151,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639, 8}, { 1279,10}, { 335,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831, 8}, { 1663,10}, \ + { 447,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \ + { 1087,10}, { 575, 9}, { 1215,12}, { 159,11}, \ + { 319,10}, { 639, 9}, { 1279,11}, { 335,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \ + { 1535,10}, { 831, 9}, { 1663,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,10}, { 1343,12}, { 351,11}, { 703,10}, \ + { 1407,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,10}, { 1791,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,13}, { 447,12}, { 895,11}, { 1791,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1151,11}, { 2303,12}, { 1215,13}, \ + { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,14}, { 639,13}, { 1279,12}, \ + { 2559,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1535,12}, { 3071,13}, { 1663,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \ + { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ + { 2431,14}, { 1279,13}, { 2559,12}, { 5119,14}, \ + { 1407,13}, { 2815,12}, { 5631,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 224 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127,10}, { 71, 9}, \ + { 143,10}, { 79,11}, { 47,10}, { 95, 9}, \ + { 191,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135, 7}, { 1087, 9}, \ + { 287,11}, { 79, 9}, { 319, 8}, { 639,10}, \ + { 167,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511, 8}, { 1023,10}, { 271, 9}, \ + { 543, 8}, { 1087,11}, { 143, 9}, { 575, 8}, \ + { 1151,10}, { 303, 9}, { 639, 8}, { 1279,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,11}, \ + { 271,10}, { 543, 9}, { 1087,10}, { 575, 9}, \ + { 1151,11}, { 303,10}, { 607, 9}, { 1215,12}, \ + { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \ + { 1535,11}, { 415,10}, { 831, 9}, { 1663,12}, \ + { 223,11}, { 447,10}, { 959,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 575,10}, { 1215,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,10}, { 1343,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,10}, { 1535,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \ + { 1791,12}, { 479,11}, { 959,14}, { 127,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2047,12}, \ + { 4095,13}, { 2175,14}, { 1151,13}, { 2303,12}, \ + { 4607,14}, { 1279,13}, { 2559,14}, { 1407,13}, \ + { 2815,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 230 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 13 +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 173 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 96 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 202 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 246 +#define REDC_1_TO_REDC_2_THRESHOLD 7 +#define REDC_2_TO_REDC_N_THRESHOLD 85 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1652 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1210 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 1,28,129,642,2387 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 127 +#define HGCD_APPR_THRESHOLD 214 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 487 +#define GCDEXT_DC_THRESHOLD 505 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 802 +#define SET_STR_PRECOMPUTE_THRESHOLD 2042 + +#define FAC_DSC_THRESHOLD 1737 +#define FAC_ODD_THRESHOLD 44 diff --git a/gmp-6.3.0/mpn/x86_64/nano/popcount.asm b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm new file mode 100644 index 0000000..fb14dd3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm new file mode 100644 index 0000000..7ae6a1a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_2) +include_mpn(`x86_64/bd1/addmul_2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm new file mode 100644 index 0000000..8e6ee1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm @@ -0,0 +1,196 @@ +dnl x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.8 +C AMD K10 2.8 +C Intel P4 4 +C Intel core2 3.6-5 (fluctuating) +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_add_n', ` + define(ADDSUB, add) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADDSUB, sub) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +ASM_START() + TEXT +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +IFDOS(` jmp L(ent) ') +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +L(ent): push %rbx + push %r12 + + mov (vp), %r9 + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) C n = 0, 4, 8, ... + mov R32(%r8), R32(%rbx) + mov (up), %r8 + mov 8(up), %r10 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + lea -16(rp), rp + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) C n = 1, 5, 9, ... + mov (up), %r11 + mov R32(%r8), R32(%rax) + xor R32(%rbx), R32(%rbx) + dec n + jnz L(gt1) + ADDSUB %r9, %r11 + setc R8(%rbx) + ADDSUB %rax, %r11 + adc $0, R32(%rbx) + mov %r11, (rp) + jmp L(ret) +L(gt1): mov 8(up), %r8 + ADDSUB %r9, %r11 + mov 8(vp), %r9 + setc R8(%rbx) + lea -8(rp), rp + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) C n = 2, 6, 10, ... + mov (up), %r12 + mov R32(%r8), R32(%rbx) + mov 8(up), %r11 + ADDSUB %r9, %r12 + mov 8(vp), %r9 + setc R8(%rax) + lea -32(rp), rp + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): mov (up), %r10 C n = 3, 7, 11, ... + mov R32(%r8), R32(%rax) + xor R32(%rbx), R32(%rbx) + mov 8(up), %r12 + ADDSUB %r9, %r10 + mov 8(vp), %r9 + setc R8(%rbx) + lea -24(rp), rp + lea -8(up), up + lea -8(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) +L(c3): mov $1, R8(%rax) + jmp L(rc3) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 C not on critical path + setc R8(%rbx) C save carry out + mov %r12, (rp) +L(L01): ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): mov 8(up), %r10 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + mov %r11, 8(rp) +L(L00): ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): mov 16(up), %r12 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r8, 16(rp) +L(L11): ADDSUB %rax, %r10 + jc L(c2) +L(rc2): mov 24(up), %r11 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, 24(rp) +L(L10): ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): lea 32(rp), rp + sub $4, n + ja L(top) + +L(end): ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, (rp) + ADDSUB %rax, %r11 + jnc L(1) + mov $1, R8(%rbx) +L(1): mov %r11, 8(rp) + +L(ret): mov R32(%rbx), R32(%rax) + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm new file mode 100644 index 0000000..66937d3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm @@ -0,0 +1,50 @@ +dnl AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1), +dnl optimised for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 31) C 31, not 63, since we use 32-bit ops + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_sublsh1_n', ` + define(ADDSUB, sub) + define(func, mpn_sublsh1_n)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) +include_mpn(`x86_64/pentium4/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm new file mode 100644 index 0000000..001f0ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm @@ -0,0 +1,50 @@ +dnl AMD64 mpn_addlsh2_n, mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2), +dnl optimised for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 30) C 30, not 62, since we use 32-bit ops + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_sublsh2_n', ` + define(ADDSUB, sub) + define(func, mpn_sublsh2_n)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) +include_mpn(`x86_64/pentium4/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm new file mode 100644 index 0000000..d03c6a3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm @@ -0,0 +1,203 @@ +dnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where +dnl C is 1, 2, 3. Optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C AMD K8,K9 3.8 +C AMD K10 3.8 +C Intel P4 5.8 +C Intel core2 4.75 +C Intel corei 4.75 +C Intel atom ? +C VIA nano 4.75 + + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +define(M, eval(m4_lshift(1,LSH))) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %r12 + push %rbp + + mov (vp), %r9 + shl $LSH, %r9 + mov 4(vp), R32(%rbp) + + xor R32(%rbx), R32(%rbx) + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) C n = 0, 4, 8, ... + + mov (up), %r8 + mov 8(up), %r10 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + lea -16(rp), rp + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) C n = 1, 5, 9, ... + mov (up), %r11 + lea -8(rp), rp + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 + setc R8(%rbx) + dec n + jz L(1) C jump for n = 1 + mov 8(up), %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + mov 12(vp), R32(%rbp) + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) C n = 2, 6, 10, ... + mov (up), %r12 + mov 8(up), %r11 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r12 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): mov (up), %r10 + mov 8(up), %r12 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r10 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) + mov 12(vp), R32(%rbp) + lea -24(rp), rp + lea -8(up), up + lea -8(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) C save carry out + mov 4(vp), R32(%rbp) + mov %r12, (rp) + ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): +L(L01): mov 8(up), %r10 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + mov %r11, 8(rp) + ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): +L(L00): mov 16(up), %r12 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r10 + mov 16(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) + mov 20(vp), R32(%rbp) + mov %r8, 16(rp) + ADDSUB %rax, %r10 + jc L(c2) +L(rc2): +L(L11): mov 24(up), %r11 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r12 + mov 24(vp), %r9 + lea (%rbp,%r9,M), %r9 + lea 32(up), up + lea 32(vp), vp + setc R8(%rax) + mov -4(vp), R32(%rbp) + mov %r10, 24(rp) + ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): lea 32(rp), rp +L(L10): sub $4, n + ja L(top) + +L(end): + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, (rp) + ADDSUB %rax, %r11 + jnc L(1) + mov $1, R8(%rbx) +L(1): mov %r11, 8(rp) + lea (%rbx,%rbp), R32(%rax) + pop %rbp + pop %r12 + pop %rbx + FUNC_EXIT() + ret +L(c3): mov $1, R8(%rax) + jmp L(rc3) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm new file mode 100644 index 0000000..e5dbb34 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h new file mode 100644 index 0000000..9c79310 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h @@ -0,0 +1,257 @@ +/* Pentium 4-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* These routines exists for all x86_64 chips, but they are slower on Pentium4 + than separate add/sub and shift. Make sure they are not really used. */ +#undef HAVE_NATIVE_mpn_rsblsh1_n +#undef HAVE_NATIVE_mpn_rsblsh2_n +#undef HAVE_NATIVE_mpn_addlsh_n +#undef HAVE_NATIVE_mpn_rsblsh_n + +/* 3400 MHz Pentium4 Nocona / 1024 Kibyte L2 cache */ +/* FFT tuning limit = 107,095,964 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 12 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 228 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 112 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 238 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 236, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 51,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \ + { 143, 8}, { 287,10}, { 79,11}, { 47,10}, \ + { 95, 9}, { 191,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 223,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287,11}, { 159,10}, { 319,11}, { 175,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 223,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 351,12}, \ + { 191,11}, { 383,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 351,13}, { 191,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,10}, \ + { 4863,13}, { 639,12}, { 1279,11}, { 2559,12}, \ + { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1279,12}, \ + { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3199,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 229 +#define MUL_FFT_THRESHOLD 2752 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 9}, { 7, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 39, 9}, { 79,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287,11}, { 159,10}, { 319,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,11}, { 223,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \ + { 1215,11}, { 2431,10}, { 4863,13}, { 639,12}, \ + { 1279,11}, { 2559,12}, { 1343,11}, { 2687,14}, \ + { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1663,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2303,12}, \ + { 4607,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1663,13}, { 3327,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3327,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 235 +#define SQR_FFT_THRESHOLD 2368 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 45 +#define MULLO_MUL_N_THRESHOLD 5397 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 46 +#define SQRLO_SQR_THRESHOLD 4658 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 95 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 47 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 178 +#define INV_APPR_THRESHOLD 116 + +#define BINV_NEWTON_THRESHOLD 206 +#define REDC_1_TO_REDC_2_THRESHOLD 24 +#define REDC_2_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 762 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 7,34,114,523,1486 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1659 + +#define FAC_DSC_THRESHOLD 969 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 29 +#define HGCD2_DIV1_METHOD 3 /* 2.03% faster than 5 */ +#define HGCD_THRESHOLD 92 +#define HGCD_APPR_THRESHOLD 95 +#define HGCD_REDUCE_THRESHOLD 1815 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 4 /* 17.06% faster than 1 */ + +/* Tuneup completed successfully, took 297016 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm new file mode 100644 index 0000000..4037be4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Pentium 4. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm new file mode 100644 index 0000000..52856c1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Pentium 4. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm new file mode 100644 index 0000000..f34b3f0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 1.0 +C AMD K10 1.12 +C Intel P4 3.25 +C Intel core2 1.5 +C Intel corei 1.5 +C Intel atom 2.5 +C VIA nano 1.75 + + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. In particular, try to avoid adc and +C sbb to placate Pentium4. +C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, +C without the dual loop exits. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + sub $2, %rsi + ja L(gt2) + + mov (ap), %rax + nop + jb L(1) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax + +L(1): FUNC_EXIT() + ret + + + ALIGN(16) +L(gt2): xor R32(%rax), R32(%rax) + xor R32(%rcx), R32(%rcx) + xor R32(%rdx), R32(%rdx) + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + +L(top): add (ap), %rax + adc $0, %r10 + add 8(ap), %rcx + adc $0, %r8 + add 16(ap), %rdx + adc $0, %r9 + + sub $3, %rsi + jng L(end) + + add 24(ap), %rax + adc $0, %r10 + add 32(ap), %rcx + adc $0, %r8 + add 40(ap), %rdx + lea 48(ap), ap + adc $0, %r9 + + sub $3, %rsi + jg L(top) + + + add $-24, ap +L(end): add %r9, %rax + adc %r10, %rcx + adc %r8, %rdx + + inc %rsi + mov $0x1, R32(%r10) + js L(combine) + + mov $0x10000, R32(%r10) + adc 24(ap), %rax + dec %rsi + js L(combine) + + adc 32(ap), %rcx + mov $0x100000000, %r10 + +L(combine): + sbb %rsi, %rsi C carry + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r10, %rsi C carry masked + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + add %rsi, %rax C apply carry + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm new file mode 100644 index 0000000..70de670 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm new file mode 100644 index 0000000..a0f7302 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_2) +include_mpn(`x86_64/bd1/mul_2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm new file mode 100644 index 0000000..fb16029 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/core2/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm new file mode 100644 index 0000000..b9e08a8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/core2/mullo_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm new file mode 100644 index 0000000..7014b39 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount optimized for Pentium 4. + +dnl Copyright 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm new file mode 100644 index 0000000..00e380d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/bt1/redc_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm new file mode 100644 index 0000000..5528ce4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm @@ -0,0 +1,334 @@ +dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 4.13 +C AMD K10 4.13 +C Intel P4 5.70 +C Intel core2 4.75 +C Intel corei 5 +C Intel atom 8.75 +C VIA nano 5.25 + +C TODO +C * Try to make this smaller, 746 bytes seem excessive for this 2nd class +C function. Less sw pipelining would help, and since we now probably +C pipeline somewhat too deeply, it might not affect performance too much. +C * A separate small-n loop might speed things as well as make things smaller. +C That loop should be selected before pushing registers. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(func, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(func, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ASM_START() + TEXT +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +IFDOS(` jmp L(ent) ') +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +L(ent): push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + mov (vp), %r9 + mov (up), %r15 + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) + + mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ... + mov 8(up), %r10 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rax) + ADDSUB %rbx, %r15 C return bit + jnc 1f + mov $1, R8(%rax) +1: mov 16(up), %r12 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r10 + jnc 1f + mov $1, R8(%rbx) +1: mov 24(up), %r11 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, %r14 + shl $63, %r10 + shr %r13 + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) + xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ... + lea -24(rp), rp + mov R32(%r8), R32(%rax) + dec n + jnz L(gt1) + ADDSUB %r9, %r15 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov %r15, %r14 + shl $63, %rbx + shr %r14 + jmp L(cj1) +L(gt1): mov 8(up), %r8 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov 16(up), %r10 + ADDSUB %r9, %r8 + mov 16(vp), %r9 + setc R8(%rax) + mov %r15, %r14 + ADDSUB %rbx, %r8 + jnc 1f + mov $1, R8(%rax) +1: mov 24(up), %r12 + ADDSUB %r9, %r10 + mov 24(vp), %r9 + setc R8(%rbx) + mov %r8, %r13 + shl $63, %r8 + shr %r14 + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) + lea -16(rp), rp C n = 2, 6, 10, ... + mov R32(%r8), R32(%rbx) + mov 8(up), %r11 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rax) + ADDSUB %rbx, %r15 + jnc 1f + mov $1, R8(%rax) +1: sub $2, n + jnz L(gt2) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: mov %r11, %r14 + shl $63, %r11 + shr %r13 + jmp L(cj2) +L(gt2): mov 16(up), %r8 + ADDSUB %r9, %r11 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: mov 24(up), %r10 + ADDSUB %r9, %r8 + mov 24(vp), %r9 + setc R8(%rax) + mov %r11, %r14 + shl $63, %r11 + shr %r13 + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ... + lea -8(rp), rp + mov R32(%r8), R32(%rax) + mov 8(up), %r12 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov 16(up), %r11 + ADDSUB %r9, %r12 + mov 16(vp), %r9 + setc R8(%rax) + mov %r15, %r14 + ADDSUB %rbx, %r12 + jnc 1f + mov $1, R8(%rax) +1: sub $3, n + jnz L(gt3) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + jmp L(cj3) +L(gt3): mov 24(up), %r8 + ADDSUB %r9, %r11 + mov 24(vp), %r9 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + lea 24(up), up + lea 24(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + or %r13, %r10 + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 C not on critical path + setc R8(%rbx) C save carry out + mov %r12, %r13 C new for later + shl $63, %r12 C shift new right + shr %r14 C shift old left + mov %r10, (rp) +L(L11): ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): mov 8(up), %r10 + or %r14, %r12 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + mov %r11, %r14 + shl $63, %r11 + shr %r13 + mov %r12, 8(rp) +L(L10): ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): mov 16(up), %r12 + or %r13, %r11 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r8, %r13 + shl $63, %r8 + shr %r14 + mov %r11, 16(rp) +L(L01): ADDSUB %rax, %r10 + jc L(c2) +L(rc2): mov 24(up), %r11 + or %r14, %r8 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, %r14 + shl $63, %r10 + shr %r13 + mov %r8, 24(rp) + lea 32(rp), rp +L(L00): ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): sub $4, n + ja L(top) + +L(end): or %r13, %r10 + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + mov %r10, (rp) +L(cj3): ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: or %r14, %r12 + mov %r11, %r14 + shl $63, %r11 + shr %r13 + mov %r12, 8(rp) +L(cj2): or %r13, %r11 + shl $63, %rbx + shr %r14 + mov %r11, 16(rp) +L(cj1): or %r14, %rbx + mov %rbx, 24(rp) + + mov R32(%r15), R32(%rax) + and $1, R32(%rax) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +L(c3): mov $1, R8(%rax) + jmp L(rc3) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm new file mode 100644 index 0000000..b7c1ee2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm @@ -0,0 +1,169 @@ +dnl x86-64 mpn_rshift optimized for Pentium 4. + +dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.5 +C AMD K10 ? +C Intel P4 3.29 +C Intel core2 2.1 (fluctuates, presumably cache related) +C Intel corei ? +C Intel atom 14.3 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') +define(`cnt',`%cl') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + mov (up), %rax + movd R32(%rcx), %mm4 + neg R32(%rcx) C put lsh count in cl + and $63, R32(%rcx) + movd R32(%rcx), %mm5 + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + lea 1(n), R32(%r8) + neg n + + shl R8(%rcx), %rax C function return value + + and $3, R32(%r8) + je L(rol) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n + jmp L(rol) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n +L(1x): + cmp $-1, n + je L(ast) + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + movq 16(up,n,8), %mm0 + movq 24(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, 8(rp,n,8) + movq %mm3, 16(rp,n,8) + add $2, n + +L(rol): movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + + add $4, n C 4 + jb L(end) C 2 + ALIGN(32) +L(top): + C finish stuff from lsh block + movq -16(up,n,8), %mm0 + movq -8(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + movq (up,n,8), %mm0 + por %mm1, %mm3 + movq 8(up,n,8), %mm1 + movq %mm2, -24(rp,n,8) + movq %mm3, -16(rp,n,8) + C start two new rsh + psllq %mm5, %mm0 + psllq %mm5, %mm1 + + C finish stuff from rsh block + movq -8(up,n,8), %mm2 + movq (up,n,8), %mm3 + psrlq %mm4, %mm2 + por %mm2, %mm0 + psrlq %mm4, %mm3 + movq 8(up,n,8), %mm2 + por %mm3, %mm1 + movq 16(up,n,8), %mm3 + movq %mm0, -8(rp,n,8) + movq %mm1, (rp,n,8) + C start two new lsh + add $4, n + psrlq %mm4, %mm2 + psrlq %mm4, %mm3 + + jae L(top) C 2 +L(end): + movq -8(up), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq (up), %mm1 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, -16(rp) + movq %mm3, -8(rp) + +L(ast): movq (up), %mm2 + psrlq %mm4, %mm2 + movq %mm2, (rp) + emms + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm new file mode 100644 index 0000000..9725287 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/core2/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/popham.asm b/gmp-6.3.0/mpn/x86_64/popham.asm new file mode 100644 index 0000000..3a29b2e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/popham.asm @@ -0,0 +1,163 @@ +dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. + +dnl Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C popcount hamdist +C cycles/limb cycles/limb +C AMD K8,K9 6 7 +C AMD K10 6 7 +C Intel P4 12 14.3 +C Intel core2 7 8 +C Intel corei ? 7.3 +C Intel atom 16.5 17.5 +C VIA nano 8.75 10.4 + +C TODO +C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for +C hamdist for K8/K9. + + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`up', `%rdi') + define(`n', `%rsi') + define(`h55555555', `%r10') + define(`h33333333', `%r11') + define(`h0f0f0f0f', `%rcx') + define(`h01010101', `%rdx') + define(`POP', `$1') + define(`HAM', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`up', `%rdi') + define(`vp', `%rsi') + define(`n', `%rdx') + define(`h55555555', `%r10') + define(`h33333333', `%r11') + define(`h0f0f0f0f', `%rcx') + define(`h01010101', `%r12') + define(`POP', `dnl') + define(`HAM', `$1') +') + + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + POP(` FUNC_ENTRY(2) ') + HAM(` FUNC_ENTRY(3) ') + push %rbx + mov $0x5555555555555555, h55555555 + push %rbp + mov $0x3333333333333333, h33333333 + HAM(` push %r12 ') + lea (up,n,8), up + mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f + HAM(` lea (vp,n,8), vp ') + neg n + mov $0x0101010101010101, h01010101 + xor R32(%rax), R32(%rax) + test $1, R8(n) + jz L(top) + + mov (up,n,8), %r8 + HAM(` xor (vp,n,8), %r8 ') + + mov %r8, %r9 + shr %r8 + and h55555555, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and h33333333, %r8 + and h33333333, %r9 + add %r8, %r9 C 16 4-bit fields (0..4) + + dec n + jmp L(mid) + + ALIGN(16) +L(top): mov (up,n,8), %r8 + mov 8(up,n,8), %rbx + HAM(` xor (vp,n,8), %r8 ') + HAM(` xor 8(vp,n,8), %rbx ') + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and h55555555, %r8 + and h55555555, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and h33333333, %r8 + and h33333333, %r9 + and h33333333, %rbx + and h33333333, %rbp + add %r8, %r9 C 16 4-bit fields (0..4) + add %rbx, %rbp C 16 4-bit fields (0..4) + + add %rbp, %r9 C 16 4-bit fields (0..8) +L(mid): mov %r9, %r8 + shr $4, %r9 + and h0f0f0f0f, %r8 + and h0f0f0f0f, %r9 + add %r8, %r9 C 8 8-bit fields (0..16) + + imul h01010101, %r9 C sum the 8 fields in high 8 bits + shr $56, %r9 + + add %r9, %rax C add to total + add $2, n + jnc L(top) + +L(end): + HAM(` pop %r12 ') + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm new file mode 100644 index 0000000..a3e9cc5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm @@ -0,0 +1,189 @@ +dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1 +dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1 + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125) +C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125) +C Intel P4 12.75 +C Intel core2 3.75 +C Intel NMH 4.4 +C Intel SBR ? +C Intel atom ? +C VIA nano 3.25 + +C TODO +C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n',` %rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + + xor R32(%rax), R32(%rax) + neg %r8 C set C flag from parameter + mov (up), %rbx + ADCSBB (vp), %rbx + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + + xor R32(%rax), R32(%rax) + mov (up), %rbx + ADDSUB (vp), %rbx +L(ent): + rcr %rbx C rotate, save acy + adc R32(%rax), R32(%rax) C return value + + mov R32(n), R32(%r11) + and $3, R32(%r11) + + cmp $1, R32(%r11) + je L(do) C jump if n = 1 5 9 ... + +L(n1): cmp $2, R32(%r11) + jne L(n2) C jump unless n = 2 6 10 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r10 + ADCSBB 8(vp), %r10 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + rcr %r10 + rcr %rbx + mov %rbx, -8(rp) + jmp L(cj1) + +L(n2): cmp $3, R32(%r11) + jne L(n3) C jump unless n = 3 7 11 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r9 + mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(rp) + jmp L(cj2) + +L(n3): dec n C come here for n = 4 8 12 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + ADCSBB 24(vp), %r10 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(rp) + mov %r8, -16(rp) +L(cj2): mov %r9, -8(rp) +L(cj1): mov %r10, %rbx + +L(do): + shr $2, n C 4 + je L(end) C 2 + ALIGN(16) +L(top): add %rbx, %rbx C rotate carry limb, restore acy + + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + mov 32(up), %r11 + ADCSBB 24(vp), %r10 + ADCSBB 32(vp), %r11 + + lea 32(up), up + lea 32(vp), vp + + rcr %r11 C rotate, save acy + rcr %r10 + rcr %r9 + rcr %r8 + + rcr %rbx + mov %rbx, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, 24(rp) + mov %r11, %rbx + + lea 32(rp), rp + dec n + jne L(top) + +L(end): mov %rbx, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/rshift.asm b/gmp-6.3.0/mpn/x86_64/rshift.asm new file mode 100644 index 0000000..3f344f1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/rshift.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_rshift -- mpn right shift. + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 8 +C Intel core2 2.11 +C Intel corei ? +C Intel atom 5.75 +C VIA nano 3.5 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov (up), %rax + shl R8(%rcx), %rax C function return value + neg R32(%rcx) C put lsh count in cl + + lea 1(n), R32(%r8) + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + neg n + + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov %r10, 8(rp,n,8) + inc n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov %r10, 8(rp,n,8) + inc n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $-1, n + je L(ast) + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 24(up,n,8), %r9 + shl R8(%rcx), %r8 + or %r8, %r10 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 8(rp,n,8) + mov %r11, 16(rp,n,8) + add $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r11 + + add $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -8(up,n,8), %r9 + shl R8(%rcx), %r8 + or %r8, %r10 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -24(rp,n,8) + mov %r11, -16(rp,n,8) + C start two new rsh + mov (up,n,8), %r8 + mov 8(up,n,8), %r9 + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov -8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shr R8(%rcx), %r10 + or %r10, %r8 + shr R8(%rcx), %r11 + or %r11, %r9 + mov %r8, -8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov 8(up,n,8), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r10 + shr R8(%rcx), %r11 + + add $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov -8(up), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -16(rp) + mov %r11, -8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shr R8(%rcx), %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm new file mode 100644 index 0000000..e8aed26 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb good for cpu +C AMD K8,K9 1.5 Y +C AMD K10 1.4 +C AMD bd1 2.64 +C AMD bobcat 2.15 Y +C Intel P4 4 +C Intel core2 1.38 +C Intel NHM 1.75 +C Intel SBR 1.25 +C Intel atom 2.5 Y +C VIA nano 1.75 Y + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using SSE2/AVX2 could result in many-fold speedup. +C * WORKS FOR n mod 4 = 0 ONLY! + +C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%rbp') +define(`j', `%r9') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab i which j * * * * * * + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov n, j + add $-4, j + js L(outer_end) + +L(outer_top): + mov nents, i + push tp + xor R32(%r12), R32(%r12) + xor R32(%r13), R32(%r13) + xor R32(%r14), R32(%r14) + xor R32(%r15), R32(%r15) + mov which, %rbx + + ALIGN(16) +L(top): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + mov 8(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(tp), %r10 + mov 24(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 + lea (tp,n,8), tp + add $-1, i + jne L(top) + + mov %r12, 0(rp) + mov %r13, 8(rp) + mov %r14, 16(rp) + mov %r15, 24(rp) + pop tp + lea 32(tp), tp + lea 32(rp), rp + add $-4, j + jns L(outer_top) +L(outer_end): + + test $2, R8(n) + jz L(b0x) +L(b1x): mov nents, i + push tp + xor R32(%r12), R32(%r12) + xor R32(%r13), R32(%r13) + mov which, %rbx + ALIGN(16) +L(tp2): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + mov 8(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + lea (tp,n,8), tp + add $-1, i + jne L(tp2) + mov %r12, 0(rp) + mov %r13, 8(rp) + pop tp + lea 16(tp), tp + lea 16(rp), rp + +L(b0x): test $1, R8(n) + jz L(b00) +L(b01): mov nents, i + xor R32(%r12), R32(%r12) + mov which, %rbx + ALIGN(16) +L(tp1): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + and %rax, %r10 + or %r10, %r12 + lea (tp,n,8), tp + add $-1, i + jne L(tp1) + mov %r12, 0(rp) + +L(b00): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm new file mode 100644 index 0000000..98c26cf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm @@ -0,0 +1,50 @@ +dnl X86-64 mpn_addlsh1_n/mpn_rsblsh1_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm new file mode 100644 index 0000000..2a83217 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm @@ -0,0 +1,50 @@ +dnl X86-64 mpn_addlsh2_n/mpn_rsblsh2_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm new file mode 100644 index 0000000..dce3d75 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreisbr/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm new file mode 100644 index 0000000..ead0d76 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1/mpn_submul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) +include_mpn(`x86_64/core2/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h new file mode 100644 index 0000000..f8cb0f4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h @@ -0,0 +1,252 @@ +/* Intel Silvermont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */ +/* FFT tuning limit = 468153400 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 55 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 152 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 232 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 24 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 23, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,14}, { 1407,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9215,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 225 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 17, 8}, \ + { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511, 9}, { 271, 8}, { 543,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 479,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \ + { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 232 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 55 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 222 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 147 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 222 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 212 +#define REDC_1_TO_REDC_2_THRESHOLD 55 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 81 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1043 + +#define POWM_SEC_TABLE 1,34,102,588,1730 + +#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1659 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 3 /* 3.06% faster than 1 */ +#define HGCD_THRESHOLD 120 +#define HGCD_APPR_THRESHOLD 153 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 /* 2.28% faster than 3 */ + +/* Tuneup completed successfully, took 938046 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm new file mode 100644 index 0000000..848ed01 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm @@ -0,0 +1,38 @@ +dnl x86-64 mpn_hamdist. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm new file mode 100644 index 0000000..acd3180 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm new file mode 100644 index 0000000..3a68bb5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm new file mode 100644 index 0000000..c1e1c94 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm new file mode 100644 index 0000000..6228c48 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/k8/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm new file mode 100644 index 0000000..0244f8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/k8/mullo_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm new file mode 100644 index 0000000..73eb7b5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm @@ -0,0 +1,38 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm new file mode 100644 index 0000000..b84371c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm new file mode 100644 index 0000000..afccf93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/k8/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h new file mode 100644 index 0000000..a899ea1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h @@ -0,0 +1,246 @@ +/* Skylake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */ +/* FFT tuning limit = 465,990,371 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 41 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 473 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 208 +#define MUL_TOOM6H_THRESHOLD 300 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39, 8}, { 79, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 167,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 199,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1151,13}, { 639,12}, \ + { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4351,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 227 +#define MUL_FFT_THRESHOLD 6272 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 735,12}, { 383,11}, { 799,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1151,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,15}, \ + { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5119,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \ + { 3839,17}, { 1023,16}, { 2047,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7679,14}, { 15359,17}, \ + { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 205 +#define SQR_FFT_THRESHOLD 4224 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 79 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 109 +#define SQRLO_SQR_THRESHOLD 8207 + +#define DC_DIV_QR_THRESHOLD 55 +#define DC_DIVAPPR_Q_THRESHOLD 179 +#define DC_BDIV_QR_THRESHOLD 82 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_2_THRESHOLD 33 +#define REDC_2_TO_REDC_N_THRESHOLD 59 + +#define MU_DIV_QR_THRESHOLD 1528 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1597 + +#define POWM_SEC_TABLE 2,8,191,452,904 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 898 +#define SET_STR_PRECOMPUTE_THRESHOLD 1670 + +#define FAC_DSC_THRESHOLD 474 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 5 /* 3.85% faster than 3 */ +#define HGCD_THRESHOLD 64 +#define HGCD_APPR_THRESHOLD 60 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 618 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 1 /* 12.01% faster than 4 */ + +/* Tuneup completed successfully, took 213784 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..f486125 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm @@ -0,0 +1,116 @@ +dnl AMD64 mpn_sqr_diag_addlsh1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.5 +C AMD K10 2.5 +C AMD bull 3.6 +C AMD pile 3.6 +C AMD steam ? +C AMD bobcat 4 +C AMD jaguar ? +C Intel P4 11.5 +C Intel core 4 +C Intel NHM 3.6 +C Intel SBR 3.15 +C Intel IBR 3.0 +C Intel HWL 2.6 +C Intel BWL ? +C Intel atom 14 +C VIA nano 3.5 + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`up_arg', `%rdx') +define(`n', `%rcx') + +define(`up', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_diag_addlsh1) + FUNC_ENTRY(4) + push %rbx + + dec n + shl n + + mov (up_arg), %rax + + lea (rp,n,8), rp + lea (tp,n,8), tp + lea (up_arg,n,4), up + neg n + + mul %rax + mov %rax, (rp,n,8) + + xor R32(%rbx), R32(%rbx) + jmp L(mid) + + ALIGN(16) +L(top): add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(rp,n,8) + mov %r9, (rp,n,8) +L(mid): mov 8(up,n,4), %rax + mov (tp,n,8), %r8 + mov 8(tp,n,8), %r9 + adc %r8, %r8 + adc %r9, %r9 + lea (%rdx,%rbx), %r10 + setc R8(%rbx) + mul %rax + add $2, n + js L(top) + +L(end): add %r10, %r8 + adc %rax, %r9 + mov %r8, I(-8(rp),-8(rp,n,8)) + mov %r9, I((rp),(rp,n,8)) + adc %rbx, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm new file mode 100644 index 0000000..c6d829f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm @@ -0,0 +1,160 @@ +dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) + +dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.2 +C AMD K10 2.2 +C Intel P4 12.75 +C Intel core2 3.45 +C Intel corei ? +C Intel atom ? +C VIA nano 3.25 + +C Sometimes speed degenerates, supposedly related to that some operand +C alignments cause cache conflicts. + +C The speed is limited by decoding/issue bandwidth. There are 26 instructions +C in the loop, which corresponds to 26/3/4 = 2.167 c/l. + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sublsh1_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), %r8 + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + xor R32(%rbp), R32(%rbp) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov 16(up,n,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $3, n + jmp L(ent) + +L(b10): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $2, n + jmp L(ent) + +L(b01): add %r8, %r8 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + sub %r8, %rbp + mov %rbp, (rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + inc n +L(ent): jns L(end) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + + mov (vp,n,8), %r8 +L(b00): adc %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + mov 24(vp,n,8), %r11 + adc %r11, %r11 + + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov 16(up,n,8), %rbp + mov 24(up,n,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(rp,n,8) + mov %rbx, 24(rp,n,8) + + sbb R32(%rbp), R32(%rbp) C save acy + add $4, n + js L(top) + +L(end): add R32(%rbp), R32(%rax) + neg R32(%rax) + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 new file mode 100644 index 0000000..4e08f2a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 @@ -0,0 +1,493 @@ +divert(-1) + +dnl m4 macros for amd64 assembler. + +dnl Copyright 1999-2005, 2008, 2009, 2011-2013, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: CPUVEC_FUNCS_LIST +dnl +dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the +dnl order they appear in that structure. + +define(CPUVEC_FUNCS_LIST, +``add_n', +`addlsh1_n', +`addlsh2_n', +`addmul_1', +`addmul_2', +`bdiv_dbm1c', +`cnd_add_n', +`cnd_sub_n', +`com', +`copyd', +`copyi', +`divexact_1', +`divrem_1', +`gcd_11', +`lshift', +`lshiftc', +`mod_1', +`mod_1_1p', +`mod_1_1p_cps', +`mod_1s_2p', +`mod_1s_2p_cps', +`mod_1s_4p', +`mod_1s_4p_cps', +`mod_34lsub1', +`modexact_1c_odd', +`mul_1', +`mul_basecase', +`mullo_basecase', +`preinv_divrem_1', +`preinv_mod_1', +`redc_1', +`redc_2', +`rshift', +`sqr_basecase', +`sub_n', +`sublsh1_n', +`submul_1'') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl In the amd64 code we use explicit TEXT and ALIGN() calls in the code, +dnl since different alignments are wanted in various circumstances. So for +dnl instance, +dnl +dnl TEXT +dnl ALIGN(16) +dnl PROLOGUE(mpn_add_n) +dnl ... +dnl EPILOGUE() + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` GLOBL $1 + TYPE($1,`function') + COFF_TYPE($1) +$1: +') + + +dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo) +dnl +dnl Emit COFF style ".def ... .endef" type information for a function, when +dnl supported. The argument should include any GSYM_PREFIX. +dnl +dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE. + +define(COFF_TYPE, +m4_assert_numargs(1) +m4_assert_defined(`HAVE_COFF_TYPE') +`ifelse(HAVE_COFF_TYPE,yes, + `.def $1 + .scl 2 + .type 32 + .endef')') + + +dnl Usage: ASSERT([cond][,instructions]) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(ne, `cmpq %rax, %rbx') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(nc) +dnl +dnl When `instructions' is not empty, a pushfq/popfq is added for +dnl convenience to preserve the flags, but the instructions themselves must +dnl preserve any registers that matter. +dnl +dnl The condition can be omitted to just output the given instructions when +dnl assertion checking is wanted. In this case the pushf/popf is omitted. +dnl For example, +dnl +dnl ASSERT(, `movq %rax, VAR_KEEPVAL') + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +`ifelse(`$1',, +` $2', +`ifelse(`$2',,, +` pushfq') + $2 + `j$1' L(ASSERT_ok`'ASSERT_counter) + ud2 C assertion failed +L(ASSERT_ok`'ASSERT_counter): +ifelse(`$2',,,` popfq') +define(`ASSERT_counter',incr(ASSERT_counter))')')') + +define(ASSERT_counter,1) + +dnl LEA - load effective address +dnl +dnl FIXME: We should never create a GOT entry and therefore use the simpler 2nd +dnl variant always. We need to understand what happens for not-yet-hidden +dnl symbols first. +dnl +define(`LEA',`dnl +ifdef(`PIC', + `mov $1@GOTPCREL(%rip), $2' +, + `lea $1(%rip), $2') +') + + +define(`DEF_OBJECT', +m4_assert_numargs_range(2,3) +` ifelse($#,3,`$3',`RODATA') + ALIGN($2) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` SIZE(`$1',.-`$1')') + + +define(`R32', + `ifelse($1,`%rax',`%eax', + $1,`%rbx',`%ebx', + $1,`%rcx',`%ecx', + $1,`%rdx',`%edx', + $1,`%rsi',`%esi', + $1,`%rdi',`%edi', + $1,`%rbp',`%ebp', + $1,`%r8',`%r8d', + $1,`%r9',`%r9d', + $1,`%r10',`%r10d', + $1,`%r11',`%r11d', + $1,`%r12',`%r12d', + $1,`%r13',`%r13d', + $1,`%r14',`%r14d', + $1,`%r15',`%r15d')') +define(`R8', + `ifelse($1,`%rax',`%al', + $1,`%rbx',`%bl', + $1,`%rcx',`%cl', + $1,`%rdx',`%dl', + $1,`%rsi',`%sil', + $1,`%rdi',`%dil', + $1,`%rbp',`%bpl', + $1,`%r8',`%r8b', + $1,`%r9',`%r9b', + $1,`%r10',`%r10b', + $1,`%r11',`%r11b', + $1,`%r12',`%r12b', + $1,`%r13',`%r13b', + $1,`%r14',`%r14b', + $1,`%r15',`%r15b')') + + +dnl Usage: CALL(funcname) +dnl + +define(`CALL',`dnl +ifdef(`PIC', + `call GSYM_PREFIX`'$1@PLT' +, + `call GSYM_PREFIX`'$1' +)') + +define(`TCALL',`dnl +ifdef(`PIC', + `jmp GSYM_PREFIX`'$1@PLT' +, + `jmp GSYM_PREFIX`'$1' +)') + + +define(`JUMPTABSECT', `.section .data.rel.ro.local,"a",@progbits') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT',`dnl +ifdef(`PIC', + `.long $1-$2'dnl +, + `.quad $1'dnl +)') + + +dnl These macros are defined just for DOS64, where they provide calling +dnl sequence glue code. + +define(`FUNC_ENTRY',`') +define(`FUNC_EXIT',`') + + +dnl Target ABI macros. + +define(`IFDOS', `') +define(`IFSTD', `$1') +define(`IFELF', `$1') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `.hidden $1') + + +dnl Usage: x86_lookup(target, key,value, key,value, ...) +dnl +dnl Look for `target' among the `key' parameters. +dnl +dnl x86_lookup expands to the corresponding `value', or generates an error +dnl if `target' isn't found. + +define(x86_lookup, +m4_assert_numargs_range(1,999) +`ifelse(eval($#<3),1, +`m4_error(`unrecognised part of x86 instruction: $1 +')', +`ifelse(`$1',`$2', `$3', +`x86_lookup(`$1',shift(shift(shift($@))))')')') + + +dnl Usage: x86_opcode_regxmm(reg) +dnl +dnl Validate the given xmm register, and return its number, 0 to 7. + +define(x86_opcode_regxmm, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_regxmm_list)') + +define(x86_opcode_regxmm_list, +``%xmm0',0, +`%xmm1',1, +`%xmm2',2, +`%xmm3',3, +`%xmm4',4, +`%xmm5',5, +`%xmm6',6, +`%xmm7',7, +`%xmm8',8, +`%xmm9',9, +`%xmm10',10, +`%xmm11',11, +`%xmm12',12, +`%xmm13',13, +`%xmm14',14, +`%xmm15',15') + +dnl Usage: palignr($imm,%srcreg,%dstreg) +dnl +dnl Emit a palignr instruction, using a .byte sequence, since obsolete but +dnl still distributed versions of gas don't know SSSE3 instructions. + +define(`palignr', +m4_assert_numargs(3) +`.byte 0x66,dnl +ifelse(eval(x86_opcode_regxmm($3) >= 8 || x86_opcode_regxmm($2) >= 8),1, + `eval(0x40+x86_opcode_regxmm($3)/8*4+x86_opcode_regxmm($2)/8),')dnl +0x0f,0x3a,0x0f,dnl +eval(0xc0+x86_opcode_regxmm($3)%8*8+x86_opcode_regxmm($2)%8),dnl +substr($1,1)') + + +dnl Usage +dnl +dnl regnum(op) raw operand index (so slightly misnamed) +dnl regnumh(op) high bit of register operand nimber +dnl ix(op) 0 for reg operand, 1 for plain pointer operand. +dnl + +define(`regnum',`x86_lookup(`$1',oplist)') +define(`regnumh',`eval(regnum($1)/8 & 1)') +define(`ix',`eval(regnum($1)/16)') +define(`oplist', +``%rax', 0, `%rcx', 1, `%rdx', 2, `%rbx', 3, + `%rsp', 4, `%rbp', 5, `%rsi', 6, `%rdi', 7, + `%r8', 8, `%r9', 9, `%r10', 10, `%r11', 11, + `%r12', 12, `%r13', 13, `%r14', 14, `%r15', 15, + `(%rax)',16, `(%rcx)',17, `(%rdx)',18, `(%rbx)',19, + `(%rsp)',20, `(%rbp)',21, `(%rsi)',22, `(%rdi)',23, + `(%r8)', 24, `(%r9)', 25, `(%r10)',26, `(%r11)',27, + `(%r12)',28, `(%r13)',29, `(%r14)',30, `(%r15)',31') + +dnl Usage (by mulx, shlx, shrx) +dnl +dnl reg1,reg2,reg3,opc1,opc2 +dnl +dnl or +dnl +dnl (reg1),reg2,reg3,opc1,opc2 +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl or +dnl +dnl off,(reg1),reg2,reg3,opc1,opc2 +dnl +dnl where reg1 is any register but rsp,r12. +dnl +dnl The exceptions are due to special coding needed for some registers; rsp +dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the +dnl offset-less form. +dnl +dnl Other addressing forms are not handled. Invalid forms are not properly +dnl detected. Offsets that don't fit one byte are not handled correctly. + +define(`c4_helper',`dnl +.byte 0xc4`'dnl +ifelse(`$#',5,`dnl +,eval(0xe2^32*regnumh($1)^128*regnumh($3))`'dnl +,eval(0x$4-8*regnum($2))`'dnl +,0x$5`'dnl +,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($3))-0xc0*ix($1))`'dnl +',`$#',6,`dnl +,eval(0xe2^32*regnumh($2)^128*regnumh($4))`'dnl +,eval(0x$5-8*regnum($3))`'dnl +,0x$6`'dnl +,eval(0x40+(7 & regnum($2))+8*(7 & regnum($4)))`'dnl +,eval(($1 + 256) % 256)`'dnl +')') + + +dnl Usage +dnl +dnl mulx(reg1,reg2,reg3) +dnl +dnl or +dnl +dnl mulx((reg1),reg2,reg3) +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl mulx(off,(reg1),reg2,reg3) +dnl +dnl where reg1 is any register but rsp,r12. + +define(`mulx',`dnl +ifelse(`$#',3,`dnl +c4_helper($1,$2,$3,fb,f6)',`dnl format 1,2 +c4_helper($1,$2,$3,$4,fb,f6)'dnl format 3 +)') + + +dnl Usage +dnl +dnl shlx(reg1,reg2,reg3) +dnl shrx(reg1,reg2,reg3) +dnl +dnl or +dnl +dnl shlx(reg1,(reg2),reg3) +dnl shrx(reg1,(reg2),reg3) +dnl +dnl where reg2 is any register but rsp,rbp,r12,r13, or +dnl +dnl shlx(reg1,off,(reg2),reg3) +dnl shrx(reg1,off,(reg2),reg3) +dnl +dnl where reg2 is any register but rsp,r12. + +define(`shlx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,f9,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,f9,f7)'dnl format 3 +)') + +define(`shrx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,fb,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,fb,f7)'dnl format 3 +)') + +define(`sarx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,fa,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,fa,f7)'dnl format 3 +)') + + +dnl Usage +dnl +dnl adcx(reg1,reg2) +dnl adox(reg1,reg2) +dnl +dnl or +dnl +dnl adcx((reg1),reg2) +dnl adox((reg1),reg2) +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl adcx(off,(reg1),reg2) +dnl adox(off,(reg1),reg2) +dnl +dnl where reg1 is any register but rsp,r12. +dnl +dnl The exceptions are due to special coding needed for some registers; rsp +dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the +dnl offset-less form. +dnl +dnl Other addressing forms are not handled. Invalid forms are not properly +dnl detected. Offsets that don't fit one byte are not handled correctly. + +define(`adx_helper',`dnl +,eval(0x48+regnumh($1)+4*regnumh($2))`'dnl +,0x0f`'dnl +,0x38`'dnl +,0xf6`'dnl +') + +define(`adx',`dnl +ifelse(`$#',2,`dnl +adx_helper($1,$2)dnl +,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($2))-0xc0*ix($1))`'dnl +',`$#',3,`dnl +adx_helper($2,$3)dnl +,eval(0x40+(7 & regnum($2))+8*(7 & regnum($3)))`'dnl +,eval(($1 + 256) % 256)`'dnl +')') + +define(`adcx',`dnl +.byte 0x66`'dnl +adx($@)') + +define(`adox',`dnl +.byte 0xf3`'dnl +adx($@)') + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm new file mode 100644 index 0000000..803fa30 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm new file mode 100644 index 0000000..417dd0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm @@ -0,0 +1,227 @@ +dnl AMD64 mpn_addlsh_n, mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 2.31 +C AMD zn1 1.69 +C AMD zn2 1.55 +C AMD zn3 1.36 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.08 +C Intel BWL 1.78 +C Intel SKL 1.78 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * Perhaps avoid using jrcxz by using dec n + jnz. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +define(`tnc', `%r9') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (vp), %r10 + + mov R32(n), R32(%rax) + shr $3, n + xor R32(tnc), R32(tnc) + sub cnt, tnc + and $7, R32(%rax) + + lea L(tab)(%rip), %r11 +ifdef(`PIC',` + movslq (%r11,%rax,4), %rax + add %r11, %rax + jmp *%rax +',` + jmp *(%r11,%rax,8) +') + +L(0): lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e0) + +L(7): mov %r10, %r11 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e7) + +L(6): lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e6) + +L(5): mov %r10, %r11 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e5) + +L(end): ADCSBB 24(up), %rax + mov %rax, -40(rp) + shrx( tnc, %r11, %rax) + ADCSBB n, %rax + FUNC_EXIT() + ret + + ALIGN(32) +L(top): jrcxz L(end) + mov -32(vp), %r10 + ADCSBB 24(up), %rax + lea 64(up), up + shrx( tnc, %r11, %r11) + mov %rax, -40(rp) +L(e0): dec n + shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -24(vp), %r11 + ADCSBB -32(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -32(rp) +L(e7): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov -16(vp), %r10 + ADCSBB -24(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -24(rp) +L(e6): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -8(vp), %r11 + ADCSBB -16(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -16(rp) +L(e5): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov (vp), %r10 + ADCSBB -8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -8(rp) +L(e4): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 8(vp), %r11 + ADCSBB (up), %rax + shrx( tnc, %r10, %r10) + mov %rax, (rp) +L(e3): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov 16(vp), %r10 + ADCSBB 8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, 8(rp) +L(e2): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 24(vp), %r11 + ADCSBB 16(up), %rax + lea 64(vp), vp + shrx( tnc, %r10, %r10) + mov %rax, 16(rp) + lea 64(rp), rp +L(e1): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + jmp L(top) + +L(4): xor R32(%r11), R32(%r11) + jmp L(e4) + +L(3): mov %r10, %r11 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e3) + +L(2): lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e2) + +L(1): mov %r10, %r11 + lea -24(up), up + lea 40(vp), vp + lea 40(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e1) +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm new file mode 100644 index 0000000..89795e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm @@ -0,0 +1,165 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.3 +C AMD zen 2 +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`ADCSBB', `adc') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`ADCSBB', `sbb') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo1) + +L(b0): mulx( %r8, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + xor R32(%rax), R32(%rax) + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + sub $-3, n + jz L(wd3) + test R32(%rax), R32(%rax) C clear cy + jmp L(lo3) + +L(b2): mulx( %r8, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo2) + +L(top): ADDSUB %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + ADCSBB %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + ADCSBB %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + ADCSBB %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax C rax = carry limb + add $4, n + js L(top) + +L(end): ADDSUB %r9, (rp) +L(wd3): ADCSBB %r11, 8(rp) +L(wd2): ADCSBB %r13, 16(rp) +L(wd1): ADCSBB %rbx, 24(rp) + adc n, %rax + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen/com.asm b/gmp-6.3.0/mpn/x86_64/zen/com.asm new file mode 100644 index 0000000..b34f841 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyd.asm b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm new file mode 100644 index 0000000..63ed237 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyi.asm b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm new file mode 100644 index 0000000..1aafaaa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h new file mode 100644 index 0000000..05a12b3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h @@ -0,0 +1,280 @@ +/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3700-4300 MHz Pinnacle Ridge */ +/* FFT tuning limit = 468,514,360 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 32 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 338 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 107 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 272 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 106 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 117 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 422 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 540, 5}, { 22, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 9}, { 11, 8}, { 29, 9}, \ + { 15, 8}, { 35, 9}, { 19, 8}, { 43, 9}, \ + { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 103,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 159,12}, { 95,11}, { 191,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343, 9}, { 2687,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 927,11}, \ + { 1855,12}, { 959,11}, { 1919,10}, { 3839,13}, \ + { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \ + { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1855,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2687,13}, \ + { 5375,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4991,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4095,14}, { 8191,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5375,14}, \ + { 11007,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7167,14}, { 14335,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9215,14}, { 18431,15}, { 9727,14}, { 19455,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 11007,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 271 +#define MUL_FFT_THRESHOLD 6272 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 13, 4}, { 27, 5}, { 21, 6}, \ + { 11, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 14, 5}, { 29, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 399,10}, { 799,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \ + { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1471,11}, { 2943,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1855,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \ + { 2175,13}, { 4479,12}, { 8959,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4479,15}, { 2303,14}, \ + { 4991,15}, { 2559,14}, { 5247,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4351,14}, { 8959,15}, { 4863,14}, \ + { 9727,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7167,14}, \ + { 14335,15}, { 7679,14}, { 15359,15}, { 7935,14}, \ + { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9215,14}, { 18431,15}, { 9727,14}, \ + { 19455,15}, { 9983,14}, { 19967,16}, { 5119,15}, \ + { 10239,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 6655,15}, { 13311,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 302 +#define SQR_FFT_THRESHOLD 4224 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 69 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 12 +#define SQRLO_DC_THRESHOLD 82 +#define SQRLO_SQR_THRESHOLD 8207 + +#define DC_DIV_QR_THRESHOLD 76 +#define DC_DIVAPPR_Q_THRESHOLD 232 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 274 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 3,22,81,494 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 486 +#define SET_STR_PRECOMPUTE_THRESHOLD 1264 + +#define FAC_DSC_THRESHOLD 187 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD2_DIV1_METHOD 1 /* 9.20% faster than 3 */ +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 104 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 566 +#define GCDEXT_DC_THRESHOLD 382 +#define JACOBI_BASE_METHOD 1 /* 15.55% faster than 3 */ + +/* Tuneup completed successfully, took 281243 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm new file mode 100644 index 0000000..48dcf61 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshift.asm b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm new file mode 100644 index 0000000..4dce319 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm new file mode 100644 index 0000000..d52b194 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm new file mode 100644 index 0000000..6a083ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm @@ -0,0 +1,161 @@ +dnl AMD64 mpn_mul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.4 +C AMD zen 2 +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor R32(%r8), R32(%r8) C carry-in limb +L(ent): mov (up), %r9 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov %r8, %r12 + mulx( %r9, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r12, %rbx + jmp L(lo1) + +L(b3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + sub $-3, n + jz L(wd3) + add %r8, %r11 + jmp L(lo3) + +L(b2): mov %r8, %r10 C carry-in limb + mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r10, %r13 + jmp L(lo2) + +L(b0): mov %r8, %rax C carry-in limb + mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rax, %r9 + jmp L(lo0) + +L(top): jrcxz L(end) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(top) + +L(end): mov %r9, (rp) +L(wd3): adc %r8, %r11 + mov %r11, 8(rp) +L(wd2): adc %r10, %r13 + mov %r13, 16(rp) +L(wd1): adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm new file mode 100644 index 0000000..affa3b6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm @@ -0,0 +1,455 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider +C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead +C of 8 product registers. +C * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops +C without branch tree. +C * Improve the overlapped software pipelining. The mulx in the osp block now +C suffers from write/read conflicts, in particular the 1 mod 4 case. Also, +C mul_1 could osp into addmul_1. +C * Let vn_param be vn to save a copy. +C * Re-allocate to benefit more from 32-bit encoding. +C * Poor performance for e.g. n = 12,16. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') +define(`vn_param', `%r8') + +define(`un', `%r14') +define(`vp', `%rbp') +define(`v0', `%rdx') +define(`n', `%rcx') +define(`vn', `%r15') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn_param + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(gen): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov un_param, un + mov vp_param, vp + mov vn_param, vn + + mov (up), %r9 + mov (vp), v0 + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + mov un, n + test $1, R8(un) + jz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + inc n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + sub $-3, n + jz L(mwd3) + test R32(%rdx), R32(%rdx) + jmp L(mlo3) + +L(mx0): test $2, R8(un) + jz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(n), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + jmp L(mlo0) + +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 +L(mwd3):mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + add $8, vp + dec vn + jz L(end) + +C The rest of the file are 4 osp loops around addmul_1 + + test $1, R8(un) + jnz L(0x1) + +L(0x0): test $2, R8(un) + jnz L(oloop2_entry) + +L(oloop0_entry): + C initial feed-in block + mov (vp), %rdx + add $8, vp + mov un, n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r8, %r11 + jmp L(lo0) + +L(oloop0): + C overlapped software pipelining block + mov (vp), %rdx C new + add $8, vp + add %r9, (rp) C prev + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + adc %r11, 8(rp) C prev + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 0x8(%rsi,%r14,8),%r11,%r10 + adc %r13, 16(rp) C prev + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 0x10(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) C prev + mov un, n + adc $0, %rax C prev + mov %rax, 32(rp) C prev + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%rbx,%rax + add %r8, %r11 C new + jmp L(lo0) + + ALIGN(16) +L(tp0): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 +L(lo0): adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp0) + + dec vn + jne L(oloop0) + + jmp L(final_wind_down) + +L(oloop2_entry): + mov (vp), %rdx + add $8, vp + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + +L(oloop2): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r11,%r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + + ALIGN(16) +L(tp2): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo2): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp2) + + dec vn + jne L(oloop2) + + jmp L(final_wind_down) + +L(0x1): test $2, R8(un) + jz L(oloop3_entry) + +L(oloop1_entry): + mov (vp), %rdx + add $8, vp + lea 1(un), n + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rbx, 24(rp,n,8) + jmp L(lo1) + +L(oloop1): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + lea 1(un), n + add $8, rp + add %rbx, 24(rp,n,8) + jmp L(lo1) + + ALIGN(16) +L(tp1): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) +L(lo1): adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp1) + + dec vn + jne L(oloop1) + + jmp L(final_wind_down) + +L(oloop3_entry): + mov (vp), %rdx + add $8, vp + lea 3(un), n + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + test n, n + jz L(wd3) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r11, 8(rp,n,8) + jmp L(lo3) + +L(oloop3): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + lea 3(un), n + add $8, rp + add %r10, %r13 + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r12, %rbx + adc $0, %rax + add %r11, 8(rp,n,8) + jmp L(lo3) + + ALIGN(16) +L(tp3): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo3): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp3) + + dec vn + jne L(oloop3) + +L(final_wind_down): + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(end): pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(3): mov (vp), %rdx + add $8, vp + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax +L(wd3): adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + dec vn + jne L(3) + jmp L(end) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm new file mode 100644 index 0000000..2ae729a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm @@ -0,0 +1,299 @@ +dnl X64-64 mpn_mullo_basecase optimised for AMD Zen. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`nn', `%rbp') + +C TODO +C * Rearrange feed-in jumps for short branch forms. +C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in +C code implodes, the blow-up will not be more than perhaps 2.5x. +C * Micro-optimise critical lead-in code blocks. +C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc. +C * Write n < 4 code specifically for Zen (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov (up), %r9 + lea -8(up,n,8), up + lea -40(rp,n,8), rp + + mov $4, R32(%r14) + sub n, %r14 + mov -8(vp_param,n,8), %rbp + imul %r9, %rbp + lea 8(vp_param), %r15 + mov (vp_param), %rdx + + test $1, R8(%r14) + jnz L(mx0) +L(mx1): test $2, R8(%r14) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + lea -2(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + lea (%r14), n + jrcxz L(x) + jmp L(mlo3) +L(x): jmp L(mcor) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax + lea -1(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8 + jmp L(mlo2) + +L(mx0): test $2, R8(%r14) + jz L(mb2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + lea -3(%r14), n + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + mov %rbx, 24(rp) + +L(outer): + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + add $8, %r15 + mov -24(up,%r14,8), %r8 + lea -8(up), up + + test $1, R8(%r14) + jz L(x0) +L(x1): test $2, R8(%r14) + jnz L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + lea -1(%r14), n + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10 + jmp L(lo1) + +L(x0): test $2, R8(%r14) + jz L(b2) + +L(b0): mulx( %r8, %r9, %r8) + lea -2(%r14), n + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12 + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + lea 1(%r14), n + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(cor) + jmp L(lo3) + +L(cor): add 8(rp), %r11 + mov 16(rp), %r10 + mov 24(rp), %r12 +L(mcor):mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + mov -24(up), %r8 + mulx( %r8, %r9, %r12) + mulx( -16,(up), %r14, %rax) + add %r12, %r14 + adc $0, %rax + adc %r9, %r13 + mov %r13, 16(rp) + adc %r14, %rbx + mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov 8(%r15), %rdx + mulx( -24,(up), %r14, %rax) + add %r14, %rbx + mov %rbx, 24(rp) + mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov %rbp, 32(rp) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(b2): mulx( %r8, %r13, %r12) + lea (%r14), n + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top): add %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + js L(top) + + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + inc %r14 + jmp L(outer) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/popcount.asm b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm new file mode 100644 index 0000000..be1613b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/rshift.asm b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm new file mode 100644 index 0000000..0196870 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..0c24de5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm @@ -0,0 +1,507 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(`up', `%rdi') +define(`un_param', `%rsi') +define(`dp_param', `%rdx') +define(`dn_param', `%rcx') +define(`dinv', `%r8') + +define(`i', `%rcx') +define(`dn', `%r14') + +define(`dp', `%rsi') +define(`un', `%r15') + +C TODO +C * The o1...o8 loops for special dn counts were naively hand-optimised by +C folding the generic loops. They can probably be tuned. The speculative +C quotient limb generation might not be in the optimal spot. +C * Perhaps avoid late-in-loop jumps, e.g., lo0. +C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves. + +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), dinv ') + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + sub dn_param, un_param C outer loop count + mov dn_param, dn C FIXME: Suppress by reg re-alloc + push dinv C keep dinv on stack + mov un_param, un C FIXME: Suppress by reg re-alloc + xor R32(%rbp), R32(%rbp) + + lea (dp_param,dn_param,8), dp + + mov (up), %rdx + imul dinv, %rdx C first quotient limb + + neg dn + lea -32(up,dn_param,8), up + + test $1, R8(dn_param) + jnz L(cx1) + +L(cx0): test $2, R8(dn_param) + jnz L(b2) + + +C ============================================================================= +L(b0): cmp $-4, dn + jnz L(gt4) + +L(o4): mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + adc 8(up), %r11 + mov %r8, %rdx C dinv + mov %r11, 8(up) + mulx( %r11, %rdx, %r12) C next quotient + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o4) + jmp L(ret) + +L(gt4): cmp $-8, dn + jnz L(out0) + +L(o8): mulx( -64,(dp), %r9, %r14) + mulx( -56,(dp), %rcx, %r10) + mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r14, %rcx + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add -32(up), %r9 + mulx( -32,(dp), %r9, %r14) + adc -24(up), %rcx + mov %rcx, -24(up) + mulx( -24,(dp), %r11, %r10) + adc %r13, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o8) + jmp L(ret) + +L(out0):mov dn, i + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12 + clc + jmp L(lo0) + + ALIGN(16) +L(top0):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top0) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out0) + jmp L(ret) + +L(cx1): test $2, R8(dn_param) + jnz L(b3) + +C ============================================================================= +L(b1): cmp $-1, dn + jnz L(gt1) + + mov 24(up), %r9 +L(o1): mulx( -8,(dp), %rbx, %rdx) + add %r9, %rbx + adc %rbp, %rdx + add 32(up), %rdx + setc R8(%rbp) + mov %rdx, %r9 + mulx( %r8, %rdx, %r12) C next quotient + lea 8(up), up + dec un + jne L(o1) + mov %r9, 24(up) + jmp L(ret) + +L(gt1): cmp $-5, dn + jnz L(out1) + +L(o5): mulx( -40,(dp), %rbx, %rax) + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + add -8(up), %rbx + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + mov %r9, (up) + mov %r8, %rdx C dinv + mulx( %r9, %rdx, %r12) C next quotient + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o5) + jmp L(ret) + +L(out1):lea 1(dn), i + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10 + clc + jmp L(lo1) + + ALIGN(16) +L(top1):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top1) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out1) + jmp L(ret) + +C ============================================================================= +L(b2): cmp $-2, dn + jnz L(gt2) + + mov 16(up), %r10 + mov 24(up), %r9 +L(o2): mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r12, %rbx + adc $0, %rax + add %r10, %r13 C 0 add just to produce carry + mov %r9, %r10 C 1 + adc %rbx, %r10 C 1 + mov %r8, %rdx + mulx( %r10, %rdx, %r12) C next quotient + adc %rbp, %rax C 2 + setc R8(%rbp) C 3 + mov 32(up), %r9 C 2 + add %rax, %r9 C 2 + adc $0, R32(%rbp) C 3 + lea 8(up), up + dec un + jne L(o2) + mov %r10, 16(up) + mov %r9, 24(up) + jmp L(ret) + +L(gt2): cmp $-6, dn + jnz L(out2) + +L(o6): mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rcx, %rax) + add %r12, %rcx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + add -16(up), %r13 + mulx( -16,(dp), %r13, %r12) + adc -8(up), %rcx + mov %rcx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o6) + jmp L(ret) + +L(out2):lea 2(dn), i + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top2):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top2) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out2) + jmp L(ret) + +C ============================================================================= +L(b3): cmp $-3, dn + jnz L(gt3) + + mov 8(up), %r14 + mov 16(up), %r9 + mov 24(up), %rcx +L(o3): mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add %r14, %r11 + mov %r9, %r14 + adc %r13, %r14 + mov %rcx, %r9 + mov %r8, %rdx C dinv + mulx( %r14, %rdx, %r12) C next quotient + adc %rbx, %r9 + adc %rbp, %rax + setc R8(%rbp) + mov 32(up), %rcx + add %rax, %rcx + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o3) + mov %r14, 8(up) + mov %r9, 16(up) + mov %rcx, 24(up) + jmp L(ret) + +L(gt3): cmp $-7, dn + jnz L(out3) + +L(o7): mulx( -56,(dp), %r11, %r10) + mulx( -48,(dp), %rcx, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r10, %rcx + adc %r12, %rbx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + add -24(up), %r11 + mulx( -24,(dp), %r11, %r10) + adc -16(up), %rcx + mov %rcx, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o7) + jmp L(ret) + +L(out3):lea 3(dn), i + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(lo3) + + ALIGN(16) +L(top3):add %r9, (up,i,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top3) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out3) + +L(ret): mov %rbp, %rax + pop %rsi C dummy dealloc + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm new file mode 100644 index 0000000..a7c6127 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm @@ -0,0 +1,482 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Do overlapped software pipelining. This should close the remaining gap to +C mul_basecase. +C +C * Update un just once in the outer loop. +C +C * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from +C loads and stores. At least in some cases, the non-scaled form is faster. +C +C * Optimise xit3 code, e.g., using shrx and sarx like in the main loop. +C +C * The mul_1 feed-in code has gotten little attention and could probably be +C improved. Perhaps even expand it to 4 separate loops to allow straight +C fall-through into the 4 addmul_1 loops. +C +C * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`un', `%rbp') +define(`n', `%rcx') + +C these are used just for the small op code +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, R32(un_param) + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, R32(un_param) + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %r15 +C push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + mov R32(un_param), R32(un) + + mov (up), %rdx C up[0] + mov 8(up), %r9 C up[1] + + mulx( %rdx, %rax, %r15) C up[0]^2 + mov %rax, (rp) + shl %rdx + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + lea 4(un), n + and $-4, n + + test $1, R8(un) + jnz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18 C mulx 24(up,un,8), %r11, %r10 + add %r15, %rbx + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r15, %r11 + jrcxz L(n4) + jmp L(mlo3) +L(n4): mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + jmp L(m) + +L(mx0): test $2, R8(un) + jnz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %rbx, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %r9, %r8 + add %r15, %r13 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x18 C mulx 24(up,un,8), %r13, %r12 + add %r15, %r9 + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + + lea 2(un), un + + mov $63, R32(%r15) C keep at 63 for shrx/sarx. + test $1, R8(un) + jz L(x0) +L(x1): test $2, R8(un) + jz L(f3) + jmp L(f1) +L(x0): test $2, R8(un) + jz L(f0) +C jmp L(f2) + +L(f2): mov -8(up,un,8), %rdx C up[0] + lea 2(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r11 + .byte 0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r13 + and %rdx, %r11 C "ci" in C code + mulx( %rdx, %rax, %r10) C up[0]^2 + lea (%r13,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r11 + + .byte 0xc4,0x62,0x93,0xf6,0x24,0xee C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %rbx, %rax + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(b2) + + ALIGN(16) +L(top2):add %r9, (rp,n,8) +L(b2): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top2) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f1): mov -8(up,un,8), %rdx C up[0] + lea 1(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r13 + .byte 0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %rbx + and %rdx, %r13 C "ci" in C code + mulx( %rdx, %rax, %r12) C up[0]^2 + lea (%rbx,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r13 + + .byte 0xc4,0xe2,0xe3,0xf6,0x04,0xee C mulx (up,un,8), %rbx, %rax + adc %r12, %rbx + adc $0, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %r9, %r8 + jmp L(b1) + + ALIGN(16) +L(top1):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(b1): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top1) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f0): mov -8(up,un,8), %rdx C up[0] + lea (un), n + lea 8(rp), rp + .byte 0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %rbx + .byte 0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r9 + and %rdx, %rbx C "ci" in C code + mulx( %rdx, %r10, %rax) C up[0]^2 + lea (%r9,%rdx,2), %rdx C "u0" arg in C code + add %r10, %rbx + adc $0, %rax C "cin" in C code + + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08 C mulx 8(up,un,8), %r11, %r10 + jmp L(b0) + + ALIGN(16) +L(top0):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(b0): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top0) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f3): mov -8(up,un,8), %rdx C up[0] + lea 3(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r9 + .byte 0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r11 + and %rdx, %r9 C "ci" in C code + mulx( %rdx, %rax, %r8) C up[0]^2 + lea (%r11,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r9 + + .byte 0xc4,0x62,0xa3,0xf6,0x14,0xee C mulx (%rsi,%rbp,8),%r11,%r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x08 C mulx 0x8(%rsi,%rbp,8),%r13,%r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 0x10(%rsi,%rbp,8),%rbx,%rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(xit3) + jmp L(top3) C FIXME perhaps fall through + + ALIGN(16) +L(top3):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top3) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + jmp L(f2) + + +L(xit3):add %r9, (rp) + adc %r11, 8(rp) + adc 16(rp), %r13 + adc 24(rp), %rbx +L(m): adc $0, %rax + mov %rax, 32(rp) + mov -24(up), %rdx C FIXME: CSE + mov -32(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %r13, %r9 + mulx( %rdx, %rax, %r10) + mov -16(up), %r8 C FIXME: CSE + adc $0, %r10 + add %rax, %r9 + adc $0, %r10 + mov %r9, 16(rp) + mov -32(up), %rax + shl %rax + adc %rdx, %rdx + mulx( %r8, %r13, %r12) + mulx( -8,(up), %r11, %rax) C FIXME: CSE + add %r10, %r13 + adc %r12, %r11 + adc $0, %rax + add %rbx, %r13 + mov %r13, 24(rp) + adc 32(rp), %r11 + adc $0, %rax + mov -16(up), %rdx C FIXME: CSE + mov -8(up), %r8 C FIXME: CSE + mov -24(up), %r9 + sar $63, %r9 + and %rdx, %r9 + add %r11, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 32(rp) + mov -24(up), %rbp + shl %rbp + adc %rdx, %rdx + mulx( %r8, %rbx, %rbp) + add %r10, %rbx + adc $0, %rbp + adc %rbx, %rax + mov %rax, 40(rp) + adc $0, %rbp + mov -8(up), %rdx C FIXME: CSE + mov -16(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %rbp, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 48(rp) + mov %r10, 56(rp) + + pop %rbx + pop %rbp + pop %r12 + pop %r13 +C pop %r14 + pop %r15 + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm new file mode 100644 index 0000000..00f6dc9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sublsh1_n, mpn_sublsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h new file mode 100644 index 0000000..3748c5f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h @@ -0,0 +1,276 @@ +/* AMD Zen2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3600-4400 MHz Matisse */ +/* FFT tuning limit = 703,392,483 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 13 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 385 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 125 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 276 +#define MUL_TOOM8H_THRESHOLD 369 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 315 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,11}, { 799,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,11}, \ + { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,11}, { 3199,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,12}, { 3455,14}, { 895,13}, { 1919,12}, \ + { 3839,11}, { 7679,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,12}, { 8959,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 3839,13}, \ + { 7679,14}, { 3967,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,17}, { 2047,16}, \ + { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \ + { 19967,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 7679,15}, { 15871,18}, { 2047,17}, { 4095,16}, \ + { 9727,15}, { 19967,17}, { 5119,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 275 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \ + { 895,12}, { 479,11}, { 959,14}, { 127,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,12}, { 735,11}, { 1471,10}, { 2943,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,11}, { 1919,10}, { 3839,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 959,12}, { 1919,11}, { 3839,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1471,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,12}, { 7679,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,14}, { 3967,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 7679,15}, \ + { 15359,18}, { 2047,17}, { 4095,16}, { 9727,15}, \ + { 19967,17}, { 5119,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 282 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 154 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 36 +#define INV_NEWTON_THRESHOLD 141 +#define INV_APPR_THRESHOLD 149 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_N_THRESHOLD 47 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 47 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 3,22,194,579 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 195 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 345 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD2_DIV1_METHOD 1 /* 11.29% faster than 3 */ +#define HGCD_THRESHOLD 89 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 25.56% faster than 4 */ + +/* Tuneup completed successfully, took 294200 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm new file mode 100644 index 0000000..7c1ecd0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`x86_64/coreibwl/addmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h new file mode 100644 index 0000000..ffba1c5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h @@ -0,0 +1,222 @@ +/* AMD Zen3 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3800-4700 MHz Vermeer */ +/* FFT tuning limit = 10,000,000 */ +/* Generated by tuneup.c, 2021-01-01, gcc 9.3 */ + +#define MOD_1_NORM_THRESHOLD 64 +#define MOD_1_UNNORM_THRESHOLD 85 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 9 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 15 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define DIV_1_VS_MUL_1_PERCENT 398 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 418 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 87 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 109 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 414 +#define SQR_TOOM8_THRESHOLD 592 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \ + { 73, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,11}, { 111,10}, { 223,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,11}, \ + { 223,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \ + { 1791,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1279, 8}, { 24063,10}, { 6399,11}, \ + { 3327,13}, { 895,12}, { 1791,13}, { 959,12}, \ + { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 186 +#define MUL_FFT_THRESHOLD 3264 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,11}, { 159,10}, \ + { 319, 9}, { 639,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 607,11}, \ + { 1215,13}, { 319,12}, { 671,11}, { 1343,12}, \ + { 735,13}, { 383,12}, { 799,11}, { 1599,10}, \ + { 3199,12}, { 831,13}, { 447,12}, { 895,11}, \ + { 1791,12}, { 959,11}, { 1919,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1343,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1023, 9}, { 17919,10}, \ + { 9727,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 2624 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 31 +#define MULLO_MUL_N_THRESHOLD 6440 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 129 +#define SQRLO_SQR_THRESHOLD 5103 + +#define DC_DIV_QR_THRESHOLD 19 +#define DC_DIVAPPR_Q_THRESHOLD 123 +#define DC_BDIV_QR_THRESHOLD 79 +#define DC_BDIV_Q_THRESHOLD 154 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 107 +#define INV_APPR_THRESHOLD 107 + +#define BINV_NEWTON_THRESHOLD 312 +#define REDC_1_TO_REDC_N_THRESHOLD 77 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1258 +#define MUPI_DIV_QR_THRESHOLD 30 +#define MU_BDIV_QR_THRESHOLD 1120 +#define MU_BDIV_Q_THRESHOLD 1394 + +#define POWM_SEC_TABLE 6,19,203,579,2245 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 115 +#define SET_STR_PRECOMPUTE_THRESHOLD 1941 + +#define FAC_DSC_THRESHOLD 182 +#define FAC_ODD_THRESHOLD 44 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 13.04% faster than 3 */ +#define HGCD_THRESHOLD 65 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 293 +#define GCDEXT_DC_THRESHOLD 186 +#define JACOBI_BASE_METHOD 1 /* 12.79% faster than 3 */ diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm new file mode 100644 index 0000000..6f1e286 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm @@ -0,0 +1,208 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. + +dnl Copyright 2015, 2017, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 ? +C AMD zn1 ? +C AMD zn2 1.6 +C AMD zn3 1.5 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL ? +C Intel SKL ? +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 +define(`ci', `%r8') C stack + +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r11 ') +IFSTD(` mov %r8, %r11 ') + jmp L(com) +EPILOGUE() + +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor R32(%r11), R32(%r11) +L(com): + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%rax) + shr $3, n + and $7, R32(%rax) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + adc %r11, %r10 + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + adc %r11, %r9 + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + adc %r11, %r10 + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + adc %r11, %r9 + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + adc %r11, %r10 + jmp L(b6) + +L(f1): mulx( (up), %r9, %rax) + adc %r11, %r9 + jrcxz L(end) + jmp L(b1) + +L(end): mov %r9, (rp) + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + adc %r11, %r10 + + ALIGN(32) +L(top): adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + lea -1(n), n + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + adc %r11, %r9 + jmp L(b7) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm new file mode 100644 index 0000000..f8c1b60 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/coreibwl/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..c9c3487 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sbpi1_bdiv_r. + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sbpi1_bdiv_r) +include_mpn(`x86_64/coreibwl/sbpi1_bdiv_r.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm new file mode 100644 index 0000000..9c4f65d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/coreibwl/sqr_basecase.asm') -- cgit v1.2.3