aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64')
-rw-r--r--gmp-6.3.0/mpn/x86_64/README74
-rw-r--r--gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm168
-rw-r--r--gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h225
-rw-r--r--gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm474
-rw-r--r--gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm140
-rw-r--r--gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm170
-rw-r--r--gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm53
-rw-r--r--gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm172
-rw-r--r--gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm176
-rw-r--r--gmp-6.3.0/mpn/x86_64/aors_err1_n.asm225
-rw-r--r--gmp-6.3.0/mpn/x86_64/aors_err2_n.asm172
-rw-r--r--gmp-6.3.0/mpn/x86_64/aors_err3_n.asm156
-rw-r--r--gmp-6.3.0/mpn/x86_64/aors_n.asm178
-rw-r--r--gmp-6.3.0/mpn/x86_64/aorsmul_1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm186
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm238
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aors_n.asm128
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm194
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/dive_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h222
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/lshift.asm123
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm127
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/mul_1.asm147
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/mul_2.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/popcount.asm35
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/redc_1.asm579
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm287
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/rshift.asm121
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm242
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/README11
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm235
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h265
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm206
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm193
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm416
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/popcount.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm96
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm142
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h263
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm96
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h266
-rw-r--r--gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm106
-rw-r--r--gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm159
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/copyd.asm91
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/copyi.asm94
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm119
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h230
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm241
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm486
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm507
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm565
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h240
-rw-r--r--gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm183
-rw-r--r--gmp-6.3.0/mpn/x86_64/com.asm95
-rw-r--r--gmp-6.3.0/mpn/x86_64/copyd.asm93
-rw-r--r--gmp-6.3.0/mpn/x86_64/copyi.asm92
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm53
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm53
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm225
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aors_n.asm150
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm188
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm243
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm93
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm137
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h222
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/hamdist.asm210
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/logops_n.asm285
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/lshift.asm145
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm159
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm975
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm427
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/popcount.asm185
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/redc_1.asm430
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm169
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/rshift.asm143
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm984
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm47
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm47
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm158
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm210
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h246
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm368
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm395
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm710
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm839
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm241
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm261
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm201
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm138
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h253
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm159
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm176
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm441
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm422
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm437
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm506
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm200
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h238
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm196
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm182
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm549
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm224
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm54
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm56
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm173
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm215
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm203
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm212
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm174
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm200
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h241
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm199
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm167
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm407
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm384
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm546
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm193
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm484
-rw-r--r--gmp-6.3.0/mpn/x86_64/darwin.m482
-rw-r--r--gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm247
-rw-r--r--gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm158
-rw-r--r--gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm200
-rw-r--r--gmp-6.3.0/mpn/x86_64/dive_1.asm158
-rw-r--r--gmp-6.3.0/mpn/x86_64/divrem_1.asm314
-rw-r--r--gmp-6.3.0/mpn/x86_64/divrem_2.asm192
-rw-r--r--gmp-6.3.0/mpn/x86_64/dos64.m4101
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm181
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm178
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/README22
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm311
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/com.asm175
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm254
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm166
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm300
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm185
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm182
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm173
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm193
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm183
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm201
-rw-r--r--gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm204
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/addmul_2.c38
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/fat.c473
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm209
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h72
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/mod_1.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/redc_1.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/redc_2.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c32
-rw-r--r--gmp-6.3.0/mpn/x86_64/gcd_11.asm114
-rw-r--r--gmp-6.3.0/mpn/x86_64/gcd_22.asm163
-rw-r--r--gmp-6.3.0/mpn/x86_64/gmp-mparam.h217
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h264
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/invert_limb.asm112
-rw-r--r--gmp-6.3.0/mpn/x86_64/invert_limb_table.asm50
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm142
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h248
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/hamdist.asm109
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/popcount.asm138
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm153
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm217
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm179
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm249
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h237
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm469
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm436
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm559
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/redc_1.asm591
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm807
-rw-r--r--gmp-6.3.0/mpn/x86_64/logops_n.asm260
-rw-r--r--gmp-6.3.0/mpn/x86_64/lshift.asm172
-rw-r--r--gmp-6.3.0/mpn/x86_64/lshiftc.asm182
-rw-r--r--gmp-6.3.0/mpn/x86_64/lshsub_n.asm172
-rw-r--r--gmp-6.3.0/mpn/x86_64/missing-call.m453
-rw-r--r--gmp-6.3.0/mpn/x86_64/missing-inline.m4100
-rw-r--r--gmp-6.3.0/mpn/x86_64/missing.asm130
-rw-r--r--gmp-6.3.0/mpn/x86_64/mod_1_1.asm238
-rw-r--r--gmp-6.3.0/mpn/x86_64/mod_1_2.asm241
-rw-r--r--gmp-6.3.0/mpn/x86_64/mod_1_4.asm272
-rw-r--r--gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm215
-rw-r--r--gmp-6.3.0/mpn/x86_64/mode1o.asm171
-rw-r--r--gmp-6.3.0/mpn/x86_64/mul_1.asm192
-rw-r--r--gmp-6.3.0/mpn/x86_64/mul_2.asm204
-rw-r--r--gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm157
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/dive_1.asm166
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h243
-rw-r--r--gmp-6.3.0/mpn/x86_64/nano/popcount.asm35
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm196
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm50
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm50
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm203
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h257
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm167
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm35
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm334
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm169
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/popham.asm163
-rw-r--r--gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm189
-rw-r--r--gmp-6.3.0/mpn/x86_64/rshift.asm176
-rw-r--r--gmp-6.3.0/mpn/x86_64/sec_tabselect.asm176
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm50
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm50
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h252
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h246
-rw-r--r--gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm116
-rw-r--r--gmp-6.3.0/mpn/x86_64/sublsh1_n.asm160
-rw-r--r--gmp-6.3.0/mpn/x86_64/x86_64-defs.m4493
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm227
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm165
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h280
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/hamdist.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mul_1.asm161
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm455
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm299
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/popcount.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm507
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm482
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h276
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h222
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm208
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm37
314 files changed, 53887 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/README b/gmp-6.3.0/mpn/x86_64/README
new file mode 100644
index 0000000..9c8a586
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/README
@@ -0,0 +1,74 @@
+Copyright 2003, 2004, 2006, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ AMD64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for AMD64 chips. It is also useful
+for 64-bit Pentiums, and "Core 2".
+
+
+ RELEVANT OPTIMIZATION ISSUES
+
+The Opteron and Athlon64 can sustain up to 3 instructions per cycle, but in
+practice that is only possible for integer instructions. But almost any
+three integer instructions can issue simultaneously, including any 3 ALU
+operations, including shifts. Up to two memory operations can issue each
+cycle.
+
+Scheduling typically requires that load-use instructions are split into
+separate load and use instructions. That requires more decode resources,
+and it is rarely a win. Opteron/Athlon64 have deep out-of-order core.
+
+
+Optimizing for 64-bit Pentium4 is probably a waste of time, as the most
+critical instructions are very poorly implemented here. Perhaps we could
+save a cycle or two, but the most common loops now run at between 10 and 22
+cycles, so a saved cycle isn't too exciting.
+
+
+The new spin of the venerable P6 core, the "Core 2" is much better than the
+Pentium4 for the GMP loops. Its integer pipeline is somewhat similar to to
+the Opteron/Athlon64 pipeline, except that the GMP favourites ADC/SBB and
+MUL are slower. Furthermore, an INC/DEC followed by ADC/SBB incur a
+pipeline stall of around 10 cycles. The default mpn_add_n and mpn_sub_n
+code suffers badly from the stall. The code in the core2 subdirectory uses
+the almost forgotten instruction JRCXZ for loop control, and updates the
+induction variable using LEA.
+
+
+
+REFERENCES
+
+"System V Application Binary Interface AMD64 Architecture Processor
+Supplement", draft version 0.99, December 2007.
+http://www.x86-64.org/documentation/abi.pdf
diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm
new file mode 100644
index 0000000..d105da6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm
@@ -0,0 +1,168 @@
+dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 -
+C AMD zn1 ?
+C AMD zn2 ?
+C AMD zn3 ?
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel CNR -
+C Intel PNR -
+C Intel NHM -
+C Intel WSM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL -
+C Intel BWL ?
+C Intel SKL ?
+C Intel RKL ?
+C Intel ALD 1.29
+C Intel atom -
+C Intel SLM -
+C Intel GLM -
+C VIA nano -
+
+define(`rp', `%rdi') dnl rcx
+define(`up', `%rsi') dnl rdx
+define(`n_param', `%rdx') dnl r8
+define(`v0_param',`%rcx') dnl r9
+
+define(`n', `%rcx') dnl
+define(`v0', `%rdx') dnl
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+ mov (up), %r8
+
+ push %rbx
+ push %r12
+ push %r13
+
+ mov %rdx, %rax
+ mov %rcx, v0
+ mov %rax, n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jl L(b1)
+ jz L(b2)
+
+L(b3): mulx( %r8, %r11, %r10)
+ mulx( 8,(up), %r13, %r12)
+ mulx( 16,(up), %rbx, %rax)
+ inc n
+ lea -8(up), up
+ lea -24(rp), rp
+ jmp L(lo3)
+
+L(b0): mulx( %r8, %r9, %r8)
+ mulx( 8,(up), %r11, %r10)
+ mulx( 16,(up), %r13, %r12)
+ lea -16(rp), rp
+ jmp L(lo0)
+
+L(b2): mulx( %r8, %r13, %r12)
+ mulx( 8,(up), %rbx, %rax)
+ lea -2(n), n
+ jrcxz L(n2)
+ mulx( 16,(up), %r9, %r8)
+ lea 16(up), up
+ jmp L(lo2)
+L(n2): jmp L(wd2)
+
+L(b1): mulx( %r8, %rbx, %rax)
+ sub $1, n
+ jrcxz L(n1)
+ mulx( 8,(up), %r9, %r8)
+ mulx( 16,(up), %r11, %r10)
+ lea 8(up), up
+ lea -8(rp), rp
+ jmp L(lo1)
+L(n1): add (rp), %rbx
+ adc %rcx, %rax
+ mov %rbx, (rp)
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+
+L(top): mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+L(lo2): adox( (rp), %r13)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+L(lo1): adox( 8,(rp), %rbx)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+L(lo0): adox( 16,(rp), %r9)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+L(lo3): adox( 24,(rp), %r11)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea -4(n), n
+ jrcxz L(end)
+ jmp L(top)
+
+L(end): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+L(wd2): adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h
new file mode 100644
index 0000000..0bffc3d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h
@@ -0,0 +1,225 @@
+/* Intel Alder Lake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2022 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3700-4900 MHz Alder Lake */
+/* FFT tuning limit = 10,000,000 */
+/* Generated by tuneup.c, 2022-03-15, gcc 11.2 */
+
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 34
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 30
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 23
+
+#define DIV_1_VS_MUL_1_PERCENT 559
+
+#define MUL_TOOM22_THRESHOLD 13
+#define MUL_TOOM33_THRESHOLD 97
+#define MUL_TOOM44_THRESHOLD 148
+#define MUL_TOOM6H_THRESHOLD 562
+#define MUL_TOOM8H_THRESHOLD 608
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 259
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 144
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 86
+#define SQR_TOOM4_THRESHOLD 582
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 753
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define MUL_FFT_MODF_THRESHOLD 384 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 384, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 135,11}, { 79, 9}, { 319, 8}, \
+ { 639, 9}, { 335, 8}, { 671,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543, 8}, { 1087, 9}, { 575,10}, \
+ { 303, 9}, { 607,10}, { 319, 9}, { 639,10}, \
+ { 335, 9}, { 671,10}, { 351,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 575,11}, { 303,10}, \
+ { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \
+ { 351,10}, { 703,11}, { 367,10}, { 735, 9}, \
+ { 1471, 8}, { 2943,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,11}, { 479,10}, { 959,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,10}, { 1471, 9}, \
+ { 2943, 8}, { 5887,12}, { 383,11}, { 767,10}, \
+ { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,10}, { 1791,12}, { 479,11}, \
+ { 959,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,11}, { 1471,10}, \
+ { 2943,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 831,13}, { 447,12}, { 959,11}, { 1919,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1471,11}, \
+ { 2943,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,12}, { 1919,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 190
+#define MUL_FFT_THRESHOLD 2496
+
+#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,11}, { 79, 9}, { 319,11}, { 95,10}, \
+ { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \
+ { 607,10}, { 319, 9}, { 639,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,10}, { 607,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,11}, { 479,10}, { 959,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087,12}, { 287,11}, { 575,10}, \
+ { 1151,11}, { 607,10}, { 1215,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \
+ { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
+ { 895,12}, { 479,11}, { 959,10}, { 1919,14}, \
+ { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
+ { 607,11}, { 1215,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,13}, { 447,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \
+ { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD 2240
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 56
+#define MULLO_MUL_N_THRESHOLD 4940
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 73
+#define SQRLO_SQR_THRESHOLD 4392
+
+#define DC_DIV_QR_THRESHOLD 19
+#define DC_DIVAPPR_Q_THRESHOLD 139
+#define DC_BDIV_QR_THRESHOLD 62
+#define DC_BDIV_Q_THRESHOLD 126
+
+#define INV_MULMOD_BNM1_THRESHOLD 24
+#define INV_NEWTON_THRESHOLD 108
+#define INV_APPR_THRESHOLD 108
+
+#define BINV_NEWTON_THRESHOLD 208
+#define REDC_1_TO_REDC_2_THRESHOLD 36
+#define REDC_2_TO_REDC_N_THRESHOLD 53
+
+#define MU_DIV_QR_THRESHOLD 855
+#define MU_DIVAPPR_Q_THRESHOLD 1120
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 807
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,11,70,702,2499
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 2150
+#define SET_STR_PRECOMPUTE_THRESHOLD 2943
+
+#define FAC_DSC_THRESHOLD 298
+#define FAC_ODD_THRESHOLD 51
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 1 /* 2.38% faster than 3 */
+#define HGCD_THRESHOLD 142
+#define HGCD_APPR_THRESHOLD 159
+#define HGCD_REDUCE_THRESHOLD 2384
+#define GCD_DC_THRESHOLD 483
+#define GCDEXT_DC_THRESHOLD 492
+#define JACOBI_BASE_METHOD 1 /* 0.94% faster than 3 */
diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm
new file mode 100644
index 0000000..9400fe5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm
@@ -0,0 +1,474 @@
+dnl AMD64 mpn_mul_basecase.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 -
+C AMD zn1 ?
+C AMD zn2 ?
+C AMD zn3 ?
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel CNR -
+C Intel PNR -
+C Intel NHM -
+C Intel WSM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL -
+C Intel BWL ?
+C Intel SKL ?
+C Intel RKL ?
+C Intel ALD 1.29
+C Intel atom -
+C Intel SLM -
+C Intel GLM -
+C VIA nano -
+
+C TODO
+C * Do overlapped software pipelining.
+C * Try shallower pipeline, which would result in using fewer registers.
+C * There are false dependencies on CF/OF between iterations. Try breaking
+C them to see if it helps.
+
+define(`rp', `%rdi') dnl rcx
+define(`up', `%rsi') dnl rdx
+define(`un_arg',`%rdx') dnl r8
+define(`vp_arg',`%rcx') dnl r9
+define(`vn_arg',`%r8') dnl stack
+
+define(`un', `%r14')
+define(`vp', `%r15')
+define(`vn', `%rbp')
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ cmp $2, un_arg
+ ja L(gen)
+ mov (vp_arg), %rdx
+ mulx( (up), %rax, %r9)
+ mov %rax, (rp)
+ je L(s2x)
+
+ mov %r9, 8(rp)
+ ret
+
+L(s2x): mulx( 8,(up), %rax, %r10)
+ add %r9, %rax
+ adc $0, %r10
+ cmp $2, R32(vn_arg)
+ je L(s22)
+
+L(s21): mov %rax, 8(rp)
+ mov %r10, 16(rp)
+ ret
+
+L(s22): mov 8(vp_arg), %rdx
+ mulx( (up), %r8, %r9)
+ add %r8, %rax
+ adc %r10, %r9
+ mov %rax, 8(rp)
+ mulx( 8,(up), %rax, %r10)
+ adc $0, %r10
+ adc %r9, %rax
+ mov %rax, 16(rp)
+ adc $0, %r10
+ mov %r10, 24(rp)
+ ret
+
+L(gen): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov un_arg, un
+ neg un
+ shl $3, un
+ mov vp_arg, vp
+ mov vn_arg, vn
+
+ test $1, R8(un_arg)
+ mov (vp), %rdx
+ jz L(bx0)
+
+L(bx1): test $16, R8(un)
+ jnz L(b01)
+
+L(b11): lea 24(un), n
+ mulx( (up), %r11, %r10)
+ mulx( 8,(up), %r13, %r12)
+ mulx( 16,(up), %rbx, %rax)
+ lea 8(rp), rp
+ lea 24(up), up
+ jrcxz L(med3)
+L(mtp3):mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(med3)
+ jmp L(mtp3)
+L(med3):adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adcx( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jz L(ret)
+L(out3):lea 32(rp,un), rp
+ lea 24(up,un), up
+ lea 8(vp), vp
+ xor R32(%rdx), R32(%rdx)
+ mov (vp), %rdx
+ mulx( -24,(up), %r11, %r10)
+ mulx( -16,(up), %r13, %r12)
+ mulx( -8,(up), %rbx, %rax)
+ lea 24(un), n
+ adox( -8,(rp), %r11)
+ jrcxz L(ed3)
+L(tp3): mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ adox( 16,(rp), %r9)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ adox( 24,(rp), %r11)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(ed3)
+ jmp L(tp3)
+L(ed3): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jnz L(out3)
+ jmp L(ret)
+
+
+L(b01): mulx( (up), %rbx, %rax)
+ lea 8(un), n
+ mulx( 8,(up), %r9, %r8)
+ mulx( 16,(up), %r11, %r10)
+ lea 8(up), up
+ lea -8(rp), rp
+ jmp L(ml1)
+L(mtp1):mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+L(ml1): mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(med1)
+ jmp L(mtp1)
+L(med1):adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adcx( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jz L(ret)
+L(out1):lea 16(rp,un), rp
+ lea 8(up,un), up
+ lea 8(vp), vp
+ xor R32(%rdx), R32(%rdx)
+ mov (vp), %rdx
+ lea 8(un), n
+ mulx( -8,(up), %rbx, %rax)
+ mulx( (up), %r9, %r8)
+ mulx( 8,(up), %r11, %r10)
+ jmp L(lo1)
+L(tp1): mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+L(lo1): adox( 8,(rp), %rbx)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ adox( 16,(rp), %r9)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ adox( 24,(rp), %r11)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(ed1)
+ jmp L(tp1)
+L(ed1): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jnz L(out1)
+ jmp L(ret)
+
+
+L(bx0): test $16, R8(un)
+ jz L(b00)
+
+L(b10): mulx( (up), %r13, %r12)
+ mulx( 8,(up), %rbx, %rax)
+ lea 16(un), n
+ mulx( 16,(up), %r9, %r8)
+ lea 16(up), up
+ jmp L(ml2)
+L(mtp2):mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+L(ml2): mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(med2)
+ jmp L(mtp2)
+L(med2):adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adcx( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jz L(ret)
+L(out2):lea 24(rp,un), rp
+ lea 16(up,un), up
+ lea 8(vp), vp
+ xor R32(%rdx), R32(%rdx)
+ mov (vp), %rdx
+ mulx( -16,(up), %r13, %r12)
+ mulx( -8,(up), %rbx, %rax)
+ lea 16(un), n
+ mulx( (up), %r9, %r8)
+ jmp L(lo2)
+L(tp2): mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+L(lo2): adox( (rp), %r13)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ adox( 16,(rp), %r9)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ adox( 24,(rp), %r11)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(ed2)
+ jmp L(tp2)
+L(ed2): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jnz L(out2)
+ jmp L(ret)
+
+
+L(b00): lea 32(un), n
+ mulx( (up), %r9, %r8)
+ mulx( 8,(up), %r11, %r10)
+ mulx( 16,(up), %r13, %r12)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, (rp)
+ lea 32(up), up
+ lea 16(rp), rp
+ jrcxz L(med0)
+L(mtp0):mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(med0)
+ jmp L(mtp0)
+L(med0):adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adcx( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jz L(ret)
+L(out0):lea 40(rp,un), rp
+ lea 32(up,un), up
+ lea 8(vp), vp
+ xor R32(%rdx), R32(%rdx)
+ mov (vp), %rdx
+ lea 32(un), n
+ mulx( -32,(up), %r9, %r8)
+ mulx( -24,(up), %r11, %r10)
+ mulx( -16,(up), %r13, %r12)
+ adox( -16,(rp), %r9)
+ mulx( -8,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, -16(rp)
+ adox( -8,(rp), %r11)
+ jrcxz L(ed0)
+L(tp0): mulx( (up), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ mulx( 8,(up), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ mulx( 16,(up), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp)
+ adox( 16,(rp), %r9)
+ mulx( 24,(up), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp)
+ adox( 24,(rp), %r11)
+ lea 32(up), up
+ lea 32(rp), rp
+ lea 32(n), n
+ jrcxz L(ed0)
+ jmp L(tp0)
+L(ed0): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+ adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+ adox( 8,(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ mov %rax, 16(rp)
+ dec vn
+ jnz L(out0)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm
new file mode 100644
index 0000000..d7d6b0d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm
@@ -0,0 +1,140 @@
+dnl AMD64 mpn_submul_1 for CPUs with mulx and adx.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2022 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 -
+C AMD zn1 ?
+C AMD zn2 ?
+C AMD zn3 2.0
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel CNR -
+C Intel PNR -
+C Intel NHM -
+C Intel WSM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL -
+C Intel BWL ?
+C Intel SKL ?
+C Intel RKL 2.0
+C Intel ALD 1.53
+C Intel atom -
+C Intel SLM -
+C Intel GLM -
+C VIA nano -
+
+define(`rp', `%rdi') dnl rcx
+define(`up', `%rsi') dnl rdx
+define(`n_param', `%rdx') dnl r8
+define(`v0_param',`%rcx') dnl r9
+
+define(`n', `%rcx') dnl
+define(`v0', `%rdx') dnl
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_submul_1)
+ mov n_param, %rax
+ mov v0_param, v0
+ mov %rax, n
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): mulx( (up), %r9, %rax)
+ test $2, R8(n)
+ stc
+ jz L(b01)
+
+L(b11): lea 1(n), n
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(lo3)
+
+L(b01): lea 3(n), n
+ jmp L(lo1)
+
+L(bx0): mulx( (up), %r9, %r8)
+ test $2, R8(n)
+ stc
+ jz L(b00)
+
+L(b10): lea 8(up), up
+ lea 8(rp), rp
+ lea 2(n), n
+ jmp L(lo2)
+
+L(b00): lea 24(up), up
+ lea 24(rp), rp
+ jmp L(lo0)
+
+L(top): lea 32(up), up
+ lea 32(rp), rp
+ mulx( -24,(up), %r9, %r8)
+ adox( %rax, %r9)
+L(lo0): not %r9
+ adcx( -24,(rp), %r9)
+ mov %r9, -24(rp)
+ mulx( -16,(up), %r9, %rax)
+ adox( %r8, %r9)
+L(lo3): not %r9
+ adcx( -16,(rp), %r9)
+ mov %r9, -16(rp)
+ mulx( -8,(up), %r9, %r8)
+ adox( %rax, %r9)
+L(lo2): not %r9
+ adcx( -8,(rp), %r9)
+ mov %r9, -8(rp)
+ mulx( (up), %r9, %rax)
+ adox( %r8, %r9)
+L(lo1): not %r9
+ adcx( (rp), %r9)
+ mov %r9, (rp)
+ lea -4(n), n
+ jrcxz L(end)
+ jmp L(top)
+
+L(end): adox( %rcx, %rax)
+ sbb $-1, %rax
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm
new file mode 100644
index 0000000..6ee0872
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm
@@ -0,0 +1,170 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 2
+C AMD bd1 ?
+C AMD bobcat ?
+C Intel P4 13
+C Intel core2 3.45
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom ?
+C VIA nano ?
+
+
+C Sometimes speed degenerates, supposedly related to that some operand
+C alignments cause cache conflicts.
+
+C The speed is limited by decoding/issue bandwidth. There are 22 instructions
+C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbp
+
+ mov (vp), %r8
+ mov R32(n), R32(%rax)
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ neg n
+ xor R32(%rbp), R32(%rbp)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): add %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ mov 16(vp,n,8), %r10
+ adc %r10, %r10
+ sbb R32(%rax), R32(%rax) C save scy
+ ADDSUB (up,n,8), %r8
+ ADCSBB 8(up,n,8), %r9
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+ ADCSBB 16(up,n,8), %r10
+ mov %r10, 16(rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $3, n
+ jmp L(ent)
+
+L(b10): add %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ sbb R32(%rax), R32(%rax) C save scy
+ ADDSUB (up,n,8), %r8
+ ADCSBB 8(up,n,8), %r9
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $2, n
+ jmp L(ent)
+
+L(b01): add %r8, %r8
+ sbb R32(%rax), R32(%rax) C save scy
+ ADDSUB (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ inc n
+L(ent): jns L(end)
+
+ ALIGN(16)
+L(top): add R32(%rax), R32(%rax) C restore scy
+
+ mov (vp,n,8), %r8
+L(b00): adc %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ mov 16(vp,n,8), %r10
+ adc %r10, %r10
+ mov 24(vp,n,8), %r11
+ adc %r11, %r11
+
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+
+ ADCSBB (up,n,8), %r8
+ nop C Hammer speedup!
+ ADCSBB 8(up,n,8), %r9
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov %r10, 16(rp,n,8)
+ mov %r11, 24(rp,n,8)
+
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $4, n
+ js L(top)
+
+L(end):
+ifdef(`OPERATION_addlsh1_n',`
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)')
+ifdef(`OPERATION_rsblsh1_n',`
+ sub R32(%rax), R32(%rbp)
+ movslq R32(%rbp), %rax')
+
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm
new file mode 100644
index 0000000..999e972
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm
@@ -0,0 +1,53 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009-2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n',`
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm
new file mode 100644
index 0000000..de00154
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm
@@ -0,0 +1,172 @@
+dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
+dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
+
+dnl Copyright 2009-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C AMD K8,K9 2.1
+C AMD K10 2.0
+C AMD bd1 ~2.7
+C AMD bd2 ~2.7
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 2.0
+C AMD bt1 3.3
+C AMD bt2 3.0
+C Intel P4 ?
+C Intel PNR 3.0
+C Intel NHM 2.75
+C Intel SBR 2.55
+C Intel IBR 2.49
+C Intel HWL 2.25
+C Intel BWL 1.89
+C Intel SKL 1.90
+C Intel atom 8.4
+C Intel SLM 4.0
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+define(M, eval(m4_lshift(1,LSH)))
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (vp), %r8
+ lea (,%r8,M), %r12
+ shr $RSH, %r8
+
+ mov R32(n), R32(%rax)
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ neg n
+ and $3, R8(%rax)
+ je L(b00)
+ cmp $2, R8(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): mov 8(vp,n,8), %r10
+ lea (%r8,%r10,M), %r14
+ shr $RSH, %r10
+ mov 16(vp,n,8), %r11
+ lea (%r10,%r11,M), %r15
+ shr $RSH, %r11
+ ADDSUB (up,n,8), %r12
+ ADCSBB 8(up,n,8), %r14
+ ADCSBB 16(up,n,8), %r15
+ sbb R32(%rax), R32(%rax) C save carry for next
+ mov %r12, (rp,n,8)
+ mov %r14, 8(rp,n,8)
+ mov %r15, 16(rp,n,8)
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b01): mov %r8, %r11
+ ADDSUB (up,n,8), %r12
+ sbb R32(%rax), R32(%rax) C save carry for next
+ mov %r12, (rp,n,8)
+ add $1, n
+ js L(top)
+ jmp L(end)
+
+L(b10): mov 8(vp,n,8), %r11
+ lea (%r8,%r11,M), %r15
+ shr $RSH, %r11
+ ADDSUB (up,n,8), %r12
+ ADCSBB 8(up,n,8), %r15
+ sbb R32(%rax), R32(%rax) C save carry for next
+ mov %r12, (rp,n,8)
+ mov %r15, 8(rp,n,8)
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+L(b00): mov 8(vp,n,8), %r9
+ mov 16(vp,n,8), %r10
+ jmp L(e00)
+
+ ALIGN(16)
+L(top): mov 16(vp,n,8), %r10
+ mov (vp,n,8), %r8
+ mov 8(vp,n,8), %r9
+ lea (%r11,%r8,M), %r12
+ shr $RSH, %r8
+L(e00): lea (%r8,%r9,M), %r13
+ shr $RSH, %r9
+ mov 24(vp,n,8), %r11
+ lea (%r9,%r10,M), %r14
+ shr $RSH, %r10
+ lea (%r10,%r11,M), %r15
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB (up,n,8), %r12
+ ADCSBB 8(up,n,8), %r13
+ ADCSBB 16(up,n,8), %r14
+ ADCSBB 24(up,n,8), %r15
+ mov %r12, (rp,n,8)
+ mov %r13, 8(rp,n,8)
+ mov %r14, 16(rp,n,8)
+ sbb R32(%rax), R32(%rax) C save carry for next
+ mov %r15, 24(rp,n,8)
+ add $4, n
+ js L(top)
+L(end):
+
+ifelse(ADDSUB,add,`
+ sub R32(%r11), R32(%rax)
+ neg R32(%rax)
+',`
+ add R32(%r11), R32(%rax)
+ movslq R32(%rax), %rax
+')
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm
new file mode 100644
index 0000000..5ca128f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm
@@ -0,0 +1,176 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 3.1 < 3.85 for lshift + add_n
+C AMD K10 3.1 < 3.85 for lshift + add_n
+C Intel P4 14.6 > 7.33 for lshift + add_n
+C Intel core2 3.87 > 3.27 for lshift + add_n
+C Intel NHM 4 > 3.75 for lshift + add_n
+C Intel SBR (5.8) > 3.46 for lshift + add_n
+C Intel atom (7.75) < 8.75 for lshift + add_n
+C VIA nano 4.7 < 6.25 for lshift + add_n
+
+C This was written quickly and not optimized at all. Surely one could get
+C closer to 3 c/l or perhaps even under 3 c/l. Ideas:
+C 1) Use indexing to save the 3 LEA
+C 2) Write reasonable feed-in code
+C 3) Be more clever about register usage
+C 4) Unroll more, handling CL negation, carry save/restore cost much now
+C 5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %r12
+ push %r13
+ push %r14
+ push %rbp
+ push %rbx
+
+ mov n, %rax
+ xor R32(%rbx), R32(%rbx) C clear carry save register
+ mov R32(%r8), R32(%rcx) C shift count
+ xor R32(%rbp), R32(%rbp) C limb carry
+
+ mov R32(%rax), R32(%r11)
+ and $3, R32(%r11)
+ je L(4)
+ sub $1, R32(%r11)
+
+L(012): mov (vp), %r8
+ mov %r8, %r12
+ shl R8(%rcx), %r8
+ or %rbp, %r8
+ neg R8(%rcx)
+ mov %r12, %rbp
+ shr R8(%rcx), %rbp
+ neg R8(%rcx)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ sbb R32(%rbx), R32(%rbx)
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ sub $1, R32(%r11)
+ jnc L(012)
+
+L(4): sub $4, %rax
+ jc L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ mov %r8, %r12
+ mov 8(vp), %r9
+ mov %r9, %r13
+ mov 16(vp), %r10
+ mov %r10, %r14
+ mov 24(vp), %r11
+
+ shl R8(%rcx), %r8
+ shl R8(%rcx), %r9
+ shl R8(%rcx), %r10
+ or %rbp, %r8
+ mov %r11, %rbp
+ shl R8(%rcx), %r11
+
+ neg R8(%rcx)
+
+ shr R8(%rcx), %r12
+ shr R8(%rcx), %r13
+ shr R8(%rcx), %r14
+ shr R8(%rcx), %rbp C used next iteration
+
+ or %r12, %r9
+ or %r13, %r10
+ or %r14, %r11
+
+ neg R8(%rcx)
+
+ add R32(%rbx), R32(%rbx) C restore carry flag
+
+ ADCSBB (up), %r8
+ ADCSBB 8(up), %r9
+ ADCSBB 16(up), %r10
+ ADCSBB 24(up), %r11
+
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+
+ sbb R32(%rbx), R32(%rbx) C save carry flag
+
+ lea 32(up), up
+ lea 32(vp), vp
+ lea 32(rp), rp
+
+ sub $4, %rax
+ jnc L(top)
+
+L(end): add R32(%rbx), R32(%rbx)
+ ADCSBB $0, %rbp
+ mov %rbp, %rax
+ pop %rbx
+ pop %rbp
+ pop %r14
+ pop %r13
+ pop %r12
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm
new file mode 100644
index 0000000..54d0b3f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm
@@ -0,0 +1,225 @@
+dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.75 (degenerates to 3 c/l for some alignments)
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`ep', `%rcx')
+define(`yp', `%r8')
+define(`n', `%r9')
+define(`cy_param', `8(%rsp)')
+
+define(`el', `%rbx')
+define(`eh', `%rbp')
+define(`t0', `%r10')
+define(`t1', `%r11')
+define(`t2', `%r12')
+define(`t3', `%r13')
+define(`w0', `%r14')
+define(`w1', `%r15')
+
+ifdef(`OPERATION_add_err1_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_err1_n)')
+ifdef(`OPERATION_sub_err1_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_err1_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov cy_param, %rax
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%r10)
+ and $3, R32(%r10)
+ jz L(0mod4)
+ cmp $2, R32(%r10)
+ jc L(1mod4)
+ jz L(2mod4)
+L(3mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ xor R32(t1), R32(t1)
+ lea -24(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 16(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc 8(yp), t0
+ mov 16(up,n,8), w0
+ ADCSBB 16(vp,n,8), w0
+ mov w0, 16(rp,n,8)
+ cmovc (yp), t1
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+
+ add $3, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(0mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea (yp,n,8), yp
+ neg n
+ jmp L(loop)
+
+ ALIGN(16)
+L(1mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea -8(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc (yp), el
+ setc %al C save carry
+
+ add $1, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(2mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ lea -16(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 8(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc (yp), t0
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+
+ add $2, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(32)
+L(loop):
+ shr $1, %al C restore carry
+ mov -8(yp), t0
+ mov $0, R32(t3)
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ cmovnc t3, t0
+ ADCSBB 8(vp,n,8), w1
+ mov -16(yp), t1
+ mov w0, (rp,n,8)
+ mov 16(up,n,8), w0
+ mov w1, 8(rp,n,8)
+ cmovnc t3, t1
+ mov -24(yp), t2
+ ADCSBB 16(vp,n,8), w0
+ cmovnc t3, t2
+ mov 24(up,n,8), w1
+ ADCSBB 24(vp,n,8), w1
+ cmovc -32(yp), t3
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+ add t2, el
+ adc $0, eh
+ mov w0, 16(rp,n,8)
+ add t3, el
+ lea -32(yp), yp
+ adc $0, eh
+ mov w1, 24(rp,n,8)
+ add $4, n
+ jnz L(loop)
+
+L(end):
+ mov el, (ep)
+ mov eh, 8(ep)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm
new file mode 100644
index 0000000..ce5c2a4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm
@@ -0,0 +1,172 @@
+dnl AMD64 mpn_add_err2_n, mpn_sub_err2_n
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 6.9
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`ep', `%rcx')
+define(`yp1', `%r8')
+define(`yp2', `%r9')
+define(`n_param', `8(%rsp)')
+define(`cy_param', `16(%rsp)')
+
+define(`cy1', `%r14')
+define(`cy2', `%rax')
+
+define(`n', `%r10')
+
+define(`w', `%rbx')
+define(`e1l', `%rbp')
+define(`e1h', `%r11')
+define(`e2l', `%r12')
+define(`e2h', `%r13')
+
+
+ifdef(`OPERATION_add_err2_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_err2_n)')
+ifdef(`OPERATION_sub_err2_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_err2_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov cy_param, cy2
+ mov n_param, n
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ xor R32(e1l), R32(e1l)
+ xor R32(e1h), R32(e1h)
+ xor R32(e2l), R32(e2l)
+ xor R32(e2h), R32(e2h)
+
+ sub yp1, yp2
+
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+
+ test $1, n
+ jnz L(odd)
+
+ lea -8(yp1,n,8), yp1
+ neg n
+ jmp L(top)
+
+ ALIGN(16)
+L(odd):
+ lea -16(yp1,n,8), yp1
+ neg n
+ shr $1, cy2
+ mov (up,n,8), w
+ ADCSBB (vp,n,8), w
+ cmovc 8(yp1), e1l
+ cmovc 8(yp1,yp2), e2l
+ mov w, (rp,n,8)
+ sbb cy2, cy2
+ inc n
+ jz L(end)
+
+ ALIGN(16)
+L(top):
+ mov (up,n,8), w
+ shr $1, cy2 C restore carry
+ ADCSBB (vp,n,8), w
+ mov w, (rp,n,8)
+ sbb cy1, cy1 C generate mask, preserve CF
+
+ mov 8(up,n,8), w
+ ADCSBB 8(vp,n,8), w
+ mov w, 8(rp,n,8)
+ sbb cy2, cy2 C generate mask, preserve CF
+
+ mov (yp1), w C (e1h:e1l) += cy1 * yp1 limb
+ and cy1, w
+ add w, e1l
+ adc $0, e1h
+
+ and (yp1,yp2), cy1 C (e2h:e2l) += cy1 * yp2 limb
+ add cy1, e2l
+ adc $0, e2h
+
+ mov -8(yp1), w C (e1h:e1l) += cy2 * next yp1 limb
+ and cy2, w
+ add w, e1l
+ adc $0, e1h
+
+ mov -8(yp1,yp2), w C (e2h:e2l) += cy2 * next yp2 limb
+ and cy2, w
+ add w, e2l
+ adc $0, e2h
+
+ add $2, n
+ lea -16(yp1), yp1
+ jnz L(top)
+L(end):
+
+ mov e1l, (ep)
+ mov e1h, 8(ep)
+ mov e2l, 16(ep)
+ mov e2h, 24(ep)
+
+ and $1, %eax C return carry
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm
new file mode 100644
index 0000000..bb6d0c5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm
@@ -0,0 +1,156 @@
+dnl AMD64 mpn_add_err3_n, mpn_sub_err3_n
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 7.0
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`ep', `%rcx')
+define(`yp1', `%r8')
+define(`yp2', `%r9')
+define(`yp3_param', `8(%rsp)')
+define(`n_param', `16(%rsp)')
+define(`cy_param', `24(%rsp)')
+
+define(`n', `%r10')
+define(`yp3', `%rcx')
+define(`t', `%rbx')
+
+define(`e1l', `%rbp')
+define(`e1h', `%r11')
+define(`e2l', `%r12')
+define(`e2h', `%r13')
+define(`e3l', `%r14')
+define(`e3h', `%r15')
+
+
+
+ifdef(`OPERATION_add_err3_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_err3_n)')
+ifdef(`OPERATION_sub_err3_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_err3_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err3_n mpn_sub_err3_n)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov cy_param, %rax
+ mov n_param, n
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ push ep
+ mov 64(%rsp), yp3 C load from yp3_param
+
+ xor R32(e1l), R32(e1l)
+ xor R32(e1h), R32(e1h)
+ xor R32(e2l), R32(e2l)
+ xor R32(e2h), R32(e2h)
+ xor R32(e3l), R32(e3l)
+ xor R32(e3h), R32(e3h)
+
+ sub yp1, yp2
+ sub yp1, yp3
+
+ lea -8(yp1,n,8), yp1
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ neg n
+
+ ALIGN(16)
+L(top):
+ shr $1, %rax C restore carry
+ mov (up,n,8), %rax
+ ADCSBB (vp,n,8), %rax
+ mov %rax, (rp,n,8)
+ sbb %rax, %rax C save carry and generate mask
+
+ mov (yp1), t
+ and %rax, t
+ add t, e1l
+ adc $0, e1h
+
+ mov (yp1,yp2), t
+ and %rax, t
+ add t, e2l
+ adc $0, e2h
+
+ mov (yp1,yp3), t
+ and %rax, t
+ add t, e3l
+ adc $0, e3h
+
+ lea -8(yp1), yp1
+ inc n
+ jnz L(top)
+
+L(end):
+ and $1, %eax
+ pop ep
+
+ mov e1l, (ep)
+ mov e1h, 8(ep)
+ mov e2l, 16(ep)
+ mov e2h, 24(ep)
+ mov e3l, 32(ep)
+ mov e3h, 40(ep)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aors_n.asm b/gmp-6.3.0/mpn/x86_64/aors_n.asm
new file mode 100644
index 0000000..d5a314a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aors_n.asm
@@ -0,0 +1,178 @@
+dnl AMD64 mpn_add_n, mpn_sub_n
+
+dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1.5
+C AMD K10 1.5
+C AMD bd1 1.8
+C AMD bd2 1.74
+C AMD bd3 ?
+C AMD bd4 1.78
+C AMD zen 1.5
+C AMD bt1 2.54
+C AMD bt2 2.15
+C Intel P4 11.5
+C Intel core2 4.9
+C Intel NHM 5.53
+C Intel SBR 1.59
+C Intel IBR 1.55
+C Intel HWL 1.44
+C Intel BWL 1.14
+C Intel SKL 1.21
+C Intel atom 4
+C Intel SLM 3
+C VIA nano 3.25
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ mov R32(n), R32(%rax)
+ shr $2, n
+ and $3, R32(%rax)
+ bt $0, %r8 C cy flag <- carry parameter
+ jrcxz L(lt4)
+
+ mov (up), %r8
+ mov 8(up), %r9
+ dec n
+ jmp L(mid)
+
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov R32(n), R32(%rax)
+ shr $2, n
+ and $3, R32(%rax)
+ jrcxz L(lt4)
+
+ mov (up), %r8
+ mov 8(up), %r9
+ dec n
+ jmp L(mid)
+
+L(lt4): dec R32(%rax)
+ mov (up), %r8
+ jnz L(2)
+ ADCSBB (vp), %r8
+ mov %r8, (rp)
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+L(2): dec R32(%rax)
+ mov 8(up), %r9
+ jnz L(3)
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+L(3): mov 16(up), %r10
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(top): ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ mov %r8, (rp)
+ lea 32(up), up
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ dec n
+ mov %r11, 24(rp)
+ lea 32(vp), vp
+ mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+L(mid): mov 16(up), %r10
+ mov 24(up), %r11
+ jnz L(top)
+
+L(end): lea 32(up), up
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+ lea 32(rp), rp
+
+ inc R32(%rax)
+ dec R32(%rax)
+ jnz L(lt4)
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm
new file mode 100644
index 0000000..dfe4dc4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.52
+C AMD K10 2.51
+C AMD bd1 4.43
+C AMD bd2 5.03 5.63
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bobcat 6.20
+C AMD jaguar 5.57 6.56
+C Intel P4 14.9 17.1
+C Intel core2 5.15
+C Intel NHM 4.93
+C Intel SBR 3.95
+C Intel IBR 3.75
+C Intel HWL 3.62
+C Intel BWL 2.53
+C Intel SKL 2.53
+C Intel atom 21.3
+C Intel SLM 9.0
+C VIA nano 5.0
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
+
+define(`n', `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
+ mul vl
+IFSTD(` mov %rbx, n ')
+
+ and $3, R32(%rbx)
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): dec n
+ jne L(gt1)
+ ADDSUB %rax, (rp)
+ jmp L(ret)
+L(gt1): lea 8(up,n,8), up
+ lea -8(rp,n,8), rp
+ neg n
+ xor %r10, %r10
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r9
+ mov (up,n,8), %rax
+ mov %rdx, %r8
+ jmp L(L1)
+
+L(b0): lea (up,n,8), up
+ lea -16(rp,n,8), rp
+ neg n
+ xor %r10, %r10
+ mov %rax, %r8
+ mov %rdx, %rbx
+ jmp L(L0)
+
+L(b3): lea -8(up,n,8), up
+ lea -24(rp,n,8), rp
+ neg n
+ mov %rax, %rbx
+ mov %rdx, %r10
+ jmp L(L3)
+
+L(b2): lea -16(up,n,8), up
+ lea -32(rp,n,8), rp
+ neg n
+ xor %r8, %r8
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov 24(up,n,8), %rax
+ mov %rdx, %r9
+ jmp L(L2)
+
+ ALIGN(16)
+L(top): ADDSUB %r10, (rp,n,8)
+ adc %rax, %r9
+ mov (up,n,8), %rax
+ adc %rdx, %r8
+ mov $0, R32(%r10)
+L(L1): mul vl
+ ADDSUB %r9, 8(rp,n,8)
+ adc %rax, %r8
+ adc %rdx, %rbx
+L(L0): mov 8(up,n,8), %rax
+ mul vl
+ ADDSUB %r8, 16(rp,n,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+L(L3): mov 16(up,n,8), %rax
+ mul vl
+ ADDSUB %rbx, 24(rp,n,8)
+ mov $0, R32(%r8) C zero
+ mov %r8, %rbx C zero
+ adc %rax, %r10
+ mov 24(up,n,8), %rax
+ mov %r8, %r9 C zero
+ adc %rdx, %r9
+L(L2): mul vl
+ add $4, n
+ js L(top)
+
+ ADDSUB %r10, (rp,n,8)
+ adc %rax, %r9
+ adc %r8, %rdx
+ ADDSUB %r9, 8(rp,n,8)
+L(ret): adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm
new file mode 100644
index 0000000..c1dcdc4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm
@@ -0,0 +1,186 @@
+dnl AMD64 mpn_addmul_2 optimised for Intel Atom.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom 18.8 this
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov n_param, n
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov (up), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ lea -8(rp), rp
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov (up), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ lea -16(up), up
+ lea -24(rp), rp
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up), %rax
+ xor R32(w1), R32(w1)
+ lea 8(up), up
+ dec n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov (up), %rax
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ lea -8(up), up
+ lea -16(rp), rp
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top):
+L(lo1): mul v1
+ add w3, (rp)
+ mov $0, R32(w2)
+ adc %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add w0, 8(rp)
+ adc %rax, w1
+ mov 8(up), %rax
+ mov $0, R32(w3)
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov 8(up), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add w1, 16(rp)
+ adc %rax, w2
+ mov 16(up), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(lo2): mul v1
+ add w2, 24(rp)
+ adc %rax, w3
+ mov 24(up), %rax
+ adc %rdx, w0
+ mov $0, R32(w1)
+ lea 32(rp), rp
+ mul v0
+ lea 32(up), up
+ add %rax, w3
+ adc %rdx, w0
+ mov -8(up), %rax
+ adc $0, R32(w1)
+ sub $4, n
+ ja L(top)
+
+L(end): mul v1
+ add w3, (rp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..f44de19
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm
@@ -0,0 +1,238 @@
+dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom.
+dnl Used also for AMD bd1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * This code is slightly large at 433 bytes.
+C * sublsh1_n.asm and this file use the same basic pattern.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 2.3
+C AMD bobcat ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.875 (4.75 is probably possible)
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh1_n)
+ define(func_nc, mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh1_n)
+ define(func_nc, mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ xor R32(%rbp), R32(%rbp)
+L(ent): mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r8
+ add %r8, %r8
+ lea 8(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 8(up), up
+ lea 8(rp), rp
+ jmp L(b0)
+
+L(b2): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ lea 16(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(b0)
+
+L(b3): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ lea 24(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 24(up), up
+ lea 24(rp), rp
+
+L(b0): test $4, R8(n)
+ jz L(skp)
+ add R32(%rax), R32(%rax) C restore scy
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ lea 32(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ mov %r11, 24(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ sbb R32(%rbp), R32(%rbp) C save acy
+
+L(skp): cmp $8, n
+ jl L(rtn)
+
+ push %r12
+ push %r13
+ push %r14
+ push %rbx
+ lea -64(rp), rp
+ jmp L(x)
+
+ ALIGN(16)
+L(top): add R32(%rax), R32(%rax) C restore scy
+ lea 64(rp), rp
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ mov 32(vp), %r12
+ adc %r12, %r12
+ mov 40(vp), %r13
+ adc %r13, %r13
+ mov 48(vp), %r14
+ adc %r14, %r14
+ mov 56(vp), %rbx
+ adc %rbx, %rbx
+ lea 64(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ mov %r11, 24(rp)
+ ADCSBB 32(up), %r12
+ mov %r12, 32(rp)
+ ADCSBB 40(up), %r13
+ mov %r13, 40(rp)
+ ADCSBB 48(up), %r14
+ mov %r14, 48(rp)
+ ADCSBB 56(up), %rbx
+ mov %rbx, 56(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 64(up), up
+L(x): sub $8, n
+ jge L(top)
+
+L(end): pop %rbx
+ pop %r14
+ pop %r13
+ pop %r12
+L(rtn):
+ifdef(`OPERATION_addlsh1_n',`
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)')
+ifdef(`OPERATION_rsblsh1_n',`
+ sub R32(%rax), R32(%rbp)
+ movslq R32(%rbp), %rax')
+
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ neg %r8 C set CF
+ sbb R32(%rbp), R32(%rbp) C save acy
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..02fb29d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm
@@ -0,0 +1,191 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+dnl Optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5.75
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+define(`LSH', 2)
+define(`RSH', 62)
+define(M, eval(m4_lshift(1,LSH)))
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh2_n)
+ define(func_nc, mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh2_n)
+ define(func_nc, mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0) C we rely on rax = 0 at target
+ cmp $2, R32(%rax)
+ mov $0, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r9
+ lea (%rax,%r9,M), %rbp
+ shr $RSH, %r9
+ sub $1, n
+ lea -8(up), up
+ lea -8(rp), rp
+ jz L(cj1)
+ mov 8(vp), %r10
+ lea (%r9,%r10,M), %r9
+ shr $RSH, %r10
+ mov 16(vp), %r11
+ lea 24(vp), vp
+ mov (vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ jmp L(L1)
+
+L(b2): lea -32(rp), rp
+ mov (vp), %r8
+ lea -32(up), up
+ lea (%rax,%r8,M), %rbx
+ shr $RSH, %r8
+ mov 8(vp), %r9
+ sub $2, n
+ jle L(end)
+ jmp L(top)
+
+L(b3): lea -24(up), up
+ mov (vp), %r11
+ lea -24(rp), rp
+ mov 8(vp), %r8
+ lea (%rax,%r11,M), %r10
+ shr $RSH, %r11
+ lea 8(vp), vp
+ lea (%r11,%r8,M), %rbx
+ add $1, n
+ jmp L(L3)
+
+L(b0): lea -16(up), up
+ mov (vp), %r10
+ lea (%rax,%r10,M), %r9
+ shr $RSH, %r10
+ mov 8(vp), %r11
+ lea -16(rp), rp
+ mov 16(vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ lea 16(vp), vp
+ jmp L(L0)
+
+ ALIGN(16)
+L(top): lea (%r8,%r9,M), %rbp
+ shr $RSH, %r9
+ lea 32(up), up
+ mov 16(vp), %r10
+ lea (%r9,%r10,M), %r9
+ shr $RSH, %r10
+ mov 24(vp), %r11
+ lea 32(rp), rp
+ lea 32(vp), vp
+ mov (vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ ADCSBB (up), %rbx
+ mov %rbx, (rp)
+L(L1): ADCSBB 8(up), %rbp
+ mov %rbp, 8(rp)
+L(L0): ADCSBB 16(up), %r9
+ lea (%r11,%r8,M), %rbx
+ mov %r9, 16(rp)
+L(L3): ADCSBB 24(up), %r10
+ sbb R32(%rax), R32(%rax)
+L(L2): shr $RSH, %r8
+ mov 8(vp), %r9
+ mov %r10, 24(rp)
+ sub $4, n
+ jg L(top)
+
+L(end): lea (%r8,%r9,M), %rbp
+ shr $RSH, %r9
+ lea 32(up), up
+ lea 32(rp), rp
+ add R32(%rax), R32(%rax)
+ ADCSBB (up), %rbx
+ mov %rbx, (rp)
+L(cj1): ADCSBB 8(up), %rbp
+ mov %rbp, 8(rp)
+
+ifdef(`OPERATION_addlsh2_n',`
+ mov R32(n), R32(%rax) C zero rax
+ adc %r9, %rax')
+ifdef(`OPERATION_rsblsh2_n',`
+ sbb n, %r9 C subtract 0
+ mov %r9, %rax')
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm
new file mode 100644
index 0000000..83b8df9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm
@@ -0,0 +1,128 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom.
+
+dnl Copyright 2011, 2017 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato. Ported to 64-bit by
+dnl Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 2
+C AMD bull 2.34\2.63
+C AMD pile 2.27\2.52
+C AMD steam
+C AMD excavator
+C AMD bobcat 2.79
+C AMD jaguar 2.78
+C Intel P4 11
+C Intel core2 7.5
+C Intel NHM 8.5
+C Intel SBR 2.11
+C Intel IBR 2.07
+C Intel HWL 1.75
+C Intel BWL 1.51
+C Intel SKL 1.52
+C Intel atom 3
+C Intel SLM 4
+C VIA nano
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func_n, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func_n, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ xor cy, cy C carry
+
+L(com): shr n C n >> 1
+ jz L(1) C n == 1
+ jc L(1m2) C n % 2 == 1
+
+L(0m2): shr cy
+ mov (up), %r10
+ lea 8(up), up
+ lea 8(vp), vp
+ lea -8(rp), rp
+ jmp L(mid)
+
+L(1): shr cy
+ mov (up), %r9
+ jmp L(end)
+
+L(1m2): shr cy
+ mov (up), %r9
+
+ ALIGN(16)
+L(top): ADCSBB (vp), %r9
+ lea 16(up), up
+ mov -8(up), %r10
+ lea 16(vp), vp
+ mov %r9, (rp)
+L(mid): ADCSBB -8(vp), %r10
+ lea 16(rp), rp
+ dec n
+ mov (up), %r9
+ mov %r10, -8(rp)
+ jnz L(top)
+
+L(end): ADCSBB (vp), %r9
+ mov $0, R32(%rax)
+ mov %r9, (rp)
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), cy ')
+ jmp L(com)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm
new file mode 100644
index 0000000..7cbc085
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm
@@ -0,0 +1,194 @@
+dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bull 4.73
+C AMD pile 4.60 4.80
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.48
+C AMD jaguar 5.61
+C Intel P4 16.6
+C Intel core2 5.09
+C Intel NHM 4.79
+C Intel SBR 3.88
+C Intel IBR 3.65
+C Intel HWL 3.53
+C Intel BWL 2.75
+C Intel SKL 2.76
+C Intel atom 19.4
+C Intel SLM 8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ mov %rdx, %r10
+ mul v0
+ mov %rax, %r8
+ mov 16(up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r11
+ mov -8(up,n,8), %rax
+ mov %rdx, %r10
+ mul v0
+ test n, n
+ jns L(cj2)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ mov %rdx, %r9
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n_param)
+ jnz L(b11)
+
+L(b01): mov $2, R32(n)
+ sub n_param, n
+ mul v0
+ test n, n
+ jns L(cj1)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ mov %rdx, %r9
+ mul v0
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ mov %rdx, %r10
+ jmp L(lo1)
+
+L(b11): xor R32(n), R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r8
+ mov 16(up,n,8), %rax
+ mov %rdx, %r9
+ mul v0
+ mov %rax, %r11
+ mov 24(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mul v0
+ ADDSUB %r8, -16(rp,n,8)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+L(lo2): mul v0
+ ADDSUB %r11, -8(rp,n,8)
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ adc %r10, %r8
+ mov %rdx, %r10
+ adc $0, %r9
+L(lo1): mul v0
+ ADDSUB %r8, (rp,n,8)
+ mov %rax, %r8
+ adc %r9, %r11
+ mov 16(up,n,8), %rax
+ adc $0, %r10
+L(lo0): mov %rdx, %r9
+ mul v0
+ ADDSUB %r11, 8(rp,n,8)
+ mov %rax, %r11
+ adc %r10, %r8
+ mov 24(up,n,8), %rax
+ adc $0, %r9
+L(lo3): add $4, n
+ mov %rdx, %r10
+ js L(top)
+
+L(end): mul v0
+ ADDSUB %r8, -16(rp,n,8)
+ adc %r9, %r11
+ adc $0, %r10
+L(cj2): ADDSUB %r11, -8(rp,n,8)
+ adc %r10, %rax
+ adc $0, %rdx
+L(cj1): ADDSUB %rax, (rp,n,8)
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm
new file mode 100644
index 0000000..fcb9a0f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_cnd_add_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n)
+include_mpn(`x86_64/coreisbr/cnd_add_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..9eee1c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_cnd_sub_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_sub_n)
+include_mpn(`x86_64/coreisbr/cnd_sub_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/com.asm b/gmp-6.3.0/mpn/x86_64/atom/com.asm
new file mode 100644
index 0000000..6b6460f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for Intel Atom.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyd.asm b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm
new file mode 100644
index 0000000..e309279
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for Intel Atom.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyi.asm b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm
new file mode 100644
index 0000000..00ec3c2
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for Intel Atom.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm
new file mode 100644
index 0000000..d9ba5fe
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86_64/nano/dive_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h
new file mode 100644
index 0000000..2cd90f6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define SHLD_SLOW 1
+#define SHRD_SLOW 1
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 50,646,641 */
+/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define DIV_1_VS_MUL_1_PERCENT 201
+
+#define MUL_TOOM22_THRESHOLD 12
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 106
+#define MUL_TOOM6H_THRESHOLD 155
+#define MUL_TOOM8H_THRESHOLD 212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58
+
+#define SQR_BASECASE_THRESHOLD 5
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 159
+#define SQR_TOOM8_THRESHOLD 236
+
+#define MULMID_TOOM42_THRESHOLD 16
+
+#define MULMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
+ { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \
+ { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
+ { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \
+ { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \
+ { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
+ { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \
+ { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \
+ { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 71, 9}, { 143, 8}, { 287,10}, { 79,11}, \
+ { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207,11}, { 111,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,11}, { 223,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,12}, { 223,11}, \
+ { 447,13}, { 127,12}, { 255,11}, { 511,12}, \
+ { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 575,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 767,13}, \
+ { 447,14}, { 255,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 703,14}, { 383,13}, \
+ { 831,12}, { 1663,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1791,16}, \
+ { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,15}, { 1535,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 169
+#define MUL_FFT_THRESHOLD 2240
+
+#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 184, 5}, { 11, 6}, { 13, 7}, { 7, 6}, \
+ { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \
+ { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
+ { 23,10}, { 15, 9}, { 39,10}, { 23, 9}, \
+ { 47,11}, { 15,10}, { 31, 9}, { 63, 8}, \
+ { 127, 7}, { 255,10}, { 39, 8}, { 159,10}, \
+ { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \
+ { 71, 9}, { 143, 8}, { 287, 7}, { 575, 9}, \
+ { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \
+ { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159, 9}, { 319, 8}, \
+ { 639,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,11}, { 111,10}, { 223, 9}, \
+ { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
+ { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 223,10}, { 447,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
+ { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,12}, { 287,11}, { 575,12}, { 319,11}, \
+ { 639,12}, { 351,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 575,13}, { 319,12}, { 703,13}, { 383,12}, \
+ { 767,13}, { 447,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,15}, { 255,14}, \
+ { 511,13}, { 1151,14}, { 639,13}, { 1407,12}, \
+ { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \
+ { 1791,15}, { 511,14}, { 1023,13}, { 2047,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,15}, { 1535,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 172
+#define SQR_FFT_THRESHOLD 1728
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 33
+#define MULLO_MUL_N_THRESHOLD 4392
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 85
+#define SQRLO_SQR_THRESHOLD 3176
+
+#define DC_DIV_QR_THRESHOLD 34
+#define DC_DIVAPPR_Q_THRESHOLD 119
+#define DC_BDIV_QR_THRESHOLD 31
+#define DC_BDIV_Q_THRESHOLD 76
+
+#define INV_MULMOD_BNM1_THRESHOLD 22
+#define INV_NEWTON_THRESHOLD 149
+#define INV_APPR_THRESHOLD 123
+
+#define BINV_NEWTON_THRESHOLD 179
+#define REDC_1_TO_REDC_2_THRESHOLD 24
+#define REDC_2_TO_REDC_N_THRESHOLD 39
+
+#define MU_DIV_QR_THRESHOLD 807
+#define MU_DIVAPPR_Q_THRESHOLD 807
+#define MUPI_DIV_QR_THRESHOLD 77
+#define MU_BDIV_QR_THRESHOLD 748
+#define MU_BDIV_Q_THRESHOLD 807
+
+#define POWM_SEC_TABLE 1,22,114,326,1486
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1565
+
+#define FAC_DSC_THRESHOLD 960
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD2_DIV1_METHOD 3 /* 5.86% faster than 4 */
+#define HGCD_THRESHOLD 88
+#define HGCD_APPR_THRESHOLD 88
+#define HGCD_REDUCE_THRESHOLD 1182
+#define GCD_DC_THRESHOLD 241
+#define GCDEXT_DC_THRESHOLD 192
+#define JACOBI_BASE_METHOD 3 /* 9.43% faster than 2 */
+
+/* Tuneup completed successfully, took 193098 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshift.asm b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm
new file mode 100644
index 0000000..1b37d5d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm
@@ -0,0 +1,123 @@
+dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
+C larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ dec n
+ lea 8(rp), rp
+ lea -8(up), up
+ jz L(end)
+
+ ALIGN(8)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ mov -8(up), %r8
+ mov %r8, %r10
+ mov %r9, -8(rp)
+ shr R8(%rcx), %r8
+ lea -16(rp), rp
+L(lo1): mov -16(up), %r9
+ or %r11, %r8
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ mov %r8, (rp)
+ dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm
new file mode 100644
index 0000000..7385f8f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm
@@ -0,0 +1,127 @@
+dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5
+C times larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ not %r11
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ lea 8(rp), rp
+ lea -8(up), up
+ jmp L(lo0)
+
+C ALIGN(16)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ not %r9
+ neg R8(%rcx)
+ mov -8(up), %r8
+ lea -16(rp), rp
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ mov %r9, 8(rp)
+L(lo1): or %r11, %r8
+ mov -16(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ not %r8
+ mov %r8, (rp)
+L(lo0): dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ not %r9
+ shl R8(%rcx), %r11
+ not %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm
new file mode 100644
index 0000000..a0dcf1e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm
@@ -0,0 +1,147 @@
+dnl AMD64 mpn_mul_1 optimised for Intel Atom.
+
+dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.03
+C AMD K10 3.03
+C AMD bull 4.74
+C AMD pile 4.56
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56 6.04
+C AMD jaguar 5.55 5.84
+C Intel P4 13.05
+C Intel core2 4.03
+C Intel NHM 3.80
+C Intel SBR 2.75
+C Intel IBR 2.69
+C Intel HWL 2.50
+C Intel BWL 2.55
+C Intel SKL 2.57
+C Intel atom 17.3
+C Intel SLM 14.7
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(com): mov (up), %rax
+ lea -16(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): mov %r8, %r9
+ test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $2, R32(n)
+ sub n_param, n
+ jmp L(lo0)
+
+L(bx1): test $2, R8(n_param)
+ jnz L(b11)
+
+L(b01): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ cmp $2, n
+ jnz L(lo1)
+ jmp L(cj1)
+
+L(b11): mov $1, R32(n)
+ sub n_param, n
+ jmp L(lo3)
+
+L(b10): xor R32(n), R32(n)
+ sub n_param, n
+ jmp L(lo2)
+
+L(top): mul v0
+ mov %r9, -24(rp,n,8)
+L(lo1): xor %r9d, %r9d
+ add %rax, %r8
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ mov %r8, -16(rp,n,8)
+L(lo0): xor %r8d, %r8d
+ mul v0
+ add %rax, %r9
+ mov 8(up,n,8), %rax
+ adc %rdx, %r8
+ mov %r9, -8(rp,n,8)
+L(lo3): xor %r9d, %r9d
+ mul v0
+ add %rax, %r8
+ mov 16(up,n,8), %rax
+ adc %rdx, %r9
+ mov %r8, (rp,n,8)
+L(lo2): xor %r8d, %r8d
+ mul v0
+ add %rax, %r9
+ mov 24(up,n,8), %rax
+ adc %rdx, %r8
+ add $4, n
+ js L(top)
+
+L(end): mul v0
+ mov %r9, -8(rp)
+L(cj1): add %rax, %r8
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ mov %r8, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(com)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm
new file mode 100644
index 0000000..4bc22cd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_mul_2 optimised for Intel Atom.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9 5.78
+C AMD K10 5.78
+C AMD bull 9.10
+C AMD pile 9.17
+C AMD steam
+C AMD excavator
+C AMD bobcat 11.3
+C AMD jaguar 10.9
+C Intel P4 24.6
+C Intel core2 8.06
+C Intel NHM 7.65
+C Intel SBR 6.28
+C Intel IBR 6.10
+C Intel HWL 6.09
+C Intel BWL 4.73
+C Intel SKL 4.77
+C Intel atom 35.3
+C Intel SLM 25.6
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov n_param, n
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov (up), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ lea -8(rp), rp
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov (up), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ lea -16(up), up
+ lea -24(rp), rp
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up), %rax
+ xor R32(w1), R32(w1)
+ lea 8(up), up
+ dec n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov (up), %rax
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ lea -8(up), up
+ lea -16(rp), rp
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top):
+L(lo1): mul v1
+ add %rax, w0
+ mov (up), %rax
+ mov $0, R32(w2)
+ mov w3, (rp)
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add %rax, w1
+ mov 8(up), %rax
+ mov w0, 8(rp)
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov 8(up), %rax
+ adc %rdx, w2
+ mov $0, R32(w3)
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up), %rax
+ mov w1, 16(rp)
+ mov $0, R32(w0)
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up), %rax
+ adc %rdx, w3
+L(lo2): mov $0, R32(w1)
+ mov w2, 24(rp)
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov 24(up), %rax
+ lea 32(up), up
+ adc %rdx, w0
+ mul v0
+ lea 32(rp), rp
+ add %rax, w3
+ adc %rdx, w0
+ mov -8(up), %rax
+ adc $0, R32(w1)
+ sub $4, n
+ ja L(top)
+
+L(end): mul v1
+ mov w3, (rp)
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/popcount.asm b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm
new file mode 100644
index 0000000..fb14dd3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm
@@ -0,0 +1,35 @@
+dnl x86-64 mpn_popcount.
+
+dnl Copyright 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm
new file mode 100644
index 0000000..62b9a84
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm
@@ -0,0 +1,579 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+C * Make lead-in code for the inner loops be more similar.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ adc %r9, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 32(mp,n,8), %rax
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+L(e1): add $4, i
+ mov %rdx, %r10
+ js L(tp1)
+
+L(ed1): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ mov 24(mp,n,8), %rax
+ adc %r9, %rbx
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ mov 32(mp,n,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+L(e3): mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp3)
+
+L(ed3): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0): cmp $-4, R32(n)
+ jz L(n4)
+
+L(otp0):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %r11
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up,n,8), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+L(e0): mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp0)
+
+L(ed0): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %r11
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up,n,8), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+L(e2): mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp2)
+
+L(ed2): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n4): mov -32(mp), %rax
+ mul q0
+ mov %rax, %r11
+ mov -24(mp), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov -16(mp), %rax
+ mov %rdx, %r9
+ mul q0
+ add -32(up), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov -8(mp), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add -24(up), %rbx
+ mov %rbx, -24(up)
+ adc %r9, %r11
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ add %r11, -16(up)
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, -8(up)
+ adc $0, %rdx
+ mov %rdx, -32(up) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ dec j
+ lea 8(up), up C up++
+ jnz L(n4)
+ jmp L(cj)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm
new file mode 100644
index 0000000..6f5f638
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm
@@ -0,0 +1,287 @@
+dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Schedule loop less. It is now almost surely overscheduled, resulting in
+C large feed-in and wind-down code.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NMH ?
+C Intel SBR ?
+C Intel atom 5.25
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), %r15
+ ADDSUB (vp), %r15
+ sbb R32(%rbx), R32(%rbx)
+ xor R32(%rax), R32(%rax)
+ shr %r15
+ adc R32(%rax), R32(%rax) C return value
+
+ mov R32(n), R32(%rbp)
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jae L(b23)
+
+L(b1): dec n
+ jnz L(gt1)
+ shl $63, %rbx
+ add %rbx, %r15
+ mov %r15, (rp)
+ jmp L(cj1)
+L(gt1): lea 24(up), up
+ lea 24(vp), vp
+ mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ lea 24(rp), rp
+ mov (up), %r11
+ ADCSBB -16(vp), %r9
+ ADCSBB -8(vp), %r10
+ mov %r15, %r12
+ ADCSBB (vp), %r11
+ mov %r9, %r13
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ mov %r10, %r14
+ shl $63, %r11
+ shl $63, %r10
+ shl $63, %r9
+ or %r9, %r12
+ shr %r13
+ mov 8(up), %r8
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ sub $4, n
+ jz L(cj5)
+L(gt5): mov 16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov 32(up), %r11
+ jmp L(lo1)
+
+L(b23): jnz L(b3)
+ mov 8(up), %r8
+ sub $2, n
+ jnz L(gt2)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ mov %r8, %r12
+ jmp L(cj2)
+L(gt2): mov 16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov 32(up), %r11
+ ADCSBB 16(vp), %r9
+ lea 32(up), up
+ ADCSBB 24(vp), %r10
+ mov %r9, %r13
+ ADCSBB 32(vp), %r11
+ mov %r8, %r12
+ jmp L(lo2)
+
+L(b3): lea 40(up), up
+ lea 8(vp), vp
+ mov %r15, %r14
+ add R32(%rbx), R32(%rbx)
+ mov -32(up), %r11
+ ADCSBB 0(vp), %r11
+ lea 8(rp), rp
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ shl $63, %r11
+ mov -24(up), %r8
+ shr %r15
+ or %r11, %r14
+ sub $3, n
+ jnz L(gt3)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ jmp L(cj3)
+L(gt3): mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov (up), %r11
+ ADCSBB 16(vp), %r9
+ ADCSBB 24(vp), %r10
+ mov %r8, %r12
+ jmp L(lo3)
+
+L(b0): lea 48(up), up
+ lea 16(vp), vp
+ add R32(%rbx), R32(%rbx)
+ mov -40(up), %r10
+ lea 16(rp), rp
+ mov -32(up), %r11
+ ADCSBB -8(vp), %r10
+ mov %r15, %r13
+ ADCSBB (vp), %r11
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ mov %r10, %r14
+ shl $63, %r11
+ shl $63, %r10
+ mov -24(up), %r8
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ sub $4, n
+ jnz L(gt4)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ jmp L(cj4)
+L(gt4): mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov (up), %r11
+ ADCSBB 16(vp), %r9
+ jmp L(lo0)
+
+ ALIGN(8)
+L(top): mov 16(up), %r9
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ mov %rbp, (rp)
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ lea 32(rp), rp
+ mov 32(up), %r11
+L(lo1): ADCSBB 16(vp), %r9
+ lea 32(up), up
+ mov %r12, -24(rp)
+L(lo0): ADCSBB 24(vp), %r10
+ mov %r8, %r12
+ mov %r13, -16(rp)
+L(lo3): ADCSBB 32(vp), %r11
+ mov %r9, %r13
+ mov %r14, -8(rp)
+L(lo2): sbb R32(%rbx), R32(%rbx)
+ shl $63, %r8
+ mov %r11, %r15
+ shr %r12
+ mov %r10, %r14
+ shl $63, %r9
+ lea 32(vp), vp
+ shl $63, %r10
+ or %r8, %rbp
+ shl $63, %r11
+ or %r9, %r12
+ shr %r13
+ mov 8(up), %r8
+ sub $4, n
+ jg L(top)
+
+L(end): shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ mov %rbp, (rp)
+ lea 32(rp), rp
+L(cj5): add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ mov %r12, -24(rp)
+L(cj4): mov %r13, -16(rp)
+L(cj3): mov %r8, %r12
+ mov %r14, -8(rp)
+L(cj2): sbb R32(%rbx), R32(%rbx)
+ shl $63, %r8
+ shr %r12
+ or %r8, %r15
+ shl $63, %rbx
+ add %rbx, %r12
+ mov %r15, (rp)
+ mov %r12, 8(rp)
+L(cj1): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/rshift.asm b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm
new file mode 100644
index 0000000..29c027d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm
@@ -0,0 +1,121 @@
+dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
+C larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shr R8(cnt), %r11
+ neg R8(cnt)
+ shl R8(cnt), %rax
+ test n, n
+ jnz L(gt1)
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov 8(up), %r8
+ mov %r8, %r10
+ shl R8(cnt), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(cnt)
+ shl R8(cnt), %rax
+ mov 8(up), %r9
+ mov %r9, %r11
+ shl R8(cnt), %r9
+ neg R8(cnt)
+ dec n
+ lea -8(rp), rp
+ lea 8(up), up
+ jz L(end)
+
+ ALIGN(8)
+L(top): shr R8(cnt), %r10
+ or %r10, %r9
+ shr R8(cnt), %r11
+ neg R8(cnt)
+ mov 8(up), %r8
+ mov %r8, %r10
+ mov %r9, 8(rp)
+ shl R8(cnt), %r8
+ lea 16(rp), rp
+L(lo1): mov 16(up), %r9
+ or %r11, %r8
+ mov %r9, %r11
+ shl R8(cnt), %r9
+ lea 16(up), up
+ neg R8(cnt)
+ mov %r8, (rp)
+ dec n
+ jg L(top)
+
+L(end): shr R8(cnt), %r10
+ or %r10, %r9
+ shr R8(cnt), %r11
+ mov %r9, 8(rp)
+ mov %r11, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm
new file mode 100644
index 0000000..1306acd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm
@@ -0,0 +1,242 @@
+dnl AMD64 mpn_sublsh1_n optimised for Intel Atom.
+dnl Used also for AMD bd1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * This code is slightly large at 501 bytes.
+C * aorrlsh1_n.asm and this file use the same basic pattern.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 2.3
+C AMD bobcat ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5 (4.875 is probably possible)
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ push %r15
+ xor R32(%rbp), R32(%rbp)
+L(ent): mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r8
+ add %r8, %r8
+ lea 8(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 8(up), up
+ lea 8(rp), rp
+ jmp L(b0)
+
+L(b2): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ lea 16(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(b0)
+
+L(b3): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ lea 24(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 24(up), up
+ lea 24(rp), rp
+
+L(b0): test $4, R8(n)
+ jz L(skp)
+ add R32(%rax), R32(%rax) C restore scy
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ lea 32(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ mov 24(up), %r15
+ sbb %r11, %r15
+ mov %r15, 24(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ sbb R32(%rbp), R32(%rbp) C save acy
+
+L(skp): cmp $8, n
+ jl L(rtn)
+
+ push %r12
+ push %r13
+ push %r14
+ push %rbx
+ lea -64(rp), rp
+ jmp L(x)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ add R32(%rax), R32(%rax)
+ lea 64(vp), vp
+ adc %r8, %r8
+ mov -56(vp), %r9
+ adc %r9, %r9
+ mov -48(vp), %r10
+ adc %r10, %r10
+ mov -40(vp), %r11
+ adc %r11, %r11
+ mov -32(vp), %r12
+ adc %r12, %r12
+ mov -24(vp), %r13
+ adc %r13, %r13
+ mov -16(vp), %r14
+ adc %r14, %r14
+ mov -8(vp), %r15
+ adc %r15, %r15
+ sbb R32(%rax), R32(%rax)
+ add R32(%rbp), R32(%rbp)
+ mov (up), %rbp
+ lea 64(rp), rp
+ mov 8(up), %rbx
+ sbb %r8, %rbp
+ mov 32(up), %r8
+ mov %rbp, (rp)
+ sbb %r9, %rbx
+ mov 16(up), %rbp
+ mov %rbx, 8(rp)
+ sbb %r10, %rbp
+ mov 24(up), %rbx
+ mov %rbp, 16(rp)
+ sbb %r11, %rbx
+ mov %rbx, 24(rp)
+ sbb %r12, %r8
+ mov 40(up), %r9
+ mov %r8, 32(rp)
+ sbb %r13, %r9
+ mov 48(up), %rbp
+ mov %r9, 40(rp)
+ sbb %r14, %rbp
+ mov 56(up), %rbx
+ mov %rbp, 48(rp)
+ sbb %r15, %rbx
+ lea 64(up), up
+ mov %rbx, 56(rp)
+ sbb R32(%rbp), R32(%rbp)
+L(x): sub $8, n
+ jge L(top)
+
+L(end): pop %rbx
+ pop %r14
+ pop %r13
+ pop %r12
+L(rtn):
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)
+
+ pop %r15
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(mpn_sublsh1_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ push %r15
+ neg %r8 C set CF
+ sbb R32(%rbp), R32(%rbp) C save acy
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/README b/gmp-6.3.0/mpn/x86_64/bd1/README
new file mode 100644
index 0000000..ccd210e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/README
@@ -0,0 +1,11 @@
+This directory contains code for AMD bulldozer including its piledriver update.
+
+We currently make limited use of SIMD instructions, both via the MPN_PATH and
+via inclusion of x86_64/fastsse files.
+
+The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably
+means that an all-core GMP load (such as a HPC load) might run slower if there
+is significant SIMD dependency.
+
+We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
+SIMD code.
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm
new file mode 100644
index 0000000..b54e91a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm
@@ -0,0 +1,235 @@
+dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1 4.2
+C AMD bd2 4.4
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1
+C AMD bt2
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+ mov $0, R32(w2) C abuse w2
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+ sub n_param, w2
+ mul v0
+
+ test $1, R8(w2)
+ jnz L(bx1)
+
+L(bx0): mov %rdx, X0
+ mov %rax, X1
+ test $2, R8(w2)
+ jnz L(b10)
+
+L(b00): lea (w2), n C un = 4, 8, 12, ...
+ mov (up,w2,8), %rax
+ mov (rp,w2,8), w3
+ mul v1
+ mov %rax, w0
+ mov 8(up,w2,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(w2), n C un = 2, 6, 10, ...
+ mov (up,w2,8), %rax
+ mov (rp,w2,8), w1
+ mul v1
+ mov %rdx, w3
+ mov %rax, w2
+ mov -8(up,n,8), %rax
+ test n, n
+ jz L(end)
+ jmp L(top)
+
+L(bx1): mov %rax, X0
+ mov %rdx, X1
+ test $2, R8(w2)
+ jz L(b11)
+
+L(b01): lea 1(w2), n C un = 1, 5, 9, ...
+ mov (up,w2,8), %rax
+ mul v1
+ mov (rp,w2,8), w2
+ mov %rdx, w0
+ mov %rax, w3
+ jmp L(lo1)
+
+L(b11): lea -1(w2), n C un = 3, 7, 11, ...
+ mov (up,w2,8), %rax
+ mul v1
+ mov (rp,w2,8), w0
+ mov %rax, w1
+ mov 8(up,w2,8), %rax
+ mov %rdx, w2
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,n,8)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up,n,8), %rax
+ mul v1
+ mov -8(rp,n,8), w1
+ mov %rdx, w0
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X0, -8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ mov (up,n,8), %rax
+ adc $0, X0
+ mov (rp,n,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(lo0): mul v0
+ add w3, X1
+ mov X1, (rp,n,8)
+ adc %rax, X0
+ mov 8(up,n,8), %rax
+ mov %rdx, X1
+ adc $0, X1
+ mov 8(rp,n,8), w3
+ mul v1
+ add w3, w0
+ adc %rax, w1
+ mov 16(up,n,8), %rax
+ mov %rdx, w2
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ adc $0, X0
+ mov 16(up,n,8), %rax
+ mov 16(rp,n,8), w0
+ mul v1
+ mov %rdx, w3
+ add w0, w1
+ adc %rax, w2
+ adc $0, w3
+ mov 24(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ add w1, X1
+ mov X1, -16(rp)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up), %rax
+ mul v1
+ mov -8(rp), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, %rdx
+ add w2, X0
+ adc $0, X1
+ mov X0, -8(rp)
+ add w3, X1
+ mov X1, (rp)
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm
new file mode 100644
index 0000000..c34a5fa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm
new file mode 100644
index 0000000..5516c9d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm
new file mode 100644
index 0000000..143c42e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreihwl/aors_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm
new file mode 100644
index 0000000..fc0d2fe
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.30 3.58
+C AMD K10 3.09
+C AMD bull 4.47 4.72
+C AMD pile 4.66
+C AMD steam
+C AMD excavator
+C AMD bobcat 6.30
+C AMD jaguar 6.29
+C Intel P4 17.3 17.8
+C Intel core2 5.13
+C Intel NHM 4.85
+C Intel SBR 3.83
+C Intel IBR 3.75
+C Intel HWL 3.45
+C Intel BWL 2.56
+C Intel SKL 2.53
+C Intel atom 20.3
+C Intel SLM 9
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
+ mul v0
+
+IFSTD(` mov %rbx, n ')
+
+ and $3, R32(%rbx)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov $0, R32(%r8)
+ mov %rax, %rbx
+ mov $0, R32(%r9)
+ mov 8(up), %rax
+ mov %rdx, %r10
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov $0, R32(%r10)
+ mov %rax, %r8
+ mov %rdx, %rbx
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): cmp $1, n
+ jz L(n1)
+ mov %rax, %r9
+ mov 8(up), %rax
+ mov %rdx, %r8
+ mov $0, R32(%rbx)
+ lea (up,n,8), up
+ neg n
+ inc n
+ jmp L(L1)
+
+L(b2): mov $0, R32(%rbx)
+ mov %rax, %r10
+ mov %rdx, %r9
+ mov 8(up), %rax
+ mov $0, R32(%r8)
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(32)
+L(top): mul v0
+ ADDSUB %r10, (rp,n,8)
+ adc %rax, %r9
+ mov (up,n,8), %rax
+ adc %rdx, %r8
+L(L1): mul v0
+ mov $0, R32(%r10)
+ ADDSUB %r9, 8(rp,n,8)
+ adc %rax, %r8
+ adc %rdx, %rbx
+ mov 8(up,n,8), %rax
+L(L0): mul v0
+ ADDSUB %r8, 16(rp,n,8)
+ mov $0, R32(%r8)
+ adc %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(up,n,8), %rax
+ adc %rdx, %r10
+L(L3): mul v0
+ ADDSUB %rbx, 24(rp,n,8)
+ mov $0, R32(%rbx)
+ adc %rax, %r10
+ adc %rdx, %r9
+ mov 24(up,n,8), %rax
+ add $4, n
+ js L(top)
+
+L(end): mul v0
+ ADDSUB %r10, (rp)
+ adc %r9, %rax
+ adc %r8, %rdx
+L(n1): ADDSUB %rax, 8(rp)
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/com.asm b/gmp-6.3.0/mpn/x86_64/bd1/com.asm
new file mode 100644
index 0000000..43f3561
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm
new file mode 100644
index 0000000..675cdc3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm
new file mode 100644
index 0000000..ceef036
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h
new file mode 100644
index 0000000..210f382
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h
@@ -0,0 +1,265 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 464,627,200 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 27
+
+#define DIV_1_VS_MUL_1_PERCENT 275
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 161
+#define MUL_TOOM6H_THRESHOLD 226
+#define MUL_TOOM8H_THRESHOLD 339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 234
+#define SQR_TOOM6_THRESHOLD 286
+#define SQR_TOOM8_THRESHOLD 466
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 99,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63, 7}, \
+ { 1023, 8}, { 543, 9}, { 303,10}, { 167,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255,11}, { 143,10}, { 287,11}, { 159,12}, \
+ { 95,11}, { 191,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,10}, { 2687,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \
+ { 1599,12}, { 831,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \
+ { 895,15}, { 255,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
+ { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \
+ { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 251
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 364 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 364, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95, 7}, \
+ { 1535, 8}, { 799, 7}, { 1599, 8}, { 831, 9}, \
+ { 447,10}, { 239,11}, { 127,10}, { 255,11}, \
+ { 143,10}, { 303,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 303,12}, { 159,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \
+ { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \
+ { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \
+ { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \
+ { 895,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 767,12}, { 1599,11}, { 3199,13}, \
+ { 831,12}, { 1727,11}, { 3455,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,11}, { 6911,14}, { 895,13}, { 1791,12}, \
+ { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \
+ { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4351,12}, { 8703,13}, { 4479,12}, { 8959,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4351,13}, { 8703,14}, \
+ { 4479,13}, { 8959,15}, { 2303,14}, { 4991,13}, \
+ { 9983,15}, { 2559,14}, { 5119,15}, { 2815,14}, \
+ { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \
+ { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \
+ { 4095,14}, { 8191,15}, { 4351,14}, { 8959,15}, \
+ { 4863,14}, { 9983,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 275
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 23
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 167
+#define DC_BDIV_QR_THRESHOLD 48
+#define DC_BDIV_Q_THRESHOLD 93
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 197
+#define INV_APPR_THRESHOLD 179
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_2_THRESHOLD 32
+#define REDC_2_TO_REDC_N_THRESHOLD 55
+
+#define MU_DIV_QR_THRESHOLD 1387
+#define MU_DIVAPPR_Q_THRESHOLD 1387
+#define MUPI_DIV_QR_THRESHOLD 92
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define POWM_SEC_TABLE 1,22,194,434,452
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 438
+#define SET_STR_PRECOMPUTE_THRESHOLD 1254
+
+#define FAC_DSC_THRESHOLD 189
+#define FAC_ODD_THRESHOLD 26
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD2_DIV1_METHOD 3 /* 2.31% faster than 4 */
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 52
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 283
+#define JACOBI_BASE_METHOD 4 /* 5.81% faster than 1 */
+
+/* Tuneup completed successfully, took 554602 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm
new file mode 100644
index 0000000..799cdda
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm
@@ -0,0 +1,206 @@
+dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.51-2.0 y
+C AMD bd2 1.50-1.9 y
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen n/a
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL n/a
+C Intel SKL n/a
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/core2/hamdist.asm')
+',`
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ cmp $5, n
+ jl L(sma)
+
+ lea L(cnsts)(%rip), %r9
+
+ xor R32(%r10), R32(%r10)
+ test $8, R8(vp)
+ jz L(ali)
+ mov (up), %r8
+ xor (vp), %r8
+ add $8, up
+ add $8, vp
+ dec n
+ popcnt %r8, %r10
+L(ali):
+
+ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
+ `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
+ movdqa OFF1`'(%r9), %xmm7 C nibble counts table
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm5 C masks
+ pxor %xmm4, %xmm4
+ pxor %xmm8, %xmm8 C grand total count
+
+ mov R32(n), R32(%rax)
+ and $6, R32(%rax)
+ lea -64(up,%rax,8), up
+ lea -64(vp,%rax,8), vp
+ifdef(`PIC',`
+ movslq (%r9,%rax,2), %r11
+ add %r9, %r11
+ jmp *%r11
+',`
+ jmp *(%r9,%rax,4)
+')
+
+L(0): add $64, up
+ add $64, vp
+ sub $2, n
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm0
+ pxor (vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(6): lddqu 16(up), %xmm0
+ pxor 16(vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(4): lddqu 32(up), %xmm0
+ pxor 32(vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+ paddb %xmm2, %xmm3
+ paddb %xmm2, %xmm4
+ paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts
+L(2): mov 48(up), %r8
+ mov 56(up), %r9
+ add $64, up
+ xor 48(vp), %r8
+ xor 56(vp), %r9
+ add $64, vp
+ popcnt %r8, %r8
+ popcnt %r9, %r9
+ add %r8, %r10
+ add %r9, %r10
+ sub $8, n
+ jg L(top)
+
+ test $1, R8(n)
+ jz L(x)
+ mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %r8
+ add %r8, %r10
+L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
+ paddq %xmm0, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ add %r10, %rax
+ FUNC_EXIT()
+ ret
+
+L(sma): mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %rax
+ dec n
+ jz L(ed)
+L(tp): mov 8(up), %r8
+ add $8, up
+ xor 8(vp), %r8
+ add $8, vp
+ popcnt %r8, %r8
+ add %r8, %rax
+ dec n
+ jnz L(tp)
+L(ed): FUNC_EXIT()
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(0), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm
new file mode 100644
index 0000000..2fb097f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm
@@ -0,0 +1,193 @@
+dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.65
+C AMD K10 3.30 3.68
+C AMD bull 4.04 4.29
+C AMD pile 4.33
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.73
+C AMD jaguar 5.87
+C Intel P4 12.5
+C Intel core2 4.38
+C Intel NHM 4.28
+C Intel SBR 2.69
+C Intel IBR 2.55
+C Intel HWL 2.41
+C Intel BWL 2.49
+C Intel SKL 2.50
+C Intel atom 20.3
+C Intel SLM 7.8
+C VIA nano 4.25
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Move loop code into feed-in blocks, to save insn for zeroing regs.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``rbx'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+IFSTD(` add %r8, %rax ')
+IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns)
+ adc $0, %rdx
+ jmp L(common)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+L(common):
+IFSTD(` mov %r11, n ')
+
+ and $3, R32(%r11)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%r11)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up), %rax
+ mul v0
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov %rax, %r9
+ mov %rdx, %r10
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): mov %rax, %r8
+ cmp $1, n
+ jz L(n1)
+ mov %rdx, %r9
+ lea (up,n,8), up
+ neg n
+ mov %r8, 16(rp,n,8)
+ inc n
+ jmp L(L1)
+
+L(b2): mov %rax, %r11
+ mov %rdx, %r8
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mul v0
+ mov %rdx, %r9
+ add %rax, %r8
+ adc $0, %r9
+ mov %r8, 8(rp,n,8)
+ mov %r11, (rp,n,8)
+L(L1): mov (up,n,8), %rax
+ mul v0
+ add %rax, %r9
+ mov %rdx, %r10
+ mov 8(up,n,8), %rax
+ adc $0, %r10
+L(L0): mul v0
+ add %rax, %r10
+ mov %rdx, %r11
+ mov 16(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ mov %r9, 16(rp,n,8)
+L(L3): add %rax, %r11
+ mov %r10, 24(rp,n,8)
+ mov %rdx, %r8
+ adc $0, %r8
+ add $4, n
+ mov -8(up,n,8), %rax
+ js L(top)
+
+L(end): mul v0
+ add %rax, %r8
+ adc $0, %rdx
+ mov %r11, (rp)
+L(n1): mov %r8, 8(rp)
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm
new file mode 100644
index 0000000..85fa7aa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 6.78
+C AMD K10 6.78
+C AMD bd1 8.39 8.65
+C AMD bd2 8.47
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1 12.1
+C AMD bt2 11.5
+C Intel P4 24.0
+C Intel PNR 8.14
+C Intel NHM 7.78
+C Intel SBR 6.34
+C Intel IBR 6.15
+C Intel HWL 6.04
+C Intel BWL 4.33
+C Intel SKL 4.41
+C Intel atom 39.5
+C Intel SLM 27.8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mov n_param, n
+ mul v0
+ neg n
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov %rdx, w3
+ mov (up,n,8), %rax
+ xor R32(w0), R32(w0)
+ mul v1
+ add $-2, n
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ mul v1
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ xor R32(w3), R32(w3)
+ dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+L(lo1): add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov (up,n,8), %rax
+ mul v0
+ mov $0, R32(w2)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mov (up,n,8), %rax
+L(lo0): mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov w0, (rp,n,8)
+ mov $0, R32(w3)
+ mov 8(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov w1, 8(rp,n,8)
+L(lo2): add %rax, w3
+ adc %rdx, w0
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ mov $0, R32(w1)
+ adc $0, R32(w1)
+ add $4, n
+ jnc L(top)
+
+L(end): mov -8(up), %rax
+ mul v1
+ mov w2, -16(rp)
+ add %rax, w0
+ mov w3, -8(rp)
+ adc %rdx, w1
+ mov w0, (rp)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm
new file mode 100644
index 0000000..e47ba58
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm
@@ -0,0 +1,416 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull ~4.8 ~4.55 - ~4.3
+C AMD pile ~4.6 ~4.55 - ~4.55
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C Alternatively, we could tweak the present code (which was loopmixed for a
+C different CPU).
+C * Merge faster mul_2, such as the one in the same directory as this file.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16)
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ mov %rax, w2 C 0
+ mov (up,un,8), %rax
+ mov %rdx, w1 C 1
+ mul v1
+ mov %rax, w0 C 1
+ mov w2, (rp,un,8) C 0
+ mov 8(up,un,8), %rax
+ mov %rdx, w2 C 2
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ mov %rax, w0 C 1
+ mov %rdx, w3 C 2
+ mov (up,un,8), %rax
+ mul v1
+ mov w0, (rp,un,8) C 1
+ mov %rdx, w0 C 3
+ mov %rax, w2 C 0
+ mov 8(up,un,8), %rax
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):add %rax, w2 C 0
+ mov (up,n,8), %rax
+ adc $0, w0 C 1
+L(m2l1):mul v0
+ add %rax, w2 C 0
+ mov (up,n,8), %rax
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mul v1
+ add w3, w2 C 0
+ adc $0, w1 C 1
+ add %rax, w0 C 1
+ mov w2, (rp,n,8) C 0
+ mov 8(up,n,8), %rax
+ mov %rdx, w2 C 2
+ adc $0, w2 C 2
+L(m2l0):mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ add w1, w0 C 1
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+ mul v1
+ add $2, n
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ jnc L(m2tp)
+
+L(m2ed):add %rax, w2
+ adc $0, %rdx
+ add w3, w2
+ adc $0, %rdx
+ mov w2, I((rp),(rp,n,8))
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up,un,8), %rax
+ mul v0
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov %rax, X1
+ mov (up,un,8), %rax
+ mov %rdx, X0
+ mul v1
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), n C un = 4, 8, 12, ...
+ mov (rp,un,8), w3
+ mov %rax, w0
+ mov 8(up,un,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(un), n C un = 2, 6, 10, ...
+ mov (rp,un,8), w1
+ mov %rdx, w3
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
+L(bx1): mov %rax, X0
+ mov (up,un,8), %rax
+ mov %rdx, X1
+ mul v1
+ test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), n C un = 1, 5, 9, ...
+ mov (rp,un,8), w2
+ mov %rdx, w0
+ mov %rax, w3
+ jmp L(lo1)
+
+L(b11): lea -1(un), n C un = 3, 7, 11, ...
+ mov (rp,un,8), w0
+ mov %rax, w1
+ mov 8(up,un,8), %rax
+ mov %rdx, w2
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,n,8)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up,n,8), %rax
+ mul v1
+ mov -8(rp,n,8), w1
+ mov %rdx, w0
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X0, -8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ mov (up,n,8), %rax
+ adc $0, X0
+ mov (rp,n,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(lo0): mul v0
+ add w3, X1
+ mov X1, (rp,n,8)
+ adc %rax, X0
+ mov 8(up,n,8), %rax
+ mov %rdx, X1
+ adc $0, X1
+ mov 8(rp,n,8), w3
+ mul v1
+ add w3, w0
+ adc %rax, w1
+ mov 16(up,n,8), %rax
+ mov %rdx, w2
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ adc $0, X0
+ mov 16(up,n,8), %rax
+ mov 16(rp,n,8), w0
+ mul v1
+ mov %rdx, w3
+ add w0, w1
+ adc %rax, w2
+ adc $0, w3
+ mov 24(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ add w1, X1
+ mov X1, I(-16(rp),-16(rp,n,8))
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ mov I(-8(rp),-8(rp,n,8)), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, %rdx
+ add w2, X0
+ adc $0, X1
+ mov X0, I(-8(rp),-8(rp,n,8))
+ add w3, X1
+ mov X1, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea 16(rp), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+ pop %r13
+ pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
new file mode 100644
index 0000000..7b084f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
@@ -0,0 +1,191 @@
+dnl AMD64 SSSE3/XOP mpn_popcount -- population count.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.27 y
+C AMD bd2 1.24 y
+C AMD bd3 ?
+C AMD bd4 1.22
+C AMD zen n/a
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL n/a
+C Intel SKL n/a
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/core2/popcount.asm')
+',`
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(3)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+ `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+ movdqa OFF1`'(%r9), %xmm7 C nibble counts table
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm9 C masks
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5 C 0-reg
+ pxor %xmm8, %xmm8 C grand total count
+
+ xor R32(%rdx), R32(%rdx)
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx
+ add $8, up
+ dec n
+ jnz L(top)
+ mov %rdx, %rax
+ FUNC_EXIT()
+ ret
+
+L(2): add $-48, up
+ jmp L(e2)
+
+L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-40, up
+ jmp L(e2)
+
+L(4): add $-32, up
+ jmp L(e4)
+
+L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-24, up
+ jmp L(e4)
+
+L(6): add $-16, up
+ jmp L(e6)
+
+L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-8, up
+ jmp L(e6)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+ paddb %xmm2, %xmm4
+L(e2): popcnt 48(up), %r8
+ popcnt 56(up), %r9
+ add $64, up
+ paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts
+ add %r8, %rdx
+ add %r9, %rdx
+ sub $8, n
+ jg L(top)
+
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ paddq %xmm5, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ add %rdx, %rax
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm
new file mode 100644
index 0000000..4ba673d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_sublsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm
new file mode 100644
index 0000000..b167077
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm
@@ -0,0 +1,96 @@
+dnl AMD64 mpn_gcd_11 optimised for AMD BD2, BD3, BT2.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 5.4
+C AMD bd2 3.72
+C AMD bd3 ?
+C AMD bd4 4.12
+C AMD bt1 9.0
+C AMD bt2 3.97
+C AMD zn1 3.36
+C AMD zn2 3.33
+C Intel P4 ?
+C Intel CNR ?
+C Intel PNR ?
+C Intel NHM ?
+C Intel WSM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ mov v0, %rdx
+ sub u0, %rdx
+ jz L(end)
+
+ ALIGN(16)
+L(top): rep;bsf %rdx, %rcx C tzcnt!
+ mov u0, %rax
+ sub v0, u0 C u - v
+ cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shr R8(%rcx), u0
+ mov v0, %rdx
+ sub u0, %rdx C v - u
+ jnz L(top)
+
+L(end): mov v0, %rax
+ C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm
new file mode 100644
index 0000000..a4f30ea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm
@@ -0,0 +1,142 @@
+dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 12.3
+C AMD K10 8.0
+C AMD bd1 10.0
+C AMD bd2 7.2
+C AMD bd3 ?
+C AMD bd4 6.7
+C AMD bt1 13.6
+C AMD bt2 8.9
+C AMD zn1 5.7
+C AMD zn2 5.6
+C Intel P4 ?
+C Intel CNR 9.7
+C Intel PNR 9.7
+C Intel NHM 9.4
+C Intel WSM 9.5
+C Intel SBR 10.3
+C Intel IBR ?
+C Intel HWL 8.2
+C Intel BWL 7.4
+C Intel SKL 7.3
+C Intel atom 26.5
+C Intel SLM 17.4
+C Intel GLM 13.4
+C Intel GLM+ 12.4
+C VIA nano ?
+
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0', `%rax')
+define(`cnt', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%r10')
+define(`t1', `%r11')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+ mov v0_param, v0
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ rep;bsf t0, cnt C tzcnt!
+ mov u0, s0
+ mov u1, s1
+
+ sub v0, u0
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+C Rightshift (u1,,u0) into (u1,,u0)
+L(shr): shr R8(cnt), u0
+ mov u1, t1
+ shr R8(cnt), u1
+ neg cnt
+ shl R8(cnt), t1
+ or t1, u0
+
+ test v1, v1
+ jnz L(top)
+ test u1, u1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ rep;bsf t0, cnt C tzcnt!
+ mov u0, s0
+ mov u1, s1
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): C mov v0, %rax
+ C mov v1, %rdx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h
new file mode 100644
index 0000000..61573ea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h
@@ -0,0 +1,263 @@
+/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4000-4200 MHz Piledriver Vishera */
+/* FFT tuning limit = 464,626,631 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 23
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 293
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 152
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 103
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 200
+#define SQR_TOOM6_THRESHOLD 286
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 55,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79, 8}, \
+ { 639, 9}, { 335,10}, { 175, 9}, { 351,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255,11}, \
+ { 143,10}, { 287,11}, { 159,12}, { 95,11}, \
+ { 191,13}, { 63,12}, { 127,11}, { 271,10}, \
+ { 543,11}, { 287,12}, { 159,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,12}, { 319,11}, { 639,10}, { 1279,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \
+ { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \
+ { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,10}, { 4863,13}, { 639,12}, { 1343,11}, \
+ { 2687,13}, { 703,12}, { 1407,11}, { 2815,14}, \
+ { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \
+ { 1727,11}, { 3455,13}, { 895,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
+ { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,12}, \
+ { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \
+ { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \
+ { 1791,12}, { 3583,13}, { 1919,12}, { 3839,11}, \
+ { 7679,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,12}, { 7679,16}, { 511,15}, \
+ { 1023,14}, { 2175,13}, { 4479,14}, { 2303,13}, \
+ { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2815,13}, { 5631,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \
+ { 2047,14}, { 4479,13}, { 8959,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \
+ { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,14}, \
+ { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 262
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 344, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191,12}, { 63, 9}, \
+ { 511,10}, { 271,11}, { 143,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,13}, { 63,12}, \
+ { 127,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 351,12}, { 191,11}, { 383,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,12}, { 287,11}, { 575,10}, \
+ { 1151,11}, { 607,12}, { 319,11}, { 639,10}, \
+ { 1279,12}, { 351,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \
+ { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
+ { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \
+ { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
+ { 831,12}, { 1727,13}, { 895,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2943,12}, { 5887,11}, { 11775,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2303,13}, { 4607,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \
+ { 5631,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4479,13}, { 8959,15}, { 2303,14}, { 4863,15}, \
+ { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \
+ { 15359,15}, { 7935,14}, { 15871,17}, { 2047,16}, \
+ { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \
+ { 19967,16}, { 5119,15}, { 10239,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 254
+#define SQR_FFT_THRESHOLD 2880
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 30
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 53
+#define SQRLO_SQR_THRESHOLD 5724
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 159
+#define DC_BDIV_QR_THRESHOLD 44
+#define DC_BDIV_Q_THRESHOLD 79
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 172
+#define INV_APPR_THRESHOLD 172
+
+#define BINV_NEWTON_THRESHOLD 226
+#define REDC_1_TO_REDC_2_THRESHOLD 40
+#define REDC_2_TO_REDC_N_THRESHOLD 51
+
+#define MU_DIV_QR_THRESHOLD 1308
+#define MU_DIVAPPR_Q_THRESHOLD 1258
+#define MUPI_DIV_QR_THRESHOLD 85
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1210
+
+#define POWM_SEC_TABLE 3,16,129,523,1297
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 228
+#define SET_STR_PRECOMPUTE_THRESHOLD 1033
+
+#define FAC_DSC_THRESHOLD 172
+#define FAC_ODD_THRESHOLD 28
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 8.54% faster than 3 */
+#define HGCD_THRESHOLD 108
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 393
+#define GCDEXT_DC_THRESHOLD 278
+#define JACOBI_BASE_METHOD 4 /* 13.69% faster than 1 */
+
+/* Tuneup completed successfully, took 463931 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm
new file mode 100644
index 0000000..ff0d27b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/zen/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm
new file mode 100644
index 0000000..4176b85
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm
@@ -0,0 +1,96 @@
+dnl AMD64 mpn_gcd_11 optimised for AMD BD4, ZN1.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 3.73
+C AMD bt1 -
+C AMD bt2 -
+C AMD zn1 3.33
+C AMD zn2 3.48
+C Intel P4 -
+C Intel CNR -
+C Intel PNR -
+C Intel NHM -
+C Intel WSM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C Intel GLM -
+C Intel GLM+ -
+C VIA nano -
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ mov u0, %rax
+ mov v0, %rdx
+ sub u0, %rdx C v - u
+ jz L(end)
+
+ ALIGN(16)
+L(top): rep;bsf %rdx, %rcx C tzcnt!
+ sub v0, u0 C u - v
+ cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shrx( %rcx, u0, %rax)
+ shrx( %rcx, u0, u0)
+ mov v0, %rdx
+ sub %rax, %rdx C v - u
+ jnz L(top)
+
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm
new file mode 100644
index 0000000..5dfd9e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/coreihwl/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h
new file mode 100644
index 0000000..9d2038c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h
@@ -0,0 +1,266 @@
+/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3800-4200 MHz Excavator/Bristol Ridge */
+/* FFT tuning limit = 461,179,335 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 52
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 25
+
+#define DIV_1_VS_MUL_1_PERCENT 298
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 53
+#define MUL_TOOM44_THRESHOLD 142
+#define MUL_TOOM6H_THRESHOLD 206
+#define MUL_TOOM8H_THRESHOLD 292
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 83
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 102
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 71
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 298
+#define SQR_TOOM8_THRESHOLD 466
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 99,10}, { 55,11}, { 31,10}, { 87,11}, \
+ { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 135, 9}, { 271, 5}, { 4351, 6}, { 2303, 7}, \
+ { 1215, 8}, { 639,10}, { 175,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415, 9}, \
+ { 831,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 639,12}, { 351,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \
+ { 319,12}, { 671,11}, { 1343,10}, { 2687,12}, \
+ { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \
+ { 575,12}, { 1151,11}, { 2303,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1535,13}, { 3071,14}, \
+ { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
+ { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
+ { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \
+ { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \
+ { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,17}, { 2047,16}, { 4095,15}, { 8191,16}, \
+ { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 253
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 300 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 300, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \
+ { 191, 8}, { 383,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
+ { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \
+ { 319, 8}, { 639,10}, { 175,11}, { 95,10}, \
+ { 191, 9}, { 383, 5}, { 6399, 6}, { 3327, 7}, \
+ { 1727, 6}, { 3455, 7}, { 1791,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415, 9}, { 831,13}, { 63,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \
+ { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2303,13}, { 4607,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \
+ { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4223,15}, { 2303,14}, { 4863,15}, { 2815,14}, \
+ { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 3583,14}, { 7167,15}, \
+ { 3839,14}, { 7679,17}, { 1023,16}, { 2047,15}, \
+ { 4095,14}, { 8191,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4095,15}, { 8447,16}, { 4607,15}, { 9983,16}, \
+ { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 273
+#define SQR_FFT_THRESHOLD 2752
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 43
+#define MULLO_MUL_N_THRESHOLD 8397
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 54
+#define SQRLO_SQR_THRESHOLD 5397
+
+#define DC_DIV_QR_THRESHOLD 39
+#define DC_DIVAPPR_Q_THRESHOLD 165
+#define DC_BDIV_QR_THRESHOLD 39
+#define DC_BDIV_Q_THRESHOLD 76
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 177
+#define INV_APPR_THRESHOLD 155
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_2_THRESHOLD 28
+#define REDC_2_TO_REDC_N_THRESHOLD 43
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 66
+#define MU_BDIV_QR_THRESHOLD 998
+#define MU_BDIV_Q_THRESHOLD 1142
+
+#define POWM_SEC_TABLE 1,16,175,269,839,1420
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 552
+#define SET_STR_PRECOMPUTE_THRESHOLD 1038
+
+#define FAC_DSC_THRESHOLD 151
+#define FAC_ODD_THRESHOLD 23
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 1 /* 8.11% faster than 3 */
+#define HGCD_THRESHOLD 87
+#define HGCD_APPR_THRESHOLD 96
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 327
+#define GCDEXT_DC_THRESHOLD 241
+#define JACOBI_BASE_METHOD 4 /* 21.40% faster than 1 */
+
+/* Tuneup completed successfully, took 431056 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..a53bd52
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm
@@ -0,0 +1,106 @@
+dnl x86_64 mpn_bdiv_dbm1.
+
+dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.25
+C AMD K10 2.25
+C Intel P4 12.5
+C Intel core2 4
+C Intel NHM 3.75
+C Intel SBR 3.6
+C Intel atom 20
+C VIA nano 4
+
+C TODO
+C * Optimise feed-in code.
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+define(`bd', `%rcx')
+define(`cy', `%r8')
+
+define(`n', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ mov (up), %rax
+ mov n_param, n
+ mov R32(n_param), R32(%r11)
+ mul bd
+ lea (up,n,8), up
+ lea (qp,n,8), qp
+ neg n
+ and $3, R32(%r11)
+ jz L(lo0)
+ lea -4(n,%r11), n
+ cmp $2, R32(%r11)
+ jc L(lo1)
+ jz L(lo2)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mov (up,n,8), %rax
+ mul bd
+L(lo0): sub %rax, %r8
+ mov %r8, (qp,n,8)
+ sbb %rdx, %r8
+ mov 8(up,n,8), %rax
+ mul bd
+L(lo3): sub %rax, %r8
+ mov %r8, 8(qp,n,8)
+ sbb %rdx, %r8
+ mov 16(up,n,8), %rax
+ mul bd
+L(lo2): sub %rax, %r8
+ mov %r8, 16(qp,n,8)
+ sbb %rdx, %r8
+ mov 24(up,n,8), %rax
+ mul bd
+L(lo1): sub %rax, %r8
+ mov %r8, 24(qp,n,8)
+ sbb %rdx, %r8
+ add $4, n
+ jnz L(top)
+
+ mov %r8, %rax
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm
new file mode 100644
index 0000000..85538c9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C norm unorm
+C AMD K8,K9 11 11
+C AMD K10 11 11
+C AMD bull 13.5 14
+C AMD pile 14 15
+C AMD steam
+C AMD excavator
+C AMD bobcat 14 14
+C AMD jaguar 14.5 15
+C Intel P4 33 33
+C Intel core2 13.5 13.25
+C Intel NHM 14 14
+C Intel SBR 8 8.25
+C Intel IBR 7.75 7.85
+C Intel HWL 8 8
+C Intel BWL 8 8
+C Intel SKL 8 8
+C Intel atom 34 36
+C Intel SLM 13.7 13.5
+C VIA nano 19.25 19.25 needs re-measuring
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`d', `%rcx')
+define(`di', `%r8') C just mpn_pi1_bdiv_q_1
+define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov %rcx, %rax
+ xor R32(%rcx), R32(%rcx) C ncnt count
+ mov %rdx, %r10
+
+ bt $0, R32(%rax)
+ jnc L(evn) C skip bsf unless divisor is even
+
+L(odd): mov %rax, %rbx
+ shr R32(%rax)
+ and $127, R32(%rax) C d/2, 7 bits
+
+ LEA( binvert_limb_table, %rdx)
+
+ movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+
+ mov %rbx, %r11 C d without twos
+
+ lea (%rax,%rax), R32(%rdx) C 2*inv
+ imul R32(%rax), R32(%rax) C inv*inv
+ imul R32(%rbx), R32(%rax) C inv*inv*d
+ sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rdx,%rdx), R32(%rax) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+ imul R32(%rbx), R32(%rdx) C inv*inv*d
+ sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+
+ lea (%rax,%rax), %r8 C 2*inv
+ imul %rax, %rax C inv*inv
+ imul %rbx, %rax C inv*inv*d
+ sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits
+
+ jmp L(pi1)
+
+L(evn): bsf %rax, %rcx
+ shr R8(%rcx), %rax
+ jmp L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ push %rbx
+
+ mov %rcx, %r11 C d
+ mov %rdx, %r10 C n
+ mov %r9, %rcx C ncnt
+
+L(pi1): mov (up), %rax C up[0]
+
+ dec %r10
+ jz L(one)
+
+ lea 8(up,%r10,8), up C up end
+ lea (rp,%r10,8), rp C rp end
+ neg %r10 C -n
+
+ test R32(%rcx), R32(%rcx)
+ jnz L(unorm) C branch if count != 0
+ xor R32(%rbx), R32(%rbx)
+ jmp L(nent)
+
+ ALIGN(8)
+L(ntop):mul %r11 C carry limb in rdx 0 10
+ mov -8(up,%r10,8), %rax C
+ sub %rbx, %rax C apply carry bit
+ setc R8(%rbx) C
+ sub %rdx, %rax C apply carry limb 5
+ adc $0, R32(%rbx) C 6
+L(nent):imul %r8, %rax C 6
+ mov %rax, (rp,%r10,8) C
+ inc %r10 C
+ jnz L(ntop)
+
+ mov -8(up), %r9 C up high limb
+ jmp L(com)
+
+L(unorm):
+ mov (up,%r10,8), %r9 C up[1]
+ shr R8(%rcx), %rax C
+ neg R32(%rcx)
+ shl R8(%rcx), %r9 C
+ neg R32(%rcx)
+ or %r9, %rax
+ xor R32(%rbx), R32(%rbx)
+ jmp L(uent)
+
+ ALIGN(8)
+L(utop):mul %r11 C carry limb in rdx 0 10
+ mov (up,%r10,8), %rax C
+ shl R8(%rcx), %rax C
+ neg R32(%rcx)
+ or %r9, %rax
+ sub %rbx, %rax C apply carry bit
+ setc R8(%rbx) C
+ sub %rdx, %rax C apply carry limb 5
+ adc $0, R32(%rbx) C 6
+L(uent):imul %r8, %rax C 6
+ mov (up,%r10,8), %r9 C
+ shr R8(%rcx), %r9 C
+ neg R32(%rcx)
+ mov %rax, (rp,%r10,8) C
+ inc %r10 C
+ jnz L(utop)
+
+L(com): mul %r11 C carry limb in rdx
+ sub %rbx, %r9 C apply carry bit
+ sub %rdx, %r9 C apply carry limb
+ imul %r8, %r9
+ mov %r9, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(one): shr R8(%rcx), %rax
+ imul %r8, %rax
+ mov %rax, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm
new file mode 100644
index 0000000..9b6b5c7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm
@@ -0,0 +1,159 @@
+dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1.77
+C AMD K10 1.76\1.82
+C AMD bd1 1.67\2.12
+C AMD bd2 1.62\1.82
+C AMD bd3
+C AMD bd4 1.55\2.2
+C AMD zen
+C AMD bt1 2.54
+C AMD bt2 2
+C Intel P4 11
+C Intel PNR 4.76
+C Intel NHM 5.27
+C Intel SBR 2
+C Intel IBR 1.94
+C Intel HWL 1.63
+C Intel BWL 1.51
+C Intel SKL 1.51
+C Intel atom 3.56
+C Intel SLM 4
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(ent): test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): shr $2, n
+ neg %r8
+ mov $3, R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r11
+ jmp L(lo0)
+
+L(b10): shr $2, n
+ neg %r8
+ mov $1, R32(%rax)
+ mov (up), %r8
+ mov 8(up), %r9
+ jrcxz L(cj2)
+ jmp L(top)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): shr $2, n
+ neg %r8
+ mov $0, R32(%rax)
+ mov (up), %r9
+ jrcxz L(cj1)
+ mov 8(up), %r10
+ jmp L(lo1)
+
+ ALIGN(8)
+L(b11): inc n
+ shr $2, n
+ neg %r8
+ mov $2, R32(%rax)
+ mov (up), %r11
+ jmp L(lo3)
+
+ ALIGN(4)
+L(top): mov 8(up,%rax,8), %r10
+ ADCSBB -8(vp,%rax,8), %r8
+ mov %r8, -8(rp,%rax,8)
+L(lo1): mov 16(up,%rax,8), %r11
+ ADCSBB (vp,%rax,8), %r9
+ lea 4(%rax), %rax
+ mov %r9, -32(rp,%rax,8)
+L(lo0): ADCSBB -24(vp,%rax,8), %r10
+ mov %r10, -24(rp,%rax,8)
+L(lo3): ADCSBB -16(vp,%rax,8), %r11
+ dec n
+ mov -8(up,%rax,8), %r8
+ mov %r11, -16(rp,%rax,8)
+L(lo2): mov (up,%rax,8), %r9
+ jnz L(top)
+
+L(cj2): ADCSBB -8(vp,%rax,8), %r8
+ mov %r8, -8(rp,%rax,8)
+L(cj1): ADCSBB (vp,%rax,8), %r9
+ mov %r9, (rp,%rax,8)
+
+ mov $0, R32(%rax)
+ adc $0, R32(%rax)
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm
new file mode 100644
index 0000000..41e1d8a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm
@@ -0,0 +1,191 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.52 old measurement
+C AMD K10 4.51 old measurement
+C AMD bd1 4.66 old measurement
+C AMD bd2 4.57 old measurement
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bt1 5.04
+C AMD bt2 5.07
+C Intel P4 16.8 18.6 old measurement
+C Intel PNR 5.59 old measurement
+C Intel NHM 5.39 old measurement
+C Intel SBR 3.93 old measurement
+C Intel IBR 3.59 old measurement
+C Intel HWL 3.61 old measurement
+C Intel BWL 2.76 old measurement
+C Intel SKL 2.77 old measurement
+C Intel atom 23 old measurement
+C Intel SLM 8 old measurement
+C Intel GLM ?
+C VIA nano 5.63 old measurement
+
+C The ALIGNment here might look completely ad-hoc. They are not.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+define(`v0', `%rcx')
+C Standard allocations
+define(`n', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp', `%rcx') ') dnl
+IFDOS(` define(`up', `%rsi') ') dnl
+IFDOS(` define(`n_param', `%r8') ') dnl
+IFDOS(` define(`v0', `%r9') ') dnl
+C DOS64 allocations
+IFDOS(` define(`n', `%rbx') ') dnl
+IFDOS(` define(`w0', `%r8') ') dnl
+IFDOS(` define(`w1', `%rdi') ') dnl
+IFDOS(` define(`w2', `%r10') ') dnl
+IFDOS(` define(`w3', `%r11') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(func)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(bx1)
+
+L(bx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(b00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(bx1): mul v0
+ test $2, R8(n)
+ je L(b01)
+
+L(b11): mov %rax, w2
+ mov %rdx, w3
+ neg n
+ inc n
+ jmp L(L3)
+
+ ALIGN(16)
+L(b01): sub $3, n
+ jc L(n1)
+ mov %rax, w2
+ mov %rdx, w3
+ neg n
+
+ ALIGN(16)
+L(top): mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ ADDSUB w2, -24(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(L0): mov -8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ ADDSUB w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(L3): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ ADDSUB w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(L2): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ ADDSUB w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ add $4, n
+ js L(top)
+
+L(end): xor R32(%rax), R32(%rax)
+ ADDSUB w2, -8(rp)
+ adc w3, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+
+ ALIGN(32)
+L(n1): ADDSUB %rax, -8(rp)
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm
new file mode 100644
index 0000000..877714e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm
@@ -0,0 +1,91 @@
+dnl AMD64 mpn_copyd optimised for AMD bobcat.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1-2 (alignment fluctuations)
+C AMD bd1 ?
+C AMD bobcat 1.5
+C Intel P4 2.8
+C Intel core2 1
+C Intel NHM 1-1.25
+C Intel SBR 1
+C Intel atom 2.87
+C VIA nano 2
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+ sub $4, n
+ jl L(end)
+ ALIGN(16)
+L(top): mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+L(ent): sub $4, n
+ jge L(top)
+
+L(end): cmp $-4, R32(n)
+ jz L(ret)
+ mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+ cmp $-3, R32(n)
+ jz L(ret)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ cmp $-2, R32(n)
+ jz L(ret)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm
new file mode 100644
index 0000000..ee0f578
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm
@@ -0,0 +1,94 @@
+dnl AMD64 mpn_copyi optimised for AMD bobcat.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1-2 (alignment fluctuations)
+C AMD bd1 ?
+C AMD bobcat 1.5
+C Intel P4 2.8
+C Intel core2 1
+C Intel NHM 1-1.25
+C Intel SBR 1
+C Intel atom 2.87
+C VIA nano 2
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+ lea -32(up,n,8), up
+ lea -32(rp,n,8), rp
+ neg n
+ add $4, n
+ jg L(end)
+ ALIGN(16)
+L(top): mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+L(ent): add $4, n
+ jle L(top)
+
+L(end): cmp $4, R32(n)
+ jz L(ret)
+ mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ cmp $3, R32(n)
+ jz L(ret)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ cmp $2, R32(n)
+ jz L(ret)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm
new file mode 100644
index 0000000..ef53392
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm
@@ -0,0 +1,119 @@
+dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 5.4
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR ?
+C Intel PNR ?
+C Intel NHM ?
+C Intel WSM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+define(`cnt', `%rcx')
+define(`s0', `%rax')
+define(`t0', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ LEA( ctz_table, %r10)
+ mov v0, t0
+ sub u0, t0
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov u0, s0
+ sub v0, u0
+ cmovc t0, u0 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ and $MASK, R32(t0)
+ movzbl (%r10,t0), R32(cnt)
+ jz L(count_better)
+L(shr): shr R8(cnt), u0
+ mov v0, t0
+ sub u0, t0
+ jnz L(top)
+
+L(end): mov v0, %rax
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+
+L(count_better):
+ bsf u0, cnt
+ jmp L(shr)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm
new file mode 100644
index 0000000..c9f221e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h
new file mode 100644
index 0000000..977a209
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h
@@ -0,0 +1,230 @@
+/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 1600 MHz AMD Bobcat/Zacate */
+/* FFT tuning limit = 110,472,704 */
+/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 71
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define DIV_1_VS_MUL_1_PERCENT 270
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 190
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 127
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 278
+#define SQR_TOOM6_THRESHOLD 372
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 22
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 444, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83, 5}, { 1343, 4}, \
+ { 2687, 5}, { 1407, 6}, { 735, 7}, { 415, 8}, \
+ { 223,10}, { 79,11}, { 47,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4991,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 380, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 63, 6}, { 1087, 7}, { 575, 8}, \
+ { 303, 9}, { 159,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \
+ { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \
+ { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,12}, { 223,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4479,15}, \
+ { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \
+ { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 186
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 42
+#define MULLO_MUL_N_THRESHOLD 10950
+#define SQRLO_BASECASE_THRESHOLD 7
+#define SQRLO_DC_THRESHOLD 100
+#define SQRLO_SQR_THRESHOLD 7293
+
+#define DC_DIV_QR_THRESHOLD 70
+#define DC_DIVAPPR_Q_THRESHOLD 204
+#define DC_BDIV_QR_THRESHOLD 59
+#define DC_BDIV_Q_THRESHOLD 148
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 246
+#define INV_APPR_THRESHOLD 236
+
+#define BINV_NEWTON_THRESHOLD 252
+#define REDC_1_TO_REDC_2_THRESHOLD 67
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,16,194,960,1603,1811,2499
+
+#define GET_STR_DC_THRESHOLD 20
+#define GET_STR_PRECOMPUTE_THRESHOLD 34
+#define SET_STR_DC_THRESHOLD 345
+#define SET_STR_PRECOMPUTE_THRESHOLD 1787
+
+#define FAC_DSC_THRESHOLD 781
+#define FAC_ODD_THRESHOLD 104
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 3 /* 3.20% faster than 5 */
+#define HGCD_THRESHOLD 110
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 293
+#define JACOBI_BASE_METHOD 2 /* 9.38% faster than 1 */
+
+/* Tuneup completed successfully, took 358881 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm
new file mode 100644
index 0000000..4394d6e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm
@@ -0,0 +1,241 @@
+dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.53 old measurement
+C AMD K10 4.53 old measurement
+C AMD bd1 4.56 old measurement
+C AMD bd2 4.47 old measurement
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bt1 5.12
+C AMD bt2 5.17
+C Intel P4 12.6 old measurement
+C Intel PNR 4.53 old measurement
+C Intel NHM 4.36 old measurement
+C Intel SBR 3.0 old measurement
+C Intel IBR 2.55 old measurement
+C Intel HWL 2.28 old measurement
+C Intel BWL 2.36 old measurement
+C Intel SKL 2.39 old measurement
+C Intel atom 21.0 old measurement
+C Intel SLM 9 old measurement
+C Intel GLM ?
+C VIA nano ?
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+define(`v0', `%rcx')
+define(`cy', `%r8')
+C Standard allocations
+define(`n', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp', `%rcx') ') dnl
+IFDOS(` define(`up', `%rsi') ') dnl
+IFDOS(` define(`n_param', `%r8') ') dnl
+IFDOS(` define(`v0', `%r9') ') dnl
+IFDOS(` define(`cy', `56(%rsp)')') dnl
+C DOS64 allocations
+IFDOS(` define(`n', `%rbx') ') dnl
+IFDOS(` define(`w0', `%r8') ') dnl
+IFDOS(` define(`w1', `%rdi') ') dnl
+IFDOS(` define(`w2', `%r10') ') dnl
+IFDOS(` define(`w3', `%r11') ') dnl
+
+ ALIGN(64)
+PROLOGUE(mpn_mul_1)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(bx1)
+
+L(bx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(b00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(b11): mov %rax, w2
+ mov %rdx, w3
+ neg n
+ inc n
+ jmp L(L3)
+
+ ALIGN(16)
+L(bx1): mul v0
+ test $2, R8(n)
+ jne L(b11)
+
+L(b01): sub $3, n
+ jc L(n1)
+ mov %rax, w2
+ mov %rdx, w3
+ neg n
+
+ ALIGN(16)
+L(top): mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -24(rp,n,8)
+ add w3, w0
+ adc $0, w1
+L(L0): mov -8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+L(L2): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+ add $4, n
+ js L(top)
+
+L(end): mov w2, -8(rp)
+ mov w3, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+
+ ALIGN(32)
+L(n1): mov %rax, -8(rp)
+ mov %rdx, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_mul_1c)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+ mov cy, w2
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(cx1)
+
+L(cx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, w0
+ adc $0, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(c00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(cx1): mul v0
+ test $2, R8(n)
+ je L(c01)
+
+L(c11): neg n
+ inc n
+ add %rax, w2
+ mov %rdx, w3
+ adc $0, w3
+ jmp L(L3)
+
+L(c01): cmp $1, n
+ jz L(m1)
+ neg n
+ add $3, n
+ add %rax, w2
+ mov %rdx, w3
+ adc $0, w3
+ jmp L(top)
+
+ ALIGN(32)
+L(m1): add %rax, w2
+ mov %rdx, %rax
+ mov w2, -8(rp)
+ adc $0, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm
new file mode 100644
index 0000000..e7d46bf
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm
@@ -0,0 +1,486 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 3 code.
+C * Fix slowdown for un=vn=3 (67->71) compared to default code.
+C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
+C combining addmul loops like that code. Tolerable slowdown?
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ mov (up), %rax
+ mov (vp), v0
+
+ cmp $2, un_param
+ ja L(ge3)
+ jz L(u2)
+
+ mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(u2): mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ cmp $1, R32(vn)
+ jnz L(u2v2)
+ mov w0, 8(rp)
+ mov w1, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(u2v2):mov 8(vp), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, %r8 C CAUTION: r8 realloc
+ adc $0, %r8
+ mov 8(up), %rax
+ mul v0
+ add w1, %r8
+ adc $0, %rdx
+ add %r8, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(ge3): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ lea 8(vp), vp
+
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ xor R32(un), R32(un)
+ mov $2, R32(n)
+ sub un_param, un
+ sub un_param, n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(top)
+
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+
+C Switch on n into right addmul_l loop
+ test n, n
+ jz L(r2)
+ cmp $2, R32(n)
+ ja L(r3)
+ jz L(r0)
+ jmp L(r1)
+
+
+L(r3): mov w2, X(-8(rp,n,8),16(rp))
+ mov w3, X((rp,n,8),24(rp))
+ add $2, un
+
+C outer loop(3)
+L(to3): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 8(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al3)
+
+ ALIGN(16)
+L(ta3): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta3)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to3)
+
+
+L(r2): mov X(0(up,n,8),(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),-8(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),(rp))
+ add w1, w2
+ adc $0, w3
+ mov X(16(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X(16(rp,n,8),16(rp))
+ adc $0, w3
+ mov w1, X(24(rp,n,8),24(rp))
+ inc un
+
+C outer loop(2)
+L(to2): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 16(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al2)
+
+ ALIGN(16)
+L(ta2): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al2): mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta2)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to2)
+
+
+L(r1): mov X(0(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),8(rp))
+ add w1, w2
+ adc $0, w3
+ mov w2, X(8(rp,n,8),16(rp))
+ mov w3, X(16(rp,n,8),24(rp))
+ add $4, un
+
+C outer loop(1)
+L(to1): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov -8(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al1)
+
+ ALIGN(16)
+L(ta1): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al1): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta1)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to1)
+
+
+L(r0): mov X((up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X((rp,n,8),16(rp))
+ mov w1, X(8(rp,n,8),24(rp))
+ add $3, un
+
+C outer loop(0)
+L(to0): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov (up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al0)
+
+ ALIGN(16)
+L(ta0): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al0): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta0)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to0)
+
+
+L(ret): pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm
new file mode 100644
index 0000000..d55b1e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm
@@ -0,0 +1,507 @@
+dnl X86-64 mpn_redc_1 optimised for AMD bobcat.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e1): mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp1)
+
+L(ed1): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e3): mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp3)
+
+L(ed3): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea (n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w3
+ add (up,n,8), w0
+ adc w1, %rbx
+ adc $0, w3
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w3, w0
+ adc $0, w1
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+L(e0): mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp0)
+
+L(ed0): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w3
+ add (up,n,8), w0
+ adc w1, %rbx
+ adc $0, w3
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w3, w0
+ adc $0, w1
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+L(e2): mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp2)
+
+L(ed2): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm
new file mode 100644
index 0000000..0e417a1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm
@@ -0,0 +1,565 @@
+dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 4 code.
+C * Perhaps implement a larger final corner (it is now 2 x 1).
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`n', `%rbp')
+define(`v0', `%rcx')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+dnl define(`X',`$1')
+
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ mov (up), %rax
+
+ cmp $2, R32(un_param)
+ jae L(ge2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(ge2): mov (up), v0
+ jnz L(g2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ mov 8(up), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, w0 C CAUTION: r8 realloc
+ adc $0, w0
+ mov 8(up), %rax
+ mul v0
+ add w1, w0
+ adc $0, %rdx
+ add w0, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(g2): cmp $3, R32(un_param)
+ ja L(g3)
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov (up), v0
+ mov 8(up), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 16(up), %rax
+ mul v0
+ xor R32(w2), R32(w2)
+ add %rax, w1
+ adc %rdx, w2
+
+ mov 8(up), v0
+ mov 16(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, w2
+ adc %rdx, w3
+ add w0, w0
+ adc w1, w1
+ adc w2, w2
+ adc w3, w3
+ mov $0, R32(v0)
+ adc v0, v0
+ add w0, 8(rp)
+ adc w1, 16(rp)
+ adc w2, 24(rp)
+ adc w3, 32(rp)
+ adc v0, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(g3): push %rbx
+ push %rbp
+
+ mov 8(up), %rax
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ neg un_param
+ push un_param C for sqr_diag_addlsh1
+ lea (un_param), un
+ lea 3(un_param), n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(top)
+
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+
+ test n, n
+ jz L(r2)
+ cmp $2, R32(n)
+ ja L(r3)
+ jz L(r0)
+
+
+L(r1): mov X((up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),8(rp))
+ add w1, w2
+ adc $0, w3
+ mov w2, X(8(rp,n,8),16(rp))
+ mov w3, X(16(rp,n,8),24(rp))
+ add $5, un
+ jmp L(to0)
+
+L(r2): mov X((up,n,8),(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),-8(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),(rp))
+ add w1, w2
+ adc $0, w3
+ mov X(16(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X(16(rp,n,8),16(rp))
+ adc $0, w3
+ mov w1, X(24(rp,n,8),24(rp))
+ add $6, un
+ jmp L(to1)
+
+L(r3): mov w2, X(-8(rp,n,8),16(rp))
+ mov w3, X((rp,n,8),24(rp))
+ add $3, un
+ jmp L(to2)
+
+L(r0): mov X((up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X((rp,n,8),16(rp))
+ mov w1, X(8(rp,n,8),24(rp))
+ add $4, un
+C jmp L(to3)
+C fall through into main loop
+
+
+L(outer):
+ mov un, n
+ mov (up,un,8), v0
+ mov 8(up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al3)
+
+ ALIGN(16)
+L(ta3): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta3)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to2): mov un, n
+ cmp $-4, R32(un)
+ jnc L(end)
+ add $4, un
+ mov 8(up,n,8), v0
+ mov 16(up,n,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al2)
+
+ ALIGN(16)
+L(ta2): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al2): mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta2)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to1): mov un, n
+ mov -16(up,un,8), v0
+ mov -8(up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al1)
+
+ ALIGN(16)
+L(ta1): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al1): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta1)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to0): mov un, n
+ mov -8(up,un,8), v0
+ mov (up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al0)
+
+ ALIGN(16)
+L(ta0): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al0): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta0)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(outer)
+
+
+L(end): mov X(8(up,un,8),(up)), v0
+ mov X(16(up,un,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov X(24(up,un,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, X(24(rp,un,8),16(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(32(rp,un,8),24(rp))
+ adc $0, w3
+ mov X(16(up,un,8),8(up)), v0
+ mov X(24(up,un,8),16(up)), %rax
+ mul v0
+ add %rax, w3
+ mov w3, X(40(rp,un,8),32(rp))
+ adc $0, %rdx
+ mov %rdx, X(48(rp,un,8),40(rp))
+
+
+C sqr_diag_addlsh1
+
+ lea 16(up), up
+ lea 40(rp), rp
+ pop n
+ lea 2(n,n), n
+
+ mov (up,n,4), %rax
+ mul %rax
+ xor R32(w2), R32(w2)
+
+ mov 8(rp,n,8), w0
+ mov %rax, (rp,n,8)
+ jmp L(lm)
+
+ ALIGN(8)
+L(tsd): add %rbx, w0
+ adc %rax, w1
+ mov w0, -8(rp,n,8)
+ mov 8(rp,n,8), w0
+ mov w1, (rp,n,8)
+L(lm): mov 16(rp,n,8), w1
+ adc w0, w0
+ adc w1, w1
+ lea (%rdx,w2), %rbx
+ mov 8(up,n,4), %rax
+ setc R8(w2)
+ mul %rax
+ add $2, n
+ js L(tsd)
+
+L(esd): add %rbx, w0
+ adc %rax, w1
+ mov w0, X(-8(rp,n,8),-8(rp))
+ mov w1, X((rp,n,8),(rp))
+ adc w2, %rdx
+ mov %rdx, X(8(rp,n,8),8(rp))
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/com.asm b/gmp-6.3.0/mpn/x86_64/bt2/com.asm
new file mode 100644
index 0000000..87085ea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm
new file mode 100644
index 0000000..83c0618
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm
new file mode 100644
index 0000000..148d0e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm
new file mode 100644
index 0000000..0ffb6ca
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/bd2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm
new file mode 100644
index 0000000..d693628
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/bd2/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h
new file mode 100644
index 0000000..3e26726
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h
@@ -0,0 +1,240 @@
+/* AMD Jaguar gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 2050 MHz AMD Jaguar/Kabini */
+/* FFT tuning limit = 225,381,546 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 65
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 15
+
+#define DIV_1_VS_MUL_1_PERCENT 267
+
+#define MUL_TOOM22_THRESHOLD 25
+#define MUL_TOOM33_THRESHOLD 32
+#define MUL_TOOM44_THRESHOLD 93
+#define MUL_TOOM6H_THRESHOLD 366
+#define MUL_TOOM8H_THRESHOLD 537
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 63
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 172
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 220
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 434
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \
+ { 31,10}, { 63, 6}, { 1087, 8}, { 303, 9}, \
+ { 159,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,11}, \
+ { 223,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,12}, { 191,11}, { 415,12}, \
+ { 223,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 479,14}, \
+ { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \
+ { 607,13}, { 319,12}, { 703,13}, { 383,12}, \
+ { 831,13}, { 447,12}, { 895,14}, { 255,13}, \
+ { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \
+ { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \
+ { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
+ { 639,13}, { 1343,12}, { 2687,14}, { 767,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \
+ { 2687,15}, { 767,14}, { 1663,13}, { 3327,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \
+ { 2047,14}, { 4223,13}, { 8447,14}, { 4479,15}, \
+ { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \
+ { 2815,14}, { 5887,16}, { 1535,15}, { 3071,14}, \
+ { 6271,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \
+ { 8447,15}, { 4351,14}, { 8959,15}, { 4863,16}, \
+ { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 201
+#define MUL_FFT_THRESHOLD 3200
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95, 6}, \
+ { 1663, 7}, { 895, 9}, { 239, 8}, { 479,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \
+ { 271,11}, { 79, 9}, { 319,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
+ { 303, 9}, { 607,10}, { 319, 9}, { 639,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \
+ { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,16}, { 511,15}, \
+ { 1023,14}, { 2175,13}, { 4479,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4991,15}, { 2815,14}, \
+ { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \
+ { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 208
+#define SQR_FFT_THRESHOLD 2880
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 63
+#define MULLO_MUL_N_THRESHOLD 6253
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 54
+#define SQRLO_SQR_THRESHOLD 5558
+
+#define DC_DIV_QR_THRESHOLD 72
+#define DC_DIVAPPR_Q_THRESHOLD 195
+#define DC_BDIV_QR_THRESHOLD 50
+#define DC_BDIV_Q_THRESHOLD 90
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 195
+#define INV_APPR_THRESHOLD 197
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_2_THRESHOLD 67
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1334
+#define MUPI_DIV_QR_THRESHOLD 104
+#define MU_BDIV_QR_THRESHOLD 1017
+#define MU_BDIV_Q_THRESHOLD 1187
+
+#define POWM_SEC_TABLE 1,16,194,712,779,2387
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 29
+#define SET_STR_DC_THRESHOLD 216
+#define SET_STR_PRECOMPUTE_THRESHOLD 994
+
+#define FAC_DSC_THRESHOLD 153
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 1 /* 9.38% faster than 3 */
+#define HGCD_THRESHOLD 77
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 440
+#define GCDEXT_DC_THRESHOLD 273
+#define JACOBI_BASE_METHOD 1 /* 7.74% faster than 4 */
+
+/* Tuneup completed successfully, took 495910 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm
new file mode 100644
index 0000000..13a2ab3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm
@@ -0,0 +1,183 @@
+dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 2
+C AMD bd1 2.32
+C AMD bobcat 3
+C Intel P4 13
+C Intel core2 2.9
+C Intel NHM 2.8
+C Intel SBR 2.4
+C Intel atom 5.33
+C VIA nano 3
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about whether it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
+C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
+C ADCSBB-to-memory, again saving 1 insn/limb.
+C * This runs optimally at decoder bandwidth on K10. It has not been tuned
+C for any other processor.
+
+C INPUT PARAMETERS
+define(`cnd', `%rdi') dnl rcx
+define(`rp', `%rsi') dnl rdx
+define(`up', `%rdx') dnl r8
+define(`vp', `%rcx') dnl r9
+define(`n', `%r8') dnl rsp+40
+
+ifdef(`OPERATION_cnd_add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), R32(%r8)')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ neg cnd
+ sbb cnd, cnd C make cnd mask
+
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%rax)
+ neg n
+ and $3, R32(%rax)
+ jz L(top) C carry-save reg rax = 0 in this arc
+ cmp $2, R32(%rax)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ and cnd, %r12
+ mov (up,n,8), %r10
+ and cnd, %r13
+ mov 8(up,n,8), %rbx
+ and cnd, %r14
+ mov 16(up,n,8), %rbp
+ ADDSUB %r12, %r10
+ mov %r10, (rp,n,8)
+ ADCSBB %r13, %rbx
+ mov %rbx, 8(rp,n,8)
+ ADCSBB %r14, %rbp
+ mov %rbp, 16(rp,n,8)
+ sbb R32(%rax), R32(%rax) C save carry
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b2): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov (up,n,8), %r10
+ and cnd, %r12
+ mov 8(up,n,8), %rbx
+ and cnd, %r13
+ ADDSUB %r12, %r10
+ mov %r10, (rp,n,8)
+ ADCSBB %r13, %rbx
+ mov %rbx, 8(rp,n,8)
+ sbb R32(%rax), R32(%rax) C save carry
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+L(b1): mov (vp,n,8), %r12
+ mov (up,n,8), %r10
+ and cnd, %r12
+ ADDSUB %r12, %r10
+ mov %r10, (rp,n,8)
+ sbb R32(%rax), R32(%rax) C save carry
+ add $1, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mov (vp,n,8), %r12
+ mov 8(vp,n,8), %r13
+ mov 16(vp,n,8), %r14
+ mov 24(vp,n,8), %r11
+ and cnd, %r12
+ mov (up,n,8), %r10
+ and cnd, %r13
+ mov 8(up,n,8), %rbx
+ and cnd, %r14
+ mov 16(up,n,8), %rbp
+ and cnd, %r11
+ mov 24(up,n,8), %r9
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB %r12, %r10
+ mov %r10, (rp,n,8)
+ ADCSBB %r13, %rbx
+ mov %rbx, 8(rp,n,8)
+ ADCSBB %r14, %rbp
+ mov %rbp, 16(rp,n,8)
+ ADCSBB %r11, %r9
+ mov %r9, 24(rp,n,8)
+ sbb R32(%rax), R32(%rax) C save carry
+ add $4, n
+ js L(top)
+
+L(end): neg R32(%rax)
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/com.asm b/gmp-6.3.0/mpn/x86_64/com.asm
new file mode 100644
index 0000000..006acaf
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/com.asm
@@ -0,0 +1,95 @@
+dnl AMD64 mpn_com.
+
+dnl Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 1.25
+C AMD K10 1.25
+C Intel P4 2.78
+C Intel core2 1.1
+C Intel corei 1.5
+C Intel atom ?
+C VIA nano 2
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_com)
+ FUNC_ENTRY(3)
+ movq (up), %r8
+ movl R32(%rdx), R32(%rax)
+ leaq (up,n,8), up
+ leaq (rp,n,8), rp
+ negq n
+ andl $3, R32(%rax)
+ je L(b00)
+ cmpl $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): notq %r8
+ movq %r8, (rp,n,8)
+ decq n
+ jmp L(e11)
+L(b10): addq $-2, n
+ jmp L(e10)
+ .byte 0x90,0x90,0x90,0x90,0x90,0x90
+L(b01): notq %r8
+ movq %r8, (rp,n,8)
+ incq n
+ jz L(ret)
+
+L(oop): movq (up,n,8), %r8
+L(b00): movq 8(up,n,8), %r9
+ notq %r8
+ notq %r9
+ movq %r8, (rp,n,8)
+ movq %r9, 8(rp,n,8)
+L(e11): movq 16(up,n,8), %r8
+L(e10): movq 24(up,n,8), %r9
+ notq %r8
+ notq %r9
+ movq %r8, 16(rp,n,8)
+ movq %r9, 24(rp,n,8)
+ addq $4, n
+ jnc L(oop)
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/copyd.asm b/gmp-6.3.0/mpn/x86_64/copyd.asm
new file mode 100644
index 0000000..a5e6e59
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/copyd.asm
@@ -0,0 +1,93 @@
+dnl AMD64 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1
+C AMD bd1 1.36
+C AMD bobcat 1.71
+C Intel P4 2-3
+C Intel core2 1
+C Intel NHM 1
+C Intel SBR 1
+C Intel atom 2
+C VIA nano 2
+
+
+IFSTD(`define(`rp',`%rdi')')
+IFSTD(`define(`up',`%rsi')')
+IFSTD(`define(`n', `%rdx')')
+
+IFDOS(`define(`rp',`%rcx')')
+IFDOS(`define(`up',`%rdx')')
+IFDOS(`define(`n', `%r8')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyd)
+ lea -8(up,n,8), up
+ lea (rp,n,8), rp
+ sub $4, n
+ jc L(end)
+ nop
+
+L(top): mov (up), %rax
+ mov -8(up), %r9
+ lea -32(rp), rp
+ mov -16(up), %r10
+ mov -24(up), %r11
+ lea -32(up), up
+ mov %rax, 24(rp)
+ mov %r9, 16(rp)
+ sub $4, n
+ mov %r10, 8(rp)
+ mov %r11, (rp)
+ jnc L(top)
+
+L(end): shr R32(n)
+ jnc 1f
+ mov (up), %rax
+ mov %rax, -8(rp)
+ lea -8(rp), rp
+ lea -8(up), up
+1: shr R32(n)
+ jnc 1f
+ mov (up), %rax
+ mov -8(up), %r9
+ mov %rax, -8(rp)
+ mov %r9, -16(rp)
+1: ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/copyi.asm b/gmp-6.3.0/mpn/x86_64/copyi.asm
new file mode 100644
index 0000000..bafce7a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/copyi.asm
@@ -0,0 +1,92 @@
+dnl AMD64 mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1
+C AMD bd1 1.36
+C AMD bobcat 1.71
+C Intel P4 2-3
+C Intel core2 1
+C Intel NHM 1
+C Intel SBR 1
+C Intel atom 2
+C VIA nano 2
+
+
+IFSTD(`define(`rp',`%rdi')')
+IFSTD(`define(`up',`%rsi')')
+IFSTD(`define(`n', `%rdx')')
+
+IFDOS(`define(`rp',`%rcx')')
+IFDOS(`define(`up',`%rdx')')
+IFDOS(`define(`n', `%r8')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+ .byte 0,0,0,0,0,0
+PROLOGUE(mpn_copyi)
+ lea -8(rp), rp
+ sub $4, n
+ jc L(end)
+
+L(top): mov (up), %rax
+ mov 8(up), %r9
+ lea 32(rp), rp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ mov %rax, -24(rp)
+ mov %r9, -16(rp)
+ sub $4, n
+ mov %r10, -8(rp)
+ mov %r11, (rp)
+ jnc L(top)
+
+L(end): shr R32(n)
+ jnc 1f
+ mov (up), %rax
+ mov %rax, 8(rp)
+ lea 8(rp), rp
+ lea 8(up), up
+1: shr R32(n)
+ jnc 1f
+ mov (up), %rax
+ mov 8(up), %r9
+ mov %rax, 8(rp)
+ mov %r9, 16(rp)
+1: ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm
new file mode 100644
index 0000000..7066bb4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm
@@ -0,0 +1,53 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm
new file mode 100644
index 0000000..5065120
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm
@@ -0,0 +1,53 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm
new file mode 100644
index 0000000..57abf31
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm
new file mode 100644
index 0000000..3f875ae
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm
@@ -0,0 +1,225 @@
+dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 4.14
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`ep', `%rcx')
+define(`yp', `%r8')
+define(`n', `%r9')
+define(`cy_param', `8(%rsp)')
+
+define(`el', `%rbx')
+define(`eh', `%rbp')
+define(`t0', `%r10')
+define(`t1', `%r11')
+define(`t2', `%r12')
+define(`t3', `%r13')
+define(`w0', `%r14')
+define(`w1', `%r15')
+
+ifdef(`OPERATION_add_err1_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_err1_n)')
+ifdef(`OPERATION_sub_err1_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_err1_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov cy_param, %rax
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%r10)
+ and $3, R32(%r10)
+ jz L(0mod4)
+ cmp $2, R32(%r10)
+ jc L(1mod4)
+ jz L(2mod4)
+L(3mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ xor R32(t1), R32(t1)
+ lea -24(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 16(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc 8(yp), t0
+ mov 16(up,n,8), w0
+ ADCSBB 16(vp,n,8), w0
+ mov w0, 16(rp,n,8)
+ cmovc (yp), t1
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+
+ add $3, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(0mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea (yp,n,8), yp
+ neg n
+ jmp L(loop)
+
+ ALIGN(16)
+L(1mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea -8(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc (yp), el
+ setc %al C save carry
+
+ add $1, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(2mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ lea -16(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 8(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc (yp), t0
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+
+ add $2, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(32)
+L(loop):
+ mov (up,n,8), w0
+ shr $1, %al C restore carry
+ mov -8(yp), t0
+ mov $0, R32(t3)
+ ADCSBB (vp,n,8), w0
+ cmovnc t3, t0
+ mov w0, (rp,n,8)
+ mov 8(up,n,8), w1
+ mov 16(up,n,8), w0
+ ADCSBB 8(vp,n,8), w1
+ mov -16(yp), t1
+ cmovnc t3, t1
+ mov -24(yp), t2
+ mov w1, 8(rp,n,8)
+ ADCSBB 16(vp,n,8), w0
+ cmovnc t3, t2
+ mov 24(up,n,8), w1
+ ADCSBB 24(vp,n,8), w1
+ cmovc -32(yp), t3
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+ add t2, el
+ adc $0, eh
+ lea -32(yp), yp
+ mov w0, 16(rp,n,8)
+ add t3, el
+ adc $0, eh
+ add $4, n
+ mov w1, -8(rp,n,8)
+ jnz L(loop)
+
+L(end):
+ mov el, (ep)
+ mov eh, 8(ep)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm
new file mode 100644
index 0000000..f9e0039
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm
@@ -0,0 +1,150 @@
+dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+
+dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 1.93\2
+C AMD bull 1.62\2.1
+C AMD pile 1.6\1.7
+C AMD steam
+C AMD excavator
+C AMD bobcat 2.79
+C AMD jaguar 2.54
+C Intel P4 10
+C Intel core2 2
+C Intel NHM 2
+C Intel SBR 2
+C Intel IBR 1.95
+C Intel HWL 1.72
+C Intel BWL 1.54
+C Intel SKL 1.52
+C Intel atom 9
+C Intel SLM 6.5
+C VIA nano 3
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(start):
+ mov (up), %r10
+ mov (vp), %r11
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+ mov R32(n), R32(%rax)
+ neg n
+ and $3, R32(%rax)
+ je L(b00)
+ add %rax, n C clear low rcx bits for jrcxz
+ cmp $2, R32(%rax)
+ jl L(b01)
+ je L(b10)
+
+L(b11): neg %r8 C set cy
+ jmp L(e11)
+
+L(b00): neg %r8 C set cy
+ mov %r10, %r8
+ mov %r11, %r9
+ lea 4(n), n
+ jmp L(e00)
+
+ nop
+ nop
+ nop
+L(b01): neg %r8 C set cy
+ jmp L(top)
+
+L(b10): neg %r8 C set cy
+ mov %r10, %r8
+ mov %r11, %r9
+ jmp L(e10)
+
+L(end): ADCSBB %r11, %r10
+ mov %r10, -8(rp)
+ mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(top): jrcxz L(end)
+ mov (up,n,8), %r8
+ mov (vp,n,8), %r9
+ lea 4(n), n
+ ADCSBB %r11, %r10
+ mov %r10, -40(rp,n,8)
+L(e00): mov -24(up,n,8), %r10
+ mov -24(vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, -32(rp,n,8)
+L(e11): mov -16(up,n,8), %r8
+ mov -16(vp,n,8), %r9
+ ADCSBB %r11, %r10
+ mov %r10, -24(rp,n,8)
+L(e10): mov -8(up,n,8), %r10
+ mov -8(vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, -16(rp,n,8)
+ jmp L(top)
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(start)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm
new file mode 100644
index 0000000..a7a5d6e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm
@@ -0,0 +1,188 @@
+dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
+
+dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.52
+C AMD K10 4.01
+C AMD bull 4.98
+C AMD pile 4.83
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56
+C AMD jaguar 5.54
+C Intel P4 16.3 17.3
+C Intel core2 4.32 4.61
+C Intel NHM 5.08
+C Intel SBR 4.04
+C Intel IBR 3.95
+C Intel HWL 3.66
+C Intel BWL 2.87
+C Intel SKL 2.79
+C Intel atom 20.6
+C Intel SLM 7.6
+C VIA nano 5.25
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`v0', `%rcx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+ define(`func_1c', `mpn_addmul_1c')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+ define(`func_1c', `mpn_submul_1c')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ C For DOS, on the stack we have four saved registers, return address,
+ C space for four register arguments, and finally the carry input.
+
+IFDOS(` define(`carry_in', `72(%rsp)')') dnl
+IFSTD(` define(`carry_in', `%r8')') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_1c)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ lea (%rdx), %rbx
+ neg %rbx
+
+ mov (up), %rax
+ mov (rp), %r10
+
+ lea -16(rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+ add carry_in, %rax
+ adc $0, %rdx
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ lea (%rdx), %rbx
+ neg %rbx
+
+ mov (up), %rax
+ mov (rp), %r10
+
+ lea -16(rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+
+L(start_nc):
+ test $1, R8(%rbx)
+ jnz L(odd)
+
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ lea (%rdx), %rbp
+ mul %rcx
+ add $2, %rbx
+ jz L(n2)
+
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ lea (%rdx), %r9
+ jmp L(mid)
+
+ ALIGN(8)
+L(odd): inc %rbx
+ jz L(n1)
+
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ lea (%rdx), %r9
+ mul %rcx
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ lea (%rdx), %rbp
+ jmp L(e)
+
+ ALIGN(16)
+L(top): mul %rcx
+ ADDSUB %r8, %r10
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(rp,%rbx,8)
+ mov (rp,%rbx,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rbp
+L(mid): mul %rcx
+ ADDSUB %r11, %r10
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ adc %rbp, %r8
+ mov %r10, (rp,%rbx,8)
+ mov 8(rp,%rbx,8), %r10
+ lea (%rdx), %rbp
+ adc $0, %r9
+L(e): add $2, %rbx
+ js L(top)
+
+ mul %rcx
+ ADDSUB %r8, %r10
+ adc %r9, %r11
+ mov %r10, -8(rp)
+ adc %rbx, %rbp C rbx = 0
+L(n2): mov (rp), %r10
+ ADDSUB %r11, %r10
+ adc %rbp, %rax
+ mov %r10, (rp)
+ adc %rbx, %rdx C rbx = 0
+L(n1): mov 8(rp), %r10
+ ADDSUB %rax, %r10
+ mov %r10, 8(rp)
+ mov R32(%rbx), R32(%rax) C rbx = 0
+ adc %rdx, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/com.asm b/gmp-6.3.0/mpn/x86_64/core2/com.asm
new file mode 100644
index 0000000..d7d9f79
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyd.asm b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm
new file mode 100644
index 0000000..57ea0e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyi.asm b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm
new file mode 100644
index 0000000..f0c7607
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm
new file mode 100644
index 0000000..1b3f139
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm
@@ -0,0 +1,243 @@
+dnl x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C norm unorm frac
+C AMD K8,K9 15 15 12
+C AMD K10 15 15 12
+C Intel P4 44 44 43
+C Intel core2 24 24 19.5
+C Intel corei 19 19 18
+C Intel atom 51 51 36
+C VIA nano 46 44 22.5
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn_param', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`d', `%r8')
+define(`dinv', `%r9') C only for mpn_preinv_divrem_1
+C shift passed on stack C only for mpn_preinv_divrem_1
+
+define(`cnt', `%rcx')
+define(`up', `%rsi')
+define(`fn', `%r12')
+define(`un', `%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C cnt qp d dinv
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFSTD(`define(`CNTOFF', `40($1)')')
+IFDOS(`define(`CNTOFF', `104($1)')')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+
+ lea -8(qp,un_param,8), qp
+
+ mov CNTOFF(%rsp), R8(cnt)
+ shl R8(cnt), d
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+ je L(ret)
+
+ lea -8(qp,un_param,8), qp
+ xor R32(%rbp), R32(%rbp)
+
+L(unnormalized):
+ test un, un
+ je L(44)
+ mov -8(up,un,8), %rax
+ cmp d, %rax
+ jae L(44)
+ mov %rbp, (qp)
+ mov %rax, %rbp
+ lea -8(qp), qp
+ je L(ret)
+ dec un
+L(44):
+ bsr d, %rcx
+ not R32(%rcx)
+ sal R8(%rcx), d
+ sal R8(%rcx), %rbp
+
+ push %rcx
+IFSTD(` push %rdi ')
+IFSTD(` push %rsi ')
+ push %r8
+IFSTD(` sub $8, %rsp ')
+IFSTD(` mov d, %rdi ')
+IFDOS(` sub $40, %rsp ')
+IFDOS(` mov d, %rcx ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+ pop %r8
+IFSTD(` pop %rsi ')
+IFSTD(` pop %rdi ')
+ pop %rcx
+
+ mov %rax, dinv
+ mov %rbp, %rax
+ test un, un
+ je L(frac)
+
+L(ent): mov -8(up,un,8), %rbp
+ shr R8(%rcx), %rax
+ shld R8(%rcx), %rbp, %rax
+ sub $2, un
+ js L(end)
+
+ ALIGN(16)
+L(top): lea 1(%rax), %r11
+ mul dinv
+ mov (up,un,8), %r10
+ shld R8(%rcx), %r10, %rbp
+ mov %rbp, %r13
+ add %rax, %r13
+ adc %r11, %rdx
+ mov %rdx, %r11
+ imul d, %rdx
+ sub %rdx, %rbp
+ lea (d,%rbp), %rax
+ sub $8, qp
+ cmp %r13, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r11
+ cmp d, %rax
+ jae L(ufx)
+L(uok): dec un
+ mov %r11, 8(qp)
+ mov %r10, %rbp
+ jns L(top)
+
+L(end): lea 1(%rax), %r11
+ sal R8(%rcx), %rbp
+ mul dinv
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ cmp %r11, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(efx)
+L(eok): mov %r13, (qp)
+ sub $8, qp
+ jmp L(frac)
+
+L(ufx): sub d, %rax
+ inc %r11
+ jmp L(uok)
+L(efx): sub d, %rax
+ inc %r13
+ jmp L(eok)
+
+L(frac):mov d, %rbp
+ neg %rbp
+ jmp L(fent)
+
+ ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
+L(ftop):mul dinv C 0,12 0,17 0,17
+ add %r11, %rdx C 5 8 10
+ mov %rax, %r11 C 4 8 3
+ mov %rdx, %r13 C 6 9 11
+ imul %rbp, %rdx C 6 9 11
+ mov d, %rax C
+ add %rdx, %rax C 10 14 14
+ cmp %r11, %rdx C 10 14 14
+ cmovc %rdx, %rax C 11 15 15
+ adc $-1, %r13 C
+ mov %r13, (qp) C
+ sub $8, qp C
+L(fent):lea 1(%rax), %r11 C
+ dec fn C
+ jns L(ftop) C
+
+ shr R8(%rcx), %rax
+L(ret): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm
new file mode 100644
index 0000000..b00451f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm
@@ -0,0 +1,93 @@
+dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR 4.22 *
+C Intel PNR 4.22 *
+C Intel NHM 4.97
+C Intel WSM 5.17
+C Intel SBR 4.83 *
+C Intel IBR 4.16 *
+C Intel HWL 3.84
+C Intel BWL 3.76
+C Intel SKL 3.83
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ jmp L(odd)
+
+ ALIGN(16)
+L(top): cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shr R8(%rcx), u0
+L(odd): mov v0, %rdx
+ sub u0, %rdx C v - u
+ bsf %rdx, %rcx
+ mov u0, %rax
+ sub v0, u0 C u - v
+ jnz L(top)
+
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm
new file mode 100644
index 0000000..b5aa73b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm
@@ -0,0 +1,137 @@
+dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR 8.7
+C Intel PNR 8.7
+C Intel NHM 9.2
+C Intel WSM 9.2
+C Intel SBR 9.1
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0', `%rax')
+define(`cnt', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%r10')
+define(`t1', `%r11')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+ mov v0_param, v0
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ mov u0, s0
+ mov u1, s1
+
+ bsf t0, cnt
+
+ sub v0, u0
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+ shrd R8(cnt), u1, u0
+ shr R8(cnt), u1
+
+ mov v1, t1
+ or u1, t1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ mov u0, s0
+ mov u1, s1
+ bsf t0, cnt
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): C mov v0, %rax
+ C mov v1, %rdx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h
new file mode 100644
index 0000000..44f1494
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 116,220,984 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 16
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 26
+
+#define DIV_1_VS_MUL_1_PERCENT 284
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM6H_THRESHOLD 256
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 102
+#define SQR_TOOM4_THRESHOLD 160
+#define SQR_TOOM6_THRESHOLD 366
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 368, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 79,11}, { 47,10}, { 95,12}, { 31, 9}, \
+ { 255,10}, { 135,11}, { 79,10}, { 159, 9}, \
+ { 319,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,13}, { 63,12}, \
+ { 127,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 575,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 799,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \
+ { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2559,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4991,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 176
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 79,11}, { 47,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,11}, { 79,10}, \
+ { 159, 6}, { 2559, 7}, { 1343, 6}, { 2687, 7}, \
+ { 1407, 9}, { 383,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 575,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 799,13}, \
+ { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
+ { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1663,14}, { 895,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,16}, { 511,15}, { 1023,14}, { 2303,13}, \
+ { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,12}, { 11775,15}, { 1535,14}, \
+ { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,13}, \
+ { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 183
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 67
+#define MULLO_MUL_N_THRESHOLD 9174
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 11
+#define SQRLO_SQR_THRESHOLD 7035
+
+#define DC_DIV_QR_THRESHOLD 53
+#define DC_DIVAPPR_Q_THRESHOLD 163
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 76
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 158
+#define INV_APPR_THRESHOLD 167
+
+#define BINV_NEWTON_THRESHOLD 248
+#define REDC_1_TO_REDC_N_THRESHOLD 44
+
+#define MU_DIV_QR_THRESHOLD 1187
+#define MU_DIVAPPR_Q_THRESHOLD 1210
+#define MUPI_DIV_QR_THRESHOLD 73
+#define MU_BDIV_QR_THRESHOLD 1017
+#define MU_BDIV_Q_THRESHOLD 1187
+
+#define POWM_SEC_TABLE 1,64,105,579,1486
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 134
+#define SET_STR_PRECOMPUTE_THRESHOLD 1752
+
+#define FAC_DSC_THRESHOLD 351
+#define FAC_ODD_THRESHOLD 27
+
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD2_DIV1_METHOD 3 /* 2.14% faster than 5 */
+#define HGCD_THRESHOLD 118
+#define HGCD_APPR_THRESHOLD 161
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 351
+#define JACOBI_BASE_METHOD 4 /* 3.56% faster than 1 */
+
+/* Tuneup completed successfully, took 132491 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm
new file mode 100644
index 0000000..ded7b67
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm
@@ -0,0 +1,210 @@
+dnl AMD64 SSSE3 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 n/a
+C Intel CNR 4.50 y
+C Intel PNR 3.28 y
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C VIA nano ?
+
+C TODO
+C * This was hand-written without too much thought about optimal insn
+C selection; check to see of it can be improved.
+C * Consider doing some instruction scheduling.
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+ `define(`OFF1',64) define(`OFF2',80)')
+ movdqa OFF1`'(%r9), %xmm7
+ movdqa OFF2`'(%r9), %xmm6
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm8, %xmm8
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): movq (up), %xmm1
+ add $8, up
+ movq (vp), %xmm10
+ add $8, vp
+ pxor %xmm10, %xmm1
+ jmp L(e1)
+
+L(2): add $-48, up
+ add $-48, vp
+ jmp L(e2)
+
+L(3): movq (up), %xmm1
+ add $-40, up
+ movq (vp), %xmm10
+ add $-40, vp
+ pxor %xmm10, %xmm1
+ jmp L(e3)
+
+L(4): add $-32, up
+ add $-32, vp
+ jmp L(e4)
+
+L(5): movq (up), %xmm1
+ add $-24, up
+ movq (vp), %xmm10
+ add $-24, vp
+ pxor %xmm10, %xmm1
+ jmp L(e5)
+
+L(6): add $-16, up
+ add $-16, vp
+ jmp L(e6)
+
+L(7): movq (up), %xmm1
+ add $-8, up
+ movq (vp), %xmm10
+ add $-8, vp
+ pxor %xmm10, %xmm1
+ jmp L(e7)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm1
+ lddqu (vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e7): movdqa %xmm6, %xmm0 C copy mask register
+ movdqa %xmm7, %xmm2 C copy count register
+ movdqa %xmm7, %xmm3 C copy count register
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm1
+ lddqu 16(vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e5): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm1
+ lddqu 32(vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e3): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e2): lddqu 48(up), %xmm1
+ add $64, up
+ lddqu 48(vp), %xmm10
+ add $64, vp
+ pxor %xmm10, %xmm1
+L(e1): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts
+ paddb %xmm2, %xmm3
+ paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts
+ movdqa %xmm3, %xmm4
+ sub $8, n
+ jg L(top)
+
+ psadbw %xmm5, %xmm4
+ paddq %xmm4, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
diff --git a/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm
new file mode 100644
index 0000000..5ff174c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm
@@ -0,0 +1,285 @@
+dnl AMD64 logops.
+
+dnl Copyright 2004-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l c/l c/l good
+C var-1 var-2 var-3 for cpu?
+C AMD K8,K9
+C AMD K10 1.52 1.75 1.75 n
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD bt1 2.67 ~2.79 ~2.79 =
+C AMD bt2 2.15 2.65 2.65 n
+C AMD zen 1.5 1.5 1.5 =
+C Intel P4
+C Intel PNR 2.0 2.0 2.0 =
+C Intel NHM 2.0 2.0 2.0 =
+C Intel SBR 1.5 1.5 1.5 y
+C Intel IBR 1.47 1.48 1.48 y
+C Intel HWL 1.11 1.35 1.35 y
+C Intel BWL 1.09 1.30 1.30 y
+C Intel SKL 1.21 1.27 1.27 y
+C Intel atom 3.31 3.57 3.57 y
+C Intel SLM 3.0 3.0 3.0 =
+C VIA nano
+
+ifdef(`OPERATION_and_n',`
+ define(`func',`mpn_and_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_andn_n',`
+ define(`func',`mpn_andn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_nand_n',`
+ define(`func',`mpn_nand_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_ior_n',`
+ define(`func',`mpn_ior_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func',`mpn_iorn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_nior_n',`
+ define(`func',`mpn_nior_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_xor_n',`
+ define(`func',`mpn_xor_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`xor')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func',`mpn_xnor_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`xor')')
+
+define(`addptr', `lea $1($2), $2')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+
+ifdef(`VARIANT_1',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+L(b00): mov 8(vp), %r9
+ LOGOP (up), %r8
+ LOGOP 8(up), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+L(e10): mov 24(vp), %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_2',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ not %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ not %r8
+L(b00): mov 8(vp), %r9
+ not %r9
+ LOGOP (up), %r8
+ LOGOP 8(up), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+ not %r8
+L(e10): mov 24(vp), %r9
+ not %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_3',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ not %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ not %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+L(b00): mov 8(vp), %r9
+ LOGOP (up), %r8
+ not %r8
+ LOGOP 8(up), %r9
+ not %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+L(e10): mov 24(vp), %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ not %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ not %r9
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshift.asm b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm
new file mode 100644
index 0000000..9016a71
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm
@@ -0,0 +1,145 @@
+dnl x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.32
+C Intel NHM 1.30 (drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea -8(up,n,8), up
+ lea 16(rp,n,8), rp
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %rax
+ mov -16(up), %r8
+ shr $2, n
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea -16(up,n,8), up
+ lea 8(rp,n,8), rp
+ mov 8(up), %r9
+ shld R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov (up), %r10
+ mov -8(up), %r11
+ jmp L(01)
+
+L(b10): lea -24(up,n,8), up
+ lea (rp,n,8), rp
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov (up), %r10
+ jmp L(10)
+
+ ALIGN(16)
+L(b11): lea -32(up,n,8), up
+ lea -8(rp,n,8), rp
+ mov 24(up), %r11
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shld R8(cnt), %r8, %r11
+ mov (up), %r10
+ mov %r11, (rp)
+L(10): shld R8(cnt), %r9, %r8
+ mov -8(up), %r11
+ mov %r8, -8(rp)
+L(01): shld R8(cnt), %r10, %r9
+ mov -16(up), %r8
+ mov %r9, -16(rp)
+L(00): shld R8(cnt), %r11, %r10
+ mov -24(up), %r9
+ add $-32, up
+ mov %r10, -24(rp)
+ add $-32, rp
+ dec n
+ jnz L(top)
+
+L(end): shld R8(cnt), %r8, %r11
+ mov %r11, (rp)
+L(2): shld R8(cnt), %r9, %r8
+ mov %r8, -8(rp)
+L(1): shl R8(cnt), %r9
+ mov %r9, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm
new file mode 100644
index 0000000..c428f13
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm
@@ -0,0 +1,159 @@
+dnl x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.52
+C Intel NHM 1.78 (just 2.15 for n < 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+C TODO
+C * This runs poorly on Nehalem compared to plain lshift, in particular for
+C n < 256.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea -8(up,n,8), up
+ lea 16(rp,n,8), rp
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %rax
+ mov -16(up), %r8
+ shr $2, n
+ shld R8(cnt), %r11, %r10
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea -16(up,n,8), up
+ lea 8(rp,n,8), rp
+ mov 8(up), %r9
+ shld R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %r9
+ jmp L(01)
+
+L(b10): lea -24(up,n,8), up
+ lea (rp,n,8), rp
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov (up), %r10
+ shld R8(cnt), %r9, %r8
+ jmp L(10)
+
+ ALIGN(16)
+L(b11): lea -32(up,n,8), up
+ lea -8(rp,n,8), rp
+ mov 24(up), %r11
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shld R8(cnt), %r8, %r11
+ mov (up), %r10
+ not %r11
+ shld R8(cnt), %r9, %r8
+ mov %r11, (rp)
+L(10): mov -8(up), %r11
+ not %r8
+ shld R8(cnt), %r10, %r9
+ mov %r8, -8(rp)
+L(01): mov -16(up), %r8
+ not %r9
+ shld R8(cnt), %r11, %r10
+ mov %r9, -16(rp)
+L(00): mov -24(up), %r9
+ not %r10
+ add $-32, up
+ mov %r10, -24(rp)
+ add $-32, rp
+ dec n
+ jnz L(top)
+
+L(end): shld R8(cnt), %r8, %r11
+ not %r11
+ mov %r11, (rp)
+L(2): shld R8(cnt), %r9, %r8
+ not %r8
+ mov %r8, -8(rp)
+L(1): shl R8(cnt), %r9
+ not %r9
+ mov %r9, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm
new file mode 100644
index 0000000..d16be85
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm
@@ -0,0 +1,975 @@
+dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
+dnl It also seems good for Conroe/Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.0 4.0 - 4.18-4.25
+C Intel NHM 3.75 3.8 - 4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
+C | | | |
+C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
+C | / | / | / | /
+C | / | / | / | /
+C | / | / | / | /
+C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
+C _____ _____ _____ _____
+C / \ / \ / \ / \
+C \|/ | \|/ | \|/ | \|/ |
+C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
+C \ /|\ \ /|\ \ /|\ \ /|\
+C \_____/ \_____/ \_____/ \_____/
+
+C TODO
+C * Tune. None done so far.
+C * Currently 2687 bytes, making it smaller would be nice.
+C * Implement some basecases, say for un < 4.
+C * Try zeroing with xor in m2 loops.
+C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C between loop header and wind-down code.
+C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
+define(`vn_param', `%r8')
+
+define(`un', `%r9')
+define(`vn', `(%rsp)')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r12')
+define(`i', `%r13')
+define(`vp', `%r14')
+
+define(`X0', `%r8')
+define(`X1', `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ mov (up), %rax C shared for mul_1 and mul_2
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov (vp_param), v0 C shared for mul_1 and mul_2
+
+ xor un, un
+ sub un_param, un C un = -un_param
+
+ lea (up,un_param,8), up
+ lea (rp,un_param,8), rp
+
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn_param)
+ jz L(m2)
+
+ lea 8(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):test $2, R8(un)
+ jnz L(m1s2)
+
+L(m1s0):
+ lea (un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ lea L(do_am0)(%rip), %rbp
+ jmp L(m1e0)
+
+L(m1s2):
+ lea 2(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mul v0
+ lea L(do_am2)(%rip), %rbp
+ test i, i
+ jnz L(m1e2)
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),8(rp,un,8))
+ mov %rdx, I((rp),16(rp,un,8))
+ jmp L(ret2)
+
+L(m1x1):test $2, R8(un)
+ jz L(m1s3)
+
+L(m1s1):
+ lea 1(un), i
+ mov %rax, (rp,un,8)
+ test i, i
+ jz L(1)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am1)(%rip), %rbp
+ jmp L(m1e1)
+L(1): mov %rdx, I((rp),8(rp,un,8))
+ jmp L(ret2)
+
+L(m1s3):
+ lea -1(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am3)(%rip), %rbp
+ jmp L(m1e3)
+
+ ALIGNx
+L(m1top):
+ mul v0
+ mov w1, -16(rp,i,8)
+L(m1e2):xor R32(w1), R32(w1)
+ add %rax, w0
+ mov (up,i,8), %rax
+ adc %rdx, w1
+ mov w0, -8(rp,i,8)
+L(m1e1):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 8(up,i,8), %rax
+ adc %rdx, w0
+ mov w1, (rp,i,8)
+L(m1e0):xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w0
+ mov 16(up,i,8), %rax
+ adc %rdx, w1
+ mov w0, 8(rp,i,8)
+L(m1e3):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 24(up,i,8), %rax
+ adc %rdx, w0
+ add $4, i
+ js L(m1top)
+
+ mul v0
+ mov w1, I(-16(rp),-16(rp,i,8))
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),-8(rp,i,8))
+ mov %rdx, I((rp),(rp,i,8))
+
+ dec vn_param
+ jz L(ret2)
+ lea -8(rp), rp
+ jmp *%rbp
+
+L(m2):
+ mov 8(vp_param), v1
+ lea 16(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), i
+ mov %rax, (rp,un,8)
+ mov %rdx, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov $0, R32(w2)
+ jmp L(m2e0)
+
+L(b10): lea -2(un), i
+ mov %rax, w2 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w3 C FIXME: Use lea?
+ mov $0, R32(w0)
+ jmp L(m2e2)
+
+L(bx1): test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), i
+ mov %rax, (rp,un,8)
+ mov (up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mov $0, R32(w1)
+ jmp L(m2e1)
+
+L(b11): lea -1(un), i
+ mov %rax, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w2 C FIXME: Use lea?
+ mov $0, R32(w3)
+ jmp L(m2e3)
+
+ ALIGNx
+L(m2top0):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+L(m2e0):mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top0)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am0):
+ push %r15
+ push vn_param
+
+L(olo0):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+C lea 0(un), i
+ mov un, i
+ mul v0
+ mov %rax, X0
+ mov (up,un,8), %rax
+ MOV( %rdx, X1, 2)
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,un,8), w2
+ mov %rax, w3
+ jmp L(lo0)
+
+ ALIGNx
+L(am2top0):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top0)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo0)
+
+L(ret): pop %rax
+ pop %r15
+L(ret2):pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top1):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(m2e1):mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top1)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am1):
+ push %r15
+ push vn_param
+
+L(olo1):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea 1(un), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 128)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w1
+ mul v1
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ MOV( %rdx, w3, 1)
+ jmp L(lo1)
+
+ ALIGNx
+L(am2top1):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo1): mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top1)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo1)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top2):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(m2e2):mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top2)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am2):
+ push %r15
+ push vn_param
+
+L(olo2):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea -2(un), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w0
+ mul v1
+ mov %rax, w1
+ lea (%rdx), w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
+ ALIGNx
+L(am2top2):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+L(lo2): mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top2)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo2)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top3):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+L(m2e3):mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top3)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am3):
+ push %r15
+ push vn_param
+
+L(olo3):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea -1(un), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w3
+ mul v1
+ mov %rax, w0
+ MOV( %rdx, w1, 16)
+ mov 8(up,un,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(am2top3):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+L(lo3): mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top3)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo3)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm
new file mode 100644
index 0000000..0f03d86
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm
@@ -0,0 +1,427 @@
+dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.0 4.18-4.25
+C Intel NHM 3.75 4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Implement proper cor2, replacing current cor0.
+C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
+C * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param', `%rcx')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r12')
+define(`n', `%r9')
+define(`i', `%r13')
+define(`vp', `%r8')
+
+define(`X0', `%r14')
+define(`X1', `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+
+ mov (up), %rax
+ mov vp_param, vp
+
+ cmp $4, n_param
+ jb L(small)
+
+ mov (vp_param), v0
+ push %rbx
+ lea (rp,n_param,8), rp C point rp at R[un]
+ push %rbp
+ lea (up,n_param,8), up C point up right after U's end
+ push %r12
+ mov $0, R32(n) C FIXME
+ sub n_param, n
+ push %r13
+ mul v0
+ mov 8(vp), v1
+
+ test $1, R8(n_param)
+ jnz L(m2x1)
+
+L(m2x0):test $2, R8(n_param)
+ jnz L(m2b2)
+
+L(m2b0):lea (n), i
+ mov %rax, (rp,n,8)
+ mov %rdx, w1
+ mov (up,n,8), %rax
+ xor R32(w2), R32(w2)
+ jmp L(m2e0)
+
+L(m2b2):lea -2(n), i
+ mov %rax, w2
+ mov (up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ jmp L(m2e2)
+
+L(m2x1):test $2, R8(n_param)
+ jnz L(m2b3)
+
+L(m2b1):lea 1(n), i
+ mov %rax, (rp,n,8)
+ mov (up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(m2e1)
+
+L(m2b3):lea -1(n), i
+ xor R32(w3), R32(w3)
+ mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ jmp L(m2e3)
+
+ ALIGNx
+L(m2tp):mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(m2e1):mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+L(m2e0):mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+L(m2e3):mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(m2e2):mul v1
+ mov $0, R32(w1) C FIXME: dead in last iteration
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0 C FIXME: dead in last iteration
+ add $4, i
+ js L(m2tp)
+
+L(m2ed):imul v0, %rax
+ add w3, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jge L(cor1)
+
+ push %r14
+ push %r15
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,n,8), %rax
+ mul v0
+ test $1, R8(n)
+ jnz L(a1x1)
+
+L(a1x0):mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (up,n,8), %rax
+ mul v1
+ test $2, R8(n)
+ jnz L(a110)
+
+L(a100):lea (n), i
+ mov (rp,n,8), w3
+ mov %rax, w0
+ MOV( %rdx, w1, 16)
+ jmp L(lo0)
+
+L(a110):lea 2(n), i
+ mov (rp,n,8), w1
+ mov %rax, w2
+ mov 8(up,n,8), %rax
+ MOV( %rdx, w3, 1)
+ jmp L(lo2)
+
+L(a1x1):mov %rax, X0
+ MOV( %rdx, X1, 2)
+ mov (up,n,8), %rax
+ mul v1
+ test $2, R8(n)
+ jz L(a111)
+
+L(a101):lea 1(n), i
+ MOV( %rdx, w0, 4)
+ mov (rp,n,8), w2
+ mov %rax, w3
+ jmp L(lo1)
+
+L(a111):lea -1(n), i
+ MOV( %rdx, w2, 64)
+ mov %rax, w1
+ mov (rp,n,8), w0
+ mov 8(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(top): mul v1
+ add w0, w1
+ adc %rax, w2
+ mov -8(up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov -8(up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov -8(rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, -8(rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov (up,i,8), %rax
+ mov (rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, (rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 8(rp,i,8), w3
+ adc $0, X1
+ mov 8(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 16(up,i,8), %rax
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 16(up,i,8), %rax
+ mov 16(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(top)
+
+L(end): imul v1, %rax
+ add w0, w1
+ adc %rax, w2
+ mov I(-8(up),-8(up,i,8)), %rax
+ imul v0, %rax
+ add w1, X1
+ mov X1, I(-16(rp),-16(rp,i,8))
+ adc X0, %rax
+ mov I(-8(rp),-8(rp,i,8)), w1
+ add w1, w2
+ add w2, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jl L(outer)
+
+ pop %r15
+ pop %r14
+
+ jnz L(cor0)
+
+L(cor1):mov (vp), v0
+ mov 8(vp), v1
+ mov -16(up), %rax
+ mul v0 C u0 x v2
+ add -16(rp), %rax C FIXME: rp[0] still available in reg?
+ adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
+ mov -8(up), %rbx
+ imul v0, %rbx
+ mov -16(up), %rcx
+ imul v1, %rcx
+ mov %rax, -16(rp)
+ add %rbx, %rcx
+ add %rdx, %rcx
+ mov %rcx, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(cor0):mov (vp), %r11
+ imul -8(up), %r11
+ add %rax, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ cmp $2, n_param
+ jae L(gt1)
+L(n1): imul (vp_param), %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp_param), %r9
+ mul %r9
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp_param), %r9
+ mul %r9 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, %r10
+ mov 8(up), %rax
+ mul %r9 C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r11
+ mov (up), %rax
+ mul %r11 C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r11 C u1 x v1
+ add %r11, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/popcount.asm b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm
new file mode 100644
index 0000000..3de69d8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm
@@ -0,0 +1,185 @@
+dnl AMD64 SSSE3 mpn_popcount -- population count.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.79-1.91 n
+C AMD bd2 1.73-1.85 n
+C AMD bd3 ?
+C AMD bd4 1.73-1.85 n
+C AMD zen 1.47 n
+C AMD bobcat 8.0 n
+C AMD jaguar 4.78 n
+C Intel P4 n/a
+C Intel CNR 3.75
+C Intel PNR 2.61 y
+C Intel NHM 2.03 n
+C Intel SBR 1.87 n
+C Intel IBR 1.52-1.58 n
+C Intel HWL 1.52-1.58 n
+C Intel BWL 1.52-1.58 n
+C Intel SKL 1.51 n
+C Intel atom 12.3 n
+C Intel SLM 9.1 n
+C VIA nano ?
+
+C TODO
+C * This was hand-written without too much thought about optimal insn
+C selection; check to see of it can be improved.
+C * Consider doing some instruction scheduling.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+ `define(`OFF1',64) define(`OFF2',80)')
+ movdqa OFF1`'(%r9), %xmm7
+ movdqa OFF2`'(%r9), %xmm6
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm8, %xmm8
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): movq (up), %xmm1
+ add $8, up
+ jmp L(e1)
+
+L(2): add $-48, up
+ jmp L(e2)
+
+L(3): movq (up), %xmm1
+ add $-40, up
+ jmp L(e3)
+
+L(4): add $-32, up
+ jmp L(e4)
+
+L(5): movq (up), %xmm1
+ add $-24, up
+ jmp L(e5)
+
+L(6): add $-16, up
+ jmp L(e6)
+
+L(7): movq (up), %xmm1
+ add $-8, up
+ jmp L(e7)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm1
+L(e7): movdqa %xmm6, %xmm0 C copy mask register
+ movdqa %xmm7, %xmm2 C copy count register
+ movdqa %xmm7, %xmm3 C copy count register
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm1
+L(e5): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm1
+L(e3): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e2): lddqu 48(up), %xmm1
+ add $64, up
+L(e1): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts
+ paddb %xmm2, %xmm3
+ paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts
+ movdqa %xmm3, %xmm4
+ sub $8, n
+ jg L(top)
+
+ psadbw %xmm5, %xmm4
+ paddq %xmm4, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
diff --git a/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm
new file mode 100644
index 0000000..8c296fd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm
@@ -0,0 +1,430 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core 4.5 (fluctuating)
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+C * Keep up[i] in registers for basecases (might require pushes).
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C X q0' n X rp up u0i mp q0 i j
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea -16(up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(b0)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+ cmp $-3, R32(n)
+ jz L(n3)
+
+ push rp
+
+L(otp1):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ lea (%rax), %rbp
+ mov 8(mp,n,8), %rax
+ lea (%rdx), %r9
+ mul q0
+ lea (%rax), %r11
+ mov 16(mp,n,8), %rax
+ mov 16(up,n,8), %r10
+ lea (%rdx), %rdi
+ mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov 24(mp,n,8), %rax
+ adc %r9, %r11
+ mov 24(up,n,8), %rbx
+ lea (%rdx), %r9
+ adc $0, %rdi
+ mul q0
+ add %r11, %rbx
+ lea (%rax), %r11
+ mov 32(mp,n,8), %rax
+ adc %rdi, %rbp
+ mov %rbx, 24(up,n,8)
+ mov 32(up,n,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ add $2, i
+ jns L(ed1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(up,i,8)
+ mov (up,i,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 8(mp,i,8), %rax
+ adc %rdi, %rbp
+ mov %r10, (up,i,8)
+ mov 8(up,i,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ add $2, i
+ js L(tp1)
+
+L(ed1): mul q0
+ add %rbp, %r10
+ adc %r9, %r11
+ mov %r10, I(-8(up),-8(up,i,8))
+ mov I((up),(up,i,8)), %r10
+ adc $0, %rdi
+ add %r11, %r10
+ adc %rdi, %rax
+ mov %r10, I((up),(up,i,8))
+ mov I(8(up),8(up,i,8)), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, I(8(up),8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, 16(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b0): cmp $-2, R32(n)
+ jz L(n2)
+ cmp $-4, R32(n)
+ jz L(n4)
+
+ push rp
+
+L(otp0):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ lea (%rax), %r11
+ mov 8(mp,n,8), %rax
+ lea (%rdx), %rdi
+ mul q0
+ lea (%rax), %rbp
+ mov 16(mp,n,8), %rax
+ mov 16(up,n,8), %r10
+ lea (%rdx), %r9
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 24(mp,n,8), %rax
+ adc %rdi, %rbp
+ mov 24(up,n,8), %rbx
+ lea (%rdx), %rdi
+ adc $0, %r9
+ mul q0
+ add %rbp, %rbx
+ lea (%rax), %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rbx, 24(up,n,8)
+ mov 32(up,n,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(up,i,8)
+ mov (up,i,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+L(e0): mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 8(mp,i,8), %rax
+ adc %rdi, %rbp
+ mov %r10, (up,i,8)
+ mov 8(up,i,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ add $2, i
+ js L(tp0)
+
+L(ed0): mul q0
+ add %rbp, %r10
+ adc %r9, %r11
+ mov %r10, I(-8(up),-8(up,i,8))
+ mov I((up),(up,i,8)), %r10
+ adc $0, %rdi
+ add %r11, %r10
+ adc %rdi, %rax
+ mov %r10, I((up),(up,i,8))
+ mov I(8(up),8(up,i,8)), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, I(8(up),8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, 16(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+
+L(cj): lea 16(up), up C FIXME
+ pop rp
+L(add_n):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add 8(up), %rax
+ adc 16(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov (up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov 8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov 16(up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 24(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov (up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov 8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, (up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, 8(up)
+ mov %r11, -8(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -32(up), %rdx
+ mov -24(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc 8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n4): mov -32(mp), %rax
+ mul q0
+ lea (%rax), %r11
+ mov -24(mp), %rax
+ lea (%rdx), %r14
+ mul q0
+ lea (%rax), %rbp
+ mov -16(mp), %rax
+ mov -16(up), %r10
+ lea (%rdx), %r9
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov -8(mp), %rax
+ adc %r14, %rbp
+ mov -8(up), %rbx
+ lea (%rdx), %r14
+ adc $0, %r9
+ mul q0
+ add %rbp, %rbx
+ adc %r9, %r11
+ mov %rbx, -8(up)
+ mov (up), %r10
+ adc $0, %r14
+ imul u0inv, %rbx C next q limb
+ add %r11, %r10
+ adc %r14, %rax
+ mov %r10, (up)
+ mov 8(up), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, 8(up)
+ adc $0, %rdx
+ mov %rdx, -16(up) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(n4)
+ lea 16(up), up
+ jmp L(add_n)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm
new file mode 100644
index 0000000..27eed37
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm
@@ -0,0 +1,169 @@
+dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
+
+dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3.05
+C Intel NHM 3.3
+C Intel SBR 2.5
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Loopmix to approach 2.5 c/l on NHM.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+
+ neg %r8 C set C flag from parameter
+ mov (up), %r8
+ ADCSBB (vp), %r8
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %r8
+ ADDSUB (vp), %r8
+L(ent): sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %rax
+ and $1, R32(%rax) C return value
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+ mov R32(n), R32(%rbp)
+ neg n
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jae L(n1)
+
+L(b1): mov %r8, %rbp
+ inc n
+ js L(top)
+ jmp L(end)
+
+L(n1): jnz L(b3)
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %r10
+ add $-2, n
+ jmp L(2)
+
+L(b3): add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r10
+ mov 16(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r10
+ ADCSBB 16(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %r9
+ dec n
+ jmp L(3)
+
+L(b0): add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r9
+ mov 16(up,n,8), %r10
+ mov 24(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r9
+ ADCSBB 16(vp,n,8), %r10
+ ADCSBB 24(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ jmp L(4)
+
+ ALIGN(16)
+
+L(top): add R32(%rbx), R32(%rbx) C restore cy
+ mov (up,n,8), %r8
+ mov 8(up,n,8), %r9
+ mov 16(up,n,8), %r10
+ mov 24(up,n,8), %r11
+ ADCSBB (vp,n,8), %r8
+ ADCSBB 8(vp,n,8), %r9
+ ADCSBB 16(vp,n,8), %r10
+ ADCSBB 24(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ shrd $1, %r8, %rbp
+ mov %rbp, -8(rp,n,8)
+L(4): shrd $1, %r9, %r8
+ mov %r8, (rp,n,8)
+L(3): shrd $1, %r10, %r9
+ mov %r9, 8(rp,n,8)
+L(2): shrd $1, %r11, %r10
+ mov %r10, 16(rp,n,8)
+L(1): add $4, n
+ mov %r11, %rbp
+ js L(top)
+
+L(end): shrd $1, %rbx, %rbp
+ mov %rbp, -8(rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/rshift.asm b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm
new file mode 100644
index 0000000..7578a53
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm
@@ -0,0 +1,143 @@
+dnl x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.32
+C Intel NHM 1.30 (drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea 8(up), up
+ lea -24(rp), rp
+ mov -8(up), %r10
+ mov (up), %r11
+ shrd R8(cnt), %r10, %rax
+ mov 8(up), %r8
+ shr $2, n
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea 16(up), up
+ lea -16(rp), rp
+ mov -16(up), %r9
+ shrd R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov -8(up), %r10
+ mov (up), %r11
+ jmp L(01)
+
+L(b10): lea 24(up), up
+ lea -8(rp), rp
+ mov -24(up), %r8
+ mov -16(up), %r9
+ shrd R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov -8(up), %r10
+ jmp L(10)
+
+L(b11): lea 32(up), up
+ mov -32(up), %r11
+ mov -24(up), %r8
+ mov -16(up), %r9
+ shrd R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shrd R8(cnt), %r8, %r11
+ mov -8(up), %r10
+ mov %r11, (rp)
+L(10): shrd R8(cnt), %r9, %r8
+ mov (up), %r11
+ mov %r8, 8(rp)
+L(01): shrd R8(cnt), %r10, %r9
+ mov 8(up), %r8
+ mov %r9, 16(rp)
+L(00): shrd R8(cnt), %r11, %r10
+ mov 16(up), %r9
+ add $32, up
+ mov %r10, 24(rp)
+ add $32, rp
+ dec n
+ jnz L(top)
+
+L(end): shrd R8(cnt), %r8, %r11
+ mov %r11, (rp)
+L(2): shrd R8(cnt), %r9, %r8
+ mov %r8, 8(rp)
+L(1): shr R8(cnt), %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm
new file mode 100644
index 0000000..a112c1b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm
@@ -0,0 +1,984 @@
+dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
+dnl It also seems good for Conroe/Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.9 4.18-4.25 3.87
+C Intel NHM 3.8 4.06-4.2 3.5
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
+C | | | |
+C | | | |
+C | | | |
+C \|/ \|/ \|/ \|/
+C ____________ ____________
+C / \ / \
+C \|/ \ \|/ \
+C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
+C \ /|\ \ /|\
+C \____________/ \____________/
+C \ /
+C \ /
+C \ /
+C tail(0m2) tail(1m2)
+C \ /
+C \ /
+C sqr_diag_addlsh1
+
+C TODO
+C * Tune. None done so far.
+C * Currently 2761 bytes, making it smaller would be nice.
+C * Consider using a jumptab-based entry sequence. One might even use a mask-
+C less sequence, if the table is large enough to support tuneup's needs.
+C The code would be, using non-PIC code,
+C lea tab(%rip),%rax; jmp *(n,%rax)
+C or,
+C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
+C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
+C with the last four entries repeated a safe number of times.
+C * Consider expanding feed-in code in order to avoid zeroing registers.
+C * Zero consistently with xor.
+C * Check if using "lea (reg),reg" should be done in more places; we have some
+C explicit "mov %rax,reg" now.
+C * Try zeroing with xor in m2 loops.
+C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C between loop header and wind-down code.
+C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+
+define(`n', `%r8')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r9')
+define(`i', `%r13')
+
+define(`X0', `%r12')
+define(`X1', `%r14')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $4, n_param
+ jl L(small)
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov (up), v0
+ mov 8(up), %rax
+ mov %rax, v1
+
+ mov $1, R32(n)
+ sub n_param, n C n = -n_param+1
+ push n
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ mov %rax, (rp,n,8)
+ jnz L(b10)
+
+L(b00): lea (n), i C n = 5, 9, ...
+ mov %rdx, w1 C FIXME: Use lea?
+ xor R32(w2), R32(w2)
+ jmp L(m2e0)
+
+L(b10): lea 2(n), i C n = 7, 11, ...
+ mov 8(up,n,8), %rax
+ mov %rdx, w3 C FIXME: Use lea?
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ jmp L(m2e2)
+
+L(bx1): test $2, R8(n)
+ mov %rax, (rp,n,8)
+ jz L(b11)
+
+L(b01): lea 1(n), i C n = 6, 10, ...
+ mov %rdx, w0 C FIXME: Use lea?
+ xor R32(w1), R32(w1)
+ jmp L(m2e1)
+
+L(b11): lea -1(n), i C n = 4, 8, 12, ...
+ mov %rdx, w2 C FIXME: Use lea?
+ xor R32(w3), R32(w3)
+ jmp L(m2e3)
+
+
+ ALIGNx
+L(m2top1):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+L(m2e1):mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top1)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ jmp L(am2o3)
+
+ ALIGNx
+L(m2top3):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+L(m2e3):mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top3)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ cmp $-1, n
+ jz L(cor1) C jumps iff entry n = 4
+
+L(am2o1):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea 1(n), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 128)
+ mov (rp,n,8), w1
+ xor R32(w2), R32(w2)
+ mov 8(up,n,8), %rax
+ xor R32(w3), R32(w3)
+ jmp L(lo1)
+
+ ALIGNx
+L(am2top1):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo1): mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top1)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+
+L(am2o3):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea -1(n), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (rp,n,8), w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov 8(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(am2top3):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+L(lo3): mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top3)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+ cmp $-1, n
+ jnz L(am2o1)
+
+L(cor1):pop n
+ mov %rdx, w3
+ mov -16(up), v0
+ mov -8(up), %rax
+ mul v0
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ jmp L(sqr_diag_addlsh1)
+
+ ALIGNx
+L(m2top2):
+L(m2e2):mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top2)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ jmp L(am2o0)
+
+ ALIGNx
+L(m2top0):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+L(m2e0):mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top0)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ cmp $-2, n
+ jz L(cor2) C jumps iff entry n = 5
+
+L(am2o2):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea -2(n), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov (rp,n,8), w0
+ xor R32(w1), R32(w1)
+ xor R32(w2), R32(w2)
+ mov 8(up,n,8), %rax
+ jmp L(lo2)
+
+ ALIGNx
+L(am2top2):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+L(lo2): mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top2)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+
+L(am2o0):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea 0(n), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 2)
+ xor R32(w0), R32(w0)
+ mov (rp,n,8), w2
+ xor R32(w3), R32(w3)
+ jmp L(lo0)
+
+ ALIGNx
+L(am2top0):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top0)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+ cmp $-2, n
+ jnz L(am2o2)
+
+L(cor2):pop n
+ mov -24(up), v0
+ mov %rax, w2
+ mov %rdx, w0
+ mov -16(up), %rax
+ mov %rax, v1
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov -8(up), %rax
+ mul v0
+ add w2, X0
+ mov X0, -16(rp)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov -8(up), %rax
+ adc $0, X0
+ mul v1
+ add w0, X1
+ adc $0, X0
+ mov X1, -8(rp)
+ add X0, %rax
+ mov %rax, (rp)
+ adc $0, %rdx
+ mov %rdx, 8(rp)
+ lea 8(rp), rp
+
+L(sqr_diag_addlsh1):
+ mov -8(up,n,8), %rax
+ shl n
+ xor R32(%rbx), R32(%rbx)
+ mul %rax
+ mov 8(rp,n,8), %r11
+ lea (%rdx), %r10
+ mov 16(rp,n,8), %r9
+ add %r11, %r11
+ jmp L(dm)
+
+ ALIGNx
+L(dtop):mul %rax
+ add %r11, %r10
+ mov 8(rp,n,8), %r11
+ mov %r10, -8(rp,n,8)
+ adc %r9, %rax
+ lea (%rdx,%rbx), %r10
+ mov 16(rp,n,8), %r9
+ adc %r11, %r11
+L(dm): mov %rax, (rp,n,8)
+ mov (up,n,4), %rax
+ adc %r9, %r9
+ setc R8(%rbx)
+ add $2, n
+ js L(dtop)
+
+ mul %rax
+ add %r11, %r10
+ mov %r10, -8(rp)
+ adc %r9, %rax
+ lea (%rdx,%rbx), %r10
+ mov %rax, (rp)
+ adc $0, %r10
+ mov %r10, 8(rp)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ mov (up), %rax
+ cmp $2, n_param
+ jae L(gt1)
+L(n1):
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+L(n2): mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2):
+L(n3): mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm
new file mode 100644
index 0000000..46488fc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm
@@ -0,0 +1,47 @@
+dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+define(ADDSUB, sub)
+define(ADCSBB, sbb)
+define(func, mpn_sublsh1_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm
new file mode 100644
index 0000000..f3b1e28
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm
@@ -0,0 +1,47 @@
+dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+define(ADDSUB, sub)
+define(ADCSBB, sbb)
+define(func, mpn_sublsh2_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm
new file mode 100644
index 0000000..272700d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm
@@ -0,0 +1,158 @@
+dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and
+dnl Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C AMD K8,K9 4.25
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3
+C Intel NHM 3.1
+C Intel SBR 2.47
+C Intel atom ?
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %r12
+
+ mov R32(%rcx), R32(%rax)
+ lea 24(up,n,8), up
+ lea 24(vp,n,8), vp
+ lea 24(rp,n,8), rp
+ neg n
+
+ xor R32(%r11), R32(%r11)
+
+ mov -24(vp,n,8), %r8 C do first limb early
+ shrd $RSH, %r8, %r11
+
+ and $3, R32(%rax)
+ je L(b0)
+ cmp $2, R32(%rax)
+ jc L(b1)
+ je L(b2)
+
+L(b3): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -8(vp,n,8), %r10
+ shrd $RSH, %r10, %r9
+ mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+ mov -8(up,n,8), %r12
+ ADCSBB %r9, %r12
+ mov %r12, -8(rp,n,8)
+ mov %r10, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b1): mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov %r8, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ inc n
+ js L(top)
+ jmp L(end)
+
+L(b2): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+ mov %r9, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+ ALIGN(16)
+L(top): mov -24(vp,n,8), %r8
+ shrd $RSH, %r8, %r11
+L(b0): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -8(vp,n,8), %r10
+ shrd $RSH, %r10, %r9
+ mov (vp,n,8), %rbx
+ shrd $RSH, %rbx, %r10
+
+ add R32(%rax), R32(%rax) C restore cy
+
+ mov -24(up,n,8), %r12
+ ADCSBB %r11, %r12
+ mov %r12, -24(rp,n,8)
+
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+
+ mov -8(up,n,8), %r12
+ ADCSBB %r9, %r12
+ mov %r12, -8(rp,n,8)
+
+ mov (up,n,8), %r12
+ ADCSBB %r10, %r12
+ mov %r12, (rp,n,8)
+
+ mov %rbx, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+
+ add $4, n
+ js L(top)
+
+L(end): shr $RSH, %r11
+ pop %r12
+ pop %rbx
+ sub R32(%r11), R32(%rax)
+ neg R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
new file mode 100644
index 0000000..8d3a44a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
@@ -0,0 +1,210 @@
+dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 n/a
+C AMD bd3 n/a
+C AMD bd4 ?
+C AMD zen1 ?
+C AMD zen2 ?
+C AMD zen3 1.5
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL 1.67 1.74
+C Intel SKL 1.63 1.71
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Put an initial mulx before switching, targeting some free registers.
+C * Tune feed-in code.
+C * Trim nop execution after L(f2).
+C * For DOS64, fix nop execution.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(` define(`up', ``%rsi'') ') dnl
+dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
+dnl IFDOS(` define(`vl', ``%r9'') ') dnl
+dnl IFDOS(` define(`r9', ``rdi'') ') dnl
+dnl IFDOS(` define(`n', ``%r8'') ') dnl
+dnl IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_1)
+ FUNC_ENTRY(4)
+
+ mov v0_param, %r10
+ mov n_param, n
+ mov R32(n_param), R32(%r8)
+ shr $3, n
+ and $7, R32(%r8) C clear OF, CF as side-effect
+ mov %r10, %rdx
+ lea L(tab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%r8,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%r8,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(f0), L(tab))
+ JMPENT( L(f1), L(tab))
+ JMPENT( L(f2), L(tab))
+ JMPENT( L(f3), L(tab))
+ JMPENT( L(f4), L(tab))
+ JMPENT( L(f5), L(tab))
+ JMPENT( L(f6), L(tab))
+ JMPENT( L(f7), L(tab))
+ TEXT
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea -1(n), n
+ jmp L(b0)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea -48(rp), rp
+ jmp L(b3)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea -40(rp), rp
+ jmp L(b4)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea -32(rp), rp
+ jmp L(b5)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea -24(rp), rp
+ jmp L(b6)
+
+L(f1): mulx( (up), %r9, %rax)
+ jrcxz L(1)
+ jmp L(b1)
+L(1): add (rp), %r9
+ mov %r9, (rp)
+ adc %rcx, %rax C relies on rcx = 0
+ FUNC_EXIT()
+ ret
+
+L(end): adox( (rp), %r9)
+ mov %r9, (rp)
+ adox( %rcx, %rax) C relies on rcx = 0
+ adc %rcx, %rax C relies on rcx = 0
+ FUNC_EXIT()
+ ret
+
+ifdef(`PIC',
+` nop;nop;nop;nop',
+` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+
+ ALIGN(32)
+L(top): adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ jrcxz L(end)
+L(b1): mulx( 8,(up), %r10, %r8)
+ adox( (rp), %r9)
+ lea -1(n), n
+ mov %r9, (rp)
+ adcx( %rax, %r10)
+L(b0): mulx( 16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ adox( 8,(rp), %r10)
+ mov %r10, 8(rp)
+L(b7): mulx( 24,(up), %r10, %r8)
+ lea 64(up), up
+ adcx( %rax, %r10)
+ adox( 16,(rp), %r9)
+ mov %r9, 16(rp)
+L(b6): mulx( -32,(up), %r9, %rax)
+ adox( 24,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 24(rp)
+L(b5): mulx( -24,(up), %r10, %r8)
+ adcx( %rax, %r10)
+ adox( 32,(rp), %r9)
+ mov %r9, 32(rp)
+L(b4): mulx( -16,(up), %r9, %rax)
+ adox( 40,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 40(rp)
+L(b3): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ jmp L(top)
+
+L(f7): mulx( (up), %r9, %rax)
+ lea -16(up), up
+ lea -16(rp), rp
+ jmp L(b7)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..91c91b5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
@@ -0,0 +1,246 @@
+/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 467,964,472 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 24
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 455
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 202
+#define MUL_TOOM6H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 198
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 426
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 46
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39, 8}, \
+ { 79,10}, { 23, 9}, { 55,11}, { 15,10}, \
+ { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \
+ { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \
+ { 87,11}, { 47,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
+ { 95,10}, { 199,11}, { 111,12}, { 63, 8}, \
+ { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 639,12}, { 1279,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
+ { 2687,13}, { 1407,14}, { 767,13}, { 1535,12}, \
+ { 3071,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
+ { 895,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,12}, { 6911,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \
+ { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5247,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8703,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,17}, { 2047,16}, { 4095,15}, \
+ { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 400, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127,11}, { 79,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \
+ { 95, 8}, { 1599, 9}, { 831,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 671,12}, { 351,11}, { 735,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 799,13}, \
+ { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2047,13}, { 4095,14}, { 2175,13}, \
+ { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4351,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \
+ { 9471,14}, { 18943,15}, { 9983,14}, { 19967,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 215
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 80
+#define MULLO_MUL_N_THRESHOLD 11025
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 109
+#define SQRLO_SQR_THRESHOLD 7293
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 86
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 58
+#define INV_NEWTON_THRESHOLD 171
+#define INV_APPR_THRESHOLD 171
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_2_THRESHOLD 33
+#define REDC_2_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 67
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1866
+
+#define POWM_SEC_TABLE 2,10,191,494,712,1378
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 644
+#define SET_STR_PRECOMPUTE_THRESHOLD 1658
+
+#define FAC_DSC_THRESHOLD 562
+#define FAC_ODD_THRESHOLD 48
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 5 /* 0.38% faster than 3 */
+#define HGCD_THRESHOLD 73
+#define HGCD_APPR_THRESHOLD 67
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 630
+#define GCDEXT_DC_THRESHOLD 365
+#define JACOBI_BASE_METHOD 1 /* 29.65% faster than 4 */
+
+/* Tuneup completed successfully, took 239050 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
new file mode 100644
index 0000000..b7fae2f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_mul_1 optimised for Intel Broadwell.
+
+dnl Copyright 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bull -
+C AMD pile -
+C AMD steam -
+C AMD excavator -
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel core2 -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 1.70
+C Intel BWL 1.51
+C Intel SKL 1.52
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Put an initial mulx before switching, targeting some free registers.
+C * Tune feed-in code.
+C * Trim nop execution after L(f2).
+C * Port to DOS64, not forgetting nop execution.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+
+dnl ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(` define(`up', ``%rsi'') ') dnl
+dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
+dnl IFDOS(` define(`vl', ``%r9'') ') dnl
+dnl IFDOS(` define(`r9', ``rdi'') ') dnl
+dnl IFDOS(` define(`n', ``%r8'') ') dnl
+dnl IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_1)
+
+ mov v0_param, %r10
+ mov n_param, n
+ mov R32(n_param), R32(%r8)
+ shr $3, n
+ and $7, R32(%r8) C clear OF, CF as side-effect
+ mov %r10, %rdx
+ lea L(tab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%r8,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%r8,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(f0), L(tab))
+ JMPENT( L(f1), L(tab))
+ JMPENT( L(f2), L(tab))
+ JMPENT( L(f3), L(tab))
+ JMPENT( L(f4), L(tab))
+ JMPENT( L(f5), L(tab))
+ JMPENT( L(f6), L(tab))
+ JMPENT( L(f7), L(tab))
+ TEXT
+
+L(f0): mulx( (up), %r10, %r8)
+ lea 56(up), up
+ lea -8(rp), rp
+ jmp L(b0)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea 16(rp), rp
+ inc n
+ jmp L(b3)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc n
+ jmp L(b4)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc n
+ jmp L(b5)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc n
+ jmp L(b6)
+
+L(f7): mulx( (up), %r9, %rax)
+ lea 48(up), up
+ lea 48(rp), rp
+ inc n
+ jmp L(b7)
+
+L(f1): mulx( (up), %r9, %rax)
+ test n, n
+ jnz L(b1)
+L(1): mov %r9, (rp)
+ ret
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+ test n, n
+ jz L(end)
+
+ ALIGN(32)
+L(top): mov %r10, -8(rp)
+ adc %r8, %r9
+L(b1): mulx( 8,(up), %r10, %r8)
+ adc %rax, %r10
+ lea 64(up), up
+ mov %r9, (rp)
+L(b0): mov %r10, 8(rp)
+ mulx( -48,(up), %r9, %rax)
+ lea 64(rp), rp
+ adc %r8, %r9
+L(b7): mulx( -40,(up), %r10, %r8)
+ mov %r9, -48(rp)
+ adc %rax, %r10
+L(b6): mov %r10, -40(rp)
+ mulx( -32,(up), %r9, %rax)
+ adc %r8, %r9
+L(b5): mulx( -24,(up), %r10, %r8)
+ mov %r9, -32(rp)
+ adc %rax, %r10
+L(b4): mulx( -16,(up), %r9, %rax)
+ mov %r10, -24(rp)
+ adc %r8, %r9
+L(b3): mulx( -8,(up), %r10, %r8)
+ adc %rax, %r10
+ mov %r9, -16(rp)
+ dec n
+ mulx( (up), %r9, %rax)
+ jnz L(top)
+
+L(end): mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ adc %rcx, %rax
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
new file mode 100644
index 0000000..7ca5a9b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
@@ -0,0 +1,368 @@
+dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell.
+
+dnl Copyright 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zen ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`vp', `%r14')
+define(`unneg', `%rbx')
+define(`v0', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ cmp $2, un_param
+ ja L(gen)
+ mov (vp_param), %rdx
+ mulx( (up), %rax, %r9) C 0 1
+ je L(s2x)
+
+L(s11): mov %rax, (rp)
+ mov %r9, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(s2x): cmp $2, vn
+ mulx( 8,(up), %r8, %r10) C 1 2
+ je L(s22)
+
+L(s21): add %r8, %r9
+ adc $0, %r10
+ mov %rax, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(s22): add %r8, %r9 C 1
+ adc $0, %r10 C 2
+ mov 8(vp_param), %rdx
+ mov %rax, (rp)
+ mulx( (up), %r8, %r11) C 1 2
+ mulx( 8,(up), %rax, %rdx) C 2 3
+ add %r11, %rax C 2
+ adc $0, %rdx C 3
+ add %r8, %r9 C 1
+ adc %rax, %r10 C 2
+ adc $0, %rdx C 3
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(gen):
+ push %rbx
+ push %rbp
+ push %r12
+ push %r14
+
+ mov vp_param, vp
+ lea 1(un_param), unneg
+ mov un_param, n_save
+ mov R32(un_param), R32(%rax)
+ and $-8, unneg
+ shr $3, n_save C loop count
+ neg unneg
+ and $7, R32(%rax) C clear CF for adc as side-effect
+ C note that rax lives very long
+ mov n_save, n
+ mov (vp), v0
+ lea 8(vp), vp
+
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r11
+ lea (%r11, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( (up), w2, w3)
+ lea 56(up), up
+ lea -8(rp), rp
+ jmp L(mb0)
+
+L(mf3): mulx( (up), w0, w1)
+ lea 16(up), up
+ lea 16(rp), rp
+ inc n
+ jmp L(mb3)
+
+L(mf4): mulx( (up), w2, w3)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc n
+ jmp L(mb4)
+
+L(mf5): mulx( (up), w0, w1)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc n
+ jmp L(mb5)
+
+L(mf6): mulx( (up), w2, w3)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc n
+ jmp L(mb6)
+
+L(mf7): mulx( (up), w0, w1)
+ lea 48(up), up
+ lea 48(rp), rp
+ inc n
+ jmp L(mb7)
+
+L(mf1): mulx( (up), w0, w1)
+ jmp L(mb1)
+
+L(mf2): mulx( (up), w2, w3)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(m1top):
+ mov w2, -8(rp)
+ adc w3, w0
+L(mb1): mulx( 8,(up), w2, w3)
+ adc w1, w2
+ lea 64(up), up
+ mov w0, (rp)
+L(mb0): mov w2, 8(rp)
+ mulx( -48,(up), w0, w1)
+ lea 64(rp), rp
+ adc w3, w0
+L(mb7): mulx( -40,(up), w2, w3)
+ mov w0, -48(rp)
+ adc w1, w2
+L(mb6): mov w2, -40(rp)
+ mulx( -32,(up), w0, w1)
+ adc w3, w0
+L(mb5): mulx( -24,(up), w2, w3)
+ mov w0, -32(rp)
+ adc w1, w2
+L(mb4): mulx( -16,(up), w0, w1)
+ mov w2, -24(rp)
+ adc w3, w0
+L(mb3): mulx( -8,(up), w2, w3)
+ adc w1, w2
+ mov w0, -16(rp)
+ dec n
+ mulx( (up), w0, w1)
+ jnz L(m1top)
+
+L(m1end):
+ mov w2, -8(rp)
+ adc w3, w0
+ mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+
+ dec vn
+ jz L(done)
+
+ lea L(atab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+
+L(outer):
+ lea (up,unneg,8), up
+ mov n_save, n
+ mov (vp), v0
+ lea 8(vp), vp
+ jmp *jaddr
+
+L(f0): mulx( 8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
+ lea -1(n), n
+ jmp L(b0)
+
+L(f3): mulx( -16,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
+ jmp L(b3)
+
+L(f4): mulx( -24,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
+ jmp L(b4)
+
+L(f5): mulx( -32,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
+ jmp L(b5)
+
+L(f6): mulx( -40,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
+ jmp L(b6)
+
+L(f7): mulx( 16,(up), w0, w1)
+ lea 8(rp,unneg,8), rp
+ jmp L(b7)
+
+L(f1): mulx( (up), w0, w1)
+ lea 8(rp,unneg,8), rp
+ jmp L(b1)
+
+L(am1end):
+ adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+
+ dec vn C clear OF as side-effect
+ jnz L(outer)
+L(done):
+ pop %r14
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(f2): mulx( -8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(am1top):
+ adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(am1end)
+L(b1): mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea -1(n), n
+ mov w0, (rp)
+ adcx( w1, w2)
+L(b0): mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+L(b7): mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+L(b6): mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+L(b5): mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+L(b4): mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+L(b3): adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(am1top)
+
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
+ JMPENT( L(mf7), L(mtab))
+L(atab):JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
new file mode 100644
index 0000000..5cdb209
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
@@ -0,0 +1,395 @@
+dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`jmpreg',`%rbx')
+define(`nn', `%rbp')
+
+C TODO
+C * Suppress more rp[] rewrites in corner.
+C * Rearrange feed-in jumps for short branch forms.
+C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since
+C feed-in code implodes, the blow-up will not be more than perhaps 4x.
+C * Micro-optimise critical lead-in code block around L(ent).
+C * Write n < 4 code specifically for Broadwell (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, R32(n)
+ jae L(big)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $2, R32(n)
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r8
+ mov (up), %rdx
+ mulx( %r8, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r8 C u1 x v1
+ add %r8, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(big): push %r14
+ push %r12
+ push %rbx
+ push %rbp
+ mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end
+ imul (up), %r14 C FIXME Put at absolute end
+ lea -3(n), R32(nn)
+ lea 8(vp_param), vp
+ mov (vp_param), %rdx
+
+ mov R32(n), R32(%rax)
+ shr $3, R32(n)
+ and $7, R32(%rax) C clear OF, CF as side-effect
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( (up), %r10, %r8)
+ lea 56(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(mb0)
+
+L(mf3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea 16(rp), rp
+ jrcxz L(mc)
+ inc R32(n)
+ lea L(f2)(%rip), jmpreg
+ jmp L(mb3)
+
+L(mc): mulx( -8,(up), %r10, %r8)
+ add %rax, %r10
+ mov %r9, -16(rp)
+ mulx( (up), %r9, %rax)
+ mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ jmp L(c2)
+
+L(mf4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc R32(n)
+ lea L(f3)(%rip), jmpreg
+ jmp L(mb4)
+
+L(mf5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc R32(n)
+ lea L(f4)(%rip), jmpreg
+ jmp L(mb5)
+
+L(mf6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc R32(n)
+ lea L(f5)(%rip), jmpreg
+ jmp L(mb6)
+
+L(mf7): mulx( (up), %r9, %rax)
+ lea 48(up), up
+ lea 48(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(mb7)
+
+L(mf1): mulx( (up), %r9, %rax)
+ lea L(f0)(%rip), jmpreg
+ jmp L(mb1)
+
+L(mf2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ lea L(f1)(%rip), jmpreg
+ mulx( (up), %r9, %rax)
+
+C FIXME ugly fallthrough FIXME
+ ALIGN(32)
+L(mtop):mov %r10, -8(rp)
+ adc %r8, %r9
+L(mb1): mulx( 8,(up), %r10, %r8)
+ adc %rax, %r10
+ lea 64(up), up
+ mov %r9, (rp)
+L(mb0): mov %r10, 8(rp)
+ mulx( -48,(up), %r9, %rax)
+ lea 64(rp), rp
+ adc %r8, %r9
+L(mb7): mulx( -40,(up), %r10, %r8)
+ mov %r9, -48(rp)
+ adc %rax, %r10
+L(mb6): mov %r10, -40(rp)
+ mulx( -32,(up), %r9, %rax)
+ adc %r8, %r9
+L(mb5): mulx( -24,(up), %r10, %r8)
+ mov %r9, -32(rp)
+ adc %rax, %r10
+L(mb4): mulx( -16,(up), %r9, %rax)
+ mov %r10, -24(rp)
+ adc %r8, %r9
+L(mb3): mulx( -8,(up), %r10, %r8)
+ adc %rax, %r10
+ mov %r9, -16(rp)
+ dec R32(n)
+ mulx( (up), %r9, %rax)
+ jnz L(mtop)
+
+L(mend):mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ adc %rcx, %rax
+
+ lea 8(,nn,8), %r12
+ neg %r12
+ shr $3, R32(nn)
+ jmp L(ent)
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(b0)
+
+L(f1): mulx( (up), %r9, %rax)
+ lea -1(nn), R32(nn)
+ lea L(f0)(%rip), jmpreg
+ jmp L(b1)
+
+L(end): adox( (rp), %r9)
+ mov %r9, (rp)
+ adox( %rcx, %rax) C relies on rcx = 0
+ adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits
+ lea 8(%r12), %r12
+L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?)
+ add %rax, %r14
+ add %r10, %r14 C h
+ lea (up,%r12), up C reset up
+ lea 8(rp,%r12), rp C reset rp
+ mov (vp), %rdx
+ lea 8(vp), vp
+ or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior)
+ jmp *jmpreg
+
+L(f7): mulx( (up), %r9, %rax)
+ lea -16(up), up
+ lea -16(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(b7)
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+ lea L(f1)(%rip), jmpreg
+
+C FIXME ugly fallthrough FIXME
+ ALIGN(32)
+L(top): adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ jrcxz L(end)
+L(b1): mulx( 8,(up), %r10, %r8)
+ adox( (rp), %r9)
+ lea -1(n), R32(n)
+ mov %r9, (rp)
+ adcx( %rax, %r10)
+L(b0): mulx( 16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ adox( 8,(rp), %r10)
+ mov %r10, 8(rp)
+L(b7): mulx( 24,(up), %r10, %r8)
+ lea 64(up), up
+ adcx( %rax, %r10)
+ adox( 16,(rp), %r9)
+ mov %r9, 16(rp)
+L(b6): mulx( -32,(up), %r9, %rax)
+ adox( 24,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 24(rp)
+L(b5): mulx( -24,(up), %r10, %r8)
+ adcx( %rax, %r10)
+ adox( 32,(rp), %r9)
+ mov %r9, 32(rp)
+L(b4): mulx( -16,(up), %r9, %rax)
+ adox( 40,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 40(rp)
+L(b3): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ jmp L(top)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea -24(rp), rp
+ lea L(f5)(%rip), jmpreg
+ jmp L(b6)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea -32(rp), rp
+ lea L(f4)(%rip), jmpreg
+ jmp L(b5)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea -40(rp), rp
+ lea L(f3)(%rip), jmpreg
+ jmp L(b4)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea -48(rp), rp
+ jrcxz L(cor)
+ lea L(f2)(%rip), jmpreg
+ jmp L(b3)
+
+L(cor): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp) C FIXME suppress
+ adox( (rp), %r9)
+ mov %r9, (rp) C FIXME suppress
+ adox( %rcx, %rax)
+L(c2):
+ mulx( 8,(up), %r10, %r8)
+ adc %rax, %r14
+ add %r10, %r14
+ mov (vp), %rdx
+ test R32(%rcx), R32(%rcx)
+ mulx( -16,(up), %r10, %r8)
+ mulx( -8,(up), %r9, %rax)
+ adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ adox( (rp), %r9)
+ adox( %rcx, %rax)
+ adc %rcx, %rax
+ mulx( (up), %r10, %r8)
+ add %rax, %r14
+ add %r10, %r14
+ mov 8(vp), %rdx
+ mulx( -16,(up), %rcx, %rax)
+ add %r9, %rcx
+ mov %rcx, (rp)
+ adc $0, %rax
+ mulx( -8,(up), %r10, %r8)
+ add %rax, %r14
+ add %r10, %r14
+ mov %r14, 8(rp)
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r14
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf7), L(mtab))
+ JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..ff35124
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
@@ -0,0 +1,710 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C AMD zn3 ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * Reduce register use, i.e., by combining n_neg and n_save.
+C * Supporess initial store through up, it's always a zero.
+C * Streamline up and dp setup.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl mp_limb_t
+dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up', `%rdi')
+define(`un', `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`dp', `%r14')
+define(`n_neg', `%rbx')
+define(`q', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+
+ lea L(atab)(%rip), %r10
+
+ cmp $MAX_SPECIAL, dn_param
+ jbe L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+
+ sub dn_param, un C outer loop count
+
+ lea -8(,dn_param,8), n_neg
+ neg n_neg
+ mov dn_param, n_save
+ mov R32(dn_param), R32(%rax)
+ shr $3, n_save C loop count
+ and $7, R32(%rax) C clear CF and OF as side-effect
+
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+ mov (up), q
+ imul dinv, q
+ jmp L(outer)
+
+L(f0): mulx( (dp), w2, w3)
+ lea -1(n), n
+ mulx( 8,(dp), w0, w1)
+ lea -8(dp), dp
+ adcx( w3, w0)
+ adox( (up), w2)
+ lea -8(up), up
+ jmp L(b0x)
+
+L(f3): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -48(up), up
+ lea 16(dp), dp
+ jmp L(b3x)
+
+L(f4): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 24(dp), dp
+ adox( (up), w2)
+ lea -40(up), up
+ adcx( w3, w0)
+ jmp L(b4x)
+
+L(f5): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 32(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -32(up), up
+ jmp L(b5x)
+
+L(f6): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 40(dp), dp
+ adox( (up), w2)
+ lea -24(up), up
+ adcx( w3, w0)
+ jmp L(b6x)
+
+L(f7): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 48(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -16(up), up
+ jmp L(b7x)
+
+L(f1): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ jmp L(b1x)
+
+L(f2): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 8(dp), dp
+ adox( (up), w2)
+ lea 8(up), up
+ adcx( w3, w0)
+ jmp L(b2x)
+
+L(end): adox( (up), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (up)
+ adc %rcx, w1 C relies on rcx = 0
+ mov 8(up,n_neg), q C Compute next quotient early...
+ mulx( dinv, q, %r12) C ...(unused in last iteration)
+ bt $0, R32(%r13)
+ adc w1, 8(up)
+ setc R8(%r13)
+ dec un C clear OF as side-effect
+ jz L(done)
+
+ lea (dp,n_neg), dp C reset dp to D[]'s beginning
+ lea 8(up,n_neg), up C point up to U[]'s current beginning
+L(outer):
+ mov n_save, n
+ test %eax, %eax C clear CF and OF
+ jmp *jaddr
+
+ ALIGN(16)
+L(top): adox( -8,(up), w2)
+ adcx( w3, w0)
+ mov w2, -8(up)
+ jrcxz L(end)
+L(b2x): mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ mov w0, (up)
+L(b1x): adcx( w1, w2)
+ mulx( 16,(dp), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(up), w2)
+ mov w2, 8(up)
+L(b0x): mulx( 24,(dp), w2, w3)
+ lea 64(dp), dp
+ adcx( w1, w2)
+ adox( 16,(up), w0)
+ mov w0, 16(up)
+L(b7x): mulx( -32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+L(b6x): mulx( -24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(up), w0)
+ mov w0, 32(up)
+L(b5x): mulx( -16,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+L(b4x): adox( 48,(up), w0)
+ mulx( -8,(dp), w2, w3)
+ mov w0, 48(up)
+L(b3x): lea 64(up), up
+ adcx( w1, w2)
+ mulx( (dp), w0, w1)
+ jmp L(top)
+
+L(done):mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(sma):
+ifdef(`PIC',
+` movslq 28(%r10,dn_param,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov 56(%r10,dn_param,8), jaddr
+')
+ jmp *jaddr
+
+L(1): mov (dp_param), %r10
+ xor R32(%rax), R32(%rax)
+ mov (up), %rdx
+ dec un
+ mov %rdx, %r9
+L(o1): mulx( dinv, %rdx, %r11) C next quotient
+ lea 8(up), up
+ mulx( %r10, %rcx, %rdx) C 0 1
+ add %r9, %rcx C 0
+ adc %rax, %rdx C 1
+ add (up), %rdx C 1
+ setc R8(%rax) C 2
+ mov %rdx, %r9 C 1
+ dec un
+ jnz L(o1)
+ mov %r9, (up)
+
+ FUNC_EXIT()
+ ret
+
+ifdef(`VER',,`define(`VER',1)')
+L(2): push %r12
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ sub dn_param, un C loop count
+ mov (up), q
+ imul dinv, q
+
+ifelse(VER,0,`
+ xor R32(%rax), R32(%rax)
+L(o2): test %eax, %eax C clear CF and OF
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ add (up), w2 C 0
+ adc 8(up), %rdx C 1
+ adc $0, w1 C 2 cannot carry further
+ add w3, %rdx C 1
+ mov %rdx, 8(up) C 1
+ adc $0, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%rax)
+ adc 16(up), w1 C 2
+ mov w1, 16(up)
+ setc R8(%rax)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+')
+ifelse(VER,1,`
+ push %rbx
+ push %r13
+ xor R32(%r13), R32(%r13)
+ mov (up), %rax
+ mov 8(up), %rbx
+L(o2): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ adox( %rax, w2) C 0
+ adcx( w3, %rdx) C 1
+ adox( %rbx, %rdx) C 1
+ adox( %rcx, w1) C 2 cannot carry further
+ mov %rdx, %rax C 1
+ adc %rcx, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%r13)
+ adc 16(up), w1 C 2
+ mov w1, %rbx
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r13
+ pop %rbx
+')
+ifelse(VER,2,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ mulx( 8,(dp), %rdx, %rcx)
+ add %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r10, %r12 C 0 add just to produce carry
+ adc %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ifelse(VER,3,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ add %r10, %r12 C 0 add just to produce carry
+ mulx( 8,(dp), %rdx, %rcx)
+ adc %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ pop %r14
+ pop %r12
+ FUNC_EXIT()
+ ret
+
+ifelse(eval(MAX_SPECIAL>=3),1,`
+L(3): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF
+ mulx( (dp), w0, w1) C 0 1
+ adox( %rax, w0) C 0
+ mulx( 8,(dp), %rax, w3) C 1 2
+ adcx( w1, %rax) C 1
+ adox( %rbx, %rax) C 1
+ mulx( 16,(dp), %rbx, w1) C 2 3
+ mov dinv, q C 1
+ mulx( %rax, q, w0)
+ adcx( w3, %rbx) C 2
+ adox( 16,(up), %rbx) C 2
+ adox( %rcx, w1) C 3
+ adc $0, w1 C 3
+ bt $0, R32(%r13)
+ adc w1, 24(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o3)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=4),1,`
+L(4): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o4): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ mov dinv, q
+ mulx( %rax, q, w2)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 24(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 32(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o4)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=5),1,`
+L(5): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o5): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ adox( 16,(up), %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 32(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 40(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o5)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=6),1,`
+L(6): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o6): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ mulx( 32,(dp), w2, w3)
+ mov w0, 24(up)
+ adcx( w1, w2)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 40(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 48(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o6)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=7),1,`
+L(7): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o7): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 16,(up), %rbx)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ mulx( 40,(dp), w2, w3)
+ mov w0, 32(up)
+ adcx( w1, w2)
+ mulx( 48,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+ mov %rax, q
+ mulx( dinv, q, w2)
+ adox( 48,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 48(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 56(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o7)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=8),1,`
+L(8): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o8): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ mulx( 32,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 24,(up), w0)
+ mov w0, 24(up)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ mulx( 48,(dp), w2, w3)
+ mov w0, 40(up)
+ adcx( w1, w2)
+ mulx( 56,(dp), w0, w1)
+ adox( 48,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 48(up)
+ adox( 56,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 56(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 64(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o8)
+ jmp L(esma)
+')
+
+L(esma):mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ JUMPTABSECT
+ ALIGN(8)
+L(atab):JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ JMPENT( L(1), L(atab))
+ JMPENT( L(2), L(atab))
+ JMPENT( L(3), L(atab))
+ JMPENT( L(4), L(atab))
+ JMPENT( L(5), L(atab))
+ JMPENT( L(6), L(atab))
+ JMPENT( L(7), L(atab))
+ JMPENT( L(8), L(atab))
+ TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
new file mode 100644
index 0000000..e81b01b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
@@ -0,0 +1,839 @@
+dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zen ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * We have 8 addmul_1 loops which fall into each other. The idea is to save
+C on switching code, since a circularly updated computed goto target will
+C hardly allow correct branch prediction. On 2nd thought, we now might make
+C each of the 8 loop branches be poorly predicted since they will be
+C executed fewer times for each time. With just one addmul_1 loop, the loop
+C count will change only once each 8th time.
+C * Do overlapped software pipelining.
+C * Perhaps load in shrx/sarx, eliminating separate load insn.
+C * Schedule add+stored in small n code.
+C * Try swapping adox and adcx insn, making mulx have more time to run.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+define(`n', `%rcx')
+define(`un_save', `%rbx')
+define(`u0', `%rdx')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, un_param
+ jae L(gt1)
+
+ mov (up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rdx
+ mov 8(up), %rcx
+ mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
+ mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
+ mov %rcx, %rdx
+ mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
+ add %r9, %r9 C W 1
+ adc %r10, %r10 C W 2
+ adc $0, %rdx C W 3
+ add %r9, %r8 C W 1
+ adc %r11, %r10 C W 2
+ adc $0, %rdx C W 3
+ mov %rax, (rp)
+ mov %r8, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, un_param
+ jae L(gt3)
+
+ push %rbx
+ mov (up), %rdx
+ mulx( 8,(up), w2, w3)
+ mulx( 16,(up), w0, w1)
+ add w3, w0
+ mov 8(up), %rdx
+ mulx( 16,(up), %rax, w3)
+ adc %rax, w1
+ adc $0, w3
+ test R32(%rbx), R32(%rbx)
+ mov (up), %rdx
+ mulx( %rdx, %rbx, %rcx)
+ mov %rbx, (rp)
+ mov 8(up), %rdx
+ mulx( %rdx, %rax, %rbx)
+ mov 16(up), %rdx
+ mulx( %rdx, %rsi, %rdx)
+ adcx( w2, w2)
+ adcx( w0, w0)
+ adcx( w1, w1)
+ adcx( w3, w3)
+ adox( w2, %rcx)
+ adox( w0, %rax)
+ adox( w1, %rbx)
+ adox( w3, %rsi)
+ mov $0, R32(%r8)
+ adox( %r8, %rdx)
+ adcx( %r8, %rdx)
+ mov %rcx, 8(rp)
+ mov %rax, 16(rp)
+ mov %rbx, 24(rp)
+ mov %rsi, 32(rp)
+ mov %rdx, 40(rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(gt3): push %rbx
+
+ lea -3(un_param), R32(un_save)
+ lea 5(un_param), R32(n)
+ mov R32(un_param), R32(%rax)
+ and $-8, R32(un_save)
+ shr $3, R32(n) C count for mul_1 loop
+ neg un_save C 8*count and offert for addmul_1 loops
+ and $7, R32(%rax) C clear CF for adc as side-effect
+
+ mov (up), u0
+
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ lea 64(up), up
+ add w1, w2
+ jmp L(mb0)
+
+L(mf3): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mov w2, (rp)
+ mulx( 8,(up), w0, w1)
+ lea 24(up), up
+ lea 24(rp), rp
+ add w3, w0
+ jmp L(mb3)
+
+L(mf4): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ add w1, w2
+ jmp L(mb4)
+
+L(mf5): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 40(up), up
+ lea 40(rp), rp
+ add w3, w0
+ jmp L(mb5)
+
+L(mf6): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 48(up), up
+ lea 48(rp), rp
+ add w1, w2
+ jmp L(mb6)
+
+L(mf7): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 56(up), up
+ lea 56(rp), rp
+ add w3, w0
+ jmp L(mb7)
+
+L(mf1): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 8(up), up
+ lea 8(rp), rp
+ add w3, w0
+ jmp L(mb1)
+
+L(mf2): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 16(up), up
+ lea 16(rp), rp
+ dec R32(n)
+ add w1, w2
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(top): mov w2, -8(rp)
+ adc w3, w0
+L(mb1): mulx( 8,(up), w2, w3)
+ adc w1, w2
+ lea 64(up), up
+L(mb0): mov w0, (rp)
+ mov w2, 8(rp)
+ mulx( -48,(up), w0, w1)
+ lea 64(rp), rp
+ adc w3, w0
+L(mb7): mulx( -40,(up), w2, w3)
+ mov w0, -48(rp)
+ adc w1, w2
+L(mb6): mov w2, -40(rp)
+ mulx( -32,(up), w0, w1)
+ adc w3, w0
+L(mb5): mulx( -24,(up), w2, w3)
+ mov w0, -32(rp)
+ adc w1, w2
+L(mb4): mulx( -16,(up), w0, w1)
+ mov w2, -24(rp)
+ adc w3, w0
+L(mb3): mulx( -8,(up), w2, w3)
+ adc w1, w2
+ mov w0, -16(rp)
+ dec R32(n)
+ mulx( (up), w0, w1)
+ jnz L(top)
+
+L(end): mov w2, -8(rp)
+ adc w3, w0
+C mov w0, (rp)
+C adc %rcx, w1
+C mov w1, 8(rp)
+
+ lea L(atab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r11
+ lea (%r11, %r10), %r11
+',`
+ mov (%r10,%rax,8), %r11
+')
+ mov $63, R32(%rax)
+ jmp *%r11
+
+L(ed0): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f7): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov (up), w1 C up[-1]
+ mov 8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ jmp L(b7)
+
+ ALIGN(16)
+L(tp0): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed0)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+L(b0): mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp0)
+
+L(ed1): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f0): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -8(up), w3 C up[-1]
+ mov (up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ jmp L(b0)
+
+ ALIGN(16)
+L(tp1): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed1)
+L(b1): mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp1)
+
+L(ed2): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f1): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea 8(un_save), un_save
+ lea -56(rp,un_save,8), rp
+ mov -16(up), w1 C up[-1]
+ mov -8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2) C FIXME: crossjump?
+ mulx( (up), w0, w1)
+ adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jmp L(b1)
+
+ ALIGN(16)
+L(tp2): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed2)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+L(b2): adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp2)
+
+L(ed3): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f2): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ or R32(un_save), R32(n)
+ jz L(cor3)
+ lea -56(rp,un_save,8), rp
+ mov -24(up), w3 C up[-1]
+ mov -16(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ jmp L(b2)
+
+ ALIGN(16)
+L(tp3): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed3)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+L(b3): mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp3)
+
+L(ed4): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f3): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -32(up), w1 C up[-1]
+ mov -24(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ jmp L(b3)
+
+ ALIGN(16)
+L(tp4): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed4)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+L(b4): mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp4)
+
+L(ed5): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f4): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -40(up), w3 C up[-1]
+ mov -32(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ jmp L(b4)
+
+ ALIGN(16)
+L(tp5): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed5)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+L(b5): mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp5)
+
+L(ed6): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f5): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -48(up), w1 C up[-1]
+ mov -40(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ jmp L(b5)
+
+ ALIGN(16)
+L(tp6): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed6)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+L(b6): adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp6)
+
+L(ed7): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f6): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -56(up), w3 C up[-1]
+ mov -48(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ mulx( -40,(up), w2, w3)
+ jmp L(b6)
+
+ ALIGN(16)
+L(tp7): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed7)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+L(b7): adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp7)
+
+L(cor3):lea -64(rp), rp
+ mov -24(up), w3 C up[-1]
+ mov -16(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ adox( 56,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 56(rp)
+ adcx( w1, w2)
+ mulx( (up), %rbx, w1)
+ adox( 64,(rp), w2)
+ adcx( w3, %rbx)
+ mov w2, 64(rp)
+ adox( 72,(rp), %rbx)
+ adox( %rcx, w1) C relies on rcx = 0
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 80(rp) C FIXME
+C wd2
+ mov -16(up), w1 C up[-1]
+ mov -8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ mulx( (up), w0, %rax)
+ adox( %rbx, w2)
+ adcx( w3, w0)
+ mov w2, 72(rp)
+ adox( 80,(rp), w0)
+ adox( %rcx, %rax) C relies on rcx = 0
+ mov w0, 80(rp)
+ adc %rcx, %rax C relies on rcx = 0
+C wd1
+ mov -8(up), w3 C up[-1]
+ mov (up), u0 C up[0]
+ sar $63, w3
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ adcx( w3, w0)
+ adox( %rax, w0)
+ mov w0, 88(rp)
+ adcx( %rcx, w1)
+ adox( %rcx, w1)
+ mov w1, 96(rp)
+
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf7), L(mtab))
+ JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
+L(atab):JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm
new file mode 100644
index 0000000..9d1c405
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm
@@ -0,0 +1,241 @@
+dnl AMD64 mpn_addmul_2 optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bull n/a
+C AMD pile n/a
+C AMD steam n/a
+C AMD excavator ?
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel core n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL 2.15
+C Intel BWL 2.33
+C Intel SKL 2.22
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vp', `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov n_param, n
+ shr $2, n
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): mov (rp), X0
+ mov 8(rp), X1
+ test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov (up), %rdx
+ lea 16(up), up
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ mov X0, (rp)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up), %rdx
+ lea 16(rp), rp
+ jmp L(lo0)
+
+L(b10): mov (up), %rdx
+ inc n
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ mov X0, (rp)
+ mov 16(rp), X0
+ add %rax, X1
+ adc $0, w2
+ xor w0, w0
+ jmp L(lo2)
+
+L(bx1): mov (rp), X1
+ mov 8(rp), X0
+ test $2, R8(n_param)
+ jnz L(b11)
+
+L(b01): mov (up), %rdx
+ mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ mov 8(up), %rdx
+ mov X1, (rp)
+ mov 16(rp), X1
+ mulx( v0, %rax, w1)
+ lea 24(rp), rp
+ lea 24(up), up
+ jmp L(lo1)
+
+L(b11): mov (up), %rdx
+ inc n
+ mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ mov X1, (rp)
+ mov 8(up), %rdx
+ mulx( v0, %rax, w1)
+ lea 8(rp), rp
+ lea 8(up), up
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ lea 32(rp), rp
+ add w1, X1
+ mov -16(up), %rdx
+ mov X1, -24(rp)
+ adc $0, w3
+ add w2, X0
+ mov -8(rp), X1
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo1): add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ add w3, X0
+ mov X0, -16(rp)
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ add w0, X1
+ mov -8(up), %rdx
+ adc $0, w2
+L(lo0): mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mov (rp), X0
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ mov X1, -8(rp)
+ adc $0, w3
+ mov (up), %rdx
+ add w2, X0
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo3): add %rax, X0
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add w3, X0
+ mov 8(rp), X1
+ mov X0, (rp)
+ mov 16(rp), X0
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+L(lo2): mov 8(up), %rdx
+ lea 32(up), up
+ dec n
+ jnz L(top)
+
+L(end): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rdx, %rax)
+ add w1, X1
+ mov X1, 8(rp)
+ adc $0, w3
+ add w2, %rdx
+ adc $0, %rax
+ add w3, %rdx
+ mov %rdx, 16(rp)
+ adc $0, %rax
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm
new file mode 100644
index 0000000..ff0d27b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/zen/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm
new file mode 100644
index 0000000..fc99627
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm
@@ -0,0 +1,261 @@
+dnl AMD64 mpn_add_n, mpn_sub_n
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1 1.5 with fluctuations
+C AMD bd2 1.5 with fluctuations
+C AMD bd3
+C AMD bd4 1.6
+C AMD zen
+C AMD bt1
+C AMD bt2
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL 1.21
+C Intel BWL 1.04
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+
+ mov R32(n), R32(%rax)
+ shr $3, n
+ and $7, R32(%rax)
+
+ lea L(tab)(%rip), %r9
+ neg %r8 C set carry
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ lea (%r9,%rax), %rax C lea not add to preserve carry
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+
+ mov R32(n), R32(%rax)
+ shr $3, n
+ and $7, R32(%rax) C clear cy as side-effect
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ lea (%r9,%rax), %rax C lea not add to preserve carry
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(0): mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ jmp L(e0)
+
+L(4): mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ lea -32(up), up
+ lea -32(vp), vp
+ lea -32(rp), rp
+ inc n
+ jmp L(e4)
+
+L(5): mov (up), %r11
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB (vp), %r11
+ lea -24(up), up
+ lea -24(vp), vp
+ lea -24(rp), rp
+ inc n
+ jmp L(e5)
+
+L(6): mov (up), %r10
+ ADCSBB (vp), %r10
+ mov 8(up), %r11
+ lea -16(up), up
+ lea -16(vp), vp
+ lea -16(rp), rp
+ inc n
+ jmp L(e6)
+
+L(7): mov (up), %r9
+ mov 8(up), %r10
+ ADCSBB (vp), %r9
+ ADCSBB 8(vp), %r10
+ lea -8(up), up
+ lea -8(vp), vp
+ lea -8(rp), rp
+ inc n
+ jmp L(e7)
+
+ ALIGN(16)
+L(top):
+L(e3): mov %r9, 40(rp)
+L(e2): mov %r10, 48(rp)
+L(e1): mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ mov %r11, 56(rp)
+ lea 64(rp), rp
+L(e0): mov 16(up), %r10
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ mov %r8, (rp)
+L(e7): mov 24(up), %r11
+ mov %r9, 8(rp)
+L(e6): mov 32(up), %r8
+ mov 40(up), %r9
+ ADCSBB 24(vp), %r11
+ mov %r10, 16(rp)
+L(e5): ADCSBB 32(vp), %r8
+ mov %r11, 24(rp)
+L(e4): mov 48(up), %r10
+ mov 56(up), %r11
+ mov %r8, 32(rp)
+ lea 64(up), up
+ ADCSBB 40(vp), %r9
+ ADCSBB 48(vp), %r10
+ ADCSBB 56(vp), %r11
+ lea 64(vp), vp
+ dec n
+ jnz L(top)
+
+L(end): mov %r9, 40(rp)
+ mov %r10, 48(rp)
+ mov %r11, 56(rp)
+ mov R32(n), R32(%rax)
+ adc R32(n), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(3): mov (up), %r9
+ mov 8(up), %r10
+ mov 16(up), %r11
+ ADCSBB (vp), %r9
+ ADCSBB 8(vp), %r10
+ ADCSBB 16(vp), %r11
+ jrcxz L(x3)
+ lea 24(up), up
+ lea 24(vp), vp
+ lea -40(rp), rp
+ jmp L(e3)
+L(x3): mov %r9, (rp)
+ mov %r10, 8(rp)
+ mov %r11, 16(rp)
+ mov R32(n), R32(%rax)
+ adc R32(n), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(1): mov (up), %r11
+ ADCSBB (vp), %r11
+ jrcxz L(x1)
+ lea 8(up), up
+ lea 8(vp), vp
+ lea -56(rp), rp
+ jmp L(e1)
+L(x1): mov %r11, (rp)
+ mov R32(n), R32(%rax)
+ adc R32(n), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(2): mov (up), %r10
+ mov 8(up), %r11
+ ADCSBB (vp), %r10
+ ADCSBB 8(vp), %r11
+ jrcxz L(x2)
+ lea 16(up), up
+ lea 16(vp), vp
+ lea -48(rp), rp
+ jmp L(e2)
+L(x2): mov %r10, (rp)
+ mov %r11, 8(rp)
+ mov R32(n), R32(%rax)
+ adc R32(n), R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm
new file mode 100644
index 0000000..3f43afa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm
@@ -0,0 +1,201 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bull -
+C AMD pile -
+C AMD steam -
+C AMD excavator -
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel core2 -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 2.32
+C Intel BWL 2.04
+C Intel SKL 1.95
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Handle small n separately, for lower overhead.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rbp')
+define(`v0', `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`ADCSBB', `adc')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`ADCSBB', `sbb')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov n_param, n
+ mov v0_param, v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): shr $2, n
+ jc L(b10)
+
+L(b00): mulx( (up), %r13, %r12)
+ mulx( 8,(up), %rbx, %rax)
+ add %r12, %rbx
+ adc $0, %rax
+ mov (rp), %r12
+ mov 8(rp), %rcx
+ mulx( 16,(up), %r9, %r8)
+ lea -16(rp), rp
+ lea 16(up), up
+ ADDSUB %r13, %r12
+ jmp L(lo0)
+
+L(bx1): shr $2, n
+ jc L(b11)
+
+L(b01): mulx( (up), %r11, %r10)
+ jnz L(gt1)
+L(n1): ADDSUB %r11, (rp)
+ mov $0, R32(%rax)
+ adc %r10, %rax
+ jmp L(ret)
+
+L(gt1): mulx( 8,(up), %r13, %r12)
+ mulx( 16,(up), %rbx, %rax)
+ lea 24(up), up
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov (rp), %r10
+ mov 8(rp), %r12
+ mov 16(rp), %rcx
+ lea -8(rp), rp
+ ADDSUB %r11, %r10
+ jmp L(lo1)
+
+L(b11): mulx( (up), %rbx, %rax)
+ mov (rp), %rcx
+ mulx( 8,(up), %r9, %r8)
+ lea 8(up), up
+ lea -24(rp), rp
+ inc n C adjust n
+ ADDSUB %rbx, %rcx
+ jmp L(lo3)
+
+L(b10): mulx( (up), %r9, %r8)
+ mulx( 8,(up), %r11, %r10)
+ lea -32(rp), rp
+ mov $0, R32(%rax)
+ clc C clear cf
+ jz L(end) C depends on old shift
+
+ ALIGN(16)
+L(top): adc %rax, %r9
+ lea 32(rp), rp
+ adc %r8, %r11
+ mulx( 16,(up), %r13, %r12)
+ mov (rp), %r8
+ mulx( 24,(up), %rbx, %rax)
+ lea 32(up), up
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov 8(rp), %r10
+ mov 16(rp), %r12
+ ADDSUB %r9, %r8
+ mov 24(rp), %rcx
+ mov %r8, (rp)
+ ADCSBB %r11, %r10
+L(lo1): mulx( (up), %r9, %r8)
+ mov %r10, 8(rp)
+ ADCSBB %r13, %r12
+L(lo0): mov %r12, 16(rp)
+ ADCSBB %rbx, %rcx
+L(lo3): mulx( 8,(up), %r11, %r10)
+ mov %rcx, 24(rp)
+ dec n
+ jnz L(top)
+
+L(end): adc %rax, %r9
+ adc %r8, %r11
+ mov 32(rp), %r8
+ mov %r10, %rax
+ adc $0, %rax
+ mov 40(rp), %r10
+ ADDSUB %r9, %r8
+ mov %r8, 32(rp)
+ ADCSBB %r11, %r10
+ mov %r10, 40(rp)
+ adc $0, %rax
+
+L(ret): pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm
new file mode 100644
index 0000000..b5863b6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm
@@ -0,0 +1,138 @@
+dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, useful tzcnt, shlx.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 6.7
+C AMD bt1 -
+C AMD bt2 -
+C AMD zn1 5.4
+C AMD zn2 5.5
+C Intel P4 -
+C Intel CNR -
+C Intel PNR -
+C Intel NHM -
+C Intel WSM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 7.1
+C Intel BWL 5.5
+C Intel SKL 5.6
+C Intel atom -
+C Intel SLM -
+C Intel GLM -
+C Intel GLM+ -
+C VIA nano -
+
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%r10')
+define(`t1', `%r11')
+define(`cnt', `%rax')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ rep;bsf t0, cnt C tzcnt!
+
+ mov u0, s0
+ sub v0, u0
+ mov u1, s1
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+ xor R32(t0), R32(t0)
+ sub cnt, t0
+ shlx( t0, u1, s1)
+ shrx( cnt, u0, u0)
+ shrx( cnt, u1, u1)
+ or s1, u0
+
+ test v1, v1
+ jnz L(top)
+ test u1, u1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ mov u0, s0
+ mov u1, s1
+ rep;bsf t0, cnt C tzcnt!
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): mov v0, %rax
+ C mov v1, %rdx
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h
new file mode 100644
index 0000000..c11aeec
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h
@@ -0,0 +1,253 @@
+/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */
+/* FFT tuning limit = 467,964,359 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 9
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 25
+
+#define DIV_1_VS_MUL_1_PERCENT 427
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 276
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 120
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 315
+#define SQR_TOOM6_THRESHOLD 414
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 42
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 79,11}, { 47,10}, { 95,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \
+ { 79,10}, { 167,11}, { 95,10}, { 191, 9}, \
+ { 383,11}, { 111,12}, { 63, 8}, { 1023,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,11}, { 447,10}, \
+ { 895,11}, { 479,13}, { 127,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,12}, { 351,11}, { 735,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
+ { 959,14}, { 127,12}, { 543,11}, { 1087,12}, \
+ { 607,11}, { 1215,10}, { 2431,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,13}, \
+ { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1727,13}, { 959,12}, \
+ { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4863,16}, \
+ { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \
+ { 6911,16}, { 3583,15}, { 7679,14}, { 15359,15}, \
+ { 7935,17}, { 2047,16}, { 4095,15}, { 8447,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 238
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \
+ { 319,11}, { 95,10}, { 191,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \
+ { 607,11}, { 159,10}, { 319, 6}, { 5631, 7}, \
+ { 2943, 6}, { 5887, 8}, { 1535,11}, { 207,10}, \
+ { 415,11}, { 223,10}, { 447,11}, { 239,10}, \
+ { 479,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,11}, \
+ { 367,10}, { 735,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,11}, { 511,10}, { 1023,11}, \
+ { 543,10}, { 1087,12}, { 287,11}, { 607,10}, \
+ { 1215,11}, { 671,12}, { 351,11}, { 735,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,11}, { 959,14}, \
+ { 127,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 607,11}, { 1215,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1151,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,11}, \
+ { 2815,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1727,11}, { 3455,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \
+ { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \
+ { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 237
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 68
+#define MULLO_MUL_N_THRESHOLD 8967
+#define SQRLO_BASECASE_THRESHOLD 11
+#define SQRLO_DC_THRESHOLD 80
+#define SQRLO_SQR_THRESHOLD 6481
+
+#define DC_DIV_QR_THRESHOLD 58
+#define DC_DIVAPPR_Q_THRESHOLD 182
+#define DC_BDIV_QR_THRESHOLD 60
+#define DC_BDIV_Q_THRESHOLD 123
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 179
+#define INV_APPR_THRESHOLD 182
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_2_THRESHOLD 48
+#define REDC_2_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 82
+#define MU_BDIV_QR_THRESHOLD 1334
+#define MU_BDIV_Q_THRESHOLD 1506
+
+#define POWM_SEC_TABLE 1,22,194,473,1297,2698
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 1391
+#define SET_STR_PRECOMPUTE_THRESHOLD 2654
+
+#define FAC_DSC_THRESHOLD 562
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD2_DIV1_METHOD 5 /* 3.49% faster than 3 */
+#define HGCD_THRESHOLD 96
+#define HGCD_APPR_THRESHOLD 92
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 501
+#define GCDEXT_DC_THRESHOLD 365
+#define JACOBI_BASE_METHOD 1 /* 23.87% faster than 4 */
+
+/* Tuneup completed successfully, took 238360 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm
new file mode 100644
index 0000000..5e649e8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm
@@ -0,0 +1,159 @@
+dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bull -
+C AMD pile -
+C AMD steam -
+C AMD excavator -
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel core2 -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 1.59
+C Intel BWL 1.76
+C Intel SKL 1.54
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rbp')
+define(`v0', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+
+ mov n_param, n
+ shr $2, n
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n_param)
+ mov v0_param, v0
+ jnz L(b10)
+
+L(b00): mulx( (up), %r9, %r8)
+ mulx( 8,(up), %r11, %r10)
+ mulx( 16,(up), %rcx, %r12)
+ lea -32(rp), rp
+ jmp L(lo0)
+
+L(b10): mulx( (up), %rcx, %r12)
+ mulx( 8,(up), %rbx, %rax)
+ lea -16(rp), rp
+ test n, n
+ jz L(cj2)
+ mulx( 16,(up), %r9, %r8)
+ lea 16(up), up
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n_param)
+ mov v0_param, v0
+ jnz L(b11)
+
+L(b01): mulx( (up), %rbx, %rax)
+ lea -24(rp), rp
+ test n, n
+ jz L(cj1)
+ mulx( 8,(up), %r9, %r8)
+ lea 8(up), up
+ jmp L(lo1)
+
+L(b11): mulx( (up), %r11, %r10)
+ mulx( 8,(up), %rcx, %r12)
+ mulx( 16,(up), %rbx, %rax)
+ lea -8(rp), rp
+ test n, n
+ jz L(cj3)
+ lea 24(up), up
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): lea 32(rp), rp
+ mov %r9, (rp)
+ adc %r8, %r11
+L(lo3): mulx( (up), %r9, %r8)
+ mov %r11, 8(rp)
+ adc %r10, %rcx
+L(lo2): mov %rcx, 16(rp)
+ adc %r12, %rbx
+L(lo1): mulx( 8,(up), %r11, %r10)
+ adc %rax, %r9
+ mulx( 16,(up), %rcx, %r12)
+ mov %rbx, 24(rp)
+L(lo0): mulx( 24,(up), %rbx, %rax)
+ lea 32(up), up
+ dec n
+ jnz L(top)
+
+L(end): lea 32(rp), rp
+ mov %r9, (rp)
+ adc %r8, %r11
+L(cj3): mov %r11, 8(rp)
+ adc %r10, %rcx
+L(cj2): mov %rcx, 16(rp)
+ adc %r12, %rbx
+L(cj1): mov %rbx, 24(rp)
+ adc $0, %rax
+
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm
new file mode 100644
index 0000000..f1f044f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm
@@ -0,0 +1,176 @@
+dnl AMD64 mpn_mul_2 optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bull -
+C AMD pile -
+C AMD steam -
+C AMD excavator -
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel core -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 3.74
+C Intel BWL 4.21
+C Intel SKL 4.20
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Move test and jcc together, for insn fusion.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vp', `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea 3(n_param), n
+ shr $2, n
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): xor w0, w0
+ test $2, R8(n_param)
+ mov (up), %rdx
+ mulx( v0, w2, w1)
+ jz L(lo0)
+
+L(b10): lea -16(rp), rp
+ lea -16(up), up
+ jmp L(lo2)
+
+L(bx1): xor w2, w2
+ test $2, R8(n_param)
+ mov (up), %rdx
+ mulx( v0, w0, w3)
+ jnz L(b11)
+
+L(b01): lea -24(rp), rp
+ lea 8(up), up
+ jmp L(lo1)
+
+L(b11): lea -8(rp), rp
+ lea -8(up), up
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mulx( v1, %rax, w0)
+ add %rax, w2 C 0
+ mov (up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0 C 1
+ add %rax, w2 C 0
+ adc $0, w1 C 1
+ add w3, w2 C 0
+L(lo0): mov w2, (rp) C 0
+ adc $0, w1 C 1
+ mulx( v1, %rax, w2)
+ add %rax, w0 C 1
+ mov 8(up), %rdx
+ adc $0, w2 C 2
+ mulx( v0, %rax, w3)
+ add %rax, w0 C 1
+ adc $0, w3 C 2
+ add w1, w0 C 1
+L(lo3): mov w0, 8(rp) C 1
+ adc $0, w3 C 2
+ mulx( v1, %rax, w0)
+ add %rax, w2 C 2
+ mov 16(up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0 C 3
+ add %rax, w2 C 2
+ adc $0, w1 C 3
+ add w3, w2 C 2
+L(lo2): mov w2, 16(rp) C 2
+ adc $0, w1 C 3
+ mulx( v1, %rax, w2)
+ add %rax, w0 C 3
+ mov 24(up), %rdx
+ adc $0, w2 C 4
+ mulx( v0, %rax, w3)
+ add %rax, w0 C 3
+ adc $0, w3 C 4
+ add w1, w0 C 3
+ lea 32(up), up
+L(lo1): mov w0, 24(rp) C 3
+ adc $0, w3 C 4
+ dec n
+ lea 32(rp), rp
+ jnz L(top)
+
+L(end): mulx( v1, %rdx, %rax)
+ add %rdx, w2
+ adc $0, %rax
+ add w3, w2
+ mov w2, (rp)
+ adc $0, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm
new file mode 100644
index 0000000..b2656c8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm
@@ -0,0 +1,441 @@
+dnl AMD64 mpn_mul_basecase optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9 n/a n/a - n/a
+C AMD K10 n/a n/a - n/a
+C AMD bull n/a n/a - n/a
+C AMD pile n/a n/a - n/a
+C AMD steam ? ? - ?
+C AMD bobcat n/a n/a - n/a
+C AMD jaguar ? ? - ?
+C Intel P4 n/a n/a - n/a
+C Intel core n/a n/a - n/a
+C Intel NHM n/a n/a - n/a
+C Intel SBR n/a n/a - n/a
+C Intel IBR n/a n/a - n/a
+C Intel HWL 1.77 1.86 - 2.15
+C Intel BWL ? ? - ?
+C Intel atom n/a n/a - n/a
+C VIA nano n/a n/a - n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Adjoin a mul_3.
+C * Further micro-optimise.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ mov un_param, un C free up rdx
+ neg un
+
+ mov un_param, n C FIXME: share
+ sar $2, n C FIXME: share
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+define(`w4', `%r9')
+define(`w5', `%r14')
+
+ mov (vp), %rdx
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):test $2, R8(un)
+ jnz L(m110)
+
+L(m100):
+ mulx( (up), w5, w2)
+ mulx( 8,(up), w1, w3)
+ lea -24(rp), rp
+ jmp L(m1l0)
+
+L(m110):
+ mulx( (up), w3, w4)
+ mulx( 8,(up), w1, w5)
+ lea -8(rp), rp
+ test n, n
+ jz L(cj2)
+ mulx( 16,(up), w0, w2)
+ lea 16(up), up
+ jmp L(m1l2)
+
+L(m1x1):test $2, R8(un)
+ jz L(m111)
+
+L(m101):
+ mulx( (up), w4, w5)
+ lea -16(rp), rp
+ test n, n
+ jz L(cj1)
+ mulx( 8,(up), w0, w2)
+ lea 8(up), up
+ jmp L(m1l1)
+
+L(m111):
+ mulx( (up), w2, w3)
+ mulx( 8,(up), w0, w4)
+ mulx( 16,(up), w1, w5)
+ lea 24(up), up
+ test n, n
+ jnz L(gt3)
+ add w0, w3
+ jmp L(cj3)
+L(gt3): add w0, w3
+ jmp L(m1l3)
+
+ ALIGN(32)
+L(m1tp):lea 32(rp), rp
+L(m1l3):mov w2, (rp)
+ mulx( (up), w0, w2)
+L(m1l2):mov w3, 8(rp)
+ adc w1, w4
+L(m1l1):adc w0, w5
+ mov w4, 16(rp)
+ mulx( 8,(up), w1, w3)
+L(m1l0):mov w5, 24(rp)
+ mulx( 16,(up), w0, w4)
+ adc w1, w2
+ mulx( 24,(up), w1, w5)
+ adc w0, w3
+ lea 32(up), up
+ dec n
+ jnz L(m1tp)
+
+L(m1ed):lea 32(rp), rp
+L(cj3): mov w2, (rp)
+L(cj2): mov w3, 8(rp)
+ adc w1, w4
+L(cj1): mov w4, 16(rp)
+ adc $0, w5
+ mov w5, 24(rp)
+
+ dec R32(vn)
+ jz L(ret5)
+
+ lea 8(vp), vp
+ lea 32(rp), rp
+C push %r12
+C push %r13
+C push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+C push %r12
+C push %r13
+C push %r14
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (un), n
+ sar $2, n
+
+ test $1, R8(un)
+ jnz L(m2x1)
+
+L(m2x0):xor w0, w0
+ test $2, R8(un)
+ mov (up), %rdx
+ mulx( v0, w2, w1)
+ jz L(m2l0)
+
+L(m210):lea -16(rp), rp
+ lea -16(up), up
+ jmp L(m2l2)
+
+L(m2x1):xor w2, w2
+ test $2, R8(un)
+ mov (up), %rdx
+ mulx( v0, w0, w3)
+ jz L(m211)
+
+L(m201):lea -24(rp), rp
+ lea 8(up), up
+ jmp L(m2l1)
+
+L(m211):lea -8(rp), rp
+ lea -8(up), up
+ jmp L(m2l3)
+
+ ALIGN(16)
+L(m2tp):mulx( v1, %rax, w0)
+ add %rax, w2
+ mov (up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0
+ add %rax, w2
+ adc $0, w1
+ add w3, w2
+L(m2l0):mov w2, (rp)
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add %rax, w0
+ mov 8(up), %rdx
+ adc $0, w2
+ mulx( v0, %rax, w3)
+ add %rax, w0
+ adc $0, w3
+ add w1, w0
+L(m2l3):mov w0, 8(rp)
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, w2
+ mov 16(up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0
+ add %rax, w2
+ adc $0, w1
+ add w3, w2
+L(m2l2):mov w2, 16(rp)
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add %rax, w0
+ mov 24(up), %rdx
+ adc $0, w2
+ mulx( v0, %rax, w3)
+ add %rax, w0
+ adc $0, w3
+ add w1, w0
+ lea 32(up), up
+L(m2l1):mov w0, 24(rp)
+ adc $0, w3
+ inc n
+ lea 32(rp), rp
+ jnz L(m2tp)
+
+L(m2ed):mulx( v1, %rdx, %rax)
+ add %rdx, w2
+ adc $0, %rax
+ add w3, w2
+ mov w2, (rp)
+ adc $0, %rax
+ mov %rax, 8(rp)
+
+ add $-2, R32(vn)
+ jz L(ret5)
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+ lea (rp,un,8), rp
+ lea (up,un,8), up
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea 2(un), n
+ sar $2, n
+
+ mov (up), %rdx
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov (rp), X0
+ mov 8(rp), X1
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ mov X0, (rp)
+ add %rax, X1
+ adc $0, w2
+ mov 8(up), %rdx
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea 16(up), up
+ lea 16(rp), rp
+ jmp L(lo0)
+
+L(b10): mov 16(rp), X0
+ lea 32(up), up
+ mulx( v0, %rax, w3)
+ jmp L(lo2)
+
+L(bx1): mov (rp), X1
+ mov 8(rp), X0
+ mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ mov 8(up), %rdx
+ mov X1, (rp)
+ mulx( v0, %rax, w1)
+ test $2, R8(un)
+ jz L(b11)
+
+L(b01): mov 16(rp), X1
+ lea 24(rp), rp
+ lea 24(up), up
+ jmp L(lo1)
+
+L(b11): lea 8(rp), rp
+ lea 8(up), up
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+L(lo2): add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ lea 32(rp), rp
+ add w1, X1
+ mov -16(up), %rdx
+ mov X1, -24(rp)
+ adc $0, w3
+ add w2, X0
+ mov -8(rp), X1
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo1): add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ add w3, X0
+ mov X0, -16(rp)
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ add w0, X1
+ mov -8(up), %rdx
+ adc $0, w2
+L(lo0): mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mov (rp), X0
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ mov X1, -8(rp)
+ adc $0, w3
+ mov (up), %rdx
+ add w2, X0
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo3): add %rax, X0
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add w3, X0
+ mov 8(rp), X1
+ mov X0, (rp)
+ mov 16(rp), X0
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ mov 8(up), %rdx
+ lea 32(up), up
+ inc n
+ jnz L(top)
+
+L(end): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rdx, %rax)
+ add w1, X1
+ mov X1, 8(rp)
+ adc $0, w3
+ add w2, %rdx
+ adc $0, %rax
+ add w3, %rdx
+ mov %rdx, 16(rp)
+ adc $0, %rax
+ mov %rax, 24(rp)
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea -16(up,un,8), up
+ lea 32(rp,un,8), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+L(ret4):pop %r13
+L(ret3):pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm
new file mode 100644
index 0000000..e65559b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm
@@ -0,0 +1,422 @@
+dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bull n/a n/a
+C AMD pile n/a n/a
+C AMD steam ? ?
+C AMD bobcat n/a n/a
+C AMD jaguar ? ?
+C Intel P4 n/a n/a
+C Intel core n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.86 2.15
+C Intel BWL ? ?
+C Intel atom n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Implement proper cor2, replacing current cor0.
+C * Micro-optimise.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r8')
+define(`X0', `%r14')
+define(`X1', `%r15')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`i', `%rbp')
+define(`v0', `%r9')
+define(`v1', `%rbx')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $4, n
+ jb L(small)
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea 2(n), i
+ shr $2, i
+ neg n
+ add $2, n
+
+ push up C put entry `up' on stack
+
+ test $1, R8(n)
+ jnz L(m2x1)
+
+L(m2x0):mulx( v0, w0, w3)
+ xor R32(w2), R32(w2)
+ test $2, R8(n)
+ jz L(m2b2)
+
+L(m2b0):lea -8(rp), rp
+ lea -8(up), up
+ jmp L(m2e0)
+
+L(m2b2):lea -24(rp), rp
+ lea 8(up), up
+ jmp L(m2e2)
+
+L(m2x1):mulx( v0, w2, w1)
+ xor R32(w0), R32(w0)
+ test $2, R8(n)
+ jnz L(m2b3)
+
+L(m2b1):jmp L(m2e1)
+
+L(m2b3):lea -16(rp), rp
+ lea -16(up), up
+ jmp L(m2e3)
+
+ ALIGN(16)
+L(m2tp):mulx( v1, %rax, w0)
+ add %rax, w2
+ mov (up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0
+ add %rax, w2
+ adc $0, w1
+ add w3, w2
+L(m2e1):mov w2, (rp)
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add %rax, w0
+ mov 8(up), %rdx
+ adc $0, w2
+ mulx( v0, %rax, w3)
+ add %rax, w0
+ adc $0, w3
+ add w1, w0
+L(m2e0):mov w0, 8(rp)
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, w2
+ mov 16(up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0
+ add %rax, w2
+ adc $0, w1
+ add w3, w2
+L(m2e3):mov w2, 16(rp)
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add %rax, w0
+ mov 24(up), %rdx
+ adc $0, w2
+ mulx( v0, %rax, w3)
+ add %rax, w0
+ adc $0, w3
+ add w1, w0
+ lea 32(up), up
+L(m2e2):mov w0, 24(rp)
+ adc $0, w3
+ dec i
+ lea 32(rp), rp
+ jnz L(m2tp)
+
+L(m2ed):mulx( v1, %rax, w0)
+ add %rax, w2
+ mov (up), %rdx
+ mulx( v0, %rax, w1)
+ add w2, %rax
+ add w3, %rax
+ mov %rax, (rp)
+
+ mov (%rsp), up C restore `up' to beginning
+ lea 16(vp), vp
+ lea 8(rp,n,8), rp C put back rp to old rp + 2
+ add $2, n
+ jge L(cor1)
+
+ push %r14
+ push %r15
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (n), i
+ sar $2, i
+
+ mov (up), %rdx
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): mov (rp), X1
+ mov 8(rp), X0
+ mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ mov 8(up), %rdx
+ mov X1, (rp)
+ mulx( v0, %rax, w1)
+ test $2, R8(n)
+ jz L(b2)
+
+L(b0): lea 8(rp), rp
+ lea 8(up), up
+ jmp L(lo0)
+
+L(b2): mov 16(rp), X1
+ lea 24(rp), rp
+ lea 24(up), up
+ jmp L(lo2)
+
+L(bx1): mov (rp), X0
+ mov 8(rp), X1
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ mov X0, (rp)
+ add %rax, X1
+ adc $0, w2
+ mov 8(up), %rdx
+ test $2, R8(n)
+ jnz L(b3)
+
+L(b1): lea 16(up), up
+ lea 16(rp), rp
+ jmp L(lo1)
+
+L(b3): mov 16(rp), X0
+ lea 32(up), up
+ mulx( v0, %rax, w3)
+ inc i
+ jz L(cj3)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+L(lo3): add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ lea 32(rp), rp
+ add w1, X1
+ mov -16(up), %rdx
+ mov X1, -24(rp)
+ adc $0, w3
+ add w2, X0
+ mov -8(rp), X1
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo2): add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ add w3, X0
+ mov X0, -16(rp)
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ add w0, X1
+ mov -8(up), %rdx
+ adc $0, w2
+L(lo1): mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mov (rp), X0
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ mov X1, -8(rp)
+ adc $0, w3
+ mov (up), %rdx
+ add w2, X0
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo0): add %rax, X0
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add w3, X0
+ mov 8(rp), X1
+ mov X0, (rp)
+ mov 16(rp), X0
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ mov 8(up), %rdx
+ lea 32(up), up
+ inc i
+ jnz L(top)
+
+L(end): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+L(cj3): add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ add w1, X1
+ mov -16(up), %rdx
+ mov X1, 8(rp)
+ adc $0, w3
+ add w2, X0
+ mulx( v0, %rax, w1)
+ add X0, %rax
+ add w3, %rax
+ mov %rax, 16(rp)
+
+ mov 16(%rsp), up C restore `up' to beginning
+ lea 16(vp), vp
+ lea 24(rp,n,8), rp C put back rp to old rp + 2
+ add $2, n
+ jl L(outer)
+
+ pop %r15
+ pop %r14
+
+ jnz L(cor0)
+
+L(cor1):mov (vp), v0
+ mov 8(vp), v1
+ mov (up), %rdx
+ mulx( v0, %r12, %rbp) C u0 x v2
+ add (rp), %r12 C FIXME: rp[0] still available in reg?
+ adc %rax, %rbp
+ mov 8(up), %r10
+ imul v0, %r10
+ imul v1, %rdx
+ mov %r12, (rp)
+ add %r10, %rdx
+ add %rbp, %rdx
+ mov %rdx, 8(rp)
+ pop %rax C deallocate `up' copy
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(cor0):mov (vp), %r11
+ imul (up), %r11
+ add %rax, %r11
+ mov %r11, (rp)
+ pop %rax C deallocate `up' copy
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ cmp $2, n
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r11
+ mov (up), %rdx
+ mulx( %r11, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r11 C u1 x v1
+ add %r11, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm
new file mode 100644
index 0000000..b1d6c0a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm
@@ -0,0 +1,437 @@
+dnl AMD64 mpn_redc_1 optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bull n/a
+C AMD pile n/a
+C AMD steam ?
+C AMD bobcat n/a
+C AMD jaguar ?
+C Intel P4 n/a
+C Intel core n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL 2.32
+C Intel BWL ?
+C Intel atom n/a
+C VIA nano n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise.
+C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2
+C cycles, though.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv_param', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%rdi')
+define(`u0inv', `(%rsp)') C stack
+
+ABI_SUPPORT(DOS64) C FIXME: needs verification
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push rp
+ mov mp_param, mp C note that rp and mp shares register
+ mov (up), %rdx
+
+ neg n
+ push %r8 C put u0inv on stack
+ imul u0inv_param, %rdx C first iteration q0
+ mov n, j C outer loop induction var
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jz L(o0b)
+
+ cmp $-2, R32(n)
+ jnz L(o2)
+
+C Special code for n = 2 since general code cannot handle it
+ mov 8(%rsp), %rbx C rp
+ lea 16(%rsp), %rsp C deallocate two slots
+ mulx( (mp), %r9, %r12)
+ mulx( 8,(mp), %r11, %r10)
+ add %r12, %r11
+ adc $0, %r10
+ add (up), %r9 C = 0
+ adc 8(up), %r11 C r11 = up[1]
+ adc $0, %r10 C -> up[0]
+ mov %r11, %rdx
+ imul u0inv_param, %rdx
+ mulx( (mp), %r13, %r12)
+ mulx( 8,(mp), %r14, %r15)
+ xor R32(%rax), R32(%rax)
+ add %r12, %r14
+ adc $0, %r15
+ add %r11, %r13 C = 0
+ adc 16(up), %r14 C rp[2]
+ adc $0, %r15 C -> up[1]
+ add %r14, %r10
+ adc 24(up), %r15
+ mov %r10, (%rbx)
+ mov %r15, 8(%rbx)
+ setc R8(%rax)
+ jmp L(ret)
+
+L(o2): lea 2(n), i C inner loop induction var
+ mulx( (mp), %r9, %r8)
+ mulx( 8,(mp), %r11, %r10)
+ sar $2, i
+ add %r8, %r11
+ jmp L(lo2)
+
+ ALIGN(16)
+L(tp2): adc %rax, %r9
+ lea 32(up), up
+ adc %r8, %r11
+L(lo2): mulx( 16,(mp), %r13, %r12)
+ mov (up), %r8
+ mulx( 24,(mp), %rbx, %rax)
+ lea 32(mp), mp
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov 8(up), %r10
+ mov 16(up), %r12
+ add %r9, %r8
+ mov 24(up), %rbp
+ mov %r8, (up)
+ adc %r11, %r10
+ mulx( (mp), %r9, %r8)
+ mov %r10, 8(up)
+ adc %r13, %r12
+ mov %r12, 16(up)
+ adc %rbx, %rbp
+ mulx( 8,(mp), %r11, %r10)
+ mov %rbp, 24(up)
+ inc i
+ jnz L(tp2)
+
+L(ed2): mov 56(up,n,8), %rdx C next iteration up[0]
+ lea 16(mp,n,8), mp C mp = (last starting mp)
+ adc %rax, %r9
+ adc %r8, %r11
+ mov 32(up), %r8
+ adc $0, %r10
+ imul u0inv, %rdx C next iteration q0
+ mov 40(up), %rax
+ add %r9, %r8
+ mov %r8, 32(up)
+ adc %r11, %rax
+ mov %rax, 40(up)
+ lea 56(up,n,8), up C up = (last starting up) + 1
+ adc $0, %r10
+ mov %r10, -8(up)
+ inc j
+ jnz L(o2)
+
+ jmp L(cj)
+
+
+L(bx1): test $2, R8(n)
+ jz L(o3a)
+
+L(o1a): cmp $-1, R32(n)
+ jnz L(o1b)
+
+C Special code for n = 1 since general code cannot handle it
+ mov 8(%rsp), %rbx C rp
+ lea 16(%rsp), %rsp C deallocate two slots
+ mulx( (mp), %r11, %r10)
+ add (up), %r11
+ adc 8(up), %r10
+ mov %r10, (%rbx)
+ mov $0, R32(%rax)
+ setc R8(%rax)
+ jmp L(ret)
+
+L(o1b): lea 24(mp), mp
+L(o1): lea 1(n), i C inner loop induction var
+ mulx( -24,(mp), %r11, %r10)
+ mulx( -16,(mp), %r13, %r12)
+ mulx( -8,(mp), %rbx, %rax)
+ sar $2, i
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov (up), %r10
+ mov 8(up), %r12
+ mov 16(up), %rbp
+ add %r11, %r10
+ jmp L(lo1)
+
+ ALIGN(16)
+L(tp1): adc %rax, %r9
+ lea 32(up), up
+ adc %r8, %r11
+ mulx( 16,(mp), %r13, %r12)
+ mov -8(up), %r8
+ mulx( 24,(mp), %rbx, %rax)
+ lea 32(mp), mp
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov (up), %r10
+ mov 8(up), %r12
+ add %r9, %r8
+ mov 16(up), %rbp
+ mov %r8, -8(up)
+ adc %r11, %r10
+L(lo1): mulx( (mp), %r9, %r8)
+ mov %r10, (up)
+ adc %r13, %r12
+ mov %r12, 8(up)
+ adc %rbx, %rbp
+ mulx( 8,(mp), %r11, %r10)
+ mov %rbp, 16(up)
+ inc i
+ jnz L(tp1)
+
+L(ed1): mov 48(up,n,8), %rdx C next iteration up[0]
+ lea 40(mp,n,8), mp C mp = (last starting mp)
+ adc %rax, %r9
+ adc %r8, %r11
+ mov 24(up), %r8
+ adc $0, %r10
+ imul u0inv, %rdx C next iteration q0
+ mov 32(up), %rax
+ add %r9, %r8
+ mov %r8, 24(up)
+ adc %r11, %rax
+ mov %rax, 32(up)
+ lea 48(up,n,8), up C up = (last starting up) + 1
+ adc $0, %r10
+ mov %r10, -8(up)
+ inc j
+ jnz L(o1)
+
+ jmp L(cj)
+
+L(o3a): cmp $-3, R32(n)
+ jnz L(o3b)
+
+C Special code for n = 3 since general code cannot handle it
+L(n3): mulx( (mp), %rbx, %rax)
+ mulx( 8,(mp), %r9, %r14)
+ add (up), %rbx
+ mulx( 16,(mp), %r11, %r10)
+ adc %rax, %r9 C W 1
+ adc %r14, %r11 C W 2
+ mov 8(up), %r14
+ mov u0inv_param, %rdx
+ adc $0, %r10 C W 3
+ mov 16(up), %rax
+ add %r9, %r14 C W 1
+ mov %r14, 8(up)
+ mulx( %r14, %rdx, %r13) C next iteration q0
+ adc %r11, %rax C W 2
+ mov %rax, 16(up)
+ adc $0, %r10 C W 3
+ mov %r10, (up)
+ lea 8(up), up C up = (last starting up) + 1
+ inc j
+ jnz L(n3)
+
+ jmp L(cj)
+
+L(o3b): lea 8(mp), mp
+L(o3): lea 4(n), i C inner loop induction var
+ mulx( -8,(mp), %rbx, %rax)
+ mulx( (mp), %r9, %r8)
+ mov (up), %rbp
+ mulx( 8,(mp), %r11, %r10)
+ sar $2, i
+ add %rbx, %rbp
+ nop
+ adc %rax, %r9
+ jmp L(lo3)
+
+ ALIGN(16)
+L(tp3): adc %rax, %r9
+ lea 32(up), up
+L(lo3): adc %r8, %r11
+ mulx( 16,(mp), %r13, %r12)
+ mov 8(up), %r8
+ mulx( 24,(mp), %rbx, %rax)
+ lea 32(mp), mp
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov 16(up), %r10
+ mov 24(up), %r12
+ add %r9, %r8
+ mov 32(up), %rbp
+ mov %r8, 8(up)
+ adc %r11, %r10
+ mulx( (mp), %r9, %r8)
+ mov %r10, 16(up)
+ adc %r13, %r12
+ mov %r12, 24(up)
+ adc %rbx, %rbp
+ mulx( 8,(mp), %r11, %r10)
+ mov %rbp, 32(up)
+ inc i
+ jnz L(tp3)
+
+L(ed3): mov 64(up,n,8), %rdx C next iteration up[0]
+ lea 24(mp,n,8), mp C mp = (last starting mp)
+ adc %rax, %r9
+ adc %r8, %r11
+ mov 40(up), %r8
+ adc $0, %r10
+ imul u0inv, %rdx C next iteration q0
+ mov 48(up), %rax
+ add %r9, %r8
+ mov %r8, 40(up)
+ adc %r11, %rax
+ mov %rax, 48(up)
+ lea 64(up,n,8), up C up = (last starting up) + 1
+ adc $0, %r10
+ mov %r10, -8(up)
+ inc j
+ jnz L(o3)
+
+ jmp L(cj)
+
+L(o0b): lea 16(mp), mp
+L(o0): mov n, i C inner loop induction var
+ mulx( -16,(mp), %r13, %r12)
+ mulx( -8,(mp), %rbx, %rax)
+ sar $2, i
+ add %r12, %rbx
+ adc $0, %rax
+ mov (up), %r12
+ mov 8(up), %rbp
+ mulx( (mp), %r9, %r8)
+ add %r13, %r12
+ jmp L(lo0)
+
+ ALIGN(16)
+L(tp0): adc %rax, %r9
+ lea 32(up), up
+ adc %r8, %r11
+ mulx( 16,(mp), %r13, %r12)
+ mov -16(up), %r8
+ mulx( 24,(mp), %rbx, %rax)
+ lea 32(mp), mp
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov -8(up), %r10
+ mov (up), %r12
+ add %r9, %r8
+ mov 8(up), %rbp
+ mov %r8, -16(up)
+ adc %r11, %r10
+ mulx( (mp), %r9, %r8)
+ mov %r10, -8(up)
+ adc %r13, %r12
+ mov %r12, (up)
+L(lo0): adc %rbx, %rbp
+ mulx( 8,(mp), %r11, %r10)
+ mov %rbp, 8(up)
+ inc i
+ jnz L(tp0)
+
+L(ed0): mov 40(up,n,8), %rdx C next iteration up[0]
+ lea 32(mp,n,8), mp C mp = (last starting mp)
+ adc %rax, %r9
+ adc %r8, %r11
+ mov 16(up), %r8
+ adc $0, %r10
+ imul u0inv, %rdx C next iteration q0
+ mov 24(up), %rax
+ add %r9, %r8
+ mov %r8, 16(up)
+ adc %r11, %rax
+ mov %rax, 24(up)
+ lea 40(up,n,8), up C up = (last starting up) + 1
+ adc $0, %r10
+ mov %r10, -8(up)
+ inc j
+ jnz L(o0)
+
+L(cj):
+IFSTD(` mov 8(%rsp), %rdi C param 1: rp
+ lea 16-8(%rsp), %rsp C deallocate 2, add back for alignment
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` mov up, %rdx C param 2: up
+ lea (up,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov 8(%rsp), %rcx C param 1: rp
+ lea 16-32-8(%rsp), %rsp') C deallocate 2, allocate shadow, align
+
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+
+IFSTD(` lea 8(%rsp), %rsp ')
+IFDOS(` lea 32+8(%rsp), %rsp')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm
new file mode 100644
index 0000000..641cdf3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm
@@ -0,0 +1,506 @@
+dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
+C AMD K8,K9 n/a n/a n/a
+C AMD K10 n/a n/a n/a
+C AMD bull n/a n/a n/a
+C AMD pile n/a n/a n/a
+C AMD steam ? ? ?
+C AMD bobcat n/a n/a n/a
+C AMD jaguar ? ? ?
+C Intel P4 n/a n/a n/a
+C Intel core n/a n/a n/a
+C Intel NHM n/a n/a n/a
+C Intel SBR n/a n/a n/a
+C Intel IBR n/a n/a n/a
+C Intel HWL 1.86 2.15 ~2.5
+C Intel BWL ? ? ?
+C Intel atom n/a n/a n/a
+C VIA nano n/a n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
+C that the sqr_diag_addlsh1 loop was manually written.
+
+C TODO
+C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be
+C possible.
+C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for
+C n = 0 (mod 2). These loops could fall into specific "corner" code.
+C * Consider splitting outer loop into 4.
+C * Streamline pointer updates.
+C * Perhaps suppress a few more xor insns in feed-in code.
+C * Make sure we write no dead registers in feed-in code.
+C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
+C out for negative sizes being zero-extended, though.
+C * Provide straight-line code for n = 4; then look for simplifications in
+C main code.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, un_param
+ jae L(gt1)
+
+ mov (up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rdx
+ mov 8(up), %rcx
+ mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
+ mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
+ mov %rcx, %rdx
+ mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
+ add %r9, %r9 C W 1
+ adc %r10, %r10 C W 2
+ adc $0, %rdx C W 3
+ add %r9, %r8 C W 1
+ adc %r11, %r10 C W 2
+ adc $0, %rdx C W 3
+ mov %rax, (rp)
+ mov %r8, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, un_param
+ jae L(gt3)
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w2', `%r11')
+
+ mov (up), v0
+ mov 8(up), %rdx
+ mov %rdx, v1
+ mulx( v0, w2, %rax)
+ mov 16(up), %rdx
+ mulx( v0, w0, %rcx)
+ mov w2, %r8
+ add %rax, w0
+ adc $0, %rcx
+ mulx( v1, %rdx, %rax)
+ add %rcx, %rdx
+ mov %rdx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ xor R32(%rcx), R32(%rcx)
+ mov (up), %rdx
+ mulx( %rdx, %rax, w2)
+ mov %rax, (rp)
+ add %r8, %r8
+ adc w0, w0
+ setc R8(%rcx)
+ mov 8(up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ add w2, %r8
+ adc %rax, w0
+ mov %r8, 8(rp)
+ mov w0, 16(rp)
+ mov 24(rp), %r8
+ mov 32(rp), w0
+ lea (%rdx,%rcx), w2
+ adc %r8, %r8
+ adc w0, w0
+ setc R8(%rcx)
+ mov 16(up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ add w2, %r8
+ adc %rax, w0
+ mov %r8, 24(rp)
+ mov w0, 32(rp)
+ adc %rcx, %rdx
+ mov %rdx, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt3):
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%rbx')
+define(`w3', `%rbp')
+define(`un', `%r12')
+define(`n', `%rcx')
+
+define(`X0', `%r13')
+define(`X1', `%r14')
+
+L(do_mul_2):
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ mov $0, R32(un)
+ sub un_param, un C free up rdx
+ push un
+ mov (up), v0
+ mov 8(up), %rdx
+ lea 2(un), n
+ sar $2, n C FIXME: suppress, change loop?
+ inc un C decrement |un|
+ mov %rdx, v1
+
+ test $1, R8(un)
+ jnz L(mx1)
+
+L(mx0): mulx( v0, w2, w1)
+ mov 16(up), %rdx
+ mov w2, 8(rp)
+ xor w2, w2
+ mulx( v0, w0, w3)
+ test $2, R8(un)
+ jz L(m00)
+
+L(m10): lea -8(rp), rp
+ lea -8(up), up
+ jmp L(mlo2)
+
+L(m00): lea 8(up), up
+ lea 8(rp), rp
+ jmp L(mlo0)
+
+L(mx1): mulx( v0, w0, w3)
+ mov 16(up), %rdx
+ mov w0, 8(rp)
+ xor w0, w0
+ mulx( v0, w2, w1)
+ test $2, R8(un)
+ jz L(mlo3)
+
+L(m01): lea 16(rp), rp
+ lea 16(up), up
+ jmp L(mlo1)
+
+ ALIGN(32)
+L(mtop):mulx( v1, %rax, w0)
+ add %rax, w2 C 0
+ mov (up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0 C 1
+ add %rax, w2 C 0
+L(mlo1):adc $0, w1 C 1
+ add w3, w2 C 0
+ mov w2, (rp) C 0
+ adc $0, w1 C 1
+ mulx( v1, %rax, w2)
+ add %rax, w0 C 1
+ mov 8(up), %rdx
+ adc $0, w2 C 2
+ mulx( v0, %rax, w3)
+ add %rax, w0 C 1
+ adc $0, w3 C 2
+L(mlo0):add w1, w0 C 1
+ mov w0, 8(rp) C 1
+ adc $0, w3 C 2
+ mulx( v1, %rax, w0)
+ add %rax, w2 C 2
+ mov 16(up), %rdx
+ mulx( v0, %rax, w1)
+ adc $0, w0 C 3
+ add %rax, w2 C 2
+ adc $0, w1 C 3
+L(mlo3):add w3, w2 C 2
+ mov w2, 16(rp) C 2
+ adc $0, w1 C 3
+ mulx( v1, %rax, w2)
+ add %rax, w0 C 3
+ mov 24(up), %rdx
+ adc $0, w2 C 4
+ mulx( v0, %rax, w3)
+ add %rax, w0 C 3
+ adc $0, w3 C 4
+L(mlo2):add w1, w0 C 3
+ lea 32(up), up
+ mov w0, 24(rp) C 3
+ adc $0, w3 C 4
+ inc n
+ lea 32(rp), rp
+ jnz L(mtop)
+
+L(mend):mulx( v1, %rdx, %rax)
+ add %rdx, w2
+ adc $0, %rax
+ add w3, w2
+ mov w2, (rp)
+ adc $0, %rax
+ mov %rax, 8(rp)
+
+ lea 16(up), up
+ lea -16(rp), rp
+
+L(do_addmul_2):
+L(outer):
+ lea (up,un,8), up C put back up to 2 positions above last time
+ lea 48(rp,un,8), rp C put back rp to 4 positions above last time
+
+ mov -8(up), v0 C shared between addmul_2 and corner
+
+ add $2, un C decrease |un|
+ cmp $-2, un
+ jge L(corner)
+
+ mov (up), v1
+
+ lea 1(un), n
+ sar $2, n C FIXME: suppress, change loop?
+
+ mov v1, %rdx
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov (rp), X0
+ mov 8(rp), X1
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ adc $0, w1
+ mov X0, (rp)
+ xor w2, w2
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): mov 8(up), %rdx
+ lea 16(rp), rp
+ lea 16(up), up
+ jmp L(lo0)
+
+L(b10): mov 8(up), %rdx
+ mov 16(rp), X0
+ lea 32(up), up
+ inc n
+ mulx( v0, %rax, w3)
+ jz L(ex)
+ jmp L(lo2)
+
+L(bx1): mov (rp), X1
+ mov 8(rp), X0
+ mulx( v0, %rax, w3)
+ mov 8(up), %rdx
+ add %rax, X1
+ adc $0, w3
+ xor w0, w0
+ mov X1, (rp)
+ mulx( v0, %rax, w1)
+ test $2, R8(un)
+ jz L(b11)
+
+L(b01): mov 16(rp), X1
+ lea 24(rp), rp
+ lea 24(up), up
+ jmp L(lo1)
+
+L(b11): lea 8(rp), rp
+ lea 8(up), up
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+L(lo2): add %rax, X1
+ adc $0, w3
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ lea 32(rp), rp
+ add w1, X1
+ mov -16(up), %rdx
+ mov X1, -24(rp)
+ adc $0, w3
+ add w2, X0
+ mov -8(rp), X1
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo1): add %rax, X0
+ mulx( v1, %rax, w2)
+ adc $0, w1
+ add w3, X0
+ mov X0, -16(rp)
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ add w0, X1
+ mov -8(up), %rdx
+ adc $0, w2
+L(lo0): mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mov (rp), X0
+ mulx( v1, %rax, w0)
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ mov X1, -8(rp)
+ adc $0, w3
+ mov (up), %rdx
+ add w2, X0
+ mulx( v0, %rax, w1)
+ adc $0, w0
+L(lo3): add %rax, X0
+ adc $0, w1
+ mulx( v1, %rax, w2)
+ add w3, X0
+ mov 8(rp), X1
+ mov X0, (rp)
+ mov 16(rp), X0
+ adc $0, w1
+ add %rax, X1
+ adc $0, w2
+ mov 8(up), %rdx
+ lea 32(up), up
+ inc n
+ jnz L(top)
+
+L(end): mulx( v0, %rax, w3)
+ add w0, X1
+ adc $0, w2
+L(ex): add %rax, X1
+ adc $0, w3
+ mulx( v1, %rdx, %rax)
+ add w1, X1
+ mov X1, 8(rp)
+ adc $0, w3
+ add w2, %rdx
+ adc $0, %rax
+ add %rdx, w3
+ mov w3, 16(rp)
+ adc $0, %rax
+ mov %rax, 24(rp)
+
+ jmp L(outer) C loop until a small corner remains
+
+L(corner):
+ pop un
+ mov (up), %rdx
+ jg L(small_corner)
+
+ mov %rdx, v1
+ mov (rp), X0
+ mov %rax, X1 C Tricky rax reuse of last iteration
+ mulx( v0, %rax, w1)
+ add %rax, X0
+ adc $0, w1
+ mov X0, (rp)
+ mov 8(up), %rdx
+ mulx( v0, %rax, w3)
+ add %rax, X1
+ adc $0, w3
+ mulx( v1, %rdx, %rax)
+ add w1, X1
+ mov X1, 8(rp)
+ adc $0, w3
+ add w3, %rdx
+ mov %rdx, 16(rp)
+ adc $0, %rax
+ mov %rax, 24(rp)
+ lea 32(rp), rp
+ lea 16(up), up
+ jmp L(com)
+
+L(small_corner):
+ mulx( v0, X1, w3)
+ add %rax, X1 C Tricky rax reuse of last iteration
+ adc $0, w3
+ mov X1, (rp)
+ mov w3, 8(rp)
+ lea 16(rp), rp
+ lea 8(up), up
+
+L(com):
+
+L(sqr_diag_addlsh1):
+ lea 8(up,un,8), up C put back up at its very beginning
+ lea (rp,un,8), rp
+ lea (rp,un,8), rp C put back rp at its very beginning
+ inc un
+
+ mov -8(up), %rdx
+ xor R32(%rbx), R32(%rbx) C clear CF as side effect
+ mulx( %rdx, %rax, %r10)
+ mov %rax, 8(rp)
+ mov 16(rp), %r8
+ mov 24(rp), %r9
+ jmp L(dm)
+
+ ALIGN(16)
+L(dtop):mov 32(rp), %r8
+ mov 40(rp), %r9
+ lea 16(rp), rp
+ lea (%rdx,%rbx), %r10
+L(dm): adc %r8, %r8
+ adc %r9, %r9
+ setc R8(%rbx)
+ mov (up), %rdx
+ lea 8(up), up
+ mulx( %rdx, %rax, %rdx)
+ add %r10, %r8
+ adc %rax, %r9
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ inc un
+ jnz L(dtop)
+
+L(dend):adc %rbx, %rdx
+ mov %rdx, 32(rp)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm
new file mode 100644
index 0000000..eed64e7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm
@@ -0,0 +1,200 @@
+dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl Optimised for Nehalem.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 4.75
+C Intel P4 ?
+C Intel core2 2.8-3
+C Intel NHM 2.8
+C Intel SBR 3.55
+C Intel atom ?
+C VIA nano ?
+
+C The inner-loop probably runs close to optimally on Nehalem (using 4-way
+C unrolling). The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+
+ lea -8(up,%rax,8), up
+ lea -8(vp,%rax,8), vp
+ lea -40(rp,%rax,8), rp
+ neg %rax
+
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor R32(%r9), R32(%r9)
+ mov 8(vp,%rax,8), %r10
+ mov 16(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r9
+ mov 24(vp,%rax,8), %r8
+ ADCSBB 16(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $3, %rax
+ jmp L(lo3)
+
+L(b0): mov 8(vp,%rax,8), %r9
+ xor R32(%r8), R32(%r8)
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ jmp L(lo0)
+
+L(b1): mov 8(vp,%rax,8), %r8
+ add $1, %rax
+ jz L(1)
+ mov 8(vp,%rax,8), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo1)
+L(1): xor R32(%r11), R32(%r11)
+ jmp L(wd1)
+
+L(b2): xor %r10, %r10
+ mov 8(vp,%rax,8), %r11
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ mov 16(vp,%rax,8), %r8
+ ADCSBB 8(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $2, %rax
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov 8(vp,%rax,8), %r9
+ mov %r11, %rbp
+L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
+L(lo1): shrd %cl, %r8, %rbp
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %rbp
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ mov %rbp, (rp,%rax,8) C offset 32
+L(lo0):
+L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
+ jnz L(top)
+
+L(end): mov %r10, 24(rp,%rax,8)
+L(wd1): shrd %cl, %r8, %r11
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %r11
+ mov %r11, 32(rp,%rax,8) C offset 32
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r8
+ ADDSUB %r8, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm
new file mode 100644
index 0000000..1be829f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.0
+C AMD K10 4.0
+C AMD bull 5.0
+C AMD pile 4.84 5.39
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56
+C AMD jaguar 5.30
+C Intel P4 15.7 17.2
+C Intel core2 5.15
+C Intel NHM 4.56
+C Intel SBR 3.44
+C Intel HWL 3.03
+C Intel BWL 2.77
+C Intel SKL 2.76
+C Intel atom 21
+C Intel SLM 11
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C N.B.: Be careful if editing, making sure the loop alignment padding does not
+C become large, as we currently fall into it.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ mov (rp), %r8
+ lea -8(rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ mov $0, R32(%r11)
+ mov %r8, %r10
+ ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc %rdx, %r11
+ jmp L(lo0)
+
+L(b10): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %r8, %r10
+ mov $0, R32(%r11)
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc %rdx, %r11
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n_param)
+ jz L(b01)
+
+L(b11): mov $2, R32(n)
+ sub n_param, n
+ mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ jmp L(lo3)
+
+L(b01): mov $0, R32(n)
+ sub n_param, n
+ xor %r11, %r11
+ add $4, n
+ jc L(end)
+
+ ALIGN(32)
+L(top): mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov -16(up,n,8), %rax
+ adc %rdx, %r9
+L(lo1): mul v0
+ ADDSUB %r11, %r8
+ mov $0, R32(%r11)
+ mov -16(rp,n,8), %r10
+ adc $0, %r9
+ ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc %rdx, %r11
+ mov %r8, -24(rp,n,8)
+ ADDSUB %r9, %r10
+ adc $0, %r11
+L(lo0): mov -8(rp,n,8), %r8
+ mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(rp,n,8)
+ ADDSUB %r11, %r8
+ adc $0, %r9
+L(lo3): mul v0
+ mov (rp,n,8), %r10
+ mov $0, R32(%r11)
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc %rdx, %r11
+ mov %r8, -8(rp,n,8)
+ ADDSUB %r9, %r10
+ adc $0, %r11
+L(lo2): mov 8(rp,n,8), %r8
+ mov %r10, (rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ ADDSUB %r11, %r8
+ adc $0, %rax
+ mov %r8, (rp)
+
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..f56c128
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -0,0 +1,238 @@
+/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 468,424,931 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 10
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 17
+
+#define DIV_1_VS_MUL_1_PERCENT 301
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 59
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 98
+#define SQR_TOOM4_THRESHOLD 250
+#define SQR_TOOM6_THRESHOLD 351
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 28
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31, 8}, { 511,10}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
+ { 2431,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \
+ { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,15}, { 1791,14}, { 3839,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,17}, { 1023,16}, \
+ { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \
+ { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 204
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \
+ { 79, 9}, { 319, 6}, { 2687, 7}, { 1407, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
+ { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \
+ { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6655,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 218
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 49
+#define MULLO_MUL_N_THRESHOLD 8397
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 11
+#define SQRLO_SQR_THRESHOLD 7035
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 151
+#define DC_BDIV_QR_THRESHOLD 40
+#define DC_BDIV_Q_THRESHOLD 30
+
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 199
+#define INV_APPR_THRESHOLD 157
+
+#define BINV_NEWTON_THRESHOLD 254
+#define REDC_1_TO_REDC_N_THRESHOLD 48
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1334
+#define MUPI_DIV_QR_THRESHOLD 83
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1308
+
+#define POWM_SEC_TABLE 1,64,66,452,1486
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 141
+#define SET_STR_PRECOMPUTE_THRESHOLD 1023
+
+#define FAC_DSC_THRESHOLD 182
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 5 /* 2.91% faster than 3 */
+#define HGCD_THRESHOLD 116
+#define HGCD_APPR_THRESHOLD 164
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 358
+#define JACOBI_BASE_METHOD 4 /* 0.12% faster than 1 */
+
+/* Tuneup completed successfully, took 452116 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm
new file mode 100644
index 0000000..a5a63e4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm
@@ -0,0 +1,196 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 3.26
+C AMD bd1 4.2
+C AMD bd2 4.2
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 1.15
+C AMD bobcat 7.29
+C AMD jaguar 2.53
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 2.03
+C Intel SBR 1.66
+C Intel IBR 1.62
+C Intel HWL 1.50
+C Intel BWL 1.50
+C Intel SKL 1.50
+C Intel atom n/a
+C Intel SLM 2.55
+C VIA nano n/a
+
+C TODO
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening. More work could be done there.
+C * Combine the accumulators rax and rcx into one register to save some
+C bookkeeping and a push/pop pair. Unfortunately this cause a slight
+C slowdown for at leat NHM and SBR.
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`sum', `lea ($1,$2), $2')
+define(`sum', `add $1, $2')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ push %rbx
+ push %rbp
+
+ mov (up), %r10
+ xor (vp), %r10
+
+ mov R32(n), R32(%r8)
+ and $3, R32(%r8)
+
+ xor R32(%rcx), R32(%rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): mov 8(up), %r10
+ mov 16(up), %r11
+ xor 8(vp), %r10
+ xor 16(vp), %r11
+ xor R32(%rbp), R32(%rbp)
+ sub $4, n
+ jle L(x3)
+ mov 24(up), %r8
+ mov 32(up), %r9
+ add $24, up
+ add $24, vp
+ jmp L(e3)
+
+L(0): mov 8(up), %r9
+ xor 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, up
+ add $32, vp
+ sub $4, n
+ jle L(x4)
+
+ ALIGN(16)
+L(top):
+L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ mov (up), %r8
+ mov 8(up), %r9
+ sum( %rbx, %rax)
+L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ xor (vp), %r8
+ xor 8(vp), %r9
+ sum( %rbp, %rcx)
+L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ add $32, up
+ sum( %rbx, %rax)
+L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, vp
+ sum( %rbp, %rcx)
+ sub $4, n
+ jg L(top)
+
+L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ sum( %rbx, %rax)
+L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ sum( %rbp, %rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ sum( %rbx, %rax)
+ sum( %rbp, %rcx)
+L(x2): add %rcx, %rax
+L(x1): pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(2): mov 8(up), %r11
+ xor 8(vp), %r11
+ sub $2, n
+ jle L(n2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r8
+ xor 24(vp), %r9
+ add $16, up
+ add $16, vp
+ jmp L(e2)
+L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx
+ jmp L(x2)
+
+L(1): dec n
+ jle L(x1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ xor 8(vp), %r8
+ xor 16(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ mov 24(up), %r10
+ mov 32(up), %r11
+ add $40, up
+ add $8, vp
+ jmp L(e1)
+
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm
new file mode 100644
index 0000000..0a3c867
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm
@@ -0,0 +1,182 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 1.39
+C AMD bd1 4
+C AMD bd2 4
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 0.72
+C AMD bobcat 5.78
+C AMD jaguar 1.27
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 1.04
+C Intel SBR 1.02
+C Intel IBR 1.0
+C Intel HWL 1.0
+C Intel BWL 1.0
+C Intel SKL 1.0
+C Intel atom n/a
+C Intel SLM 1.34
+C VIA nano n/a
+
+C TODO
+C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would
+C not cause any additional feed-in overhead as we already use a jump table.
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(2)
+
+ mov R32(n), R32(%r8)
+ and $7, R32(%r8)
+
+ .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax
+ xor R32(%rcx), R32(%rcx)
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11
+ add $24, up
+ sub $8, n
+ jg L(e34)
+ add %r10, %rax
+ add %r11, %rax
+L(s1): FUNC_EXIT()
+ ret
+
+L(1): sub $8, n
+ jle L(s1)
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9
+ add $8, up
+ jmp L(e12)
+
+L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11
+ add $-8, up
+ jmp L(e07)
+
+L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ jmp L(e07)
+
+L(4): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ add $32, up
+ sub $8, n
+ jle L(x4)
+
+ ALIGN(16)
+L(top):
+L(e34): .byte 0xf3,0x4c,0x0f,0xb8,0x07 C popcnt (%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%r9
+ add %r10, %rcx
+ add %r11, %rax
+L(e12): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ add %r8, %rcx
+ add %r9, %rax
+L(e07): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 C popcnt 0x20(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 C popcnt 0x28(%rdi),%r9
+ add %r10, %rcx
+ add %r11, %rax
+L(e56): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 C popcnt 0x30(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 C popcnt 0x38(%rdi),%r11
+ add $64, up
+ add %r8, %rcx
+ add %r9, %rax
+ sub $8, n
+ jg L(top)
+
+L(x4): add %r10, %rcx
+ add %r11, %rax
+L(x2): add %rcx, %rax
+
+ FUNC_EXIT()
+ ret
+
+L(2): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ sub $8, n
+ jle L(x2)
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9
+ add $16, up
+ jmp L(e12)
+
+L(5): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 0x8(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 0x10(%rdi),%r9
+ add $-24, up
+ jmp L(e56)
+
+L(6): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9
+ add $-16, up
+ jmp L(e56)
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm
new file mode 100644
index 0000000..fc71c1b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm
@@ -0,0 +1,549 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 3(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov $0, R32(%r11)
+ mov 8(up,n,8), %rbx
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r9
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+L(e1): add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 5(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov 8(up,n,8), %rbx
+ mov $0, R32(%r11)
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+C jmp L(tp3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp3)
+
+L(ed3): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov $0, R32(%r11)
+ mov (up,n,8), %r10
+ add %rax, %r10
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r11
+ mov 8(up,n,8), %rbx
+ mul q0
+ add %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r9
+ add %r11, %rbx
+ adc $0, %r9
+ mul q0
+ mov 16(up,n,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r11
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+L(e0): add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp0)
+
+L(ed0): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r11
+ mov 8(up,n,8), %rbx
+ mul q0
+ add %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbx
+ mov $0, R32(%r11)
+ mov 16(up,n,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r11
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+L(e2): add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp2)
+
+L(ed2): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm
new file mode 100644
index 0000000..21f0bf4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm
@@ -0,0 +1,224 @@
+dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.93 this
+C Intel IBR 2.66 this
+C Intel HWL 2.5 2.15
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+
+ mov n_param, n
+ neg n
+
+ lea (up,n_param,8), up
+ lea 8(rp,n_param,8), rp
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): mov -8(rp,n,8), X0
+ mov %rdx, w1
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ xor w0, w0
+ xor w3, w3
+ test $2, R8(n)
+ jnz L(b10)
+
+L(b00): nop C this nop make loop go faster on SBR!
+ mul v1
+ mov (rp,n,8), X1
+ jmp L(lo0)
+
+L(b10): lea -2(n), n
+ jmp L(lo2)
+
+L(bx1): mov -8(rp,n,8), X1
+ mov %rdx, w3
+ add %rax, X1
+ adc $0, w3
+ mov (up,n,8), %rax
+ xor w1, w1
+ xor w2, w2
+ test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov (rp,n,8), X0
+ inc n
+ jmp L(lo1)
+
+L(b11): dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo1): mul v1
+ mov %rdx, w0 C 1
+ add %rax, X0 C 0
+ adc $0, w0 C 1
+ add w1, X1 C 3
+ adc $0, w3 C 0
+ add w2, X0 C 0
+ adc $0, w0 C 1
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, X0 C 0
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mov (up,n,8), %rax
+ mul v1
+ mov X1, -16(rp,n,8) C 3
+ mov (rp,n,8), X1 C 1
+ add w3, X0 C 0
+ adc $0, w1 C 1
+L(lo0): mov %rdx, w2 C 2
+ mov X0, -8(rp,n,8) C 0
+ add %rax, X1 C 1
+ adc $0, w2 C 2
+ mov 8(up,n,8), %rax
+ add w0, X1 C 1
+ adc $0, w2 C 2
+ mul v0
+ add %rax, X1 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+L(lo3): mul v1
+ add w1, X1 C 1
+ mov 8(rp,n,8), X0 C 2
+ adc $0, w3 C 2
+ mov %rdx, w0 C 3
+ add %rax, X0 C 2
+ adc $0, w0 C 3
+ mov 16(up,n,8), %rax
+ mul v0
+ add w2, X0 C 2
+ mov X1, (rp,n,8) C 1
+ mov %rdx, w1 C 3
+ adc $0, w0 C 3
+ add %rax, X0 C 2
+ adc $0, w1 C 3
+ mov 16(up,n,8), %rax
+ add w3, X0 C 2
+ adc $0, w1 C 3
+L(lo2): mul v1
+ mov 16(rp,n,8), X1 C 3
+ add %rax, X1 C 3
+ mov %rdx, w2 C 4
+ adc $0, w2 C 4
+ mov 24(up,n,8), %rax
+ mov X0, 8(rp,n,8) C 2
+ mul v0
+ add w0, X1 C 3
+ mov %rdx, w3 C 4
+ adc $0, w2 C 4
+ add %rax, X1 C 3
+ mov 24(up,n,8), %rax
+ mov 24(rp,n,8), X0 C 0 useless but harmless final read
+ adc $0, w3 C 4
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-16(rp),-16(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I(-8(rp),-8(rp,n,8))
+ mov %rdx, %rax
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm
new file mode 100644
index 0000000..2319a80
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm
@@ -0,0 +1,54 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh1_n)
+ define(func_nc, mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh1_n)
+ define(func_nc, mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm
new file mode 100644
index 0000000..3b7bb22
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm
@@ -0,0 +1,56 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh2_n)
+ define(func_nc, mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh2_n)
+ define(func_nc, mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh2_nc
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm
new file mode 100644
index 0000000..23ace41
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm
@@ -0,0 +1,173 @@
+dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
+dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
+
+dnl Copyright 2009-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3.25
+C Intel NHM 4
+C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32))
+C Intel atom ?
+C VIA nano ?
+
+C This code probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling). It also runs reasonably well on Core 2, but it runs poorly on
+C all other processors, including Nehalem.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ mov cy, %rax
+ neg %rax C set msb on carry
+ xor R32(%rbp), R32(%rbp) C limb carry
+ mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+ mov R32(n), R32(%r9)
+ and $3, R32(%r9)
+ je L(b00)
+ cmp $2, R32(%r9)
+ jc L(b01)
+ je L(b10)
+ jmp L(b11)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ xor R32(%rbp), R32(%rbp) C limb carry
+ mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ mov 16(vp), %r10
+ shrd $RSH, %r10, %r9
+ add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+ mov %r10, %rbp
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $3, n
+ ja L(top)
+ jmp L(end)
+
+L(b01): add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ mov %rbp, (rp)
+ mov %r8, %rbp
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $1, n
+ ja L(top)
+ jmp L(end)
+
+L(b10): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, %rbp
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $2, n
+ ja L(top)
+ jmp L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+L(b00): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ mov 16(vp), %r10
+ shrd $RSH, %r10, %r9
+ mov 24(vp), %r11
+ shrd $RSH, %r11, %r10
+ lea 32(vp), vp
+ add R32(%rax), R32(%rax) C restore carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+ mov %r10, 24(rp)
+ mov %r11, %rbp
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $4, n
+ jnz L(top)
+
+L(end): shr $RSH, %rbp
+ add R32(%rax), R32(%rax) C restore carry flag
+ ADCSBB $0, %rbp
+ mov %rbp, %rax
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm
new file mode 100644
index 0000000..db8ee68
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm
@@ -0,0 +1,215 @@
+dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl Optimised for Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 5.25
+C Intel P4 ?
+C Intel core2 3.1
+C Intel NHM 3.95
+C Intel SBR 2.75
+C Intel atom ?
+C VIA nano ?
+
+C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling). The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+ and $3, R32(%rbp)
+ jz L(b0)
+ lea -32(vp,%rbp,8), vp
+ lea -32(up,%rbp,8), up
+ lea -32(rp,%rbp,8), rp
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor %r8, %r8
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $3, %rax
+ jz L(3)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo3)
+L(3): add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd3)
+
+L(b0): mov (vp), %r8
+ mov 8(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo0)
+
+L(b1): xor %r10, %r10
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $1, %rax
+ jz L(1)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ mov (vp), %r8
+ jmp L(lo1)
+L(1): add R32(%rbx), R32(%rbx)
+ ADCSBB 24(up), %r10
+ jmp L(wd1)
+
+L(b2): xor %r9, %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $2, %rax
+ jz L(2)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo2)
+L(2): add R32(%rbx), R32(%rbx)
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd2)
+
+ ALIGN(32) C 16-byte alignment is not enough!
+L(top): shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+ lea 32(up), up
+L(lo3): mov %r8, 8(rp)
+L(lo2): mov %r9, 16(rp)
+ mov (vp), %r8
+L(lo1): mov %r10, 24(rp)
+ mov 8(vp), %r9
+ mov %r11, %rbp
+ lea 32(rp), rp
+ sbb R32(%rbx), R32(%rbx)
+L(lo0): shrd R8(%rcx), %r8, %rbp
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ sub $4, %rax
+ jg L(top)
+
+ shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+L(wd3): mov %r8, 8(rp)
+L(wd2): mov %r9, 16(rp)
+L(wd1): mov %r10, 24(rp)
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r11
+ ADDSUB %r11, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm
new file mode 100644
index 0000000..61fee3e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm
@@ -0,0 +1,203 @@
+dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
+dnl Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1.75\2.52
+C AMD K10 1.5
+C AMD bd1 1.69\2.25
+C AMD bd2 1.65
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 1.5
+C AMD bt1 2.67
+C AMD bt2 2.16
+C Intel P4 11.54
+C Intel PNR 5
+C Intel NHM 5.5
+C Intel SBR 1.54
+C Intel IBR 1.5
+C Intel HWL 1.32
+C Intel BWL 1.07
+C Intel SKL 1.21
+C Intel atom 4.3
+C Intel SLM 3
+C VIA nano ?
+
+C The loop of this code was manually written. It runs close to optimally on
+C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
+C It also runs slightly faster on average on AMD bd1 and bd2.
+C
+C No micro-optimisation has been done.
+C
+C N.B.! The loop alignment padding insns are executed. If editing the code,
+C make sure the padding does not become excessive. It is now a 4-byte nop.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+
+L(ent): mov R32(n), R32(%rax)
+ shr $2, n
+
+ test $1, R8(%rax)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(%rax)
+ jnz L(b10)
+
+L(b00): neg %r8
+ mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+ lea -16(rp), rp
+ jmp L(lo0)
+
+L(b10): neg %r8
+ mov (up), %r10
+ mov 8(up), %r11
+ ADCSBB 0(vp), %r10
+ ADCSBB 8(vp), %r11
+ jrcxz L(e2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ lea 16(up), up
+ ADCSBB 16(vp), %r8
+ ADCSBB 24(vp), %r9
+ lea 16(vp), vp
+C lea (rp), rp
+ jmp L(lo2)
+
+L(e2): mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+L(bx1): test $2, R8(%rax)
+ jnz L(b11)
+
+L(b01): neg %r8
+ mov (up), %r11
+ ADCSBB (vp), %r11
+ jrcxz L(e1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ lea 8(up), up
+ lea -8(rp), rp
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ lea 8(vp), vp
+ jmp L(lo1)
+
+L(e1): mov %r11, (rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+L(b11): neg %r8
+ mov (up), %r9
+ ADCSBB (vp), %r9
+ mov 8(up), %r10
+ mov 16(up), %r11
+ lea 24(up), up
+ ADCSBB 8(vp), %r10
+ ADCSBB 16(vp), %r11
+ lea 24(vp), vp
+ mov %r9, (rp)
+ lea 8(rp), rp
+ jrcxz L(end)
+
+ ALIGN(32)
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+L(lo2): mov %r10, (rp)
+L(lo1): mov %r11, 8(rp)
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+L(lo0): mov %r8, 16(rp)
+L(lo3): mov %r9, 24(rp)
+ lea 32(rp), rp
+ dec n
+ jnz L(top)
+
+L(end): mov R32(n), R32(%rax) C zero rax
+ mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm
new file mode 100644
index 0000000..b4c1572
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm
@@ -0,0 +1,212 @@
+dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.27
+C AMD K10 4.27 4.54
+C AMD bull 4.76
+C AMD pile 4.55
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.30
+C AMD jaguar 5.28
+C Intel P4 16.2 17.1
+C Intel core2 5.26
+C Intel NHM 5.09
+C Intel SBR 3.21
+C Intel IBR 2.96
+C Intel HWL 2.81
+C Intel BWL 2.76
+C Intel SKL 2.76
+C Intel atom 21.5
+C Intel SLM 9.5
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjörn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+define(`I',`$1')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'')') dnl
+IFDOS(` define(`rp', ``%rcx'')') dnl
+IFDOS(` define(`v0', ``%r9'')') dnl
+IFDOS(` define(`r9', ``rdi'')') dnl
+IFDOS(` define(`n_param',``%r8'')') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax
+ push %rbx
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(b13)
+
+L(b02): xor R32(%r11), R32(%r11)
+ test $2, R8(n_param)
+ jnz L(b2)
+
+L(b0): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rdx, %r9
+ mov -8(rp,n,8), %r8
+ jmp L(e0)
+
+ ALIGN(16)
+L(b2): mov $-1, n
+ sub n_param, n
+ mul v0
+ mov 8(rp,n,8), %r8
+ mov %rdx, %r9
+ jmp L(e2)
+
+ ALIGN(16)
+L(b13): xor R32(%r9), R32(%r9)
+ test $2, R8(n_param)
+ jnz L(b3)
+
+L(b1): mov $2, R32(n)
+ sub n_param, n
+ jns L(1)
+ mul v0
+ mov -16(rp,n,8), %r10
+ mov %rdx, %r11
+ jmp L(e1)
+
+ ALIGN(16)
+L(b3): xor R32(n), R32(n)
+ sub n_param, n
+ mul v0
+ mov (rp,n,8), %r10
+ jmp L(e3)
+
+ ALIGN(32)
+L(top): mul v0
+ mov -16(rp,n,8), %r10
+ ADDSUB %r11, %r8
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %r8, -24(rp,n,8)
+L(e1): ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ ADDSUB %r9, %r10
+ mov %rdx, %r9
+ mov -8(rp,n,8), %r8
+ adc $0, %r11
+ mov %r10, -16(rp,n,8)
+L(e0): ADDSUB %rax, %r8
+ adc $0, %r9
+ mov (up,n,8), %rax
+ mul v0
+ mov (rp,n,8), %r10
+ ADDSUB %r11, %r8
+ mov %r8, -8(rp,n,8)
+ adc $0, %r9
+L(e3): mov %rdx, %r11
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ mov 8(rp,n,8), %r8
+ ADDSUB %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (rp,n,8)
+ adc $0, %r11
+L(e2): ADDSUB %rax, %r8
+ adc $0, %r9
+ mov 16(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ mov I(-8(rp),-16(rp,n,8)), %r10
+ ADDSUB %r11, %r8
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %r8, I(-16(rp),-24(rp,n,8))
+ ADDSUB %rax, %r10
+ adc $0, %r11
+ ADDSUB %r9, %r10
+ adc $0, %r11
+ mov %r10, I(-8(rp),-16(rp,n,8))
+ mov %r11, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+
+ ALIGN(16)
+L(1): mul v0
+ ADDSUB %rax, -8(rp)
+ mov %rdx, %rax
+ adc $0, %rax
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm
new file mode 100644
index 0000000..43abcc8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm
@@ -0,0 +1,174 @@
+dnl AMD64 mpn_cnd_add_n.
+
+dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR 3.0
+C Intel NHM 3.75
+C Intel SBR 1.93
+C Intel IBR 1.89
+C Intel HWL 1.78
+C Intel BWL 1.50
+C Intel SKL 1.50
+C Intel atom
+C Intel SLM 4.0
+C VIA nano
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about whether it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi') dnl rcx
+define(`rp', `%rsi') dnl rdx
+define(`up', `%rdx') dnl r8
+define(`vp', `%rcx') dnl r9
+define(`n', `%r8') dnl rsp+40
+
+define(`cnd', `%rbx')
+
+define(ADDSUB, add)
+define(ADCSBB, adc)
+define(func, mpn_cnd_add_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), R32(%r8)')
+ push %rbx
+
+ neg cnd_arg
+ sbb cnd, cnd C make cnd mask
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): test $2, R8(n)
+ jz L(b1)
+
+L(b3): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ and cnd, %rdi
+ and cnd, %r9
+ and cnd, %r10
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sub $3, n
+ jnz L(top)
+ jmp L(end)
+
+L(x0): xor R32(%rax), R32(%rax)
+ test $2, R8(n)
+ jz L(top)
+
+L(b2): mov (vp), %rdi
+ mov 8(vp), %r9
+ and cnd, %rdi
+ and cnd, %r9
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sub $2, n
+ jnz L(top)
+ jmp L(end)
+
+L(b1): mov (vp), %rdi
+ and cnd, %rdi
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ dec n
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ mov 24(vp), %r11
+ lea 32(vp), vp
+ and cnd, %rdi
+ and cnd, %r9
+ and cnd, %r10
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ lea 32(up), up
+ mov %r11, 24(rp)
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry
+ sub $4, n
+ jnz L(top)
+
+L(end): neg R32(%rax)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm
new file mode 100644
index 0000000..f55492b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm
@@ -0,0 +1,200 @@
+dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR 3.0
+C Intel NHM 2.75
+C Intel SBR 2.15
+C Intel IBR 1.96
+C Intel HWL 2.0
+C Intel BWL 1.65
+C Intel SKL 1.65
+C Intel atom
+C Intel SLM 4.5
+C VIA nano
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about whether it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+C * Given that we have a dedicated cnd_add_n, it might look strange that this
+C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and
+C this file's generality might come in handy for some pipeline.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi') dnl rcx
+define(`rp', `%rsi') dnl rdx
+define(`up', `%rdx') dnl r8
+define(`vp', `%rcx') dnl r9
+define(`n', `%r8') dnl rsp+40
+
+define(`cnd', `%rbx')
+
+ifdef(`OPERATION_cnd_add_n',`
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n',`
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), R32(%r8)')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ neg cnd_arg
+ sbb cnd, cnd C make cnd mask
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): test $2, R8(n)
+ jz L(b1)
+
+L(b3): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ and cnd, %rdi
+ mov (up), %r12
+ and cnd, %r9
+ mov 8(up), %r13
+ and cnd, %r10
+ mov 16(up), %rbp
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ ADCSBB %r10, %rbp
+ mov %rbp, 16(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sub $3, n
+ jnz L(top)
+ jmp L(end)
+
+L(x0): xor R32(%rax), R32(%rax)
+ test $2, R8(n)
+ jz L(top)
+
+L(b2): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov (up), %r12
+ and cnd, %rdi
+ mov 8(up), %r13
+ and cnd, %r9
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sub $2, n
+ jnz L(top)
+ jmp L(end)
+
+L(b1): mov (vp), %rdi
+ mov (up), %r12
+ and cnd, %rdi
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ dec n
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ mov 24(vp), %r11
+ lea 32(vp), vp
+ and cnd, %rdi
+ mov (up), %r12
+ and cnd, %r9
+ mov 8(up), %r13
+ and cnd, %r10
+ mov 16(up), %rbp
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ mov 24(up), %rax
+ lea 32(up), up
+ ADCSBB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ ADCSBB %r10, %rbp
+ mov %rbp, 16(rp)
+ ADCSBB %r11, %rax
+ mov %rax, 24(rp)
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry
+ sub $4, n
+ jnz L(top)
+
+L(end): neg R32(%rax)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm
new file mode 100644
index 0000000..d9f371f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_divrem_1
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1)
+include_mpn(`x86_64/divrem_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..36f4512
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -0,0 +1,241 @@
+/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 468,152,320 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 30
+
+#define DIV_1_VS_MUL_1_PERCENT 298
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 154
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 93
+#define SQR_TOOM4_THRESHOLD 248
+#define SQR_TOOM6_THRESHOLD 342
+#define SQR_TOOM8_THRESHOLD 462
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 396, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 95,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \
+ { 95, 7}, { 1535, 8}, { 831,10}, { 223, 9}, \
+ { 447,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 639,12}, \
+ { 351,11}, { 703,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1919,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4095,15}, { 8191,16}, { 4607,15}, { 9983,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 6}, \
+ { 4351, 7}, { 2303, 8}, { 1215,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,10}, { 607,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1599,12}, \
+ { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1791,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 210
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 66
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 172
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 92
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 170
+#define INV_APPR_THRESHOLD 167
+
+#define BINV_NEWTON_THRESHOLD 228
+#define REDC_1_TO_REDC_2_THRESHOLD 36
+#define REDC_2_TO_REDC_N_THRESHOLD 55
+
+#define MU_DIV_QR_THRESHOLD 1387
+#define MU_DIVAPPR_Q_THRESHOLD 1387
+#define MUPI_DIV_QR_THRESHOLD 77
+#define MU_BDIV_QR_THRESHOLD 1187
+#define MU_BDIV_Q_THRESHOLD 1442
+
+#define POWM_SEC_TABLE 1,16,191,452,1297
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 1160
+#define SET_STR_PRECOMPUTE_THRESHOLD 2043
+
+#define FAC_DSC_THRESHOLD 426
+#define FAC_ODD_THRESHOLD 24
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD2_DIV1_METHOD 5 /* 0.74% faster than 3 */
+#define HGCD_THRESHOLD 96
+#define HGCD_APPR_THRESHOLD 60
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 1 /* 32.22% faster than 4 */
+
+/* Tuneup completed successfully, took 276198 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm
new file mode 100644
index 0000000..a1cbc31
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm
new file mode 100644
index 0000000..ac90edb
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm
new file mode 100644
index 0000000..a43a117
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm
@@ -0,0 +1,199 @@
+dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2
+C Intel NHM
+C Intel SBR 2.49
+C Intel IBR 2.32
+C Intel HWL 2.44
+C Intel BWL 2.43
+C Intel SKL 2.47
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up_param',`%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+define(`cin', `%r8') C stack
+
+define(`up', `%rsi') C same as rp_param
+define(`n', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`rp', `%rcx')')
+IFDOS(` define(`up_param',`%rdx')')
+IFDOS(` define(`n_param', `%r8')')
+IFDOS(` define(`v0', `%r9')')
+IFDOS(` define(`cin', `48(%rsp)')')
+
+IFDOS(` define(`up', `%rsi')')
+IFDOS(` define(`n', `%r8')')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ neg n
+ mul v0
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01)
+
+L(11): mov 8(up,n,8), %rax
+ dec n
+ jmp L(L3)
+
+L(01): inc n
+ jnz L(L1)
+ mov %rax, (rp)
+ mov %rdx, %rax
+IFDOS(` pop %rsi ')
+ ret
+
+L(x0): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up,n,8), %rax
+ test $2, R8(n)
+ jz L(L0)
+
+L(10): add $-2, n
+ jmp L(L2)
+
+ ALIGN(8)
+L(top): mov %rdx, %r10
+ add %rax, %r11
+L(L1): mov 0(up,n,8), %rax
+ adc $0, %r10
+ mul v0
+ add %rax, %r10
+ mov %r11, 0(rp,n,8)
+ mov 8(up,n,8), %rax
+ mov %rdx, %r11
+L(L0c): adc $0, %r11
+L(L0): mul v0
+ mov %r10, 8(rp,n,8)
+ add %rax, %r11
+ mov %rdx, %r10
+L(L3c): mov 16(up,n,8), %rax
+ adc $0, %r10
+L(L3): mul v0
+ mov %r11, 16(rp,n,8)
+ mov %rdx, %r11
+ add %rax, %r10
+L(L2c): mov 24(up,n,8), %rax
+ adc $0, %r11
+L(L2): mul v0
+ mov %r10, 24(rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(end): add %rax, %r11
+ mov %rdx, %rax
+ adc $0, %rax
+ mov %r11, (rp)
+
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ neg n
+ mul v0
+
+ test $1, R8(n)
+ jz L(x0c)
+L(x1c): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01c)
+
+L(11c): add cin, %r11
+ dec n
+ jmp L(L3c)
+
+L(01c): add cin, %r11
+ inc n
+ jnz L(L1)
+ mov %r11, (rp)
+ mov %rdx, %rax
+ adc $0, %rax
+IFDOS(` pop %rsi ')
+ ret
+
+L(x0c): mov %rax, %r10
+ mov %rdx, %r11
+ test $2, R8(n)
+ jz L(00c)
+
+L(10c): add $-2, n
+ add cin, %r10
+ jmp L(L2c)
+
+L(00c): add cin, %r10
+ mov 8(up,n,8), %rax
+ jmp L(L0c)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm
new file mode 100644
index 0000000..781534d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm
@@ -0,0 +1,167 @@
+dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9 8.03
+C AMD K10 8.03
+C AMD bull 9.19
+C AMD pile 9.16
+C AMD steam
+C AMD excavator
+C AMD bobcat 10.6
+C AMD jaguar 11.0
+C Intel P4 26.0
+C Intel core2 8.73
+C Intel NHM 8.55
+C Intel SBR 5.15
+C Intel IBR 4.57
+C Intel HWL 4.08
+C Intel BWL 4.10
+C Intel SKL 4.14
+C Intel atom 39.5
+C Intel SLM 26.3
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(b1)
+
+L(b0): mov $0, R32(n)
+ sub n_param, n
+ xor w0, w0
+ mul v0
+ mov %rax, w2
+ mov %rdx, w1
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b1): mov $1, R32(n)
+ sub n_param, n
+ xor w2, w2
+ mul v0
+ mov %rax, w0
+ mov %rdx, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ jmp L(lo1)
+
+ ALIGN(32)
+L(top): mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0 C 1
+ adc $0, w3 C 2
+L(lo1): add %rax, w2 C 2
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ adc $0, w0 C 3
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w2 C 2
+ mov %rdx, w1 C 3
+ adc $0, w1 C 3
+ add w3, w2 C 2
+ mov (up,n,8), %rax
+ adc $0, w1 C 1
+L(lo0): mul v1
+ mov w2, (rp,n,8) C 2
+ add %rax, w0 C 3
+ mov %rdx, w2 C 4
+ mov 8(up,n,8), %rax
+ adc $0, w2 C 4
+ add $2, n
+ jnc L(top)
+
+L(end): mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm
new file mode 100644
index 0000000..35fd1cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm
@@ -0,0 +1,407 @@
+dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.5 2.5 - 2.95
+C Intel IBR 2.4 2.3 - 2.68
+C Intel HWL 2.35 2.0 - 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Fix the addmul_2 fluctuation affecting SBR.
+C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
+C the loops at the expense of code size.
+C * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
+C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
+C speedup.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16) C FIXME
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ xor w0, w0
+ mov %rax, w2
+ mov %rdx, w1
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ xor w1, w1
+ xor w2, w2
+ mov %rax, w0
+ mov %rdx, w3
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+L(m2l1):mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,n,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+L(m2l0):mov (up,n,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,n,8)
+ add %rax, w0
+ mov %rdx, w2
+ mov 8(up,n,8), %rax
+ adc $0, w2
+ add $2, n
+ jnc L(m2tp)
+
+L(m2ed):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,un,8), %rax
+ mul v0
+ test $1, R8(un)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,un,8), X0
+ xor w0, w0
+ mov %rdx, w1
+ test $2, R8(un)
+ jnz L(a110)
+
+L(a100):lea 2(un), n C un = 4, 8, 12, ...
+ add %rax, X0
+ adc $0, w1
+ mov (up,un,8), %rax
+ mul v1
+ mov 8(rp,un,8), X1
+ jmp L(lo0)
+
+L(a110):lea (un), n C un = 2, 6, 10, ...
+ xor w3, w3
+ jmp L(lo2)
+
+L(a1x1):mov (rp,un,8), X1
+ xor w2, w2
+ xor w1, w1
+ test $2, R8(un)
+ jz L(a111)
+
+L(a101):lea 3(un), n C un = 1, 5, 9, ...
+ mov %rdx, w3
+ add %rax, X1
+ mov (up,un,8), %rax
+ mov 8(rp,un,8), X0
+ adc $0, w3
+ jmp L(top)
+
+L(a111):lea 1(un), n C un = 3, 7, 11, ...
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,n,8), %rax
+ mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,n,8), %rax
+ mul v1
+ mov X1, -24(rp,n,8)
+ mov -8(rp,n,8), X1
+ add w3, X0
+ adc $0, w1
+L(lo0): mov %rdx, w2
+ mov X0, -16(rp,n,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,n,8), %rax
+ add w0, X1
+ adc $0, w2
+ mul v0
+L(lo3): add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,n,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X1, -8(rp,n,8)
+ mov %rdx, w1
+ adc $0, w0
+L(lo2): add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,n,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,n,8), %rax
+ mov X0, (rp,n,8)
+ mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,n,8), %rax
+ mov 16(rp,n,8), X0 C useless but harmless in final iter
+ adc $0, w3
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-8(rp),-24(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I((rp),-16(rp,n,8))
+ mov %rdx, I(8(rp),-8(rp,n,8))
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea 16(rp), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+ pop %r13
+ pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm
new file mode 100644
index 0000000..a41a8ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm
@@ -0,0 +1,384 @@
+dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.5 2.95
+C Intel IBR 2.3 2.68
+C Intel HWL 2.0 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Implement proper cor2, replacing current cor0.
+C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
+C * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r8')
+define(`X0', `%r14')
+define(`X1', `%r15')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`i', `%rbp')
+define(`v0', `%r9')
+define(`v1', `%rbx')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+
+ mov (up), %rax
+ mov vp_param, vp
+
+ cmp $4, n
+ jb L(small)
+
+ mov (vp_param), v0
+ push %rbx
+ lea (rp,n,8), rp C point rp at R[un]
+ push %rbp
+ lea (up,n,8), up C point up right after U's end
+ push %r12
+ neg n
+ push %r13
+ mul v0
+ mov 8(vp), v1
+
+ test $1, R8(n)
+ jnz L(m2b1)
+
+L(m2b0):lea (n), i
+ xor w0, w0
+ mov %rax, w2
+ mov %rdx, w1
+ jmp L(m2l0)
+
+L(m2b1):lea 1(n), i
+ xor w1, w1
+ xor w2, w2
+ mov %rax, w0
+ mov %rdx, w3
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+L(m2l1):mov -8(up,i,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,i,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+L(m2l0):mov (up,i,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,i,8)
+ add %rax, w0
+ mov %rdx, w2 C FIXME: dead in last iteration
+ mov 8(up,i,8), %rax
+ adc $0, w2 C FIXME: dead in last iteration
+ add $2, i
+ jnc L(m2tp)
+
+L(m2ed):imul v0, %rax
+ add w0, %rax
+ add w1, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jge L(cor1)
+
+ push %r14
+ push %r15
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,n,8), %rax
+ mul v0
+ test $1, R8(n)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,n,8), X1
+ xor w2, w2
+ xor w1, w1
+ test $2, R8(n)
+ jnz L(a110)
+
+L(a100):lea 1(n), i
+ jmp L(lo0)
+
+L(a110):lea 3(n), i
+ mov %rdx, w3
+ add %rax, X1
+ mov (up,n,8), %rax
+ mov 8(rp,n,8), X0
+ adc $0, w3
+ jmp L(lo2)
+
+L(a1x1):mov (rp,n,8), X0
+ xor w0, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jz L(a111)
+
+L(a101):lea 2(n), i
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ mul v1
+ mov 8(rp,n,8), X1
+ jmp L(lo1)
+
+L(a111):lea (n), i
+ xor w3, w3
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,i,8), %rax
+ mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,i,8), %rax
+ mul v1
+ mov X1, -24(rp,i,8)
+ mov -8(rp,i,8), X1
+ add w3, X0
+ adc $0, w1
+L(lo1): mov %rdx, w2
+ mov X0, -16(rp,i,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,i,8), %rax
+ add w0, X1
+ adc $0, w2
+ mul v0
+L(lo0): add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,i,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,i,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,i,8), %rax
+ mul v0
+ add w2, X0
+ mov X1, -8(rp,i,8)
+ mov %rdx, w1
+ adc $0, w0
+L(lo3): add %rax, X0
+ adc $0, w1
+ mov (up,i,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,i,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,i,8), %rax
+ mov X0, (rp,i,8)
+ mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,i,8), %rax
+ mov 16(rp,i,8), X0
+ adc $0, w3
+ add $4, i
+ jnc L(top)
+
+L(end): imul v1, %rax
+ add %rax, X0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ mov I(-8(up),-16(up,i,8)), %rax
+ imul v0, %rax
+ add X0, %rax
+ mov X1, I(-16(rp),-24(rp,i,8))
+ add w3, %rax
+ mov %rax, I(-8(rp),-16(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jl L(outer)
+
+ pop %r15
+ pop %r14
+
+ jnz L(cor0)
+
+L(cor1):mov (vp), v0
+ mov 8(vp), v1
+ mov -16(up), %rax
+ mul v0 C u0 x v2
+ add -16(rp), %rax C FIXME: rp[0] still available in reg?
+ adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
+ mov -8(up), %r10
+ imul v0, %r10
+ mov -16(up), %r11
+ imul v1, %r11
+ mov %rax, -16(rp)
+ add %r10, %r11
+ add %rdx, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(cor0):mov (vp), %r11
+ imul -8(up), %r11
+ add %rax, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ cmp $2, n
+ jae L(gt1)
+L(n1): imul (vp_param), %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp_param), %r9
+ mul %r9
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp_param), %r9
+ mul %r9 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, %r10
+ mov 8(up), %rax
+ mul %r9 C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r11
+ mov (up), %rax
+ mul %r11 C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r11 C u1 x v1
+ add %r11, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm
new file mode 100644
index 0000000..f0dbe07
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm
@@ -0,0 +1,546 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR 3.24
+C Intel IBR 3.04
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea 8(mp_param,n,8), mp
+ lea 8(up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov %rdx, %r9
+ mov (up,n,8), %rbx
+ add %rax, %rbx
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbx
+ mov %rbx, -8(up,i,8) C next low remainder limb
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+L(e1): mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r9
+ add %rax, %rbx
+ adc $0, %r9
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov 8(up,n,8), %r10
+ add %r11, %rbx
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbx, (up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+L(e3): add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp3)
+
+L(ed3): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea -8(up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea -8(up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea (n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov %rdx, %r9
+ mov -8(up,n,8), %rbp
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r11
+ add %rax, %rbx
+ mov 8(mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,n,8), %rbp
+ add %r9, %rbx
+ mov %rdx, %r9
+ mov %rbx, (up,n,8)
+ adc $0, %r11
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+L(e0): add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp0)
+
+L(ed0): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %rbp
+ mov %rdx, %r9
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r11
+ add %rax, %rbx
+ mov 8(mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %rbx
+ mov %rdx, %r9
+ mov 8(up,n,8), %rbp
+ adc $0, %r11
+ mov %rbx, (up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+L(e2): add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp2)
+
+L(ed2): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -16(up), %rax
+ adc -8(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -24(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ mov -16(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -24(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ mov -8(up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc (up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -32(mp), %rax
+ mov -32(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -24(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -24(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ add %r11, %rbp
+ mov -16(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -24(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -16(up)
+ mov %r11, -32(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+ jmp L(cj)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm
new file mode 100644
index 0000000..fd2eaea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm
@@ -0,0 +1,193 @@
+dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge.
+
+dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 4.25
+C Intel P4 21.5
+C Intel core2 3.2
+C Intel NHM 3.87
+C Intel SBR 2.05
+C Intel atom ?
+C VIA nano 44.9
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+
+ neg %r8 C set C flag from parameter
+ mov (up), %rbp
+ ADCSBB (vp), %rbp
+
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rbp
+ ADDSUB (vp), %rbp
+L(ent):
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov R32(%rbp), R32(%rax)
+ and $1, R32(%rax) C return value
+
+ mov R32(n), R32(%r11)
+ and $3, R32(%r11)
+
+ cmp $1, R32(%r11)
+ je L(do) C jump if n = 1 5 9 ...
+
+L(n1): cmp $2, R32(%r11)
+ jne L(n2) C jump unless n = 2 6 10 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r10
+ ADCSBB 8(vp), %r10
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r10, %rbp
+ mov %rbp, -8(rp)
+ jmp L(cj1)
+
+L(n2): cmp $3, R32(%r11)
+ jne L(n3) C jump unless n = 3 7 11 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r9
+ mov 16(up), %r10
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r9, %rbp
+ mov %rbp, -16(rp)
+ jmp L(cj2)
+
+L(n3): dec n C come here for n = 4 8 12 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ ADCSBB 24(vp), %r10
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r8, %rbp
+ mov %rbp, -24(rp)
+ shrd $1, %r9, %r8
+ mov %r8, -16(rp)
+L(cj2): shrd $1, %r10, %r9
+ mov %r9, -8(rp)
+L(cj1): mov %r10, %rbp
+
+L(do):
+ shr $2, n C 4
+ je L(end) C 2
+ ALIGN(16)
+L(top): add R32(%rbx), R32(%rbx) C restore cy
+
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ mov 32(up), %r11
+ ADCSBB 24(vp), %r10
+ ADCSBB 32(vp), %r11
+
+ lea 32(up), up
+ lea 32(vp), vp
+
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r8, %rbp
+ mov %rbp, (rp)
+ shrd $1, %r9, %r8
+ mov %r8, 8(rp)
+ shrd $1, %r10, %r9
+ mov %r9, 16(rp)
+ shrd $1, %r11, %r10
+ mov %r10, 24(rp)
+
+ dec n
+ mov %r11, %rbp
+ lea 32(rp), rp
+ jne L(top)
+
+L(end): shrd $1, %rbx, %rbp
+ mov %rbp, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm
new file mode 100644
index 0000000..4c1c0d4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm
new file mode 100644
index 0000000..46a3612
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm
@@ -0,0 +1,484 @@
+dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
+C AMD K8,K9 ? ? ?
+C AMD K10 ? ? ?
+C AMD bull ? ? ?
+C AMD pile ? ? ?
+C AMD steam ? ? ?
+C AMD bobcat ? ? ?
+C AMD jaguar ? ? ?
+C Intel P4 ? ? ?
+C Intel core ? ? ?
+C Intel NHM ? ? ?
+C Intel SBR 2.57 2.93 3.0
+C Intel IBR 2.35 2.66 3.0
+C Intel HWL 2.02 2.5 2.5
+C Intel BWL ? ? ?
+C Intel atom ? ? ?
+C VIA nano ? ? ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
+C that the sqr_diag_addlsh1 loop was manually written.
+
+C TODO
+C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
+C * Streamline pointer updates.
+C * Perhaps suppress a few more xor insns in feed-in code.
+C * Make sure we write no dead registers in feed-in code.
+C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
+C out for negative sizes being zero-extended, though.
+C * The straight-line code for n <= 3 comes from the K8 code, and might be
+C quite sub-optimal here. Write specific code, and add code for n = 4.
+C * The mul_2 loop has a 10 insn common sequence in the loop start and the
+C wind-down code. Try re-rolling it.
+C * This file has been the subject to just basic micro-optimisation.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, un_param
+ jae L(gt1)
+
+ mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rax
+ mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, un_param
+ jae L(gt3)
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w2', `%r11')
+
+ mov (up), %rax
+ mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt3):
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%rbx')
+define(`w3', `%rbp')
+define(`un', `%r12')
+define(`n', `%rcx')
+
+define(`X0', `%r13')
+define(`X1', `%r14')
+
+L(do_mul_2):
+ mov (up), v0
+ push %rbx
+ lea (rp,un_param,8), rp C point rp at R[un]
+ mov 8(up), %rax
+ push %rbp
+ lea (up,un_param,8), up C point up right after U's end
+ mov %rax, v1
+ push %r12
+ mov $1, R32(un) C free up rdx
+ push %r13
+ sub un_param, un
+ push %r14
+ push un
+ mul v0
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea 2(un), n
+ xor R32(w1), R32(w1) C FIXME
+ xor R32(w2), R32(w2) C FIXME
+ mov %rdx, w0
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ xor R32(w3), R32(w3) C FIXME
+ xor R32(w0), R32(w0) C FIXME
+ mov %rdx, w2
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):
+L(m2l0):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,n,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,n,8), %rax
+L(m2l1):mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+ mov (up,n,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,n,8)
+ add %rax, w0
+ mov %rdx, w2
+ mov 8(up,n,8), %rax
+ adc $0, w2
+ add $2, n
+ jnc L(m2tp)
+
+L(m2ed):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $2, un C decrease |un|
+
+L(do_addmul_2):
+L(outer):
+ lea 16(rp), rp
+ cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1}
+ jge L(corner) C FIXME: move to before the lea above
+
+ mov -8(up,un,8), v0
+ mov (up,un,8), %rax
+ mov %rax, v1
+ mul v0
+ test $1, R8(un)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,un,8), X0
+ xor w0, w0
+ mov 8(rp,un,8), X1
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ xor w2, w2
+ mov X0, (rp,un,8)
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(a110)
+
+L(a100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(lo0)
+
+L(a110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(lo2)
+
+L(a1x1):mov (rp,un,8), X1
+ xor w2, w2
+ mov 8(rp,un,8), X0
+ add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ xor w0, w0
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jz L(a111)
+
+L(a101):lea 3(un), n C un = 1, 5, 9, ...
+ jmp L(lo1)
+
+L(a111):lea 1(un), n C un = 3, 7, 11, ...
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,n,8), %rax
+L(lo1): mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,n,8), %rax
+ mul v1
+ mov X1, -24(rp,n,8)
+ mov -8(rp,n,8), X1
+ add w3, X0
+ adc $0, w1
+ mov %rdx, w2
+ mov X0, -16(rp,n,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,n,8), %rax
+ add w0, X1
+ adc $0, w2
+L(lo0): mul v0
+ add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,n,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,n,8), %rax
+L(lo3): mul v0
+ add w2, X0
+ mov X1, -8(rp,n,8)
+ mov %rdx, w1
+ adc $0, w0
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,n,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,n,8), %rax
+ mov X0, (rp,n,8)
+L(lo2): mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,n,8), %rax
+ mov 16(rp,n,8), X0
+ adc $0, w3
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-8(rp),-24(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I((rp),-16(rp,n,8))
+ mov %rdx, I(8(rp),-8(rp,n,8))
+
+ add $2, un C decrease |un|
+ jmp L(outer) C loop until a small corner remains
+
+L(corner):
+ pop n
+ jg L(small_corner)
+
+ lea 8(rp), rp
+ mov -24(up), v0
+ mov -16(up), %rax
+ mov %rax, v1
+ mul v0
+ mov -24(rp), X0
+ mov -16(rp), X1
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ xor w2, w2
+ mov X0, -24(rp)
+ mov -8(up), %rax
+ mul v0
+ add $0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov -8(up), %rax
+ adc $0, w3
+ mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, -16(rp)
+ jmp L(com)
+
+L(small_corner):
+ mov -8(rp), w3
+ mov -16(up), v0
+ mov -8(up), %rax
+ mul v0
+L(com): add w3, %rax
+ adc $0, %rdx
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+
+L(sqr_diag_addlsh1):
+ mov -8(up,n,8), %rax
+ shl n
+ mul %rax
+ mov %rax, (rp,n,8)
+
+ xor R32(%rbx), R32(%rbx)
+ mov 8(rp,n,8), %r8
+ mov 16(rp,n,8), %r9
+ jmp L(dm)
+
+ ALIGN(32)
+L(dtop):add %r8, %r10
+ adc %r9, %rax
+ mov 8(rp,n,8), %r8
+ mov 16(rp,n,8), %r9
+ mov %r10, -8(rp,n,8)
+ mov %rax, (rp,n,8)
+L(dm): adc %r8, %r8
+ adc %r9, %r9
+ mov (up,n,4), %rax
+ lea (%rdx,%rbx), %r10
+ setc R8(%rbx)
+ mul %rax
+ add $2, n
+ js L(dtop)
+
+L(dend):add %r8, %r10
+ adc %r9, %rax
+ mov %r10, I(-8(rp),-8(rp,n,8))
+ mov %rax, I((rp),(rp,n,8))
+ adc %rbx, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/darwin.m4 b/gmp-6.3.0/mpn/x86_64/darwin.m4
new file mode 100644
index 0000000..7771476
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/darwin.m4
@@ -0,0 +1,82 @@
+divert(-1)
+dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+define(`LEA',`dnl
+ifdef(`PIC',
+ `lea $1(%rip), $2'
+,
+ `movabs `$'$1, $2')
+')
+
+dnl Usage: CALL(funcname)
+dnl
+dnl Simply override the definition in x86_64-defs.m4.
+
+define(`CALL',`call GSYM_PREFIX`'$1')
+define(`TCALL',`jmp GSYM_PREFIX`'$1')
+
+
+dnl Usage: JUMPTABSECT
+dnl
+dnl CAUTION: Do not put anything sensible here, like RODATA. That works with
+dnl some Darwin tool chains, but silently breaks with other. (Note that
+dnl putting jump tables in the text segment is a really poor idea for many PC
+dnl processors, since they cannot cache the same thing in both L1D and L2I.)
+
+define(`JUMPTABSECT', `.text')
+
+
+dnl Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT',`dnl
+ifdef(`PIC',
+ `.set $1_tmp, $1-$2
+ .long $1_tmp'
+,
+ `.quad $1'
+)')
+
+dnl Target ABI macros. For Darwin we override IFELF (and leave default for
+dnl IFDOS and IFSTD).
+
+define(`IFELF', `')
+
+
+dnl Usage: PROTECT(symbol)
+dnl
+dnl Used for private GMP symbols that should never be overridden by users.
+dnl This can save reloc entries and improve shlib sharing as well as
+dnl application startup times
+
+define(`PROTECT', `.private_extern $1')
+
+
+divert`'dnl
diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..b3d45e2
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm
@@ -0,0 +1,247 @@
+dnl x86-64 mpn_div_qr_1n_pi1
+dnl -- Divide an mpn number by a normalized single-limb number,
+dnl using a single-limb inverse.
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C AMD K8,K9 13
+C AMD K10 13
+C AMD bull 16.5
+C AMD pile 15
+C AMD steam ?
+C AMD bobcat 16
+C AMD jaguar ?
+C Intel P4 47 poor
+C Intel core 19.25
+C Intel NHM 18
+C Intel SBR 15 poor
+C Intel IBR 13
+C Intel HWL 11.7
+C Intel BWL ?
+C Intel atom 52 very poor
+C VIA nano 19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx') C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8') C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+ ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ dec UN_INPUT
+ jnz L(first)
+
+ C Just a single 2/1 division.
+ C T, U0 are allocated in scratch registers
+ lea 1(U1), T
+ mov U1, %rax
+ mul DINV
+ mov (UP), U0
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(single_div_done)
+ sub D, %rax
+ add $1, T
+L(single_div_done):
+ mov T, (QP)
+ FUNC_EXIT()
+ ret
+L(first):
+ C FIXME: Could delay some of these until we enter the loop.
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbx
+ push %rbp
+
+ mov D, B2
+ imul DINV, B2
+ neg B2
+ mov B2, B2md
+ sub D, B2md
+
+ C D not needed until final reduction
+ push D
+ mov UN_INPUT, UN C Clobbers D
+
+ mov DINV, %rax
+ mul U1
+ mov %rax, Q0
+ add U1, %rdx
+ mov %rdx, T
+
+ mov B2, %rax
+ mul U1
+ mov -8(UP, UN, 8), U0
+ mov (UP, UN, 8), U1
+ mov T, (QP, UN, 8)
+ add %rax, U0
+ adc %rdx, U1
+ sbb U2, U2
+ dec UN
+ mov U1, %rax
+ jz L(final)
+
+ ALIGN(16)
+
+ C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
+ C At entry, %rax holds an extra copy of U1
+L(loop):
+ C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+ C Remains to add in B (U1 + c)
+ mov DINV, Q1
+ mov U2, Q2
+ and U2, Q1
+ neg Q2
+ mul DINV
+ add %rdx, Q1
+ adc $0, Q2
+ add Q0, Q1
+ mov %rax, Q0
+ mov B2, %rax
+ lea (B2md, U0), T
+ adc $0, Q2
+
+ C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+ mul U1
+ and B2, U2
+ add U2, U0
+ cmovnc U0, T
+
+ C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+ adc U1, Q1
+ mov -8(UP, UN, 8), U0
+ adc Q2, 8(QP, UN, 8)
+ jc L(q_incr)
+L(q_incr_done):
+ add %rax, U0
+ mov T, %rax
+ adc %rdx, %rax
+ mov Q1, (QP, UN, 8)
+ sbb U2, U2
+ dec UN
+ mov %rax, U1
+ jnz L(loop)
+
+L(final):
+ pop D
+
+ mov U2, Q1
+ and D, U2
+ sub U2, %rax
+ neg Q1
+
+ mov %rax, U1
+ sub D, %rax
+ cmovc U1, %rax
+ sbb $-1, Q1
+
+ lea 1(%rax), T
+ mul DINV
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(div_done)
+ sub D, %rax
+ add $1, T
+L(div_done):
+ add T, Q0
+ mov Q0, (QP)
+ adc Q1, 8(QP)
+ jnc L(done)
+L(final_q_incr):
+ addq $1, 16(QP)
+ lea 8(QP), QP
+ jc L(final_q_incr)
+
+L(done):
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(q_incr):
+ C U1 is not live, so use it for indexing
+ lea 16(QP, UN, 8), U1
+L(q_incr_loop):
+ addq $1, (U1)
+ jnc L(q_incr_done)
+ lea 8(U1), U1
+ jmp L(q_incr_loop)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm
new file mode 100644
index 0000000..5e59a0a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm
@@ -0,0 +1,158 @@
+dnl x86-64 mpn_div_qr_2n_pi1
+dnl -- Divide an mpn number by a normalized 2-limb number,
+dnl using a single-limb inverse.
+
+dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`rp', `%rsi')
+define(`up_param', `%rdx')
+define(`un', `%rcx')
+define(`d1', `%r8')
+define(`d0', `%r9')
+define(`di_param', `8(%rsp)')
+
+define(`di', `%r10')
+define(`up', `%r11')
+define(`u2', `%rbx')
+define(`u1', `%r12')
+define(`t1', `%r13')
+define(`t0', `%r14')
+define(`md1', `%r15')
+
+C TODO
+C * Store qh in the same stack slot as di_param, instead of pushing
+C it. (we could put it in register %rbp, but then we would need to
+C save and restore that instead, which doesn't seem like a win).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_div_qr_2n_pi1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+IFDOS(`define(`di_param', `72(%rsp)')')
+ mov di_param, di
+ mov up_param, up
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbx
+
+ mov -16(up, un, 8), u1
+ mov -8(up, un, 8), u2
+
+ mov u1, t0
+ mov u2, t1
+ sub d0, t0
+ sbb d1, t1
+ cmovnc t0, u1
+ cmovnc t1, u2
+ C push qh which is !carry
+ sbb %rax, %rax
+ inc %rax
+ push %rax
+ lea -2(un), un
+ mov d1, md1
+ neg md1
+
+ jmp L(next)
+
+ ALIGN(16)
+L(loop):
+ C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
+ C Based on the optimized divrem_2.asm code.
+
+ mov di, %rax
+ mul u2
+ mov u1, t0
+ add %rax, t0 C q0 in t0
+ adc u2, %rdx
+ mov %rdx, t1 C q in t1
+ imul md1, %rdx
+ mov d0, %rax
+ lea (%rdx, u1), u2
+ mul t1
+ mov (up, un, 8), u1
+ sub d0, u1
+ sbb d1, u2
+ sub %rax, u1
+ sbb %rdx, u2
+ xor R32(%rax), R32(%rax)
+ xor R32(%rdx), R32(%rdx)
+ cmp t0, u2
+ cmovnc d0, %rax
+ cmovnc d1, %rdx
+ adc $0, t1
+ nop
+ add %rax, u1
+ adc %rdx, u2
+ cmp d1, u2
+ jae L(fix)
+L(bck):
+ mov t1, (qp, un, 8)
+L(next):
+ sub $1, un
+ jnc L(loop)
+L(end):
+ mov u2, 8(rp)
+ mov u1, (rp)
+
+ C qh on stack
+ pop %rax
+
+ pop %rbx
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(fix): C Unlikely update. u2 >= d1
+ seta %dl
+ cmp d0, u1
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck)
+ inc t1
+ sub d0, u1
+ sbb d1, u2
+ jmp L(bck)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm
new file mode 100644
index 0000000..85af96f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm
@@ -0,0 +1,200 @@
+dnl x86-64 mpn_div_qr_2u_pi1
+dnl -- Divide an mpn number by an unnormalized 2-limb number,
+dnl using a single-limb inverse and shifting the dividend on the fly.
+
+dnl Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`rp', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx') dnl %rcx needed for shift count
+define(`d1', `%r8')
+define(`d0', `%r9')
+define(`shift_param', `FRAME+8(%rsp)')
+define(`di_param', `FRAME+16(%rsp)')
+
+define(`di', `%r10')
+define(`up', `%r11')
+define(`un', `%rbp')
+define(`u2', `%rbx')
+define(`u1', `%r12')
+define(`u0', `%rsi') dnl Same as rp, which is saved and restored.
+define(`t1', `%r13')
+define(`t0', `%r14')
+define(`md1', `%r15')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME', 0)
+PROLOGUE(mpn_div_qr_2u_pi1)
+ mov di_param, di
+ mov up_param, up
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbx
+ push %rbp
+ push rp
+deflit(`FRAME', 56)
+ lea -2(un_param), un
+ mov d1, md1
+ neg md1
+
+ C int parameter, 32 bits only
+ movl shift_param, R32(%rcx)
+
+ C FIXME: Different code for SHLD_SLOW
+
+ xor R32(u2), R32(u2)
+ mov 8(up, un, 8), u1
+ shld %cl, u1, u2
+ C Remains to read (up, un, 8) and shift u1, u0
+ C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di)
+ mov di, %rax
+ mul u2
+ mov (up, un, 8), u0
+ shld %cl, u0, u1
+ mov u1, t0
+ add %rax, t0 C q0 in t0
+ adc u2, %rdx
+ mov %rdx, t1 C q in t1
+ imul md1, %rdx
+ mov d0, %rax
+ lea (%rdx, u1), u2
+ mul t1
+ mov u0, u1
+ shl %cl, u1
+ sub d0, u1
+ sbb d1, u2
+ sub %rax, u1
+ sbb %rdx, u2
+ xor R32(%rax), R32(%rax)
+ xor R32(%rdx), R32(%rdx)
+ cmp t0, u2
+ cmovnc d0, %rax
+ cmovnc d1, %rdx
+ adc $0, t1
+ nop
+ add %rax, u1
+ adc %rdx, u2
+ cmp d1, u2
+ jae L(fix_qh)
+L(bck_qh):
+ push t1 C push qh on stack
+
+ jmp L(next)
+
+ ALIGN(16)
+L(loop):
+ C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
+ C Based on the optimized divrem_2.asm code.
+
+ mov di, %rax
+ mul u2
+ mov (up, un, 8), u0
+ xor R32(t1), R32(t1)
+ shld %cl, u0, t1
+ or t1, u1
+ mov u1, t0
+ add %rax, t0 C q0 in t0
+ adc u2, %rdx
+ mov %rdx, t1 C q in t1
+ imul md1, %rdx
+ mov d0, %rax
+ lea (%rdx, u1), u2
+ mul t1
+ mov u0, u1
+ shl %cl, u1
+ sub d0, u1
+ sbb d1, u2
+ sub %rax, u1
+ sbb %rdx, u2
+ xor R32(%rax), R32(%rax)
+ xor R32(%rdx), R32(%rdx)
+ cmp t0, u2
+ cmovnc d0, %rax
+ cmovnc d1, %rdx
+ adc $0, t1
+ nop
+ add %rax, u1
+ adc %rdx, u2
+ cmp d1, u2
+ jae L(fix)
+L(bck):
+ mov t1, (qp, un, 8)
+L(next):
+ sub $1, un
+ jnc L(loop)
+L(end):
+ C qh on stack
+ pop %rax
+ pop rp
+ shrd %cl, u2, u1
+ shr %cl, u2
+ mov u2, 8(rp)
+ mov u1, (rp)
+
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ ret
+
+L(fix): C Unlikely update. u2 >= d1
+ seta %dl
+ cmp d0, u1
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck)
+ inc t1
+ sub d0, u1
+ sbb d1, u2
+ jmp L(bck)
+
+C Duplicated, just jumping back to a different address.
+L(fix_qh): C Unlikely update. u2 >= d1
+ seta %dl
+ cmp d0, u1
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck_qh)
+ inc t1
+ sub d0, u1
+ sbb d1, u2
+ jmp L(bck_qh)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/dive_1.asm b/gmp-6.3.0/mpn/x86_64/dive_1.asm
new file mode 100644
index 0000000..988bdab
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/dive_1.asm
@@ -0,0 +1,158 @@
+dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 10
+C AMD K10 10
+C Intel P4 33
+C Intel core2 13.25
+C Intel corei 14
+C Intel atom 42
+C VIA nano 43
+
+C A quick adoption of the 32-bit K7 code.
+
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+C divisor rcx
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov %rcx, %rax
+ xor R32(%rcx), R32(%rcx) C shift count
+ mov %rdx, %r8
+
+ bt $0, R32(%rax)
+ jnc L(evn) C skip bsfq unless divisor is even
+
+L(odd): mov %rax, %rbx
+ shr R32(%rax)
+ and $127, R32(%rax) C d/2, 7 bits
+
+ LEA( binvert_limb_table, %rdx)
+
+ movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+
+ mov %rbx, %r11 C d without twos
+
+ lea (%rax,%rax), R32(%rdx) C 2*inv
+ imul R32(%rax), R32(%rax) C inv*inv
+ imul R32(%rbx), R32(%rax) C inv*inv*d
+ sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rdx,%rdx), R32(%rax) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+ imul R32(%rbx), R32(%rdx) C inv*inv*d
+ sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+
+ lea (%rax,%rax), %r10 C 2*inv
+ imul %rax, %rax C inv*inv
+ imul %rbx, %rax C inv*inv*d
+ sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
+
+ lea (%rsi,%r8,8), %rsi C up end
+ lea -8(%rdi,%r8,8), %rdi C rp end
+ neg %r8 C -n
+
+ mov (%rsi,%r8,8), %rax C up[0]
+
+ inc %r8
+ jz L(one)
+
+ mov (%rsi,%r8,8), %rdx C up[1]
+
+ shrd R8(%rcx), %rdx, %rax
+
+ xor R32(%rbx), R32(%rbx)
+ jmp L(ent)
+
+L(evn): bsf %rax, %rcx
+ shr R8(%rcx), %rax
+ jmp L(odd)
+
+ ALIGN(8)
+L(top):
+ C rax q
+ C rbx carry bit, 0 or 1
+ C rcx shift
+ C rdx
+ C rsi up end
+ C rdi rp end
+ C r8 counter, limbs, negative
+ C r10 d^(-1) mod 2^64
+ C r11 d, shifted down
+
+ mul %r11 C carry limb in rdx 0 10
+ mov -8(%rsi,%r8,8), %rax C
+ mov (%rsi,%r8,8), %r9 C
+ shrd R8(%rcx), %r9, %rax C
+ nop C
+ sub %rbx, %rax C apply carry bit
+ setc %bl C
+ sub %rdx, %rax C apply carry limb 5
+ adc $0, %rbx C 6
+L(ent): imul %r10, %rax C 6
+ mov %rax, (%rdi,%r8,8) C
+ inc %r8 C
+ jnz L(top)
+
+ mul %r11 C carry limb in rdx
+ mov -8(%rsi), %rax C up high limb
+ shr R8(%rcx), %rax
+ sub %rbx, %rax C apply carry bit
+ sub %rdx, %rax C apply carry limb
+ imul %r10, %rax
+ mov %rax, (%rdi)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(one): shr R8(%rcx), %rax
+ imul %r10, %rax
+ mov %rax, (%rdi)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/divrem_1.asm
new file mode 100644
index 0000000..d4d61ad
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/divrem_1.asm
@@ -0,0 +1,314 @@
+dnl x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C norm unorm frac
+C AMD K8,K9 13 13 12
+C AMD K10 13 13 12
+C Intel P4 43 44 43
+C Intel core2 24.5 24.5 19.5
+C Intel corei 20.5 19.5 18
+C Intel atom 43 46 36
+C VIA nano 25.5 25.5 24
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn_param', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`d', `%r8')
+define(`dinv', `%r9') C only for mpn_preinv_divrem_1
+C shift passed on stack C only for mpn_preinv_divrem_1
+
+define(`cnt', `%rcx')
+define(`up', `%rsi')
+define(`fn', `%r12')
+define(`un', `%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C cnt qp d dinv
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFSTD(`define(`CNTOFF', `40($1)')')
+IFDOS(`define(`CNTOFF', `104($1)')')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+
+ lea -8(qp,un_param,8), qp
+
+ test d, d
+ js L(nent)
+
+ mov CNTOFF(%rsp), R8(cnt)
+ shl R8(cnt), d
+ jmp L(uent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+ je L(ret)
+
+ lea -8(qp,un_param,8), qp
+ xor R32(%rbp), R32(%rbp)
+
+ test d, d
+ jns L(unnormalized)
+
+L(normalized):
+ test un, un
+ je L(8) C un == 0
+ mov -8(up,un,8), %rbp
+ dec un
+ mov %rbp, %rax
+ sub d, %rbp
+ cmovc %rax, %rbp
+ sbb R32(%rax), R32(%rax)
+ inc R32(%rax)
+ mov %rax, (qp)
+ lea -8(qp), qp
+L(8):
+IFSTD(` push %rdi ')
+IFSTD(` push %rsi ')
+ push %r8
+IFSTD(` mov d, %rdi ')
+IFDOS(` sub $32, %rsp ')
+IFDOS(` mov d, %rcx ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ pop %r8
+IFSTD(` pop %rsi ')
+IFSTD(` pop %rdi ')
+
+ mov %rax, dinv
+ mov %rbp, %rax
+ jmp L(nent)
+
+ ALIGN(16)
+L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4
+ mul dinv C 0,13 0,20 0,18 0,45
+ add %r10, %rax C 4 8 3 12
+ adc %rbp, %rdx C 5 9 10 13
+ mov %rax, %rbp C 5 9 4 13
+ mov %rdx, %r13 C 6 11 12 23
+ imul d, %rdx C 6 11 11 23
+ sub %rdx, %r10 C 10 16 14 33
+ mov d, %rax C
+ add %r10, %rax C 11 17 15 34
+ cmp %rbp, %r10 C 11 17 15 34
+ cmovc %r10, %rax C 12 18 16 35
+ adc $-1, %r13 C
+ cmp d, %rax C
+ jae L(nfx) C
+L(nok): mov %r13, (qp) C
+ sub $8, qp C
+L(nent):lea 1(%rax), %rbp C
+ dec un C
+ jns L(ntop) C
+
+ xor R32(%rcx), R32(%rcx)
+ jmp L(frac)
+
+L(nfx): sub d, %rax
+ inc %r13
+ jmp L(nok)
+
+L(unnormalized):
+ test un, un
+ je L(44)
+ mov -8(up,un,8), %rax
+ cmp d, %rax
+ jae L(44)
+ mov %rbp, (qp)
+ mov %rax, %rbp
+ lea -8(qp), qp
+ je L(ret)
+ dec un
+L(44):
+ bsr d, %rcx
+ not R32(%rcx)
+ shl R8(%rcx), d
+ shl R8(%rcx), %rbp
+
+ push %rcx
+IFSTD(` push %rdi ')
+IFSTD(` push %rsi ')
+ push %r8
+IFSTD(` sub $8, %rsp ')
+IFSTD(` mov d, %rdi ')
+IFDOS(` sub $40, %rsp ')
+IFDOS(` mov d, %rcx ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+ pop %r8
+IFSTD(` pop %rsi ')
+IFSTD(` pop %rdi ')
+ pop %rcx
+
+ mov %rax, dinv
+ mov %rbp, %rax
+ test un, un
+ je L(frac)
+
+L(uent):dec un
+ mov (up,un,8), %rbp
+ neg R32(%rcx)
+ shr R8(%rcx), %rbp
+ neg R32(%rcx)
+ or %rbp, %rax
+ jmp L(ent)
+
+ ALIGN(16)
+L(utop):mov (up,un,8), %r10
+ shl R8(%rcx), %rbp
+ neg R32(%rcx)
+ shr R8(%rcx), %r10
+ neg R32(%rcx)
+ or %r10, %rbp
+ mul dinv
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ cmp %r11, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(ufx)
+L(uok): mov %r13, (qp)
+ sub $8, qp
+L(ent): mov (up,un,8), %rbp
+ dec un
+ lea 1(%rax), %r11
+ jns L(utop)
+
+L(uend):shl R8(%rcx), %rbp
+ mul dinv
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ cmp %r11, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(efx)
+L(eok): mov %r13, (qp)
+ sub $8, qp
+ jmp L(frac)
+
+L(ufx): sub d, %rax
+ inc %r13
+ jmp L(uok)
+L(efx): sub d, %rax
+ inc %r13
+ jmp L(eok)
+
+L(frac):mov d, %rbp
+ neg %rbp
+ jmp L(fent)
+
+ ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
+L(ftop):mul dinv C 0,12 0,17 0,17
+ add %r11, %rdx C 5 8 10
+ mov %rax, %r11 C 4 8 3
+ mov %rdx, %r13 C 6 9 11
+ imul %rbp, %rdx C 6 9 11
+ mov d, %rax C
+ add %rdx, %rax C 10 14 14
+ cmp %r11, %rdx C 10 14 14
+ cmovc %rdx, %rax C 11 15 15
+ adc $-1, %r13 C
+ mov %r13, (qp) C
+ sub $8, qp C
+L(fent):lea 1(%rax), %r11 C
+ dec fn C
+ jns L(ftop) C
+
+ shr R8(%rcx), %rax
+L(ret): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/divrem_2.asm b/gmp-6.3.0/mpn/x86_64/divrem_2.asm
new file mode 100644
index 0000000..20811cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/divrem_2.asm
@@ -0,0 +1,192 @@
+dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb best
+C AMD K8,K9 18
+C AMD K10 18
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4 68
+C Intel core 34
+C Intel NHM 30.25
+C Intel SBR 21.3
+C Intel IBR 21.4
+C Intel HWL 20.6
+C Intel BWL
+C Intel atom 73
+C VIA nano 33
+
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`dp', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ lea -24(%rdx,%rcx,8), %r12 C r12 = &up[un-1]
+ mov %rsi, %r13
+ push %rbp
+ mov %rdi, %rbp
+ push %rbx
+ mov 8(%r8), %r11 C d1
+ mov 16(%r12), %rbx
+ mov (%r8), %r8 C d0
+ mov 8(%r12), %r10
+
+ xor R32(%r15), R32(%r15)
+ cmp %rbx, %r11
+ ja L(2)
+ setb %dl
+ cmp %r10, %r8
+ setbe %al
+ orb %al, %dl C "orb" form to placate Sun tools
+ je L(2)
+ inc R32(%r15)
+ sub %r8, %r10
+ sbb %r11, %rbx
+L(2):
+ lea -3(%rcx,%r13), %r14 C un + fn - 3
+ test %r14, %r14
+ js L(end)
+
+ push %r8
+ push %r10
+ push %r11
+IFSTD(` mov %r11, %rdi ')
+IFDOS(` mov %r11, %rcx ')
+IFDOS(` sub $32, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ pop %r11
+ pop %r10
+ pop %r8
+
+ mov %r11, %rdx
+ mov %rax, %rdi
+ imul %rax, %rdx
+ mov %rdx, %r9
+ mul %r8
+ xor R32(%rcx), R32(%rcx)
+ add %r8, %r9
+ adc $-1, %rcx
+ add %rdx, %r9
+ adc $0, %rcx
+ js 2f
+1: dec %rdi
+ sub %r11, %r9
+ sbb $0, %rcx
+ jns 1b
+2:
+
+ lea (%rbp,%r14,8), %rbp
+ mov %r11, %rsi
+ neg %rsi C -d1
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C n2 un -d1 dinv qp d0 q0 d1 up fn msl
+
+ ALIGN(16)
+L(top): mov %rdi, %rax C di ncp
+ mul %rbx C 0, 17
+ mov %r10, %rcx C
+ add %rax, %rcx C 4
+ adc %rbx, %rdx C 5
+ mov %rdx, %r9 C q 6
+ imul %rsi, %rdx C 6
+ mov %r8, %rax C ncp
+ lea (%rdx, %r10), %rbx C n1 -= ... 10
+ xor R32(%r10), R32(%r10) C
+ mul %r9 C 7
+ cmp %r14, %r13 C
+ jg L(19) C
+ mov (%r12), %r10 C
+ sub $8, %r12 C
+L(19): sub %r8, %r10 C ncp
+ sbb %r11, %rbx C 11
+ sub %rax, %r10 C 11
+ sbb %rdx, %rbx C 12
+ xor R32(%rax), R32(%rax) C
+ xor R32(%rdx), R32(%rdx) C
+ cmp %rcx, %rbx C 13
+ cmovnc %r8, %rax C 14
+ cmovnc %r11, %rdx C 14
+ adc $0, %r9 C adjust q 14
+ nop
+ add %rax, %r10 C 15
+ adc %rdx, %rbx C 16
+ cmp %r11, %rbx C
+ jae L(fix) C
+L(bck): mov %r9, (%rbp) C
+ sub $8, %rbp C
+ dec %r14
+ jns L(top)
+
+L(end): mov %r10, 8(%r12)
+ mov %rbx, 16(%r12)
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ mov %r15, %rax
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(fix): seta %dl
+ cmp %r8, %r10
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck)
+ inc %r9
+ sub %r8, %r10
+ sbb %r11, %rbx
+ jmp L(bck)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/dos64.m4 b/gmp-6.3.0/mpn/x86_64/dos64.m4
new file mode 100644
index 0000000..0da1b36
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/dos64.m4
@@ -0,0 +1,101 @@
+divert(-1)
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+define(`HOST_DOS64')
+
+
+dnl On DOS64 we always generate position-independent-code
+dnl
+
+define(`PIC')
+
+
+define(`LEA',`
+ lea $1(%rip), $2
+')
+
+
+dnl Usage: CALL(funcname)
+dnl
+dnl Simply override the definition in x86_64-defs.m4.
+
+define(`CALL',`call GSYM_PREFIX`'$1')
+define(`TCALL',`jmp GSYM_PREFIX`'$1')
+
+
+dnl Usage: JUMPTABSECT
+
+define(`JUMPTABSECT', `RODATA')
+
+
+dnl Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT', `.long $1-$2')
+
+
+dnl Usage: FUNC_ENTRY(nregparmas)
+dnl Usage: FUNC_EXIT()
+
+dnl FUNC_ENTRY and FUNC_EXIT provide an easy path for adoption of standard
+dnl ABI assembly to the DOS64 ABI.
+
+define(`FUNC_ENTRY',
+ `push %rdi
+ push %rsi
+ mov %rcx, %rdi
+ifelse(eval($1>=2),1,`dnl
+ mov %rdx, %rsi
+ifelse(eval($1>=3),1,`dnl
+ mov %r8, %rdx
+ifelse(eval($1>=4),1,`dnl
+ mov %r9, %rcx
+')')')')
+
+define(`FUNC_EXIT',
+ `pop %rsi
+ pop %rdi')
+
+
+dnl Target ABI macros. For DOS64 we override the defaults.
+
+define(`IFDOS', `$1')
+define(`IFSTD', `')
+define(`IFELF', `')
+
+
+dnl Usage: PROTECT(symbol)
+dnl
+dnl Used for private GMP symbols that should never be overridden by users.
+dnl This can save reloc entries and improve shlib sharing as well as
+dnl application startup times
+
+define(`PROTECT', `')
+
+
+divert`'dnl
diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm
new file mode 100644
index 0000000..21ab210
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm
@@ -0,0 +1,181 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast AVX.
+
+dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 4.87 4.87 N
+C AMD bd3 ? ?
+C AMD bd4 0.53 ?
+C AMD zn1 0.51 ?
+C AMD zn2 0.25 ? Y
+C AMD zn3 0.25 ? Y
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel WSM n/a
+C Intel SBR 0.50 0.91 N
+C Intel IBR 0.50 0.65 N
+C Intel HWL 0.25 0.30 Y
+C Intel BWL 0.28 0.37 Y
+C Intel SKL 0.27 ? Y
+C Intel atom n/a
+C Intel SLM n/a
+C Intel GLM n/a
+C VIA nano n/a
+
+C We try to do as many 32-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ lea -32(rp,n,8), rp
+ lea -32(up,n,8), up
+
+ cmp $7, n C basecase needed for correctness
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(a2) C jump if rp aligned
+ mov 24(up), %rax
+ lea -8(up), up
+ mov %rax, 24(rp)
+ lea -8(rp), rp
+ dec n
+L(a2): test $16, R8(rp) C is rp 32-byte aligned?
+ jz L(a3) C jump if rp aligned
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+ sub $2, n
+L(a3): sub $16, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ vmovdqu -64(up), %ymm2
+ vmovdqu -96(up), %ymm3
+ lea -128(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ vmovdqa %ymm2, -64(rp)
+ vmovdqa %ymm3, -96(rp)
+ lea -128(rp), rp
+L(ali): sub $16, n
+ jnc L(top)
+
+L(sma): test $8, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ lea -64(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ lea -64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ lea -32(up), up
+ vmovdqa %ymm0, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %r8
+ mov %r8, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(bc): test $4, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ mov 8(up), %r8
+ mov (up), %r9
+ lea -32(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ mov %r8, 8(rp)
+ mov %r9, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ lea -16(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov %rax, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm
new file mode 100644
index 0000000..03c2440
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm
@@ -0,0 +1,178 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast AVX.
+
+dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 4.87 4.87 N
+C AMD bd3 ? ?
+C AMD bd4 0.53 ?
+C AMD zn1 0.51 ?
+C AMD zn2 0.25 ? Y
+C AMD zn3 0.25 ? Y
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel WSM n/a
+C Intel SBR 0.50 0.91 N
+C Intel IBR 0.50 0.65 N
+C Intel HWL 0.25 0.30 Y
+C Intel BWL 0.28 0.37 Y
+C Intel SKL 0.27 ? Y
+C Intel atom n/a
+C Intel SLM n/a
+C Intel GLM n/a
+C VIA nano n/a
+
+C We try to do as many 32-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $7, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(a2) C jump if rp aligned
+ mov (up), %rax
+ lea 8(up), up
+ mov %rax, (rp)
+ lea 8(rp), rp
+ dec n
+L(a2): test $16, R8(rp) C is rp 32-byte aligned?
+ jz L(a3) C jump if rp aligned
+ vmovdqu (up), %xmm0
+ lea 16(up), up
+ vmovdqa %xmm0, (rp)
+ lea 16(rp), rp
+ sub $2, n
+L(a3): sub $16, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): vmovdqu (up), %ymm0
+ vmovdqu 32(up), %ymm1
+ vmovdqu 64(up), %ymm2
+ vmovdqu 96(up), %ymm3
+ lea 128(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, 32(rp)
+ vmovdqa %ymm2, 64(rp)
+ vmovdqa %ymm3, 96(rp)
+ lea 128(rp), rp
+L(ali): sub $16, n
+ jnc L(top)
+
+L(sma): test $8, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ vmovdqu 32(up), %ymm1
+ lea 64(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, 32(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ lea 32(up), up
+ vmovdqa %ymm0, (rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ vmovdqu (up), %xmm0
+ lea 16(up), up
+ vmovdqa %xmm0, (rp)
+ lea 16(rp), rp
+1:
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+1:
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(bc): test $4, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov 8(up), %rcx
+ mov 16(up), %r8
+ mov 24(up), %r9
+ lea 32(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov 8(up), %rcx
+ lea 16(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+ lea 16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov %rax, (rp)
+1:
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/README b/gmp-6.3.0/mpn/x86_64/fastsse/README
new file mode 100644
index 0000000..5538b2d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/README
@@ -0,0 +1,22 @@
+This directory contains code for x86-64 processors with fast
+implementations of SSE operations, hence the name "fastsse".
+
+Current processors that might benefit from this code are:
+
+ AMD K10
+ AMD Bulldozer/Piledriver/Steamroller/Excavator
+ Intel Nocona
+ Intel Nehalem/Westmere
+ Intel Sandybridge/Ivybridge
+ Intel Haswell/Broadwell
+ VIA Nano
+
+Current processors that do not benefit from this code are:
+
+ AMD K8
+ AMD Bobcat
+ Intel Atom
+
+Intel Conroe/Penryn is a border case; its handling of non-aligned
+128-bit memory operands is poor. VIA Nano also have poor handling of
+non-aligned operands.
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm
new file mode 100644
index 0000000..69027bc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm
@@ -0,0 +1,311 @@
+dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bd1 1.39 ? 1.45 Y/N
+C AMD bd2 0.8-1.4 0.7-1.4 Y
+C AMD bd3
+C AMD bd4
+C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
+C AMD jaguar 1.02 1.02 0.91/0.91 N
+C Intel P4 2.26 illop Y/N
+C Intel core 0.58 0.87 opt/0.74 Y
+C Intel NHM 0.64 1.14 opt/bad Y
+C Intel SBR 0.51 0.65 opt/opt Y
+C Intel IBR 0.50 0.64 opt/0.57 Y
+C Intel HWL 0.51 0.58 opt/opt Y
+C Intel BWL 0.52 0.64 opt/opt Y
+C Intel SKL 0.51 0.63 opt/opt Y
+C Intel atom 1.16 1.70 opt/opt Y
+C Intel SLM 1.02 1.52 N
+C VIA nano 1.09 1.10 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
+C from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_com)
+ FUNC_ENTRY(3)
+
+ cmp $COM_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ pcmpeqb %xmm5, %xmm5 C set to 111...111
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(rp_aligned) C jump if rp aligned
+
+ mov (up), %r8
+ lea 8(up), up
+ not %r8
+ mov %r8, (rp)
+ lea 8(rp), rp
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jnz L(uent)
+
+ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa 0(up), %xmm0
+ movdqa 16(up), %xmm1
+ movdqa 32(up), %xmm2
+ movdqa 48(up), %xmm3
+ lea 64(up), up
+ pxor %xmm5, %xmm0
+ pxor %xmm5, %xmm1
+ pxor %xmm5, %xmm2
+ pxor %xmm5, %xmm3
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ movdqa 16(up), %xmm1
+ lea 32(up), up
+ pxor %xmm5, %xmm0
+ pxor %xmm5, %xmm1
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ lea 16(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+C FIXME: The code below only handles overlap if it is close to complete, or
+C quite separate: up-rp < 5 or up-up > 15 limbs
+ lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
+ sub rp, %rax
+ cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
+ jbe L(bc) C deflect to plain loop
+
+ sub $16, n
+ jc L(uend)
+
+ movdqa 120(up), %xmm3
+
+ sub $16, n
+ jmp L(um)
+
+ ALIGN(16)
+L(utop):movdqa 120(up), %xmm3
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, -128(rp)
+ sub $16, n
+L(um): movdqa 104(up), %xmm2
+ palignr($8, %xmm2, %xmm3)
+ movdqa 88(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 112(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 72(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 96(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa 56(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 80(rp)
+ palignr($8, %xmm3, %xmm0)
+ movdqa 40(up), %xmm2
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movdqa 24(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 8(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 128(up), up
+ lea 128(rp), rp
+ jnc L(utop)
+
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, -128(rp)
+
+L(uend):test $8, R8(n)
+ jz 1f
+ movdqa 56(up), %xmm3
+ movdqa 40(up), %xmm2
+ palignr($8, %xmm2, %xmm3)
+ movdqa 24(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 8(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 64(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movdqa 24(up), %xmm1
+ movdqa 8(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 32(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa 8(up), %xmm0
+ movdqa -8(up), %xmm3
+ palignr($8, %xmm3, %xmm0)
+ lea 16(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): lea -8(rp), rp
+ sub $4, R32(n)
+ jc L(end)
+
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` ALIGN(16)')
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ not %r8
+ not %r9
+ not %r10
+ not %r11
+ mov %r8, -24(rp)
+ mov %r9, -16(rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, -8(rp)
+ mov %r11, (rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, 8(rp)
+ lea 8(rp), rp
+ lea 8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov 8(up), %r9
+ not %r8
+ not %r9
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm
new file mode 100644
index 0000000..c867222
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm
@@ -0,0 +1,175 @@
+dnl AMD64 mpn_com optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 2.0 N
+C AMD K10 0.85 1.3 Y/N
+C AMD bull 1.40 1.40 Y
+C AMD pile 0.9-1.4 0.9-1.4 Y
+C AMD steam
+C AMD excavator
+C AMD bobcat 3.1 3.1 N
+C AMD jaguar 0.91 0.91 opt/opt Y
+C Intel P4 2.28 illop Y
+C Intel core2 1.02 1.02 N
+C Intel NHM 0.53 0.68 Y
+C Intel SBR 0.51 0.75 opt/0.65 Y/N
+C Intel IBR 0.50 0.57 opt/opt Y
+C Intel HWL 0.51 0.64 opt/0.58 Y
+C Intel BWL 0.61 0.65 0.57/opt Y
+C Intel atom 3.68 3.68 N
+C Intel SLM 1.09 1.35 N
+C VIA nano 1.17 5.09 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_com)
+ FUNC_ENTRY(3)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ pcmpeqb %xmm7, %xmm7 C set to 111...111
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ mov (up), %rax
+ lea 8(up), up
+ not %rax
+ mov %rax, (rp)
+ lea 8(rp), rp
+ dec n
+
+ sub $14, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ movdqu 64(up), %xmm4
+ movdqu 80(up), %xmm5
+ movdqu 96(up), %xmm6
+ lea 112(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ pxor %xmm7, %xmm4
+ pxor %xmm7, %xmm5
+ pxor %xmm7, %xmm6
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa %xmm4, 64(rp)
+ movdqa %xmm5, 80(rp)
+ movdqa %xmm6, 96(rp)
+ lea 112(rp), rp
+L(ali): sub $14, n
+ jnc L(top)
+
+L(sma): add $14, n
+ test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ lea 64(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ lea 32(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea 16(up), up
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov (up), %rax
+ not %rax
+ mov %rax, (rp)
+1:
+L(don):
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm
new file mode 100644
index 0000000..fac6f8a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm
@@ -0,0 +1,254 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bull 0.70 0.70 Y
+C AMD pile 0.68 0.68 Y
+C AMD steam
+C AMD excavator
+C AMD bobcat 1.97 8.24 1.5/1.5 N
+C AMD jaguar 0.77 0.89 0.65/opt N/Y
+C Intel P4 2.26 illop Y/N
+C Intel core 0.52 0.80 opt/opt Y
+C Intel NHM 0.52 0.64 opt/opt Y
+C Intel SBR 0.51 0.51 opt/opt Y
+C Intel IBR 0.50 0.50 opt/opt Y
+C Intel HWL 0.50 0.51 opt/opt Y
+C Intel BWL 0.55 0.55 opt/opt Y
+C Intel atom 1.16 1.66 opt/opt Y
+C Intel SLM 1.02 1.04 opt/opt Y
+C VIA nano 1.08 1.06 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
+C
+C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+
+ cmp $COPYD_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jnz L(rp_aligned) C jump if rp aligned
+
+ mov (up), %rax C copy one limb
+ mov %rax, (rp)
+ lea -8(up), up
+ lea -8(rp), rp
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jz L(uent)
+
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa -8(up), %xmm0
+ movdqa -24(up), %xmm1
+ movdqa -40(up), %xmm2
+ movdqa -56(up), %xmm3
+ lea -64(up), up
+ movdqa %xmm0, -8(rp)
+ movdqa %xmm1, -24(rp)
+ movdqa %xmm2, -40(rp)
+ movdqa %xmm3, -56(rp)
+ lea -64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa -8(up), %xmm0
+ movdqa -24(up), %xmm1
+ lea -32(up), up
+ movdqa %xmm0, -8(rp)
+ movdqa %xmm1, -24(rp)
+ lea -32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa -8(up), %xmm0
+ lea -16(up), up
+ movdqa %xmm0, -8(rp)
+ lea -16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):sub $16, n
+ movdqa (up), %xmm0
+ jc L(uend)
+
+ ALIGN(16)
+L(utop):sub $16, n
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm2
+ palignr($8, %xmm2, %xmm1)
+ movdqa %xmm1, -24(rp)
+ movdqa -48(up), %xmm3
+ palignr($8, %xmm3, %xmm2)
+ movdqa %xmm2, -40(rp)
+ movdqa -64(up), %xmm0
+ palignr($8, %xmm0, %xmm3)
+ movdqa %xmm3, -56(rp)
+ movdqa -80(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -72(rp)
+ movdqa -96(up), %xmm2
+ palignr($8, %xmm2, %xmm1)
+ movdqa %xmm1, -88(rp)
+ movdqa -112(up), %xmm3
+ palignr($8, %xmm3, %xmm2)
+ movdqa %xmm2, -104(rp)
+ movdqa -128(up), %xmm0
+ palignr($8, %xmm0, %xmm3)
+ movdqa %xmm3, -120(rp)
+ lea -128(up), up
+ lea -128(rp), rp
+ jnc L(utop)
+
+L(uend):test $8, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -24(rp)
+ movdqa -48(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -40(rp)
+ movdqa -64(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -56(rp)
+ lea -64(up), up
+ lea -64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -24(rp)
+ lea -32(up), up
+ lea -32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ lea -16(up), up
+ lea -16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): sub $4, R32(n)
+ jc L(end)
+
+ ALIGN(16)
+L(top): mov (up), %r8
+ mov -8(up), %r9
+ lea -32(rp), rp
+ mov -16(up), %r10
+ mov -24(up), %r11
+ lea -32(up), up
+ mov %r8, 32(rp)
+ mov %r9, 24(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, 16(rp)
+ mov %r11, 8(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+ lea -8(rp), rp
+ lea -8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov -8(up), %r9
+ mov %r8, (rp)
+ mov %r9, -8(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm
new file mode 100644
index 0000000..b3c4706
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm
@@ -0,0 +1,166 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9
+C AMD K10 0.85 1.64 Y/N
+C AMD bull 1.4 1.4 Y
+C AMD pile 0.68 0.98 Y/N
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar 0.65 1.02 opt/0.93 Y/N
+C Intel P4 2.3 2.3 Y
+C Intel core 1.0 1.0 0.52/0.80 N
+C Intel NHM 0.5 0.67 Y
+C Intel SBR 0.51 0.75 opt/0.54 Y/N
+C Intel IBR 0.50 0.57 opt/0.50 Y
+C Intel HWL 0.50 0.57 opt/0.51 Y
+C Intel BWL 0.55 0.62 opt/0.55 Y
+C Intel atom
+C Intel SLM 1.02 1.27 opt/1.04 Y/N
+C VIA nano 1.16 5.16 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ test n, n
+ jz L(don)
+
+ lea -16(rp,n,8), rp
+ lea -16(up,n,8), up
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ mov 8(up), %rax
+ lea -8(up), up
+ mov %rax, 8(rp)
+ lea -8(rp), rp
+ dec n
+
+L(ali): sub $16, n
+ jc L(sma)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ movdqu -32(up), %xmm2
+ movdqu -48(up), %xmm3
+ movdqu -64(up), %xmm4
+ movdqu -80(up), %xmm5
+ movdqu -96(up), %xmm6
+ movdqu -112(up), %xmm7
+ lea -128(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ movdqa %xmm2, -32(rp)
+ movdqa %xmm3, -48(rp)
+ movdqa %xmm4, -64(rp)
+ movdqa %xmm5, -80(rp)
+ movdqa %xmm6, -96(rp)
+ movdqa %xmm7, -112(rp)
+ lea -128(rp), rp
+ sub $16, n
+ jnc L(top)
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+
+L(sma): test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ movdqu -32(up), %xmm2
+ movdqu -48(up), %xmm3
+ lea -64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ movdqa %xmm2, -32(rp)
+ movdqa %xmm3, -48(rp)
+ lea -64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ lea -32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea -16(up), up
+ movdqa %xmm0, (rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 8(up), %r8
+ mov %r8, 8(rp)
+1:
+L(don): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
new file mode 100644
index 0000000..9876a47
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
@@ -0,0 +1,300 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bd1 0.70 0.66 Y
+C AMD bd2 0.68 0.66 Y
+C AMD bd3 ? ?
+C AMD bd4 ? ?
+C AMD bt1 1.97 8.16 1.5/1.5 N
+C AMD bt2 0.77 0.93 0.65/opt N/Y
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C Intel P4 2.26 illop Y/N
+C Intel CNR 0.52 0.64 opt/opt Y
+C Intel NHM 0.52 0.71 0.50/0.67 N
+C Intel SBR 0.51 0.54 opt/0.51 Y
+C Intel IBR 0.50 0.54 opt/opt Y
+C Intel HWL 0.50 0.51 opt/opt Y
+C Intel BWL 0.55 0.55 opt/opt Y
+C Intel atom 1.16 1.61 opt/opt Y
+C Intel SLM 1.02 1.07 opt/opt Y
+C VIA nano 1.09 1.08 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+dnl define(`movdqa', ``movaps'')
+
+ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $COPYI_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(rp_aligned) C jump if rp aligned
+
+ movsq C copy one limb
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jnz L(uent)
+
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa 0(up), %xmm0
+ movdqa 16(up), %xmm1
+ movdqa 32(up), %xmm2
+ movdqa 48(up), %xmm3
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ movdqa 16(up), %xmm1
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+ cmp $16, n
+ jc L(ued0)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+IFDOS(` movdqa %xmm8, 32(%rsp) ')
+
+ movaps 120(up), %xmm7
+ movaps 104(up), %xmm6
+ movaps 88(up), %xmm5
+ movaps 72(up), %xmm4
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
+ lea 128(up), up
+ sub $32, n
+ jc L(ued1)
+
+ ALIGN(16)
+L(utop):movaps -104(up), %xmm1
+ sub $16, n
+ movaps -120(up), %xmm0
+ palignr($8, %xmm6, %xmm7)
+ movaps -136(up), %xmm8
+ movdqa %xmm7, 112(rp)
+ palignr($8, %xmm5, %xmm6)
+ movaps 120(up), %xmm7
+ movdqa %xmm6, 96(rp)
+ palignr($8, %xmm4, %xmm5)
+ movaps 104(up), %xmm6
+ movdqa %xmm5, 80(rp)
+ palignr($8, %xmm3, %xmm4)
+ movaps 88(up), %xmm5
+ movdqa %xmm4, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movaps 72(up), %xmm4
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movaps 56(up), %xmm3
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movaps 40(up), %xmm2
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm8, %xmm0)
+ lea 128(up), up
+ movdqa %xmm0, (rp)
+ lea 128(rp), rp
+ jnc L(utop)
+
+L(ued1):movaps -104(up), %xmm1
+ movaps -120(up), %xmm0
+ movaps -136(up), %xmm8
+ palignr($8, %xmm6, %xmm7)
+ movdqa %xmm7, 112(rp)
+ palignr($8, %xmm5, %xmm6)
+ movdqa %xmm6, 96(rp)
+ palignr($8, %xmm4, %xmm5)
+ movdqa %xmm5, 80(rp)
+ palignr($8, %xmm3, %xmm4)
+ movdqa %xmm4, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm8, %xmm0)
+ movdqa %xmm0, (rp)
+ lea 128(rp), rp
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` movdqa 32(%rsp), %xmm8 ')
+IFDOS(` add $56, %rsp ')
+
+L(ued0):test $8, R8(n)
+ jz 1f
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
+ movaps -8(up), %xmm4
+ palignr($8, %xmm2, %xmm3)
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm4, %xmm0)
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ lea 64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movaps -8(up), %xmm3
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa 8(up), %xmm0
+ movdqa -8(up), %xmm3
+ palignr($8, %xmm3, %xmm0)
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): lea -8(rp), rp
+ sub $4, R32(n)
+ jc L(end)
+
+ ALIGN(16)
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ mov %r8, -24(rp)
+ mov %r9, -16(rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, -8(rp)
+ mov %r11, (rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, 8(rp)
+ lea 8(rp), rp
+ lea 8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov 8(up), %r9
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm
new file mode 100644
index 0000000..97f7865
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm
@@ -0,0 +1,185 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9
+C AMD K10 0.85 1.64 Y/N
+C AMD bull 1.4 1.4 N
+C AMD pile 0.77 0.93 N
+C AMD steam ? ?
+C AMD excavator ? ?
+C AMD bobcat
+C AMD jaguar 0.65 1.02 opt/0.93 Y/N
+C Intel P4 2.3 2.3 Y
+C Intel core 1.0 1.0 0.52/0.64 N
+C Intel NHM 0.5 0.67 Y
+C Intel SBR 0.51 0.75 opt/0.54 Y/N
+C Intel IBR 0.50 0.57 opt/0.54 Y
+C Intel HWL 0.50 0.57 opt/0.51 Y
+C Intel BWL 0.55 0.62 opt/0.55 Y
+C Intel atom
+C Intel SLM 1.02 1.27 opt/1.07 Y/N
+C VIA nano 1.16 5.16 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $3, n C NB: bc code below assumes this limit
+ jc L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ movsq C copy single limb
+ dec n
+
+L(ali): sub $16, n
+ jc L(sma)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ movdqu 64(up), %xmm4
+ movdqu 80(up), %xmm5
+ movdqu 96(up), %xmm6
+ movdqu 112(up), %xmm7
+ lea 128(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa %xmm4, 64(rp)
+ movdqa %xmm5, 80(rp)
+ movdqa %xmm6, 96(rp)
+ movdqa %xmm7, 112(rp)
+ lea 128(rp), rp
+ sub $16, n
+ jnc L(top)
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+
+L(sma): test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+ ALIGN(16)
+1:
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+1:
+ FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for correctness as
+C the above code is currently written. The commented-out lines need to be
+C reinstated if this code is to be used for n > 3, and then the post loop
+C offsets need fixing.
+
+L(bc): sub $2, n
+ jc L(end)
+ ALIGN(16)
+1: mov (up), %rax
+ mov 8(up), %rcx
+dnl lea 16(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+dnl lea 16(rp), rp
+dnl sub $2, n
+dnl jnc 1b
+
+ test $1, R8(n)
+ jz L(ret)
+ mov 16(up), %rax
+ mov %rax, 16(rp)
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm
new file mode 100644
index 0000000..a05e850
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm
@@ -0,0 +1,182 @@
+dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 2.35 no, use shl/shr
+C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
+C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
+C AMD bobcat 3.17 3.17 yes, bad for n < 20
+C Intel P4 4.67 4.67 2.7 no, slow movdqu
+C Intel core2 2.15 2.15 1.25 no, use shld/shrd
+C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
+C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
+C Intel atom 11.7 11.7 4.5 no
+C VIA nano 5.7 5.95 2.0 no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ cmp $3, n
+ jle L(bc)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+
+ and $6, R32(%r8)
+ jz L(ba0)
+ cmp $4, R32(%r8)
+ jz L(ba4)
+ jc L(ba2)
+L(ba6): add $-4, n
+ jmp L(i56)
+L(ba0): add $-6, n
+ jmp L(i70)
+L(ba4): add $-2, n
+ jmp L(i34)
+L(ba2): add $-8, n
+ jle L(end)
+
+ ALIGN(16)
+L(top): movdqu 40(ap,n,8), %xmm1
+ movdqu 48(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 48(rp,n,8)
+L(i70):
+ movdqu 24(ap,n,8), %xmm1
+ movdqu 32(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 32(rp,n,8)
+L(i56):
+ movdqu 8(ap,n,8), %xmm1
+ movdqu 16(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 16(rp,n,8)
+L(i34):
+ movdqu -8(ap,n,8), %xmm1
+ movdqu (ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+ sub $8, n
+ jg L(top)
+
+L(end): test $1, R8(n)
+ jnz L(end8)
+
+ movdqu (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jz L(end8)
+
+ movq (ap,n,8), %xmm1
+ movq -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, (rp,n,8)
+ sub $2, R32(n)
+ jl L(end8)
+ movq 8(ap), %xmm1
+ movq (ap), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
new file mode 100644
index 0000000..6a17b93
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
@@ -0,0 +1,173 @@
+dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
+
+dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb good
+C 16-byte aligned 16-byte unaligned for cpu?
+C AMD K8,K9 ? ?
+C AMD K10 1.68 (1.45) 1.75 (1.49) Y
+C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
+C AMD bobcat 4 4
+C Intel P4 3 (2.7) 3 (2.7) Y
+C Intel core2 2.05 (1.67) 2.55 (1.75)
+C Intel NHM 2.05 (1.75) 2.09 (2)
+C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
+C Intel atom ? ?
+C VIA nano 2.25 (2) 2.5 (2) Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true. The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C (1) The unaligned case makes many reads.
+C (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ cmp $2, n
+ jle L(le2)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea (ap,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(aent)
+ jmp L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+ ALIGN(16)
+L(utop):movdqa -8(ap,n,8), %xmm0
+ movq (ap,n,8), %xmm1
+ punpcklqdq 8(ap,n,8), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(uent):sub $2, n
+ ja L(utop)
+
+ jne L(end8)
+
+ movq (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ punpcklqdq 8(ap), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+ ALIGN(16)
+L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
+ movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
+ punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(aent):
+ sub $2, n
+ ja L(atop)
+ jne L(end8)
+
+ movdqa (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+ ALIGN(16)
+L(le2): jne L(end8)
+
+ movq 8(ap), %xmm0
+ movq (ap), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
new file mode 100644
index 0000000..8250910
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
@@ -0,0 +1,193 @@
+dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 ? no, use shl/shr
+C AMD K10 1.8-2.0 1.8-2.0 ? yes
+C AMD bd1 1.9 1.9 ? yes
+C AMD bobcat 3.67 3.67 yes, bad for n < 20
+C Intel P4 4.75 4.75 ? no, slow movdqu
+C Intel core2 2.27 2.27 ? no, use shld/shrd
+C Intel NHM 2.15 2.15 ? no, use shld/shrd
+C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6
+C Intel atom 12.9 12.9 ? no
+C VIA nano 6.18 6.44 ? no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ pcmpeqb %xmm3, %xmm3 C set to 111...111
+
+ cmp $3, n
+ jle L(bc)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+
+ and $6, R32(%r8)
+ jz L(ba0)
+ cmp $4, R32(%r8)
+ jz L(ba4)
+ jc L(ba2)
+L(ba6): add $-4, n
+ jmp L(i56)
+L(ba0): add $-6, n
+ jmp L(i70)
+L(ba4): add $-2, n
+ jmp L(i34)
+L(ba2): add $-8, n
+ jle L(end)
+
+ ALIGN(16)
+L(top): movdqu 40(ap,n,8), %xmm1
+ movdqu 48(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 48(rp,n,8)
+L(i70):
+ movdqu 24(ap,n,8), %xmm1
+ movdqu 32(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 32(rp,n,8)
+L(i56):
+ movdqu 8(ap,n,8), %xmm1
+ movdqu 16(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 16(rp,n,8)
+L(i34):
+ movdqu -8(ap,n,8), %xmm1
+ movdqu (ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, (rp,n,8)
+ sub $8, n
+ jg L(top)
+
+L(end): test $1, R8(n)
+ jnz L(end8)
+
+ movdqu (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jz L(end8)
+
+ movq (ap,n,8), %xmm1
+ movq -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, (rp,n,8)
+ sub $2, R32(n)
+ jl L(end8)
+ movq 8(ap), %xmm1
+ movq (ap), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm
new file mode 100644
index 0000000..a616075
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm
@@ -0,0 +1,183 @@
+dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
+
+dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb good
+C 16-byte aligned 16-byte unaligned for cpu?
+C AMD K8,K9 ? ?
+C AMD K10 1.85 (1.635) 1.9 (1.67) Y
+C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
+C AMD bobcat 4.5 4.5
+C Intel P4 3.6 (3.125) 3.6 (3.125) Y
+C Intel core2 2.05 (1.67) 2.55 (1.75)
+C Intel NHM 2.05 (1.875) 2.6 (2.25)
+C Intel SBR 1.55 (1.44) 2 (1.57) Y
+C Intel atom ? ?
+C VIA nano 2.5 (2.5) 2.5 (2.5) Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We always write using
+C 16-byte operations, we read with both 8-byte and 16-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true. The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C (1) The unaligned case makes too many reads.
+C (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ pcmpeqb %xmm2, %xmm2 C set to 111...111
+
+ cmp $2, n
+ jle L(le2)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea (ap,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(aent)
+ jmp L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+ ALIGN(16)
+L(utop):movq (ap,n,8), %xmm1
+ punpcklqdq 8(ap,n,8), %xmm1
+ movdqa -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(uent):sub $2, n
+ ja L(utop)
+
+ jne L(end8)
+
+ movq (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ punpcklqdq 8(ap), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+ ALIGN(16)
+L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
+ movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
+ punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(aent):sub $2, n
+ ja L(atop)
+
+ jne L(end8)
+
+ movdqa (ap), %xmm0
+ pxor %xmm1, %xmm1
+ punpcklqdq %xmm0, %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+ ALIGN(16)
+L(le2): jne L(end8)
+
+ movq 8(ap), %xmm0
+ movq (ap), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm
new file mode 100644
index 0000000..1e270b1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm
@@ -0,0 +1,201 @@
+dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 2.35 no, use shl/shr
+C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
+C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
+C AMD bobcat 3.17 3.17 yes, bad for n < 20
+C Intel P4 4.67 4.67 2.7 no, slow movdqu
+C Intel core2 2.15 2.15 1.25 no, use shld/shrd
+C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
+C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
+C Intel atom 11.7 11.7 4.5 no
+C VIA nano 5.7 5.95 2.0 no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov (ap), %rax
+ shl R8(%rcx), %rax
+
+ cmp $3, n
+ jle L(bc)
+
+ test $8, R8(rp)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq (ap), %xmm0
+ movq 8(ap), %xmm1
+ psrlq %xmm4, %xmm0
+ psllq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, (rp)
+ lea 8(ap), ap
+ lea 8(rp), rp
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+ lea (ap,n,8), ap
+ lea (rp,n,8), rp
+ neg n
+
+ and $6, R32(%r8)
+ jz L(bu0)
+ cmp $4, R32(%r8)
+ jz L(bu4)
+ jc L(bu2)
+L(bu6): add $4, n
+ jmp L(i56)
+L(bu0): add $6, n
+ jmp L(i70)
+L(bu4): add $2, n
+ jmp L(i34)
+L(bu2): add $8, n
+ jge L(end)
+
+ ALIGN(16)
+L(top): movdqu -64(ap,n,8), %xmm1
+ movdqu -56(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -64(rp,n,8)
+L(i70):
+ movdqu -48(ap,n,8), %xmm1
+ movdqu -40(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -48(rp,n,8)
+L(i56):
+ movdqu -32(ap,n,8), %xmm1
+ movdqu -24(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -32(rp,n,8)
+L(i34):
+ movdqu -16(ap,n,8), %xmm1
+ movdqu -8(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -16(rp,n,8)
+ add $8, n
+ jl L(top)
+
+L(end): test $1, R8(n)
+ jnz L(e1)
+
+ movdqu -16(ap), %xmm1
+ movq -8(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, -16(rp)
+ FUNC_EXIT()
+ ret
+
+L(e1): movq -8(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, -8(rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jnz 1f
+ movq (ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+1: movq (ap), %xmm1
+ movq 8(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, (rp)
+ dec R32(n)
+ jnz 1f
+ movq 8(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, 8(rp)
+ FUNC_EXIT()
+ ret
+
+1: movq 8(ap), %xmm1
+ movq 16(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+ movq 16(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
new file mode 100644
index 0000000..e7b7feb
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
@@ -0,0 +1,204 @@
+dnl AMD64 SSE mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb
+C ali,evn n unal,evn n other cases
+C AMD K8,K9 1.65 1.65 1.8
+C AMD K10 0.78 0.78 0.85
+C AMD bd1 0.80 0.91 1.25
+C AMD bobcat 2.15 2.15 2.37
+C Intel P4 2.5 2.5 2.95
+C Intel core2 1.17 1.25 1.25
+C Intel NHM 0.87 0.90 0.90
+C Intel SBR 0.63 0.79 0.77
+C Intel atom 4.3 4.3 4.3 slower than plain code
+C VIA nano 1.4 5.1 3.14 too alignment dependent
+
+C NOTES
+C * We only honour the least significant 32 bits of the `which' and `nents'
+C arguments to allow efficient code using just SSE2. We would need to
+C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
+C * We use movd for copying between xmm and plain registers, since old gas
+C rejects movq. But gas assembles movd as movq when given a 64-bit greg.
+
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`n', `%rdx')
+define(`nents', `%rcx')
+define(`which', `%r8')
+
+define(`i', `%r10')
+define(`j', `%r9')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C nents n rp tab which j i temp * * * *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+IFDOS(` add $-88, %rsp ')
+IFDOS(` movdqu %xmm6, (%rsp) ')
+IFDOS(` movdqu %xmm7, 16(%rsp) ')
+IFDOS(` movdqu %xmm8, 32(%rsp) ')
+IFDOS(` movdqu %xmm9, 48(%rsp) ')
+
+ movd which, %xmm8
+ pshufd $0, %xmm8, %xmm8 C 4 `which' copies
+ mov $1, R32(%rax)
+ movd %rax, %xmm9
+ pshufd $0, %xmm9, %xmm9 C 4 copies of 1
+
+ mov n, j
+ add $-8, j
+ js L(outer_end)
+
+L(outer_top):
+ mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ALIGN(16)
+L(top): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ movdqu 16(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm4
+ por %xmm3, %xmm5
+ movdqu 32(tp), %xmm2
+ movdqu 48(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm6
+ por %xmm3, %xmm7
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(top)
+
+ movdqu %xmm4, 0(rp)
+ movdqu %xmm5, 16(rp)
+ movdqu %xmm6, 32(rp)
+ movdqu %xmm7, 48(rp)
+
+ lea 64(%r11), tp
+ lea 64(rp), rp
+ add $-8, j
+ jns L(outer_top)
+L(outer_end):
+
+ test $4, R8(n)
+ je L(b0xx)
+L(b1xx):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ ALIGN(16)
+L(tp4): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ movdqu 16(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm4
+ por %xmm3, %xmm5
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp4)
+ movdqu %xmm4, 0(rp)
+ movdqu %xmm5, 16(rp)
+ lea 32(%r11), tp
+ lea 32(rp), rp
+
+L(b0xx):test $2, R8(n)
+ je L(b00x)
+L(b01x):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ ALIGN(16)
+L(tp2): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ pand %xmm0, %xmm2
+ por %xmm2, %xmm4
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp2)
+ movdqu %xmm4, 0(rp)
+ lea 16(%r11), tp
+ lea 16(rp), rp
+
+L(b00x):test $1, R8(n)
+ je L(b000)
+L(b001):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ ALIGN(16)
+L(tp1): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movq 0(tp), %xmm2
+ pand %xmm0, %xmm2
+ por %xmm2, %xmm4
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp1)
+ movq %xmm4, 0(rp)
+
+L(b000):
+IFDOS(` movdqu (%rsp), %xmm6 ')
+IFDOS(` movdqu 16(%rsp), %xmm7 ')
+IFDOS(` movdqu 32(%rsp), %xmm8 ')
+IFDOS(` movdqu 48(%rsp), %xmm9 ')
+IFDOS(` add $88, %rsp ')
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c
new file mode 100644
index 0000000..e0d7358
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c
@@ -0,0 +1,38 @@
+/* Fat binary fallback mpn_addmul_2.
+
+Copyright 2016 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, const mp_limb_t vp[2])
+{
+ rp[n] = mpn_addmul_1 (rp, up, n, vp[0]);
+ return mpn_addmul_1 (rp + 1, up, n, vp[1]);
+}
diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat.c b/gmp-6.3.0/mpn/x86_64/fat/fat.c
new file mode 100644
index 0000000..cc35afa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/fat.c
@@ -0,0 +1,473 @@
+/* x86_64 fat binary initializers.
+
+ Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
+ Torbjorn Granlund (port to x86_64)
+
+ THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
+ THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
+ COMPLETELY IN FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2009, 2011-2015, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include <stdio.h> /* for printf */
+#include <stdlib.h> /* for getenv */
+#include <string.h>
+
+#include "gmp-impl.h"
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* fat_entry.asm */
+long __gmpn_cpuid (char [12], int);
+
+
+#if WANT_FAKE_CPUID
+/* The "name"s in the table are values for the GMP_CPU_TYPE environment
+ variable. Anything can be used, but for now it's the canonical cpu types
+ as per config.guess/config.sub. */
+
+#define __gmpn_cpuid fake_cpuid
+
+#define MAKE_FMS(family, model) \
+ ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \
+ + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12))
+
+static struct {
+ const char *name;
+ const char *vendor;
+ unsigned fms;
+} fake_cpuid_table[] = {
+ { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) },
+ { "nehalem", "GenuineIntel", MAKE_FMS (6, 0x1a) },
+ { "nhm", "GenuineIntel", MAKE_FMS (6, 0x1a) },
+ { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) },
+ { "westmere", "GenuineIntel", MAKE_FMS (6, 0x25) },
+ { "wsm", "GenuineIntel", MAKE_FMS (6, 0x25) },
+ { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) },
+ { "sbr", "GenuineIntel", MAKE_FMS (6, 0x2a) },
+ { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) },
+ { "slm", "GenuineIntel", MAKE_FMS (6, 0x37) },
+ { "haswell", "GenuineIntel", MAKE_FMS (6, 0x3c) },
+ { "hwl", "GenuineIntel", MAKE_FMS (6, 0x3c) },
+ { "broadwell", "GenuineIntel", MAKE_FMS (6, 0x3d) },
+ { "bwl", "GenuineIntel", MAKE_FMS (6, 0x3d) },
+ { "skylake", "GenuineIntel", MAKE_FMS (6, 0x5e) },
+ { "sky", "GenuineIntel", MAKE_FMS (6, 0x5e) },
+ { "pentium4", "GenuineIntel", MAKE_FMS (15, 3) },
+
+ { "k8", "AuthenticAMD", MAKE_FMS (15, 0) },
+ { "k10", "AuthenticAMD", MAKE_FMS (16, 0) },
+ { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) },
+ { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) },
+ { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
+ { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
+ { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) },
+ { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) },
+ { "zen", "AuthenticAMD", MAKE_FMS (23, 1) },
+
+ { "nano", "CentaurHauls", MAKE_FMS (6, 15) },
+};
+
+static int
+fake_cpuid_lookup (void)
+{
+ char *s;
+ int i;
+
+ s = getenv ("GMP_CPU_TYPE");
+ if (s == NULL)
+ {
+ printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
+ abort ();
+ }
+
+ for (i = 0; i < numberof (fake_cpuid_table); i++)
+ if (strcmp (s, fake_cpuid_table[i].name) == 0)
+ return i;
+
+ printf ("GMP_CPU_TYPE=%s unknown\n", s);
+ abort ();
+}
+
+static long
+fake_cpuid (char dst[12], unsigned int id)
+{
+ int i = fake_cpuid_lookup();
+
+ switch (id) {
+ case 0:
+ memcpy (dst, fake_cpuid_table[i].vendor, 12);
+ return 0;
+ case 1:
+ return fake_cpuid_table[i].fms;
+ case 7:
+ dst[0] = 0xff; /* BMI1, AVX2, etc */
+ dst[1] = 0xff; /* BMI2, etc */
+ return 0;
+ case 0x80000001:
+ dst[4 + 29 / 8] = (1 << (29 % 8)); /* "long" mode */
+ return 0;
+ default:
+ printf ("fake_cpuid(): oops, unknown id %d\n", id);
+ abort ();
+ }
+}
+#endif
+
+
+typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
+typedef DECL_preinv_mod_1 ((*preinv_mod_1_t));
+
+struct cpuvec_t __gmpn_cpuvec = {
+ __MPN(add_n_init),
+ __MPN(addlsh1_n_init),
+ __MPN(addlsh2_n_init),
+ __MPN(addmul_1_init),
+ __MPN(addmul_2_init),
+ __MPN(bdiv_dbm1c_init),
+ __MPN(cnd_add_n_init),
+ __MPN(cnd_sub_n_init),
+ __MPN(com_init),
+ __MPN(copyd_init),
+ __MPN(copyi_init),
+ __MPN(divexact_1_init),
+ __MPN(divrem_1_init),
+ __MPN(gcd_11_init),
+ __MPN(lshift_init),
+ __MPN(lshiftc_init),
+ __MPN(mod_1_init),
+ __MPN(mod_1_1p_init),
+ __MPN(mod_1_1p_cps_init),
+ __MPN(mod_1s_2p_init),
+ __MPN(mod_1s_2p_cps_init),
+ __MPN(mod_1s_4p_init),
+ __MPN(mod_1s_4p_cps_init),
+ __MPN(mod_34lsub1_init),
+ __MPN(modexact_1c_odd_init),
+ __MPN(mul_1_init),
+ __MPN(mul_basecase_init),
+ __MPN(mullo_basecase_init),
+ __MPN(preinv_divrem_1_init),
+ __MPN(preinv_mod_1_init),
+ __MPN(redc_1_init),
+ __MPN(redc_2_init),
+ __MPN(rshift_init),
+ __MPN(sqr_basecase_init),
+ __MPN(sub_n_init),
+ __MPN(sublsh1_n_init),
+ __MPN(submul_1_init),
+ 0
+};
+
+int __gmpn_cpuvec_initialized = 0;
+
+/* The following setups start with generic x86, then overwrite with
+ specifics for a chip, and higher versions of that chip.
+
+ The arrangement of the setups here will normally be the same as the $path
+ selections in configure.in for the respective chips.
+
+ This code is reentrant and thread safe. We always calculate the same
+ decided_cpuvec, so if two copies of the code are running it doesn't
+ matter which completes first, both write the same to __gmpn_cpuvec.
+
+ We need to go via decided_cpuvec because if one thread has completed
+ __gmpn_cpuvec then it may be making use of the threshold values in that
+ vector. If another thread is still running __gmpn_cpuvec_init then we
+ don't want it to write different values to those fields since some of the
+ asm routines only operate correctly up to their own defined threshold,
+ not an arbitrary value. */
+
+static int
+gmp_workaround_skylake_cpuid_bug ()
+{
+ char feature_string[49];
+ char processor_name_string[49];
+ static const char *bad_cpus[] = {" G44", " G45", " G39" /* , "6600" */ };
+ int i;
+
+ /* Example strings: */
+ /* "Intel(R) Pentium(R) CPU G4400 @ 3.30GHz" */
+ /* "Intel(R) Core(TM) i5-6600K CPU @ 3.50GHz" */
+ /* ^ ^ ^ */
+ /* 0x80000002 0x80000003 0x80000004 */
+ /* We match out just the 0x80000003 part here. */
+
+ /* In their infinitive wisdom, Intel decided to use one register order for
+ the vendor string, and another for the processor name string. We shuffle
+ things about here, rather than write a new variant of our assembly cpuid.
+ */
+
+ unsigned int eax, ebx, ecx, edx;
+ eax = __gmpn_cpuid (feature_string, 0x80000003);
+ ebx = ((unsigned int *)feature_string)[0];
+ edx = ((unsigned int *)feature_string)[1];
+ ecx = ((unsigned int *)feature_string)[2];
+
+ ((unsigned int *) (processor_name_string))[0] = eax;
+ ((unsigned int *) (processor_name_string))[1] = ebx;
+ ((unsigned int *) (processor_name_string))[2] = ecx;
+ ((unsigned int *) (processor_name_string))[3] = edx;
+
+ processor_name_string[16] = 0;
+
+ for (i = 0; i < sizeof (bad_cpus) / sizeof (char *); i++)
+ {
+ if (strstr (processor_name_string, bad_cpus[i]) != 0)
+ return 1;
+ }
+ return 0;
+}
+
+enum {BMI2_BIT = 8};
+
+void
+__gmpn_cpuvec_init (void)
+{
+ struct cpuvec_t decided_cpuvec;
+ char vendor_string[13];
+ char dummy_string[12];
+ long fms;
+ int family, model;
+
+ TRACE (printf ("__gmpn_cpuvec_init:\n"));
+
+ memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
+
+ CPUVEC_SETUP_x86_64;
+ CPUVEC_SETUP_fat;
+
+ __gmpn_cpuid (vendor_string, 0);
+ vendor_string[12] = 0;
+
+ fms = __gmpn_cpuid (dummy_string, 1);
+ family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
+ model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
+
+ /* Check extended feature flags */
+ __gmpn_cpuid (dummy_string, 0x80000001);
+ if ((dummy_string[4 + 29 / 8] & (1 << (29 % 8))) == 0)
+ abort (); /* longmode-capable-bit turned off! */
+
+ /*********************************************************/
+ /*** WARNING: keep this list in sync with config.guess ***/
+ /*********************************************************/
+ if (strcmp (vendor_string, "GenuineIntel") == 0)
+ {
+ switch (family)
+ {
+ case 6:
+ switch (model)
+ {
+ case 0x0f: /* Conroe Merom Kentsfield Allendale */
+ case 0x10:
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ case 0x16:
+ case 0x17: /* PNR Wolfdale Yorkfield */
+ case 0x18:
+ case 0x19:
+ case 0x1d: /* PNR Dunnington */
+ CPUVEC_SETUP_core2;
+ break;
+
+ case 0x1c: /* Atom Silverthorne */
+ case 0x26: /* Atom Lincroft */
+ case 0x27: /* Atom Saltwell? */
+ case 0x36: /* Atom Cedarview/Saltwell */
+ CPUVEC_SETUP_atom;
+ break;
+
+ case 0x1a: /* NHM Gainestown */
+ case 0x1b:
+ case 0x1e: /* NHM Lynnfield/Jasper */
+ case 0x1f:
+ case 0x20:
+ case 0x21:
+ case 0x22:
+ case 0x23:
+ case 0x24:
+ case 0x25: /* WSM Clarkdale/Arrandale */
+ case 0x28:
+ case 0x29:
+ case 0x2b:
+ case 0x2c: /* WSM Gulftown */
+ case 0x2e: /* NHM Beckton */
+ case 0x2f: /* WSM Eagleton */
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ break;
+
+ case 0x37: /* Silvermont */
+ case 0x4a: /* Silvermont */
+ case 0x4c: /* Airmont */
+ case 0x4d: /* Silvermont/Avoton */
+ case 0x5a: /* Silvermont */
+ CPUVEC_SETUP_atom;
+ CPUVEC_SETUP_silvermont;
+ break;
+
+ case 0x5c: /* Goldmont */
+ case 0x5f: /* Goldmont */
+ case 0x7a: /* Goldmont Plus */
+ CPUVEC_SETUP_atom;
+ CPUVEC_SETUP_silvermont;
+ CPUVEC_SETUP_goldmont;
+ break;
+
+ case 0x2a: /* SB */
+ case 0x2d: /* SBC-EP */
+ case 0x3a: /* IBR */
+ case 0x3e: /* IBR Ivytown */
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ CPUVEC_SETUP_coreisbr;
+ break;
+ case 0x3c: /* Haswell client */
+ case 0x3f: /* Haswell server */
+ case 0x45: /* Haswell ULT */
+ case 0x46: /* Crystal Well */
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ CPUVEC_SETUP_coreisbr;
+ /* Some Haswells lack BMI2. Let them appear as Sandybridges for
+ now. */
+ __gmpn_cpuid (dummy_string, 7);
+ if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+ break;
+ CPUVEC_SETUP_coreihwl;
+ break;
+ case 0x3d: /* Broadwell */
+ case 0x47: /* Broadwell */
+ case 0x4f: /* Broadwell server */
+ case 0x56: /* Broadwell microserver */
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ CPUVEC_SETUP_coreisbr;
+ if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+ break;
+ CPUVEC_SETUP_coreihwl;
+ CPUVEC_SETUP_coreibwl;
+ break;
+ case 0x4e: /* Skylake client */
+ case 0x55: /* Skylake server */
+ case 0x5e: /* Skylake */
+ case 0x8e: /* Kabylake */
+ case 0x9e: /* Kabylake */
+ CPUVEC_SETUP_core2;
+ CPUVEC_SETUP_coreinhm;
+ CPUVEC_SETUP_coreisbr;
+ if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+ break;
+ if (gmp_workaround_skylake_cpuid_bug ())
+ break;
+ CPUVEC_SETUP_coreihwl;
+ CPUVEC_SETUP_coreibwl;
+ CPUVEC_SETUP_skylake;
+ break;
+ }
+ break;
+
+ case 15:
+ CPUVEC_SETUP_pentium4;
+ break;
+ }
+ }
+ else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+ {
+ switch (family)
+ {
+ case 0x0f: /* k8 */
+ case 0x11: /* "fam 11h", mix of k8 and k10 */
+ case 0x13:
+ CPUVEC_SETUP_k8;
+ break;
+
+ case 0x10: /* k10 */
+ case 0x12: /* k10 (llano) */
+ CPUVEC_SETUP_k8;
+ CPUVEC_SETUP_k10;
+ break;
+
+ case 0x14: /* bobcat */
+ CPUVEC_SETUP_k8;
+ CPUVEC_SETUP_k10;
+ CPUVEC_SETUP_bt1;
+ break;
+
+ case 0x16: /* jaguar */
+ CPUVEC_SETUP_k8;
+ CPUVEC_SETUP_k10;
+ CPUVEC_SETUP_bt1;
+ CPUVEC_SETUP_bt2;
+ break;
+
+ case 0x15: /* bulldozer, piledriver, steamroller, excavator */
+ CPUVEC_SETUP_k8;
+ CPUVEC_SETUP_k10;
+ CPUVEC_SETUP_bd1;
+ break;
+
+ case 0x17: /* zen */
+ case 0x19: /* zen3 */
+ CPUVEC_SETUP_zen;
+ break;
+ }
+ }
+ else if (strcmp (vendor_string, "CentaurHauls") == 0)
+ {
+ switch (family)
+ {
+ case 6:
+ if (model >= 15)
+ CPUVEC_SETUP_nano;
+ break;
+ }
+ }
+
+ /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
+ Instead default to the plain versions from whichever CPU we detected.
+ The function arguments are compatible, no need for any glue code. */
+ if (decided_cpuvec.preinv_divrem_1 == NULL)
+ decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
+ if (decided_cpuvec.preinv_mod_1 == NULL)
+ decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1;
+
+ ASSERT_CPUVEC (decided_cpuvec);
+ CPUVEC_INSTALL (decided_cpuvec);
+
+ /* Set this once the threshold fields are ready.
+ Use volatile to prevent it getting moved. */
+ *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+}
diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm
new file mode 100644
index 0000000..5f244ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm
@@ -0,0 +1,209 @@
+dnl x86 fat binary entrypoints.
+
+dnl Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
+dnl Torbjorn Granlund (port to x86_64)
+
+dnl Copyright 2003, 2009, 2011-2014, 2016 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl Forcibly disable profiling.
+dnl
+dnl The entrypoints and inits are small enough not to worry about, the real
+dnl routines arrived at will have any profiling. Also, the way the code
+dnl here ends with a jump means we won't work properly with the
+dnl "instrument" profiling scheme anyway.
+
+define(`WANT_PROFILING',no)
+
+
+dnl We define PRETEND_PIC as a helper symbol, the use it for suppressing
+dnl normal, fast call code, since that triggers problems on Darwin, OpenBSD
+dnl and some versions of GNU/Linux. This will go away when symbol hiding is
+dnl finished.
+
+ifdef(`DARWIN',
+`define(`PRETEND_PIC')')
+ifdef(`OPENBSD',
+`define(`PRETEND_PIC')')
+ifdef(`LINUX',
+`define(`PRETEND_PIC')')
+ifdef(`PIC',
+`define(`PRETEND_PIC')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ TEXT
+
+dnl Usage: FAT_ENTRY(name, offset)
+dnl
+dnl Emit a fat binary entrypoint function of the given name. This is the
+dnl normal entry for applications, eg. __gmpn_add_n.
+dnl
+dnl The code simply jumps through the function pointer in __gmpn_cpuvec at
+dnl the given "offset" (in bytes).
+dnl
+dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
+dnl fine for all x86s.
+dnl
+dnl For ELF/DARWIN PIC, the jumps are 20 bytes each, and are best aligned to
+dnl 16 to ensure at least the first two instructions don't cross a cache line
+dnl boundary.
+dnl
+dnl For DOS64, the jumps are 6 bytes. The same form works also for GNU/Linux
+dnl (at least with certain assembler/linkers) but FreeBSD 8.2 crashes. Not
+dnl tested on Darwin, Slowaris, NetBSD, etc.
+dnl
+dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
+dnl grepping in configure, stopping that code trying to eval something with
+dnl $1 in it.
+
+define(FAT_ENTRY,
+m4_assert_numargs(2)
+`ifdef(`HOST_DOS64',
+` ALIGN(8)
+`'PROLOGUE($1)
+ jmp *$2+GSYM_PREFIX`'__gmpn_cpuvec(%rip)
+EPILOGUE()
+',
+` ALIGN(ifdef(`PIC',16,8))
+`'PROLOGUE($1)
+ifdef(`PRETEND_PIC',
+` LEA( GSYM_PREFIX`'__gmpn_cpuvec, %rax)
+ jmp *$2(%rax)
+',`dnl non-PIC
+ jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2
+')
+EPILOGUE()
+')')
+
+
+dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_ENTRY(MPN(i),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 8))',
+CPUVEC_FUNCS_LIST)
+
+
+dnl Usage: FAT_INIT(name, offset)
+dnl
+dnl Emit a fat binary initializer function of the given name. These
+dnl functions are the initial values for the pointers in __gmpn_cpuvec.
+dnl
+dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through
+dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
+dnl __gmpn_cpuvec_init will have stored the address of the selected
+dnl implementation there.
+dnl
+dnl Only one of these routines will be executed, and only once, since after
+dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no
+dnl need for anything special here, just something small and simple. To
+dnl keep code size down, "fat_init" is a shared bit of code, arrived at
+dnl with the offset in %al. %al is used since the movb instruction is 2
+dnl bytes where %eax would be 4.
+dnl
+dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
+dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval
+dnl something with $1 in it.
+dnl
+dnl We need to preserve parameter registers over the __gmpn_cpuvec_init call
+
+define(FAT_INIT,
+m4_assert_numargs(2)
+`PROLOGUE($1)
+ mov $`'$2, %al
+ jmp L(fat_init)
+EPILOGUE()
+')
+
+dnl FAT_INIT for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 1))',
+CPUVEC_FUNCS_LIST)
+
+L(fat_init):
+ C al __gmpn_cpuvec byte offset
+
+ movzbl %al, %eax
+IFSTD(` push %rdi ')
+IFSTD(` push %rsi ')
+ push %rdx
+ push %rcx
+ push %r8
+ push %r9
+ push %rax
+IFDOS(` sub $32, %rsp ')
+ CALL( __gmpn_cpuvec_init)
+IFDOS(` add $32, %rsp ')
+ pop %rax
+ pop %r9
+ pop %r8
+ pop %rcx
+ pop %rdx
+IFSTD(` pop %rsi ')
+IFSTD(` pop %rdi ')
+ifdef(`PRETEND_PIC',`
+ LEA( GSYM_PREFIX`'__gmpn_cpuvec, %r10)
+ jmp *(%r10,%rax,8)
+',`dnl non-PIC
+ jmp *GSYM_PREFIX`'__gmpn_cpuvec(,%rax,8)
+')
+
+
+C long __gmpn_cpuid (char dst[12], int id);
+C
+C This is called only 3 times, so just something simple and compact is fine.
+C
+C The rcx/ecx zeroing here is needed for the BMI2 check.
+
+define(`rp', `%rdi')
+define(`idx', `%rsi')
+
+PROLOGUE(__gmpn_cpuid)
+ FUNC_ENTRY(2)
+ mov %rbx, %r8
+ mov R32(idx), R32(%rax)
+ xor %ecx, %ecx
+ cpuid
+ mov %ebx, (rp)
+ mov %edx, 4(rp)
+ mov %ecx, 8(rp)
+ mov %r8, %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h
new file mode 100644
index 0000000..005c893
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* Fat binary x86_64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2003, 2009, 2011 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time
+ this might not be true currently is for actual 80386 and 80486 chips,
+ where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
+ that's not worth worrying about. */
+#define DIVEXACT_1_THRESHOLD 0
+
+/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
+ USE_PREINV_DIVREM_1 so that all callers use it, and then let the
+ __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
+ preinv. */
+#define USE_PREINV_DIVREM_1 1
+
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
+ for mpn_sqr to call the latter. */
+#define SQR_BASECASE_THRESHOLD 0
+
+/* Sensible fallbacks for these, when not taken from a cpu-specific
+ gmp-mparam.h. */
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 130
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 200
+
+/* These are values more or less in the middle of what the typical x86 chips
+ come out as. For a fat binary it's necessary to have values for these,
+ since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
+ as non-constant array initializers. FIXME: Perhaps these should be done
+ in the cpuvec structure like other thresholds. */
+#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 400
+#define MUL_FFT_THRESHOLD 2000
+
+#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 500
+#define SQR_FFT_THRESHOLD 3000
diff --git a/gmp-6.3.0/mpn/x86_64/fat/mod_1.c b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c
new file mode 100644
index 0000000..4f149cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mod_1.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mod_1.c"
diff --git a/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c
new file mode 100644
index 0000000..d9eb471
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mul_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mul_basecase.c"
diff --git a/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c
new file mode 100644
index 0000000..7f86be6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mullo_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/mullo_basecase.c"
diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_1.c b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c
new file mode 100644
index 0000000..0025403
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_1.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/redc_1.c"
diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_2.c b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c
new file mode 100644
index 0000000..1932d58
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_2.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/redc_2.c"
diff --git a/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c
new file mode 100644
index 0000000..d1c5dcd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c
@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_sqr_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#include "mpn/generic/sqr_basecase.c"
diff --git a/gmp-6.3.0/mpn/x86_64/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/gcd_11.asm
new file mode 100644
index 0000000..f9b3bcc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/gcd_11.asm
@@ -0,0 +1,114 @@
+dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 5.5
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 7.1
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR ?
+C Intel PNR ?
+C Intel NHM ?
+C Intel WSM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom 9.1
+C Intel SLM 6.9
+C Intel GLM 6.0
+C Intel GLM+ 5.8
+C VIA nano ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ LEA( ctz_table, %r8)
+ jmp L(ent)
+
+ ALIGN(16)
+L(top): cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+L(mid): and $MASK, R32(%rdx)
+ movzbl (%r8,%rdx), R32(%rcx)
+ jz L(shift_alot)
+ shr R8(%rcx), u0
+L(ent): mov u0, %rax
+ mov v0, %rdx
+ sub u0, %rdx
+ sub v0, u0
+ jnz L(top)
+
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+
+L(shift_alot):
+ shr $MAXSHIFT, u0
+ mov u0, %rdx
+ jmp L(mid)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/gcd_22.asm
new file mode 100644
index 0000000..78f985f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/gcd_22.asm
@@ -0,0 +1,163 @@
+dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx.
+dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always
+dnl works for our use, and helps a lot for certain CPUs.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 8.9
+C AMD K10 8.8
+C AMD bd1 9.7
+C AMD bd2 7.8
+C AMD bd3 ?
+C AMD bd4 7.4
+C AMD bt1 9.2
+C AMD bt2 9.1
+C AMD zn1 7.5
+C AMD zn2 7.5
+C Intel P4 ?
+C Intel CNR 10.5
+C Intel PNR 10.5
+C Intel NHM 9.7
+C Intel WSM 9.7
+C Intel SBR 10.7
+C Intel IBR ?
+C Intel HWL 9.5
+C Intel BWL 8.7
+C Intel SKL 8.6
+C Intel atom 18.9
+C Intel SLM 14.0
+C Intel GLM 9.8
+C Intel GLM+ 8.8
+C VIA nano ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0', `%rax')
+define(`cnt', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%rcx')
+define(`t1', `%r11')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+ mov v0_param, v0
+
+ LEA( ctz_table, %r10)
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ mov u0, s0
+ mov u1, s1
+
+ sub v0, u0
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+ and $MASK, R32(t0)
+ movzbl (%r10,t0), R32(cnt)
+ jz L(count_better)
+C Rightshift (u1,,u0) into (u1,,u0)
+L(shr): shr R8(cnt), u0
+ mov u1, t1
+ shr R8(cnt), u1
+ neg cnt
+ shl R8(cnt), t1
+ or t1, u0
+
+ test v1, v1
+ jnz L(top)
+ test u1, u1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(count_better):
+ rep;bsf u0, cnt C tzcnt!
+ jmp L(shr)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ mov u0, s0
+ mov u1, s1
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): C mov v0, %rax
+ C mov v1, %rdx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h
new file mode 100644
index 0000000..db94fb7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h
@@ -0,0 +1,217 @@
+/* AMD K8-K10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 15
+
+#define MUL_TOOM22_THRESHOLD 27
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 234
+#define MUL_TOOM6H_THRESHOLD 418
+#define MUL_TOOM8H_THRESHOLD 466
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 175
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 36
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 327
+#define SQR_TOOM6_THRESHOLD 446
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 2,67,322,991
+
+#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 25, 8}, { 13, 7}, { 29, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
+ { 23, 7}, { 47, 8}, { 25, 7}, { 51, 8}, \
+ { 29, 9}, { 15, 8}, { 37, 9}, { 19, 8}, \
+ { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \
+ { 55,10}, { 15, 9}, { 43,10}, { 23, 9}, \
+ { 55,10}, { 31, 9}, { 63, 5}, { 1023, 4}, \
+ { 2431, 5}, { 1279, 6}, { 671, 7}, { 367, 8}, \
+ { 189, 9}, { 95, 8}, { 195, 9}, { 111,11}, \
+ { 31, 9}, { 131,10}, { 71, 9}, { 155,10}, \
+ { 79, 9}, { 159,10}, { 87,11}, { 47,10}, \
+ { 111,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 143,10}, { 287,11}, { 159,10}, \
+ { 319,11}, { 175,12}, { 95,11}, { 207,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 447,14}, \
+ { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \
+ { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \
+ { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 799,11}, { 1599,12}, { 831,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,14}, { 1279,13}, { 2687,14}, { 1407,15}, \
+ { 767,14}, { 1535,13}, { 3071,14}, { 1791,16}, \
+ { 511,15}, { 1023,14}, { 2431,15}, { 1279,14}, \
+ { 2815,15}, { 1535,14}, { 3199,15}, { 1791,14}, \
+ { 3583,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 185
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 27, 7}, { 14, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 29, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
+ { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \
+ { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \
+ { 39, 9}, { 83,10}, { 47, 6}, { 767, 4}, \
+ { 3263, 5}, { 1727, 4}, { 3455, 5}, { 1791, 6}, \
+ { 927, 7}, { 479, 6}, { 959, 7}, { 511, 8}, \
+ { 271, 9}, { 147,10}, { 87,11}, { 47,10}, \
+ { 95,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 167,11}, { 95,10}, { 191,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 255,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,13}, { 447,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \
+ { 1663,13}, { 895,12}, { 1791,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,14}, { 639,13}, { 1471,14}, { 767,13}, \
+ { 1663,14}, { 895,13}, { 1855,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2303,14}, \
+ { 1279,13}, { 2559,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3071,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2303,15}, { 1279,14}, { 2687,15}, \
+ { 1535,14}, { 3199,15}, { 1791,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 203
+#define SQR_FFT_THRESHOLD 5248
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 15604
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 220
+#define DC_BDIV_QR_THRESHOLD 52
+#define DC_BDIV_Q_THRESHOLD 152
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 226
+#define INV_APPR_THRESHOLD 214
+
+#define BINV_NEWTON_THRESHOLD 327
+#define REDC_1_TO_REDC_2_THRESHOLD 4
+#define REDC_2_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1895
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 106
+#define MU_BDIV_QR_THRESHOLD 1589
+#define MU_BDIV_Q_THRESHOLD 1718
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 125
+#define HGCD_APPR_THRESHOLD 173
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 555
+#define GCDEXT_DC_THRESHOLD 478
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_PRECOMPUTE_THRESHOLD 1648
+
+#define FAC_DSC_THRESHOLD 1075
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm
new file mode 100644
index 0000000..06c5d5d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addlsh_n, mpn_rsblsh_n, optimised for Intel Goldmont.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/k8/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm
new file mode 100644
index 0000000..1818f9f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Goldmont.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreihwl/aors_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm
new file mode 100644
index 0000000..9c5f631
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Goldmont.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+include_mpn(`x86_64/bd1/aorsmul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h
new file mode 100644
index 0000000..531521d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h
@@ -0,0 +1,264 @@
+/* Intel Goldmont gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */
+/* FFT tuning limit = 468,030,122 */
+/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 17
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 19
+
+#define DIV_1_VS_MUL_1_PERCENT 301
+
+#define MUL_TOOM22_THRESHOLD 23
+#define MUL_TOOM33_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 178
+#define MUL_TOOM6H_THRESHOLD 258
+#define MUL_TOOM8H_THRESHOLD 357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 113
+#define SQR_TOOM4_THRESHOLD 290
+#define SQR_TOOM6_THRESHOLD 351
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define MUL_FFT_MODF_THRESHOLD 440 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 440, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,10}, \
+ { 191,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 367,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \
+ { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \
+ { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 831,13}, { 447,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \
+ { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \
+ { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1727,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \
+ { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \
+ { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \
+ { 5887,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \
+ { 3071,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \
+ { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,12}, { 15359,14}, { 3967,16}, \
+ { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \
+ { 6911,15}, { 3839,14}, { 7679,13}, { 15359,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \
+ { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \
+ { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \
+ { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 261
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
+ { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \
+ { 639,12}, { 95,11}, { 191,10}, { 383,11}, \
+ { 207,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 607,13}, { 319,12}, { 703,11}, { 1407,12}, \
+ { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,13}, { 447,12}, { 895,11}, { 1791,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,11}, { 2303,12}, { 1215,13}, { 639,12}, \
+ { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
+ { 895,12}, { 1791,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \
+ { 2431,14}, { 639,13}, { 1279,12}, { 2559,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,12}, { 2943,11}, { 5887,14}, { 767,13}, \
+ { 1535,12}, { 3071,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \
+ { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \
+ { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3583,13}, { 7167,14}, { 3839,13}, { 7679,12}, \
+ { 15359,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2559,14}, { 5119,15}, \
+ { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \
+ { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \
+ { 3583,14}, { 7167,15}, { 3839,14}, { 7679,13}, \
+ { 15359,17}, { 1023,16}, { 2047,15}, { 4095,14}, \
+ { 8191,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \
+ { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 259
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 13
+#define SQRLO_SQR_THRESHOLD 7035
+
+#define DC_DIV_QR_THRESHOLD 51
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 47
+#define DC_BDIV_Q_THRESHOLD 88
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 226
+#define INV_APPR_THRESHOLD 204
+
+#define BINV_NEWTON_THRESHOLD 264
+#define REDC_1_TO_REDC_2_THRESHOLD 28
+#define REDC_2_TO_REDC_N_THRESHOLD 54
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1620
+#define MUPI_DIV_QR_THRESHOLD 83
+#define MU_BDIV_QR_THRESHOLD 1334
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,16,194,642
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1042
+
+#define FAC_DSC_THRESHOLD 218
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 1 /* 6.58% faster than 3 */
+#define HGCD_THRESHOLD 136
+#define HGCD_APPR_THRESHOLD 168
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 393
+#define JACOBI_BASE_METHOD 4 /* 1.17% faster than 3 */
+
+/* Tuneup completed successfully, took 800192 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm
new file mode 100644
index 0000000..ed1ec54
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_1 optimised for Intel Goldmont.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/coreisbr/mul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm
new file mode 100644
index 0000000..1192635
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Goldmont.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_redc_1)
+include_mpn(`x86_64/k8/redc_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb.asm b/gmp-6.3.0/mpn/x86_64/invert_limb.asm
new file mode 100644
index 0000000..b375ad3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/invert_limb.asm
@@ -0,0 +1,112 @@
+dnl AMD64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
+
+dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb (approx) div
+C AMD K8,K9 48 71
+C AMD K10 48 77
+C Intel P4 135 161
+C Intel core2 69 116
+C Intel corei 55 89
+C Intel atom 129 191
+C VIA nano 79 157
+
+C rax rcx rdx rdi rsi r8
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+PROTECT(`mpn_invert_limb_table')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_invert_limb) C Kn C2 Ci
+ FUNC_ENTRY(1)
+ mov %rdi, %rax C 0 0 0
+ shr $55, %rax C 1 1 1
+ifdef(`DARWIN',`
+ lea mpn_invert_limb_table(%rip), %r8
+ add $-512, %r8
+',`
+ lea -512+mpn_invert_limb_table(%rip), %r8
+')
+ movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0
+
+ C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+ mov %rdi, %rsi C 0 0 0
+ mov R32(%rcx), R32(%rax) C 4 5 5
+ imul R32(%rcx), R32(%rcx) C 4 5 5
+ shr $24, %rsi C 1 1 1
+ inc %rsi C %rsi = d40
+ imul %rsi, %rcx C 8 10 8
+ shr $40, %rcx C 12 15 11
+ sal $11, R32(%rax) C 5 6 6
+ dec R32(%rax)
+ sub R32(%rcx), R32(%rax) C %rax = v1
+
+ C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+ mov $0x1000000000000000, %rcx
+ imul %rax, %rsi C 14 17 13
+ sub %rsi, %rcx
+ imul %rax, %rcx
+ sal $13, %rax
+ shr $47, %rcx
+ add %rax, %rcx C %rcx = v2
+
+ C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65)
+ mov %rdi, %rsi C 0 0 0
+ shr %rsi C d/2
+ sbb %rax, %rax C -d0 = -(d mod 2)
+ sub %rax, %rsi C d63 = ceil(d/2)
+ imul %rcx, %rsi C v2 * d63
+ and %rcx, %rax C v2 * d0
+ shr %rax C (v2>>1) * d0
+ sub %rsi, %rax C (v2>>1) * d0 - v2 * d63
+ mul %rcx
+ sal $31, %rcx
+ shr %rdx
+ add %rdx, %rcx C %rcx = v3
+
+ mov %rdi, %rax
+ mul %rcx
+ add %rdi, %rax
+ mov %rcx, %rax
+ adc %rdi, %rdx
+ sub %rdx, %rax
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm
new file mode 100644
index 0000000..739d59e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm
@@ -0,0 +1,50 @@
+dnl Table used for mpn_invert_limb
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
+
+dnl Copyright 2004, 2007-2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+PROTECT(`mpn_invert_limb_table')
+
+ASM_START()
+C Table entry X contains floor (0x7fd00 / (0x100 + X))
+
+ RODATA
+ ALIGN(2)
+ GLOBL mpn_invert_limb_table
+mpn_invert_limb_table:
+forloop(i,256,512-1,dnl
+` .value eval(0x7fd00/i)
+')dnl
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm
new file mode 100644
index 0000000..f58b4cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm
@@ -0,0 +1,142 @@
+dnl AMD64 mpn_gcd_22. Assumes useful bsf, useless shrd, no tzcnt, no shlx.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 ?
+C AMD K10 7.4
+C AMD bd1 9.9
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR ?
+C Intel PNR ?
+C Intel NHM 9.2
+C Intel WSM 9.0
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0', `%rax')
+define(`cnt', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%r10')
+define(`t1', `%r11')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+ mov v0_param, v0
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ mov u0, s0
+ mov u1, s1
+
+ bsf t0, cnt
+
+ sub v0, u0
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovnc u1, t1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+ shr R8(cnt), u0
+ mov t1, u1
+ shr R8(cnt), u1
+ neg cnt
+ shl R8(cnt), t1
+ or t1, u0
+
+ test u1, u1
+ jnz L(top)
+ test v1, v1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ mov u0, s0
+ mov u1, s1
+ bsf t0, cnt
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): C mov v0, %rax
+ C mov v1, %rdx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h
new file mode 100644
index 0000000..349bace
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h
@@ -0,0 +1,248 @@
+/* AMD K10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c) \
+ (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \
+ : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 3200-3600 MHz K10 Thuban */
+/* FFT tuning limit = 427,161,280 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 15
+
+#define DIV_1_VS_MUL_1_PERCENT 324
+
+#define MUL_TOOM22_THRESHOLD 27
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 232
+#define MUL_TOOM6H_THRESHOLD 363
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 155
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 280
+#define SQR_TOOM6_THRESHOLD 446
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 34
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 530, 5}, { 24, 6}, { 13, 5}, { 27, 6}, \
+ { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
+ { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 35, 8}, { 71, 9}, \
+ { 39, 8}, { 81, 9}, { 43,10}, { 23, 9}, \
+ { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \
+ { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \
+ { 55,11}, { 31,10}, { 87,11}, { 47,10}, \
+ { 111,12}, { 31,11}, { 63,10}, { 143,11}, \
+ { 79,10}, { 167,11}, { 95,10}, { 191,11}, \
+ { 111,12}, { 63,11}, { 143,10}, { 287,11}, \
+ { 159,12}, { 95,11}, { 207,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 575,10}, \
+ { 1151,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 447,14}, \
+ { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 831,13}, { 447,12}, { 959,14}, { 255,13}, \
+ { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1471,14}, \
+ { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
+ { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,14}, { 1279,13}, { 2559,14}, { 1407,15}, \
+ { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \
+ { 2431,15}, { 1279,14}, { 2943,15}, { 1535,14}, \
+ { 3199,15}, { 1791,14}, { 3583,16}, { 1023,15}, \
+ { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \
+ { 3071,14}, { 6271,15}, { 3327,14}, { 6911,15}, \
+ { 3583,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7167,17}, { 2047,16}, { 4095,15}, { 8959,16}, \
+ { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 6143,15}, { 12543,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 207
+#define MUL_FFT_THRESHOLD 7552
+
+#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 29, 7}, { 28, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \
+ { 23, 7}, { 47, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 49, 9}, { 27, 8}, { 55,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \
+ { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 95,10}, \
+ { 55,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 103,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 167,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319,12}, { 95,11}, \
+ { 191,10}, { 383, 9}, { 767,10}, { 399,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \
+ { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \
+ { 383,11}, { 799,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \
+ { 1407,12}, { 735,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \
+ { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \
+ { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \
+ { 1791,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,14}, { 1279,13}, { 2559,14}, \
+ { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2303,15}, { 1279,14}, { 2815,15}, \
+ { 1535,14}, { 3199,15}, { 1791,16}, { 1023,15}, \
+ { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \
+ { 3071,14}, { 6271,15}, { 3327,14}, { 6911,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7679,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,16}, { 5119,15}, { 10495,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 6143,15}, { 12287,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 224
+#define SQR_FFT_THRESHOLD 5568
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 10950
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 238
+#define DC_BDIV_QR_THRESHOLD 54
+#define DC_BDIV_Q_THRESHOLD 42
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 252
+#define INV_APPR_THRESHOLD 230
+
+#define BINV_NEWTON_THRESHOLD 327
+#define REDC_1_TO_REDC_2_THRESHOLD 25
+#define REDC_2_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 1620
+#define MU_DIVAPPR_Q_THRESHOLD 1620
+#define MUPI_DIV_QR_THRESHOLD 104
+#define MU_BDIV_QR_THRESHOLD 1528
+#define MU_BDIV_Q_THRESHOLD 1652
+
+#define POWM_SEC_TABLE 1,22,321,473,2144
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_PRECOMPUTE_THRESHOLD 1304
+
+#define FAC_DSC_THRESHOLD 470
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 5 /* 8.38% faster than 4 */
+#define HGCD_THRESHOLD 115
+#define HGCD_APPR_THRESHOLD 146
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 535
+#define GCDEXT_DC_THRESHOLD 460
+#define JACOBI_BASE_METHOD 1 /* 0.90% faster than 4 */
+
+/* Tuneup completed successfully, took 448763 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm
new file mode 100644
index 0000000..f70494a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm
@@ -0,0 +1,109 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2008, 2010-2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 2.0 =
+C AMD bd1 ~4.4 =
+C AMD bd2 ~4.4 =
+C AMD bd3
+C AMD bd4
+C AMD bobcat 7.55 =
+C AMD jaguar 2.52 -
+C Intel P4 -
+C Intel core2 -
+C Intel NHM 2.03 +
+C Intel SBR 2.01 +
+C Intel IBR 1.96 +
+C Intel HWL 1.64 =
+C Intel BWL 1.56 -
+C Intel SKL 1.52 =
+C Intel atom
+C Intel SLM 3.0 -
+C VIA nano
+
+define(`ap', `%rdi')
+define(`bp', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ mov (ap), %r8
+ xor (bp), %r8
+
+ lea (ap,n,8), ap C point at A operand end
+ lea (bp,n,8), bp C point at B operand end
+ neg n
+
+ test $1, R8(n)
+ jz L(2)
+
+L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
+ xor R32(%r10), R32(%r10)
+ inc n
+ js L(top)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(2): mov 8(ap,n,8), %r9
+ .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax
+ xor 8(bp,n,8), %r9
+ .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
+ add $2, n
+ js L(top)
+ lea (%r10, %rax), %rax
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(top): mov (ap,n,8), %r8
+ lea (%r10, %rax), %rax
+ mov 8(ap,n,8), %r9
+ xor (bp,n,8), %r8
+ xor 8(bp,n,8), %r9
+ .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx
+ lea (%rcx, %rax), %rax
+ .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10
+ add $2, n
+ js L(top)
+
+ lea (%r10, %rax), %rax
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshift.asm b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm
new file mode 100644
index 0000000..cadf9b9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for AMD K10.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm
new file mode 100644
index 0000000..48a92e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for AMD K10.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/k10/popcount.asm b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm
new file mode 100644
index 0000000..3814aea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm
@@ -0,0 +1,138 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 1.125
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel corei 1.25
+C Intel atom n/a
+C VIA nano n/a
+
+C * The zero-offset of popcount is misassembled to the offset-less form, which
+C is one byte shorter and therefore will mess up the switching code.
+C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn,
+C which is the main reason for our usage of '.byte'.
+
+C TODO
+C * Improve switching code, the current code sucks.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(2)
+
+ifelse(1,1,`
+ lea (up,n,8), up
+
+C mov R32(n), R32(%rcx)
+C neg R32(%rcx)
+ imul $-1, R32(n), R32(%rcx)
+ and $8-1, R32(%rcx)
+
+ neg n
+
+ mov R32(%rcx), R32(%rax)
+ neg %rax
+ lea (up,%rax,8),up
+
+ xor R32(%rax), R32(%rax)
+
+ lea (%rcx,%rcx,4), %rcx
+
+ lea L(top)(%rip), %rdx
+ lea (%rdx,%rcx,2), %rdx
+ jmp *%rdx
+',`
+ lea (up,n,8), up
+
+ mov R32(n), R32(%rcx)
+ neg R32(%rcx)
+ and $8-1, R32(%rcx)
+
+ neg n
+
+ mov R32(%rcx), R32(%rax)
+ shl $3, R32(%rax)
+ sub %rax, up
+
+ xor R32(%rax), R32(%rax)
+
+C add R32(%rcx), R32(%rcx) C 2x
+C lea (%rcx,%rcx,4), %rcx C 10x
+ imul $10, R32(%rcx)
+
+ lea L(top)(%rip), %rdx
+ add %rcx, %rdx
+ jmp *%rdx
+')
+
+ ALIGN(32)
+L(top):
+C 0 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00 C popcnt 0(up,n,8), %r8
+ add %r8, %rax
+C 7 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08 C popcnt 8(up,n,8), %r9
+ add %r9, %rax
+C 6 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10 C popcnt 16(up,n,8), %r8
+ add %r8, %rax
+C 5 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18 C popcnt 24(up,n,8), %r9
+ add %r9, %rax
+C 4 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20 C popcnt 32(up,n,8), %r8
+ add %r8, %rax
+C 3 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28 C popcnt 40(up,n,8), %r9
+ add %r9, %rax
+C 2 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30 C popcnt 48(up,n,8), %r8
+ add %r8, %rax
+C 1 = n mod 8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38 C popcnt 56(up,n,8), %r9
+ add %r9, %rax
+
+ add $8, n
+ js L(top)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k10/rshift.asm b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm
new file mode 100644
index 0000000..249051a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for AMD K10.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
new file mode 100644
index 0000000..3e1898b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
@@ -0,0 +1,153 @@
+dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl Copyright 2008, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.167
+C AMD K10 2.167
+C Intel P4 12.0
+C Intel core2 4.0
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Perhaps handle various n mod 3 sizes better. The code now is too large.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp_param', `%rdx')
+define(`n', `%rcx')
+define(`u0', `%r8')
+define(`v0', `%r9')
+
+
+define(`bp', `%rbp')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ push %rbp
+
+ lea (ap,n,8), ap
+ lea (bp_param,n,8), bp
+ lea (rp,n,8), rp
+ neg n
+
+ mov (ap,n,8), %rax
+ mul %r8
+ mov %rax, %r11
+ mov (bp,n,8), %rax
+ mov %rdx, %r10
+ add $3, n
+ jns L(end)
+
+ push %r13
+
+ ALIGN(16)
+L(top): mul %r9
+ add %rax, %r11
+ mov -16(ap,n,8), %rax
+ adc %rdx, %r10
+ mov %r11, -24(rp,n,8)
+ mul %r8
+ add %rax, %r10
+ mov -16(bp,n,8), %rax
+ mov $0, R32(%r13)
+ adc %rdx, %r13
+ mul %r9
+ add %rax, %r10
+ mov -8(ap,n,8), %rax
+ adc %rdx, %r13
+ mov %r10, -16(rp,n,8)
+ mul %r8
+ add %rax, %r13
+ mov -8(bp,n,8), %rax
+ mov $0, R32(%r11)
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r13
+ adc %rdx, %r11
+ mov (ap,n,8), %rax
+ mul %r8
+ add %rax, %r11
+ mov %r13, -8(rp,n,8)
+ mov (bp,n,8), %rax
+ mov $0, R32(%r10)
+ adc %rdx, %r10
+ add $3, n
+ js L(top)
+
+ pop %r13
+
+L(end): mul %r9
+ add %rax, %r11
+ adc %rdx, %r10
+ cmp $1, R32(n)
+ ja L(two)
+ mov -16(ap,n,8), %rax
+ mov %r11, -24(rp,n,8)
+ mov %r10, %r11
+ jz L(one)
+
+L(nul): mul %r8
+ add %rax, %r10
+ mov -16(bp), %rax
+ mov $0, R32(%r11)
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r10
+ mov -8(ap), %rax
+ adc %rdx, %r11
+ mov %r10, -16(rp)
+L(one): mul %r8
+ add %rax, %r11
+ mov -8(bp), %rax
+ mov $0, R32(%r10)
+ adc %rdx, %r10
+ mul %r9
+ add %rax, %r11
+ adc %rdx, %r10
+
+L(two): mov %r11, -8(rp)
+ mov %r10, %rax
+L(ret): pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
new file mode 100644
index 0000000..78bcba1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl add the result to a third limb vector.
+
+dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cfg cycles/limb am1+am1
+C AMD K8,K9 2.375
+C AMD K10 2.375
+C AMD bull 5.2 <- 4.6-4.75 bad
+C AMD pile 4.96 <- 4.6-4.75 bad
+C AMD steam ?
+C AMD excavator ?
+C AMD bobcat 5.75 5.0 bad
+C AMD jaguar 5.9 5.2-5.4 bad
+C Intel P4 15-16
+C Intel core2 4.5 4.25-4.5 bad
+C Intel NHM 4.33 4.55 bad
+C Intel SBR 3.4 2.93 3.24 bad
+C Intel IBR 3.35 2.6 2.95 bad
+C Intel HWL 3.3 2.15 2.3 bad
+C Intel BWL 2.33 2.33 1.65 bad
+C Intel SKL 2.37 2.21 1.64 bad
+C Intel atom 20 18.7
+C Intel SLM 8 8.5
+C VIA nano 4.4
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Tune feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vp', `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ mov n_param, n
+ push %rbx
+ push %rbp
+
+ mov 0(vp), v0
+ mov 8(vp), v1
+
+ mov R32(n_param), R32(%rbx)
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ mul v0
+ neg n
+ and $3, R32(%rbx)
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, w1
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ mov 8(up,n,8), %rax
+ dec n
+ jmp L(lo3)
+
+L(b2): mov %rax, w2
+ mov 8(up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ add $-2, n
+ jmp L(lo2)
+
+L(b1): mov %rax, w3
+ mov 8(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b0): mov $0, R32(w3)
+ mov %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(lo0)
+
+ ALIGN(32)
+L(top): mov $0, R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(lo1): mul v1
+ add w3, (rp,n,8)
+ mov $0, R32(w3)
+ adc %rax, w0
+ mov $0, R32(w2)
+ mov 8(up,n,8), %rax
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add w0, 8(rp,n,8)
+ adc %rax, w1
+ adc %rdx, w2
+ mov 16(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 16(up,n,8), %rax
+L(lo3): mul v1
+ add w1, 16(rp,n,8)
+ adc %rax, w2
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov 24(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(lo2): mul v1
+ add w2, 24(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov 32(up,n,8), %rax
+ add $4, n
+ js L(top)
+
+L(end): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up), %rax
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w3, (rp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
new file mode 100644
index 0000000..ff3a184
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
@@ -0,0 +1,217 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.87 < 3.85 for lshift + add_n
+C AMD K10 2.75 < 3.85 for lshift + add_n
+C Intel P4 22 > 7.33 for lshift + add_n
+C Intel core2 4.1 > 3.27 for lshift + add_n
+C Intel NHM 4.4 > 3.75 for lshift + add_n
+C Intel SBR 3.17 < 3.46 for lshift + add_n
+C Intel atom ? ? 8.75 for lshift + add_n
+C VIA nano 4.7 < 6.25 for lshift + add_n
+
+C TODO
+C * Can we propagate carry into rdx instead of using a special carry register?
+C That could save enough insns to get to 10 cycles/iteration.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param', `%rcx')
+define(`cnt', `%r8')
+
+define(`vp', `%r12')
+define(`n', `%rbp')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADDSUB, `add')
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADDSUB, `sub')
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov (vp_param), %rax C load first V limb early
+
+ mov $0, R32(n)
+ sub n_param, n
+
+ lea -16(up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+ lea 16(vp_param,n_param,8), vp
+
+ mov n_param, %r9
+
+ mov %r8, %rcx
+ mov $1, R32(%r8)
+ shl R8(%rcx), %r8
+
+ mul %r8 C initial multiply
+
+ and $3, R32(%r9)
+ jz L(b0)
+ cmp $2, R32(%r9)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r11
+ ADDSUB 16(up,n,8), %r11
+ mov -8(vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov (vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $3, n
+ jnz L(lo3)
+ jmp L(cj3)
+
+L(b2): mov %rax, %rbx
+ mov -8(vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $2, n
+ jz L(cj2)
+ mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ xor R32(%rcx), R32(%rcx) C clear carry register
+ jmp L(lo2)
+
+L(b1): mov %rax, %r9
+ mov %rdx, %r10
+ add $1, n
+ jnz L(gt1)
+ ADDSUB 8(up,n,8), %r9
+ jmp L(cj1)
+L(gt1): mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ jmp L(lo1)
+
+L(b0): mov %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov 8(vp,n,8), %rax
+ add $4, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(lo3): mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %r11, -8(rp,n,8)
+L(lo2): mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rbx, (rp,n,8)
+L(lo1): mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov %r9, 8(rp,n,8)
+L(lo0): mov 8(vp,n,8), %rax
+ add $4, n
+ jnz L(top)
+
+L(end): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(cj3): mov %r11, -8(rp,n,8)
+L(cj2): add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ mov %rbx, (rp,n,8)
+L(cj1): mov %r9, 8(rp,n,8)
+ mov %rdx, %rax
+ ADCSBB $0, %rax
+ pop %rbx
+ pop %rbp
+ pop %r12
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
new file mode 100644
index 0000000..1172b0d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
@@ -0,0 +1,179 @@
+dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor,
+dnl returning quotient only.
+
+dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm/unorm
+C AMD K8,K9 10 +
+C AMD K10 10 +
+C AMD bull 13.7 -
+C AMD pile 13.7 +
+C AMD steam
+C AMD excavator
+C AMD bobcat 15 -
+C AMD jaguar 16 -
+C Intel P4 33 =
+C Intel core2 13.25 =
+C Intel NHM 14 =
+C Intel SBR 8.5 -
+C Intel IBR 8.5 -
+C Intel HWL 8 =
+C Intel BWL 8 =
+C Intel SKL 8 =
+C Intel atom 42 --
+C Intel SLM 20.4 --
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`d', `%rcx')
+define(`di', `%r8') C just mpn_pi1_bdiv_q_1
+define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov %rcx, %rax
+ xor R32(%rcx), R32(%rcx) C ncnt count
+ mov %rdx, %r10
+
+ bt $0, R32(%rax)
+ jnc L(evn) C skip bsf unless divisor is even
+
+L(odd): mov %rax, %rbx
+ shr R32(%rax)
+ and $127, R32(%rax) C d/2, 7 bits
+
+ LEA( binvert_limb_table, %rdx)
+
+ movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+
+ mov %rbx, %r11 C d without twos
+
+ lea (%rax,%rax), R32(%rdx) C 2*inv
+ imul R32(%rax), R32(%rax) C inv*inv
+ imul R32(%rbx), R32(%rax) C inv*inv*d
+ sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rdx,%rdx), R32(%rax) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+ imul R32(%rbx), R32(%rdx) C inv*inv*d
+ sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+
+ lea (%rax,%rax), %r8 C 2*inv
+ imul %rax, %rax C inv*inv
+ imul %rbx, %rax C inv*inv*d
+ sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits
+
+ jmp L(pi1)
+
+L(evn): bsf %rax, %rcx
+ shr R8(%rcx), %rax
+ jmp L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ push %rbx
+
+ mov %rcx, %r11 C d
+ mov %rdx, %r10 C n
+ mov %r9, %rcx C ncnt
+
+L(pi1): mov (up), %rax C up[0]
+
+ dec %r10
+ jz L(one)
+
+ mov 8(up), %rdx C up[1]
+ lea (up,%r10,8), up C up end
+ lea (rp,%r10,8), rp C rp end
+ neg %r10 C -n
+
+ shrd R8(%rcx), %rdx, %rax
+
+ xor R32(%rbx), R32(%rbx)
+ jmp L(ent)
+
+ ALIGN(8)
+L(top):
+ C rax q
+ C rbx carry bit, 0 or 1
+ C rcx ncnt
+ C rdx
+ C r10 counter, limbs, negative
+ C r11 d
+
+ mul %r11 C carry limb in rdx
+ mov (up,%r10,8), %rax
+ mov 8(up,%r10,8), %r9
+ shrd R8(%rcx), %r9, %rax
+ nop
+ sub %rbx, %rax C apply carry bit
+ setc R8(%rbx)
+ sub %rdx, %rax C apply carry limb
+ adc $0, R32(%rbx)
+L(ent): imul %r8, %rax
+ mov %rax, (rp,%r10,8)
+ inc %r10
+ jnz L(top)
+
+ mul %r11 C carry limb in rdx
+ mov (up), %rax C up high limb
+ shr R8(%rcx), %rax
+ sub %rbx, %rax C apply carry bit
+ sub %rdx, %rax C apply carry limb
+ imul %r8, %rax
+ mov %rax, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(one): shr R8(%rcx), %rax
+ imul %r8, %rax
+ mov %rax, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..86de08c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
@@ -0,0 +1,249 @@
+dnl x86-64 mpn_div_qr_1n_pi1
+dnl -- Divide an mpn number by a normalized single-limb number,
+dnl using a single-limb inverse.
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C AMD K8,K9 11
+C AMD K10 11
+C AMD bull 16
+C AMD pile 14.25
+C AMD steam ?
+C AMD bobcat 16
+C AMD jaguar ?
+C Intel P4 47.5 poor
+C Intel core 28.5 very poor
+C Intel NHM 29 very poor
+C Intel SBR 16 poor
+C Intel IBR 13.5
+C Intel HWL 12
+C Intel BWL ?
+C Intel atom 53 very poor
+C VIA nano 19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx') C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8') C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+ ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ dec UN_INPUT
+ jnz L(first)
+
+ C Just a single 2/1 division.
+ C T, U0 are allocated in scratch registers
+ lea 1(U1), T
+ mov U1, %rax
+ mul DINV
+ mov (UP), U0
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(single_div_done)
+ sub D, %rax
+ add $1, T
+L(single_div_done):
+ mov T, (QP)
+ FUNC_EXIT()
+ ret
+L(first):
+ C FIXME: Could delay some of these until we enter the loop.
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbx
+ push %rbp
+
+ mov D, B2
+ imul DINV, B2
+ neg B2
+ mov B2, B2md
+ sub D, B2md
+
+ C D not needed until final reduction
+ push D
+ mov UN_INPUT, UN C Clobbers D
+
+ mov DINV, %rax
+ mul U1
+ mov %rax, Q0
+ add U1, %rdx
+ mov %rdx, T
+
+ mov B2, %rax
+ mul U1
+ mov -8(UP, UN, 8), U0
+ mov (UP, UN, 8), U1
+ mov T, (QP, UN, 8)
+ add %rax, U0
+ adc %rdx, U1
+ sbb U2, U2
+ dec UN
+ mov U1, %rax
+ jz L(final)
+ mov $0, R32(Q1)
+
+ ALIGN(16)
+
+ C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
+ C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
+ C is zero, and carry holds an extra copy of U2.
+L(loop):
+ C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+ C Remains to add in B (U1 + c)
+ cmovc DINV, Q1
+ mov U2, Q2
+ neg Q2
+ mul DINV
+ add %rdx, Q1
+ adc $0, Q2
+ add Q0, Q1
+ mov %rax, Q0
+ mov B2, %rax
+ lea (B2md, U0), T
+ adc $0, Q2
+
+ C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+ mul U1
+ and B2, U2
+ add U2, U0
+ cmovnc U0, T
+
+ C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+ adc U1, Q1
+ mov -8(UP, UN, 8), U0
+ adc Q2, 8(QP, UN, 8)
+ jc L(q_incr)
+L(q_incr_done):
+ add %rax, U0
+ mov T, %rax
+ adc %rdx, %rax
+ mov Q1, (QP, UN, 8)
+ mov $0, R32(Q1)
+ sbb U2, U2
+ dec UN
+ mov %rax, U1
+ jnz L(loop)
+
+L(final):
+ pop D
+
+ mov U2, Q1
+ and D, U2
+ sub U2, %rax
+ neg Q1
+
+ mov %rax, U1
+ sub D, %rax
+ cmovc U1, %rax
+ sbb $-1, Q1
+
+ lea 1(%rax), T
+ mul DINV
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(div_done)
+ sub D, %rax
+ add $1, T
+L(div_done):
+ add T, Q0
+ mov Q0, (QP)
+ adc Q1, 8(QP)
+ jnc L(done)
+L(final_q_incr):
+ addq $1, 16(QP)
+ lea 8(QP), QP
+ jc L(final_q_incr)
+
+L(done):
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(q_incr):
+ C U1 is not live, so use it for indexing
+ lea 16(QP, UN, 8), U1
+L(q_incr_loop):
+ addq $1, (U1)
+ jnc L(q_incr_done)
+ lea 8(U1), U1
+ jmp L(q_incr_loop)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
new file mode 100644
index 0000000..d87cc3b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
@@ -0,0 +1,237 @@
+/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c) \
+ (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \
+ : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 115,768,433 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define DIV_1_VS_MUL_1_PERCENT 309
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 232
+#define MUL_TOOM6H_THRESHOLD 324
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 114
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 430
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 44, 8}, { 23, 7}, { 47, 8}, \
+ { 25, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \
+ { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 53, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \
+ { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \
+ { 81, 9}, { 43,10}, { 23, 9}, { 55, 8}, \
+ { 111,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \
+ { 111,11}, { 31,10}, { 63, 9}, { 131,10}, \
+ { 71, 9}, { 147,10}, { 87,11}, { 47,10}, \
+ { 111,11}, { 63,10}, { 143,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 199,11}, { 111,12}, \
+ { 63,11}, { 143,10}, { 287,11}, { 159,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1215,14}, \
+ { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1855,15}, { 511,14}, { 1023,13}, \
+ { 2047,14}, { 1151,13}, { 2367,14}, { 1407,15}, \
+ { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \
+ { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \
+ { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \
+ { 4223,15}, { 2303,14}, { 4735,15}, { 2559,16}, \
+ { 1535,15}, { 3071,14}, { 6271,15}, { 3327,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD 11520
+
+#define SQR_FFT_MODF_THRESHOLD 540 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 540, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 16, 6}, { 33, 7}, { 33, 8}, \
+ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
+ { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
+ { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \
+ { 65, 9}, { 35, 8}, { 71, 9}, { 43,10}, \
+ { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
+ { 55, 9}, { 111,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 87,11}, { 47,10}, { 111,12}, \
+ { 31,11}, { 63,10}, { 143,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,10}, { 511, 9}, \
+ { 1023,11}, { 271,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 575,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,13}, \
+ { 127,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,14}, { 127,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,14}, { 511,13}, { 1215,14}, { 639,13}, \
+ { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \
+ { 1791,15}, { 511,14}, { 1023,13}, { 2111,14}, \
+ { 1151,13}, { 2303,14}, { 1407,15}, { 767,14}, \
+ { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \
+ { 1279,14}, { 2687,15}, { 1535,14}, { 3199,15}, \
+ { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2559,16}, { 1535,15}, \
+ { 3071,14}, { 6271,15}, { 3327,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 202
+#define SQR_FFT_THRESHOLD 7296
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 22239
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 14281
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 266
+#define DC_BDIV_QR_THRESHOLD 38
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 252
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 258
+#define REDC_1_TO_REDC_2_THRESHOLD 35
+#define REDC_2_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 2089
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 99
+#define MU_BDIV_QR_THRESHOLD 1787
+#define MU_BDIV_Q_THRESHOLD 1895
+
+#define POWM_SEC_TABLE 1,16,194,960,2825
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_PRECOMPUTE_THRESHOLD 1747
+
+#define FAC_DSC_THRESHOLD 1240
+#define FAC_ODD_THRESHOLD 27
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 3 /* 4.10% faster than 5 */
+#define HGCD_THRESHOLD 141
+#define HGCD_APPR_THRESHOLD 181
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 622
+#define GCDEXT_DC_THRESHOLD 496
+#define JACOBI_BASE_METHOD 1 /* 0.97% faster than 3 */
+
+/* Tuneup completed successfully, took 131832 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
new file mode 100644
index 0000000..ca2efb9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
@@ -0,0 +1,469 @@
+dnl AMD64 mpn_mul_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
+
+dnl Copyright 2008, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.375
+C AMD K10 2.375
+C Intel P4 15-16
+C Intel core2 4.45
+C Intel corei 4.35
+C Intel atom ?
+C VIA nano 4.5
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Use fewer registers. (how??? I can't see it -- david)
+C * Avoid some "mov $0,r" and instead use "xor r,r".
+C * Can the top of each L(addmul_outer_n) prologue be folded into the
+C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
+C case where vn = 1 or 2; is it worth it?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n', `%r11')
+define(`outer_addr', `%r14')
+define(`un', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ xor R32(un), R32(un)
+ mov (up), %rax
+ mov (vp), v0
+
+ sub un_param, un C rdx used by mul
+ mov un, n
+ mov R32(un_param), R32(w0)
+
+ lea (rp,un_param,8), rp
+ lea (up,un_param,8), up
+
+ mul v0
+
+ test $1, R8(vn)
+ jz L(mul_2)
+
+C ===========================================================
+C mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+ and $3, R32(w0)
+ jz L(mul_1_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_1_prologue_1)
+ jz L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+ add $-1, n
+ lea L(addmul_outer_3)(%rip), outer_addr
+ mov %rax, w3
+ mov %rdx, w0
+ jmp L(mul_1_entry_3)
+
+L(mul_1_prologue_0):
+ mov %rax, w2
+ mov %rdx, w3 C note: already w0 == 0
+ lea L(addmul_outer_0)(%rip), outer_addr
+ jmp L(mul_1_entry_0)
+
+L(mul_1_prologue_1):
+ cmp $-1, un
+ jne 2f
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ jmp L(ret)
+2: add $1, n
+ lea L(addmul_outer_1)(%rip), outer_addr
+ mov %rax, w1
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ mov (up,n,8), %rax
+ jmp L(mul_1_entry_1)
+
+L(mul_1_prologue_2):
+ add $-2, n
+ lea L(addmul_outer_2)(%rip), outer_addr
+ mov %rax, w0
+ mov %rdx, w1
+ mov 24(up,n,8), %rax
+ xor R32(w2), R32(w2)
+ xor R32(w3), R32(w3)
+ jmp L(mul_1_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+L(mul_1_entry_1):
+ xor R32(w0), R32(w0)
+ mul v0
+ mov w1, -8(rp,n,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov w2, (rp,n,8)
+ add %rax, w3
+ adc %rdx, w0
+L(mul_1_entry_3):
+ mov 16(up,n,8), %rax
+ mul v0
+ mov w3, 8(rp,n,8)
+ xor R32(w2), R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,n,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, n
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ adc %rdx, w2
+ mov w2, (rp)
+
+ add $-1, vn C vn -= 1
+ jz L(ret)
+
+ mov 8(vp), v0
+ mov 16(vp), v1
+
+ lea 8(vp), vp C vp += 1
+ lea 8(rp), rp C rp += 1
+
+ jmp *outer_addr
+
+C ===========================================================
+C mul_2 for vp[0], vp[1] if vn is even
+
+ ALIGN(16)
+L(mul_2):
+ mov 8(vp), v1
+
+ and $3, R32(w0)
+ jz L(mul_2_prologue_0)
+ cmp $2, R32(w0)
+ jz L(mul_2_prologue_2)
+ jc L(mul_2_prologue_1)
+
+L(mul_2_prologue_3):
+ lea L(addmul_outer_3)(%rip), outer_addr
+ add $2, n
+ mov %rax, -16(rp,n,8)
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ xor R32(w0), R32(w0)
+ mov -16(up,n,8), %rax
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_prologue_0):
+ add $3, n
+ mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov -24(up,n,8), %rax
+ lea L(addmul_outer_0)(%rip), outer_addr
+ jmp L(mul_2_entry_0)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ mov %rax, w3
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_2):
+ add $1, n
+ lea L(addmul_outer_2)(%rip), outer_addr
+ mov $0, R32(w0)
+ mov $0, R32(w1)
+ mov %rax, w2
+ mov -8(up,n,8), %rax
+ mov %rdx, w3
+ jmp L(mul_2_entry_2)
+
+ C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -32(up,n,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,n,8), %rax
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(mul_2_entry_0):
+ mul v1
+ add %rax, w1
+ mov w0, -24(rp,n,8)
+ adc %rdx, w2
+ mov -16(up,n,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,n,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(rp,n,8)
+L(mul_2_entry_3):
+ mul v1
+ add %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ adc R32(w1), R32(w0) C adc $0, w0
+L(mul_2_entry_2):
+ mul v1
+ add %rax, w3
+ mov w2, -8(rp,n,8)
+ adc %rdx, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(mul_2_entry_1):
+ add $4, n
+ mov w3, -32(rp,n,8)
+ js L(mul_2_top)
+
+ mov -32(up,n,8), %rax C FIXME: n is constant
+ mul v1
+ add %rax, w0
+ mov w0, (rp)
+ adc %rdx, w1
+ mov w1, 8(rp)
+
+ add $-2, vn C vn -= 2
+ jz L(ret)
+
+ mov 16(vp), v0
+ mov 24(vp), v1
+
+ lea 16(vp), vp C vp += 2
+ lea 16(rp), rp C rp += 2
+
+ jmp *outer_addr
+
+
+C ===========================================================
+C addmul_2 for remaining vp's
+
+ C in the following prologues, we reuse un to store the
+ C adjusted value of n that is reloaded on each iteration
+
+L(addmul_outer_0):
+ add $3, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -24(up,un,8), %rax
+ mul v0
+ mov %rax, w0
+ mov -24(up,un,8), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(addmul_entry_0)
+
+L(addmul_outer_1):
+ mov un, n
+ mov (up,un,8), %rax
+ mul v0
+ mov %rax, w3
+ mov (up,un,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(addmul_entry_1)
+
+L(addmul_outer_2):
+ add $1, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -8(up,un,8), %rax
+ mul v0
+ xor R32(w0), R32(w0)
+ mov %rax, w2
+ xor R32(w1), R32(w1)
+ mov %rdx, w3
+ mov -8(up,un,8), %rax
+ jmp L(addmul_entry_2)
+
+L(addmul_outer_3):
+ add $2, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -16(up,un,8), %rax
+ xor R32(w3), R32(w3)
+ mul v0
+ mov %rax, w1
+ mov -16(up,un,8), %rax
+ mov %rdx, w2
+ jmp L(addmul_entry_3)
+
+ C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(addmul_top):
+ add w3, -32(rp,n,8)
+ adc %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ adc R32(w2), R32(w2) C adc $0, w2
+L(addmul_entry_0):
+ mul v1
+ xor R32(w3), R32(w3)
+ add w0, -24(rp,n,8)
+ adc %rax, w1
+ mov -16(up,n,8), %rax
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov -16(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(addmul_entry_3):
+ mul v1
+ add w1, -16(rp,n,8)
+ adc %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ xor R32(w0), R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov -8(up,n,8), %rax
+ adc R32(w1), R32(w0) C adc $0, w0
+L(addmul_entry_2):
+ mul v1
+ add w2, -8(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w3
+ mov (up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(addmul_entry_1):
+ mul v1
+ add $4, n
+ js L(addmul_top)
+
+ add w3, -8(rp)
+ adc %rax, w0
+ mov w0, (rp)
+ adc %rdx, w1
+ mov w1, 8(rp)
+
+ add $-2, vn C vn -= 2
+ jz L(ret)
+
+ lea 16(rp), rp C rp += 2
+ lea 16(vp), vp C vp += 2
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+ ALIGN(16)
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
new file mode 100644
index 0000000..fa00f42
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
@@ -0,0 +1,436 @@
+dnl AMD64 mpn_mullo_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C large trip count. Instead, we should start with mul_2 for any operand
+C size congruence class.
+C * Stop iterating addmul_2 earlier, falling into straight-line triangle code
+C for the last 2-3 iterations.
+C * Perhaps implement n=4 special code.
+C * The reload of the outer loop jump address hurts branch prediction.
+C * The addmul_2 loop ends with an MUL whose high part is not used upon loop
+C exit.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`outer_addr', `%r8')
+define(`j', `%r9')
+define(`v0', `%r13')
+define(`v1', `%r14')
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, n
+ jge L(gen)
+ mov (up), %rax C u0
+ mov (vp_param), %r8 C v0
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',
+` movslq (%r9,%rcx,4), %r10
+ add %r10, %r9
+ jmp *%r9
+',`
+ jmp *(%r9,n,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(tab), L(tab)) C not allowed
+ JMPENT( L(1), L(tab)) C 1
+ JMPENT( L(2), L(tab)) C 2
+ JMPENT( L(3), L(tab)) C 3
+dnl JMPENT( L(0m4), L(tab)) C 4
+dnl JMPENT( L(1m4), L(tab)) C 5
+dnl JMPENT( L(2m4), L(tab)) C 6
+dnl JMPENT( L(3m4), L(tab)) C 7
+dnl JMPENT( L(0m4), L(tab)) C 8
+dnl JMPENT( L(1m4), L(tab)) C 9
+dnl JMPENT( L(2m4), L(tab)) C 10
+dnl JMPENT( L(3m4), L(tab)) C 11
+ TEXT
+
+L(1): imul %r8, %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+
+L(2): mov 8(vp_param), %r11
+ imul %rax, %r11 C u0 x v1
+ mul %r8 C u0 x v0
+ mov %rax, (rp)
+ imul 8(up), %r8 C u1 x v0
+ lea (%r11, %rdx), %rax
+ add %r8, %rax
+ mov %rax, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(3): mov 8(vp_param), %r9 C v1
+ mov 16(vp_param), %r11
+ mul %r8 C u0 x v0 -> <r1,r0>
+ mov %rax, (rp) C r0
+ mov (up), %rax C u0
+ mov %rdx, %rcx C r1
+ mul %r9 C u0 x v1 -> <r2,r1>
+ imul 8(up), %r9 C u1 x v1 -> r2
+ mov 16(up), %r10
+ imul %r8, %r10 C u2 x v0 -> r2
+ add %rax, %rcx
+ adc %rdx, %r9
+ add %r10, %r9
+ mov 8(up), %rax C u1
+ mul %r8 C u1 x v0 -> <r2,r1>
+ add %rax, %rcx
+ adc %rdx, %r9
+ mov %r11, %rax
+ imul (up), %rax C u0 x v2 -> r2
+ add %rax, %r9
+ mov %rcx, 8(rp)
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(0m4):
+L(1m4):
+L(2m4):
+L(3m4):
+L(gen): push %rbx
+ push %rbp
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), %rax
+ mov (vp_param), v0
+ mov vp_param, vp
+
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ neg n
+
+ mul v0
+
+ test $1, R8(n)
+ jz L(mul_2)
+
+L(mul_1):
+ lea -8(rp), rp
+ lea -8(up), up
+ test $2, R8(n)
+ jnz L(mul_1_prologue_3)
+
+L(mul_1_prologue_2): C n = 7, 11, 15, ...
+ lea -1(n), j
+ lea L(addmul_outer_1)(%rip), outer_addr
+ mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ xor R32(w3), R32(w3)
+ mov 16(up,n,8), %rax
+ jmp L(mul_1_entry_2)
+
+L(mul_1_prologue_3): C n = 5, 9, 13, ...
+ lea 1(n), j
+ lea L(addmul_outer_3)(%rip), outer_addr
+ mov %rax, w2
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ jmp L(mul_1_entry_0)
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,j,8)
+ add %rax, w1
+ mov (up,j,8), %rax
+ adc %rdx, w2
+ xor R32(w0), R32(w0)
+ mul v0
+ mov w1, -8(rp,j,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,j,8), %rax
+ mul v0
+ mov w2, (rp,j,8)
+ add %rax, w3
+ adc %rdx, w0
+ mov 16(up,j,8), %rax
+ mul v0
+ mov w3, 8(rp,j,8)
+ xor R32(w2), R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,j,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, j
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ adc %rdx, w2
+
+ imul (up), v0
+ add v0, w2
+ mov w2, (rp)
+
+ add $1, n
+ jz L(ret)
+
+ mov 8(vp), v0
+ mov 16(vp), v1
+
+ lea 16(up), up
+ lea 8(vp), vp
+ lea 24(rp), rp
+
+ jmp *outer_addr
+
+
+L(mul_2):
+ mov 8(vp), v1
+ test $2, R8(n)
+ jz L(mul_2_prologue_3)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ lea 0(n), j
+ mov %rax, w3
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ mov (up,n,8), %rax
+ lea L(addmul_outer_3)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_3):
+ lea 2(n), j
+ mov $0, R32(w3)
+ mov %rax, w1
+ mov (up,n,8), %rax
+ mov %rdx, w2
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -32(up,j,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ mov w0, -24(rp,j,8)
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+L(mul_2_entry_3):
+ mov $0, R32(w0)
+ mov w1, -16(rp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc R32(w1), R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(rp,j,8)
+ adc %rdx, w0
+ mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(mul_2_entry_1):
+ add $4, j
+ mov w3, -32(rp,j,8)
+ js L(mul_2_top)
+
+ imul -16(up), v1
+ add v1, w0
+ imul -8(up), v0
+ add v0, w0
+ mov w0, -8(rp)
+
+ add $2, n
+ jz L(ret)
+
+ mov 16(vp), v0
+ mov 24(vp), v1
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+ jmp *outer_addr
+
+
+L(addmul_outer_1):
+ lea -2(n), j
+ mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w3
+ mov -16(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ lea L(addmul_outer_3)(%rip), outer_addr
+ jmp L(addmul_entry_1)
+
+L(addmul_outer_3):
+ lea 0(n), j
+ mov -16(up,n,8), %rax
+ xor R32(w3), R32(w3)
+ mul v0
+ mov %rax, w1
+ mov -16(up,n,8), %rax
+ mov %rdx, w2
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(addmul_entry_3)
+
+ ALIGN(16)
+L(addmul_top):
+ add w3, -32(rp,j,8)
+ adc %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc R32(w2), R32(w2)
+ mul v1
+ xor R32(w3), R32(w3)
+ add w0, -24(rp,j,8)
+ adc %rax, w1
+ mov -16(up,j,8), %rax
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov -16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(addmul_entry_3):
+ mul v1
+ add w1, -16(rp,j,8)
+ adc %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ xor R32(w0), R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov -8(up,j,8), %rax
+ adc R32(w1), R32(w0)
+ mul v1
+ add w2, -8(rp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov (up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(addmul_entry_1):
+ mul v1
+ add $4, j
+ js L(addmul_top)
+
+ add w3, -32(rp)
+ adc %rax, w0
+
+ imul -24(up), v0
+ add v0, w0
+ add w0, -24(rp)
+
+ add $2, n
+ jns L(ret)
+
+ lea 16(vp), vp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea -16(up), up
+
+ jmp *outer_addr
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
new file mode 100644
index 0000000..86f1414
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
@@ -0,0 +1,559 @@
+dnl AMD64 mpn_mulmid_basecase
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C K8,K9: 2.375 (2.5 when un - vn is "small")
+C K10: ?
+C P4: ?
+C P6-15: ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn', `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n', `%r11')
+define(`outer_addr', `%r14')
+define(`un', `%r13')
+define(`vp', `%r15')
+
+define(`vp_inner', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mulmid_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov vp_param, vp
+
+ C use un for row length (= un_param - vn + 1)
+ lea 1(un_param), un
+ sub vn, un
+
+ lea (rp,un,8), rp
+
+ cmp $4, un C TODO: needs tuning
+ jc L(diagonal)
+
+ lea (up,un_param,8), up
+
+ test $1, vn
+ jz L(mul_2)
+
+C ===========================================================
+C mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+ mov R32(un), R32(w0)
+
+ neg un
+ mov (up,un,8), %rax
+ mov (vp), v0
+ mul v0
+
+ and $-4, un C round down to multiple of 4
+ mov un, n
+
+ and $3, R32(w0)
+ jz L(mul_1_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_1_prologue_1)
+ jz L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+ mov %rax, w3
+ mov %rdx, w0
+ lea L(addmul_prologue_3)(%rip), outer_addr
+ jmp L(mul_1_entry_3)
+
+ ALIGN(16)
+L(mul_1_prologue_0):
+ mov %rax, w2
+ mov %rdx, w3 C note already w0 == 0
+ lea L(addmul_prologue_0)(%rip), outer_addr
+ jmp L(mul_1_entry_0)
+
+ ALIGN(16)
+L(mul_1_prologue_1):
+ add $4, n
+ mov %rax, w1
+ mov %rdx, w2
+ mov $0, R32(w3)
+ mov (up,n,8), %rax
+ lea L(addmul_prologue_1)(%rip), outer_addr
+ jmp L(mul_1_entry_1)
+
+ ALIGN(16)
+L(mul_1_prologue_2):
+ mov %rax, w0
+ mov %rdx, w1
+ mov 24(up,n,8), %rax
+ mov $0, R32(w2)
+ mov $0, R32(w3)
+ lea L(addmul_prologue_2)(%rip), outer_addr
+ jmp L(mul_1_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+L(mul_1_entry_1):
+ mov $0, R32(w0)
+ mul v0
+ mov w1, -8(rp,n,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov w2, (rp,n,8)
+ add %rax, w3
+ adc %rdx, w0
+L(mul_1_entry_3):
+ mov 16(up,n,8), %rax
+ mul v0
+ mov w3, 8(rp,n,8)
+ mov $0, R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,n,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, n
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ mov w2, 8(rp) C zero last limb of output
+ adc %rdx, w2
+ mov w2, (rp)
+
+ dec vn
+ jz L(ret)
+
+ lea -8(up), up
+ lea 8(vp), vp
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C mul_2 for vp[0], vp[1] if vn is even
+
+ ALIGN(16)
+L(mul_2):
+ mov R32(un), R32(w0)
+
+ neg un
+ mov -8(up,un,8), %rax
+ mov (vp), v0
+ mov 8(vp), v1
+ mul v1
+
+ and $-4, un C round down to multiple of 4
+ mov un, n
+
+ and $3, R32(w0)
+ jz L(mul_2_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_2_prologue_1)
+ jz L(mul_2_prologue_2)
+
+L(mul_2_prologue_3):
+ mov %rax, w1
+ mov %rdx, w2
+ lea L(addmul_prologue_3)(%rip), outer_addr
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_prologue_0):
+ mov %rax, w0
+ mov %rdx, w1
+ lea L(addmul_prologue_0)(%rip), outer_addr
+ jmp L(mul_2_entry_0)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ mov %rax, w3
+ mov %rdx, w0
+ mov $0, R32(w1)
+ lea L(addmul_prologue_1)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_2):
+ mov %rax, w2
+ mov %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,n,8), %rax
+ lea L(addmul_prologue_2)(%rip), outer_addr
+ jmp L(mul_2_entry_2)
+
+
+ C this loop is 18 c/loop = 2.25 c/l on K8
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -8(up,n,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+L(mul_2_entry_0):
+ mov $0, R32(w2)
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w0
+ mov (up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ mov w0, (rp,n,8)
+ adc %rdx, w2
+L(mul_2_entry_3):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov $0, R32(w0)
+ adc $0, R32(w3)
+ mov 8(up,n,8), %rax
+ mov w1, 8(rp,n,8)
+ mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+L(mul_2_entry_2):
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, 16(rp,n,8)
+ adc %rdx, w0
+L(mul_2_entry_1):
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, n
+ mov w3, -8(rp,n,8)
+ jnz L(mul_2_top)
+
+ mov w0, (rp)
+ mov w1, 8(rp)
+
+ sub $2, vn
+ jz L(ret)
+
+ lea 16(vp), vp
+ lea -16(up), up
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C addmul_2 for remaining vp's
+
+ ALIGN(16)
+L(addmul_prologue_0):
+ mov -8(up,n,8), %rax
+ mul v1
+ mov %rax, w1
+ mov %rdx, w2
+ mov $0, R32(w3)
+ jmp L(addmul_entry_0)
+
+ ALIGN(16)
+L(addmul_prologue_1):
+ mov 16(up,n,8), %rax
+ mul v1
+ mov %rax, w0
+ mov %rdx, w1
+ mov $0, R32(w2)
+ mov 24(up,n,8), %rax
+ jmp L(addmul_entry_1)
+
+ ALIGN(16)
+L(addmul_prologue_2):
+ mov 8(up,n,8), %rax
+ mul v1
+ mov %rax, w3
+ mov %rdx, w0
+ mov $0, R32(w1)
+ jmp L(addmul_entry_2)
+
+ ALIGN(16)
+L(addmul_prologue_3):
+ mov (up,n,8), %rax
+ mul v1
+ mov %rax, w2
+ mov %rdx, w3
+ mov $0, R32(w0)
+ mov $0, R32(w1)
+ jmp L(addmul_entry_3)
+
+ C this loop is 19 c/loop = 2.375 c/l on K8
+
+ ALIGN(16)
+L(addmul_top):
+ mov $0, R32(w3)
+ add %rax, w0
+ mov -8(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add w0, -8(rp,n,8)
+ adc %rax, w1
+ adc %rdx, w2
+L(addmul_entry_0):
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1
+ add w1, (rp,n,8)
+ mov $0, R32(w1)
+ adc %rax, w2
+ mov $0, R32(w0)
+ adc %rdx, w3
+L(addmul_entry_3):
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov 8(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add w2, 8(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+L(addmul_entry_2):
+ mov 16(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ mov 16(up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add w3, 16(rp,n,8)
+ nop C don't ask...
+ adc %rax, w0
+ mov $0, R32(w2)
+ mov 24(up,n,8), %rax
+ adc %rdx, w1
+L(addmul_entry_1):
+ mul v0
+ add $4, n
+ jnz L(addmul_top)
+
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+
+ add w0, -8(rp)
+ adc w1, (rp)
+ adc w2, 8(rp)
+
+ sub $2, vn
+ jz L(ret)
+
+ lea 16(vp), vp
+ lea -16(up), up
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C accumulate along diagonals if un - vn is small
+
+ ALIGN(16)
+L(diagonal):
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ xor R32(w2), R32(w2)
+
+ neg un
+
+ mov R32(vn), %eax
+ and $3, %eax
+ jz L(diag_prologue_0)
+ cmp $2, %eax
+ jc L(diag_prologue_1)
+ jz L(diag_prologue_2)
+
+L(diag_prologue_3):
+ lea -8(vp), vp
+ mov vp, vp_inner
+ add $1, vn
+ mov vn, n
+ lea L(diag_entry_3)(%rip), outer_addr
+ jmp L(diag_entry_3)
+
+L(diag_prologue_0):
+ mov vp, vp_inner
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov -8(up,n,8), %rax
+ jmp L(diag_entry_0)
+
+L(diag_prologue_1):
+ lea 8(vp), vp
+ mov vp, vp_inner
+ add $3, vn
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov -8(vp_inner), %rax
+ jmp L(diag_entry_1)
+
+L(diag_prologue_2):
+ lea -16(vp), vp
+ mov vp, vp_inner
+ add $2, vn
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov 16(vp_inner), %rax
+ jmp L(diag_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8
+
+ ALIGN(16)
+L(diag_top):
+ add %rax, w0
+ adc %rdx, w1
+ mov -8(up,n,8), %rax
+ adc $0, w2
+L(diag_entry_0):
+ mulq (vp_inner)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, w2
+L(diag_entry_3):
+ mov -16(up,n,8), %rax
+ mulq 8(vp_inner)
+ add %rax, w0
+ mov 16(vp_inner), %rax
+ adc %rdx, w1
+ adc $0, w2
+L(diag_entry_2):
+ mulq -24(up,n,8)
+ add %rax, w0
+ mov 24(vp_inner), %rax
+ adc %rdx, w1
+ lea 32(vp_inner), vp_inner
+ adc $0, w2
+L(diag_entry_1):
+ mulq -32(up,n,8)
+ sub $4, n
+ jnz L(diag_top)
+
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, w2
+
+ mov w0, (rp,un,8)
+
+ inc un
+ jz L(diag_end)
+
+ mov vn, n
+ mov vp, vp_inner
+
+ lea 8(up), up
+ mov w1, w0
+ mov w2, w1
+ xor R32(w2), R32(w2)
+
+ jmp *outer_addr
+
+L(diag_end):
+ mov w1, (rp)
+ mov w2, 8(rp)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
new file mode 100644
index 0000000..9327b21
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
@@ -0,0 +1,591 @@
+dnl X86-64 mpn_redc_1 optimised for AMD K8-K10.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * This looks different from other current redc_1.asm variants. Consider
+C adapting this to the mainstream style.
+C * Is this code really faster than more approaches which compute q0 later?
+C Is the use of a jump jump table faster? Or is the edge of this due to the
+C inlined add_n code?
+C * Put initial m[0] x q0 computation in header.
+C * Put basecases at the file's end, single them out before the pushes.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r11')
+define(`nneg', `%r12')
+define(`mp', `%r13')
+define(`q0', `%rbp')
+define(`vp', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ mov (up), q0 C up[0]
+ push %rbx
+ imul u0inv, q0 C first q0, for all execution paths
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov n, nneg
+ neg nneg
+ lea (mp_param,n,8), mp C mp += n
+ lea -16(up,n,8), up C up += n
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ lea 4(%rax), %r9
+ cmp $4, R32(n)
+ cmovg %r9, %rax
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(0m4), L(tab))
+ JMPENT( L(1m4), L(tab))
+ JMPENT( L(2m4), L(tab))
+ JMPENT( L(3m4), L(tab))
+ TEXT
+
+ ALIGN(16)
+L(1): mov (mp_param), %rax
+ mul q0
+ add 8(up), %rax
+ adc 16(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+ ALIGN(16)
+L(2): mov (mp_param), %rax
+ mul q0
+ xor R32(%r14), R32(%r14)
+ mov %rax, %r10
+ mov -8(mp), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up), %r10
+ adc %rax, %r9
+ adc %rdx, %r14
+ add 8(up), %r9
+ adc $0, %r14
+ mov %r9, q0
+ imul u0inv, q0
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov -8(mp), %rax
+ mov %rdx, %r11
+ mul q0
+ add %r9, %r10
+ adc %rax, %r11
+ adc %rdx, %rbx
+ add 16(up), %r11
+ adc $0, %rbx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 24(up), %rbx
+ mov %r14, (rp)
+ mov %rbx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+L(3): mov (mp_param), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add -8(up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add (up), %r10
+ mov %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ mov %r10, q0
+ imul u0inv, q0
+ add %r9, 8(up)
+ adc $0, %r14
+ mov %r14, -8(up)
+
+ mov -24(mp), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add (up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add 8(up), %r10
+ mov %r10, 8(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ mov %r10, q0
+ imul u0inv, q0
+ add %r9, 16(up)
+ adc $0, %r14
+ mov %r14, (up)
+
+ mov -24(mp), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add 8(up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add 16(up), %r10
+ adc %rax, %r9
+ adc %rdx, %r14
+ add 24(up), %r9
+ adc $0, %r14
+
+ xor R32(%rax), R32(%rax)
+ add -8(up), %r10
+ adc (up), %r9
+ adc 32(up), %r14
+ mov %r10, (rp)
+ mov %r9, 8(rp)
+ mov %r14, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+ ALIGN(16)
+L(2m4):
+L(lo2): mov (mp,nneg,8), %rax
+ mul q0
+ xor R32(%r14), R32(%r14)
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mov %rdx, %r9
+ mul q0
+ add 16(up,nneg,8), %r10
+ adc %rax, %r9
+ mov 16(mp,nneg,8), %rax
+ adc %rdx, %r14
+ mul q0
+ mov $0, R32(%r10) C xor?
+ lea 2(nneg), i
+ add %r9, %r15
+ imul u0inv, %r15
+ jmp L(e2)
+
+ ALIGN(16)
+L(li2): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+L(e2): add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li2)
+
+L(le2): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo2)
+
+ mov nneg, n
+ sar $2, n
+ lea 32(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -16(up), %r8
+ mov -8(up), %r9
+ add -16(vp), %r8
+ adc -8(vp), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ lea 16(rp), rp
+ jmp L(addx)
+
+
+ ALIGN(16)
+L(1m4):
+L(lo1): mov (mp,nneg,8), %rax
+ xor %r9, %r9
+ xor R32(%rbx), R32(%rbx)
+ mul q0
+ mov %rax, %r9
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mov %rdx, %r14
+ mov $0, R32(%r10) C xor?
+ mul q0
+ add 16(up,nneg,8), %r9
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 16(mp,nneg,8), %rax
+ mul q0
+ lea 1(nneg), i
+ add %r14, %r15
+ imul u0inv, %r15
+ jmp L(e1)
+
+ ALIGN(16)
+L(li1): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+L(e1): add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li1)
+
+L(le1): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo1)
+
+ mov nneg, n
+ sar $2, n
+ lea 24(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -8(up), %r8
+ add -8(vp), %r8
+ mov %r8, (rp)
+ lea 8(rp), rp
+ jmp L(addx)
+
+
+ ALIGN(16)
+L(0):
+L(0m4):
+L(lo0): mov (mp,nneg,8), %rax
+ mov nneg, i
+ mul q0
+ xor R32(%r10), R32(%r10)
+ mov %rax, %r14
+ mov %rdx, %rbx
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mul q0
+ add 16(up,nneg,8), %r14
+ adc %rax, %rbx
+ adc %rdx, %r10
+ add %rbx, %r15
+ imul u0inv, %r15
+ jmp L(e0)
+
+ ALIGN(16)
+L(li0): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+L(e0): mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li0)
+
+L(le0): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo0)
+
+ mov nneg, n
+ sar $2, n
+ clc
+ lea 16(up,nneg,8), up
+ lea (up,nneg,8), vp
+ jmp L(addy)
+
+
+ ALIGN(16)
+L(3m4):
+L(lo3): mov (mp,nneg,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mul q0
+ add 16(up,nneg,8), %rbx C result is zero, might carry
+ mov $0, R32(%rbx) C zero
+ mov %rbx, %r14 C zero
+ adc %rax, %r10
+ mov 16(mp,nneg,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ add %r10, %r15
+ mul q0
+ lea 3(nneg), i
+ imul u0inv, %r15
+C jmp L(li3)
+
+ ALIGN(16)
+L(li3): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li3)
+
+L(le3): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ mov %r15, q0
+ lea 8(up), up
+ dec n
+ jnz L(lo3)
+
+
+C ==== Addition code ====
+ mov nneg, n
+ sar $2, n
+ lea 40(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -24(up), %r8
+ mov -16(up), %r9
+ mov -8(up), %r10
+ add -24(vp), %r8
+ adc -16(vp), %r9
+ adc -8(vp), %r10
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ lea 24(rp), rp
+
+L(addx):inc n
+ jz L(ad3)
+
+L(addy):mov (up), %r8
+ mov 8(up), %r9
+ inc n
+ jmp L(mid)
+
+C ALIGN(16)
+L(al3): adc (vp), %r8
+ adc 8(vp), %r9
+ adc 16(vp), %r10
+ adc 24(vp), %r11
+ mov %r8, (rp)
+ lea 32(up), up
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ inc n
+ mov %r11, 24(rp)
+ lea 32(vp), vp
+ mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+L(mid): mov 16(up), %r10
+ mov 24(up), %r11
+ jnz L(al3)
+
+L(ae3): adc (vp), %r8
+ adc 8(vp), %r9
+ adc 16(vp), %r10
+ adc 24(vp), %r11
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+
+L(ad3): mov R32(n), R32(%rax) C zero
+ adc R32(%rax), R32(%rax)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
new file mode 100644
index 0000000..60cf945
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
@@ -0,0 +1,807 @@
+dnl AMD64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C large trip count. Instead, we should follow the generic/sqr_basecase.c
+C code which uses addmul_2s from the start, conditionally leaving a 1x1
+C multiply to the end. (In assembly code, one would stop invoking
+C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
+C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
+C save/restore carry, instead it can propagate into the high product word.
+C * Align more labels, should shave off a few cycles.
+C * We can safely use 32-bit size operations, since operands with (2^32)
+C limbs will lead to non-termination in practice.
+C * The jump table could probably be optimized, at least for non-pic.
+C * The special code for n <= 4 was quickly written. It is probably too
+C large and unnecessarily slow.
+C * Consider combining small cases code so that the n=k-1 code jumps into the
+C middle of the n=k code.
+C * Avoid saving registers for small cases code.
+C * Needed variables:
+C n r11 input size
+C i r8 work left, initially n
+C j r9 inner loop count
+C r15 unused
+C v0 r13
+C v1 r14
+C rp rdi
+C up rsi
+C w0 rbx
+C w1 rcx
+C w2 rbp
+C w3 r10
+C tp r12
+C lo rax
+C hi rdx
+C rsp
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+
+define(`n', `%r11')
+define(`tp', `%r12')
+define(`i', `%r8')
+define(`j', `%r9')
+define(`v0', `%r13')
+define(`v1', `%r14')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+ mov R32(n_param), R32(%rcx)
+ mov R32(n_param), R32(n) C free original n register (rdx)
+
+ add $-40, %rsp
+
+ and $3, R32(%rcx)
+ cmp $4, R32(n_param)
+ lea 4(%rcx), %r8
+
+ mov %rbx, 32(%rsp)
+ mov %rbp, 24(%rsp)
+ mov %r12, 16(%rsp)
+ mov %r13, 8(%rsp)
+ mov %r14, (%rsp)
+
+ cmovg %r8, %rcx
+
+ lea L(tab)(%rip), %rax
+ifdef(`PIC',
+` movslq (%rax,%rcx,4), %r10
+ add %r10, %rax
+ jmp *%rax
+',`
+ jmp *(%rax,%rcx,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(4), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(0m4), L(tab))
+ JMPENT( L(1m4), L(tab))
+ JMPENT( L(2m4), L(tab))
+ JMPENT( L(3m4), L(tab))
+ TEXT
+
+L(1): mov (up), %rax
+ mul %rax
+ add $40, %rsp
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(2): mov (up), %rax
+ mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ add $40, %rsp
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(3): mov (up), %rax
+ mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add $40, %rsp
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(4): mov (up), %rax
+ mov %rax, %r11
+ mul %rax
+ mov 8(up), %rbx
+ mov %rax, (rp)
+ mov %rbx, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+ mov 24(up), %rax
+ mul %rax
+ mov %rax, 48(rp)
+ mov %rbx, %rax
+ mov %rdx, 56(rp)
+
+ mul %r11
+ add $32, %rsp
+ mov %rax, %r8
+ mov %rdx, %r9
+ mov 16(up), %rax
+ mul %r11
+ xor %r10, %r10
+ add %rax, %r9
+ adc %rdx, %r10
+ mov 24(up), %rax
+ mul %r11
+ xor %r11, %r11
+ add %rax, %r10
+ adc %rdx, %r11
+ mov 16(up), %rax
+ mul %rbx
+ xor %rcx, %rcx
+ add %rax, %r10
+ adc %rdx, %r11
+ adc $0, %rcx
+ mov 24(up), %rax
+ mul %rbx
+ pop %rbx
+ add %rax, %r11
+ adc %rdx, %rcx
+ mov 16(up), %rdx
+ mov 24(up), %rax
+ mul %rdx
+ add %rax, %rcx
+ adc $0, %rdx
+
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %r11, %r11
+ adc %rcx, %rcx
+ mov $0, R32(%rax)
+ adc %rdx, %rdx
+
+ adc %rax, %rax
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %r11, 32(rp)
+ adc %rcx, 40(rp)
+ adc %rdx, 48(rp)
+ adc %rax, 56(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(0m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0
+ mov 8(up), %rax
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -4(n), i
+C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
+ xor R32(j), R32(j)
+ sub n, j
+
+ mul v0
+ xor R32(w2), R32(w2)
+ mov %rax, w0
+ mov 16(up,j,8), %rax
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(mul_1_m3_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+ xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+L(L3): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m3_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+ lea -8(up), up
+ jmp L(dowhile)
+
+
+L(1m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0 C u0
+ mov 8(up), %rax C u1
+ lea 8(up,n,8), up C point up at end of input operand
+
+ lea -3(n), i
+C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
+ lea -3(n), j
+ neg j
+
+ mov %rax, v1 C u1
+ mul v0 C u0 * u1
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov %rax, 8(rp)
+ jmp L(m0)
+
+ ALIGN(16)
+L(mul_2_m0_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+L(m0): mov -16(up,j,8), %rax C u2, u6 ...
+ mul v0 C u0 * u2
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+L(m2x): mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m0_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -16(up), up
+ lea eval(3*8-24)(tp), tp C tp += 3
+ jmp L(dowhile_end)
+
+
+L(2m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0
+ mov 8(up), %rax
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -4(n), i
+C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
+ lea -2(n), j
+ neg j
+
+ mul v0
+ mov %rax, w2
+ mov (up,j,8), %rax
+ mov %rdx, w1
+ jmp L(L1)
+
+ ALIGN(16)
+L(mul_1_m1_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+L(L1): xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+ xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m1_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+ lea -8(up), up
+ jmp L(dowhile_mid)
+
+
+L(3m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0 C u0
+ mov 8(up), %rax C u1
+ lea 8(up,n,8), up C point up at end of input operand
+
+ lea -5(n), i
+C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
+ lea -1(n), j
+ neg j
+
+ mov %rax, v1 C u1
+ mul v0 C u0 * u1
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov %rax, 8(rp)
+ jmp L(m2)
+
+ ALIGN(16)
+L(mul_2_m2_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+L(m2): mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m2_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -16(up), up
+ jmp L(dowhile_mid)
+
+L(dowhile):
+C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
+ lea 4(i), j
+ neg j
+
+ mov 16(up,j,8), v0
+ mov 24(up,j,8), v1
+ mov 24(up,j,8), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, 24(tp,j,8)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ jmp L(am2)
+
+ ALIGN(16)
+L(addmul_2_m2_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+ mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+L(am2): mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m2_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+
+ add $-2, R32(i) C i -= 2
+
+L(dowhile_mid):
+C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
+ lea 2(i), j
+ neg j
+
+ mov (up,j,8), v0
+ mov 8(up,j,8), v1
+ mov 8(up,j,8), %rax
+ mul v0
+ xor R32(w1), R32(w1)
+ add %rax, 8(tp,j,8)
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(20)
+
+ ALIGN(16)
+L(addmul_2_m0_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+L(20): mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m0_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+L(dowhile_end):
+
+ add $-2, R32(i) C i -= 2
+ jne L(dowhile)
+
+C Function mpn_addmul_2s_2
+ mov -16(up), v0
+ mov -8(up), v1
+ mov -8(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, -8(tp)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov (up), %rax
+ mul v0
+ add %rax, w3
+ mov (up), %rax
+ adc %rdx, w0
+ mul v1
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+C Function mpn_sqr_diag_addlsh1
+ lea -4(n,n), j
+
+ mov 8(rp), %r11
+ lea -8(up), up
+ lea (rp,j,8), rp
+ neg j
+ mov (up,j,4), %rax
+ mul %rax
+ test $2, R8(j)
+ jnz L(odd)
+
+L(evn): add %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ add %rdx, %r11
+ mov %rax, (rp,j,8)
+ jmp L(d0)
+
+L(odd): add %r11, %r11
+ sbb R32(%rbp), R32(%rbp) C save CF
+ add %rdx, %r11
+ mov %rax, (rp,j,8)
+ lea -2(j), j
+ jmp L(d1)
+
+ ALIGN(16)
+L(top): mov (up,j,4), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp,j,8)
+L(d0): mov %r11, 8(rp,j,8)
+ mov 16(rp,j,8), %r10
+ adc %r10, %r10
+ mov 24(rp,j,8), %r11
+ adc %r11, %r11
+ nop
+ sbb R32(%rbp), R32(%rbp) C save CF
+ mov 8(up,j,4), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, 16(rp,j,8)
+L(d1): mov %r11, 24(rp,j,8)
+ mov 32(rp,j,8), %r10
+ adc %r10, %r10
+ mov 40(rp,j,8), %r11
+ adc %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ add $4, j
+ js L(top)
+
+ mov (up), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp)
+ mov %r11, 8(rp)
+ mov 16(rp), %r10
+ adc %r10, %r10
+ sbb R32(%rbp), R32(%rbp) C save CF
+ neg R32(%rbp)
+ mov 8(up), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rbp, %rdx
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/logops_n.asm b/gmp-6.3.0/mpn/x86_64/logops_n.asm
new file mode 100644
index 0000000..e25854d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/logops_n.asm
@@ -0,0 +1,260 @@
+dnl AMD64 logops.
+
+dnl Copyright 2004-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l c/l c/l good
+C var-1 var-2 var-3 for cpu?
+C AMD K8,K9 1.5 1.5 1.5 y
+C AMD K10 1.5 1.5 1.5 y
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD bt1 2.67 ~2.79 ~2.67
+C AMD bt2 2.0 2.28 2.28 y
+C AMD zen 1.5 1.5 1.5 =
+C Intel P4 2.8 3.35 3.6
+C Intel PNR 2.0 2.0 2.0 =
+C Intel NHM 2.0 2.0 2.0 =
+C Intel SBR 1.5 1.75 1.75 n
+C Intel IBR 1.48 1.71 1.72 n
+C Intel HWL 1.5 1.5 1.5 n
+C Intel BWL 1.5 1.5 1.5 n
+C Intel SKL 1.5 1.5 1.5 n
+C Intel atom 3.82 3.82 3.82 n
+C Intel SLM 3.0 3.0 3.0 =
+C VIA nano 3.25
+
+ifdef(`OPERATION_and_n',`
+ define(`func',`mpn_and_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_andn_n',`
+ define(`func',`mpn_andn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_nand_n',`
+ define(`func',`mpn_nand_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_ior_n',`
+ define(`func',`mpn_ior_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func',`mpn_iorn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_nior_n',`
+ define(`func',`mpn_nior_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_xor_n',`
+ define(`func',`mpn_xor_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`xor')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func',`mpn_xnor_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`xor')')
+
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+
+ifdef(`VARIANT_1',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+ neg n
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ dec n
+ jmp L(e11)
+L(b10): add $-2, n
+ jmp L(e10)
+L(b01): LOGOP (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ inc n
+ jz L(ret)
+
+L(top): mov (vp,n,8), %r8
+L(b00): mov 8(vp,n,8), %r9
+ LOGOP (up,n,8), %r8
+ LOGOP 8(up,n,8), %r9
+ nop C K8/K9/K10 concession
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+L(e11): mov 16(vp,n,8), %r8
+L(e10): mov 24(vp,n,8), %r9
+ LOGOP 16(up,n,8), %r8
+ LOGOP 24(up,n,8), %r9
+ mov %r8, 16(rp,n,8)
+ mov %r9, 24(rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_2',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ not %r8
+ mov R32(%rcx), R32(%rax)
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+ neg n
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ dec n
+ jmp L(e11)
+L(b10): add $-2, n
+ jmp L(e10)
+ .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
+L(b01): LOGOP (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ inc n
+ jz L(ret)
+
+L(top): mov (vp,n,8), %r8
+ not %r8
+L(b00): mov 8(vp,n,8), %r9
+ not %r9
+ LOGOP (up,n,8), %r8
+ LOGOP 8(up,n,8), %r9
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+L(e11): mov 16(vp,n,8), %r8
+ not %r8
+L(e10): mov 24(vp,n,8), %r9
+ not %r9
+ LOGOP 16(up,n,8), %r8
+ LOGOP 24(up,n,8), %r9
+ mov %r8, 16(rp,n,8)
+ mov %r9, 24(rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_3',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ lea (vp,n,8), vp
+ lea (up,n,8), up
+ lea (rp,n,8), rp
+ neg n
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up,n,8), %r8
+ not %r8
+ mov %r8, (rp,n,8)
+ dec n
+ jmp L(e11)
+L(b10): add $-2, n
+ jmp L(e10)
+ .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
+L(b01): LOGOP (up,n,8), %r8
+ not %r8
+ mov %r8, (rp,n,8)
+ inc n
+ jz L(ret)
+
+L(top): mov (vp,n,8), %r8
+L(b00): mov 8(vp,n,8), %r9
+ LOGOP (up,n,8), %r8
+ not %r8
+ LOGOP 8(up,n,8), %r9
+ not %r9
+ mov %r8, (rp,n,8)
+ mov %r9, 8(rp,n,8)
+L(e11): mov 16(vp,n,8), %r8
+L(e10): mov 24(vp,n,8), %r9
+ LOGOP 16(up,n,8), %r8
+ not %r8
+ LOGOP 24(up,n,8), %r9
+ not %r9
+ mov %r8, 16(rp,n,8)
+ mov %r9, 24(rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
diff --git a/gmp-6.3.0/mpn/x86_64/lshift.asm b/gmp-6.3.0/mpn/x86_64/lshift.asm
new file mode 100644
index 0000000..fff3152
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/lshift.asm
@@ -0,0 +1,172 @@
+dnl AMD64 mpn_lshift -- mpn left shift.
+
+dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012, 2018 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cnt=1
+C AMD K8,K9 2.375 1.375
+C AMD K10 2.375 1.375
+C Intel P4 8 10.5
+C Intel core2 2.11 4.28
+C Intel corei ? ?
+C Intel atom 5.75 3.5
+C VIA nano 3.5 2.25
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ neg R32(%rcx) C put rsh count in cl
+ mov -8(up,n,8), %rax
+ shr R8(%rcx), %rax C function return value
+
+ neg R32(%rcx) C put lsh count in cl
+ lea 1(n), R32(%r8)
+ and $3, R32(%r8)
+ je L(rlx) C jump for n = 3, 7, 11, ...
+
+ dec R32(%r8)
+ jne L(1)
+C n = 4, 8, 12, ...
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ mov %r10, -8(rp,n,8)
+ dec n
+ jmp L(rll)
+
+L(1): dec R32(%r8)
+ je L(1x) C jump for n = 1, 5, 9, 13, ...
+C n = 2, 6, 10, 16, ...
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ mov %r10, -8(rp,n,8)
+ dec n
+ neg R32(%rcx) C put lsh count in cl
+L(1x):
+ cmp $1, n
+ je L(ast)
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r11
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ mov -24(up,n,8), %r9
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, -8(rp,n,8)
+ mov %r11, -16(rp,n,8)
+ sub $2, n
+
+L(rll): neg R32(%rcx) C put lsh count in cl
+L(rlx): mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r11
+
+ sub $4, n C 4
+ jb L(end) C 2
+ ALIGN(16)
+L(top):
+ C finish stuff from lsh block
+ neg R32(%rcx) C put rsh count in cl
+ mov 16(up,n,8), %r8
+ mov 8(up,n,8), %r9
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, 24(rp,n,8)
+ mov %r11, 16(rp,n,8)
+ C start two new rsh
+ mov 0(up,n,8), %r8
+ mov -8(up,n,8), %r9
+ shr R8(%rcx), %r8
+ shr R8(%rcx), %r9
+
+ C finish stuff from rsh block
+ neg R32(%rcx) C put lsh count in cl
+ mov 8(up,n,8), %r10
+ mov 0(up,n,8), %r11
+ shl R8(%rcx), %r10
+ or %r10, %r8
+ shl R8(%rcx), %r11
+ or %r11, %r9
+ mov %r8, 8(rp,n,8)
+ mov %r9, 0(rp,n,8)
+ C start two new lsh
+ mov -8(up,n,8), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r10
+ shl R8(%rcx), %r11
+
+ sub $4, n
+ jae L(top) C 2
+L(end):
+ neg R32(%rcx) C put rsh count in cl
+ mov 8(up), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ mov (up), %r9
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, 16(rp)
+ mov %r11, 8(rp)
+
+ neg R32(%rcx) C put lsh count in cl
+L(ast): mov (up), %r10
+ shl R8(%rcx), %r10
+ mov %r10, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/lshiftc.asm
new file mode 100644
index 0000000..c4ba04a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/lshiftc.asm
@@ -0,0 +1,182 @@
+dnl AMD64 mpn_lshiftc -- mpn left shift with complement.
+
+dnl Copyright 2003, 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.75
+C AMD K10 2.75
+C Intel P4 ?
+C Intel core2 ?
+C Intel corei ?
+C Intel atom ?
+C VIA nano 3.75
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ neg R32(%rcx) C put rsh count in cl
+ mov -8(up,n,8), %rax
+ shr R8(%rcx), %rax C function return value
+
+ neg R32(%rcx) C put lsh count in cl
+ lea 1(n), R32(%r8)
+ and $3, R32(%r8)
+ je L(rlx) C jump for n = 3, 7, 11, ...
+
+ dec R32(%r8)
+ jne L(1)
+C n = 4, 8, 12, ...
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ not %r10
+ mov %r10, -8(rp,n,8)
+ dec n
+ jmp L(rll)
+
+L(1): dec R32(%r8)
+ je L(1x) C jump for n = 1, 5, 9, 13, ...
+C n = 2, 6, 10, 16, ...
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ not %r10
+ mov %r10, -8(rp,n,8)
+ dec n
+ neg R32(%rcx) C put lsh count in cl
+L(1x):
+ cmp $1, n
+ je L(ast)
+ mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r11
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ mov -24(up,n,8), %r9
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ not %r10
+ not %r11
+ mov %r10, -8(rp,n,8)
+ mov %r11, -16(rp,n,8)
+ sub $2, n
+
+L(rll): neg R32(%rcx) C put lsh count in cl
+L(rlx): mov -8(up,n,8), %r10
+ shl R8(%rcx), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r11
+
+ sub $4, n C 4
+ jb L(end) C 2
+ ALIGN(16)
+L(top):
+ C finish stuff from lsh block
+ neg R32(%rcx) C put rsh count in cl
+ mov 16(up,n,8), %r8
+ mov 8(up,n,8), %r9
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ not %r10
+ not %r11
+ mov %r10, 24(rp,n,8)
+ mov %r11, 16(rp,n,8)
+ C start two new rsh
+ mov 0(up,n,8), %r8
+ mov -8(up,n,8), %r9
+ shr R8(%rcx), %r8
+ shr R8(%rcx), %r9
+
+ C finish stuff from rsh block
+ neg R32(%rcx) C put lsh count in cl
+ mov 8(up,n,8), %r10
+ mov 0(up,n,8), %r11
+ shl R8(%rcx), %r10
+ or %r10, %r8
+ shl R8(%rcx), %r11
+ or %r11, %r9
+ not %r8
+ not %r9
+ mov %r8, 8(rp,n,8)
+ mov %r9, 0(rp,n,8)
+ C start two new lsh
+ mov -8(up,n,8), %r10
+ mov -16(up,n,8), %r11
+ shl R8(%rcx), %r10
+ shl R8(%rcx), %r11
+
+ sub $4, n
+ jae L(top) C 2
+L(end):
+ neg R32(%rcx) C put rsh count in cl
+ mov 8(up), %r8
+ shr R8(%rcx), %r8
+ or %r8, %r10
+ mov (up), %r9
+ shr R8(%rcx), %r9
+ or %r9, %r11
+ not %r10
+ not %r11
+ mov %r10, 16(rp)
+ mov %r11, 8(rp)
+
+ neg R32(%rcx) C put lsh count in cl
+L(ast): mov (up), %r10
+ shl R8(%rcx), %r10
+ not %r10
+ mov %r10, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/lshsub_n.asm b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm
new file mode 100644
index 0000000..4d428c0
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm
@@ -0,0 +1,172 @@
+dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
+
+dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
+C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
+C Intel P4 16.5
+C Intel core2 4.35
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+C This was written quickly and not optimized at all, but it runs very well on
+C K8. But perhaps one could get under 3 c/l. Ideas:
+C 1) Use indexing to save the 3 LEA
+C 2) Write reasonable feed-in code
+C 3) Be more clever about register usage
+C 4) Unroll more, handling CL negation, carry save/restore cost much now
+C 5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshsub_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push %rbx
+
+ mov n, %rax
+ xor R32(%rbx), R32(%rbx) C clear carry save register
+ mov R32(%r8), R32(%rcx) C shift count
+ xor R32(%r15), R32(%r15) C limb carry
+
+ mov R32(%rax), R32(%r11)
+ and $3, R32(%r11)
+ je L(4)
+ sub $1, R32(%r11)
+
+L(oopette):
+ add R32(%rbx), R32(%rbx) C restore carry flag
+ mov 0(up), %r8
+ lea 8(up), up
+ sbb 0(vp), %r8
+ mov %r8, %r12
+ sbb R32(%rbx), R32(%rbx) C save carry flag
+ shl R8(%rcx), %r8
+ or %r15, %r8
+ mov %r12, %r15
+ lea 8(vp), vp
+ neg R8(%rcx)
+ shr R8(%rcx), %r15
+ neg R8(%rcx)
+ mov %r8, 0(rp)
+ lea 8(rp), rp
+ sub $1, R32(%r11)
+ jnc L(oopette)
+
+L(4):
+ sub $4, %rax
+ jc L(end)
+
+ ALIGN(16)
+L(oop):
+ add R32(%rbx), R32(%rbx) C restore carry flag
+
+ mov 0(up), %r8
+ mov 8(up), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+
+ lea 32(up), up
+
+ sbb 0(vp), %r8
+ mov %r8, %r12
+ sbb 8(vp), %r9
+ mov %r9, %r13
+ sbb 16(vp), %r10
+ mov %r10, %r14
+ sbb 24(vp), %r11
+
+ sbb R32(%rbx), R32(%rbx) C save carry flag
+
+ shl R8(%rcx), %r8
+ shl R8(%rcx), %r9
+ shl R8(%rcx), %r10
+ or %r15, %r8
+ mov %r11, %r15
+ shl R8(%rcx), %r11
+
+ lea 32(vp), vp
+
+ neg R8(%rcx)
+
+ shr R8(%rcx), %r12
+ shr R8(%rcx), %r13
+ shr R8(%rcx), %r14
+ shr R8(%rcx), %r15 C used next loop
+
+ or %r12, %r9
+ or %r13, %r10
+ or %r14, %r11
+
+ neg R8(%rcx)
+
+ mov %r8, 0(rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+
+ lea 32(rp), rp
+
+ sub $4, %rax
+ jnc L(oop)
+L(end):
+ neg R32(%rbx)
+ shl R8(%rcx), %rbx
+ adc %r15, %rbx
+ mov %rbx, %rax
+ pop %rbx
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/missing-call.m4 b/gmp-6.3.0/mpn/x86_64/missing-call.m4
new file mode 100644
index 0000000..c024f0e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/missing-call.m4
@@ -0,0 +1,53 @@
+dnl AMD64 MULX/ADX simulation support, function call version.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+define(`adox',`
+ push $1
+ push $2
+ call __gmp_adox
+ pop $2
+')
+
+define(`adcx',`
+ push $1
+ push $2
+ call __gmp_adcx
+ pop $2
+')
+
+define(`mulx',`
+ push $1
+ call __gmp_mulx
+ pop $2
+ pop $3
+')
diff --git a/gmp-6.3.0/mpn/x86_64/missing-inline.m4 b/gmp-6.3.0/mpn/x86_64/missing-inline.m4
new file mode 100644
index 0000000..bd1df13
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/missing-inline.m4
@@ -0,0 +1,100 @@
+dnl AMD64 MULX/ADX simulation support, inline version.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+define(`adox',`
+ push $2
+ push %rcx
+ push %rbx
+ push %rax
+ mov $1, %rcx
+ pushfq
+ pushfq
+C copy 0(%rsp):11 to 0(%rsp):0
+ mov (%rsp), %rbx
+ shr %rbx
+ bt $`'10, %rbx
+ adc %rbx, %rbx
+ mov %rbx, (%rsp)
+C put manipulated flags into eflags, execute a plain adc
+ popfq
+ adc %rcx, 32(%rsp)
+C copy CF to 0(%rsp):11
+ mov (%rsp), %rbx
+ sbb R32(%rax), R32(%rax)
+ and $`'0x800, R32(%rax)
+ and $`'0xfffffffffffff7ff, %rbx
+ or %rax, %rbx
+ mov %rbx, (%rsp)
+C put manipulated flags into eflags
+ popfq
+ pop %rax
+ pop %rbx
+ pop %rcx
+ pop $2
+')
+
+define(`adcx',`
+ push $2
+ push %rcx
+ push %rbx
+ push %rax
+ mov $1, %rcx
+ pushfq
+ adc %rcx, 32(%rsp)
+ mov (%rsp), %rbx
+ sbb R32(%rax), R32(%rax)
+ and $`'0xfffffffffffffffe, %rbx
+ sub %rax, %rbx
+ mov %rbx, (%rsp)
+ popfq
+ pop %rax
+ pop %rbx
+ pop %rcx
+ pop $2
+')
+
+define(`mulx',`
+ lea -16(%rsp), %rsp
+ push %rax
+ push %rdx
+ pushfq C preserve all flags
+ mov $1, %rax
+ mul %rdx
+ mov %rax, 24(%rsp)
+ mov %rdx, 32(%rsp)
+ popfq C restore eflags
+ pop %rdx
+ pop %rax
+ pop $2
+ pop $3
+')
diff --git a/gmp-6.3.0/mpn/x86_64/missing.asm b/gmp-6.3.0/mpn/x86_64/missing.asm
new file mode 100644
index 0000000..9b65c89
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/missing.asm
@@ -0,0 +1,130 @@
+
+ dnl AMD64 MULX/ADX simulation support.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ASM_START()
+
+C Fake the MULX instruction
+C
+C Accept the single explicit parameter on the stack, return the two result
+C words on the stack. This calling convention means that we need to move the
+C return address up.
+C
+PROLOGUE(__gmp_mulx)
+ lea -8(%rsp), %rsp
+ push %rax
+ push %rdx
+ pushfq C preserve all flags
+ mov 32(%rsp), %rax C move retaddr...
+ mov %rax, 24(%rsp) C ...up the stack
+ mov 40(%rsp), %rax C input parameter
+ mul %rdx
+ mov %rax, 32(%rsp)
+ mov %rdx, 40(%rsp)
+ popfq C restore eflags
+ pop %rdx
+ pop %rax
+ ret
+EPILOGUE()
+PROTECT(__gmp_mulx)
+
+
+C Fake the ADOX instruction
+C
+C Accept the two parameters on the stack, return the result word on the stack.
+C This calling convention means that we need to move the return address down.
+C
+PROLOGUE(__gmp_adox)
+ push %rcx
+ push %rbx
+ push %rax
+ mov 32(%rsp), %rcx C src2
+ mov 24(%rsp), %rax C move retaddr...
+ mov %rax, 32(%rsp) C ...down the stack
+ pushfq
+C copy 0(%rsp):11 to 0(%rsp):0
+ mov (%rsp), %rbx
+ shr %rbx
+ bt $10, %rbx
+ adc %rbx, %rbx
+ push %rbx
+C put manipulated flags into eflags, execute a plain adc
+ popfq
+ adc %rcx, 48(%rsp)
+C copy CF to 0(%rsp):11
+ pop %rbx
+ sbb R32(%rax), R32(%rax)
+ and $0x800, R32(%rax)
+ and $0xfffffffffffff7ff, %rbx
+ or %rax, %rbx
+ push %rbx
+C put manipulated flags into eflags
+ popfq
+ pop %rax
+ pop %rbx
+ pop %rcx
+ lea 8(%rsp), %rsp
+ ret
+EPILOGUE()
+PROTECT(__gmp_adox)
+
+
+C Fake the ADCX instruction
+C
+C Accept the two parameters on the stack, return the result word on the stack.
+C This calling convention means that we need to move the return address down.
+C
+PROLOGUE(__gmp_adcx)
+ push %rcx
+ push %rbx
+ push %rax
+ mov 32(%rsp), %rcx C src2
+ mov 24(%rsp), %rax C move retaddr...
+ mov %rax, 32(%rsp) C ...down the stack
+ pushfq
+ adc %rcx, 48(%rsp)
+ pop %rbx
+ sbb R32(%rax), R32(%rax)
+ and $`'0xfffffffffffffffe, %rbx
+ sub %rax, %rbx
+ push %rbx
+ popfq
+ pop %rax
+ pop %rbx
+ pop %rcx
+ lea 8(%rsp), %rsp
+ ret
+EPILOGUE()
+PROTECT(__gmp_adcx)
diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_1.asm b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm
new file mode 100644
index 0000000..255305f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm
@@ -0,0 +1,238 @@
+dnl AMD64 mpn_mod_1_1p
+
+dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
+
+dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 6
+C AMD K10 6
+C Intel P4 26
+C Intel core2 12.5
+C Intel NHM 11.3
+C Intel SBR 8.4 (slowdown, old code took 8.0)
+C Intel atom 26
+C VIA nano 13
+
+define(`B2mb', `%r10')
+define(`B2modb', `%r11')
+define(`ap', `%rdi')
+define(`n', `%rsi')
+define(`pre', `%r8')
+define(`b', `%rbx')
+
+define(`r0', `%rbp') C r1 kept in %rax
+define(`r2', `%rcx') C kept negated. Also used as shift count
+define(`t0', `%r9')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
+C %rdi %rsi %rdx %rcx
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+C The iteration is almost as follows,
+C
+C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
+C
+C where r2 is a single bit represented as a mask. But to make sure that the
+C result fits in two limbs and a bit, carry from the addition
+C
+C r_0 + r_2 B2mod
+C
+C is handled specially. On carry, we subtract b to cancel the carry,
+C and we use instead the value
+C
+C r_0 + B2mb (mod B)
+C
+C This addition can be issued early since it doesn't depend on r2, and it is
+C the source of the cmov in the loop.
+C
+C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+ FUNC_ENTRY(4)
+ push %rbp
+ push %rbx
+ mov %rdx, b
+ mov %rcx, pre
+
+ mov -8(ap, n, 8), %rax
+ cmp $3, n
+ jnc L(first)
+ mov -16(ap, n, 8), r0
+ jmp L(reduce_two)
+
+L(first):
+ C First iteration, no r2
+ mov 24(pre), B2modb
+ mul B2modb
+ mov -24(ap, n, 8), r0
+ add %rax, r0
+ mov -16(ap, n, 8), %rax
+ adc %rdx, %rax
+ sbb r2, r2
+ sub $4, n
+ jc L(reduce_three)
+
+ mov B2modb, B2mb
+ sub b, B2mb
+
+ ALIGN(16)
+L(top): and B2modb, r2
+ lea (B2mb, r0), t0
+ mul B2modb
+ add r0, r2
+ mov (ap, n, 8), r0
+ cmovc t0, r2
+ add %rax, r0
+ mov r2, %rax
+ adc %rdx, %rax
+ sbb r2, r2
+ sub $1, n
+ jnc L(top)
+
+L(reduce_three):
+ C Eliminate r2
+ and b, r2
+ sub r2, %rax
+
+L(reduce_two):
+ mov 8(pre), R32(%rcx)
+ test R32(%rcx), R32(%rcx)
+ jz L(normalized)
+
+ C Unnormalized, use B1modb to reduce to size < B (b+1)
+ mulq 16(pre)
+ xor t0, t0
+ add %rax, r0
+ adc %rdx, t0
+ mov t0, %rax
+
+ C Left-shift to normalize
+ifdef(`SHLD_SLOW',`
+ shl R8(%rcx), %rax
+ mov r0, t0
+ neg R32(%rcx)
+ shr R8(%rcx), t0
+ or t0, %rax
+ neg R32(%rcx)
+',`
+ shld R8(%rcx), r0, %rax
+')
+ shl R8(%rcx), r0
+ jmp L(udiv)
+
+L(normalized):
+ mov %rax, t0
+ sub b, t0
+ cmovnc t0, %rax
+
+L(udiv):
+ lea 1(%rax), t0
+ mulq (pre)
+ add r0, %rax
+ adc t0, %rdx
+ imul b, %rdx
+ sub %rdx, r0
+ cmp r0, %rax
+ lea (b, r0), %rax
+ cmovnc r0, %rax
+ cmp b, %rax
+ jnc L(fix)
+L(ok): shr R8(%rcx), %rax
+
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+L(fix): sub b, %rax
+ jmp L(ok)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1_1p_cps)
+ FUNC_ENTRY(2)
+ push %rbp
+ bsr %rsi, %rcx
+ push %rbx
+ mov %rdi, %rbx
+ push %r12
+ xor $63, R32(%rcx)
+ mov %rsi, %r12
+ mov R32(%rcx), R32(%rbp)
+ sal R8(%rcx), %r12
+IFSTD(` mov %r12, %rdi ') C pass parameter
+IFDOS(` mov %r12, %rcx ') C pass parameter
+IFDOS(` sub $32, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ neg %r12
+ mov %r12, %r8
+ mov %rax, (%rbx) C store bi
+ mov %rbp, 8(%rbx) C store cnt
+ imul %rax, %r12
+ mov %r12, 24(%rbx) C store B2modb
+ mov R32(%rbp), R32(%rcx)
+ test R32(%rcx), R32(%rcx)
+ jz L(z)
+
+ mov $1, R32(%rdx)
+ifdef(`SHLD_SLOW',`
+ C Destroys %rax, unlike shld. Otherwise, we could do B1modb
+ C before B2modb, and get rid of the move %r12, %r8 above.
+
+ shl R8(%rcx), %rdx
+ neg R32(%rcx)
+ shr R8(%rcx), %rax
+ or %rax, %rdx
+ neg R32(%rcx)
+',`
+ shld R8(%rcx), %rax, %rdx
+')
+ imul %rdx, %r8
+ shr R8(%rcx), %r8
+ mov %r8, 16(%rbx) C store B1modb
+L(z):
+ pop %r12
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_2.asm b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm
new file mode 100644
index 0000000..40fcaeb
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm
@@ -0,0 +1,241 @@
+dnl AMD64 mpn_mod_1s_2p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4
+C AMD K10 4
+C Intel P4 19
+C Intel core2 8
+C Intel NHM 6.5
+C Intel SBR 4.5
+C Intel atom 28
+C VIA nano 8
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_2p)
+ FUNC_ENTRY(4)
+ push %r14
+ test $1, R8(%rsi)
+ mov %rdx, %r14
+ push %r13
+ mov %rcx, %r13
+ push %r12
+ push %rbp
+ push %rbx
+ mov 16(%rcx), %r10
+ mov 24(%rcx), %rbx
+ mov 32(%rcx), %rbp
+ je L(b0)
+ dec %rsi
+ je L(one)
+ mov -8(%rdi,%rsi,8), %rax
+ mul %r10
+ mov %rax, %r9
+ mov %rdx, %r8
+ mov (%rdi,%rsi,8), %rax
+ add -16(%rdi,%rsi,8), %r9
+ adc $0, %r8
+ mul %rbx
+ add %rax, %r9
+ adc %rdx, %r8
+ jmp L(11)
+
+L(b0): mov -8(%rdi,%rsi,8), %r8
+ mov -16(%rdi,%rsi,8), %r9
+
+L(11): sub $4, %rsi
+ jb L(ed2)
+ lea 40(%rdi,%rsi,8), %rdi
+ mov -40(%rdi), %r11
+ mov -32(%rdi), %rax
+ jmp L(m0)
+
+ ALIGN(16)
+L(top): mov -24(%rdi), %r9
+ add %rax, %r11
+ mov -16(%rdi), %rax
+ adc %rdx, %r12
+ mul %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %rdx, %r8
+ adc $0, %r8
+ mul %rbx
+ add %rax, %r9
+ mov %r12, %rax
+ adc %rdx, %r8
+ mul %rbp
+ sub $2, %rsi
+ jb L(ed1)
+ mov -40(%rdi), %r11
+ add %rax, %r9
+ mov -32(%rdi), %rax
+ adc %rdx, %r8
+L(m0): mul %r10
+ add %rax, %r11
+ mov %r9, %rax
+ mov %rdx, %r12
+ adc $0, %r12
+ mul %rbx
+ add %rax, %r11
+ lea -32(%rdi), %rdi C ap -= 4
+ mov %r8, %rax
+ adc %rdx, %r12
+ mul %rbp
+ sub $2, %rsi
+ jae L(top)
+
+L(ed0): mov %r11, %r9
+ mov %r12, %r8
+L(ed1): add %rax, %r9
+ adc %rdx, %r8
+L(ed2): mov 8(%r13), R32(%rdi) C cnt
+ mov %r8, %rax
+ mov %r9, %r8
+ mul %r10
+ add %rax, %r8
+ adc $0, %rdx
+L(1): xor R32(%rcx), R32(%rcx)
+ mov %r8, %r9
+ sub R32(%rdi), R32(%rcx)
+ shr R8(%rcx), %r9
+ mov R32(%rdi), R32(%rcx)
+ sal R8(%rcx), %rdx
+ or %rdx, %r9
+ sal R8(%rcx), %r8
+ mov %r9, %rax
+ mulq (%r13)
+ mov %rax, %rsi
+ inc %r9
+ add %r8, %rsi
+ adc %r9, %rdx
+ imul %r14, %rdx
+ sub %rdx, %r8
+ lea (%r8,%r14), %rax
+ cmp %r8, %rsi
+ cmovc %rax, %r8
+ mov %r8, %rax
+ sub %r14, %rax
+ cmovc %r8, %rax
+ mov R32(%rdi), R32(%rcx)
+ shr R8(%rcx), %rax
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ FUNC_EXIT()
+ ret
+L(one):
+ mov (%rdi), %r8
+ mov 8(%rcx), R32(%rdi)
+ xor %rdx, %rdx
+ jmp L(1)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_2p_cps)
+ FUNC_ENTRY(2)
+ push %rbp
+ bsr %rsi, %rcx
+ push %rbx
+ mov %rdi, %rbx
+ push %r12
+ xor $63, R32(%rcx)
+ mov %rsi, %r12
+ mov R32(%rcx), R32(%rbp) C preserve cnt over call
+ sal R8(%rcx), %r12 C b << cnt
+IFSTD(` mov %r12, %rdi ') C pass parameter
+IFDOS(` mov %r12, %rcx ') C pass parameter
+IFDOS(` sub $32, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ mov %r12, %r8
+ mov %rax, %r11
+ mov %rax, (%rbx) C store bi
+ mov %rbp, 8(%rbx) C store cnt
+ neg %r8
+ mov R32(%rbp), R32(%rcx)
+ mov $1, R32(%rsi)
+ifdef(`SHLD_SLOW',`
+ shl R8(%rcx), %rsi
+ neg R32(%rcx)
+ mov %rax, %rbp
+ shr R8(%rcx), %rax
+ or %rax, %rsi
+ mov %rbp, %rax
+ neg R32(%rcx)
+',`
+ shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
+')
+ imul %r8, %rsi
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 16(%rbx) C store B1modb
+
+ not %rdx
+ imul %r12, %rdx
+ lea (%rdx,%r12), %rsi
+ cmp %rdx, %rax
+ cmovnc %rdx, %rsi
+ mov %r11, %rax
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 24(%rbx) C store B2modb
+
+ not %rdx
+ imul %r12, %rdx
+ add %rdx, %r12
+ cmp %rdx, %rax
+ cmovnc %rdx, %r12
+
+ shr R8(%rcx), %r12
+ mov %r12, 32(%rbx) C store B3modb
+
+ pop %r12
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_4.asm b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm
new file mode 100644
index 0000000..6cf304c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm
@@ -0,0 +1,272 @@
+dnl AMD64 mpn_mod_1s_4p
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3
+C AMD K10 3
+C Intel P4 15.5
+C Intel core2 5
+C Intel corei 4
+C Intel atom 23
+C VIA nano 4.75
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+ FUNC_ENTRY(4)
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov %rdx, %r15
+ mov %rcx, %r14
+ mov 16(%rcx), %r11 C B1modb
+ mov 24(%rcx), %rbx C B2modb
+ mov 32(%rcx), %rbp C B3modb
+ mov 40(%rcx), %r13 C B4modb
+ mov 48(%rcx), %r12 C B5modb
+ xor R32(%r8), R32(%r8)
+ mov R32(%rsi), R32(%rdx)
+ and $3, R32(%rdx)
+ je L(b0)
+ cmp $2, R32(%rdx)
+ jc L(b1)
+ je L(b2)
+
+L(b3): lea -24(%rdi,%rsi,8), %rdi
+ mov 8(%rdi), %rax
+ mul %r11
+ mov (%rdi), %r9
+ add %rax, %r9
+ adc %rdx, %r8
+ mov 16(%rdi), %rax
+ mul %rbx
+ jmp L(m0)
+
+ ALIGN(8)
+L(b0): lea -32(%rdi,%rsi,8), %rdi
+ mov 8(%rdi), %rax
+ mul %r11
+ mov (%rdi), %r9
+ add %rax, %r9
+ adc %rdx, %r8
+ mov 16(%rdi), %rax
+ mul %rbx
+ add %rax, %r9
+ adc %rdx, %r8
+ mov 24(%rdi), %rax
+ mul %rbp
+ jmp L(m0)
+
+ ALIGN(8)
+L(b1): lea -8(%rdi,%rsi,8), %rdi
+ mov (%rdi), %r9
+ jmp L(m1)
+
+ ALIGN(8)
+L(b2): lea -16(%rdi,%rsi,8), %rdi
+ mov 8(%rdi), %r8
+ mov (%rdi), %r9
+ jmp L(m1)
+
+ ALIGN(16)
+L(top): mov -24(%rdi), %rax
+ mov -32(%rdi), %r10
+ mul %r11 C up[1] * B1modb
+ add %rax, %r10
+ mov -16(%rdi), %rax
+ mov $0, R32(%rcx)
+ adc %rdx, %rcx
+ mul %rbx C up[2] * B2modb
+ add %rax, %r10
+ mov -8(%rdi), %rax
+ adc %rdx, %rcx
+ sub $32, %rdi
+ mul %rbp C up[3] * B3modb
+ add %rax, %r10
+ mov %r13, %rax
+ adc %rdx, %rcx
+ mul %r9 C rl * B4modb
+ add %rax, %r10
+ mov %r12, %rax
+ adc %rdx, %rcx
+ mul %r8 C rh * B5modb
+ mov %r10, %r9
+ mov %rcx, %r8
+L(m0): add %rax, %r9
+ adc %rdx, %r8
+L(m1): sub $4, %rsi
+ ja L(top)
+
+L(end): mov 8(%r14), R32(%rsi)
+ mov %r8, %rax
+ mul %r11
+ mov %rax, %r8
+ add %r9, %r8
+ adc $0, %rdx
+ xor R32(%rcx), R32(%rcx)
+ sub R32(%rsi), R32(%rcx)
+ mov %r8, %rdi
+ shr R8(%rcx), %rdi
+ mov R32(%rsi), R32(%rcx)
+ sal R8(%rcx), %rdx
+ or %rdx, %rdi
+ mov %rdi, %rax
+ mulq (%r14)
+ mov %r15, %rbx
+ mov %rax, %r9
+ sal R8(%rcx), %r8
+ inc %rdi
+ add %r8, %r9
+ adc %rdi, %rdx
+ imul %rbx, %rdx
+ sub %rdx, %r8
+ lea (%r8,%rbx), %rax
+ cmp %r8, %r9
+ cmovc %rax, %r8
+ mov %r8, %rax
+ sub %rbx, %rax
+ cmovc %r8, %rax
+ shr R8(%rcx), %rax
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+ FUNC_ENTRY(2)
+ push %rbp
+ bsr %rsi, %rcx
+ push %rbx
+ mov %rdi, %rbx
+ push %r12
+ xor $63, R32(%rcx)
+ mov %rsi, %r12
+ mov R32(%rcx), R32(%rbp) C preserve cnt over call
+ sal R8(%rcx), %r12 C b << cnt
+IFSTD(` mov %r12, %rdi ') C pass parameter
+IFDOS(` mov %r12, %rcx ') C pass parameter
+IFDOS(` sub $32, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ mov %r12, %r8
+ mov %rax, %r11
+ mov %rax, (%rbx) C store bi
+ mov %rbp, 8(%rbx) C store cnt
+ neg %r8
+ mov R32(%rbp), R32(%rcx)
+ mov $1, R32(%rsi)
+ifdef(`SHLD_SLOW',`
+ shl R8(%rcx), %rsi
+ neg R32(%rcx)
+ mov %rax, %rbp
+ shr R8(%rcx), %rax
+ or %rax, %rsi
+ mov %rbp, %rax
+ neg R32(%rcx)
+',`
+ shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
+')
+ imul %r8, %rsi
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 16(%rbx) C store B1modb
+
+ not %rdx
+ imul %r12, %rdx
+ lea (%rdx,%r12), %rsi
+ cmp %rdx, %rax
+ cmovnc %rdx, %rsi
+ mov %r11, %rax
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 24(%rbx) C store B2modb
+
+ not %rdx
+ imul %r12, %rdx
+ lea (%rdx,%r12), %rsi
+ cmp %rdx, %rax
+ cmovnc %rdx, %rsi
+ mov %r11, %rax
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 32(%rbx) C store B3modb
+
+ not %rdx
+ imul %r12, %rdx
+ lea (%rdx,%r12), %rsi
+ cmp %rdx, %rax
+ cmovnc %rdx, %rsi
+ mov %r11, %rax
+ mul %rsi
+
+ add %rsi, %rdx
+ shr R8(%rcx), %rsi
+ mov %rsi, 40(%rbx) C store B4modb
+
+ not %rdx
+ imul %r12, %rdx
+ add %rdx, %r12
+ cmp %rdx, %rax
+ cmovnc %rdx, %r12
+
+ shr R8(%rcx), %r12
+ mov %r12, 48(%rbx) C store B5modb
+
+ pop %r12
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm
new file mode 100644
index 0000000..75421a6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm
@@ -0,0 +1,215 @@
+dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
+C AMD K10 0.67 this seems hard to beat
+C AMD bd1 1
+C AMD bd2 1
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 0.62
+C AMD bobcat 1.07
+C AMD jaguar 1
+C Intel P4 7.35 terrible, use old code
+C Intel core2 1.25 1+epsilon with huge unrolling
+C Intel NHM 1.15 this seems hard to beat
+C Intel SBR 0.93
+C Intel IBR 0.93
+C Intel HWL 0.82
+C Intel BWL 0.64
+C Intel SKY 0.60
+C Intel atom 2.5
+C Intel SLM 1.59
+C VIA nano 1.25 this seems hard to beat
+
+C INPUT PARAMETERS
+define(`ap', %rdi)
+define(`n', %rsi)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C * Review feed-in and wind-down code.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+ FUNC_ENTRY(2)
+
+ mov $0x0000FFFFFFFFFFFF, %r11
+
+ mov (ap), %rax
+
+ cmp $2, %rsi
+ ja L(gt2)
+
+ jb L(one)
+
+ mov 8(ap), %rsi
+ mov %rax, %rdx
+ shr $48, %rax C src[0] low
+
+ and %r11, %rdx C src[0] high
+ add %rdx, %rax
+ mov R32(%rsi), R32(%rdx)
+
+ shr $32, %rsi C src[1] high
+ add %rsi, %rax
+
+ shl $16, %rdx C src[1] low
+ add %rdx, %rax
+L(one): FUNC_EXIT()
+ ret
+
+
+C Don't change this, the wind-down code is not able to handle greater values
+define(UNROLL,3)
+
+L(gt2): mov 8(ap), %rcx
+ mov 16(ap), %rdx
+ xor %r9, %r9
+ add $24, ap
+ sub $eval(UNROLL*3+3), %rsi
+ jc L(end)
+ ALIGN(16)
+L(top):
+ add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+forloop(i,1,UNROLL-1,`dnl
+ add eval(i*24)(ap), %rax
+ adc eval(i*24+8)(ap), %rcx
+ adc eval(i*24+16)(ap), %rdx
+ adc $0, %r9
+')dnl
+ add $eval(UNROLL*24), ap
+ sub $eval(UNROLL*3), %rsi
+ jnc L(top)
+
+L(end):
+ lea L(tab)(%rip), %r8
+ifdef(`PIC',
+` movslq 36(%r8,%rsi,4), %r10
+ add %r10, %r8
+ jmp *%r8
+',`
+ jmp *72(%r8,%rsi,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
+ JMPENT( L(8), L(tab))
+ TEXT
+
+L(6): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+ add $24, ap
+L(3): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ jmp L(cj1)
+
+L(7): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+ add $24, ap
+L(4): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+ add $24, ap
+L(1): add (ap), %rax
+ adc $0, %rcx
+ jmp L(cj2)
+
+L(8): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+ add $24, ap
+L(5): add (ap), %rax
+ adc 8(ap), %rcx
+ adc 16(ap), %rdx
+ adc $0, %r9
+ add $24, ap
+L(2): add (ap), %rax
+ adc 8(ap), %rcx
+
+L(cj2): adc $0, %rdx
+L(cj1): adc $0, %r9
+L(0): add %r9, %rax
+ adc $0, %rcx
+ adc $0, %rdx
+ adc $0, %rax
+
+ mov %rax, %rdi C 0mod3
+ shr $48, %rax C 0mod3 high
+
+ and %r11, %rdi C 0mod3 low
+ mov R32(%rcx), R32(%r10) C 1mod3
+
+ shr $32, %rcx C 1mod3 high
+
+ add %rdi, %rax C apply 0mod3 low
+ movzwl %dx, R32(%rdi) C 2mod3
+ shl $16, %r10 C 1mod3 low
+
+ add %rcx, %rax C apply 1mod3 high
+ shr $16, %rdx C 2mod3 high
+
+ add %r10, %rax C apply 1mod3 low
+ shl $32, %rdi C 2mod3 low
+
+ add %rdx, %rax C apply 2mod3 high
+ add %rdi, %rax C apply 2mod3 low
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/mode1o.asm b/gmp-6.3.0/mpn/x86_64/mode1o.asm
new file mode 100644
index 0000000..2cd2b08
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mode1o.asm
@@ -0,0 +1,171 @@
+dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
+
+dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 10
+C AMD K10 10
+C Intel P4 33
+C Intel core2 13
+C Intel corei 14.5
+C Intel atom 35
+C VIA nano ?
+
+
+C The dependent chain in the main loop is
+C
+C cycles
+C sub %rdx, %rax 1
+C imul %r9, %rax 4
+C mul %r8 5
+C ----
+C total 10
+C
+C The mov load from src seems to need to be scheduled back before the jz to
+C achieve this speed, out-of-order execution apparently can't completely hide
+C the latency otherwise.
+C
+C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
+C for the first iteration (where there's no cbit).
+C
+C The code alignment used (32-byte) for the loop also seems necessary. Without
+C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
+C run at 11 cycles instead of 10.
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_modexact_1_odd)
+ FUNC_ENTRY(3)
+ mov $0, R32(%rcx)
+IFDOS(` jmp L(ent) ')
+
+PROLOGUE(mpn_modexact_1c_odd)
+ FUNC_ENTRY(4)
+L(ent):
+ C rdi src
+ C rsi size
+ C rdx divisor
+ C rcx carry
+
+ mov %rdx, %r8 C d
+ shr R32(%rdx) C d/2
+
+ LEA( binvert_limb_table, %r9)
+
+ and $127, R32(%rdx)
+ mov %rcx, %r10 C initial carry
+
+ movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits
+
+ mov (%rdi), %rax C src[0]
+ lea (%rdi,%rsi,8), %r11 C src end
+ mov %r8, %rdi C d, made available to imull
+
+ lea (%rdx,%rdx), R32(%rcx) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+
+ neg %rsi C -size
+
+ imul R32(%rdi), R32(%rdx) C inv*inv*d
+
+ sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rcx,%rcx), R32(%rdx) C 2*inv
+ imul R32(%rcx), R32(%rcx) C inv*inv
+
+ imul R32(%rdi), R32(%rcx) C inv*inv*d
+
+ sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits
+ xor R32(%rcx), R32(%rcx) C initial cbit
+
+ lea (%rdx,%rdx), %r9 C 2*inv
+ imul %rdx, %rdx C inv*inv
+
+ imul %r8, %rdx C inv*inv*d
+
+ sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits
+ mov %r10, %rdx C initial climb
+
+ ASSERT(e,` C d*inv == 1 mod 2^64
+ mov %r8, %r10
+ imul %r9, %r10
+ cmp $1, %r10')
+
+ inc %rsi
+ jz L(one)
+
+
+ ALIGN(16)
+L(top):
+ C rax l = src[i]-cbit
+ C rcx new cbit, 0 or 1
+ C rdx climb, high of last product
+ C rsi counter, limbs, negative
+ C rdi
+ C r8 divisor
+ C r9 inverse
+ C r11 src end ptr
+
+ sub %rdx, %rax C l = src[i]-cbit - climb
+
+ adc $0, %rcx C more cbit
+ imul %r9, %rax C q = l * inverse
+
+ mul %r8 C climb = high (q * d)
+
+ mov (%r11,%rsi,8), %rax C src[i+1]
+ sub %rcx, %rax C next l = src[i+1] - cbit
+ setc R8(%rcx) C new cbit
+
+ inc %rsi
+ jnz L(top)
+
+
+L(one):
+ sub %rdx, %rax C l = src[i]-cbit - climb
+
+ adc $0, %rcx C more cbit
+ imul %r9, %rax C q = l * inverse
+
+ mul %r8 C climb = high (q * d)
+
+ lea (%rcx,%rdx), %rax C climb+cbit
+ FUNC_EXIT()
+ ret
+
+EPILOGUE(mpn_modexact_1c_odd)
+EPILOGUE(mpn_modexact_1_odd)
diff --git a/gmp-6.3.0/mpn/x86_64/mul_1.asm b/gmp-6.3.0/mpn/x86_64/mul_1.asm
new file mode 100644
index 0000000..e1ba89b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mul_1.asm
@@ -0,0 +1,192 @@
+dnl AMD64 mpn_mul_1.
+
+dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.54
+C AMD K10 2.54
+C AMD bull 4.98
+C AMD pile 4.80
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.37
+C AMD jaguar 6.16
+C Intel P4 12.6
+C Intel core2 4.05
+C Intel NHM 4.0
+C Intel SBR 2.91
+C Intel IBR 2.73
+C Intel HWL 2.44
+C Intel BWL 2.39
+C Intel SKL 2.44
+C Intel atom 19.8
+C Intel SLM 9.0
+C VIA nano 4.25
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * The loop is great, but the prologue and epilogue code was quickly written.
+C Tune it!
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vl', `%rcx') C r9
+
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`vl', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+ push %rbx
+IFSTD(` mov %r8, %r10')
+IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns)
+ jmp L(common)
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ push %rbx
+ xor %r10, %r10
+L(common):
+ mov (up), %rax C read first u limb early
+IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
+ mul vl
+IFSTD(` mov %rbx, n ')
+
+ add %r10, %rax
+ adc $0, %rdx
+
+ and $3, R32(%rbx)
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): dec n
+ jne L(gt1)
+ mov %rax, (rp)
+ jmp L(ret)
+L(gt1): lea 8(up,n,8), up
+ lea -8(rp,n,8), rp
+ neg n
+ xor %r10, %r10
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r9
+ mov (up,n,8), %rax
+ mov %rdx, %r8
+ jmp L(L1)
+
+L(b0): lea (up,n,8), up
+ lea -16(rp,n,8), rp
+ neg n
+ xor %r10, %r10
+ mov %rax, %r8
+ mov %rdx, %rbx
+ jmp L(L0)
+
+L(b3): lea -8(up,n,8), up
+ lea -24(rp,n,8), rp
+ neg n
+ mov %rax, %rbx
+ mov %rdx, %r10
+ jmp L(L3)
+
+L(b2): lea -16(up,n,8), up
+ lea -32(rp,n,8), rp
+ neg n
+ xor %r8, %r8
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov 24(up,n,8), %rax
+ mov %rdx, %r9
+ jmp L(L2)
+
+ ALIGN(16)
+L(top): mov %r10, (rp,n,8)
+ add %rax, %r9
+ mov (up,n,8), %rax
+ adc %rdx, %r8
+ mov $0, R32(%r10)
+L(L1): mul vl
+ mov %r9, 8(rp,n,8)
+ add %rax, %r8
+ adc %rdx, %rbx
+L(L0): mov 8(up,n,8), %rax
+ mul vl
+ mov %r8, 16(rp,n,8)
+ add %rax, %rbx
+ adc %rdx, %r10
+L(L3): mov 16(up,n,8), %rax
+ mul vl
+ mov %rbx, 24(rp,n,8)
+ mov $0, R32(%r8) C zero
+ mov %r8, %rbx C zero
+ add %rax, %r10
+ mov 24(up,n,8), %rax
+ mov %r8, %r9 C zero
+ adc %rdx, %r9
+L(L2): mul vl
+ add $4, n
+ js L(top)
+
+ mov %r10, (rp,n,8)
+ add %rax, %r9
+ adc %r8, %rdx
+ mov %r9, 8(rp,n,8)
+ add %r8, %rdx
+L(ret): mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/mul_2.asm b/gmp-6.3.0/mpn/x86_64/mul_2.asm
new file mode 100644
index 0000000..d64313b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mul_2.asm
@@ -0,0 +1,204 @@
+dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl store the result in a third limb vector.
+
+dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.53
+C AMD K10 4.53
+C AMD bull 9.76 10.37
+C AMD pile 9.22
+C AMD steam
+C AMD excavator
+C AMD bobcat 11.3
+C AMD jaguar 11.9
+C Intel P4 25.0
+C Intel core2 8.05
+C Intel NHM 7.72
+C Intel SBR 6.33
+C Intel IBR 6.15
+C Intel HWL 6.00
+C Intel BWL 4.44
+C Intel SKL 4.54
+C Intel atom 39.0
+C Intel SLM 24.0
+C VIA nano
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Work on feed-in and wind-down code.
+C * Convert "mov $0" to "xor".
+C * Adjust initial lea to save some bytes.
+C * Perhaps adjust n from n_param&3 value?
+C * Replace with 2.25 c/l sequence.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vp', `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+
+ mov n_param, n
+ neg n
+ lea -8(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+
+ and $3, R32(n_param)
+ jz L(m2p0)
+ cmp $2, R32(n_param)
+ jc L(m2p1)
+ jz L(m2p2)
+L(m2p3):
+ mul v0
+ xor R32(w3), R32(w3)
+ mov %rax, w1
+ mov %rdx, w2
+ mov 8(up,n,8), %rax
+ add $-1, n
+ mul v1
+ add %rax, w2
+ jmp L(m23)
+L(m2p0):
+ mul v0
+ xor R32(w2), R32(w2)
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(m20)
+L(m2p1):
+ mul v0
+ xor R32(w3), R32(w3)
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ add $1, n
+ jmp L(m2top)
+L(m2p2):
+ mul v0
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(up,n,8), %rax
+ add $-2, n
+ jmp L(m22)
+
+
+ ALIGN(32)
+L(m2top):
+ add %rax, w3
+ adc %rdx, w0
+ mov 0(up,n,8), %rax
+ adc $0, R32(w1)
+ mov $0, R32(w2)
+ mul v1
+ add %rax, w0
+ mov w3, 0(rp,n,8)
+ adc %rdx, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(m20): mov 8(up,n,8), %rax
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 16(up,n,8), %rax
+ mov $0, R32(w3)
+ mul v0
+ add %rax, w1
+ mov 16(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1
+ add %rax, w2
+ mov w0, 8(rp,n,8)
+L(m23): adc %rdx, w3
+ mov 24(up,n,8), %rax
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov w1, 16(rp,n,8)
+ mov 24(up,n,8), %rax
+ mov $0, R32(w1)
+ adc $0, R32(w0)
+L(m22): mul v1
+ add %rax, w3
+ mov w2, 24(rp,n,8)
+ adc %rdx, w0
+ mov 32(up,n,8), %rax
+ mul v0
+ add $4, n
+ js L(m2top)
+
+
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mov (up), %rax
+ mul v1
+ mov w3, (rp)
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm
new file mode 100644
index 0000000..9ceb611
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm
@@ -0,0 +1,157 @@
+dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 -
+C AMD zen ?
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL -
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+define(`rp', `%rdi') dnl rcx
+define(`up', `%rsi') dnl rdx
+define(`n_param', `%rdx') dnl r8
+define(`v0_param',`%rcx') dnl r9
+
+define(`n', `%rcx') dnl
+define(`v0', `%rdx') dnl
+
+C Testing mechanism for running this on older AMD64 processors
+ifelse(FAKE_MULXADX,1,`
+ include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
+',`
+ define(`adox', ``adox' $1, $2')
+ define(`adcx', ``adcx' $1, $2')
+ define(`mulx', ``mulx' $1, $2, $3')
+')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+ mov (up), %r8
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jl L(b1)
+ jz L(b2)
+
+L(b3): mulx( (up,n,8), %r11, %r10)
+ mulx( 8(up,n,8), %r13, %r12)
+ mulx( 16(up,n,8), %rbx, %rax)
+ dec n
+ jmp L(lo3)
+
+L(b0): mulx( (up,n,8), %r9, %r8)
+ mulx( 8(up,n,8), %r11, %r10)
+ mulx( 16(up,n,8), %r13, %r12)
+ jmp L(lo0)
+
+L(b2): mulx( (up,n,8), %r13, %r12)
+ mulx( 8(up,n,8), %rbx, %rax)
+ lea 2(n), n
+ jrcxz L(wd2)
+L(gt2): mulx( (up,n,8), %r9, %r8)
+ jmp L(lo2)
+
+L(b1): and R8(%rax), R8(%rax)
+ mulx( (up,n,8), %rbx, %rax)
+ lea 1(n), n
+ jrcxz L(wd1)
+ mulx( (up,n,8), %r9, %r8)
+ mulx( 8(up,n,8), %r11, %r10)
+ jmp L(lo1)
+
+L(end): adcx( %r10, %r13)
+ mov %r11, -8(rp)
+L(wd2): adox( (rp), %r13)
+ adcx( %r12, %rbx)
+ mov %r13, (rp)
+L(wd1): adox( 8(rp), %rbx)
+ adcx( %rcx, %rax)
+ adox( %rcx, %rax)
+ mov %rbx, 8(rp)
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+
+L(top): jrcxz L(end)
+ mulx( (up,n,8), %r9, %r8)
+ adcx( %r10, %r13)
+ mov %r11, -8(rp,n,8)
+L(lo2): adox( (rp,n,8), %r13)
+ mulx( 8(up,n,8), %r11, %r10)
+ adcx( %r12, %rbx)
+ mov %r13, (rp,n,8)
+L(lo1): adox( 8(rp,n,8), %rbx)
+ mulx( 16(up,n,8), %r13, %r12)
+ adcx( %rax, %r9)
+ mov %rbx, 8(rp,n,8)
+L(lo0): adox( 16(rp,n,8), %r9)
+ mulx( 24(up,n,8), %rbx, %rax)
+ adcx( %r8, %r11)
+ mov %r9, 16(rp,n,8)
+L(lo3): adox( 24(rp,n,8), %r11)
+ lea 4(n), n
+ jmp L(top)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyd.asm b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm
new file mode 100644
index 0000000..f0dc54a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyi.asm b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm
new file mode 100644
index 0000000..9c26e00
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm
new file mode 100644
index 0000000..e9a0763
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm
@@ -0,0 +1,166 @@
+dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C norm unorm
+C AMD K8,K9 11 11
+C AMD K10 11 11
+C Intel P4 ?
+C Intel core2 13.5 13.25
+C Intel corei 14.25
+C Intel atom 34 36
+C VIA nano 19.25 19.25
+
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+C divisor rcx
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov %rcx, %rax
+ xor R32(%rcx), R32(%rcx) C shift count
+ mov %rdx, %r8
+
+ bt $0, R32(%rax)
+ jc L(odd) C skip bsfq unless divisor is even
+ bsf %rax, %rcx
+ shr R8(%rcx), %rax
+L(odd): mov %rax, %rbx
+ shr R32(%rax)
+ and $127, R32(%rax) C d/2, 7 bits
+
+ LEA( binvert_limb_table, %rdx)
+
+ movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+
+ mov %rbx, %r11 C d without twos
+
+ lea (%rax,%rax), R32(%rdx) C 2*inv
+ imul R32(%rax), R32(%rax) C inv*inv
+ imul R32(%rbx), R32(%rax) C inv*inv*d
+ sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rdx,%rdx), R32(%rax) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+ imul R32(%rbx), R32(%rdx) C inv*inv*d
+ sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+
+ lea (%rax,%rax), %r10 C 2*inv
+ imul %rax, %rax C inv*inv
+ imul %rbx, %rax C inv*inv*d
+ sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
+
+ lea (%rsi,%r8,8), %rsi C up end
+ lea -8(%rdi,%r8,8), %rdi C rp end
+ neg %r8 C -n
+
+ mov (%rsi,%r8,8), %rax C up[0]
+
+ inc %r8
+ jz L(one)
+
+ test R32(%rcx), R32(%rcx)
+ jnz L(unorm) C branch if count != 0
+ xor R32(%rbx), R32(%rbx)
+ jmp L(nent)
+
+ ALIGN(8)
+L(ntop):mul %r11 C carry limb in rdx 0 10
+ mov -8(%rsi,%r8,8), %rax C
+ sub %rbx, %rax C apply carry bit
+ setc %bl C
+ sub %rdx, %rax C apply carry limb 5
+ adc $0, %rbx C 6
+L(nent):imul %r10, %rax C 6
+ mov %rax, (%rdi,%r8,8) C
+ inc %r8 C
+ jnz L(ntop)
+
+ mov -8(%rsi), %r9 C up high limb
+ jmp L(com)
+
+L(unorm):
+ mov (%rsi,%r8,8), %r9 C up[1]
+ shr R8(%rcx), %rax C
+ neg R32(%rcx)
+ shl R8(%rcx), %r9 C
+ neg R32(%rcx)
+ or %r9, %rax
+ xor R32(%rbx), R32(%rbx)
+ jmp L(uent)
+
+ ALIGN(8)
+L(utop):mul %r11 C carry limb in rdx 0 10
+ mov (%rsi,%r8,8), %rax C
+ shl R8(%rcx), %rax C
+ neg R32(%rcx)
+ or %r9, %rax
+ sub %rbx, %rax C apply carry bit
+ setc %bl C
+ sub %rdx, %rax C apply carry limb 5
+ adc $0, %rbx C 6
+L(uent):imul %r10, %rax C 6
+ mov (%rsi,%r8,8), %r9 C
+ shr R8(%rcx), %r9 C
+ neg R32(%rcx)
+ mov %rax, (%rdi,%r8,8) C
+ inc %r8 C
+ jnz L(utop)
+
+L(com): mul %r11 C carry limb in rdx
+ sub %rbx, %r9 C apply carry bit
+ sub %rdx, %r9 C apply carry limb
+ imul %r10, %r9
+ mov %r9, (%rdi)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(one): shr R8(%rcx), %rax
+ imul %r10, %rax
+ mov %rax, (%rdi)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h
new file mode 100644
index 0000000..fde69db
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h
@@ -0,0 +1,243 @@
+/* VIA Nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define SHLD_SLOW 1
+#define SHRD_SLOW 1
+
+/* 1600 MHz Nano 2xxx */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define MUL_TOOM22_THRESHOLD 27
+#define MUL_TOOM33_THRESHOLD 38
+#define MUL_TOOM44_THRESHOLD 324
+#define MUL_TOOM6H_THRESHOLD 450
+#define MUL_TOOM8H_THRESHOLD 632
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 211
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 315
+
+#define SQR_BASECASE_THRESHOLD 10
+#define SQR_TOOM2_THRESHOLD 52
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 387
+#define SQR_TOOM6_THRESHOLD 662
+#define SQR_TOOM8_THRESHOLD 781
+
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 43,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 79,11}, { 47,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 143,11}, { 79,10}, \
+ { 159, 9}, { 319,10}, { 175,11}, { 95, 9}, \
+ { 383, 8}, { 767,10}, { 207,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143, 9}, \
+ { 575, 8}, { 1151,10}, { 303,11}, { 159,10}, \
+ { 319, 9}, { 639, 8}, { 1279,10}, { 335,12}, \
+ { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
+ { 207,10}, { 415, 9}, { 831, 8}, { 1663,10}, \
+ { 447,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \
+ { 1087,10}, { 575, 9}, { 1215,12}, { 159,11}, \
+ { 319,10}, { 639, 9}, { 1279,11}, { 335,10}, \
+ { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \
+ { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \
+ { 1535,10}, { 831, 9}, { 1663,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,10}, { 1343,12}, { 351,11}, { 703,10}, \
+ { 1407,13}, { 191,12}, { 383,11}, { 767,10}, \
+ { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,10}, { 1791,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \
+ { 1663,13}, { 447,12}, { 895,11}, { 1791,13}, \
+ { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \
+ { 575,12}, { 1151,11}, { 2303,12}, { 1215,13}, \
+ { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \
+ { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,14}, { 639,13}, { 1279,12}, \
+ { 2559,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1535,12}, { 3071,13}, { 1663,14}, \
+ { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \
+ { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \
+ { 2431,14}, { 1279,13}, { 2559,12}, { 5119,14}, \
+ { 1407,13}, { 2815,12}, { 5631,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 224
+#define MUL_FFT_THRESHOLD 3520
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 63, 9}, { 127,10}, { 71, 9}, \
+ { 143,10}, { 79,11}, { 47,10}, { 95, 9}, \
+ { 191,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135, 7}, { 1087, 9}, \
+ { 287,11}, { 79, 9}, { 319, 8}, { 639,10}, \
+ { 167,11}, { 95,10}, { 191, 9}, { 383, 8}, \
+ { 767,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511, 8}, { 1023,10}, { 271, 9}, \
+ { 543, 8}, { 1087,11}, { 143, 9}, { 575, 8}, \
+ { 1151,10}, { 303, 9}, { 639, 8}, { 1279,10}, \
+ { 335, 9}, { 671,10}, { 351, 9}, { 703,12}, \
+ { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \
+ { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511, 9}, { 1023,11}, \
+ { 271,10}, { 543, 9}, { 1087,10}, { 575, 9}, \
+ { 1151,11}, { 303,10}, { 607, 9}, { 1215,12}, \
+ { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \
+ { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \
+ { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \
+ { 1535,11}, { 415,10}, { 831, 9}, { 1663,12}, \
+ { 223,11}, { 447,10}, { 959,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,11}, { 575,10}, { 1215,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,10}, { 1343,12}, \
+ { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \
+ { 383,11}, { 767,10}, { 1535,12}, { 415,11}, \
+ { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \
+ { 1791,12}, { 479,11}, { 959,14}, { 127,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,13}, \
+ { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \
+ { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \
+ { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1407,12}, \
+ { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2047,12}, \
+ { 4095,13}, { 2175,14}, { 1151,13}, { 2303,12}, \
+ { 4607,14}, { 1279,13}, { 2559,14}, { 1407,13}, \
+ { 2815,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD 2496
+
+#define MULLO_BASECASE_THRESHOLD 13
+#define MULLO_DC_THRESHOLD 38
+#define MULLO_MUL_N_THRESHOLD 6633
+
+#define DC_DIV_QR_THRESHOLD 56
+#define DC_DIVAPPR_Q_THRESHOLD 173
+#define DC_BDIV_QR_THRESHOLD 55
+#define DC_BDIV_Q_THRESHOLD 96
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 202
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 246
+#define REDC_1_TO_REDC_2_THRESHOLD 7
+#define REDC_2_TO_REDC_N_THRESHOLD 85
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1652
+#define MUPI_DIV_QR_THRESHOLD 83
+#define MU_BDIV_QR_THRESHOLD 1210
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define POWM_SEC_TABLE 1,28,129,642,2387
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 127
+#define HGCD_APPR_THRESHOLD 214
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 487
+#define GCDEXT_DC_THRESHOLD 505
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 802
+#define SET_STR_PRECOMPUTE_THRESHOLD 2042
+
+#define FAC_DSC_THRESHOLD 1737
+#define FAC_ODD_THRESHOLD 44
diff --git a/gmp-6.3.0/mpn/x86_64/nano/popcount.asm b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm
new file mode 100644
index 0000000..fb14dd3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm
@@ -0,0 +1,35 @@
+dnl x86-64 mpn_popcount.
+
+dnl Copyright 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm
new file mode 100644
index 0000000..7ae6a1a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addmul_2 optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_2)
+include_mpn(`x86_64/bd1/addmul_2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm
new file mode 100644
index 0000000..8e6ee1b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm
@@ -0,0 +1,196 @@
+dnl x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.8
+C AMD K10 2.8
+C Intel P4 4
+C Intel core2 3.6-5 (fluctuating)
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_add_n', `
+ define(ADDSUB, add)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADDSUB, sub)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ASM_START()
+ TEXT
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+IFDOS(` jmp L(ent) ')
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+L(ent): push %rbx
+ push %r12
+
+ mov (vp), %r9
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jne L(n00) C n = 0, 4, 8, ...
+ mov R32(%r8), R32(%rbx)
+ mov (up), %r8
+ mov 8(up), %r10
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ setc R8(%rax)
+ lea -16(rp), rp
+ jmp L(L00)
+
+L(n00): cmp $2, R32(%rax)
+ jnc L(n01) C n = 1, 5, 9, ...
+ mov (up), %r11
+ mov R32(%r8), R32(%rax)
+ xor R32(%rbx), R32(%rbx)
+ dec n
+ jnz L(gt1)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ ADDSUB %rax, %r11
+ adc $0, R32(%rbx)
+ mov %r11, (rp)
+ jmp L(ret)
+L(gt1): mov 8(up), %r8
+ ADDSUB %r9, %r11
+ mov 8(vp), %r9
+ setc R8(%rbx)
+ lea -8(rp), rp
+ lea 8(up), up
+ lea 8(vp), vp
+ jmp L(L01)
+
+L(n01): jne L(n10) C n = 2, 6, 10, ...
+ mov (up), %r12
+ mov R32(%r8), R32(%rbx)
+ mov 8(up), %r11
+ ADDSUB %r9, %r12
+ mov 8(vp), %r9
+ setc R8(%rax)
+ lea -32(rp), rp
+ lea 16(up), up
+ lea 16(vp), vp
+ jmp L(L10)
+
+L(n10): mov (up), %r10 C n = 3, 7, 11, ...
+ mov R32(%r8), R32(%rax)
+ xor R32(%rbx), R32(%rbx)
+ mov 8(up), %r12
+ ADDSUB %r9, %r10
+ mov 8(vp), %r9
+ setc R8(%rbx)
+ lea -24(rp), rp
+ lea -8(up), up
+ lea -8(vp), vp
+ jmp L(L11)
+
+L(c0): mov $1, R8(%rbx)
+ jmp L(rc0)
+L(c1): mov $1, R8(%rax)
+ jmp L(rc1)
+L(c2): mov $1, R8(%rbx)
+ jmp L(rc2)
+L(c3): mov $1, R8(%rax)
+ jmp L(rc3)
+
+ ALIGN(16)
+L(top): mov (up), %r8 C not on critical path
+ ADDSUB %r9, %r11 C not on critical path
+ mov (vp), %r9 C not on critical path
+ setc R8(%rbx) C save carry out
+ mov %r12, (rp)
+L(L01): ADDSUB %rax, %r11 C apply previous carry out
+ jc L(c0) C jump if ripple
+L(rc0): mov 8(up), %r10
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ setc R8(%rax)
+ mov %r11, 8(rp)
+L(L00): ADDSUB %rbx, %r8
+ jc L(c1)
+L(rc1): mov 16(up), %r12
+ ADDSUB %r9, %r10
+ mov 16(vp), %r9
+ setc R8(%rbx)
+ mov %r8, 16(rp)
+L(L11): ADDSUB %rax, %r10
+ jc L(c2)
+L(rc2): mov 24(up), %r11
+ ADDSUB %r9, %r12
+ lea 32(up), up
+ mov 24(vp), %r9
+ lea 32(vp), vp
+ setc R8(%rax)
+ mov %r10, 24(rp)
+L(L10): ADDSUB %rbx, %r12
+ jc L(c3)
+L(rc3): lea 32(rp), rp
+ sub $4, n
+ ja L(top)
+
+L(end): ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r12, (rp)
+ ADDSUB %rax, %r11
+ jnc L(1)
+ mov $1, R8(%rbx)
+L(1): mov %r11, 8(rp)
+
+L(ret): mov R32(%rbx), R32(%rax)
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm
new file mode 100644
index 0000000..66937d3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm
@@ -0,0 +1,50 @@
+dnl AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1),
+dnl optimised for Pentium 4.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31) C 31, not 63, since we use 32-bit ops
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_sublsh1_n', `
+ define(ADDSUB, sub)
+ define(func, mpn_sublsh1_n)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+include_mpn(`x86_64/pentium4/aorslshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm
new file mode 100644
index 0000000..001f0ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm
@@ -0,0 +1,50 @@
+dnl AMD64 mpn_addlsh2_n, mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2),
+dnl optimised for Pentium 4.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30) C 30, not 62, since we use 32-bit ops
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(func, mpn_addlsh2_n)')
+ifdef(`OPERATION_sublsh2_n', `
+ define(ADDSUB, sub)
+ define(func, mpn_sublsh2_n)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
+include_mpn(`x86_64/pentium4/aorslshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm
new file mode 100644
index 0000000..d03c6a3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm
@@ -0,0 +1,203 @@
+dnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
+dnl C is 1, 2, 3. Optimized for Pentium 4.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C AMD K8,K9 3.8
+C AMD K10 3.8
+C Intel P4 5.8
+C Intel core2 4.75
+C Intel corei 4.75
+C Intel atom ?
+C VIA nano 4.75
+
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+define(M, eval(m4_lshift(1,LSH)))
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %r12
+ push %rbp
+
+ mov (vp), %r9
+ shl $LSH, %r9
+ mov 4(vp), R32(%rbp)
+
+ xor R32(%rbx), R32(%rbx)
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jne L(n00) C n = 0, 4, 8, ...
+
+ mov (up), %r8
+ mov 8(up), %r10
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ lea -16(rp), rp
+ jmp L(L00)
+
+L(n00): cmp $2, R32(%rax)
+ jnc L(n01) C n = 1, 5, 9, ...
+ mov (up), %r11
+ lea -8(rp), rp
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ dec n
+ jz L(1) C jump for n = 1
+ mov 8(up), %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ mov 12(vp), R32(%rbp)
+ lea 8(up), up
+ lea 8(vp), vp
+ jmp L(L01)
+
+L(n01): jne L(n10) C n = 2, 6, 10, ...
+ mov (up), %r12
+ mov 8(up), %r11
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r12
+ mov 8(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ lea 16(up), up
+ lea 16(vp), vp
+ jmp L(L10)
+
+L(n10): mov (up), %r10
+ mov 8(up), %r12
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r10
+ mov 8(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rbx)
+ mov 12(vp), R32(%rbp)
+ lea -24(rp), rp
+ lea -8(up), up
+ lea -8(vp), vp
+ jmp L(L11)
+
+L(c0): mov $1, R8(%rbx)
+ jmp L(rc0)
+L(c1): mov $1, R8(%rax)
+ jmp L(rc1)
+L(c2): mov $1, R8(%rbx)
+ jmp L(rc2)
+
+ ALIGN(16)
+L(top): mov (up), %r8 C not on critical path
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r11 C not on critical path
+ mov (vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rbx) C save carry out
+ mov 4(vp), R32(%rbp)
+ mov %r12, (rp)
+ ADDSUB %rax, %r11 C apply previous carry out
+ jc L(c0) C jump if ripple
+L(rc0):
+L(L01): mov 8(up), %r10
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rax)
+ mov 12(vp), R32(%rbp)
+ mov %r11, 8(rp)
+ ADDSUB %rbx, %r8
+ jc L(c1)
+L(rc1):
+L(L00): mov 16(up), %r12
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r10
+ mov 16(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ setc R8(%rbx)
+ mov 20(vp), R32(%rbp)
+ mov %r8, 16(rp)
+ ADDSUB %rax, %r10
+ jc L(c2)
+L(rc2):
+L(L11): mov 24(up), %r11
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r12
+ mov 24(vp), %r9
+ lea (%rbp,%r9,M), %r9
+ lea 32(up), up
+ lea 32(vp), vp
+ setc R8(%rax)
+ mov -4(vp), R32(%rbp)
+ mov %r10, 24(rp)
+ ADDSUB %rbx, %r12
+ jc L(c3)
+L(rc3): lea 32(rp), rp
+L(L10): sub $4, n
+ ja L(top)
+
+L(end):
+ shr $RSH, R32(%rbp)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r12, (rp)
+ ADDSUB %rax, %r11
+ jnc L(1)
+ mov $1, R8(%rbx)
+L(1): mov %r11, 8(rp)
+ lea (%rbx,%rbp), R32(%rax)
+ pop %rbp
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+L(c3): mov $1, R8(%rax)
+ jmp L(rc3)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm
new file mode 100644
index 0000000..e5dbb34
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+include_mpn(`x86_64/bd1/aorsmul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h
new file mode 100644
index 0000000..9c79310
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h
@@ -0,0 +1,257 @@
+/* Pentium 4-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* These routines exists for all x86_64 chips, but they are slower on Pentium4
+ than separate add/sub and shift. Make sure they are not really used. */
+#undef HAVE_NATIVE_mpn_rsblsh1_n
+#undef HAVE_NATIVE_mpn_rsblsh2_n
+#undef HAVE_NATIVE_mpn_addlsh_n
+#undef HAVE_NATIVE_mpn_rsblsh_n
+
+/* 3400 MHz Pentium4 Nocona / 1024 Kibyte L2 cache */
+/* FFT tuning limit = 107,095,964 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 12
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define DIV_1_VS_MUL_1_PERCENT 228
+
+#define MUL_TOOM22_THRESHOLD 12
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 130
+#define MUL_TOOM6H_THRESHOLD 173
+#define MUL_TOOM8H_THRESHOLD 430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 112
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 18
+#define SQR_TOOM3_THRESHOLD 113
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 238
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 11
+
+#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 236, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
+ { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
+ { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
+ { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \
+ { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
+ { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \
+ { 39,10}, { 23, 9}, { 51,11}, { 15,10}, \
+ { 31, 9}, { 67,10}, { 39, 9}, { 83,10}, \
+ { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \
+ { 143, 8}, { 287,10}, { 79,11}, { 47,10}, \
+ { 95, 9}, { 191,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,11}, \
+ { 79,10}, { 159, 9}, { 319,10}, { 175,11}, \
+ { 95,10}, { 191, 9}, { 383,10}, { 223,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287,11}, { 159,10}, { 319,11}, { 175,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 223,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 287,10}, { 575,12}, { 159,11}, { 351,12}, \
+ { 191,11}, { 383,12}, { 223,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
+ { 575,10}, { 1151,12}, { 351,13}, { 191,12}, \
+ { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \
+ { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \
+ { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \
+ { 447,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1151,11}, { 2303,12}, { 1215,11}, { 2431,10}, \
+ { 4863,13}, { 639,12}, { 1279,11}, { 2559,12}, \
+ { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,15}, { 255,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \
+ { 2431,11}, { 4863,14}, { 639,13}, { 1279,12}, \
+ { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \
+ { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3199,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \
+ { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 229
+#define MUL_FFT_THRESHOLD 2752
+
+#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 240, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
+ { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 13, 8}, \
+ { 7, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 9}, { 7, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
+ { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \
+ { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \
+ { 31, 9}, { 63,10}, { 39, 9}, { 79,10}, \
+ { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \
+ { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 255,11}, \
+ { 143,10}, { 287,11}, { 159,10}, { 319,11}, \
+ { 175,10}, { 351,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,11}, { 223,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 287,10}, { 575,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 511,12}, { 287,11}, { 575,10}, \
+ { 1151,12}, { 319,11}, { 639,12}, { 351,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 671,11}, { 1343,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \
+ { 1215,11}, { 2431,10}, { 4863,13}, { 639,12}, \
+ { 1279,11}, { 2559,12}, { 1343,11}, { 2687,14}, \
+ { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \
+ { 1663,15}, { 255,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
+ { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1791,12}, \
+ { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2303,12}, \
+ { 4607,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \
+ { 1663,13}, { 3327,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \
+ { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3327,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3071,14}, { 6143,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 235
+#define SQR_FFT_THRESHOLD 2368
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 45
+#define MULLO_MUL_N_THRESHOLD 5397
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 46
+#define SQRLO_SQR_THRESHOLD 4658
+
+#define DC_DIV_QR_THRESHOLD 36
+#define DC_DIVAPPR_Q_THRESHOLD 95
+#define DC_BDIV_QR_THRESHOLD 35
+#define DC_BDIV_Q_THRESHOLD 47
+
+#define INV_MULMOD_BNM1_THRESHOLD 22
+#define INV_NEWTON_THRESHOLD 178
+#define INV_APPR_THRESHOLD 116
+
+#define BINV_NEWTON_THRESHOLD 206
+#define REDC_1_TO_REDC_2_THRESHOLD 24
+#define REDC_2_TO_REDC_N_THRESHOLD 50
+
+#define MU_DIV_QR_THRESHOLD 979
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 97
+#define MU_BDIV_QR_THRESHOLD 762
+#define MU_BDIV_Q_THRESHOLD 942
+
+#define POWM_SEC_TABLE 7,34,114,523,1486
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1659
+
+#define FAC_DSC_THRESHOLD 969
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 29
+#define HGCD2_DIV1_METHOD 3 /* 2.03% faster than 5 */
+#define HGCD_THRESHOLD 92
+#define HGCD_APPR_THRESHOLD 95
+#define HGCD_REDUCE_THRESHOLD 1815
+#define GCD_DC_THRESHOLD 195
+#define GCDEXT_DC_THRESHOLD 233
+#define JACOBI_BASE_METHOD 4 /* 17.06% faster than 1 */
+
+/* Tuneup completed successfully, took 297016 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm
new file mode 100644
index 0000000..4037be4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for Pentium 4.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm
new file mode 100644
index 0000000..52856c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for Pentium 4.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm
new file mode 100644
index 0000000..f34b3f0
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm
@@ -0,0 +1,167 @@
+dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 1.0
+C AMD K10 1.12
+C Intel P4 3.25
+C Intel core2 1.5
+C Intel corei 1.5
+C Intel atom 2.5
+C VIA nano 1.75
+
+
+C INPUT PARAMETERS
+define(`ap', %rdi)
+define(`n', %rsi)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C * Review feed-in and wind-down code. In particular, try to avoid adc and
+C sbb to placate Pentium4.
+C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
+C without the dual loop exits.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+ FUNC_ENTRY(2)
+
+ mov $0x0000FFFFFFFFFFFF, %r11
+
+ sub $2, %rsi
+ ja L(gt2)
+
+ mov (ap), %rax
+ nop
+ jb L(1)
+
+ mov 8(ap), %rsi
+ mov %rax, %rdx
+ shr $48, %rax C src[0] low
+
+ and %r11, %rdx C src[0] high
+ add %rdx, %rax
+ mov R32(%rsi), R32(%rdx)
+
+ shr $32, %rsi C src[1] high
+ add %rsi, %rax
+
+ shl $16, %rdx C src[1] low
+ add %rdx, %rax
+
+L(1): FUNC_EXIT()
+ ret
+
+
+ ALIGN(16)
+L(gt2): xor R32(%rax), R32(%rax)
+ xor R32(%rcx), R32(%rcx)
+ xor R32(%rdx), R32(%rdx)
+ xor %r8, %r8
+ xor %r9, %r9
+ xor %r10, %r10
+
+L(top): add (ap), %rax
+ adc $0, %r10
+ add 8(ap), %rcx
+ adc $0, %r8
+ add 16(ap), %rdx
+ adc $0, %r9
+
+ sub $3, %rsi
+ jng L(end)
+
+ add 24(ap), %rax
+ adc $0, %r10
+ add 32(ap), %rcx
+ adc $0, %r8
+ add 40(ap), %rdx
+ lea 48(ap), ap
+ adc $0, %r9
+
+ sub $3, %rsi
+ jg L(top)
+
+
+ add $-24, ap
+L(end): add %r9, %rax
+ adc %r10, %rcx
+ adc %r8, %rdx
+
+ inc %rsi
+ mov $0x1, R32(%r10)
+ js L(combine)
+
+ mov $0x10000, R32(%r10)
+ adc 24(ap), %rax
+ dec %rsi
+ js L(combine)
+
+ adc 32(ap), %rcx
+ mov $0x100000000, %r10
+
+L(combine):
+ sbb %rsi, %rsi C carry
+ mov %rax, %rdi C 0mod3
+ shr $48, %rax C 0mod3 high
+
+ and %r10, %rsi C carry masked
+ and %r11, %rdi C 0mod3 low
+ mov R32(%rcx), R32(%r10) C 1mod3
+
+ add %rsi, %rax C apply carry
+ shr $32, %rcx C 1mod3 high
+
+ add %rdi, %rax C apply 0mod3 low
+ movzwl %dx, R32(%rdi) C 2mod3
+ shl $16, %r10 C 1mod3 low
+
+ add %rcx, %rax C apply 1mod3 high
+ shr $16, %rdx C 2mod3 high
+
+ add %r10, %rax C apply 1mod3 low
+ shl $32, %rdi C 2mod3 low
+
+ add %rdx, %rax C apply 2mod3 high
+ add %rdi, %rax C apply 2mod3 low
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm
new file mode 100644
index 0000000..70de670
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_1 optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/bd1/mul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm
new file mode 100644
index 0000000..a0f7302
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_2 optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_2)
+include_mpn(`x86_64/bd1/mul_2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm
new file mode 100644
index 0000000..fb16029
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_basecase optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86_64/core2/mul_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm
new file mode 100644
index 0000000..b9e08a8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mullo_basecase optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mullo_basecase)
+include_mpn(`x86_64/core2/mullo_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm
new file mode 100644
index 0000000..7014b39
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm
@@ -0,0 +1,35 @@
+dnl x86-64 mpn_popcount optimized for Pentium 4.
+
+dnl Copyright 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm
new file mode 100644
index 0000000..00e380d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_redc_1)
+include_mpn(`x86_64/bt1/redc_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm
new file mode 100644
index 0000000..5528ce4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm
@@ -0,0 +1,334 @@
+dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 4.13
+C AMD K10 4.13
+C Intel P4 5.70
+C Intel core2 4.75
+C Intel corei 5
+C Intel atom 8.75
+C VIA nano 5.25
+
+C TODO
+C * Try to make this smaller, 746 bytes seem excessive for this 2nd class
+C function. Less sw pipelining would help, and since we now probably
+C pipeline somewhat too deeply, it might not affect performance too much.
+C * A separate small-n loop might speed things as well as make things smaller.
+C That loop should be selected before pushing registers.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(func, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(func, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ASM_START()
+ TEXT
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+IFDOS(` jmp L(ent) ')
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+L(ent): push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (vp), %r9
+ mov (up), %r15
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jne L(n00)
+
+ mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ...
+ mov 8(up), %r10
+ ADDSUB %r9, %r15
+ mov 8(vp), %r9
+ setc R8(%rax)
+ ADDSUB %rbx, %r15 C return bit
+ jnc 1f
+ mov $1, R8(%rax)
+1: mov 16(up), %r12
+ ADDSUB %r9, %r10
+ mov 16(vp), %r9
+ setc R8(%rbx)
+ mov %r15, %r13
+ ADDSUB %rax, %r10
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov 24(up), %r11
+ ADDSUB %r9, %r12
+ lea 32(up), up
+ mov 24(vp), %r9
+ lea 32(vp), vp
+ setc R8(%rax)
+ mov %r10, %r14
+ shl $63, %r10
+ shr %r13
+ jmp L(L00)
+
+L(n00): cmp $2, R32(%rax)
+ jnc L(n01)
+ xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ...
+ lea -24(rp), rp
+ mov R32(%r8), R32(%rax)
+ dec n
+ jnz L(gt1)
+ ADDSUB %r9, %r15
+ setc R8(%rbx)
+ ADDSUB %rax, %r15
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov %r15, %r14
+ shl $63, %rbx
+ shr %r14
+ jmp L(cj1)
+L(gt1): mov 8(up), %r8
+ ADDSUB %r9, %r15
+ mov 8(vp), %r9
+ setc R8(%rbx)
+ ADDSUB %rax, %r15
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov 16(up), %r10
+ ADDSUB %r9, %r8
+ mov 16(vp), %r9
+ setc R8(%rax)
+ mov %r15, %r14
+ ADDSUB %rbx, %r8
+ jnc 1f
+ mov $1, R8(%rax)
+1: mov 24(up), %r12
+ ADDSUB %r9, %r10
+ mov 24(vp), %r9
+ setc R8(%rbx)
+ mov %r8, %r13
+ shl $63, %r8
+ shr %r14
+ lea 8(up), up
+ lea 8(vp), vp
+ jmp L(L01)
+
+L(n01): jne L(n10)
+ lea -16(rp), rp C n = 2, 6, 10, ...
+ mov R32(%r8), R32(%rbx)
+ mov 8(up), %r11
+ ADDSUB %r9, %r15
+ mov 8(vp), %r9
+ setc R8(%rax)
+ ADDSUB %rbx, %r15
+ jnc 1f
+ mov $1, R8(%rax)
+1: sub $2, n
+ jnz L(gt2)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r15, %r13
+ ADDSUB %rax, %r11
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov %r11, %r14
+ shl $63, %r11
+ shr %r13
+ jmp L(cj2)
+L(gt2): mov 16(up), %r8
+ ADDSUB %r9, %r11
+ mov 16(vp), %r9
+ setc R8(%rbx)
+ mov %r15, %r13
+ ADDSUB %rax, %r11
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov 24(up), %r10
+ ADDSUB %r9, %r8
+ mov 24(vp), %r9
+ setc R8(%rax)
+ mov %r11, %r14
+ shl $63, %r11
+ shr %r13
+ lea 16(up), up
+ lea 16(vp), vp
+ jmp L(L10)
+
+L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ...
+ lea -8(rp), rp
+ mov R32(%r8), R32(%rax)
+ mov 8(up), %r12
+ ADDSUB %r9, %r15
+ mov 8(vp), %r9
+ setc R8(%rbx)
+ ADDSUB %rax, %r15
+ jnc 1f
+ mov $1, R8(%rbx)
+1: mov 16(up), %r11
+ ADDSUB %r9, %r12
+ mov 16(vp), %r9
+ setc R8(%rax)
+ mov %r15, %r14
+ ADDSUB %rbx, %r12
+ jnc 1f
+ mov $1, R8(%rax)
+1: sub $3, n
+ jnz L(gt3)
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r12, %r13
+ shl $63, %r12
+ shr %r14
+ jmp L(cj3)
+L(gt3): mov 24(up), %r8
+ ADDSUB %r9, %r11
+ mov 24(vp), %r9
+ setc R8(%rbx)
+ mov %r12, %r13
+ shl $63, %r12
+ shr %r14
+ lea 24(up), up
+ lea 24(vp), vp
+ jmp L(L11)
+
+L(c0): mov $1, R8(%rbx)
+ jmp L(rc0)
+L(c1): mov $1, R8(%rax)
+ jmp L(rc1)
+L(c2): mov $1, R8(%rbx)
+ jmp L(rc2)
+
+ ALIGN(16)
+L(top): mov (up), %r8 C not on critical path
+ or %r13, %r10
+ ADDSUB %r9, %r11 C not on critical path
+ mov (vp), %r9 C not on critical path
+ setc R8(%rbx) C save carry out
+ mov %r12, %r13 C new for later
+ shl $63, %r12 C shift new right
+ shr %r14 C shift old left
+ mov %r10, (rp)
+L(L11): ADDSUB %rax, %r11 C apply previous carry out
+ jc L(c0) C jump if ripple
+L(rc0): mov 8(up), %r10
+ or %r14, %r12
+ ADDSUB %r9, %r8
+ mov 8(vp), %r9
+ setc R8(%rax)
+ mov %r11, %r14
+ shl $63, %r11
+ shr %r13
+ mov %r12, 8(rp)
+L(L10): ADDSUB %rbx, %r8
+ jc L(c1)
+L(rc1): mov 16(up), %r12
+ or %r13, %r11
+ ADDSUB %r9, %r10
+ mov 16(vp), %r9
+ setc R8(%rbx)
+ mov %r8, %r13
+ shl $63, %r8
+ shr %r14
+ mov %r11, 16(rp)
+L(L01): ADDSUB %rax, %r10
+ jc L(c2)
+L(rc2): mov 24(up), %r11
+ or %r14, %r8
+ ADDSUB %r9, %r12
+ lea 32(up), up
+ mov 24(vp), %r9
+ lea 32(vp), vp
+ setc R8(%rax)
+ mov %r10, %r14
+ shl $63, %r10
+ shr %r13
+ mov %r8, 24(rp)
+ lea 32(rp), rp
+L(L00): ADDSUB %rbx, %r12
+ jc L(c3)
+L(rc3): sub $4, n
+ ja L(top)
+
+L(end): or %r13, %r10
+ ADDSUB %r9, %r11
+ setc R8(%rbx)
+ mov %r12, %r13
+ shl $63, %r12
+ shr %r14
+ mov %r10, (rp)
+L(cj3): ADDSUB %rax, %r11
+ jnc 1f
+ mov $1, R8(%rbx)
+1: or %r14, %r12
+ mov %r11, %r14
+ shl $63, %r11
+ shr %r13
+ mov %r12, 8(rp)
+L(cj2): or %r13, %r11
+ shl $63, %rbx
+ shr %r14
+ mov %r11, 16(rp)
+L(cj1): or %r14, %rbx
+ mov %rbx, 24(rp)
+
+ mov R32(%r15), R32(%rax)
+ and $1, R32(%rax)
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+L(c3): mov $1, R8(%rax)
+ jmp L(rc3)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm
new file mode 100644
index 0000000..b7c1ee2
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm
@@ -0,0 +1,169 @@
+dnl x86-64 mpn_rshift optimized for Pentium 4.
+
+dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.5
+C AMD K10 ?
+C Intel P4 3.29
+C Intel core2 2.1 (fluctuates, presumably cache related)
+C Intel corei ?
+C Intel atom 14.3
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+define(`cnt',`%cl')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ mov (up), %rax
+ movd R32(%rcx), %mm4
+ neg R32(%rcx) C put lsh count in cl
+ and $63, R32(%rcx)
+ movd R32(%rcx), %mm5
+
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ lea 1(n), R32(%r8)
+ neg n
+
+ shl R8(%rcx), %rax C function return value
+
+ and $3, R32(%r8)
+ je L(rol) C jump for n = 3, 7, 11, ...
+
+ dec R32(%r8)
+ jne L(1)
+C n = 4, 8, 12, ...
+ movq 8(up,n,8), %mm2
+ psrlq %mm4, %mm2
+ movq 16(up,n,8), %mm0
+ psllq %mm5, %mm0
+ por %mm0, %mm2
+ movq %mm2, 8(rp,n,8)
+ inc n
+ jmp L(rol)
+
+L(1): dec R32(%r8)
+ je L(1x) C jump for n = 1, 5, 9, 13, ...
+C n = 2, 6, 10, 16, ...
+ movq 8(up,n,8), %mm2
+ psrlq %mm4, %mm2
+ movq 16(up,n,8), %mm0
+ psllq %mm5, %mm0
+ por %mm0, %mm2
+ movq %mm2, 8(rp,n,8)
+ inc n
+L(1x):
+ cmp $-1, n
+ je L(ast)
+ movq 8(up,n,8), %mm2
+ psrlq %mm4, %mm2
+ movq 16(up,n,8), %mm3
+ psrlq %mm4, %mm3
+ movq 16(up,n,8), %mm0
+ movq 24(up,n,8), %mm1
+ psllq %mm5, %mm0
+ por %mm0, %mm2
+ psllq %mm5, %mm1
+ por %mm1, %mm3
+ movq %mm2, 8(rp,n,8)
+ movq %mm3, 16(rp,n,8)
+ add $2, n
+
+L(rol): movq 8(up,n,8), %mm2
+ psrlq %mm4, %mm2
+ movq 16(up,n,8), %mm3
+ psrlq %mm4, %mm3
+
+ add $4, n C 4
+ jb L(end) C 2
+ ALIGN(32)
+L(top):
+ C finish stuff from lsh block
+ movq -16(up,n,8), %mm0
+ movq -8(up,n,8), %mm1
+ psllq %mm5, %mm0
+ por %mm0, %mm2
+ psllq %mm5, %mm1
+ movq (up,n,8), %mm0
+ por %mm1, %mm3
+ movq 8(up,n,8), %mm1
+ movq %mm2, -24(rp,n,8)
+ movq %mm3, -16(rp,n,8)
+ C start two new rsh
+ psllq %mm5, %mm0
+ psllq %mm5, %mm1
+
+ C finish stuff from rsh block
+ movq -8(up,n,8), %mm2
+ movq (up,n,8), %mm3
+ psrlq %mm4, %mm2
+ por %mm2, %mm0
+ psrlq %mm4, %mm3
+ movq 8(up,n,8), %mm2
+ por %mm3, %mm1
+ movq 16(up,n,8), %mm3
+ movq %mm0, -8(rp,n,8)
+ movq %mm1, (rp,n,8)
+ C start two new lsh
+ add $4, n
+ psrlq %mm4, %mm2
+ psrlq %mm4, %mm3
+
+ jae L(top) C 2
+L(end):
+ movq -8(up), %mm0
+ psllq %mm5, %mm0
+ por %mm0, %mm2
+ movq (up), %mm1
+ psllq %mm5, %mm1
+ por %mm1, %mm3
+ movq %mm2, -16(rp)
+ movq %mm3, -8(rp)
+
+L(ast): movq (up), %mm2
+ psrlq %mm4, %mm2
+ movq %mm2, (rp)
+ emms
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm
new file mode 100644
index 0000000..9725287
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sqr_basecase optimised for Intel Nocona.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86_64/core2/sqr_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/popham.asm b/gmp-6.3.0/mpn/x86_64/popham.asm
new file mode 100644
index 0000000..3a29b2e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/popham.asm
@@ -0,0 +1,163 @@
+dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
+
+dnl Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C cycles/limb cycles/limb
+C AMD K8,K9 6 7
+C AMD K10 6 7
+C Intel P4 12 14.3
+C Intel core2 7 8
+C Intel corei ? 7.3
+C Intel atom 16.5 17.5
+C VIA nano 8.75 10.4
+
+C TODO
+C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for
+C hamdist for K8/K9.
+
+
+ifdef(`OPERATION_popcount',`
+ define(`func',`mpn_popcount')
+ define(`up', `%rdi')
+ define(`n', `%rsi')
+ define(`h55555555', `%r10')
+ define(`h33333333', `%r11')
+ define(`h0f0f0f0f', `%rcx')
+ define(`h01010101', `%rdx')
+ define(`POP', `$1')
+ define(`HAM', `dnl')
+')
+ifdef(`OPERATION_hamdist',`
+ define(`func',`mpn_hamdist')
+ define(`up', `%rdi')
+ define(`vp', `%rsi')
+ define(`n', `%rdx')
+ define(`h55555555', `%r10')
+ define(`h33333333', `%r11')
+ define(`h0f0f0f0f', `%rcx')
+ define(`h01010101', `%r12')
+ define(`POP', `dnl')
+ define(`HAM', `$1')
+')
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ POP(` FUNC_ENTRY(2) ')
+ HAM(` FUNC_ENTRY(3) ')
+ push %rbx
+ mov $0x5555555555555555, h55555555
+ push %rbp
+ mov $0x3333333333333333, h33333333
+ HAM(` push %r12 ')
+ lea (up,n,8), up
+ mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
+ HAM(` lea (vp,n,8), vp ')
+ neg n
+ mov $0x0101010101010101, h01010101
+ xor R32(%rax), R32(%rax)
+ test $1, R8(n)
+ jz L(top)
+
+ mov (up,n,8), %r8
+ HAM(` xor (vp,n,8), %r8 ')
+
+ mov %r8, %r9
+ shr %r8
+ and h55555555, %r8
+ sub %r8, %r9
+
+ mov %r9, %r8
+ shr $2, %r9
+ and h33333333, %r8
+ and h33333333, %r9
+ add %r8, %r9 C 16 4-bit fields (0..4)
+
+ dec n
+ jmp L(mid)
+
+ ALIGN(16)
+L(top): mov (up,n,8), %r8
+ mov 8(up,n,8), %rbx
+ HAM(` xor (vp,n,8), %r8 ')
+ HAM(` xor 8(vp,n,8), %rbx ')
+
+ mov %r8, %r9
+ mov %rbx, %rbp
+ shr %r8
+ shr %rbx
+ and h55555555, %r8
+ and h55555555, %rbx
+ sub %r8, %r9
+ sub %rbx, %rbp
+
+ mov %r9, %r8
+ mov %rbp, %rbx
+ shr $2, %r9
+ shr $2, %rbp
+ and h33333333, %r8
+ and h33333333, %r9
+ and h33333333, %rbx
+ and h33333333, %rbp
+ add %r8, %r9 C 16 4-bit fields (0..4)
+ add %rbx, %rbp C 16 4-bit fields (0..4)
+
+ add %rbp, %r9 C 16 4-bit fields (0..8)
+L(mid): mov %r9, %r8
+ shr $4, %r9
+ and h0f0f0f0f, %r8
+ and h0f0f0f0f, %r9
+ add %r8, %r9 C 8 8-bit fields (0..16)
+
+ imul h01010101, %r9 C sum the 8 fields in high 8 bits
+ shr $56, %r9
+
+ add %r9, %rax C add to total
+ add $2, n
+ jnc L(top)
+
+L(end):
+ HAM(` pop %r12 ')
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm
new file mode 100644
index 0000000..a3e9cc5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm
@@ -0,0 +1,189 @@
+dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
+dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
+
+dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125)
+C Intel P4 12.75
+C Intel core2 3.75
+C Intel NMH 4.4
+C Intel SBR ?
+C Intel atom ?
+C VIA nano 3.25
+
+C TODO
+C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n',` %rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+
+ xor R32(%rax), R32(%rax)
+ neg %r8 C set C flag from parameter
+ mov (up), %rbx
+ ADCSBB (vp), %rbx
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ xor R32(%rax), R32(%rax)
+ mov (up), %rbx
+ ADDSUB (vp), %rbx
+L(ent):
+ rcr %rbx C rotate, save acy
+ adc R32(%rax), R32(%rax) C return value
+
+ mov R32(n), R32(%r11)
+ and $3, R32(%r11)
+
+ cmp $1, R32(%r11)
+ je L(do) C jump if n = 1 5 9 ...
+
+L(n1): cmp $2, R32(%r11)
+ jne L(n2) C jump unless n = 2 6 10 ...
+ add %rbx, %rbx C rotate carry limb, restore acy
+ mov 8(up), %r10
+ ADCSBB 8(vp), %r10
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ rcr %r10
+ rcr %rbx
+ mov %rbx, -8(rp)
+ jmp L(cj1)
+
+L(n2): cmp $3, R32(%r11)
+ jne L(n3) C jump unless n = 3 7 11 ...
+ add %rbx, %rbx C rotate carry limb, restore acy
+ mov 8(up), %r9
+ mov 16(up), %r10
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ rcr %r10
+ rcr %r9
+ rcr %rbx
+ mov %rbx, -16(rp)
+ jmp L(cj2)
+
+L(n3): dec n C come here for n = 4 8 12 ...
+ add %rbx, %rbx C rotate carry limb, restore acy
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ ADCSBB 24(vp), %r10
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ rcr %r10
+ rcr %r9
+ rcr %r8
+ rcr %rbx
+ mov %rbx, -24(rp)
+ mov %r8, -16(rp)
+L(cj2): mov %r9, -8(rp)
+L(cj1): mov %r10, %rbx
+
+L(do):
+ shr $2, n C 4
+ je L(end) C 2
+ ALIGN(16)
+L(top): add %rbx, %rbx C rotate carry limb, restore acy
+
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ mov 32(up), %r11
+ ADCSBB 24(vp), %r10
+ ADCSBB 32(vp), %r11
+
+ lea 32(up), up
+ lea 32(vp), vp
+
+ rcr %r11 C rotate, save acy
+ rcr %r10
+ rcr %r9
+ rcr %r8
+
+ rcr %rbx
+ mov %rbx, (rp)
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+ mov %r10, 24(rp)
+ mov %r11, %rbx
+
+ lea 32(rp), rp
+ dec n
+ jne L(top)
+
+L(end): mov %rbx, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/rshift.asm b/gmp-6.3.0/mpn/x86_64/rshift.asm
new file mode 100644
index 0000000..3f344f1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/rshift.asm
@@ -0,0 +1,176 @@
+dnl AMD64 mpn_rshift -- mpn right shift.
+
+dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.375
+C AMD K10 2.375
+C Intel P4 8
+C Intel core2 2.11
+C Intel corei ?
+C Intel atom 5.75
+C VIA nano 3.5
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ neg R32(%rcx) C put rsh count in cl
+ mov (up), %rax
+ shl R8(%rcx), %rax C function return value
+ neg R32(%rcx) C put lsh count in cl
+
+ lea 1(n), R32(%r8)
+
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ neg n
+
+ and $3, R32(%r8)
+ je L(rlx) C jump for n = 3, 7, 11, ...
+
+ dec R32(%r8)
+ jne L(1)
+C n = 4, 8, 12, ...
+ mov 8(up,n,8), %r10
+ shr R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov 16(up,n,8), %r8
+ shl R8(%rcx), %r8
+ or %r8, %r10
+ mov %r10, 8(rp,n,8)
+ inc n
+ jmp L(rll)
+
+L(1): dec R32(%r8)
+ je L(1x) C jump for n = 1, 5, 9, 13, ...
+C n = 2, 6, 10, 16, ...
+ mov 8(up,n,8), %r10
+ shr R8(%rcx), %r10
+ neg R32(%rcx) C put rsh count in cl
+ mov 16(up,n,8), %r8
+ shl R8(%rcx), %r8
+ or %r8, %r10
+ mov %r10, 8(rp,n,8)
+ inc n
+ neg R32(%rcx) C put lsh count in cl
+L(1x):
+ cmp $-1, n
+ je L(ast)
+ mov 8(up,n,8), %r10
+ shr R8(%rcx), %r10
+ mov 16(up,n,8), %r11
+ shr R8(%rcx), %r11
+ neg R32(%rcx) C put rsh count in cl
+ mov 16(up,n,8), %r8
+ mov 24(up,n,8), %r9
+ shl R8(%rcx), %r8
+ or %r8, %r10
+ shl R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, 8(rp,n,8)
+ mov %r11, 16(rp,n,8)
+ add $2, n
+
+L(rll): neg R32(%rcx) C put lsh count in cl
+L(rlx): mov 8(up,n,8), %r10
+ shr R8(%rcx), %r10
+ mov 16(up,n,8), %r11
+ shr R8(%rcx), %r11
+
+ add $4, n C 4
+ jb L(end) C 2
+ ALIGN(16)
+L(top):
+ C finish stuff from lsh block
+ neg R32(%rcx) C put rsh count in cl
+ mov -16(up,n,8), %r8
+ mov -8(up,n,8), %r9
+ shl R8(%rcx), %r8
+ or %r8, %r10
+ shl R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, -24(rp,n,8)
+ mov %r11, -16(rp,n,8)
+ C start two new rsh
+ mov (up,n,8), %r8
+ mov 8(up,n,8), %r9
+ shl R8(%rcx), %r8
+ shl R8(%rcx), %r9
+
+ C finish stuff from rsh block
+ neg R32(%rcx) C put lsh count in cl
+ mov -8(up,n,8), %r10
+ mov 0(up,n,8), %r11
+ shr R8(%rcx), %r10
+ or %r10, %r8
+ shr R8(%rcx), %r11
+ or %r11, %r9
+ mov %r8, -8(rp,n,8)
+ mov %r9, 0(rp,n,8)
+ C start two new lsh
+ mov 8(up,n,8), %r10
+ mov 16(up,n,8), %r11
+ shr R8(%rcx), %r10
+ shr R8(%rcx), %r11
+
+ add $4, n
+ jae L(top) C 2
+L(end):
+ neg R32(%rcx) C put rsh count in cl
+ mov -8(up), %r8
+ shl R8(%rcx), %r8
+ or %r8, %r10
+ mov (up), %r9
+ shl R8(%rcx), %r9
+ or %r9, %r11
+ mov %r10, -16(rp)
+ mov %r11, -8(rp)
+
+ neg R32(%rcx) C put lsh count in cl
+L(ast): mov (up), %r10
+ shr R8(%rcx), %r10
+ mov %r10, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm
new file mode 100644
index 0000000..e8aed26
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm
@@ -0,0 +1,176 @@
+dnl AMD64 mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb good for cpu
+C AMD K8,K9 1.5 Y
+C AMD K10 1.4
+C AMD bd1 2.64
+C AMD bobcat 2.15 Y
+C Intel P4 4
+C Intel core2 1.38
+C Intel NHM 1.75
+C Intel SBR 1.25
+C Intel atom 2.5 Y
+C VIA nano 1.75 Y
+
+C NOTES
+C * This has not been tuned for any specific processor. Its speed should not
+C be too bad, though.
+C * Using SSE2/AVX2 could result in many-fold speedup.
+C * WORKS FOR n mod 4 = 0 ONLY!
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`n', `%rdx')
+define(`nents', `%rcx')
+define(`which', `%r8')
+
+define(`i', `%rbp')
+define(`j', `%r9')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C nents n rp tab i which j * * * * * *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov n, j
+ add $-4, j
+ js L(outer_end)
+
+L(outer_top):
+ mov nents, i
+ push tp
+ xor R32(%r12), R32(%r12)
+ xor R32(%r13), R32(%r13)
+ xor R32(%r14), R32(%r14)
+ xor R32(%r15), R32(%r15)
+ mov which, %rbx
+
+ ALIGN(16)
+L(top): sub $1, %rbx
+ sbb %rax, %rax
+ mov 0(tp), %r10
+ mov 8(tp), %r11
+ and %rax, %r10
+ and %rax, %r11
+ or %r10, %r12
+ or %r11, %r13
+ mov 16(tp), %r10
+ mov 24(tp), %r11
+ and %rax, %r10
+ and %rax, %r11
+ or %r10, %r14
+ or %r11, %r15
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(top)
+
+ mov %r12, 0(rp)
+ mov %r13, 8(rp)
+ mov %r14, 16(rp)
+ mov %r15, 24(rp)
+ pop tp
+ lea 32(tp), tp
+ lea 32(rp), rp
+ add $-4, j
+ jns L(outer_top)
+L(outer_end):
+
+ test $2, R8(n)
+ jz L(b0x)
+L(b1x): mov nents, i
+ push tp
+ xor R32(%r12), R32(%r12)
+ xor R32(%r13), R32(%r13)
+ mov which, %rbx
+ ALIGN(16)
+L(tp2): sub $1, %rbx
+ sbb %rax, %rax
+ mov 0(tp), %r10
+ mov 8(tp), %r11
+ and %rax, %r10
+ and %rax, %r11
+ or %r10, %r12
+ or %r11, %r13
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp2)
+ mov %r12, 0(rp)
+ mov %r13, 8(rp)
+ pop tp
+ lea 16(tp), tp
+ lea 16(rp), rp
+
+L(b0x): test $1, R8(n)
+ jz L(b00)
+L(b01): mov nents, i
+ xor R32(%r12), R32(%r12)
+ mov which, %rbx
+ ALIGN(16)
+L(tp1): sub $1, %rbx
+ sbb %rax, %rax
+ mov 0(tp), %r10
+ and %rax, %r10
+ or %r10, %r12
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp1)
+ mov %r12, 0(rp)
+
+L(b00): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm
new file mode 100644
index 0000000..98c26cf
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm
@@ -0,0 +1,50 @@
+dnl X86-64 mpn_addlsh1_n/mpn_rsblsh1_n optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm
new file mode 100644
index 0000000..2a83217
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm
@@ -0,0 +1,50 @@
+dnl X86-64 mpn_addlsh2_n/mpn_rsblsh2_n optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm
new file mode 100644
index 0000000..dce3d75
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreisbr/aors_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm
new file mode 100644
index 0000000..ead0d76
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addmul_1/mpn_submul_1 optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+include_mpn(`x86_64/core2/aorsmul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h
new file mode 100644
index 0000000..f8cb0f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h
@@ -0,0 +1,252 @@
+/* Intel Silvermont gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */
+/* FFT tuning limit = 468153400 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 55
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define DIV_1_VS_MUL_1_PERCENT 168
+
+#define MUL_TOOM22_THRESHOLD 19
+#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 152
+#define MUL_TOOM6H_THRESHOLD 222
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 232
+#define SQR_TOOM6_THRESHOLD 286
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 24
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \
+ { 23, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 135,11}, { 79, 9}, { 319,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703, 9}, { 1407,12}, { 191,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 575,10}, \
+ { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,10}, { 1407,13}, { 191,12}, { 415,11}, \
+ { 831,10}, { 1663,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 543,11}, { 1087,10}, { 2175,12}, \
+ { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
+ { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \
+ { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \
+ { 5887,14}, { 767,13}, { 1663,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2559,14}, { 1407,13}, { 2943,12}, { 5887,15}, \
+ { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \
+ { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \
+ { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4607,15}, { 9215,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 225
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 17, 8}, \
+ { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 95,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511, 9}, { 271, 8}, { 543,11}, \
+ { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
+ { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 479,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
+ { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 479,13}, { 255,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,12}, { 2943,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2943,15}, \
+ { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \
+ { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \
+ { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \
+ { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 232
+#define SQR_FFT_THRESHOLD 2752
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 55
+#define MULLO_MUL_N_THRESHOLD 6633
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 5397
+
+#define DC_DIV_QR_THRESHOLD 33
+#define DC_DIVAPPR_Q_THRESHOLD 222
+#define DC_BDIV_QR_THRESHOLD 31
+#define DC_BDIV_Q_THRESHOLD 147
+
+#define INV_MULMOD_BNM1_THRESHOLD 37
+#define INV_NEWTON_THRESHOLD 222
+#define INV_APPR_THRESHOLD 222
+
+#define BINV_NEWTON_THRESHOLD 212
+#define REDC_1_TO_REDC_2_THRESHOLD 55
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 81
+#define MU_BDIV_QR_THRESHOLD 942
+#define MU_BDIV_Q_THRESHOLD 1043
+
+#define POWM_SEC_TABLE 1,34,102,588,1730
+
+#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1659
+
+#define FAC_DSC_THRESHOLD 351
+#define FAC_ODD_THRESHOLD 27
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 3 /* 3.06% faster than 1 */
+#define HGCD_THRESHOLD 120
+#define HGCD_APPR_THRESHOLD 153
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1 /* 2.28% faster than 3 */
+
+/* Tuneup completed successfully, took 938046 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm
new file mode 100644
index 0000000..848ed01
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm
@@ -0,0 +1,38 @@
+dnl x86-64 mpn_hamdist.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/coreinhm/hamdist.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm
new file mode 100644
index 0000000..acd3180
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm
new file mode 100644
index 0000000..3a68bb5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm
new file mode 100644
index 0000000..c1e1c94
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_1 optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/bd1/mul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm
new file mode 100644
index 0000000..6228c48
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_basecase optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86_64/k8/mul_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm
new file mode 100644
index 0000000..0244f8a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mullo_basecase optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mullo_basecase)
+include_mpn(`x86_64/k8/mullo_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm
new file mode 100644
index 0000000..73eb7b5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm
@@ -0,0 +1,38 @@
+dnl x86-64 mpn_popcount.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/coreinhm/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm
new file mode 100644
index 0000000..b84371c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm
new file mode 100644
index 0000000..afccf93
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sqr_basecase optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86_64/k8/sqr_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h
new file mode 100644
index 0000000..a899ea1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h
@@ -0,0 +1,246 @@
+/* Skylake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */
+/* FFT tuning limit = 465,990,371 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 41
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define DIV_1_VS_MUL_1_PERCENT 473
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 208
+#define MUL_TOOM6H_THRESHOLD 300
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 426
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 46
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 39, 8}, { 79, 9}, { 43,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 99,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 167,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 199,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,12}, { 95,11}, { 191,10}, \
+ { 383,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \
+ { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \
+ { 831,13}, { 447,12}, { 959,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1151,13}, { 639,12}, \
+ { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1727,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2175,13}, { 4351,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \
+ { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \
+ { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 227
+#define MUL_FFT_THRESHOLD 6272
+
+#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 400, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 28, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 735,12}, { 383,11}, { 799,12}, \
+ { 415,11}, { 831,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \
+ { 1791,13}, { 959,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1151,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,14}, { 767,13}, { 1599,12}, \
+ { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 1535,14}, { 3455,15}, \
+ { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
+ { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5119,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \
+ { 3839,17}, { 1023,16}, { 2047,15}, { 4863,16}, \
+ { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \
+ { 6911,16}, { 3583,15}, { 7679,14}, { 15359,17}, \
+ { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \
+ { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 205
+#define SQR_FFT_THRESHOLD 4224
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 79
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 109
+#define SQRLO_SQR_THRESHOLD 8207
+
+#define DC_DIV_QR_THRESHOLD 55
+#define DC_DIVAPPR_Q_THRESHOLD 179
+#define DC_BDIV_QR_THRESHOLD 82
+#define DC_BDIV_Q_THRESHOLD 166
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 170
+#define INV_APPR_THRESHOLD 171
+
+#define BINV_NEWTON_THRESHOLD 294
+#define REDC_1_TO_REDC_2_THRESHOLD 33
+#define REDC_2_TO_REDC_N_THRESHOLD 59
+
+#define MU_DIV_QR_THRESHOLD 1528
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 62
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1597
+
+#define POWM_SEC_TABLE 2,8,191,452,904
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 898
+#define SET_STR_PRECOMPUTE_THRESHOLD 1670
+
+#define FAC_DSC_THRESHOLD 474
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 5 /* 3.85% faster than 3 */
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 60
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 618
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 1 /* 12.01% faster than 4 */
+
+/* Tuneup completed successfully, took 213784 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..f486125
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm
@@ -0,0 +1,116 @@
+dnl AMD64 mpn_sqr_diag_addlsh1
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.5
+C AMD K10 2.5
+C AMD bull 3.6
+C AMD pile 3.6
+C AMD steam ?
+C AMD bobcat 4
+C AMD jaguar ?
+C Intel P4 11.5
+C Intel core 4
+C Intel NHM 3.6
+C Intel SBR 3.15
+C Intel IBR 3.0
+C Intel HWL 2.6
+C Intel BWL ?
+C Intel atom 14
+C VIA nano 3.5
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`up_arg', `%rdx')
+define(`n', `%rcx')
+
+define(`up', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_diag_addlsh1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ dec n
+ shl n
+
+ mov (up_arg), %rax
+
+ lea (rp,n,8), rp
+ lea (tp,n,8), tp
+ lea (up_arg,n,4), up
+ neg n
+
+ mul %rax
+ mov %rax, (rp,n,8)
+
+ xor R32(%rbx), R32(%rbx)
+ jmp L(mid)
+
+ ALIGN(16)
+L(top): add %r10, %r8
+ adc %rax, %r9
+ mov %r8, -8(rp,n,8)
+ mov %r9, (rp,n,8)
+L(mid): mov 8(up,n,4), %rax
+ mov (tp,n,8), %r8
+ mov 8(tp,n,8), %r9
+ adc %r8, %r8
+ adc %r9, %r9
+ lea (%rdx,%rbx), %r10
+ setc R8(%rbx)
+ mul %rax
+ add $2, n
+ js L(top)
+
+L(end): add %r10, %r8
+ adc %rax, %r9
+ mov %r8, I(-8(rp),-8(rp,n,8))
+ mov %r9, I((rp),(rp,n,8))
+ adc %rbx, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm
new file mode 100644
index 0000000..c6d829f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm
@@ -0,0 +1,160 @@
+dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 2.2
+C AMD K10 2.2
+C Intel P4 12.75
+C Intel core2 3.45
+C Intel corei ?
+C Intel atom ?
+C VIA nano 3.25
+
+C Sometimes speed degenerates, supposedly related to that some operand
+C alignments cause cache conflicts.
+
+C The speed is limited by decoding/issue bandwidth. There are 26 instructions
+C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (vp), %r8
+ mov R32(n), R32(%rax)
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ neg n
+ xor R32(%rbp), R32(%rbp)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): add %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ mov 16(vp,n,8), %r10
+ adc %r10, %r10
+ sbb R32(%rax), R32(%rax) C save scy
+ mov (up,n,8), %rbp
+ mov 8(up,n,8), %rbx
+ sub %r8, %rbp
+ sbb %r9, %rbx
+ mov %rbp, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov 16(up,n,8), %rbp
+ sbb %r10, %rbp
+ mov %rbp, 16(rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $3, n
+ jmp L(ent)
+
+L(b10): add %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ sbb R32(%rax), R32(%rax) C save scy
+ mov (up,n,8), %rbp
+ mov 8(up,n,8), %rbx
+ sub %r8, %rbp
+ sbb %r9, %rbx
+ mov %rbp, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $2, n
+ jmp L(ent)
+
+L(b01): add %r8, %r8
+ sbb R32(%rax), R32(%rax) C save scy
+ mov (up,n,8), %rbp
+ sub %r8, %rbp
+ mov %rbp, (rp,n,8)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ inc n
+L(ent): jns L(end)
+
+ ALIGN(16)
+L(top): add R32(%rax), R32(%rax) C restore scy
+
+ mov (vp,n,8), %r8
+L(b00): adc %r8, %r8
+ mov 8(vp,n,8), %r9
+ adc %r9, %r9
+ mov 16(vp,n,8), %r10
+ adc %r10, %r10
+ mov 24(vp,n,8), %r11
+ adc %r11, %r11
+
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+
+ mov (up,n,8), %rbp
+ mov 8(up,n,8), %rbx
+ sbb %r8, %rbp
+ sbb %r9, %rbx
+ mov %rbp, (rp,n,8)
+ mov %rbx, 8(rp,n,8)
+ mov 16(up,n,8), %rbp
+ mov 24(up,n,8), %rbx
+ sbb %r10, %rbp
+ sbb %r11, %rbx
+ mov %rbp, 16(rp,n,8)
+ mov %rbx, 24(rp,n,8)
+
+ sbb R32(%rbp), R32(%rbp) C save acy
+ add $4, n
+ js L(top)
+
+L(end): add R32(%rbp), R32(%rax)
+ neg R32(%rax)
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4
new file mode 100644
index 0000000..4e08f2a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4
@@ -0,0 +1,493 @@
+divert(-1)
+
+dnl m4 macros for amd64 assembler.
+
+dnl Copyright 1999-2005, 2008, 2009, 2011-2013, 2017 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Usage: CPUVEC_FUNCS_LIST
+dnl
+dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
+dnl order they appear in that structure.
+
+define(CPUVEC_FUNCS_LIST,
+``add_n',
+`addlsh1_n',
+`addlsh2_n',
+`addmul_1',
+`addmul_2',
+`bdiv_dbm1c',
+`cnd_add_n',
+`cnd_sub_n',
+`com',
+`copyd',
+`copyi',
+`divexact_1',
+`divrem_1',
+`gcd_11',
+`lshift',
+`lshiftc',
+`mod_1',
+`mod_1_1p',
+`mod_1_1p_cps',
+`mod_1s_2p',
+`mod_1s_2p_cps',
+`mod_1s_4p',
+`mod_1s_4p_cps',
+`mod_34lsub1',
+`modexact_1c_odd',
+`mul_1',
+`mul_basecase',
+`mullo_basecase',
+`preinv_divrem_1',
+`preinv_mod_1',
+`redc_1',
+`redc_2',
+`rshift',
+`sqr_basecase',
+`sub_n',
+`sublsh1_n',
+`submul_1'')
+
+
+dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl In the amd64 code we use explicit TEXT and ALIGN() calls in the code,
+dnl since different alignments are wanted in various circumstances. So for
+dnl instance,
+dnl
+dnl TEXT
+dnl ALIGN(16)
+dnl PROLOGUE(mpn_add_n)
+dnl ...
+dnl EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+` GLOBL $1
+ TYPE($1,`function')
+ COFF_TYPE($1)
+$1:
+')
+
+
+dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo)
+dnl
+dnl Emit COFF style ".def ... .endef" type information for a function, when
+dnl supported. The argument should include any GSYM_PREFIX.
+dnl
+dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE.
+
+define(COFF_TYPE,
+m4_assert_numargs(1)
+m4_assert_defined(`HAVE_COFF_TYPE')
+`ifelse(HAVE_COFF_TYPE,yes,
+ `.def $1
+ .scl 2
+ .type 32
+ .endef')')
+
+
+dnl Usage: ASSERT([cond][,instructions])
+dnl
+dnl If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl flags condition to then be satisfied. For example,
+dnl
+dnl ASSERT(ne, `cmpq %rax, %rbx')
+dnl
+dnl The instructions can be omitted to just assert a flags condition with
+dnl no extra calculation. For example,
+dnl
+dnl ASSERT(nc)
+dnl
+dnl When `instructions' is not empty, a pushfq/popfq is added for
+dnl convenience to preserve the flags, but the instructions themselves must
+dnl preserve any registers that matter.
+dnl
+dnl The condition can be omitted to just output the given instructions when
+dnl assertion checking is wanted. In this case the pushf/popf is omitted.
+dnl For example,
+dnl
+dnl ASSERT(, `movq %rax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+` $2',
+`ifelse(`$2',,,
+` pushfq')
+ $2
+ `j$1' L(ASSERT_ok`'ASSERT_counter)
+ ud2 C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,` popfq')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+dnl LEA - load effective address
+dnl
+dnl FIXME: We should never create a GOT entry and therefore use the simpler 2nd
+dnl variant always. We need to understand what happens for not-yet-hidden
+dnl symbols first.
+dnl
+define(`LEA',`dnl
+ifdef(`PIC',
+ `mov $1@GOTPCREL(%rip), $2'
+,
+ `lea $1(%rip), $2')
+')
+
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(2,3)
+` ifelse($#,3,`$3',`RODATA')
+ ALIGN($2)
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+` SIZE(`$1',.-`$1')')
+
+
+define(`R32',
+ `ifelse($1,`%rax',`%eax',
+ $1,`%rbx',`%ebx',
+ $1,`%rcx',`%ecx',
+ $1,`%rdx',`%edx',
+ $1,`%rsi',`%esi',
+ $1,`%rdi',`%edi',
+ $1,`%rbp',`%ebp',
+ $1,`%r8',`%r8d',
+ $1,`%r9',`%r9d',
+ $1,`%r10',`%r10d',
+ $1,`%r11',`%r11d',
+ $1,`%r12',`%r12d',
+ $1,`%r13',`%r13d',
+ $1,`%r14',`%r14d',
+ $1,`%r15',`%r15d')')
+define(`R8',
+ `ifelse($1,`%rax',`%al',
+ $1,`%rbx',`%bl',
+ $1,`%rcx',`%cl',
+ $1,`%rdx',`%dl',
+ $1,`%rsi',`%sil',
+ $1,`%rdi',`%dil',
+ $1,`%rbp',`%bpl',
+ $1,`%r8',`%r8b',
+ $1,`%r9',`%r9b',
+ $1,`%r10',`%r10b',
+ $1,`%r11',`%r11b',
+ $1,`%r12',`%r12b',
+ $1,`%r13',`%r13b',
+ $1,`%r14',`%r14b',
+ $1,`%r15',`%r15b')')
+
+
+dnl Usage: CALL(funcname)
+dnl
+
+define(`CALL',`dnl
+ifdef(`PIC',
+ `call GSYM_PREFIX`'$1@PLT'
+,
+ `call GSYM_PREFIX`'$1'
+)')
+
+define(`TCALL',`dnl
+ifdef(`PIC',
+ `jmp GSYM_PREFIX`'$1@PLT'
+,
+ `jmp GSYM_PREFIX`'$1'
+)')
+
+
+define(`JUMPTABSECT', `.section .data.rel.ro.local,"a",@progbits')
+
+
+dnl Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT',`dnl
+ifdef(`PIC',
+ `.long $1-$2'dnl
+,
+ `.quad $1'dnl
+)')
+
+
+dnl These macros are defined just for DOS64, where they provide calling
+dnl sequence glue code.
+
+define(`FUNC_ENTRY',`')
+define(`FUNC_EXIT',`')
+
+
+dnl Target ABI macros.
+
+define(`IFDOS', `')
+define(`IFSTD', `$1')
+define(`IFELF', `$1')
+
+
+dnl Usage: PROTECT(symbol)
+dnl
+dnl Used for private GMP symbols that should never be overridden by users.
+dnl This can save reloc entries and improve shlib sharing as well as
+dnl application startup times
+
+define(`PROTECT', `.hidden $1')
+
+
+dnl Usage: x86_lookup(target, key,value, key,value, ...)
+dnl
+dnl Look for `target' among the `key' parameters.
+dnl
+dnl x86_lookup expands to the corresponding `value', or generates an error
+dnl if `target' isn't found.
+
+define(x86_lookup,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+
+dnl Usage: x86_opcode_regxmm(reg)
+dnl
+dnl Validate the given xmm register, and return its number, 0 to 7.
+
+define(x86_opcode_regxmm,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_regxmm_list)')
+
+define(x86_opcode_regxmm_list,
+``%xmm0',0,
+`%xmm1',1,
+`%xmm2',2,
+`%xmm3',3,
+`%xmm4',4,
+`%xmm5',5,
+`%xmm6',6,
+`%xmm7',7,
+`%xmm8',8,
+`%xmm9',9,
+`%xmm10',10,
+`%xmm11',11,
+`%xmm12',12,
+`%xmm13',13,
+`%xmm14',14,
+`%xmm15',15')
+
+dnl Usage: palignr($imm,%srcreg,%dstreg)
+dnl
+dnl Emit a palignr instruction, using a .byte sequence, since obsolete but
+dnl still distributed versions of gas don't know SSSE3 instructions.
+
+define(`palignr',
+m4_assert_numargs(3)
+`.byte 0x66,dnl
+ifelse(eval(x86_opcode_regxmm($3) >= 8 || x86_opcode_regxmm($2) >= 8),1,
+ `eval(0x40+x86_opcode_regxmm($3)/8*4+x86_opcode_regxmm($2)/8),')dnl
+0x0f,0x3a,0x0f,dnl
+eval(0xc0+x86_opcode_regxmm($3)%8*8+x86_opcode_regxmm($2)%8),dnl
+substr($1,1)')
+
+
+dnl Usage
+dnl
+dnl regnum(op) raw operand index (so slightly misnamed)
+dnl regnumh(op) high bit of register operand nimber
+dnl ix(op) 0 for reg operand, 1 for plain pointer operand.
+dnl
+
+define(`regnum',`x86_lookup(`$1',oplist)')
+define(`regnumh',`eval(regnum($1)/8 & 1)')
+define(`ix',`eval(regnum($1)/16)')
+define(`oplist',
+``%rax', 0, `%rcx', 1, `%rdx', 2, `%rbx', 3,
+ `%rsp', 4, `%rbp', 5, `%rsi', 6, `%rdi', 7,
+ `%r8', 8, `%r9', 9, `%r10', 10, `%r11', 11,
+ `%r12', 12, `%r13', 13, `%r14', 14, `%r15', 15,
+ `(%rax)',16, `(%rcx)',17, `(%rdx)',18, `(%rbx)',19,
+ `(%rsp)',20, `(%rbp)',21, `(%rsi)',22, `(%rdi)',23,
+ `(%r8)', 24, `(%r9)', 25, `(%r10)',26, `(%r11)',27,
+ `(%r12)',28, `(%r13)',29, `(%r14)',30, `(%r15)',31')
+
+dnl Usage (by mulx, shlx, shrx)
+dnl
+dnl reg1,reg2,reg3,opc1,opc2
+dnl
+dnl or
+dnl
+dnl (reg1),reg2,reg3,opc1,opc2
+dnl
+dnl where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl or
+dnl
+dnl off,(reg1),reg2,reg3,opc1,opc2
+dnl
+dnl where reg1 is any register but rsp,r12.
+dnl
+dnl The exceptions are due to special coding needed for some registers; rsp
+dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the
+dnl offset-less form.
+dnl
+dnl Other addressing forms are not handled. Invalid forms are not properly
+dnl detected. Offsets that don't fit one byte are not handled correctly.
+
+define(`c4_helper',`dnl
+.byte 0xc4`'dnl
+ifelse(`$#',5,`dnl
+,eval(0xe2^32*regnumh($1)^128*regnumh($3))`'dnl
+,eval(0x$4-8*regnum($2))`'dnl
+,0x$5`'dnl
+,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($3))-0xc0*ix($1))`'dnl
+',`$#',6,`dnl
+,eval(0xe2^32*regnumh($2)^128*regnumh($4))`'dnl
+,eval(0x$5-8*regnum($3))`'dnl
+,0x$6`'dnl
+,eval(0x40+(7 & regnum($2))+8*(7 & regnum($4)))`'dnl
+,eval(($1 + 256) % 256)`'dnl
+')')
+
+
+dnl Usage
+dnl
+dnl mulx(reg1,reg2,reg3)
+dnl
+dnl or
+dnl
+dnl mulx((reg1),reg2,reg3)
+dnl
+dnl where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl mulx(off,(reg1),reg2,reg3)
+dnl
+dnl where reg1 is any register but rsp,r12.
+
+define(`mulx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($1,$2,$3,fb,f6)',`dnl format 1,2
+c4_helper($1,$2,$3,$4,fb,f6)'dnl format 3
+)')
+
+
+dnl Usage
+dnl
+dnl shlx(reg1,reg2,reg3)
+dnl shrx(reg1,reg2,reg3)
+dnl
+dnl or
+dnl
+dnl shlx(reg1,(reg2),reg3)
+dnl shrx(reg1,(reg2),reg3)
+dnl
+dnl where reg2 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl shlx(reg1,off,(reg2),reg3)
+dnl shrx(reg1,off,(reg2),reg3)
+dnl
+dnl where reg2 is any register but rsp,r12.
+
+define(`shlx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,f9,f7)',`dnl format 1,2
+c4_helper($1,$3,$2,$4,f9,f7)'dnl format 3
+)')
+
+define(`shrx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,fb,f7)',`dnl format 1,2
+c4_helper($1,$3,$2,$4,fb,f7)'dnl format 3
+)')
+
+define(`sarx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,fa,f7)',`dnl format 1,2
+c4_helper($1,$3,$2,$4,fa,f7)'dnl format 3
+)')
+
+
+dnl Usage
+dnl
+dnl adcx(reg1,reg2)
+dnl adox(reg1,reg2)
+dnl
+dnl or
+dnl
+dnl adcx((reg1),reg2)
+dnl adox((reg1),reg2)
+dnl
+dnl where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl adcx(off,(reg1),reg2)
+dnl adox(off,(reg1),reg2)
+dnl
+dnl where reg1 is any register but rsp,r12.
+dnl
+dnl The exceptions are due to special coding needed for some registers; rsp
+dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the
+dnl offset-less form.
+dnl
+dnl Other addressing forms are not handled. Invalid forms are not properly
+dnl detected. Offsets that don't fit one byte are not handled correctly.
+
+define(`adx_helper',`dnl
+,eval(0x48+regnumh($1)+4*regnumh($2))`'dnl
+,0x0f`'dnl
+,0x38`'dnl
+,0xf6`'dnl
+')
+
+define(`adx',`dnl
+ifelse(`$#',2,`dnl
+adx_helper($1,$2)dnl
+,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($2))-0xc0*ix($1))`'dnl
+',`$#',3,`dnl
+adx_helper($2,$3)dnl
+,eval(0x40+(7 & regnum($2))+8*(7 & regnum($3)))`'dnl
+,eval(($1 + 256) % 256)`'dnl
+')')
+
+define(`adcx',`dnl
+.byte 0x66`'dnl
+adx($@)')
+
+define(`adox',`dnl
+.byte 0xf3`'dnl
+adx($@)')
+
+divert`'dnl
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm
new file mode 100644
index 0000000..803fa30
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm
new file mode 100644
index 0000000..417dd0a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm
@@ -0,0 +1,227 @@
+dnl AMD64 mpn_addlsh_n, mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 n/a
+C AMD bd3 n/a
+C AMD bd4 2.31
+C AMD zn1 1.69
+C AMD zn2 1.55
+C AMD zn3 1.36
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL 2.08
+C Intel BWL 1.78
+C Intel SKL 1.78
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * Perhaps avoid using jrcxz by using dec n + jnz.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+
+define(`tnc', `%r9')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ mov (vp), %r10
+
+ mov R32(n), R32(%rax)
+ shr $3, n
+ xor R32(tnc), R32(tnc)
+ sub cnt, tnc
+ and $7, R32(%rax)
+
+ lea L(tab)(%rip), %r11
+ifdef(`PIC',`
+ movslq (%r11,%rax,4), %rax
+ add %r11, %rax
+ jmp *%rax
+',`
+ jmp *(%r11,%rax,8)
+')
+
+L(0): lea 32(up), up
+ lea 32(vp), vp
+ lea 32(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e0)
+
+L(7): mov %r10, %r11
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e7)
+
+L(6): lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e6)
+
+L(5): mov %r10, %r11
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e5)
+
+L(end): ADCSBB 24(up), %rax
+ mov %rax, -40(rp)
+ shrx( tnc, %r11, %rax)
+ ADCSBB n, %rax
+ FUNC_EXIT()
+ ret
+
+ ALIGN(32)
+L(top): jrcxz L(end)
+ mov -32(vp), %r10
+ ADCSBB 24(up), %rax
+ lea 64(up), up
+ shrx( tnc, %r11, %r11)
+ mov %rax, -40(rp)
+L(e0): dec n
+ shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov -24(vp), %r11
+ ADCSBB -32(up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, -32(rp)
+L(e7): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov -16(vp), %r10
+ ADCSBB -24(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, -24(rp)
+L(e6): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov -8(vp), %r11
+ ADCSBB -16(up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, -16(rp)
+L(e5): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov (vp), %r10
+ ADCSBB -8(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, -8(rp)
+L(e4): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov 8(vp), %r11
+ ADCSBB (up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, (rp)
+L(e3): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov 16(vp), %r10
+ ADCSBB 8(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, 8(rp)
+L(e2): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov 24(vp), %r11
+ ADCSBB 16(up), %rax
+ lea 64(vp), vp
+ shrx( tnc, %r10, %r10)
+ mov %rax, 16(rp)
+ lea 64(rp), rp
+L(e1): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ jmp L(top)
+
+L(4): xor R32(%r11), R32(%r11)
+ jmp L(e4)
+
+L(3): mov %r10, %r11
+ lea -8(up), up
+ lea -8(vp), vp
+ lea -8(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e3)
+
+L(2): lea -16(up), up
+ lea -16(vp), vp
+ lea -16(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e2)
+
+L(1): mov %r10, %r11
+ lea -24(up), up
+ lea 40(vp), vp
+ lea 40(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e1)
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm
new file mode 100644
index 0000000..89795e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm
@@ -0,0 +1,165 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 4.3
+C AMD zen 2
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`ADCSBB', `adc')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`ADCSBB', `sbb')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (up), %r8
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mulx( %r8, %rbx, %rax)
+ sub $-1, n
+ jz L(wd1)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo1)
+
+L(b0): mulx( %r8, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ xor R32(%rax), R32(%rax)
+ jmp L(lo0)
+
+L(b3): mulx( %r8, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ sub $-3, n
+ jz L(wd3)
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo3)
+
+L(b2): mulx( %r8, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
+ add %r12, %rbx
+ adc $0, %rax
+ sub $-2, n
+ jz L(wd2)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo2)
+
+L(top): ADDSUB %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ ADCSBB %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ ADCSBB %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ ADCSBB %rbx, 24(rp,n,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax C rax = carry limb
+ add $4, n
+ js L(top)
+
+L(end): ADDSUB %r9, (rp)
+L(wd3): ADCSBB %r11, 8(rp)
+L(wd2): ADCSBB %r13, 16(rp)
+L(wd1): ADCSBB %rbx, 24(rp)
+ adc n, %rax
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/com.asm b/gmp-6.3.0/mpn/x86_64/zen/com.asm
new file mode 100644
index 0000000..b34f841
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyd.asm b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm
new file mode 100644
index 0000000..63ed237
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyi.asm b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm
new file mode 100644
index 0000000..1aafaaa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm
new file mode 100644
index 0000000..0ffb6ca
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/bd2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm
new file mode 100644
index 0000000..5dfd9e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/coreihwl/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h
new file mode 100644
index 0000000..05a12b3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h
@@ -0,0 +1,280 @@
+/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 468,514,360 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 32
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 338
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 107
+#define MUL_TOOM44_THRESHOLD 190
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 272
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 106
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 117
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 114
+#define SQR_TOOM4_THRESHOLD 422
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 540, 5}, { 22, 6}, { 12, 5}, { 25, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 43, 9}, { 11, 8}, { 29, 9}, \
+ { 15, 8}, { 35, 9}, { 19, 8}, { 43, 9}, \
+ { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \
+ { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
+ { 55,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 103,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 167,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 159,12}, { 95,11}, { 191,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \
+ { 1343,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,10}, { 1343, 9}, { 2687,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \
+ { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 927,11}, \
+ { 1855,12}, { 959,11}, { 1919,10}, { 3839,13}, \
+ { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
+ { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \
+ { 3455,13}, { 895,12}, { 1855,13}, { 959,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2687,13}, \
+ { 5375,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
+ { 7679,16}, { 1023,15}, { 2047,14}, { 4479,15}, \
+ { 2303,14}, { 4991,15}, { 2559,14}, { 5247,15}, \
+ { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \
+ { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \
+ { 2047,15}, { 4095,14}, { 8191,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5375,14}, \
+ { 11007,15}, { 5887,14}, { 11775,16}, { 3071,15}, \
+ { 6911,16}, { 3583,15}, { 7167,14}, { 14335,15}, \
+ { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9215,14}, { 18431,15}, { 9727,14}, { 19455,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 11007,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 271
+#define MUL_FFT_THRESHOLD 6272
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 13, 4}, { 27, 5}, { 21, 6}, \
+ { 11, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 14, 5}, { 29, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 25, 8}, \
+ { 13, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,12}, \
+ { 95,11}, { 191,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \
+ { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \
+ { 735,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 399,10}, { 799,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,10}, { 1471,13}, \
+ { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \
+ { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \
+ { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1471,11}, { 2943,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \
+ { 895,12}, { 1855,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \
+ { 2175,13}, { 4479,12}, { 8959,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,14}, { 3967,16}, \
+ { 1023,15}, { 2047,14}, { 4479,15}, { 2303,14}, \
+ { 4991,15}, { 2559,14}, { 5247,15}, { 2815,14}, \
+ { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \
+ { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \
+ { 8191,15}, { 4351,14}, { 8959,15}, { 4863,14}, \
+ { 9727,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7167,14}, \
+ { 14335,15}, { 7679,14}, { 15359,15}, { 7935,14}, \
+ { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \
+ { 4607,15}, { 9215,14}, { 18431,15}, { 9727,14}, \
+ { 19455,15}, { 9983,14}, { 19967,16}, { 5119,15}, \
+ { 10239,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 6655,15}, { 13311,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 302
+#define SQR_FFT_THRESHOLD 4224
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 69
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 12
+#define SQRLO_DC_THRESHOLD 82
+#define SQRLO_SQR_THRESHOLD 8207
+
+#define DC_DIV_QR_THRESHOLD 76
+#define DC_DIVAPPR_Q_THRESHOLD 232
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 37
+#define INV_NEWTON_THRESHOLD 274
+#define INV_APPR_THRESHOLD 230
+
+#define BINV_NEWTON_THRESHOLD 372
+#define REDC_1_TO_REDC_N_THRESHOLD 68
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define POWM_SEC_TABLE 3,22,81,494
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 486
+#define SET_STR_PRECOMPUTE_THRESHOLD 1264
+
+#define FAC_DSC_THRESHOLD 187
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD2_DIV1_METHOD 1 /* 9.20% faster than 3 */
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 104
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 566
+#define GCDEXT_DC_THRESHOLD 382
+#define JACOBI_BASE_METHOD 1 /* 15.55% faster than 3 */
+
+/* Tuneup completed successfully, took 281243 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm
new file mode 100644
index 0000000..48dcf61
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/coreinhm/hamdist.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshift.asm b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm
new file mode 100644
index 0000000..4dce319
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm
new file mode 100644
index 0000000..d52b194
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm
new file mode 100644
index 0000000..6a083ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm
@@ -0,0 +1,161 @@
+dnl AMD64 mpn_mul_1 for CPUs with mulx.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 4.4
+C AMD zen 2
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ xor R32(%r8), R32(%r8) C carry-in limb
+L(ent): mov (up), %r9
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov %r8, %r12
+ mulx( %r9, %rbx, %rax)
+ sub $-1, n
+ jz L(wd1)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ add %r12, %rbx
+ jmp L(lo1)
+
+L(b3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
+ sub $-3, n
+ jz L(wd3)
+ add %r8, %r11
+ jmp L(lo3)
+
+L(b2): mov %r8, %r10 C carry-in limb
+ mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
+ sub $-2, n
+ jz L(wd2)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ add %r10, %r13
+ jmp L(lo2)
+
+L(b0): mov %r8, %rax C carry-in limb
+ mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ add %rax, %r9
+ jmp L(lo0)
+
+L(top): jrcxz L(end)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(top)
+
+L(end): mov %r9, (rp)
+L(wd3): adc %r8, %r11
+ mov %r11, 8(rp)
+L(wd2): adc %r10, %r13
+ mov %r13, 16(rp)
+L(wd1): adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm
new file mode 100644
index 0000000..affa3b6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm
@@ -0,0 +1,455 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Zen.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider
+C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead
+C of 8 product registers.
+C * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops
+C without branch tree.
+C * Improve the overlapped software pipelining. The mulx in the osp block now
+C suffers from write/read conflicts, in particular the 1 mod 4 case. Also,
+C mul_1 could osp into addmul_1.
+C * Let vn_param be vn to save a copy.
+C * Re-allocate to benefit more from 32-bit encoding.
+C * Poor performance for e.g. n = 12,16.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')
+define(`vn_param', `%r8')
+
+define(`un', `%r14')
+define(`vp', `%rbp')
+define(`v0', `%rdx')
+define(`n', `%rcx')
+define(`vn', `%r15')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ cmp $2, un_param
+ ja L(gen)
+ mov (vp_param), %rdx
+ mulx( (up), %rax, %r9) C 0 1
+ je L(s2x)
+
+L(s11): mov %rax, (rp)
+ mov %r9, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(s2x): cmp $2, vn_param
+ mulx( 8,(up), %r8, %r10) C 1 2
+ je L(s22)
+
+L(s21): add %r8, %r9
+ adc $0, %r10
+ mov %rax, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(s22): add %r8, %r9 C 1
+ adc $0, %r10 C 2
+ mov 8(vp_param), %rdx
+ mov %rax, (rp)
+ mulx( (up), %r8, %r11) C 1 2
+ mulx( 8,(up), %rax, %rdx) C 2 3
+ add %r11, %rax C 2
+ adc $0, %rdx C 3
+ add %r8, %r9 C 1
+ adc %rax, %r10 C 2
+ adc $0, %rdx C 3
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(gen): push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov un_param, un
+ mov vp_param, vp
+ mov vn_param, vn
+
+ mov (up), %r9
+ mov (vp), v0
+
+ lea (up,un,8), up
+ lea -32(rp,un,8), rp
+
+ neg un
+ mov un, n
+ test $1, R8(un)
+ jz L(mx0)
+L(mx1): test $2, R8(un)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ inc n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ sub $-3, n
+ jz L(mwd3)
+ test R32(%rdx), R32(%rdx)
+ jmp L(mlo3)
+
+L(mx0): test $2, R8(un)
+ jz L(mb0)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ lea 2(n), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ jmp L(mlo2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12
+ jmp L(mlo0)
+
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+L(mwd3):mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+ mov %rax, 32(rp)
+ add $8, vp
+ dec vn
+ jz L(end)
+
+C The rest of the file are 4 osp loops around addmul_1
+
+ test $1, R8(un)
+ jnz L(0x1)
+
+L(0x0): test $2, R8(un)
+ jnz L(oloop2_entry)
+
+L(oloop0_entry):
+ C initial feed-in block
+ mov (vp), %rdx
+ add $8, vp
+ mov un, n
+ add $8, rp
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 24(up,un,8), %rbx, %rax
+ add %r8, %r11
+ jmp L(lo0)
+
+L(oloop0):
+ C overlapped software pipelining block
+ mov (vp), %rdx C new
+ add $8, vp
+ add %r9, (rp) C prev
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8
+ adc %r11, 8(rp) C prev
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 0x8(%rsi,%r14,8),%r11,%r10
+ adc %r13, 16(rp) C prev
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 0x10(%rsi,%r14,8),%r13,%r12
+ adc %rbx, 24(rp) C prev
+ mov un, n
+ adc $0, %rax C prev
+ mov %rax, 32(rp) C prev
+ add $8, rp
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%rbx,%rax
+ add %r8, %r11 C new
+ jmp L(lo0)
+
+ ALIGN(16)
+L(tp0): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+L(lo0): adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp0)
+
+ dec vn
+ jne L(oloop0)
+
+ jmp L(final_wind_down)
+
+L(oloop2_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 2(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ add %r13, 16(rp,n,8)
+ jmp L(lo2)
+
+L(oloop2):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ lea 2(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r11,%r10
+ add %r13, 16(rp,n,8)
+ jmp L(lo2)
+
+ ALIGN(16)
+L(tp2): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp2)
+
+ dec vn
+ jne L(oloop2)
+
+ jmp L(final_wind_down)
+
+L(0x1): test $2, R8(un)
+ jz L(oloop3_entry)
+
+L(oloop1_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 1(un), n
+ add $8, rp
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ add %rbx, 24(rp,n,8)
+ jmp L(lo1)
+
+L(oloop1):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ adc %r11, 8(rp)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r13,%r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax
+ lea 1(un), n
+ add $8, rp
+ add %rbx, 24(rp,n,8)
+ jmp L(lo1)
+
+ ALIGN(16)
+L(tp1): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+L(lo1): adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp1)
+
+ dec vn
+ jne L(oloop1)
+
+ jmp L(final_wind_down)
+
+L(oloop3_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 3(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ test n, n
+ jz L(wd3)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ add %r11, 8(rp,n,8)
+ jmp L(lo3)
+
+L(oloop3):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ lea 3(un), n
+ add $8, rp
+ add %r10, %r13
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r12, %rbx
+ adc $0, %rax
+ add %r11, 8(rp,n,8)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(tp3): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp3)
+
+ dec vn
+ jne L(oloop3)
+
+L(final_wind_down):
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(end): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(3): mov (vp), %rdx
+ add $8, vp
+ add $8, rp
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+L(wd3): adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ dec vn
+ jne L(3)
+ jmp L(end)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm
new file mode 100644
index 0000000..2ae729a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm
@@ -0,0 +1,299 @@
+dnl X64-64 mpn_mullo_basecase optimised for AMD Zen.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`nn', `%rbp')
+
+C TODO
+C * Rearrange feed-in jumps for short branch forms.
+C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in
+C code implodes, the blow-up will not be more than perhaps 2.5x.
+C * Micro-optimise critical lead-in code blocks.
+C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc.
+C * Write n < 4 code specifically for Zen (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, R32(n)
+ jae L(big)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $2, R32(n)
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r8
+ mov (up), %rdx
+ mulx( %r8, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r8 C u1 x v1
+ add %r8, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(big): push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov (up), %r9
+ lea -8(up,n,8), up
+ lea -40(rp,n,8), rp
+
+ mov $4, R32(%r14)
+ sub n, %r14
+ mov -8(vp_param,n,8), %rbp
+ imul %r9, %rbp
+ lea 8(vp_param), %r15
+ mov (vp_param), %rdx
+
+ test $1, R8(%r14)
+ jnz L(mx0)
+L(mx1): test $2, R8(%r14)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ lea -2(%r14), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+ lea (%r14), n
+ jrcxz L(x)
+ jmp L(mlo3)
+L(x): jmp L(mcor)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax
+ lea -1(%r14), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8
+ jmp L(mlo2)
+
+L(mx0): test $2, R8(%r14)
+ jz L(mb2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12
+ lea -3(%r14), n
+ jmp L(mlo0)
+
+ ALIGN(16)
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+ mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ mov %rbx, 24(rp)
+
+L(outer):
+ mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov (%r15), %rdx
+ add $8, %r15
+ mov -24(up,%r14,8), %r8
+ lea -8(up), up
+
+ test $1, R8(%r14)
+ jz L(x0)
+L(x1): test $2, R8(%r14)
+ jnz L(b3)
+
+L(b1): mulx( %r8, %rbx, %rax)
+ lea -1(%r14), n
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10
+ jmp L(lo1)
+
+L(x0): test $2, R8(%r14)
+ jz L(b2)
+
+L(b0): mulx( %r8, %r9, %r8)
+ lea -2(%r14), n
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12
+ jmp L(lo0)
+
+L(b3): mulx( %r8, %r11, %r10)
+ lea 1(%r14), n
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jrcxz L(cor)
+ jmp L(lo3)
+
+L(cor): add 8(rp), %r11
+ mov 16(rp), %r10
+ mov 24(rp), %r12
+L(mcor):mov %r11, 8(rp)
+ adc %r10, %r13
+ adc %r12, %rbx
+ mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov (%r15), %rdx
+ mov -24(up), %r8
+ mulx( %r8, %r9, %r12)
+ mulx( -16,(up), %r14, %rax)
+ add %r12, %r14
+ adc $0, %rax
+ adc %r9, %r13
+ mov %r13, 16(rp)
+ adc %r14, %rbx
+ mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov 8(%r15), %rdx
+ mulx( -24,(up), %r14, %rax)
+ add %r14, %rbx
+ mov %rbx, 24(rp)
+ mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov %rbp, 32(rp)
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(b2): mulx( %r8, %r13, %r12)
+ lea (%r14), n
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8
+ jmp L(lo2)
+
+ ALIGN(16)
+L(top): add %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ js L(top)
+
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ inc %r14
+ jmp L(outer)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/popcount.asm b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm
new file mode 100644
index 0000000..be1613b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/coreinhm/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/rshift.asm b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm
new file mode 100644
index 0000000..0196870
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..0c24de5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm
@@ -0,0 +1,507 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(`up', `%rdi')
+define(`un_param', `%rsi')
+define(`dp_param', `%rdx')
+define(`dn_param', `%rcx')
+define(`dinv', `%r8')
+
+define(`i', `%rcx')
+define(`dn', `%r14')
+
+define(`dp', `%rsi')
+define(`un', `%r15')
+
+C TODO
+C * The o1...o8 loops for special dn counts were naively hand-optimised by
+C folding the generic loops. They can probably be tuned. The speculative
+C quotient limb generation might not be in the optimal spot.
+C * Perhaps avoid late-in-loop jumps, e.g., lo0.
+C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
+
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), dinv ')
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ sub dn_param, un_param C outer loop count
+ mov dn_param, dn C FIXME: Suppress by reg re-alloc
+ push dinv C keep dinv on stack
+ mov un_param, un C FIXME: Suppress by reg re-alloc
+ xor R32(%rbp), R32(%rbp)
+
+ lea (dp_param,dn_param,8), dp
+
+ mov (up), %rdx
+ imul dinv, %rdx C first quotient limb
+
+ neg dn
+ lea -32(up,dn_param,8), up
+
+ test $1, R8(dn_param)
+ jnz L(cx1)
+
+L(cx0): test $2, R8(dn_param)
+ jnz L(b2)
+
+
+C =============================================================================
+L(b0): cmp $-4, dn
+ jnz L(gt4)
+
+L(o4): mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add (up), %r9
+ adc 8(up), %r11
+ mov %r8, %rdx C dinv
+ mov %r11, 8(up)
+ mulx( %r11, %rdx, %r12) C next quotient
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o4)
+ jmp L(ret)
+
+L(gt4): cmp $-8, dn
+ jnz L(out0)
+
+L(o8): mulx( -64,(dp), %r9, %r14)
+ mulx( -56,(dp), %rcx, %r10)
+ mulx( -48,(dp), %r13, %r12)
+ mulx( -40,(dp), %rbx, %rax)
+ add %r14, %rcx
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add -32(up), %r9
+ mulx( -32,(dp), %r9, %r14)
+ adc -24(up), %rcx
+ mov %rcx, -24(up)
+ mulx( -24,(dp), %r11, %r10)
+ adc %r13, -16(up)
+ mulx( -16,(dp), %r13, %r12)
+ adc %rbx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o8)
+ jmp L(ret)
+
+L(out0):mov dn, i
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12
+ clc
+ jmp L(lo0)
+
+ ALIGN(16)
+L(top0):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top0)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out0)
+ jmp L(ret)
+
+L(cx1): test $2, R8(dn_param)
+ jnz L(b3)
+
+C =============================================================================
+L(b1): cmp $-1, dn
+ jnz L(gt1)
+
+ mov 24(up), %r9
+L(o1): mulx( -8,(dp), %rbx, %rdx)
+ add %r9, %rbx
+ adc %rbp, %rdx
+ add 32(up), %rdx
+ setc R8(%rbp)
+ mov %rdx, %r9
+ mulx( %r8, %rdx, %r12) C next quotient
+ lea 8(up), up
+ dec un
+ jne L(o1)
+ mov %r9, 24(up)
+ jmp L(ret)
+
+L(gt1): cmp $-5, dn
+ jnz L(out1)
+
+L(o5): mulx( -40,(dp), %rbx, %rax)
+ mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ add -8(up), %rbx
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add (up), %r9
+ mov %r9, (up)
+ mov %r8, %rdx C dinv
+ mulx( %r9, %rdx, %r12) C next quotient
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o5)
+ jmp L(ret)
+
+L(out1):lea 1(dn), i
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10
+ clc
+ jmp L(lo1)
+
+ ALIGN(16)
+L(top1):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top1)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out1)
+ jmp L(ret)
+
+C =============================================================================
+L(b2): cmp $-2, dn
+ jnz L(gt2)
+
+ mov 16(up), %r10
+ mov 24(up), %r9
+L(o2): mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r12, %rbx
+ adc $0, %rax
+ add %r10, %r13 C 0 add just to produce carry
+ mov %r9, %r10 C 1
+ adc %rbx, %r10 C 1
+ mov %r8, %rdx
+ mulx( %r10, %rdx, %r12) C next quotient
+ adc %rbp, %rax C 2
+ setc R8(%rbp) C 3
+ mov 32(up), %r9 C 2
+ add %rax, %r9 C 2
+ adc $0, R32(%rbp) C 3
+ lea 8(up), up
+ dec un
+ jne L(o2)
+ mov %r10, 16(up)
+ mov %r9, 24(up)
+ jmp L(ret)
+
+L(gt2): cmp $-6, dn
+ jnz L(out2)
+
+L(o6): mulx( -48,(dp), %r13, %r12)
+ mulx( -40,(dp), %rcx, %rax)
+ add %r12, %rcx
+ adc $0, %rax
+ mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ add -16(up), %r13
+ mulx( -16,(dp), %r13, %r12)
+ adc -8(up), %rcx
+ mov %rcx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o6)
+ jmp L(ret)
+
+L(out2):lea 2(dn), i
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (dp,dn,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8
+ jmp L(lo2)
+
+ ALIGN(16)
+L(top2):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top2)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out2)
+ jmp L(ret)
+
+C =============================================================================
+L(b3): cmp $-3, dn
+ jnz L(gt3)
+
+ mov 8(up), %r14
+ mov 16(up), %r9
+ mov 24(up), %rcx
+L(o3): mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add %r14, %r11
+ mov %r9, %r14
+ adc %r13, %r14
+ mov %rcx, %r9
+ mov %r8, %rdx C dinv
+ mulx( %r14, %rdx, %r12) C next quotient
+ adc %rbx, %r9
+ adc %rbp, %rax
+ setc R8(%rbp)
+ mov 32(up), %rcx
+ add %rax, %rcx
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o3)
+ mov %r14, 8(up)
+ mov %r9, 16(up)
+ mov %rcx, 24(up)
+ jmp L(ret)
+
+L(gt3): cmp $-7, dn
+ jnz L(out3)
+
+L(o7): mulx( -56,(dp), %r11, %r10)
+ mulx( -48,(dp), %rcx, %r12)
+ mulx( -40,(dp), %rbx, %rax)
+ add %r10, %rcx
+ adc %r12, %rbx
+ adc $0, %rax
+ mulx( -32,(dp), %r9, %r14)
+ add -24(up), %r11
+ mulx( -24,(dp), %r11, %r10)
+ adc -16(up), %rcx
+ mov %rcx, -16(up)
+ mulx( -16,(dp), %r13, %r12)
+ adc %rbx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o7)
+ jmp L(ret)
+
+L(out3):lea 3(dn), i
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (dp,dn,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top3):add %r9, (up,i,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top3)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out3)
+
+L(ret): mov %rbp, %rax
+ pop %rsi C dummy dealloc
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm
new file mode 100644
index 0000000..a7c6127
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm
@@ -0,0 +1,482 @@
+dnl AMD64 mpn_sqr_basecase optimised for AMD Zen.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Do overlapped software pipelining. This should close the remaining gap to
+C mul_basecase.
+C
+C * Update un just once in the outer loop.
+C
+C * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from
+C loads and stores. At least in some cases, the non-scaled form is faster.
+C
+C * Optimise xit3 code, e.g., using shrx and sarx like in the main loop.
+C
+C * The mul_1 feed-in code has gotten little attention and could probably be
+C improved. Perhaps even expand it to 4 separate loops to allow straight
+C fall-through into the 4 addmul_1 loops.
+C
+C * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+define(`un', `%rbp')
+define(`n', `%rcx')
+
+C these are used just for the small op code
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, R32(un_param)
+ jae L(gt1)
+
+ mov (up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rdx
+ mov 8(up), %rcx
+ mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
+ mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
+ mov %rcx, %rdx
+ mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
+ add %r9, %r9 C W 1
+ adc %r10, %r10 C W 2
+ adc $0, %rdx C W 3
+ add %r9, %r8 C W 1
+ adc %r11, %r10 C W 2
+ adc $0, %rdx C W 3
+ mov %rax, (rp)
+ mov %r8, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, R32(un_param)
+ jae L(gt3)
+
+ push %rbx
+ mov (up), %rdx
+ mulx( 8,(up), w2, w3)
+ mulx( 16,(up), w0, w1)
+ add w3, w0
+ mov 8(up), %rdx
+ mulx( 16,(up), %rax, w3)
+ adc %rax, w1
+ adc $0, w3
+ test R32(%rbx), R32(%rbx)
+ mov (up), %rdx
+ mulx( %rdx, %rbx, %rcx)
+ mov %rbx, (rp)
+ mov 8(up), %rdx
+ mulx( %rdx, %rax, %rbx)
+ mov 16(up), %rdx
+ mulx( %rdx, %rsi, %rdx)
+ adcx( w2, w2)
+ adcx( w0, w0)
+ adcx( w1, w1)
+ adcx( w3, w3)
+ adox( w2, %rcx)
+ adox( w0, %rax)
+ adox( w1, %rbx)
+ adox( w3, %rsi)
+ mov $0, R32(%r8)
+ adox( %r8, %rdx)
+ adcx( %r8, %rdx)
+ mov %rcx, 8(rp)
+ mov %rax, 16(rp)
+ mov %rbx, 24(rp)
+ mov %rsi, 32(rp)
+ mov %rdx, 40(rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(gt3): push %r15
+C push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+ mov R32(un_param), R32(un)
+
+ mov (up), %rdx C up[0]
+ mov 8(up), %r9 C up[1]
+
+ mulx( %rdx, %rax, %r15) C up[0]^2
+ mov %rax, (rp)
+ shl %rdx
+
+ lea (up,un,8), up
+ lea -32(rp,un,8), rp
+
+ neg un
+ lea 4(un), n
+ and $-4, n
+
+ test $1, R8(un)
+ jnz L(mx0)
+L(mx1): test $2, R8(un)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18 C mulx 24(up,un,8), %r11, %r10
+ add %r15, %rbx
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x10 C mulx 16(up,un,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %rbx, %rax
+ add %r15, %r11
+ jrcxz L(n4)
+ jmp L(mlo3)
+L(n4): mov %r11, 8(rp)
+ adc %r10, %r13
+ adc %r12, %rbx
+ jmp L(m)
+
+L(mx0): test $2, R8(un)
+ jnz L(mb0)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %rbx, %rax
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %r9, %r8
+ add %r15, %r13
+ jmp L(mlo2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10 C mulx 16(up,un,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x18 C mulx 24(up,un,8), %r13, %r12
+ add %r15, %r9
+ jmp L(mlo0)
+
+ ALIGN(16)
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+ mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+ mov %rax, 32(rp)
+
+ lea 2(un), un
+
+ mov $63, R32(%r15) C keep at 63 for shrx/sarx.
+ test $1, R8(un)
+ jz L(x0)
+L(x1): test $2, R8(un)
+ jz L(f3)
+ jmp L(f1)
+L(x0): test $2, R8(un)
+ jz L(f0)
+C jmp L(f2)
+
+L(f2): mov -8(up,un,8), %rdx C up[0]
+ lea 2(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r11
+ .byte 0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r13
+ and %rdx, %r11 C "ci" in C code
+ mulx( %rdx, %rax, %r10) C up[0]^2
+ lea (%r13,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r11
+
+ .byte 0xc4,0x62,0x93,0xf6,0x24,0xee C mulx (up,un,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %rbx, %rax
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jmp L(b2)
+
+ ALIGN(16)
+L(top2):add %r9, (rp,n,8)
+L(b2): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top2)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f1): mov -8(up,un,8), %rdx C up[0]
+ lea 1(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r13
+ .byte 0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %rbx
+ and %rdx, %r13 C "ci" in C code
+ mulx( %rdx, %rax, %r12) C up[0]^2
+ lea (%rbx,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r13
+
+ .byte 0xc4,0xe2,0xe3,0xf6,0x04,0xee C mulx (up,un,8), %rbx, %rax
+ adc %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %r9, %r8
+ jmp L(b1)
+
+ ALIGN(16)
+L(top1):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(b1): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top1)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f0): mov -8(up,un,8), %rdx C up[0]
+ lea (un), n
+ lea 8(rp), rp
+ .byte 0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %rbx
+ .byte 0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r9
+ and %rdx, %rbx C "ci" in C code
+ mulx( %rdx, %r10, %rax) C up[0]^2
+ lea (%r9,%rdx,2), %rdx C "u0" arg in C code
+ add %r10, %rbx
+ adc $0, %rax C "cin" in C code
+
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08 C mulx 8(up,un,8), %r11, %r10
+ jmp L(b0)
+
+ ALIGN(16)
+L(top0):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(b0): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top0)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f3): mov -8(up,un,8), %rdx C up[0]
+ lea 3(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r9
+ .byte 0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r11
+ and %rdx, %r9 C "ci" in C code
+ mulx( %rdx, %rax, %r8) C up[0]^2
+ lea (%r11,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r9
+
+ .byte 0xc4,0x62,0xa3,0xf6,0x14,0xee C mulx (%rsi,%rbp,8),%r11,%r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x08 C mulx 0x8(%rsi,%rbp,8),%r13,%r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 0x10(%rsi,%rbp,8),%rbx,%rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jrcxz L(xit3)
+ jmp L(top3) C FIXME perhaps fall through
+
+ ALIGN(16)
+L(top3):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top3)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ jmp L(f2)
+
+
+L(xit3):add %r9, (rp)
+ adc %r11, 8(rp)
+ adc 16(rp), %r13
+ adc 24(rp), %rbx
+L(m): adc $0, %rax
+ mov %rax, 32(rp)
+ mov -24(up), %rdx C FIXME: CSE
+ mov -32(up), %r9 C FIXME: CSE
+ sar $63, %r9
+ and %rdx, %r9
+ add %r13, %r9
+ mulx( %rdx, %rax, %r10)
+ mov -16(up), %r8 C FIXME: CSE
+ adc $0, %r10
+ add %rax, %r9
+ adc $0, %r10
+ mov %r9, 16(rp)
+ mov -32(up), %rax
+ shl %rax
+ adc %rdx, %rdx
+ mulx( %r8, %r13, %r12)
+ mulx( -8,(up), %r11, %rax) C FIXME: CSE
+ add %r10, %r13
+ adc %r12, %r11
+ adc $0, %rax
+ add %rbx, %r13
+ mov %r13, 24(rp)
+ adc 32(rp), %r11
+ adc $0, %rax
+ mov -16(up), %rdx C FIXME: CSE
+ mov -8(up), %r8 C FIXME: CSE
+ mov -24(up), %r9
+ sar $63, %r9
+ and %rdx, %r9
+ add %r11, %r9
+ mulx( %rdx, %rbp, %r10)
+ adc $0, %r10
+ add %rbp, %r9
+ adc $0, %r10
+ mov %r9, 32(rp)
+ mov -24(up), %rbp
+ shl %rbp
+ adc %rdx, %rdx
+ mulx( %r8, %rbx, %rbp)
+ add %r10, %rbx
+ adc $0, %rbp
+ adc %rbx, %rax
+ mov %rax, 40(rp)
+ adc $0, %rbp
+ mov -8(up), %rdx C FIXME: CSE
+ mov -16(up), %r9 C FIXME: CSE
+ sar $63, %r9
+ and %rdx, %r9
+ add %rbp, %r9
+ mulx( %rdx, %rbp, %r10)
+ adc $0, %r10
+ add %rbp, %r9
+ adc $0, %r10
+ mov %r9, 48(rp)
+ mov %r10, 56(rp)
+
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+C pop %r14
+ pop %r15
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm
new file mode 100644
index 0000000..00f6dc9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sublsh1_n, mpn_sublsh1_nc.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h
new file mode 100644
index 0000000..3748c5f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h
@@ -0,0 +1,276 @@
+/* AMD Zen2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3600-4400 MHz Matisse */
+/* FFT tuning limit = 703,392,483 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 13
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 385
+
+#define MUL_TOOM22_THRESHOLD 19
+#define MUL_TOOM33_THRESHOLD 125
+#define MUL_TOOM44_THRESHOLD 196
+#define MUL_TOOM6H_THRESHOLD 276
+#define MUL_TOOM8H_THRESHOLD 369
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 315
+#define SQR_TOOM6_THRESHOLD 446
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 38
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 20
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,12}, { 95,11}, { 191,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \
+ { 1343,12}, { 351,11}, { 703,10}, { 1407,11}, \
+ { 735,13}, { 191,12}, { 383,11}, { 767,10}, \
+ { 1535,11}, { 799,12}, { 415,11}, { 831,10}, \
+ { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \
+ { 127,13}, { 255,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,10}, { 2431,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \
+ { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 959,11}, \
+ { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \
+ { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \
+ { 767,12}, { 1599,11}, { 3199,13}, { 831,12}, \
+ { 1727,13}, { 895,12}, { 1791,13}, { 959,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \
+ { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1727,12}, { 3455,14}, { 895,13}, { 1919,12}, \
+ { 3839,11}, { 7679,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \
+ { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \
+ { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4479,12}, { 8959,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,15}, { 1791,14}, { 3839,13}, \
+ { 7679,14}, { 3967,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,13}, \
+ { 15359,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,15}, { 7935,17}, { 2047,16}, \
+ { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \
+ { 19967,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 7679,15}, { 15871,18}, { 2047,17}, { 4095,16}, \
+ { 9727,15}, { 19967,17}, { 5119,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 275
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 396, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \
+ { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \
+ { 735,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,10}, { 1471,12}, \
+ { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \
+ { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \
+ { 895,12}, { 479,11}, { 959,14}, { 127,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \
+ { 1407,12}, { 735,11}, { 1471,10}, { 2943,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \
+ { 1599,12}, { 831,11}, { 1663,13}, { 447,12}, \
+ { 959,11}, { 1919,10}, { 3839,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \
+ { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \
+ { 3455,13}, { 959,12}, { 1919,11}, { 3839,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1471,12}, \
+ { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \
+ { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \
+ { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \
+ { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \
+ { 3839,12}, { 7679,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,14}, { 3967,16}, { 1023,15}, \
+ { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5247,15}, { 2815,14}, { 5887,13}, \
+ { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \
+ { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 7679,15}, \
+ { 15359,18}, { 2047,17}, { 4095,16}, { 9727,15}, \
+ { 19967,17}, { 5119,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 282
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 57
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 43
+#define DC_DIVAPPR_Q_THRESHOLD 154
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 93
+
+#define INV_MULMOD_BNM1_THRESHOLD 36
+#define INV_NEWTON_THRESHOLD 141
+#define INV_APPR_THRESHOLD 149
+
+#define BINV_NEWTON_THRESHOLD 264
+#define REDC_1_TO_REDC_N_THRESHOLD 47
+
+#define MU_DIV_QR_THRESHOLD 1470
+#define MU_DIVAPPR_Q_THRESHOLD 1528
+#define MUPI_DIV_QR_THRESHOLD 47
+#define MU_BDIV_QR_THRESHOLD 1187
+#define MU_BDIV_Q_THRESHOLD 1589
+
+#define POWM_SEC_TABLE 3,22,194,579
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 19
+#define SET_STR_DC_THRESHOLD 195
+#define SET_STR_PRECOMPUTE_THRESHOLD 1752
+
+#define FAC_DSC_THRESHOLD 345
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 24
+#define HGCD2_DIV1_METHOD 1 /* 11.29% faster than 3 */
+#define HGCD_THRESHOLD 89
+#define HGCD_APPR_THRESHOLD 96
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 233
+#define JACOBI_BASE_METHOD 1 /* 25.56% faster than 4 */
+
+/* Tuneup completed successfully, took 294200 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm
new file mode 100644
index 0000000..7c1ecd0
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addmul_1.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`x86_64/coreibwl/addmul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h
new file mode 100644
index 0000000..ffba1c5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* AMD Zen3 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3800-4700 MHz Vermeer */
+/* FFT tuning limit = 10,000,000 */
+/* Generated by tuneup.c, 2021-01-01, gcc 9.3 */
+
+#define MOD_1_NORM_THRESHOLD 64
+#define MOD_1_UNNORM_THRESHOLD 85
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 9
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 15
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 18
+
+#define DIV_1_VS_MUL_1_PERCENT 398
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 89
+#define MUL_TOOM44_THRESHOLD 130
+#define MUL_TOOM6H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 418
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 87
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 109
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 414
+#define SQR_TOOM8_THRESHOLD 592
+
+#define MULMID_TOOM42_THRESHOLD 38
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \
+ { 73, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \
+ { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,11}, { 111,10}, { 223,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \
+ { 639,11}, { 175,10}, { 351,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,11}, \
+ { 223,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,10}, { 1087,12}, \
+ { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \
+ { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \
+ { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \
+ { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \
+ { 1791,12}, { 959,11}, { 1919,10}, { 3839,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1279, 8}, { 24063,10}, { 6399,11}, \
+ { 3327,13}, { 895,12}, { 1791,13}, { 959,12}, \
+ { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 186
+#define MUL_FFT_THRESHOLD 3264
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 43, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303, 9}, { 607,11}, { 159,10}, \
+ { 319, 9}, { 639,12}, { 95,11}, { 191,10}, \
+ { 383,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,12}, { 223,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 607,11}, \
+ { 1215,13}, { 319,12}, { 671,11}, { 1343,12}, \
+ { 735,13}, { 383,12}, { 799,11}, { 1599,10}, \
+ { 3199,12}, { 831,13}, { 447,12}, { 895,11}, \
+ { 1791,12}, { 959,11}, { 1919,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1343,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1663,13}, \
+ { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \
+ { 255,14}, { 511,13}, { 1023, 9}, { 17919,10}, \
+ { 9727,12}, { 4096,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD 2624
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 31
+#define MULLO_MUL_N_THRESHOLD 6440
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 129
+#define SQRLO_SQR_THRESHOLD 5103
+
+#define DC_DIV_QR_THRESHOLD 19
+#define DC_DIVAPPR_Q_THRESHOLD 123
+#define DC_BDIV_QR_THRESHOLD 79
+#define DC_BDIV_Q_THRESHOLD 154
+
+#define INV_MULMOD_BNM1_THRESHOLD 42
+#define INV_NEWTON_THRESHOLD 107
+#define INV_APPR_THRESHOLD 107
+
+#define BINV_NEWTON_THRESHOLD 312
+#define REDC_1_TO_REDC_N_THRESHOLD 77
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1258
+#define MUPI_DIV_QR_THRESHOLD 30
+#define MU_BDIV_QR_THRESHOLD 1120
+#define MU_BDIV_Q_THRESHOLD 1394
+
+#define POWM_SEC_TABLE 6,19,203,579,2245
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 115
+#define SET_STR_PRECOMPUTE_THRESHOLD 1941
+
+#define FAC_DSC_THRESHOLD 182
+#define FAC_ODD_THRESHOLD 44
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD2_DIV1_METHOD 1 /* 13.04% faster than 3 */
+#define HGCD_THRESHOLD 65
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 293
+#define GCDEXT_DC_THRESHOLD 186
+#define JACOBI_BASE_METHOD 1 /* 12.79% faster than 3 */
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm
new file mode 100644
index 0000000..6f1e286
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm
@@ -0,0 +1,208 @@
+dnl AMD64 mpn_mul_1 optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2017, 2020 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 n/a
+C AMD bd3 n/a
+C AMD bd4 ?
+C AMD zn1 ?
+C AMD zn2 1.6
+C AMD zn3 1.5
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel WSM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom n/a
+C Intel SLM n/a
+C Intel GLM n/a
+C VIA nano n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Put an initial mulx before switching, targeting some free registers.
+C * Tune feed-in code.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+define(`ci', `%r8') C stack
+
+define(`n', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(` define(`up', ``%rsi'') ') dnl
+dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
+dnl IFDOS(` define(`vl', ``%r9'') ') dnl
+dnl IFDOS(` define(`r9', ``rdi'') ') dnl
+dnl IFDOS(` define(`n', ``%r8'') ') dnl
+dnl IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r11 ')
+IFSTD(` mov %r8, %r11 ')
+ jmp L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ xor R32(%r11), R32(%r11)
+L(com):
+ mov v0_param, %r10
+ mov n_param, n
+ mov R32(n_param), R32(%rax)
+ shr $3, n
+ and $7, R32(%rax) C clear OF, CF as side-effect
+ mov %r10, %rdx
+ lea L(tab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(f0), L(tab))
+ JMPENT( L(f1), L(tab))
+ JMPENT( L(f2), L(tab))
+ JMPENT( L(f3), L(tab))
+ JMPENT( L(f4), L(tab))
+ JMPENT( L(f5), L(tab))
+ JMPENT( L(f6), L(tab))
+ JMPENT( L(f7), L(tab))
+ TEXT
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea -1(n), n
+ adc %r11, %r10
+ jmp L(b0)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea -48(rp), rp
+ adc %r11, %r9
+ jmp L(b3)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea -40(rp), rp
+ adc %r11, %r10
+ jmp L(b4)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea -32(rp), rp
+ adc %r11, %r9
+ jmp L(b5)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea -24(rp), rp
+ adc %r11, %r10
+ jmp L(b6)
+
+L(f1): mulx( (up), %r9, %rax)
+ adc %r11, %r9
+ jrcxz L(end)
+ jmp L(b1)
+
+L(end): mov %r9, (rp)
+ adc %rcx, %rax C relies on rcx = 0
+ FUNC_EXIT()
+ ret
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+ adc %r11, %r10
+
+ ALIGN(32)
+L(top): adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ jrcxz L(end)
+L(b1): mulx( 8,(up), %r10, %r8)
+ lea -1(n), n
+ mov %r9, (rp)
+ adcx( %rax, %r10)
+L(b0): mulx( 16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ mov %r10, 8(rp)
+L(b7): mulx( 24,(up), %r10, %r8)
+ lea 64(up), up
+ adcx( %rax, %r10)
+ mov %r9, 16(rp)
+L(b6): mulx( -32,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ mov %r10, 24(rp)
+L(b5): mulx( -24,(up), %r10, %r8)
+ adcx( %rax, %r10)
+ mov %r9, 32(rp)
+L(b4): mulx( -16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ mov %r10, 40(rp)
+L(b3): mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ jmp L(top)
+
+L(f7): mulx( (up), %r9, %rax)
+ lea -16(up), up
+ lea -16(rp), rp
+ adc %r11, %r9
+ jmp L(b7)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm
new file mode 100644
index 0000000..f8c1b60
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_mul_basecase.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86_64/coreibwl/mul_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..c9c3487
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sbpi1_bdiv_r.
+
+dnl Copyright 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sbpi1_bdiv_r)
+include_mpn(`x86_64/coreibwl/sbpi1_bdiv_r.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm
new file mode 100644
index 0000000..9c4f65d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sqr_basecase.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86_64/coreibwl/sqr_basecase.asm')