From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/tune/Makefile | 957 +++++++++ gmp-6.3.0/tune/Makefile.am | 187 ++ gmp-6.3.0/tune/Makefile.in | 957 +++++++++ gmp-6.3.0/tune/README | 501 +++++ gmp-6.3.0/tune/alpha.asm | 59 + gmp-6.3.0/tune/common.c | 2945 +++++++++++++++++++++++++++ gmp-6.3.0/tune/div_qr_1_tune.c | 50 + gmp-6.3.0/tune/div_qr_1n_pi1_1.c | 38 + gmp-6.3.0/tune/div_qr_1n_pi1_2.c | 38 + gmp-6.3.0/tune/div_qr_1n_pi1_3.c | 38 + gmp-6.3.0/tune/div_qr_1n_pi1_4.c | 38 + gmp-6.3.0/tune/divrem1div.c | 41 + gmp-6.3.0/tune/divrem1inv.c | 41 + gmp-6.3.0/tune/divrem2div.c | 40 + gmp-6.3.0/tune/divrem2inv.c | 40 + gmp-6.3.0/tune/freq.c | 893 +++++++++ gmp-6.3.0/tune/gcdext_double.c | 38 + gmp-6.3.0/tune/gcdext_single.c | 38 + gmp-6.3.0/tune/gcdextod.c | 39 + gmp-6.3.0/tune/gcdextos.c | 39 + gmp-6.3.0/tune/hgcd2-1.c | 39 + gmp-6.3.0/tune/hgcd2-2.c | 39 + gmp-6.3.0/tune/hgcd2-3.c | 39 + gmp-6.3.0/tune/hgcd2-4.c | 39 + gmp-6.3.0/tune/hgcd2-5.c | 39 + gmp-6.3.0/tune/hgcd2.c | 49 + gmp-6.3.0/tune/hgcd_appr_lehmer.c | 39 + gmp-6.3.0/tune/hgcd_lehmer.c | 39 + gmp-6.3.0/tune/hgcd_reduce_1.c | 40 + gmp-6.3.0/tune/hgcd_reduce_2.c | 39 + gmp-6.3.0/tune/hppa.asm | 42 + gmp-6.3.0/tune/hppa2.asm | 44 + gmp-6.3.0/tune/hppa2w.asm | 44 + gmp-6.3.0/tune/ia64.asm | 47 + gmp-6.3.0/tune/jacbase1.c | 37 + gmp-6.3.0/tune/jacbase2.c | 37 + gmp-6.3.0/tune/jacbase3.c | 37 + gmp-6.3.0/tune/jacbase4.c | 37 + gmp-6.3.0/tune/many.pl | 1334 +++++++++++++ gmp-6.3.0/tune/mod_1_1-1.c | 40 + gmp-6.3.0/tune/mod_1_1-2.c | 40 + gmp-6.3.0/tune/mod_1_div.c | 45 + gmp-6.3.0/tune/mod_1_inv.c | 45 + gmp-6.3.0/tune/modlinv.c | 177 ++ gmp-6.3.0/tune/noop.c | 67 + gmp-6.3.0/tune/pentium.asm | 60 + gmp-6.3.0/tune/powerpc.asm | 53 + gmp-6.3.0/tune/powerpc64.asm | 49 + gmp-6.3.0/tune/powm_mod.c | 38 + gmp-6.3.0/tune/powm_redc.c | 40 + gmp-6.3.0/tune/pre_divrem_1.c | 40 + gmp-6.3.0/tune/set_strb.c | 46 + gmp-6.3.0/tune/set_strp.c | 42 + gmp-6.3.0/tune/set_strs.c | 42 + gmp-6.3.0/tune/sparcv9.asm | 45 + gmp-6.3.0/tune/speed-ext.c | 233 +++ gmp-6.3.0/tune/speed.c | 1419 +++++++++++++ gmp-6.3.0/tune/speed.h | 3981 +++++++++++++++++++++++++++++++++++++ gmp-6.3.0/tune/sqr_basecase.c | 2 + gmp-6.3.0/tune/time.c | 1598 +++++++++++++++ gmp-6.3.0/tune/tune-gcd-p.c | 225 +++ gmp-6.3.0/tune/tuneup.c | 3072 ++++++++++++++++++++++++++++ gmp-6.3.0/tune/x86_64.asm | 55 + 63 files changed, 20531 insertions(+) create mode 100644 gmp-6.3.0/tune/Makefile create mode 100644 gmp-6.3.0/tune/Makefile.am create mode 100644 gmp-6.3.0/tune/Makefile.in create mode 100644 gmp-6.3.0/tune/README create mode 100644 gmp-6.3.0/tune/alpha.asm create mode 100644 gmp-6.3.0/tune/common.c create mode 100644 gmp-6.3.0/tune/div_qr_1_tune.c create mode 100644 gmp-6.3.0/tune/div_qr_1n_pi1_1.c create mode 100644 gmp-6.3.0/tune/div_qr_1n_pi1_2.c create mode 100644 gmp-6.3.0/tune/div_qr_1n_pi1_3.c create mode 100644 gmp-6.3.0/tune/div_qr_1n_pi1_4.c create mode 100644 gmp-6.3.0/tune/divrem1div.c create mode 100644 gmp-6.3.0/tune/divrem1inv.c create mode 100644 gmp-6.3.0/tune/divrem2div.c create mode 100644 gmp-6.3.0/tune/divrem2inv.c create mode 100644 gmp-6.3.0/tune/freq.c create mode 100644 gmp-6.3.0/tune/gcdext_double.c create mode 100644 gmp-6.3.0/tune/gcdext_single.c create mode 100644 gmp-6.3.0/tune/gcdextod.c create mode 100644 gmp-6.3.0/tune/gcdextos.c create mode 100644 gmp-6.3.0/tune/hgcd2-1.c create mode 100644 gmp-6.3.0/tune/hgcd2-2.c create mode 100644 gmp-6.3.0/tune/hgcd2-3.c create mode 100644 gmp-6.3.0/tune/hgcd2-4.c create mode 100644 gmp-6.3.0/tune/hgcd2-5.c create mode 100644 gmp-6.3.0/tune/hgcd2.c create mode 100644 gmp-6.3.0/tune/hgcd_appr_lehmer.c create mode 100644 gmp-6.3.0/tune/hgcd_lehmer.c create mode 100644 gmp-6.3.0/tune/hgcd_reduce_1.c create mode 100644 gmp-6.3.0/tune/hgcd_reduce_2.c create mode 100644 gmp-6.3.0/tune/hppa.asm create mode 100644 gmp-6.3.0/tune/hppa2.asm create mode 100644 gmp-6.3.0/tune/hppa2w.asm create mode 100644 gmp-6.3.0/tune/ia64.asm create mode 100644 gmp-6.3.0/tune/jacbase1.c create mode 100644 gmp-6.3.0/tune/jacbase2.c create mode 100644 gmp-6.3.0/tune/jacbase3.c create mode 100644 gmp-6.3.0/tune/jacbase4.c create mode 100644 gmp-6.3.0/tune/many.pl create mode 100644 gmp-6.3.0/tune/mod_1_1-1.c create mode 100644 gmp-6.3.0/tune/mod_1_1-2.c create mode 100644 gmp-6.3.0/tune/mod_1_div.c create mode 100644 gmp-6.3.0/tune/mod_1_inv.c create mode 100644 gmp-6.3.0/tune/modlinv.c create mode 100644 gmp-6.3.0/tune/noop.c create mode 100644 gmp-6.3.0/tune/pentium.asm create mode 100644 gmp-6.3.0/tune/powerpc.asm create mode 100644 gmp-6.3.0/tune/powerpc64.asm create mode 100644 gmp-6.3.0/tune/powm_mod.c create mode 100644 gmp-6.3.0/tune/powm_redc.c create mode 100644 gmp-6.3.0/tune/pre_divrem_1.c create mode 100644 gmp-6.3.0/tune/set_strb.c create mode 100644 gmp-6.3.0/tune/set_strp.c create mode 100644 gmp-6.3.0/tune/set_strs.c create mode 100644 gmp-6.3.0/tune/sparcv9.asm create mode 100644 gmp-6.3.0/tune/speed-ext.c create mode 100644 gmp-6.3.0/tune/speed.c create mode 100644 gmp-6.3.0/tune/speed.h create mode 100644 gmp-6.3.0/tune/sqr_basecase.c create mode 100644 gmp-6.3.0/tune/time.c create mode 100644 gmp-6.3.0/tune/tune-gcd-p.c create mode 100644 gmp-6.3.0/tune/tuneup.c create mode 100644 gmp-6.3.0/tune/x86_64.asm (limited to 'gmp-6.3.0/tune') diff --git a/gmp-6.3.0/tune/Makefile b/gmp-6.3.0/tune/Makefile new file mode 100644 index 0000000..24a6e9c --- /dev/null +++ b/gmp-6.3.0/tune/Makefile @@ -0,0 +1,957 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# tune/Makefile. Generated from Makefile.in by configure. + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + + +# Copyright 2000-2003, 2005-2011 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +# Copyright 1996, 1998-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/gmp +pkgincludedir = $(includedir)/gmp +pkglibdir = $(libdir)/gmp +pkglibexecdir = $(libexecdir)/gmp +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = pentiumm-pc-linux-gnu +host_triplet = pentiumm-pc-linux-gnu +EXTRA_PROGRAMS = speed$(EXEEXT) speed-dynamic$(EXEEXT) \ + speed-ext$(EXEEXT) tuneup$(EXEEXT) tune-gcd-p$(EXEEXT) +subdir = tune +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__DEPENDENCIES_1 = +am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) \ + $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la +am_libspeed_la_OBJECTS = common.lo divrem1div.lo divrem1inv.lo \ + divrem2div.lo divrem2inv.lo div_qr_1n_pi1_1.lo \ + div_qr_1n_pi1_2.lo div_qr_1n_pi1_3.lo div_qr_1n_pi1_4.lo \ + div_qr_1_tune.lo freq.lo gcdext_single.lo gcdext_double.lo \ + gcdextod.lo gcdextos.lo hgcd_lehmer.lo hgcd_appr_lehmer.lo \ + hgcd_reduce_1.lo hgcd_reduce_2.lo jacbase1.lo jacbase2.lo \ + jacbase3.lo jacbase4.lo hgcd2-1.lo hgcd2-2.lo hgcd2-3.lo \ + hgcd2-4.lo hgcd2-5.lo mod_1_div.lo mod_1_inv.lo mod_1_1-1.lo \ + mod_1_1-2.lo modlinv.lo noop.lo powm_mod.lo powm_redc.lo \ + pre_divrem_1.lo set_strb.lo set_strs.lo set_strp.lo time.lo +libspeed_la_OBJECTS = $(am_libspeed_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +libspeed_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(libspeed_la_LDFLAGS) $(LDFLAGS) -o $@ +am_speed_OBJECTS = speed.$(OBJEXT) +speed_OBJECTS = $(am_speed_OBJECTS) +speed_LDADD = $(LDADD) +speed_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +speed_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(speed_LDFLAGS) $(LDFLAGS) -o $@ +am_speed_dynamic_OBJECTS = speed.$(OBJEXT) +speed_dynamic_OBJECTS = $(am_speed_dynamic_OBJECTS) +speed_dynamic_LDADD = $(LDADD) +speed_dynamic_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +am_speed_ext_OBJECTS = speed-ext.$(OBJEXT) +speed_ext_OBJECTS = $(am_speed_ext_OBJECTS) +speed_ext_LDADD = $(LDADD) +speed_ext_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +speed_ext_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(speed_ext_LDFLAGS) $(LDFLAGS) -o $@ +am_tune_gcd_p_OBJECTS = tune-gcd-p.$(OBJEXT) +tune_gcd_p_OBJECTS = $(am_tune_gcd_p_OBJECTS) +tune_gcd_p_LDADD = $(LDADD) +tune_gcd_p_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(tune_gcd_p_LDFLAGS) $(LDFLAGS) -o $@ +am_tuneup_OBJECTS = tuneup.$(OBJEXT) hgcd2.$(OBJEXT) +am__objects_1 = div_qr_2.$(OBJEXT) bdiv_q.$(OBJEXT) bdiv_qr.$(OBJEXT) \ + dcpi1_div_qr.$(OBJEXT) dcpi1_divappr_q.$(OBJEXT) \ + dcpi1_bdiv_qr.$(OBJEXT) dcpi1_bdiv_q.$(OBJEXT) \ + invertappr.$(OBJEXT) invert.$(OBJEXT) binvert.$(OBJEXT) \ + divrem_2.$(OBJEXT) gcd.$(OBJEXT) gcdext.$(OBJEXT) \ + get_str.$(OBJEXT) set_str.$(OBJEXT) matrix22_mul.$(OBJEXT) \ + hgcd.$(OBJEXT) hgcd_appr.$(OBJEXT) hgcd_reduce.$(OBJEXT) \ + mul_n.$(OBJEXT) sqr.$(OBJEXT) sec_powm.$(OBJEXT) \ + mullo_n.$(OBJEXT) mul_fft.$(OBJEXT) mul.$(OBJEXT) \ + tdiv_qr.$(OBJEXT) mulmod_bnm1.$(OBJEXT) sqrmod_bnm1.$(OBJEXT) \ + mulmid.$(OBJEXT) mulmid_n.$(OBJEXT) toom42_mulmid.$(OBJEXT) \ + sqrlo.$(OBJEXT) sqrlo_basecase.$(OBJEXT) \ + nussbaumer_mul.$(OBJEXT) toom6h_mul.$(OBJEXT) \ + toom8h_mul.$(OBJEXT) toom6_sqr.$(OBJEXT) toom8_sqr.$(OBJEXT) \ + toom22_mul.$(OBJEXT) toom2_sqr.$(OBJEXT) toom33_mul.$(OBJEXT) \ + toom3_sqr.$(OBJEXT) toom44_mul.$(OBJEXT) toom4_sqr.$(OBJEXT) +am__objects_2 = $(am__objects_1) divrem_1.$(OBJEXT) mod_1.$(OBJEXT) +nodist_tuneup_OBJECTS = sqr_basecase.$(OBJEXT) fac_ui.$(OBJEXT) \ + $(am__objects_2) +tuneup_OBJECTS = $(am_tuneup_OBJECTS) $(nodist_tuneup_OBJECTS) +am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) libspeed.la +tuneup_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(tuneup_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = +am__depfiles_maybe = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \ + $(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \ + $(tune_gcd_p_SOURCES) $(tuneup_SOURCES) \ + $(nodist_tuneup_SOURCES) +DIST_SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \ + $(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \ + $(tune_gcd_p_SOURCES) $(tuneup_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +HEADERS = $(noinst_HEADERS) +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/../mpn/Makeasm.am $(srcdir)/Makefile.in \ + README +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ABI = 32 +ACLOCAL = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing aclocal-1.15 +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AS = as +ASMFLAGS = -Wa,--noexecstack +AUTOCONF = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing autoconf +AUTOHEADER = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing autoheader +AUTOMAKE = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing automake-1.15 +AWK = gawk +CALLING_CONVENTIONS_OBJS = x86call.lo x86check$U.lo +CC = gcc +CCAS = gcc -c +CC_FOR_BUILD = gcc +CFLAGS = -m32 -O2 -pedantic -fomit-frame-pointer -mtune=pentium3 -march=pentium3 +CPP = gcc -E +CPPFLAGS = +CPP_FOR_BUILD = gcc -E +CXX = +CXXCPP = +CXXFLAGS = +CYGPATH_W = echo +DEFN_LONG_LONG_LIMB = /* #undef _LONG_LONG_LIMB */ +DEFS = -DHAVE_CONFIG_H +DLLTOOL = dlltool +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +EXEEXT_FOR_BUILD = +FGREP = /usr/bin/grep -F +GMP_LDFLAGS = +GMP_LIMB_BITS = 32 +GMP_NAIL_BITS = 0 +GREP = /usr/bin/grep +HAVE_CLOCK_01 = 1 +HAVE_CPUTIME_01 = 0 +HAVE_GETRUSAGE_01 = 1 +HAVE_GETTIMEOFDAY_01 = 1 +HAVE_HOST_CPU_FAMILY_power = 0 +HAVE_HOST_CPU_FAMILY_powerpc = 0 +HAVE_SIGACTION_01 = 1 +HAVE_SIGALTSTACK_01 = 1 +HAVE_SIGSTACK_01 = 1 +HAVE_STACK_T_01 = 1 +HAVE_SYS_RESOURCE_H_01 = 1 +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld +LDFLAGS = +LEX = flex +LEXLIB = -lfl +LEX_OUTPUT_ROOT = lex.yy +LIBCURSES = -lncurses +LIBGMPXX_LDFLAGS = +LIBGMP_DLL = 0 +LIBGMP_LDFLAGS = +LIBM = -lm +LIBM_FOR_BUILD = -lm +LIBOBJS = +LIBREADLINE = -lreadline +LIBS = +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +M4 = m4 +MAINT = # +MAKEINFO = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = gmp +PACKAGE_BUGREPORT = gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html) +PACKAGE_NAME = GNU MP +PACKAGE_STRING = GNU MP 6.3.0 +PACKAGE_TARNAME = gmp +PACKAGE_URL = http://www.gnu.org/software/gmp/ +PACKAGE_VERSION = 6.3.0 +PATH_SEPARATOR = : +RANLIB = ranlib +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +SPEED_CYCLECOUNTER_OBJ = pentium.lo +STRIP = strip +TAL_OBJECT = tal-reent.lo +TUNE_LIBS = +TUNE_SQR_OBJ = +U_FOR_BUILD = +VERSION = 6.3.0 +WITH_READLINE_01 = 1 +YACC = bison -y +YFLAGS = +abs_builddir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/tune +abs_srcdir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/tune +abs_top_builddir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0 +abs_top_srcdir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = +ac_ct_DUMPBIN = +am__leading_dot = . +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = pentiumm-pc-linux-gnu +build_alias = +build_cpu = pentiumm +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +gmp_srclinks = mpn/add.c mpn/add_1.c mpn/add_n.asm mpn/sub.c mpn/sub_1.c mpn/sub_n.asm mpn/cnd_add_n.asm mpn/cnd_sub_n.asm mpn/cnd_swap.c mpn/neg.c mpn/com.c mpn/mul_1.asm mpn/addmul_1.asm mpn/submul_1.asm mpn/add_err1_n.c mpn/add_err2_n.c mpn/add_err3_n.c mpn/sub_err1_n.c mpn/sub_err2_n.c mpn/sub_err3_n.c mpn/lshift.asm mpn/rshift.asm mpn/dive_1.asm mpn/diveby3.c mpn/divis.c mpn/divrem.c mpn/divrem_1.asm mpn/divrem_2.asm mpn/fib2_ui.c mpn/fib2m.c mpn/mod_1.c mpn/mod_34lsub1.asm mpn/mode1o.asm mpn/pre_mod_1.c mpn/dump.c mpn/mod_1_1.asm mpn/mod_1_2.c mpn/mod_1_3.c mpn/mod_1_4.asm mpn/lshiftc.c mpn/mul.c mpn/mul_fft.c mpn/mul_n.c mpn/sqr.c mpn/mul_basecase.asm mpn/sqr_basecase.asm mpn/nussbaumer_mul.c mpn/mulmid_basecase.c mpn/toom42_mulmid.c mpn/mulmid_n.c mpn/mulmid.c mpn/random.c mpn/random2.c mpn/pow_1.c mpn/rootrem.c mpn/sqrtrem.c mpn/sizeinbase.c mpn/get_str.c mpn/set_str.c mpn/compute_powtab.c mpn/scan0.c mpn/scan1.c mpn/popcount.asm mpn/hamdist.asm mpn/cmp.c mpn/zero_p.c mpn/perfsqr.c mpn/perfpow.c mpn/strongfibo.c mpn/gcd_11.asm mpn/gcd_22.c mpn/gcd_1.c mpn/gcd.c mpn/gcdext_1.c mpn/gcdext.c mpn/gcd_subdiv_step.c mpn/gcdext_lehmer.c mpn/div_q.c mpn/tdiv_qr.c mpn/jacbase.c mpn/jacobi_2.c mpn/jacobi.c mpn/get_d.c mpn/matrix22_mul.c mpn/matrix22_mul1_inverse_vector.c mpn/hgcd_matrix.c mpn/hgcd2.c mpn/hgcd_step.c mpn/hgcd_reduce.c mpn/hgcd.c mpn/hgcd_appr.c mpn/hgcd2_jacobi.c mpn/hgcd_jacobi.c mpn/mullo_n.c mpn/mullo_basecase.c mpn/sqrlo.c mpn/sqrlo_basecase.c mpn/toom22_mul.c mpn/toom32_mul.c mpn/toom42_mul.c mpn/toom52_mul.c mpn/toom62_mul.c mpn/toom33_mul.c mpn/toom43_mul.c mpn/toom53_mul.c mpn/toom54_mul.c mpn/toom63_mul.c mpn/toom44_mul.c mpn/toom6h_mul.c mpn/toom6_sqr.c mpn/toom8h_mul.c mpn/toom8_sqr.c mpn/toom_couple_handling.c mpn/toom2_sqr.c mpn/toom3_sqr.c mpn/toom4_sqr.c mpn/toom_eval_dgr3_pm1.c mpn/toom_eval_dgr3_pm2.c mpn/toom_eval_pm1.c mpn/toom_eval_pm2.c mpn/toom_eval_pm2exp.c mpn/toom_eval_pm2rexp.c mpn/toom_interpolate_5pts.c mpn/toom_interpolate_6pts.c mpn/toom_interpolate_7pts.c mpn/toom_interpolate_8pts.c mpn/toom_interpolate_12pts.c mpn/toom_interpolate_16pts.c mpn/invertappr.c mpn/invert.c mpn/binvert.c mpn/mulmod_bnm1.c mpn/sqrmod_bnm1.c mpn/mulmod_bknp1.c mpn/div_qr_1.c mpn/div_qr_1n_pi1.c mpn/div_qr_2.c mpn/div_qr_2n_pi1.c mpn/div_qr_2u_pi1.c mpn/sbpi1_div_q.c mpn/sbpi1_div_qr.c mpn/sbpi1_divappr_q.c mpn/dcpi1_div_q.c mpn/dcpi1_div_qr.c mpn/dcpi1_divappr_q.c mpn/mu_div_qr.c mpn/mu_divappr_q.c mpn/mu_div_q.c mpn/bdiv_q_1.asm mpn/sbpi1_bdiv_q.c mpn/sbpi1_bdiv_qr.c mpn/sbpi1_bdiv_r.c mpn/dcpi1_bdiv_q.c mpn/dcpi1_bdiv_qr.c mpn/mu_bdiv_q.c mpn/mu_bdiv_qr.c mpn/bdiv_q.c mpn/bdiv_qr.c mpn/broot.c mpn/brootinv.c mpn/bsqrt.c mpn/bsqrtinv.c mpn/divexact.c mpn/bdiv_dbm1c.asm mpn/redc_1.c mpn/redc_2.c mpn/redc_n.c mpn/powm.c mpn/powlo.c mpn/sec_powm.c mpn/sec_mul.c mpn/sec_sqr.c mpn/sec_div_qr.c mpn/sec_div_r.c mpn/sec_pi1_div_qr.c mpn/sec_pi1_div_r.c mpn/sec_add_1.c mpn/sec_sub_1.c mpn/sec_invert.c mpn/trialdiv.c mpn/remove.c mpn/and_n.c mpn/andn_n.c mpn/nand_n.c mpn/ior_n.c mpn/iorn_n.c mpn/nior_n.c mpn/xor_n.c mpn/xnor_n.c mpn/copyi.asm mpn/copyd.asm mpn/zero.c mpn/sec_tabselect.asm mpn/comb_tables.c mpn/umul.asm mpn/udiv.asm mpn/add_n_sub_n.c gmp-mparam.h +host = pentiumm-pc-linux-gnu +host_alias = +host_cpu = pentiumm +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +mpn_objects = add$U.lo add_1$U.lo add_n.lo sub$U.lo sub_1$U.lo sub_n.lo cnd_add_n.lo cnd_sub_n.lo cnd_swap$U.lo neg$U.lo com$U.lo mul_1.lo addmul_1.lo submul_1.lo add_err1_n$U.lo add_err2_n$U.lo add_err3_n$U.lo sub_err1_n$U.lo sub_err2_n$U.lo sub_err3_n$U.lo lshift.lo rshift.lo dive_1.lo diveby3$U.lo divis$U.lo divrem$U.lo divrem_1.lo divrem_2.lo fib2_ui$U.lo fib2m$U.lo mod_1$U.lo mod_34lsub1.lo mode1o.lo pre_mod_1$U.lo dump$U.lo mod_1_1.lo mod_1_2$U.lo mod_1_3$U.lo mod_1_4.lo lshiftc$U.lo mul$U.lo mul_fft$U.lo mul_n$U.lo sqr$U.lo mul_basecase.lo sqr_basecase.lo nussbaumer_mul$U.lo mulmid_basecase$U.lo toom42_mulmid$U.lo mulmid_n$U.lo mulmid$U.lo random$U.lo random2$U.lo pow_1$U.lo rootrem$U.lo sqrtrem$U.lo sizeinbase$U.lo get_str$U.lo set_str$U.lo compute_powtab$U.lo scan0$U.lo scan1$U.lo popcount.lo hamdist.lo cmp$U.lo zero_p$U.lo perfsqr$U.lo perfpow$U.lo strongfibo$U.lo gcd_11.lo gcd_22$U.lo gcd_1$U.lo gcd$U.lo gcdext_1$U.lo gcdext$U.lo gcd_subdiv_step$U.lo gcdext_lehmer$U.lo div_q$U.lo tdiv_qr$U.lo jacbase$U.lo jacobi_2$U.lo jacobi$U.lo get_d$U.lo matrix22_mul$U.lo matrix22_mul1_inverse_vector$U.lo hgcd_matrix$U.lo hgcd2$U.lo hgcd_step$U.lo hgcd_reduce$U.lo hgcd$U.lo hgcd_appr$U.lo hgcd2_jacobi$U.lo hgcd_jacobi$U.lo mullo_n$U.lo mullo_basecase$U.lo sqrlo$U.lo sqrlo_basecase$U.lo toom22_mul$U.lo toom32_mul$U.lo toom42_mul$U.lo toom52_mul$U.lo toom62_mul$U.lo toom33_mul$U.lo toom43_mul$U.lo toom53_mul$U.lo toom54_mul$U.lo toom63_mul$U.lo toom44_mul$U.lo toom6h_mul$U.lo toom6_sqr$U.lo toom8h_mul$U.lo toom8_sqr$U.lo toom_couple_handling$U.lo toom2_sqr$U.lo toom3_sqr$U.lo toom4_sqr$U.lo toom_eval_dgr3_pm1$U.lo toom_eval_dgr3_pm2$U.lo toom_eval_pm1$U.lo toom_eval_pm2$U.lo toom_eval_pm2exp$U.lo toom_eval_pm2rexp$U.lo toom_interpolate_5pts$U.lo toom_interpolate_6pts$U.lo toom_interpolate_7pts$U.lo toom_interpolate_8pts$U.lo toom_interpolate_12pts$U.lo toom_interpolate_16pts$U.lo invertappr$U.lo invert$U.lo binvert$U.lo mulmod_bnm1$U.lo sqrmod_bnm1$U.lo mulmod_bknp1$U.lo div_qr_1$U.lo div_qr_1n_pi1$U.lo div_qr_2$U.lo div_qr_2n_pi1$U.lo div_qr_2u_pi1$U.lo sbpi1_div_q$U.lo sbpi1_div_qr$U.lo sbpi1_divappr_q$U.lo dcpi1_div_q$U.lo dcpi1_div_qr$U.lo dcpi1_divappr_q$U.lo mu_div_qr$U.lo mu_divappr_q$U.lo mu_div_q$U.lo bdiv_q_1.lo sbpi1_bdiv_q$U.lo sbpi1_bdiv_qr$U.lo sbpi1_bdiv_r$U.lo dcpi1_bdiv_q$U.lo dcpi1_bdiv_qr$U.lo mu_bdiv_q$U.lo mu_bdiv_qr$U.lo bdiv_q$U.lo bdiv_qr$U.lo broot$U.lo brootinv$U.lo bsqrt$U.lo bsqrtinv$U.lo divexact$U.lo bdiv_dbm1c.lo redc_1$U.lo redc_2$U.lo redc_n$U.lo powm$U.lo powlo$U.lo sec_powm$U.lo sec_mul$U.lo sec_sqr$U.lo sec_div_qr$U.lo sec_div_r$U.lo sec_pi1_div_qr$U.lo sec_pi1_div_r$U.lo sec_add_1$U.lo sec_sub_1$U.lo sec_invert$U.lo trialdiv$U.lo remove$U.lo and_n$U.lo andn_n$U.lo nand_n$U.lo ior_n$U.lo iorn_n$U.lo nior_n$U.lo xor_n$U.lo xnor_n$U.lo copyi.lo copyd.lo zero$U.lo sec_tabselect.lo comb_tables$U.lo umul.lo udiv.lo add_n_sub_n$U.lo +mpn_objs_in_libgmp = mpn/add$U.lo mpn/add_1$U.lo mpn/add_n.lo mpn/sub$U.lo mpn/sub_1$U.lo mpn/sub_n.lo mpn/cnd_add_n.lo mpn/cnd_sub_n.lo mpn/cnd_swap$U.lo mpn/neg$U.lo mpn/com$U.lo mpn/mul_1.lo mpn/addmul_1.lo mpn/submul_1.lo mpn/add_err1_n$U.lo mpn/add_err2_n$U.lo mpn/add_err3_n$U.lo mpn/sub_err1_n$U.lo mpn/sub_err2_n$U.lo mpn/sub_err3_n$U.lo mpn/lshift.lo mpn/rshift.lo mpn/dive_1.lo mpn/diveby3$U.lo mpn/divis$U.lo mpn/divrem$U.lo mpn/divrem_1.lo mpn/divrem_2.lo mpn/fib2_ui$U.lo mpn/fib2m$U.lo mpn/mod_1$U.lo mpn/mod_34lsub1.lo mpn/mode1o.lo mpn/pre_mod_1$U.lo mpn/dump$U.lo mpn/mod_1_1.lo mpn/mod_1_2$U.lo mpn/mod_1_3$U.lo mpn/mod_1_4.lo mpn/lshiftc$U.lo mpn/mul$U.lo mpn/mul_fft$U.lo mpn/mul_n$U.lo mpn/sqr$U.lo mpn/mul_basecase.lo mpn/sqr_basecase.lo mpn/nussbaumer_mul$U.lo mpn/mulmid_basecase$U.lo mpn/toom42_mulmid$U.lo mpn/mulmid_n$U.lo mpn/mulmid$U.lo mpn/random$U.lo mpn/random2$U.lo mpn/pow_1$U.lo mpn/rootrem$U.lo mpn/sqrtrem$U.lo mpn/sizeinbase$U.lo mpn/get_str$U.lo mpn/set_str$U.lo mpn/compute_powtab$U.lo mpn/scan0$U.lo mpn/scan1$U.lo mpn/popcount.lo mpn/hamdist.lo mpn/cmp$U.lo mpn/zero_p$U.lo mpn/perfsqr$U.lo mpn/perfpow$U.lo mpn/strongfibo$U.lo mpn/gcd_11.lo mpn/gcd_22$U.lo mpn/gcd_1$U.lo mpn/gcd$U.lo mpn/gcdext_1$U.lo mpn/gcdext$U.lo mpn/gcd_subdiv_step$U.lo mpn/gcdext_lehmer$U.lo mpn/div_q$U.lo mpn/tdiv_qr$U.lo mpn/jacbase$U.lo mpn/jacobi_2$U.lo mpn/jacobi$U.lo mpn/get_d$U.lo mpn/matrix22_mul$U.lo mpn/matrix22_mul1_inverse_vector$U.lo mpn/hgcd_matrix$U.lo mpn/hgcd2$U.lo mpn/hgcd_step$U.lo mpn/hgcd_reduce$U.lo mpn/hgcd$U.lo mpn/hgcd_appr$U.lo mpn/hgcd2_jacobi$U.lo mpn/hgcd_jacobi$U.lo mpn/mullo_n$U.lo mpn/mullo_basecase$U.lo mpn/sqrlo$U.lo mpn/sqrlo_basecase$U.lo mpn/toom22_mul$U.lo mpn/toom32_mul$U.lo mpn/toom42_mul$U.lo mpn/toom52_mul$U.lo mpn/toom62_mul$U.lo mpn/toom33_mul$U.lo mpn/toom43_mul$U.lo mpn/toom53_mul$U.lo mpn/toom54_mul$U.lo mpn/toom63_mul$U.lo mpn/toom44_mul$U.lo mpn/toom6h_mul$U.lo mpn/toom6_sqr$U.lo mpn/toom8h_mul$U.lo mpn/toom8_sqr$U.lo mpn/toom_couple_handling$U.lo mpn/toom2_sqr$U.lo mpn/toom3_sqr$U.lo mpn/toom4_sqr$U.lo mpn/toom_eval_dgr3_pm1$U.lo mpn/toom_eval_dgr3_pm2$U.lo mpn/toom_eval_pm1$U.lo mpn/toom_eval_pm2$U.lo mpn/toom_eval_pm2exp$U.lo mpn/toom_eval_pm2rexp$U.lo mpn/toom_interpolate_5pts$U.lo mpn/toom_interpolate_6pts$U.lo mpn/toom_interpolate_7pts$U.lo mpn/toom_interpolate_8pts$U.lo mpn/toom_interpolate_12pts$U.lo mpn/toom_interpolate_16pts$U.lo mpn/invertappr$U.lo mpn/invert$U.lo mpn/binvert$U.lo mpn/mulmod_bnm1$U.lo mpn/sqrmod_bnm1$U.lo mpn/mulmod_bknp1$U.lo mpn/div_qr_1$U.lo mpn/div_qr_1n_pi1$U.lo mpn/div_qr_2$U.lo mpn/div_qr_2n_pi1$U.lo mpn/div_qr_2u_pi1$U.lo mpn/sbpi1_div_q$U.lo mpn/sbpi1_div_qr$U.lo mpn/sbpi1_divappr_q$U.lo mpn/dcpi1_div_q$U.lo mpn/dcpi1_div_qr$U.lo mpn/dcpi1_divappr_q$U.lo mpn/mu_div_qr$U.lo mpn/mu_divappr_q$U.lo mpn/mu_div_q$U.lo mpn/bdiv_q_1.lo mpn/sbpi1_bdiv_q$U.lo mpn/sbpi1_bdiv_qr$U.lo mpn/sbpi1_bdiv_r$U.lo mpn/dcpi1_bdiv_q$U.lo mpn/dcpi1_bdiv_qr$U.lo mpn/mu_bdiv_q$U.lo mpn/mu_bdiv_qr$U.lo mpn/bdiv_q$U.lo mpn/bdiv_qr$U.lo mpn/broot$U.lo mpn/brootinv$U.lo mpn/bsqrt$U.lo mpn/bsqrtinv$U.lo mpn/divexact$U.lo mpn/bdiv_dbm1c.lo mpn/redc_1$U.lo mpn/redc_2$U.lo mpn/redc_n$U.lo mpn/powm$U.lo mpn/powlo$U.lo mpn/sec_powm$U.lo mpn/sec_mul$U.lo mpn/sec_sqr$U.lo mpn/sec_div_qr$U.lo mpn/sec_div_r$U.lo mpn/sec_pi1_div_qr$U.lo mpn/sec_pi1_div_r$U.lo mpn/sec_add_1$U.lo mpn/sec_sub_1$U.lo mpn/sec_invert$U.lo mpn/trialdiv$U.lo mpn/remove$U.lo mpn/and_n$U.lo mpn/andn_n$U.lo mpn/nand_n$U.lo mpn/ior_n$U.lo mpn/iorn_n$U.lo mpn/nior_n$U.lo mpn/xor_n$U.lo mpn/xnor_n$U.lo mpn/copyi.lo mpn/copyd.lo mpn/zero$U.lo mpn/sec_tabselect.lo mpn/comb_tables$U.lo mpn/umul.lo mpn/udiv.lo mpn/add_n_sub_n$U.lo +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/bin +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target_alias = +top_build_prefix = ../ +top_builddir = .. +top_srcdir = .. +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests +AM_LDFLAGS = -no-install +EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \ + ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl + +noinst_HEADERS = speed.h +#STATIC = + +# Prefer -static on the speed and tune programs, since that can avoid +# overheads of shared library linkages on some systems. Libtool tends to +# botch -static if configured with --disable-static, perhaps reasonably +# enough. In any event under --disable-static the only choice is a dynamic +# link so there's no point in -static. +# +STATIC = -static +EXTRA_LTLIBRARIES = libspeed.la +libspeed_la_SOURCES = \ + common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ + div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c \ + div_qr_1n_pi1_4.c div_qr_1_tune.c \ + freq.c \ + gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ + hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ + jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ + hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c \ + mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \ + noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ + set_strb.c set_strs.c set_strp.c time.c + +libspeed_la_DEPENDENCIES = $(SPEED_CYCLECOUNTER_OBJ) \ + $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la + +libspeed_la_LIBADD = $(libspeed_la_DEPENDENCIES) $(LIBM) +libspeed_la_LDFLAGS = $(STATIC) +DEPENDENCIES = libspeed.la +LDADD = $(DEPENDENCIES) $(TUNE_LIBS) +speed_SOURCES = speed.c +speed_LDFLAGS = $(STATIC) +speed_dynamic_SOURCES = speed.c +speed_ext_SOURCES = speed-ext.c +speed_ext_LDFLAGS = $(STATIC) +tuneup_SOURCES = tuneup.c hgcd2.c +nodist_tuneup_SOURCES = sqr_basecase.c fac_ui.c $(TUNE_MPN_SRCS) +tuneup_DEPENDENCIES = $(TUNE_SQR_OBJ) libspeed.la +tuneup_LDADD = $(tuneup_DEPENDENCIES) $(TUNE_LIBS) +tuneup_LDFLAGS = $(STATIC) +tune_gcd_p_SOURCES = tune-gcd-p.c +tune_gcd_p_DEPENDENCIES = ../mpn/gcd.c +tune_gcd_p_LDFLAGS = $(STATIC) + +# $(MANY_CLEAN) and $(MANY_DISTCLEAN) are hooks for many.pl +CLEANFILES = $(EXTRA_PROGRAMS) $(EXTRA_LTLIBRARIES) \ + $(TUNE_MPN_SRCS) fac_ui.c sqr_asm.asm \ + stg.gnuplot stg.data \ + mtg.gnuplot mtg.data \ + fibg.gnuplot fibg.data \ + graph.gnuplot graph.data \ + $(MANY_CLEAN) + +DISTCLEANFILES = sqr_basecase.c $(MANY_DISTCLEAN) + +# Generating these little files at build time seems better than including +# them in the distribution, since the list can be changed more easily. +# +# mpn/generic/tdiv_qr.c uses mpn_divrem_1 and mpn_divrem_2, but only for 1 +# and 2 limb divisors, which are never used during tuning, so it doesn't +# matter whether it picks up a tuned or untuned version of those. +# +# divrem_1 and mod_1 are recompiled renamed to "_tune" to avoid a linking +# problem. If a native divrem_1 provides an mpn_divrem_1c entrypoint then +# common.c will want that, but the generic divrem_1 doesn't provide it, +# likewise for mod_1. The simplest way around this is to have the tune +# build versions renamed suitably. +# +# FIXME: Would like say mul_n.c to depend on $(top_builddir)/mul_n.c so the +# recompiled object will be rebuilt if that file changes. +TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c +TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ + dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c \ + invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ + get_str.c set_str.c matrix22_mul.c \ + hgcd.c hgcd_appr.c hgcd_reduce.c \ + mul_n.c sqr.c sec_powm.c \ + mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ + mulmid.c mulmid_n.c toom42_mulmid.c sqrlo.c sqrlo_basecase.c \ + nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ + toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c + + +# COMPILE minus CC. +# +COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS) + + +# Flags used for preprocessing (in ansi2knr rules). +# +PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) + + +# Recent versions of automake (1.5 and up for instance) append automake +# generated suffixes to this $(SUFFIXES) list. This is essential for us, +# since .c must come after .s, .S and .asm. If .c is before .s, for +# instance, then in the mpn directory "make" will see add_n.c mentioned in +# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c, +# even if add_n.c doesn't exist but add_n.s does. See GNU make +# documentation "(make)Implicit Rule Search", part 5c. +# +# On IRIX 6 native make this doesn't work properly though. Somehow .c +# remains ahead of .s, perhaps because .c.s is a builtin rule. .asm works +# fine though, and mpn/mips3 uses this. +# +SUFFIXES = .s .S .asm + +# can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +RM_TMP = rm -f +all: all-am + +.SUFFIXES: +.SUFFIXES: .s .S .asm .c .lo .o .obj +$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(srcdir)/../mpn/Makeasm.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps tune/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu --ignore-deps tune/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; +$(srcdir)/../mpn/Makeasm.am $(am__empty): + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: # $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): # $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +libspeed.la: $(libspeed_la_OBJECTS) $(libspeed_la_DEPENDENCIES) $(EXTRA_libspeed_la_DEPENDENCIES) + $(AM_V_CCLD)$(libspeed_la_LINK) $(libspeed_la_OBJECTS) $(libspeed_la_LIBADD) $(LIBS) + +speed$(EXEEXT): $(speed_OBJECTS) $(speed_DEPENDENCIES) $(EXTRA_speed_DEPENDENCIES) + @rm -f speed$(EXEEXT) + $(AM_V_CCLD)$(speed_LINK) $(speed_OBJECTS) $(speed_LDADD) $(LIBS) + +speed-dynamic$(EXEEXT): $(speed_dynamic_OBJECTS) $(speed_dynamic_DEPENDENCIES) $(EXTRA_speed_dynamic_DEPENDENCIES) + @rm -f speed-dynamic$(EXEEXT) + $(AM_V_CCLD)$(LINK) $(speed_dynamic_OBJECTS) $(speed_dynamic_LDADD) $(LIBS) + +speed-ext$(EXEEXT): $(speed_ext_OBJECTS) $(speed_ext_DEPENDENCIES) $(EXTRA_speed_ext_DEPENDENCIES) + @rm -f speed-ext$(EXEEXT) + $(AM_V_CCLD)$(speed_ext_LINK) $(speed_ext_OBJECTS) $(speed_ext_LDADD) $(LIBS) + +tune-gcd-p$(EXEEXT): $(tune_gcd_p_OBJECTS) $(tune_gcd_p_DEPENDENCIES) $(EXTRA_tune_gcd_p_DEPENDENCIES) + @rm -f tune-gcd-p$(EXEEXT) + $(AM_V_CCLD)$(tune_gcd_p_LINK) $(tune_gcd_p_OBJECTS) $(tune_gcd_p_LDADD) $(LIBS) + +tuneup$(EXEEXT): $(tuneup_OBJECTS) $(tuneup_DEPENDENCIES) $(EXTRA_tuneup_DEPENDENCIES) + @rm -f tuneup$(EXEEXT) + $(AM_V_CCLD)$(tuneup_LINK) $(tuneup_OBJECTS) $(tuneup_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +.c.o: + $(AM_V_CC)$(COMPILE) -c -o $@ $< + +.c.obj: + $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: + $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(HEADERS) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool cscopelist-am ctags ctags-am distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +$(top_builddir)/tests/libtests.la: + cd $(top_builddir)/tests; $(MAKE) $(AM_MAKEFLAGS) libtests.la + +tune: + $(MAKE) $(AM_MAKEFLAGS) tuneup$(EXEEXT) + ./tuneup + +allprogs: $(EXTRA_PROGRAMS) + +$(TUNE_MPN_SRCS_BASIC): + for i in $(TUNE_MPN_SRCS_BASIC); do \ + echo "#define TUNE_PROGRAM_BUILD 1" >$$i; \ + echo "#include \"mpn/generic/$$i\"" >>$$i; \ + done + +divrem_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >divrem_1.c + echo "#define __gmpn_divrem_1 mpn_divrem_1_tune" >>divrem_1.c + echo "#include \"mpn/generic/divrem_1.c\"" >>divrem_1.c + +mod_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >mod_1.c + echo "#define __gmpn_mod_1 mpn_mod_1_tune" >>mod_1.c + echo "#include \"mpn/generic/mod_1.c\"" >>mod_1.c + +sqr_asm.asm: $(top_builddir)/mpn/sqr_basecase.asm + echo 'define(SQR_TOOM2_THRESHOLD_OVERRIDE,SQR_TOOM2_THRESHOLD_MAX)' >sqr_asm.asm + echo 'include(../mpn/sqr_basecase.asm)' >>sqr_asm.asm + +# FIXME: Should it depend on $(top_builddir)/fac_ui.h too? +fac_ui.c: $(top_builddir)/mpz/fac_ui.c + echo "#define TUNE_PROGRAM_BUILD 1" >fac_ui.c + echo "#define __gmpz_fac_ui mpz_fac_ui_tune" >>fac_ui.c + echo "#define __gmpz_oddfac_1 mpz_oddfac_1_tune" >>fac_ui.c + echo "#include \"mpz/oddfac_1.c\"" >>fac_ui.c + echo "#include \"mpz/fac_ui.c\"" >>fac_ui.c + +# .s assembler, no preprocessing. +# +.s.o: + $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` +.s.lo: + $(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .S assembler, preprocessed with cpp. +# +# It's necessary to run $(CPP) separately, since it seems not all compilers +# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and +# will silently do nothing if given a .S). +# +# For .lo we need a helper script, as described below for .asm.lo. +# +.S.o: + $(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.obj: + $(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .asm assembler, preprocessed with m4. +# +# .o and .obj are non-PIC and just need m4 followed by a compile. +# +# .lo is a bit tricky. Libtool (as of version 1.5) has foo.lo as a little +# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects, +# respectively. It'd be asking for lots of trouble to try to create foo.lo +# ourselves, so instead arrange to invoke libtool like a --mode=compile, but +# with a special m4-ccas script which first m4 preprocesses, then compiles. +# --tag=CC is necessary since foo.asm is otherwise unknown to libtool. +# +# Libtool adds -DPIC when building a shared object and the .asm files look +# for that. But it should be noted that the other PIC flags are on occasion +# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before +# it accepts PIC constructs like @GOT, and gcc adds that flag only under +# -fPIC. (Later versions of gas are happy to accept PIC stuff any time.) +# +.asm.o: + $(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +.NOTPARALLEL: + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/gmp-6.3.0/tune/Makefile.am b/gmp-6.3.0/tune/Makefile.am new file mode 100644 index 0000000..0f564ed --- /dev/null +++ b/gmp-6.3.0/tune/Makefile.am @@ -0,0 +1,187 @@ +## Process this file with automake to generate Makefile.in + +# Copyright 2000-2003, 2005-2011 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests +AM_LDFLAGS = -no-install + +EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \ + ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl +noinst_HEADERS = speed.h + +# Prefer -static on the speed and tune programs, since that can avoid +# overheads of shared library linkages on some systems. Libtool tends to +# botch -static if configured with --disable-static, perhaps reasonably +# enough. In any event under --disable-static the only choice is a dynamic +# link so there's no point in -static. +# +if ENABLE_STATIC +STATIC = -static +else +STATIC = +endif + + +EXTRA_LTLIBRARIES = libspeed.la + +libspeed_la_SOURCES = \ + common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ + div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c \ + div_qr_1n_pi1_4.c div_qr_1_tune.c \ + freq.c \ + gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ + hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ + jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ + hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c \ + mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \ + noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ + set_strb.c set_strs.c set_strp.c time.c + +libspeed_la_DEPENDENCIES = $(SPEED_CYCLECOUNTER_OBJ) \ + $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la +libspeed_la_LIBADD = $(libspeed_la_DEPENDENCIES) $(LIBM) +libspeed_la_LDFLAGS = $(STATIC) + +$(top_builddir)/tests/libtests.la: + cd $(top_builddir)/tests; $(MAKE) $(AM_MAKEFLAGS) libtests.la + + +# The library code is faster static than shared on some systems, so do +# tuning and measuring with static, since users who care about maximizing +# speed will be using that. speed-dynamic exists to show the difference. +# +# On Solaris 8, gcc 2.95.2 -static is somehow broken (it creates executables +# that immediately seg fault), so -all-static is not used. The only thing +# -all-static does is make libc static linked as well as libgmp, and that +# makes a difference only when measuring malloc and friends in the speed +# program. This can always be forced with "make speed_LDFLAGS=-all-static +# ..." if desired, see tune/README. + +EXTRA_PROGRAMS = speed speed-dynamic speed-ext tuneup tune-gcd-p + +DEPENDENCIES = libspeed.la +LDADD = $(DEPENDENCIES) $(TUNE_LIBS) + +speed_SOURCES = speed.c +speed_LDFLAGS = $(STATIC) + +speed_dynamic_SOURCES = speed.c + +speed_ext_SOURCES = speed-ext.c +speed_ext_LDFLAGS = $(STATIC) + +tuneup_SOURCES = tuneup.c hgcd2.c +nodist_tuneup_SOURCES = sqr_basecase.c fac_ui.c $(TUNE_MPN_SRCS) +tuneup_DEPENDENCIES = $(TUNE_SQR_OBJ) libspeed.la +tuneup_LDADD = $(tuneup_DEPENDENCIES) $(TUNE_LIBS) +tuneup_LDFLAGS = $(STATIC) + +tune_gcd_p_SOURCES = tune-gcd-p.c +tune_gcd_p_DEPENDENCIES = ../mpn/gcd.c +tune_gcd_p_LDFLAGS = $(STATIC) + + +tune: + $(MAKE) $(AM_MAKEFLAGS) tuneup$(EXEEXT) + ./tuneup + +allprogs: $(EXTRA_PROGRAMS) + +# $(MANY_CLEAN) and $(MANY_DISTCLEAN) are hooks for many.pl +CLEANFILES = $(EXTRA_PROGRAMS) $(EXTRA_LTLIBRARIES) \ + $(TUNE_MPN_SRCS) fac_ui.c sqr_asm.asm \ + stg.gnuplot stg.data \ + mtg.gnuplot mtg.data \ + fibg.gnuplot fibg.data \ + graph.gnuplot graph.data \ + $(MANY_CLEAN) +DISTCLEANFILES = sqr_basecase.c $(MANY_DISTCLEAN) + + +# Generating these little files at build time seems better than including +# them in the distribution, since the list can be changed more easily. +# +# mpn/generic/tdiv_qr.c uses mpn_divrem_1 and mpn_divrem_2, but only for 1 +# and 2 limb divisors, which are never used during tuning, so it doesn't +# matter whether it picks up a tuned or untuned version of those. +# +# divrem_1 and mod_1 are recompiled renamed to "_tune" to avoid a linking +# problem. If a native divrem_1 provides an mpn_divrem_1c entrypoint then +# common.c will want that, but the generic divrem_1 doesn't provide it, +# likewise for mod_1. The simplest way around this is to have the tune +# build versions renamed suitably. +# +# FIXME: Would like say mul_n.c to depend on $(top_builddir)/mul_n.c so the +# recompiled object will be rebuilt if that file changes. + +TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c +TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ + dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c \ + invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ + get_str.c set_str.c matrix22_mul.c \ + hgcd.c hgcd_appr.c hgcd_reduce.c \ + mul_n.c sqr.c sec_powm.c \ + mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ + mulmid.c mulmid_n.c toom42_mulmid.c sqrlo.c sqrlo_basecase.c \ + nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ + toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c + +$(TUNE_MPN_SRCS_BASIC): + for i in $(TUNE_MPN_SRCS_BASIC); do \ + echo "#define TUNE_PROGRAM_BUILD 1" >$$i; \ + echo "#include \"mpn/generic/$$i\"" >>$$i; \ + done + +divrem_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >divrem_1.c + echo "#define __gmpn_divrem_1 mpn_divrem_1_tune" >>divrem_1.c + echo "#include \"mpn/generic/divrem_1.c\"" >>divrem_1.c + +mod_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >mod_1.c + echo "#define __gmpn_mod_1 mpn_mod_1_tune" >>mod_1.c + echo "#include \"mpn/generic/mod_1.c\"" >>mod_1.c + +sqr_asm.asm: $(top_builddir)/mpn/sqr_basecase.asm + echo 'define(SQR_TOOM2_THRESHOLD_OVERRIDE,SQR_TOOM2_THRESHOLD_MAX)' >sqr_asm.asm + echo 'include(../mpn/sqr_basecase.asm)' >>sqr_asm.asm + +# FIXME: Should it depend on $(top_builddir)/fac_ui.h too? +fac_ui.c: $(top_builddir)/mpz/fac_ui.c + echo "#define TUNE_PROGRAM_BUILD 1" >fac_ui.c + echo "#define __gmpz_fac_ui mpz_fac_ui_tune" >>fac_ui.c + echo "#define __gmpz_oddfac_1 mpz_oddfac_1_tune" >>fac_ui.c + echo "#include \"mpz/oddfac_1.c\"" >>fac_ui.c + echo "#include \"mpz/fac_ui.c\"" >>fac_ui.c + +include ../mpn/Makeasm.am + +.NOTPARALLEL: + diff --git a/gmp-6.3.0/tune/Makefile.in b/gmp-6.3.0/tune/Makefile.in new file mode 100644 index 0000000..7db531a --- /dev/null +++ b/gmp-6.3.0/tune/Makefile.in @@ -0,0 +1,957 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Copyright 2000-2003, 2005-2011 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +# Copyright 1996, 1998-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +EXTRA_PROGRAMS = speed$(EXEEXT) speed-dynamic$(EXEEXT) \ + speed-ext$(EXEEXT) tuneup$(EXEEXT) tune-gcd-p$(EXEEXT) +subdir = tune +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \ + $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +am__DEPENDENCIES_1 = +am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) \ + $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la +am_libspeed_la_OBJECTS = common.lo divrem1div.lo divrem1inv.lo \ + divrem2div.lo divrem2inv.lo div_qr_1n_pi1_1.lo \ + div_qr_1n_pi1_2.lo div_qr_1n_pi1_3.lo div_qr_1n_pi1_4.lo \ + div_qr_1_tune.lo freq.lo gcdext_single.lo gcdext_double.lo \ + gcdextod.lo gcdextos.lo hgcd_lehmer.lo hgcd_appr_lehmer.lo \ + hgcd_reduce_1.lo hgcd_reduce_2.lo jacbase1.lo jacbase2.lo \ + jacbase3.lo jacbase4.lo hgcd2-1.lo hgcd2-2.lo hgcd2-3.lo \ + hgcd2-4.lo hgcd2-5.lo mod_1_div.lo mod_1_inv.lo mod_1_1-1.lo \ + mod_1_1-2.lo modlinv.lo noop.lo powm_mod.lo powm_redc.lo \ + pre_divrem_1.lo set_strb.lo set_strs.lo set_strp.lo time.lo +libspeed_la_OBJECTS = $(am_libspeed_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libspeed_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(libspeed_la_LDFLAGS) $(LDFLAGS) -o $@ +am_speed_OBJECTS = speed.$(OBJEXT) +speed_OBJECTS = $(am_speed_OBJECTS) +speed_LDADD = $(LDADD) +speed_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +speed_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(speed_LDFLAGS) $(LDFLAGS) -o $@ +am_speed_dynamic_OBJECTS = speed.$(OBJEXT) +speed_dynamic_OBJECTS = $(am_speed_dynamic_OBJECTS) +speed_dynamic_LDADD = $(LDADD) +speed_dynamic_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +am_speed_ext_OBJECTS = speed-ext.$(OBJEXT) +speed_ext_OBJECTS = $(am_speed_ext_OBJECTS) +speed_ext_LDADD = $(LDADD) +speed_ext_DEPENDENCIES = $(DEPENDENCIES) $(am__DEPENDENCIES_1) +speed_ext_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(speed_ext_LDFLAGS) $(LDFLAGS) -o $@ +am_tune_gcd_p_OBJECTS = tune-gcd-p.$(OBJEXT) +tune_gcd_p_OBJECTS = $(am_tune_gcd_p_OBJECTS) +tune_gcd_p_LDADD = $(LDADD) +tune_gcd_p_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(tune_gcd_p_LDFLAGS) $(LDFLAGS) -o $@ +am_tuneup_OBJECTS = tuneup.$(OBJEXT) hgcd2.$(OBJEXT) +am__objects_1 = div_qr_2.$(OBJEXT) bdiv_q.$(OBJEXT) bdiv_qr.$(OBJEXT) \ + dcpi1_div_qr.$(OBJEXT) dcpi1_divappr_q.$(OBJEXT) \ + dcpi1_bdiv_qr.$(OBJEXT) dcpi1_bdiv_q.$(OBJEXT) \ + invertappr.$(OBJEXT) invert.$(OBJEXT) binvert.$(OBJEXT) \ + divrem_2.$(OBJEXT) gcd.$(OBJEXT) gcdext.$(OBJEXT) \ + get_str.$(OBJEXT) set_str.$(OBJEXT) matrix22_mul.$(OBJEXT) \ + hgcd.$(OBJEXT) hgcd_appr.$(OBJEXT) hgcd_reduce.$(OBJEXT) \ + mul_n.$(OBJEXT) sqr.$(OBJEXT) sec_powm.$(OBJEXT) \ + mullo_n.$(OBJEXT) mul_fft.$(OBJEXT) mul.$(OBJEXT) \ + tdiv_qr.$(OBJEXT) mulmod_bnm1.$(OBJEXT) sqrmod_bnm1.$(OBJEXT) \ + mulmid.$(OBJEXT) mulmid_n.$(OBJEXT) toom42_mulmid.$(OBJEXT) \ + sqrlo.$(OBJEXT) sqrlo_basecase.$(OBJEXT) \ + nussbaumer_mul.$(OBJEXT) toom6h_mul.$(OBJEXT) \ + toom8h_mul.$(OBJEXT) toom6_sqr.$(OBJEXT) toom8_sqr.$(OBJEXT) \ + toom22_mul.$(OBJEXT) toom2_sqr.$(OBJEXT) toom33_mul.$(OBJEXT) \ + toom3_sqr.$(OBJEXT) toom44_mul.$(OBJEXT) toom4_sqr.$(OBJEXT) +am__objects_2 = $(am__objects_1) divrem_1.$(OBJEXT) mod_1.$(OBJEXT) +nodist_tuneup_OBJECTS = sqr_basecase.$(OBJEXT) fac_ui.$(OBJEXT) \ + $(am__objects_2) +tuneup_OBJECTS = $(am_tuneup_OBJECTS) $(nodist_tuneup_OBJECTS) +am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) libspeed.la +tuneup_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(tuneup_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = +am__depfiles_maybe = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \ + $(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \ + $(tune_gcd_p_SOURCES) $(tuneup_SOURCES) \ + $(nodist_tuneup_SOURCES) +DIST_SOURCES = $(libspeed_la_SOURCES) $(speed_SOURCES) \ + $(speed_dynamic_SOURCES) $(speed_ext_SOURCES) \ + $(tune_gcd_p_SOURCES) $(tuneup_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +HEADERS = $(noinst_HEADERS) +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/../mpn/Makeasm.am $(srcdir)/Makefile.in \ + README +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ABI = @ABI@ +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +ASMFLAGS = @ASMFLAGS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@ +CC = @CC@ +CCAS = @CCAS@ +CC_FOR_BUILD = @CC_FOR_BUILD@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CPP_FOR_BUILD = @CPP_FOR_BUILD@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFN_LONG_LONG_LIMB = @DEFN_LONG_LONG_LIMB@ +DEFS = @DEFS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@ +FGREP = @FGREP@ +GMP_LDFLAGS = @GMP_LDFLAGS@ +GMP_LIMB_BITS = @GMP_LIMB_BITS@ +GMP_NAIL_BITS = @GMP_NAIL_BITS@ +GREP = @GREP@ +HAVE_CLOCK_01 = @HAVE_CLOCK_01@ +HAVE_CPUTIME_01 = @HAVE_CPUTIME_01@ +HAVE_GETRUSAGE_01 = @HAVE_GETRUSAGE_01@ +HAVE_GETTIMEOFDAY_01 = @HAVE_GETTIMEOFDAY_01@ +HAVE_HOST_CPU_FAMILY_power = @HAVE_HOST_CPU_FAMILY_power@ +HAVE_HOST_CPU_FAMILY_powerpc = @HAVE_HOST_CPU_FAMILY_powerpc@ +HAVE_SIGACTION_01 = @HAVE_SIGACTION_01@ +HAVE_SIGALTSTACK_01 = @HAVE_SIGALTSTACK_01@ +HAVE_SIGSTACK_01 = @HAVE_SIGSTACK_01@ +HAVE_STACK_T_01 = @HAVE_STACK_T_01@ +HAVE_SYS_RESOURCE_H_01 = @HAVE_SYS_RESOURCE_H_01@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBCURSES = @LIBCURSES@ +LIBGMPXX_LDFLAGS = @LIBGMPXX_LDFLAGS@ +LIBGMP_DLL = @LIBGMP_DLL@ +LIBGMP_LDFLAGS = @LIBGMP_LDFLAGS@ +LIBM = @LIBM@ +LIBM_FOR_BUILD = @LIBM_FOR_BUILD@ +LIBOBJS = @LIBOBJS@ +LIBREADLINE = @LIBREADLINE@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +M4 = @M4@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SPEED_CYCLECOUNTER_OBJ = @SPEED_CYCLECOUNTER_OBJ@ +STRIP = @STRIP@ +TAL_OBJECT = @TAL_OBJECT@ +TUNE_LIBS = @TUNE_LIBS@ +TUNE_SQR_OBJ = @TUNE_SQR_OBJ@ +U_FOR_BUILD = @U_FOR_BUILD@ +VERSION = @VERSION@ +WITH_READLINE_01 = @WITH_READLINE_01@ +YACC = @YACC@ +YFLAGS = @YFLAGS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__leading_dot = @am__leading_dot@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +gmp_srclinks = @gmp_srclinks@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +mpn_objects = @mpn_objects@ +mpn_objs_in_libgmp = @mpn_objs_in_libgmp@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests +AM_LDFLAGS = -no-install +EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \ + ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl + +noinst_HEADERS = speed.h +@ENABLE_STATIC_FALSE@STATIC = + +# Prefer -static on the speed and tune programs, since that can avoid +# overheads of shared library linkages on some systems. Libtool tends to +# botch -static if configured with --disable-static, perhaps reasonably +# enough. In any event under --disable-static the only choice is a dynamic +# link so there's no point in -static. +# +@ENABLE_STATIC_TRUE@STATIC = -static +EXTRA_LTLIBRARIES = libspeed.la +libspeed_la_SOURCES = \ + common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c \ + div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c \ + div_qr_1n_pi1_4.c div_qr_1_tune.c \ + freq.c \ + gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c \ + hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c \ + jacbase1.c jacbase2.c jacbase3.c jacbase4.c \ + hgcd2-1.c hgcd2-2.c hgcd2-3.c hgcd2-4.c hgcd2-5.c \ + mod_1_div.c mod_1_inv.c mod_1_1-1.c mod_1_1-2.c modlinv.c \ + noop.c powm_mod.c powm_redc.c pre_divrem_1.c \ + set_strb.c set_strs.c set_strp.c time.c + +libspeed_la_DEPENDENCIES = $(SPEED_CYCLECOUNTER_OBJ) \ + $(top_builddir)/tests/libtests.la $(top_builddir)/libgmp.la + +libspeed_la_LIBADD = $(libspeed_la_DEPENDENCIES) $(LIBM) +libspeed_la_LDFLAGS = $(STATIC) +DEPENDENCIES = libspeed.la +LDADD = $(DEPENDENCIES) $(TUNE_LIBS) +speed_SOURCES = speed.c +speed_LDFLAGS = $(STATIC) +speed_dynamic_SOURCES = speed.c +speed_ext_SOURCES = speed-ext.c +speed_ext_LDFLAGS = $(STATIC) +tuneup_SOURCES = tuneup.c hgcd2.c +nodist_tuneup_SOURCES = sqr_basecase.c fac_ui.c $(TUNE_MPN_SRCS) +tuneup_DEPENDENCIES = $(TUNE_SQR_OBJ) libspeed.la +tuneup_LDADD = $(tuneup_DEPENDENCIES) $(TUNE_LIBS) +tuneup_LDFLAGS = $(STATIC) +tune_gcd_p_SOURCES = tune-gcd-p.c +tune_gcd_p_DEPENDENCIES = ../mpn/gcd.c +tune_gcd_p_LDFLAGS = $(STATIC) + +# $(MANY_CLEAN) and $(MANY_DISTCLEAN) are hooks for many.pl +CLEANFILES = $(EXTRA_PROGRAMS) $(EXTRA_LTLIBRARIES) \ + $(TUNE_MPN_SRCS) fac_ui.c sqr_asm.asm \ + stg.gnuplot stg.data \ + mtg.gnuplot mtg.data \ + fibg.gnuplot fibg.data \ + graph.gnuplot graph.data \ + $(MANY_CLEAN) + +DISTCLEANFILES = sqr_basecase.c $(MANY_DISTCLEAN) + +# Generating these little files at build time seems better than including +# them in the distribution, since the list can be changed more easily. +# +# mpn/generic/tdiv_qr.c uses mpn_divrem_1 and mpn_divrem_2, but only for 1 +# and 2 limb divisors, which are never used during tuning, so it doesn't +# matter whether it picks up a tuned or untuned version of those. +# +# divrem_1 and mod_1 are recompiled renamed to "_tune" to avoid a linking +# problem. If a native divrem_1 provides an mpn_divrem_1c entrypoint then +# common.c will want that, but the generic divrem_1 doesn't provide it, +# likewise for mod_1. The simplest way around this is to have the tune +# build versions renamed suitably. +# +# FIXME: Would like say mul_n.c to depend on $(top_builddir)/mul_n.c so the +# recompiled object will be rebuilt if that file changes. +TUNE_MPN_SRCS = $(TUNE_MPN_SRCS_BASIC) divrem_1.c mod_1.c +TUNE_MPN_SRCS_BASIC = div_qr_2.c bdiv_q.c bdiv_qr.c \ + dcpi1_div_qr.c dcpi1_divappr_q.c dcpi1_bdiv_qr.c dcpi1_bdiv_q.c \ + invertappr.c invert.c binvert.c divrem_2.c gcd.c gcdext.c \ + get_str.c set_str.c matrix22_mul.c \ + hgcd.c hgcd_appr.c hgcd_reduce.c \ + mul_n.c sqr.c sec_powm.c \ + mullo_n.c mul_fft.c mul.c tdiv_qr.c mulmod_bnm1.c sqrmod_bnm1.c \ + mulmid.c mulmid_n.c toom42_mulmid.c sqrlo.c sqrlo_basecase.c \ + nussbaumer_mul.c toom6h_mul.c toom8h_mul.c toom6_sqr.c toom8_sqr.c \ + toom22_mul.c toom2_sqr.c toom33_mul.c toom3_sqr.c toom44_mul.c toom4_sqr.c + + +# COMPILE minus CC. +# +COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS) + + +# Flags used for preprocessing (in ansi2knr rules). +# +PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) + + +# Recent versions of automake (1.5 and up for instance) append automake +# generated suffixes to this $(SUFFIXES) list. This is essential for us, +# since .c must come after .s, .S and .asm. If .c is before .s, for +# instance, then in the mpn directory "make" will see add_n.c mentioned in +# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c, +# even if add_n.c doesn't exist but add_n.s does. See GNU make +# documentation "(make)Implicit Rule Search", part 5c. +# +# On IRIX 6 native make this doesn't work properly though. Somehow .c +# remains ahead of .s, perhaps because .c.s is a builtin rule. .asm works +# fine though, and mpn/mips3 uses this. +# +SUFFIXES = .s .S .asm + +# can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +RM_TMP = rm -f +all: all-am + +.SUFFIXES: +.SUFFIXES: .s .S .asm .c .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/../mpn/Makeasm.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps tune/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu --ignore-deps tune/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; +$(srcdir)/../mpn/Makeasm.am $(am__empty): + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +libspeed.la: $(libspeed_la_OBJECTS) $(libspeed_la_DEPENDENCIES) $(EXTRA_libspeed_la_DEPENDENCIES) + $(AM_V_CCLD)$(libspeed_la_LINK) $(libspeed_la_OBJECTS) $(libspeed_la_LIBADD) $(LIBS) + +speed$(EXEEXT): $(speed_OBJECTS) $(speed_DEPENDENCIES) $(EXTRA_speed_DEPENDENCIES) + @rm -f speed$(EXEEXT) + $(AM_V_CCLD)$(speed_LINK) $(speed_OBJECTS) $(speed_LDADD) $(LIBS) + +speed-dynamic$(EXEEXT): $(speed_dynamic_OBJECTS) $(speed_dynamic_DEPENDENCIES) $(EXTRA_speed_dynamic_DEPENDENCIES) + @rm -f speed-dynamic$(EXEEXT) + $(AM_V_CCLD)$(LINK) $(speed_dynamic_OBJECTS) $(speed_dynamic_LDADD) $(LIBS) + +speed-ext$(EXEEXT): $(speed_ext_OBJECTS) $(speed_ext_DEPENDENCIES) $(EXTRA_speed_ext_DEPENDENCIES) + @rm -f speed-ext$(EXEEXT) + $(AM_V_CCLD)$(speed_ext_LINK) $(speed_ext_OBJECTS) $(speed_ext_LDADD) $(LIBS) + +tune-gcd-p$(EXEEXT): $(tune_gcd_p_OBJECTS) $(tune_gcd_p_DEPENDENCIES) $(EXTRA_tune_gcd_p_DEPENDENCIES) + @rm -f tune-gcd-p$(EXEEXT) + $(AM_V_CCLD)$(tune_gcd_p_LINK) $(tune_gcd_p_OBJECTS) $(tune_gcd_p_LDADD) $(LIBS) + +tuneup$(EXEEXT): $(tuneup_OBJECTS) $(tuneup_DEPENDENCIES) $(EXTRA_tuneup_DEPENDENCIES) + @rm -f tuneup$(EXEEXT) + $(AM_V_CCLD)$(tuneup_LINK) $(tuneup_OBJECTS) $(tuneup_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +.c.o: + $(AM_V_CC)$(COMPILE) -c -o $@ $< + +.c.obj: + $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: + $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(HEADERS) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool cscopelist-am ctags ctags-am distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ + pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +$(top_builddir)/tests/libtests.la: + cd $(top_builddir)/tests; $(MAKE) $(AM_MAKEFLAGS) libtests.la + +tune: + $(MAKE) $(AM_MAKEFLAGS) tuneup$(EXEEXT) + ./tuneup + +allprogs: $(EXTRA_PROGRAMS) + +$(TUNE_MPN_SRCS_BASIC): + for i in $(TUNE_MPN_SRCS_BASIC); do \ + echo "#define TUNE_PROGRAM_BUILD 1" >$$i; \ + echo "#include \"mpn/generic/$$i\"" >>$$i; \ + done + +divrem_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >divrem_1.c + echo "#define __gmpn_divrem_1 mpn_divrem_1_tune" >>divrem_1.c + echo "#include \"mpn/generic/divrem_1.c\"" >>divrem_1.c + +mod_1.c: + echo "#define TUNE_PROGRAM_BUILD 1" >mod_1.c + echo "#define __gmpn_mod_1 mpn_mod_1_tune" >>mod_1.c + echo "#include \"mpn/generic/mod_1.c\"" >>mod_1.c + +sqr_asm.asm: $(top_builddir)/mpn/sqr_basecase.asm + echo 'define(SQR_TOOM2_THRESHOLD_OVERRIDE,SQR_TOOM2_THRESHOLD_MAX)' >sqr_asm.asm + echo 'include(../mpn/sqr_basecase.asm)' >>sqr_asm.asm + +# FIXME: Should it depend on $(top_builddir)/fac_ui.h too? +fac_ui.c: $(top_builddir)/mpz/fac_ui.c + echo "#define TUNE_PROGRAM_BUILD 1" >fac_ui.c + echo "#define __gmpz_fac_ui mpz_fac_ui_tune" >>fac_ui.c + echo "#define __gmpz_oddfac_1 mpz_oddfac_1_tune" >>fac_ui.c + echo "#include \"mpz/oddfac_1.c\"" >>fac_ui.c + echo "#include \"mpz/fac_ui.c\"" >>fac_ui.c + +# .s assembler, no preprocessing. +# +.s.o: + $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` +.s.lo: + $(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .S assembler, preprocessed with cpp. +# +# It's necessary to run $(CPP) separately, since it seems not all compilers +# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and +# will silently do nothing if given a .S). +# +# For .lo we need a helper script, as described below for .asm.lo. +# +.S.o: + $(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.obj: + $(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .asm assembler, preprocessed with m4. +# +# .o and .obj are non-PIC and just need m4 followed by a compile. +# +# .lo is a bit tricky. Libtool (as of version 1.5) has foo.lo as a little +# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects, +# respectively. It'd be asking for lots of trouble to try to create foo.lo +# ourselves, so instead arrange to invoke libtool like a --mode=compile, but +# with a special m4-ccas script which first m4 preprocesses, then compiles. +# --tag=CC is necessary since foo.asm is otherwise unknown to libtool. +# +# Libtool adds -DPIC when building a shared object and the .asm files look +# for that. But it should be noted that the other PIC flags are on occasion +# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before +# it accepts PIC constructs like @GOT, and gcc adds that flag only under +# -fPIC. (Later versions of gas are happy to accept PIC stuff any time.) +# +.asm.o: + $(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +.NOTPARALLEL: + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/gmp-6.3.0/tune/README b/gmp-6.3.0/tune/README new file mode 100644 index 0000000..f76407f --- /dev/null +++ b/gmp-6.3.0/tune/README @@ -0,0 +1,501 @@ +Copyright 2000-2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + GMP SPEED MEASURING AND PARAMETER TUNING + + +The programs in this directory are for knowledgeable users who want to +measure GMP routines on their machine, and perhaps tweak some settings or +identify things that can be improved. + +The programs here are tools, not ready to run solutions. Nothing is built +in a normal "make all", but various Makefile targets described below exist. + +Relatively few systems and CPUs have been tested, so be sure to verify that +results are sensible before relying on them. + + + + +MISCELLANEOUS NOTES + +--enable-assert + + Don't configure with --enable-assert, since the extra code added by + assertion checking may influence measurements. + +Direct mapped caches + + Some effort has been made to accommodate CPUs with direct mapped caches, + by putting data blocks more or less contiguously on the stack. But this + will depend on TMP_ALLOC using alloca, and even then it may or may not + be enough. + +FreeBSD 4.2 i486 getrusage + + This getrusage seems to be a bit doubtful, it looks like it's + microsecond accurate, but sometimes ru_utime remains unchanged after a + time of many microseconds has elapsed. It'd be good to detect this in + the time.c initializations, but for now the suggestion is to pretend it + doesn't exist. + + ./configure ac_cv_func_getrusage=no + +NetBSD 1.4.1 m68k macintosh time base + + On this system it's been found getrusage often goes backwards, making it + unusable (time.c getrusage_backwards_p detects this). gettimeofday + sometimes doesn't update atomically when it crosses a 1 second boundary. + Not sure what to do about this. Expect possible intermittent failures. + +SCO OpenUNIX 8 /etc/hw + + /etc/hw takes about a second to return the cpu frequency, which suggests + perhaps it's measuring each time it runs. If this is annoying when + running the speed program repeatedly then set a GMP_CPU_FREQUENCY + environment variable (see TIME BASE section below). + +Timing on GNU/Linux + + On Linux, timing currently uses the cycle counter. This is unreliable, + since the counter is not saved and restored at context switches (unlike + FreeBSD and Solaris where the cycle counter is "virtualized"). + + Using the clock_gettime method with CLOCK_PROCESS_CPUTIME_ID (posix) or + CLOCK_VIRTUAL (BSD) should be more reliable. To get clock_gettime + with glibc, one has to link with -lrt (which also drags in the pthreads + threading library). configure.in must be hacked to detect this and + arrange proper linking. Something like + + old_LIBS="$LIBS" + AC_SEARCH_LIBS(clock_gettime, rt, [AC_DEFINE(HAVE_CLOCK_GETTIME)]) + TUNE_LIBS="$LIBS" + LIBS="$old_LIBS" + + AC_SUBST(TUNE_LIBS) + + might work. + +Low resolution timebase + + Parameter tuning can be very time consuming if the only timebase + available is a 10 millisecond clock tick, to the point of being + unusable. This is currently the case on VAX and ARM systems. + + + + +PARAMETER TUNING + +The "tuneup" program runs some tests designed to find the best settings for +various thresholds, like MUL_TOOM22_THRESHOLD. Its output can be put +into gmp-mparam.h. The program is built and run with + + make tune + +If the thresholds indicated are grossly different from the values in the +selected gmp-mparam.h then there may be a performance boost in applicable +size ranges by changing gmp-mparam.h accordingly. + +Be sure to do a full reconfigure and rebuild to get any newly set thresholds +to take effect. A partial rebuild is enough sometimes, but a fresh +configure and make is certain to be correct. + +If a CPU has specific tuned parameters coming from a gmp-mparam.h in one of +the mpn subdirectories then the values from "make tune" should be similar. +But check that the configured CPU is right and there are no machine specific +effects causing a difference. + +It's hoped the compiler and options used won't have too much effect on +thresholds, since for most CPUs they ultimately come down to comparisons +between assembler subroutines. Missing out on the longlong.h macros by not +using gcc will probably have an effect. + +Some thresholds produced by the tune program are merely single values chosen +from what's a range of sizes where two algorithms are pretty much the same +speed. When this happens the program is likely to give somewhat different +values on successive runs. This is noticeable on the toom3 thresholds for +instance. + + + + +SPEED PROGRAM + +The "speed" program can be used for measuring and comparing various +routines, and producing tables of data or gnuplot graphs. Compile it with + + make speed + +(Or on DOS systems "make speed.exe".) + +Here are some examples of how to use it. Check the code for all the +options. + +Draw a graph of mpn_mul_n, stepping through sizes by 10 or a factor of 1.05 +(whichever is greater). + + ./speed -s 10-5000 -t 10 -f 1.05 -P foo mpn_mul_n + gnuplot foo.gnuplot + +Compare mpn_add_n and an mpn_lshift by 1, showing times in cycles and +showing under mpn_lshift the difference between it and mpn_add_n. + + ./speed -s 1-40 -c -d mpn_add_n mpn_lshift.1 + +Using option -c for times in cycles is interesting but normally only +necessary when looking carefully at assembler subroutines. You might think +it would always give an integer value, but this doesn't happen in practice, +probably due to overheads in the time measurements. + +In the free-form output the "#" symbol against a measurement means the +corresponding routine is fastest at that size. This is a convenient visual +cue when comparing different routines. The graph data files .data +don't get this since it would upset gnuplot or other data viewers. + + + + +TIME BASE + +The time measuring method is determined in time.c, based on what the +configured host has available. A cycle counter is preferred, possibly +supplemented by another method if the counter has a limited range. A +microsecond accurate getrusage() or gettimeofday() will work quite well too. + +The cycle counters (except possibly on alpha) and gettimeofday() will depend +on the machine being otherwise idle, or rather on other jobs not stealing +CPU time from the measuring program. Short routines (those that complete +within a timeslice) should work even on a busy machine. + +Some trouble is taken by speed_measure() in common.c to avoid ill effects +from sporadic interrupts, or other intermittent things (like cron waking up +every minute). But generally an idle machine will be necessary to be +certain of consistent results. + +The CPU frequency is needed to convert between cycles and seconds, or for +when a cycle counter is supplemented by getrusage() etc. The speed program +will convert as necessary according to the output format requested. The +tune program will work with either cycles or seconds. + +freq.c knows how to get the frequency on some systems, or can measure a +cycle counter against gettimeofday() or getrusage(), but when that fails, or +needs to be overridden, an environment variable GMP_CPU_FREQUENCY can be +used (in Hertz). For example in "bash" on a 650 MHz machine, + + export GMP_CPU_FREQUENCY=650e6 + +A high precision time base makes it possible to get accurate measurements in +a shorter time. + + + + +EXAMPLE COMPARISONS - VARIOUS + +Here are some ideas for things that can be done with the speed program. + +There's always going to be a certain amount of overhead in the time +measurements, due to reading the time base, and in the loop that runs a +routine enough times to get a reading of the desired precision. Noop +functions taking various arguments are available to measure this. The +"overhead" printed by the speed program each time in its intro is the "noop" +routine, but note that this is just for information, it isn't deducted from +the times printed or anything. + + ./speed -s 1 noop noop_wxs noop_wxys + +To see how many cycles per limb a routine is taking, look at the time +increase when the size increments, using option -D. This avoids fixed +overheads in the measuring. Also, remember many of the assembler routines +have unrolled loops, so it might be necessary to compare times at, say, 16, +32, 48, 64 etc to see what the unrolled part is taking, as opposed to any +finishing off. + + ./speed -s 16-64 -t 16 -C -D mpn_add_n + +The -C option on its own gives cycles per limb, but is really only useful at +big sizes where fixed overheads are small compared to the code doing the +real work. Remember of course memory caching and/or page swapping will +affect results at large sizes. + + ./speed -s 500000 -C mpn_add_n + +Once a calculation stops fitting in the CPU data cache, it's going to start +taking longer. Exactly where this happens depends on the cache priming in +the measuring routines, and on what sort of "least recently used" the +hardware does. Here's an example for a CPU with a 16kbyte L1 data cache and +32-bit limb, showing a suddenly steeper curve for mpn_add_n at about 2000 +limbs. + + ./speed -s 1-4000 -t 5 -f 1.02 -P foo mpn_add_n + gnuplot foo.gnuplot + +When a routine has an unrolled loop for, say, multiples of 8 limbs and then +an ordinary loop for the remainder, it can happen that it's actually faster +to do an operation on, say, 8 limbs than it is on 7 limbs. The following +draws a graph of mpn_sub_n, to see whether times smoothly increase with +size. + + ./speed -s 1-100 -c -P foo mpn_sub_n + gnuplot foo.gnuplot + +If mpn_lshift and mpn_rshift have special case code for shifts by 1, it +ought to be faster (or at least not slower) than shifting by, say, 2 bits. + + ./speed -s 1-200 -c mpn_rshift.1 mpn_rshift.2 + +An mpn_lshift by 1 can be done by mpn_add_n adding a number to itself, and +if the lshift isn't faster there's an obvious improvement that's possible. + + ./speed -s 1-200 -c mpn_lshift.1 mpn_add_n_self + +On some CPUs (AMD K6 for example) an "in-place" mpn_add_n where the +destination is one of the sources is faster than a separate destination. +Here's an example to see this. ".1" selects dst==src1 for mpn_add_n (and +mpn_sub_n), for other values see speed.h SPEED_ROUTINE_MPN_BINARY_N_CALL. + + ./speed -s 1-200 -c mpn_add_n mpn_add_n.1 + +The gmp manual points out that divisions by powers of two should be done +using a right shift because it'll be significantly faster than an actual +division. The following shows by what factor mpn_rshift is faster than +mpn_divrem_1, using division by 32 as an example. + + ./speed -s 10-20 -r mpn_rshift.5 mpn_divrem_1.32 + + + + +EXAMPLE COMPARISONS - MULTIPLICATION + +mul_basecase takes a "." parameter. If positive, it gives the second +(smaller) operand size. For example to show speeds for 3x3 up to 20x3 in +cycles, + + ./speed -s 3-20 -c mpn_mul_basecase.3 + +A negative ".<-r>" parameter fixes the size of the product to the absolute +value r. For example to show speeds for 10x10 up to 19x1 in cycles, + + ./speed -s 10-19 -c mpn_mul_basecase.-20 + +mul_basecase with no parameter does an NxN multiply, so for example to show +speeds in cycles for 1x1, 2x2, 3x3, etc, up to 20x20, in cycles, + + ./speed -s 1-20 -c mpn_mul_basecase + +sqr_basecase is implemented by a "triangular" method on most CPUs, making it +up to twice as fast as mul_basecase. In practice loop overheads and the +products on the diagonal mean it falls short of this. Here's an example +running the two and showing by what factor an NxN mul_basecase is slower +than an NxN sqr_basecase. (Some versions of sqr_basecase only allow sizes +below SQR_TOOM2_THRESHOLD, so if it crashes at that point don't worry.) + + ./speed -s 1-20 -r mpn_sqr_basecase mpn_mul_basecase + +The technique described above with -CD for showing the time difference in +cycles per limb between two size operations can be done on an NxN +mul_basecase using -E to change the basis for the size increment to N*N. +For instance a 20x20 operation is taken to be doing 400 limbs, and a 16x16 +doing 256 limbs. The following therefore shows the per crossproduct speed +of mul_basecase and sqr_basecase at around 20x20 limbs. + + ./speed -s 16-20 -t 4 -CDE mpn_mul_basecase mpn_sqr_basecase + +Of course sqr_basecase isn't really doing NxN crossproducts, but it can be +interesting to compare it to mul_basecase as if it was. For sqr_basecase +the -F option can be used to base the deltas on N*(N+1)/2 operations, which +is the triangular products sqr_basecase does. For example, + + ./speed -s 16-20 -t 4 -CDF mpn_sqr_basecase + +Both -E and -F are preliminary and might change. A consistent approach to +using them when claiming certain per crossproduct or per triangularproduct +speeds hasn't really been established, but the increment between speeds in +the range karatsuba will call seems sensible, that being k to k/2. For +instance, if the karatsuba threshold was 20 for the multiply and 30 for the +square, + + ./speed -s 10-20 -t 10 -CDE mpn_mul_basecase + ./speed -s 15-30 -t 15 -CDF mpn_sqr_basecase + + + +EXAMPLE COMPARISONS - MALLOC + +The gmp manual recommends application programs avoid excessive initializing +and clearing of mpz_t variables (and mpq_t and mpf_t too). Every new +variable will at a minimum go through an init, a realloc for its first +store, and finally a clear. Quite how long that takes depends on the C +library. The following compares an mpz_init/realloc/clear to a 10 limb +mpz_add. Don't be surprised if the mallocing is quite slow. + + ./speed -s 10 -c mpz_init_realloc_clear mpz_add + +On some systems malloc and free are much slower when dynamic linked. The +speed-dynamic program can be used to see this. For example the following +measures malloc/free, first static then dynamic. + + ./speed -s 10 -c malloc_free + ./speed-dynamic -s 10 -c malloc_free + +Of course a real world program has big problems if it's doing so many +mallocs and frees that it gets slowed down by a dynamic linked malloc. + + + + + +EXAMPLE COMPARISONS - STRING CONVERSIONS + +mpn_get_str does a binary to string conversion. The base is specified with +a "." parameter, or decimal by default. Power of 2 bases are much faster +than general bases. The following compares decimal and hex for instance. + + ./speed -s 1-20 -c mpn_get_str mpn_get_str.16 + +Smaller bases need more divisions to split a given size number, and so are +slower. The following compares base 3 and base 9. On small operands 9 will +be nearly twice as fast, though at bigger sizes this reduces since in the +current implementation both divide repeatedly by 3^20 (or 3^40 for 64 bit +limbs) and those divisions come to dominate. + + ./speed -s 1-20 -cr mpn_get_str.3 mpn_get_str.9 + +mpn_set_str does a string to binary conversion. The base is specified with +a "." parameter, or decimal by default. Power of 2 bases are faster than +general bases on large conversions. + + ./speed -s 1-512 -f 2 -c mpn_set_str.8 mpn_set_str.10 + +mpn_set_str also has some special case code for decimal which is a bit +faster than the general case, basically by giving the compiler a chance to +optimize some multiplications by 10. + + ./speed -s 20-40 -c mpn_set_str.9 mpn_set_str.10 mpn_set_str.11 + + + + +EXAMPLE COMPARISONS - GCDs + +mpn_gcd_1 has a threshold for when to reduce using an initial x%y when both +x and y are single limbs. This isn't tuned currently, but a value can be +established by a measurement like + + ./speed -s 10-32 mpn_gcd_1.10 + +This runs src[0] from 10 to 32 bits, and y fixed at 10 bits. If the div +threshold is high, say 31 so it's effectively disabled then a 32x10 bit gcd +is done by nibbling away at the 32-bit operands bit-by-bit. When the +threshold is small, say 1 bit, then an initial x%y is done to reduce it to a +10x10 bit operation. + +The threshold in mpn/generic/gcd_1.c or the various assembler +implementations can be tweaked up or down until there's no more speedups on +interesting combinations of sizes. Note that this affects only a 1x1 limb +operation and so isn't very important. (An Nx1 limb operation always does +an initial modular reduction, using mpn_mod_1 or mpn_modexact_1_odd.) + + + + +SPEED PROGRAM EXTENSIONS + +Potentially lots of things could be made available in the program, but it's +been left at only the things that have actually been wanted and are likely +to be reasonably useful in the future. + +Extensions should be fairly easy to make though. speed-ext.c is an example, +in a style that should suit one-off tests, or new code fragments under +development. + +many.pl is a script for generating a new speed program supplemented with +alternate versions of the standard routines. It can be used for measuring +experimental code, or for comparing different implementations that exist +within a CPU family. + + + + +THRESHOLD EXAMINING + +The speed program can be used to examine the speeds of different algorithms +to check the tune program has done the right thing. For example to examine +the karatsuba multiply threshold, + + ./speed -s 5-40 mpn_mul_basecase mpn_kara_mul_n + +When examining the toom3 threshold, remember it depends on the karatsuba +threshold, so the right karatsuba threshold needs to be compiled into the +library first. The tune program uses specially recompiled versions of +mpn/mul_n.c etc for this reason, but the speed program simply uses the +normal libgmp.la. + +Note further that the various routines may recurse into themselves on sizes +far enough above applicable thresholds. For example, mpn_kara_mul_n will +recurse into itself on sizes greater than twice the compiled-in +MUL_TOOM22_THRESHOLD. + +When doing the above comparison between mul_basecase and kara_mul_n what's +probably of interest is mul_basecase versus a kara_mul_n that does one level +of Karatsuba then calls to mul_basecase, but this only happens on sizes less +than twice the compiled MUL_TOOM22_THRESHOLD. A larger value for that +setting can be compiled-in to avoid the problem if necessary. The same +applies to toom3 and DC, though in a trickier fashion. + +There are some upper limits on some of the thresholds, arising from arrays +dimensioned according to a threshold (mpn_mul_n), or asm code with certain +sized displacements (some x86 versions of sqr_basecase). So putting huge +values for the thresholds, even just for testing, may fail. + + + + +FUTURE + +Make a program to check the time base is working properly, for small and +large measurements. Make it able to test each available method, including +perhaps the apparent resolution of each. + +Make a general mechanism for specifying operand overlap, and a syntax like +maybe "mpn_add_n.dst=src2" to select it. Some measuring routines do this +sort of thing with the "r" parameter currently. + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/tune/alpha.asm b/gmp-6.3.0/tune/alpha.asm new file mode 100644 index 0000000..888c77f --- /dev/null +++ b/gmp-6.3.0/tune/alpha.asm @@ -0,0 +1,59 @@ +dnl Alpha time stamp counter access routine. + +dnl Copyright 2000, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C void speed_cyclecounter (unsigned int p[2]); +C + +C The rpcc instruction returns a 64-bit value split into two 32-bit fields. +C The lower 32 bits are set by the hardware, and the upper 32 bits are set +C by the operating system. The real per-process cycle count is the sum of +C these halves. + +C Unfortunately, some operating systems don't get this right. NetBSD 1.3 is +C known to sometimes put garbage in the upper half. Whether newer NetBSD +C versions get it right, is unknown to us. + +C rpcc measures cycles elapsed in the user program and hence should be very +C accurate even on a busy system. Losing cache contents due to task +C switching may have an effect though. + +ASM_START() +PROLOGUE(speed_cyclecounter) + rpcc r0 + srl r0,32,r1 + addq r1,r0,r0 + stl r0,0(r16) + stl r31,4(r16) C zero upper return word + ret r31,(r26),1 +EPILOGUE(speed_cyclecounter) +ASM_END() diff --git a/gmp-6.3.0/tune/common.c b/gmp-6.3.0/tune/common.c new file mode 100644 index 0000000..48da6c6 --- /dev/null +++ b/gmp-6.3.0/tune/common.c @@ -0,0 +1,2945 @@ +/* Shared speed subroutines. + +Copyright 1999-2006, 2008-2017, 2019-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_NO_ATTRIBUTE_CONST_PURE + +#include +#include +#include +#include +#include /* for qsort */ +#include +#include +#if 0 +#include +#endif + +#include "gmp-impl.h" +#include "longlong.h" + +#include "tests.h" +#include "speed.h" + + +int speed_option_addrs = 0; +int speed_option_verbose = 0; +int speed_option_cycles_broken = 0; + + +/* Provide __clz_tab even if it's not required, for the benefit of new code + being tested with many.pl. */ +#ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#include "mp_clz_tab.c" +#undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#endif + + +void +pentium_wbinvd(void) +{ +#if 0 + { + static int fd = -2; + + if (fd == -2) + { + fd = open ("/dev/wbinvd", O_RDWR); + if (fd == -1) + perror ("open /dev/wbinvd"); + } + + if (fd != -1) + ioctl (fd, 0, 0); + } +#endif + +#if 0 +#define WBINVDSIZE 1024*1024*2 + { + static char *p = NULL; + int i, sum; + + if (p == NULL) + p = malloc (WBINVDSIZE); + +#if 0 + for (i = 0; i < WBINVDSIZE; i++) + p[i] = i & 0xFF; +#endif + + sum = 0; + for (i = 0; i < WBINVDSIZE; i++) + sum += p[i]; + + mpn_cache_fill_dummy (sum); + } +#endif +} + + +int +double_cmp_ptr (const double *p, const double *q) +{ + if (*p > *q) return 1; + if (*p < *q) return -1; + return 0; +} + + +/* Measure the speed of a given routine. + + The routine is run with enough repetitions to make it take at least + speed_precision * speed_unittime. This aims to minimize the effects of a + limited accuracy time base and the overhead of the measuring itself. + + Measurements are made looking for 4 results within TOLERANCE of each + other (or 3 for routines taking longer than 2 seconds). This aims to get + an accurate reading even if some runs are bloated by interrupts or task + switches or whatever. + + The given (*fun)() is expected to run its function "s->reps" many times + and return the total elapsed time measured using speed_starttime() and + speed_endtime(). If the function doesn't support the given s->size or + s->r, -1.0 should be returned. See the various base routines below. */ + +double +speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s) +{ +#define TOLERANCE 1.01 /* 1% */ + const int max_zeros = 10; + + struct speed_params s_dummy; + int i, j, e; + double t[30]; + double t_unsorted[30]; + double reps_d; + int zeros = 0; + + /* Use dummy parameters if caller doesn't provide any. Only a few special + "fun"s will cope with this, speed_noop() is one. */ + if (s == NULL) + { + memset (&s_dummy, '\0', sizeof (s_dummy)); + s = &s_dummy; + } + + s->reps = 1; + s->time_divisor = 1.0; + for (i = 0; i < numberof (t); i++) + { + for (;;) + { + s->src_num = 0; + s->dst_num = 0; + + t[i] = (*fun) (s); + + if (speed_option_verbose >= 3) + gmp_printf("size=%ld reps=%u r=%Md attempt=%d %.9f\n", + (long) s->size, s->reps, s->r, i, t[i]); + + if (t[i] == 0.0) + { + zeros++; + if (zeros > max_zeros) + { + fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros); + abort (); + } + if (s->reps < 10000) + s->reps *= 2; + + continue; + } + + if (t[i] == -1.0) + return -1.0; + + if (t[i] >= speed_unittime * speed_precision) + break; + + /* go to a value of reps to make t[i] >= precision */ + reps_d = ceil (1.1 * s->reps + * speed_unittime * speed_precision + / MAX (t[i], speed_unittime)); + if (reps_d > 2e9 || reps_d < 1.0) + { + fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d); + fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n", + s->reps, speed_unittime, speed_precision, t[i]); + abort (); + } + s->reps = (unsigned) reps_d; + } + t[i] /= s->reps; + t_unsorted[i] = t[i]; + + if (speed_precision == 0) + return t[i]; + + /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */ + if (t[0] >= 2.0) + e = 3; + else + e = 4; + + /* Look for e many t[]'s within TOLERANCE of each other to consider a + valid measurement. Return smallest among them. */ + if (i >= e) + { + qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); + for (j = e-1; j < i; j++) + if (t[j] <= t[j-e+1] * TOLERANCE) + return t[j-e+1] / s->time_divisor; + } + } + + fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n", + e, (TOLERANCE-1.0)*100.0); + fprintf (stderr, " unsorted sorted\n"); + fprintf (stderr, " %.12f %.12f is about %.1f%%\n", + t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0), + 100*(TOLERANCE-1.0)); + for (i = 0; i < numberof (t); i++) + fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]); + + return -1.0; +} + + +/* Read all of ptr,size to get it into the CPU memory cache. + + A call to mpn_cache_fill_dummy() is used to make sure the compiler + doesn't optimize away the whole loop. Using "volatile mp_limb_t sum" + would work too, but the function call means we don't rely on every + compiler actually implementing volatile properly. + + mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking + it can inline it. */ + +void +mpn_cache_fill (mp_srcptr ptr, mp_size_t size) +{ + mp_limb_t sum = 0; + mp_size_t i; + + for (i = 0; i < size; i++) + sum += ptr[i]; + + mpn_cache_fill_dummy(sum); +} + + +void +mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) +{ + mpn_cache_fill (ptr, size); + +#if 0 + mpn_random (ptr, size); +#endif + +#if 0 + mp_size_t i; + + for (i = 0; i < size; i++) + ptr[i] = i; +#endif +} + + +void +speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) +{ + if (s->src_num >= numberof (s->src)) + { + fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); + abort (); + } + s->src[s->src_num].ptr = ptr; + s->src[s->src_num].size = size; + s->src_num++; +} + + +void +speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) +{ + if (s->dst_num >= numberof (s->dst)) + { + fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); + abort (); + } + s->dst[s->dst_num].ptr = ptr; + s->dst[s->dst_num].size = size; + s->dst_num++; +} + + +void +speed_cache_fill (struct speed_params *s) +{ + static struct speed_params prev; + int i; + + /* FIXME: need a better way to get the format string for a pointer */ + + if (speed_option_addrs) + { + int different; + + different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); + for (i = 0; i < s->dst_num; i++) + different |= (s->dst[i].ptr != prev.dst[i].ptr); + for (i = 0; i < s->src_num; i++) + different |= (s->src[i].ptr != prev.src[i].ptr); + + if (different) + { + if (s->dst_num != 0) + { + printf ("dst"); + for (i = 0; i < s->dst_num; i++) + printf (" %08lX", (unsigned long) s->dst[i].ptr); + printf (" "); + } + + if (s->src_num != 0) + { + printf ("src"); + for (i = 0; i < s->src_num; i++) + printf (" %08lX", (unsigned long) s->src[i].ptr); + printf (" "); + } + printf (" (cf sp approx %08lX)\n", (unsigned long) &different); + + } + + memcpy (&prev, s, sizeof(prev)); + } + + switch (s->cache) { + case 0: + for (i = 0; i < s->dst_num; i++) + mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size); + for (i = 0; i < s->src_num; i++) + mpn_cache_fill (s->src[i].ptr, s->src[i].size); + break; + case 1: + pentium_wbinvd(); + break; + } +} + + +/* Miscellaneous options accepted by tune and speed programs under -o. */ + +void +speed_option_set (const char *s) +{ + int n; + + if (strcmp (s, "addrs") == 0) + { + speed_option_addrs = 1; + } + else if (strcmp (s, "verbose") == 0) + { + speed_option_verbose++; + } + else if (sscanf (s, "verbose=%d", &n) == 1) + { + speed_option_verbose = n; + } + else if (strcmp (s, "cycles-broken") == 0) + { + speed_option_cycles_broken = 1; + } + else + { + printf ("Unrecognised -o option: %s\n", s); + exit (1); + } +} + + +/* The following are basic speed running routines for various gmp functions. + Many are very similar and use speed.h macros. + + Each routine allocates it's own destination space for the result of the + function, because only it can know what the function needs. + + speed_starttime() and speed_endtime() are put tight around the code to be + measured. Any setups are done outside the timed portion. + + Each routine is responsible for its own cache priming. + speed_cache_fill() is a good way to do this, see examples in speed.h. + One cache priming possibility, for CPUs with write-allocate cache, and + functions that don't take too long, is to do one dummy call before timing + so as to cache everything that gets used. But speed_measure() runs a + routine at least twice and will take the smaller time, so this might not + be necessary. + + Data alignment will be important, for source, destination and temporary + workspace. A routine can align its destination and workspace. Programs + using the routines will ensure s->xp and s->yp are aligned. Aligning + onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and + s->align_wp2 should be respected where it makes sense to do so. + SPEED_TMP_ALLOC_LIMBS is a good way to do this. + + A loop of the following form can be expected to turn into good assembler + code on most CPUs, thereby minimizing overhead in the measurement. It + can always be assumed s->reps >= 1. + + i = s->reps + do + foo(); + while (--i != 0); + + Additional parameters might be added to "struct speed_params" in the + future. Routines should ignore anything they don't use. + + s->size can be used creatively, and s->xp and s->yp can be ignored. For + example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is + just a user-supplied parameter. speed_mpn_lshift() uses it as a shift, + speed_mpn_mul_1() uses it as a multiplier. */ + + +/* MPN_COPY etc can be macros, so the _CALL forms are necessary */ +double +speed_MPN_COPY (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (MPN_COPY); +} +double +speed_MPN_COPY_INCR (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR); +} +double +speed_MPN_COPY_DECR (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR); +} +#if HAVE_NATIVE_mpn_copyi +double +speed_mpn_copyi (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_copyi); +} +#endif +#if HAVE_NATIVE_mpn_copyd +double +speed_mpn_copyd (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_copyd); +} +#endif +double +speed_memcpy (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY_BYTES (memcpy); +} +double +speed_mpn_com (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_com); +} +double +speed_mpn_neg (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_neg); +} +double +speed_mpn_sec_tabselect (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TABSELECT (mpn_sec_tabselect); +} + + +double +speed_mpn_addmul_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1); +} +double +speed_mpn_submul_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1); +} + +#if HAVE_NATIVE_mpn_addmul_2 +double +speed_mpn_addmul_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2); +} +#endif +#if HAVE_NATIVE_mpn_addmul_3 +double +speed_mpn_addmul_3 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3); +} +#endif +#if HAVE_NATIVE_mpn_addmul_4 +double +speed_mpn_addmul_4 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4); +} +#endif +#if HAVE_NATIVE_mpn_addmul_5 +double +speed_mpn_addmul_5 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5); +} +#endif +#if HAVE_NATIVE_mpn_addmul_6 +double +speed_mpn_addmul_6 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6); +} +#endif +#if HAVE_NATIVE_mpn_addmul_7 +double +speed_mpn_addmul_7 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7); +} +#endif +#if HAVE_NATIVE_mpn_addmul_8 +double +speed_mpn_addmul_8 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8); +} +#endif + +#if HAVE_NATIVE_mpn_addaddmul_1msb0 +double +speed_mpn_addaddmul_1msb0 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_ADDADDMUL1_MSB0 (mpn_addaddmul_1msb0); +} +#endif +double +speed_mpn_mul_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1); +} +double +speed_mpn_mul_1_inplace (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1); +} + +#if HAVE_NATIVE_mpn_mul_2 +double +speed_mpn_mul_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2); +} +#endif +#if HAVE_NATIVE_mpn_mul_3 +double +speed_mpn_mul_3 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3); +} +#endif +#if HAVE_NATIVE_mpn_mul_4 +double +speed_mpn_mul_4 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4); +} +#endif +#if HAVE_NATIVE_mpn_mul_5 +double +speed_mpn_mul_5 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_5 (mpn_mul_5); +} +#endif +#if HAVE_NATIVE_mpn_mul_6 +double +speed_mpn_mul_6 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_6 (mpn_mul_6); +} +#endif + + +double +speed_mpn_lshift (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift); +} +double +speed_mpn_lshiftc (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc); +} +double +speed_mpn_rshift (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift); +} + + +/* The carry-in variants (if available) are good for measuring because they + won't skip a division if highsize); +} + +double +speed_mpn_binvert (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch); +} + +double +speed_mpn_invert (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch); +} + +double +speed_mpn_invertappr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch); +} + +double +speed_mpn_ni_invertappr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch); +} + +double +speed_mpn_sec_invert (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SEC_INVERT (mpn_sec_invert, mpn_sec_invert_itch); +} + +double +speed_mpn_redc_1 (struct speed_params *s) +{ + SPEED_ROUTINE_REDC_1 (mpn_redc_1); +} +double +speed_mpn_redc_2 (struct speed_params *s) +{ + SPEED_ROUTINE_REDC_2 (mpn_redc_2); +} +double +speed_mpn_redc_n (struct speed_params *s) +{ + SPEED_ROUTINE_REDC_N (mpn_redc_n); +} + + +double +speed_mpn_popcount (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount); +} +double +speed_mpn_hamdist (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist); +} + + +double +speed_mpn_add_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n); +} +double +speed_mpn_sub_n (struct speed_params *s) +{ +SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); +} +double +speed_mpn_add_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_add_1); +} +double +speed_mpn_add_1_inplace (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_add_1); +} +double +speed_mpn_sub_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1 (mpn_sub_1); +} +double +speed_mpn_sub_1_inplace (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_sub_1); +} + +double +speed_mpn_add_err1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n); +} +double +speed_mpn_sub_err1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n); +} +double +speed_mpn_add_err2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n); +} +double +speed_mpn_sub_err2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n); +} +double +speed_mpn_add_err3_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n); +} +double +speed_mpn_sub_err3_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n); +} + + +#if HAVE_NATIVE_mpn_add_n_sub_n +double +speed_mpn_add_n_sub_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size)); +} +#endif + +#if HAVE_NATIVE_mpn_addlsh1_n == 1 +double +speed_mpn_addlsh1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n); +} +#endif +#if HAVE_NATIVE_mpn_sublsh1_n == 1 +double +speed_mpn_sublsh1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n); +} +#endif +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 +double +speed_mpn_addlsh1_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip1); +} +#endif +#if HAVE_NATIVE_mpn_addlsh1_n_ip2 +double +speed_mpn_addlsh1_n_ip2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip2); +} +#endif +#if HAVE_NATIVE_mpn_sublsh1_n_ip1 +double +speed_mpn_sublsh1_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_sublsh1_n_ip1); +} +#endif +#if HAVE_NATIVE_mpn_rsblsh1_n == 1 +double +speed_mpn_rsblsh1_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n); +} +#endif +#if HAVE_NATIVE_mpn_addlsh2_n == 1 +double +speed_mpn_addlsh2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n); +} +#endif +#if HAVE_NATIVE_mpn_sublsh2_n == 1 +double +speed_mpn_sublsh2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n); +} +#endif +#if HAVE_NATIVE_mpn_addlsh2_n_ip1 +double +speed_mpn_addlsh2_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip1); +} +#endif +#if HAVE_NATIVE_mpn_addlsh2_n_ip2 +double +speed_mpn_addlsh2_n_ip2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip2); +} +#endif +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 +double +speed_mpn_sublsh2_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_COPY (mpn_sublsh2_n_ip1); +} +#endif +#if HAVE_NATIVE_mpn_rsblsh2_n == 1 +double +speed_mpn_rsblsh2_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n); +} +#endif +#if HAVE_NATIVE_mpn_addlsh_n +double +speed_mpn_addlsh_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addlsh_n (wp, xp, yp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_sublsh_n +double +speed_mpn_sublsh_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_sublsh_n (wp, xp, yp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_addlsh_n_ip1 +double +speed_mpn_addlsh_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip1 (wp, s->xp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_addlsh_n_ip2 +double +speed_mpn_addlsh_n_ip2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip2 (wp, s->xp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_sublsh_n_ip1 +double +speed_mpn_sublsh_n_ip1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_sublsh_n_ip1 (wp, s->xp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_rsblsh_n +double +speed_mpn_rsblsh_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_rsblsh_n (wp, xp, yp, s->size, 7)); +} +#endif +#if HAVE_NATIVE_mpn_rsh1add_n +double +speed_mpn_rsh1add_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n); +} +#endif +#if HAVE_NATIVE_mpn_rsh1sub_n +double +speed_mpn_rsh1sub_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n); +} +#endif + +double +speed_mpn_cnd_add_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_add_n (1, wp, xp, yp, s->size)); +} +double +speed_mpn_cnd_sub_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_sub_n (1, wp, xp, yp, s->size)); +} + +/* mpn_and_n etc can be macros and so have to be handled with + SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ +double +speed_mpn_and_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, xp, yp, s->size)); +} +double +speed_mpn_andn_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, xp, yp, s->size)); +} +double +speed_mpn_nand_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, xp, yp, s->size)); +} +double +speed_mpn_ior_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, xp, yp, s->size)); +} +double +speed_mpn_iorn_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, xp, yp, s->size)); +} +double +speed_mpn_nior_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, xp, yp, s->size)); +} +double +speed_mpn_xor_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, xp, yp, s->size)); +} +double +speed_mpn_xnor_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, xp, yp, s->size)); +} + + +double +speed_mpn_mul_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n); +} +double +speed_mpn_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR (mpn_sqr); +} +double +speed_mpn_mul_n_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); +} + +double +speed_mpn_mul_basecase (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase); +} +double +speed_mpn_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL(mpn_mul); +} +double +speed_mpn_sqr_basecase (struct speed_params *s) +{ + /* FIXME: size restrictions on some versions of sqr_basecase */ + SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); +} + +#if HAVE_NATIVE_mpn_sqr_diagonal +double +speed_mpn_sqr_diagonal (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal); +} +#endif + +#if HAVE_NATIVE_mpn_sqr_diag_addlsh1 +double +speed_mpn_sqr_diag_addlsh1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL (mpn_sqr_diag_addlsh1 (wp, tp, s->xp, s->size)); +} +#endif + +double +speed_mpn_toom2_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr); +} +double +speed_mpn_toom3_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr); +} +double +speed_mpn_toom4_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr); +} +double +speed_mpn_toom6_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr); +} +double +speed_mpn_toom8_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr); +} +double +speed_mpn_toom22_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul); +} +double +speed_mpn_toom33_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul); +} +double +speed_mpn_toom44_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul); +} +double +speed_mpn_toom6h_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul); +} +double +speed_mpn_toom8h_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul); +} + +double +speed_mpn_toom32_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul); +} +double +speed_mpn_toom42_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul); +} +double +speed_mpn_toom43_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul); +} +double +speed_mpn_toom63_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul); +} +double +speed_mpn_toom32_for_toom43_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul); +} +double +speed_mpn_toom43_for_toom32_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul); +} +double +speed_mpn_toom32_for_toom53_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul); +} +double +speed_mpn_toom53_for_toom32_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul); +} +double +speed_mpn_toom42_for_toom53_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul); +} +double +speed_mpn_toom53_for_toom42_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul); +} +double +speed_mpn_toom43_for_toom54_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul); +} +double +speed_mpn_toom54_for_toom43_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul); +} + +double +speed_mpn_nussbaumer_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_N_CALL + (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size)); +} +double +speed_mpn_nussbaumer_mul_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR_CALL + (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size)); +} + +#if WANT_OLD_FFT_FULL +double +speed_mpn_mul_fft_full (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_N_CALL + (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); +} +double +speed_mpn_mul_fft_full_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR_CALL + (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); +} +#endif + +/* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's + used as k, otherwise the best k for the size is used. If s->size isn't a + multiple of 2^k it's rounded up to make the effective operation size. */ + +#define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \ + { \ + mp_ptr wp; \ + mp_size_t pl; \ + int k; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + if (s->r != 0) \ + k = s->r; \ + else \ + k = mpn_fft_best_k (s->size, sqr); \ + \ + TMP_MARK; \ + pl = mpn_fft_next_size (s->size, k); \ + SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + if (!sqr) \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, pl+1); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +double +speed_mpn_mul_fft (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_FFT_CALL + (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); +} + +double +speed_mpn_mul_fft_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_FFT_CALL + (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); +} + +double +speed_mpn_fft_mul (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size)); +} + +double +speed_mpn_fft_sqr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size)); +} + +double +speed_mpn_sqrlo (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo); +} +double +speed_mpn_sqrlo_basecase (struct speed_params *s) +{ + SPEED_RESTRICT_COND (ABOVE_THRESHOLD (s->size, MIN (3, SQRLO_BASECASE_THRESHOLD)) + && BELOW_THRESHOLD (s->size, SQRLO_DC_THRESHOLD)); + SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo_basecase); +} +double +speed_mpn_mullo_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n); +} +double +speed_mpn_mullo_basecase (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase); +} + +double +speed_mpn_mulmid_basecase (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase); +} + +double +speed_mpn_mulmid (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID (mpn_mulmid); +} + +double +speed_mpn_mulmid_n (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n); +} + +double +speed_mpn_toom42_mulmid (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid); +} + +double +speed_mpn_mulmod_bnm1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp)); +} + +double +speed_mpn_bc_mulmod_bnm1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp)); +} + +double +speed_mpn_mulmod_bnm1_rounded (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1); +} + +double +speed_mpn_sqrmod_bnm1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp)); +} + +double +speed_mpn_mulmod_bknp1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNP1_CALL (mpn_mulmod_bknp1 (wp, s->xp, s->yp, nk, k, tp),1); +} + +double +speed_mpn_sqrmod_bknp1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNP1_CALL (mpn_sqrmod_bknp1 (wp, s->xp, nk, k, tp),1); +} + +static void +mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, + unsigned k, mp_ptr tp) +{ + if (k > 2) + mpn_mulmod_bknp1 (rp, ap, bp, n, k, tp); + else + { + n *= k; + mpn_mul_n (tp, ap, bp, n); + mpn_sub_n (rp, tp, tp + n, n); + } +} + +static void +mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, + unsigned k, mp_ptr tp) +{ + if (k > 2) + mpn_sqrmod_bknp1 (rp, ap, n, k, tp); + else + { + n *= k; + mpn_sqr (tp, ap, n); + mpn_sub_n (rp, tp, tp + n, n); + } +} + +double +speed_mpn_mulmod_bnp1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNP1_CALL (mpn_bc_mulmod_bnp1 (wp, s->xp, s->yp, nk, k, tp),0); +} + +double +speed_mpn_sqrmod_bnp1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MULMOD_BNP1_CALL (mpn_bc_sqrmod_bnp1 (wp, s->xp, nk, k, tp),0); +} + +double +speed_mpn_matrix22_mul (struct speed_params *s) +{ + /* Speed params only includes 2 inputs, so we have to invent the + other 6. */ + + mp_ptr a; + mp_ptr r; + mp_ptr b; + mp_ptr tp; + mp_size_t itch; + unsigned i; + double t; + TMP_DECL; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp); + SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp); + + MPN_COPY (a, s->xp, s->size); + mpn_random (a + s->size, 3 * s->size); + MPN_COPY (b, s->yp, s->size); + mpn_random (b + s->size, 3 * s->size); + + itch = mpn_matrix22_mul_itch (s->size, s->size); + SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); + + speed_operand_src (s, a, 4 * s->size); + speed_operand_src (s, b, 4 * s->size); + speed_operand_dst (s, r, 8 * s->size + 4); + speed_operand_dst (s, tp, itch); + speed_cache_fill (s); + + speed_starttime (); + i = s->reps; + do + { + mp_size_t sz = s->size; + MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz); + MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz); + MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz); + MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz); + mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz, + b, b + 1 * sz, b + 2 * sz, b + 3 * sz, sz, + tp); + } + while (--i != 0); + t = speed_endtime(); + TMP_FREE; + return t; +} + +double +speed_mpn_hgcd2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2); +} +double +speed_mpn_hgcd2_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_1); +} +double +speed_mpn_hgcd2_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_2); +} +double +speed_mpn_hgcd2_3 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_3); +} +double +speed_mpn_hgcd2_4 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_4); +} +double +speed_mpn_hgcd2_5 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_5); +} + +double +speed_mpn_hgcd (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd, mpn_hgcd_itch); +} + +double +speed_mpn_hgcd_lehmer (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch); +} + +double +speed_mpn_hgcd_appr (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch); +} + +double +speed_mpn_hgcd_appr_lehmer (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch); +} + +double +speed_mpn_hgcd_reduce (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch); +} +double +speed_mpn_hgcd_reduce_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch); +} +double +speed_mpn_hgcd_reduce_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch); +} + +double +speed_mpn_gcd (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCD (mpn_gcd); +} + +double +speed_mpn_gcdext (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext); +} +#if 0 +double +speed_mpn_gcdext_lehmer (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer); +} +#endif +double +speed_mpn_gcdext_single (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single); +} +double +speed_mpn_gcdext_double (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double); +} +double +speed_mpn_gcdext_one_single (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single); +} +double +speed_mpn_gcdext_one_double (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double); +} +double +speed_mpn_gcd_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1); +} +double +speed_mpn_gcd_11 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCD_11 (mpn_gcd_11); +} +double +speed_mpn_gcd_1N (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1); +} +double +speed_mpn_gcd_22 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GCD_22 (mpn_gcd_22); +} + +double +speed_gmp_primesieve (struct speed_params *s) +{ + SPEED_ROUTINE_GMP_PRIMESIEVE (gmp_primesieve); +} + +double +speed_mpz_nextprime (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_NEXTPRIME (mpz_nextprime); +} + +double +speed_mpz_nextprime_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_UNARY_1 (mpz_nextprime); +} + +double +speed_mpz_prevprime (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_NEXTPRIME (mpz_prevprime); +} + +double +speed_mpz_prevprime_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_UNARY_1 (mpz_prevprime); +} + +double +speed_mpz_jacobi (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi); +} +double +speed_mpn_jacobi_base (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base); +} +double +speed_mpn_jacobi_base_1 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1); +} +double +speed_mpn_jacobi_base_2 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2); +} +double +speed_mpn_jacobi_base_3 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3); +} +double +speed_mpn_jacobi_base_4 (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4); +} + + +double +speed_mpn_sqrtrem (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, wp2, s->xp, s->size)); +} + +double +speed_mpn_sqrt (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, NULL, s->xp, s->size)); +} + +double +speed_mpn_rootrem (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, wp2, s->xp, s->size, s->r)); +} + +double +speed_mpn_root (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, NULL, s->xp, s->size, s->r)); +} + + +double +speed_mpn_perfect_power_p (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_PERFECT_POWER (mpn_perfect_power_p); +} + +double +speed_mpn_perfect_square_p (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_PERFECT_SQUARE (mpn_perfect_square_p); +} + + +double +speed_mpz_fac_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); +} + +double +speed_mpz_2fac_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_UI (mpz_2fac_ui); +} + +double +speed_mpz_primorial_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_UI (mpz_primorial_ui); +} + + +double +speed_mpn_fib2_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui); +} +double +speed_mpz_fib_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui); +} +double +speed_mpz_fib2_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui); +} +double +speed_mpz_lucnum_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui); +} +double +speed_mpz_lucnum2_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui); +} + + +double +speed_mpz_powm (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_POWM (mpz_powm); +} +double +speed_mpz_powm_mod (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod); +} +double +speed_mpz_powm_redc (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc); +} +double +speed_mpz_powm_sec (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_POWM (mpz_powm_sec); +} +double +speed_mpz_powm_ui (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui); +} + + +double +speed_binvert_limb (struct speed_params *s) +{ + SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb); +} + + +double +speed_noop (struct speed_params *s) +{ + unsigned i; + + speed_starttime (); + i = s->reps; + do + noop (); + while (--i != 0); + return speed_endtime (); +} + +double +speed_noop_wxs (struct speed_params *s) +{ + mp_ptr wp; + unsigned i; + double t; + TMP_DECL; + + TMP_MARK; + wp = TMP_ALLOC_LIMBS (1); + + speed_starttime (); + i = s->reps; + do + noop_wxs (wp, s->xp, s->size); + while (--i != 0); + t = speed_endtime (); + + TMP_FREE; + return t; +} + +double +speed_noop_wxys (struct speed_params *s) +{ + mp_ptr wp; + unsigned i; + double t; + TMP_DECL; + + TMP_MARK; + wp = TMP_ALLOC_LIMBS (1); + + speed_starttime (); + i = s->reps; + do + noop_wxys (wp, s->xp, s->yp, s->size); + while (--i != 0); + t = speed_endtime (); + + TMP_FREE; + return t; +} + + +#define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ + { \ + unsigned i; \ + variables; \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + calls; \ + } \ + while (--i != 0); \ + return speed_endtime (); \ + } + + +/* Compare these to see how much malloc/free costs and then how much + __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or + mpq_init/clear will be doing a 1 limb allocate, so use that as the size + when including them in comparisons. */ + +double +speed_malloc_free (struct speed_params *s) +{ + size_t bytes = s->size * GMP_LIMB_BYTES; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = malloc (bytes); + free (p)); +} + +double +speed_malloc_realloc_free (struct speed_params *s) +{ + size_t bytes = s->size * GMP_LIMB_BYTES; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = malloc (GMP_LIMB_BYTES); + p = realloc (p, bytes); + free (p)); +} + +double +speed_gmp_allocate_free (struct speed_params *s) +{ + size_t bytes = s->size * GMP_LIMB_BYTES; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = (*__gmp_allocate_func) (bytes); + (*__gmp_free_func) (p, bytes)); +} + +double +speed_gmp_allocate_reallocate_free (struct speed_params *s) +{ + size_t bytes = s->size * GMP_LIMB_BYTES; + SPEED_ROUTINE_ALLOC_FREE + (void *p, + p = (*__gmp_allocate_func) (GMP_LIMB_BYTES); + p = (*__gmp_reallocate_func) (p, bytes, GMP_LIMB_BYTES); + (*__gmp_free_func) (p, bytes)); +} + +double +speed_mpz_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpz_t z, + mpz_init (z); + mpz_clear (z)); +} + +double +speed_mpz_init_realloc_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpz_t z, + mpz_init (z); + _mpz_realloc (z, s->size); + mpz_clear (z)); +} + +double +speed_mpq_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpq_t q, + mpq_init (q); + mpq_clear (q)); +} + +double +speed_mpf_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpf_t f, + mpf_init (f); + mpf_clear (f)); +} + + +/* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note + that repeatedly calling mpz_add with the same data gives branch prediction + in it an advantage. */ + +double +speed_mpz_add (struct speed_params *s) +{ + mpz_t w, x, y; + unsigned i; + double t; + + mpz_init (w); + mpz_init (x); + mpz_init (y); + + mpz_set_n (x, s->xp, s->size); + mpz_set_n (y, s->yp, s->size); + mpz_add (w, x, y); + + speed_starttime (); + i = s->reps; + do + { + mpz_add (w, x, y); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + mpz_clear (x); + mpz_clear (y); + return t; +} + + +/* An inverse (s->r) or (s->size)/2 modulo s->size limbs */ + +double +speed_mpz_invert (struct speed_params *s) +{ + mpz_t a, m, r; + mp_size_t k; + unsigned i; + double t; + + if (s->r == 0) + k = s->size/2; + else if (s->r < GMP_LIMB_HIGHBIT) + k = s->r; + else /* s->r < 0 */ + k = s->size - (-s->r); + + SPEED_RESTRICT_COND (k > 0 && k <= s->size); + + mpz_init_set_n (m, s->yp, s->size); + mpz_setbit (m, 0); /* force m to odd */ + + mpz_init_set_n (a, s->xp, k); + + mpz_init (r); + while (mpz_invert (r, a, m) == 0) + mpz_add_ui (a, a, 1); + + speed_starttime (); + i = s->reps; + do + mpz_invert (r, a, m); + while (--i != 0); + t = speed_endtime (); + + mpz_clear (r); + mpz_clear (a); + mpz_clear (m); + return t; + } + +/* If r==0, calculate binomial(size,size/2), + otherwise calculate binomial(size,r). */ + +double +speed_mpz_bin_uiui (struct speed_params *s) +{ + mpz_t w; + unsigned long k; + unsigned i; + double t; + + mpz_init (w); + if (s->r != 0) + k = s->r; + else + k = s->size/2; + + speed_starttime (); + i = s->reps; + do + { + mpz_bin_uiui (w, s->size, k); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + return t; +} + +/* If r==0, calculate binomial(2^size,size), + otherwise calculate binomial(2^size,r). */ + +double +speed_mpz_bin_ui (struct speed_params *s) +{ + mpz_t w, x; + unsigned long k; + unsigned i; + double t; + + mpz_init (w); + mpz_init_set_ui (x, 0); + + mpz_setbit (x, s->size); + + if (s->r != 0) + k = s->r; + else + k = s->size; + + speed_starttime (); + i = s->reps; + do + { + mpz_bin_ui (w, x, k); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + mpz_clear (x); + return t; +} + +/* If r==0, calculate mfac(size,log(size)), + otherwise calculate mfac(size,r). */ + +double +speed_mpz_mfac_uiui (struct speed_params *s) +{ + mpz_t w; + unsigned long k; + unsigned i; + double t; + + mpz_init (w); + if (s->r != 0) + k = s->r; + else + for (k = 1; s->size >> k; ++k); + + speed_starttime (); + i = s->reps; + do + { + mpz_mfac_uiui (w, s->size, k); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + return t; +} + +/* The multiplies are successively dependent so the latency is measured, not + the issue rate. There's only 10 per loop so the code doesn't get too big + since umul_ppmm is several instructions on some cpus. + + Putting the arguments as "h,l,l,h" gets slightly better code from gcc + 2.95.2 on x86, it puts only one mov between each mul, not two. That mov + though will probably show up as a bogus extra cycle though. + + The measuring function macros are into three parts to avoid overflowing + preprocessor expansion space if umul_ppmm is big. + + Limitations: + + The default umul_ppmm doing h*l will be getting increasing numbers of + high zero bits in the calculation. CPUs with data-dependent multipliers + will want to use umul_ppmm.1 to get some randomization into the + calculation. The extra xors and fetches will be a slowdown of course. */ + +#define SPEED_MACRO_UMUL_PPMM_A \ + { \ + mp_limb_t h, l; \ + unsigned i; \ + double t; \ + \ + s->time_divisor = 10; \ + \ + h = s->xp[0]; \ + l = s->yp[0]; \ + \ + if (s->r == 1) \ + { \ + speed_starttime (); \ + i = s->reps; \ + do \ + { + +#define SPEED_MACRO_UMUL_PPMM_B \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + } \ + else \ + { \ + speed_starttime (); \ + i = s->reps; \ + do \ + { + +#define SPEED_MACRO_UMUL_PPMM_C \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + } \ + \ + /* stop the compiler optimizing away the whole calculation! */ \ + noop_1 (h); \ + noop_1 (l); \ + \ + return t; \ + } + + +double +speed_umul_ppmm (struct speed_params *s) +{ + SPEED_MACRO_UMUL_PPMM_A; + { + umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8]; + umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9]; + } + SPEED_MACRO_UMUL_PPMM_B; + { + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + umul_ppmm (h, l, l, h); + } + SPEED_MACRO_UMUL_PPMM_C; +} + + +#if HAVE_NATIVE_mpn_umul_ppmm +double +speed_mpn_umul_ppmm (struct speed_params *s) +{ + SPEED_MACRO_UMUL_PPMM_A; + { + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; + h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; + } + SPEED_MACRO_UMUL_PPMM_B; + { + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + h = mpn_umul_ppmm (&l, h, l); + } + SPEED_MACRO_UMUL_PPMM_C; +} +#endif + +#if HAVE_NATIVE_mpn_umul_ppmm_r +double +speed_mpn_umul_ppmm_r (struct speed_params *s) +{ + SPEED_MACRO_UMUL_PPMM_A; + { + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; + h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; + } + SPEED_MACRO_UMUL_PPMM_B; + { + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + h = mpn_umul_ppmm_r (h, l, &l); + } + SPEED_MACRO_UMUL_PPMM_C; +} +#endif + + +/* The divisions are successively dependent so latency is measured, not + issue rate. There's only 10 per loop so the code doesn't get too big, + especially for udiv_qrnnd_preinv and preinv2norm, which are several + instructions each. + + Note that it's only the division which is measured here, there's no data + fetching and no shifting if the divisor gets normalized. + + In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d" + generate x86 div instructions with nothing in between. + + The measuring function macros are in two parts to avoid overflowing + preprocessor expansion space if udiv_qrnnd etc are big. + + Limitations: + + Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code + generated first. + + CPUs with data-dependent divisions may want more attention paid to the + randomness of the data used. Probably the measurement wanted is over + uniformly distributed numbers, but what's here might not be giving that. */ + +#define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \ + { \ + double t; \ + unsigned i; \ + mp_limb_t q, r, d; \ + mp_limb_t dinv; \ + \ + s->time_divisor = 10; \ + \ + /* divisor from "r" parameter, or a default */ \ + d = s->r; \ + if (d == 0) \ + d = mp_bases[10].big_base; \ + \ + if (normalize) \ + { \ + unsigned norm; \ + count_leading_zeros (norm, d); \ + d <<= norm; \ + invert_limb (dinv, d); \ + } \ + \ + q = s->xp[0]; \ + r = s->yp[0] % d; \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { + +#define SPEED_ROUTINE_UDIV_QRNND_B \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + /* stop the compiler optimizing away the whole calculation! */ \ + noop_1 (q); \ + noop_1 (r); \ + \ + return t; \ + } + +double +speed_udiv_qrnnd (struct speed_params *s) +{ + SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION); + { + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + udiv_qrnnd (q, r, r, q, d); + } + SPEED_ROUTINE_UDIV_QRNND_B; +} + +double +speed_udiv_qrnnd_c (struct speed_params *s) +{ + SPEED_ROUTINE_UDIV_QRNND_A (1); + { + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + __udiv_qrnnd_c (q, r, r, q, d); + } + SPEED_ROUTINE_UDIV_QRNND_B; +} + +#if HAVE_NATIVE_mpn_udiv_qrnnd +double +speed_mpn_udiv_qrnnd (struct speed_params *s) +{ + SPEED_ROUTINE_UDIV_QRNND_A (1); + { + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + q = mpn_udiv_qrnnd (&r, r, q, d); + } + SPEED_ROUTINE_UDIV_QRNND_B; +} +#endif + +#if HAVE_NATIVE_mpn_udiv_qrnnd_r +double +speed_mpn_udiv_qrnnd_r (struct speed_params *s) +{ + SPEED_ROUTINE_UDIV_QRNND_A (1); + { + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + q = mpn_udiv_qrnnd_r (r, q, d, &r); + } + SPEED_ROUTINE_UDIV_QRNND_B; +} +#endif + + +double +speed_invert_limb (struct speed_params *s) +{ + SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d)); +} + + +/* xp[0] might not be particularly random, but should give an indication how + "/" runs. Same for speed_operator_mod below. */ +double +speed_operator_div (struct speed_params *s) +{ + double t; + unsigned i; + mp_limb_t x, q, d; + + s->time_divisor = 10; + + /* divisor from "r" parameter, or a default */ + d = s->r; + if (d == 0) + d = mp_bases[10].big_base; + + x = s->xp[0]; + q = 0; + + speed_starttime (); + i = s->reps; + do + { + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + q ^= x; q /= d; + } + while (--i != 0); + t = speed_endtime (); + + /* stop the compiler optimizing away the whole calculation! */ + noop_1 (q); + + return t; +} + +double +speed_operator_mod (struct speed_params *s) +{ + double t; + unsigned i; + mp_limb_t x, r, d; + + s->time_divisor = 10; + + /* divisor from "r" parameter, or a default */ + d = s->r; + if (d == 0) + d = mp_bases[10].big_base; + + x = s->xp[0]; + r = 0; + + speed_starttime (); + i = s->reps; + do + { + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + r ^= x; r %= d; + } + while (--i != 0); + t = speed_endtime (); + + /* stop the compiler optimizing away the whole calculation! */ + noop_1 (r); + + return t; +} + + +/* r==0 measures on data with the values uniformly distributed. This will + be typical for count_trailing_zeros in a GCD etc. + + r==1 measures on data with the resultant count uniformly distributed + between 0 and GMP_LIMB_BITS-1. This is probably sensible for + count_leading_zeros on the high limbs of divisors. */ + +int +speed_routine_count_zeros_setup (struct speed_params *s, + mp_ptr xp, int leading, int zero) +{ + int i, c; + mp_limb_t n; + + if (s->r == 0) + { + /* Make uniformly distributed data. If zero isn't allowed then change + it to 1 for leading, or 0x800..00 for trailing. */ + MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE); + if (! zero) + for (i = 0; i < SPEED_BLOCK_SIZE; i++) + if (xp[i] == 0) + xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT; + } + else if (s->r == 1) + { + /* Make counts uniformly distributed. A randomly chosen bit is set, and + for leading the rest above it are cleared, or for trailing then the + rest below. */ + for (i = 0; i < SPEED_BLOCK_SIZE; i++) + { + mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS); + mp_limb_t keep_below = set-1; + mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below; + mp_limb_t keep = (leading ? keep_below : keep_above); + xp[i] = (s->xp_block[i] & keep) | set; + } + } + else + { + return 0; + } + + /* Account for the effect of n^=c. */ + c = 0; + for (i = 0; i < SPEED_BLOCK_SIZE; i++) + { + n = xp[i]; + xp[i] ^= c; + + if (leading) + count_leading_zeros (c, n); + else + count_trailing_zeros (c, n); + } + + return 1; +} + +double +speed_count_leading_zeros (struct speed_params *s) +{ +#ifdef COUNT_LEADING_ZEROS_0 +#define COUNT_LEADING_ZEROS_0_ALLOWED 1 +#else +#define COUNT_LEADING_ZEROS_0_ALLOWED 0 +#endif + + SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); + count_leading_zeros (c, n); + SPEED_ROUTINE_COUNT_ZEROS_B (); +} +double +speed_count_trailing_zeros (struct speed_params *s) +{ + SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); + count_trailing_zeros (c, n); + SPEED_ROUTINE_COUNT_ZEROS_B (); +} + + +double +speed_mpn_get_str (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_GET_STR (mpn_get_str); +} + +double +speed_mpn_set_str (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base)); +} +double +speed_mpn_bc_set_str (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base)); +} + +double +speed_MPN_ZERO (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size)); +} + + +int +speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate) +{ + if (s->r == 0) + gmp_randinit_default (rstate); + else if (s->r == 1) + gmp_randinit_mt (rstate); + else + { + return gmp_randinit_lc_2exp_size (rstate, s->r); + } + return 1; +} + +double +speed_gmp_randseed (struct speed_params *s) +{ + gmp_randstate_t rstate; + unsigned i; + double t; + mpz_t x; + + SPEED_RESTRICT_COND (s->size >= 1); + SPEED_RESTRICT_COND (speed_randinit (s, rstate)); + + /* s->size bits of seed */ + mpz_init_set_n (x, s->xp, s->size); + mpz_fdiv_r_2exp (x, x, (unsigned long) s->size); + + /* cache priming */ + gmp_randseed (rstate, x); + + speed_starttime (); + i = s->reps; + do + gmp_randseed (rstate, x); + while (--i != 0); + t = speed_endtime (); + + gmp_randclear (rstate); + mpz_clear (x); + return t; +} + +double +speed_gmp_randseed_ui (struct speed_params *s) +{ + gmp_randstate_t rstate; + unsigned i, j; + double t; + + SPEED_RESTRICT_COND (speed_randinit (s, rstate)); + + /* cache priming */ + gmp_randseed_ui (rstate, 123L); + + speed_starttime (); + i = s->reps; + j = 0; + do + { + gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]); + j++; + if (j >= SPEED_BLOCK_SIZE) + j = 0; + } + while (--i != 0); + t = speed_endtime (); + + gmp_randclear (rstate); + return t; +} + +double +speed_mpz_urandomb (struct speed_params *s) +{ + gmp_randstate_t rstate; + mpz_t z; + unsigned i; + double t; + + SPEED_RESTRICT_COND (s->size >= 0); + SPEED_RESTRICT_COND (speed_randinit (s, rstate)); + + mpz_init (z); + + /* cache priming */ + mpz_urandomb (z, rstate, (unsigned long) s->size); + mpz_urandomb (z, rstate, (unsigned long) s->size); + + speed_starttime (); + i = s->reps; + do + mpz_urandomb (z, rstate, (unsigned long) s->size); + while (--i != 0); + t = speed_endtime (); + + mpz_clear (z); + gmp_randclear (rstate); + return t; +} diff --git a/gmp-6.3.0/tune/div_qr_1_tune.c b/gmp-6.3.0/tune/div_qr_1_tune.c new file mode 100644 index 0000000..2a623f0 --- /dev/null +++ b/gmp-6.3.0/tune/div_qr_1_tune.c @@ -0,0 +1,50 @@ +/* mpn/generic/div_qr_1, using tuned threshold and method. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define TUNE_PROGRAM_BUILD 1 + +#include "gmp-impl.h" + +mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_4 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); + +#if !HAVE_NATIVE_mpn_div_qr_1n_pi1 +#define __gmpn_div_qr_1n_pi1 \ + (div_qr_1n_pi1_method <= 2 \ + ? (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) \ + : (div_qr_1n_pi1_method == 3 ? mpn_div_qr_1n_pi1_3 : mpn_div_qr_1n_pi1_4)) +#endif + +#undef mpn_div_qr_1 +#define mpn_div_qr_1 mpn_div_qr_1_tune + +#include "mpn/generic/div_qr_1.c" diff --git a/gmp-6.3.0/tune/div_qr_1n_pi1_1.c b/gmp-6.3.0/tune/div_qr_1n_pi1_1.c new file mode 100644 index 0000000..e64a3c7 --- /dev/null +++ b/gmp-6.3.0/tune/div_qr_1n_pi1_1.c @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 1. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 1 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_1 + +#include "mpn/generic/div_qr_1n_pi1.c" diff --git a/gmp-6.3.0/tune/div_qr_1n_pi1_2.c b/gmp-6.3.0/tune/div_qr_1n_pi1_2.c new file mode 100644 index 0000000..c5432ea --- /dev/null +++ b/gmp-6.3.0/tune/div_qr_1n_pi1_2.c @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 2. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 2 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_2 + +#include "mpn/generic/div_qr_1n_pi1.c" diff --git a/gmp-6.3.0/tune/div_qr_1n_pi1_3.c b/gmp-6.3.0/tune/div_qr_1n_pi1_3.c new file mode 100644 index 0000000..826244c --- /dev/null +++ b/gmp-6.3.0/tune/div_qr_1n_pi1_3.c @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 3. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 3 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_3 + +#include "mpn/generic/div_qr_1n_pi1.c" diff --git a/gmp-6.3.0/tune/div_qr_1n_pi1_4.c b/gmp-6.3.0/tune/div_qr_1n_pi1_4.c new file mode 100644 index 0000000..0f69ea0 --- /dev/null +++ b/gmp-6.3.0/tune/div_qr_1n_pi1_4.c @@ -0,0 +1,38 @@ +/* mpn/generic/div_qr_1n_pi1.c method 4. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 4 +#undef mpn_div_qr_1n_pi1 +#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_4 + +#include "mpn/generic/div_qr_1n_pi1.c" diff --git a/gmp-6.3.0/tune/divrem1div.c b/gmp-6.3.0/tune/divrem1div.c new file mode 100644 index 0000000..0089971 --- /dev/null +++ b/gmp-6.3.0/tune/divrem1div.c @@ -0,0 +1,41 @@ +/* mpn/generic/divrem_1.c forced to use plain udiv_qrnnd. + +Copyright 2000, 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define OPERATION_divrem_1 + +#include "gmp-impl.h" + +#undef DIVREM_1_NORM_THRESHOLD +#undef DIVREM_1_UNNORM_THRESHOLD +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_divrem_1 mpn_divrem_1_div + +#include "mpn/generic/divrem_1.c" diff --git a/gmp-6.3.0/tune/divrem1inv.c b/gmp-6.3.0/tune/divrem1inv.c new file mode 100644 index 0000000..82c8528 --- /dev/null +++ b/gmp-6.3.0/tune/divrem1inv.c @@ -0,0 +1,41 @@ +/* mpn/generic/divrem_1.c forced to use mul-by-inverse udiv_qrnnd_preinv. + +Copyright 2000, 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define OPERATION_divrem_1 + +#include "gmp-impl.h" + +#undef DIVREM_1_NORM_THRESHOLD +#undef DIVREM_1_UNNORM_THRESHOLD +#define DIVREM_1_NORM_THRESHOLD 0 +#define DIVREM_1_UNNORM_THRESHOLD 0 +#define __gmpn_divrem_1 mpn_divrem_1_inv + +#include "mpn/generic/divrem_1.c" diff --git a/gmp-6.3.0/tune/divrem2div.c b/gmp-6.3.0/tune/divrem2div.c new file mode 100644 index 0000000..8331d8f --- /dev/null +++ b/gmp-6.3.0/tune/divrem2div.c @@ -0,0 +1,40 @@ +/* mpn/generic/divrem_2.c forced to use plain udiv_qrnnd. */ + +/* +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifdef DIVREM_2_THRESHOLD +#undef DIVREM_2_THRESHOLD +#endif +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_divrem_2 mpn_divrem_2_div + +#include "mpn/generic/divrem_2.c" diff --git a/gmp-6.3.0/tune/divrem2inv.c b/gmp-6.3.0/tune/divrem2inv.c new file mode 100644 index 0000000..8ae87f5 --- /dev/null +++ b/gmp-6.3.0/tune/divrem2inv.c @@ -0,0 +1,40 @@ +/* mpn/generic/divrem_2.c forced to use udiv_qrnnd_preinv. */ + +/* +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifdef DIVREM_2_THRESHOLD +#undef DIVREM_2_THRESHOLD +#endif +#define DIVREM_2_THRESHOLD 0 +#define __gmpn_divrem_2 mpn_divrem_2_inv + +#include "mpn/generic/divrem_2.c" diff --git a/gmp-6.3.0/tune/freq.c b/gmp-6.3.0/tune/freq.c new file mode 100644 index 0000000..ee38506 --- /dev/null +++ b/gmp-6.3.0/tune/freq.c @@ -0,0 +1,893 @@ +/* CPU frequency determination. + +Copyright 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* Currently we don't get a CPU frequency on the following systems, + + alphaev5-cray-unicosmk2.0.6.X + times() has been seen at 13.33 ns (75 MHz), which is probably not the + cpu frequency. Measuring the cycle counter against that would be + possible though. But currently we don't use the cycle counter due to + unicos having int==8bytes where tune/alpha.asm assumes int==4bytes. + + m68040-unknown-netbsd1.4.1 + Not sure if the system even knows the cpu frequency. There's no + cycle counter to measure, though we could perhaps make a loop taking + a known number of cycles and measure that. + + power-ibm-aix4.2.1.0 + power2-ibm-aix4.3.1.0 + powerpc604-ibm-aix4.3.1.0 + powerpc604-ibm-aix4.3.3.0 + powerpc630-ibm-aix4.3.3.0 + powerpc-unknown-netbsd1.6 + Don't know where any info hides on these. mftb is not related to the + cpu frequency so doesn't help. + + sparc-unknown-linux-gnu [maybe] + Don't know where any info hides on this. + + t90-cray-unicos10.0.X + The times() call seems to be for instance 2.22 nanoseconds, which + might be the cpu frequency (450 mhz), but need to confirm that. + +*/ + +#include "config.h" + +#if HAVE_INVENT_H +#include /* for IRIX invent_cpuinfo_t */ +#endif + +#include +#include /* for getenv, qsort */ +#include /* for memcmp */ + +#if HAVE_UNISTD_H +#include /* for sysconf */ +#endif + +#include + +#if HAVE_SYS_ATTRIBUTES_H +#include /* for IRIX attr_get(), needs sys/types.h */ +#endif + +#if HAVE_SYS_IOGRAPH_H +#include /* for IRIX INFO_LBL_DETAIL_INVENT */ +#endif + +#if HAVE_SYS_PARAM_H /* for constants needed by NetBSD */ +#include /* and needed by HPUX */ +#endif + +#if HAVE_SYS_PSTAT_H +#include /* for HPUX pstat_getprocessor() */ +#endif + +#if HAVE_SYS_SYSCTL_H +#include /* for sysctlbyname() */ +#endif + +#if TIME_WITH_SYS_TIME +# include /* for struct timeval */ +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#if HAVE_SYS_RESOURCE_H +#include /* for struct rusage */ +#endif + +#if HAVE_SYS_PROCESSOR_H +#include /* for solaris processor_info_t */ +#endif + +/* On AIX 5.1 with gcc 2.9-aix51-020209 in -maix64 mode, + gets an error about "fill" in "struct cpuinfo" having a negative size, + apparently due to __64BIT_KERNEL not being defined because _KERNEL is not + defined. Avoid this file if we don't actually need it, which we don't on + AIX since there's no getsysinfo there. */ +#if HAVE_SYS_SYSINFO_H && HAVE_GETSYSINFO +#include /* for OSF getsysinfo */ +#endif + +#if HAVE_MACHINE_HAL_SYSINFO_H +#include /* for OSF GSI_CPU_INFO, struct cpu_info */ +#endif + +/* Remove definitions from NetBSD , to avoid conflicts with + gmp-impl.h. */ +#ifdef MIN +#undef MIN +#endif +#ifdef MAX +#undef MAX +#endif + +#include "gmp-impl.h" + +#include "speed.h" + + +#define HELP(str) \ + if (help) \ + { \ + printf (" - %s\n", str); \ + return 0; \ + } + + +/* GMP_CPU_FREQUENCY environment variable. Should be in Hertz and can be + floating point, for example "450e6". */ +static int +freq_environment (int help) +{ + char *e; + + HELP ("environment variable GMP_CPU_FREQUENCY (in Hertz)"); + + e = getenv ("GMP_CPU_FREQUENCY"); + if (e == NULL) + return 0; + + speed_cycletime = 1.0 / atof (e); + + if (speed_option_verbose) + printf ("Using GMP_CPU_FREQUENCY %.2f for cycle time %.3g\n", + atof (e), speed_cycletime); + + return 1; +} + + +/* getsysinfo is available on OSF, or 4.0 and up at least. + The man page (on 4.0) suggests a 0 return indicates information not + available, but that seems to be the normal return for GSI_CPU_INFO. */ +static int +freq_getsysinfo (int help) +{ +#if HAVE_GETSYSINFO + struct cpu_info c; + int start; + + HELP ("getsysinfo() GSI_CPU_INFO"); + + start = 0; + if (getsysinfo (GSI_CPU_INFO, (caddr_t) &c, sizeof (c), + &start, NULL, NULL) != -1) + { + speed_cycletime = 1e-6 / (double) c.mhz; + if (speed_option_verbose) + printf ("Using getsysinfo() GSI_CPU_INFO %u for cycle time %.3g\n", + c.mhz, speed_cycletime); + return 1; + } +#endif + return 0; +} + + +/* In HPUX 10 and up, pstat_getprocessor() psp_iticksperclktick is the + number of CPU cycles (ie. the CR16 register) per CLK_TCK. HPUX 9 doesn't + have that field in pst_processor though, and has no apparent + equivalent. */ + +static int +freq_pstat_getprocessor (int help) +{ +#if HAVE_PSTAT_GETPROCESSOR && HAVE_PSP_ITICKSPERCLKTICK + struct pst_processor p; + + HELP ("pstat_getprocessor() psp_iticksperclktick"); + + if (pstat_getprocessor (&p, sizeof(p), 1, 0) != -1) + { + long c = clk_tck(); + speed_cycletime = 1.0 / (c * p.psp_iticksperclktick); + if (speed_option_verbose) + printf ("Using pstat_getprocessor() psp_iticksperclktick %lu and clk_tck %ld for cycle time %.3g\n", + (unsigned long) p.psp_iticksperclktick, c, + speed_cycletime); + return 1; + } +#endif + return 0; +} + + +/* i386 FreeBSD 2.2.8 sysctlbyname machdep.i586_freq is in Hertz. + There's no obvious defines available to get this from plain sysctl. */ +static int +freq_sysctlbyname_i586_freq (int help) +{ +#if HAVE_SYSCTLBYNAME + unsigned val; + size_t size; + + HELP ("sysctlbyname() machdep.i586_freq"); + + size = sizeof(val); + if (sysctlbyname ("machdep.i586_freq", &val, &size, NULL, 0) == 0 + && size == sizeof(val)) + { + speed_cycletime = 1.0 / (double) val; + if (speed_option_verbose) + printf ("Using sysctlbyname() machdep.i586_freq %u for cycle time %.3g\n", + val, speed_cycletime); + return 1; + } +#endif + return 0; +} + + +/* i368 FreeBSD 3.3 sysctlbyname machdep.tsc_freq is in Hertz. + There's no obvious defines to get this from plain sysctl. */ + +static int +freq_sysctlbyname_tsc_freq (int help) +{ +#if HAVE_SYSCTLBYNAME + unsigned val; + size_t size; + + HELP ("sysctlbyname() machdep.tsc_freq"); + + size = sizeof(val); + if (sysctlbyname ("machdep.tsc_freq", &val, &size, NULL, 0) == 0 + && size == sizeof(val)) + { + speed_cycletime = 1.0 / (double) val; + if (speed_option_verbose) + printf ("Using sysctlbyname() machdep.tsc_freq %u for cycle time %.3g\n", + val, speed_cycletime); + return 1; + } +#endif + return 0; +} + + +/* Apple powerpc Darwin 1.3 sysctl hw.cpufrequency is in hertz. For some + reason only seems to be available from sysctl(), not sysctlbyname(). */ + +static int +freq_sysctl_hw_cpufrequency (int help) +{ +#if HAVE_SYSCTL && defined (CTL_HW) && defined (HW_CPU_FREQ) + int mib[2]; + unsigned val; + size_t size; + + HELP ("sysctl() hw.cpufrequency"); + + mib[0] = CTL_HW; + mib[1] = HW_CPU_FREQ; + size = sizeof(val); + if (sysctl (mib, 2, &val, &size, NULL, 0) == 0) + { + speed_cycletime = 1.0 / (double) val; + if (speed_option_verbose) + printf ("Using sysctl() hw.cpufrequency %u for cycle time %.3g\n", + val, speed_cycletime); + return 1; + } +#endif + return 0; +} + + +/* The following ssyctl hw.model strings have been observed, + + Alpha FreeBSD 4.1: Digital AlphaPC 164LX 599 MHz + NetBSD 1.4: Digital AlphaPC 164LX 599 MHz + NetBSD 1.6.1: CY7C601 @ 40 MHz, TMS390C602A FPU + + NetBSD 1.4 doesn't seem to have sysctlbyname, so sysctl() is used. */ + +static int +freq_sysctl_hw_model (int help) +{ +#if HAVE_SYSCTL && defined (CTL_HW) && defined (HW_MODEL) + int mib[2]; + char str[128]; + unsigned val; + size_t size; + char *p; + int end; + + HELP ("sysctl() hw.model"); + + mib[0] = CTL_HW; + mib[1] = HW_MODEL; + size = sizeof(str); + if (sysctl (mib, 2, str, &size, NULL, 0) == 0) + { + for (p = str; *p != '\0'; p++) + { + end = 0; + if (sscanf (p, "%u MHz%n", &val, &end) == 1 && end != 0) + { + speed_cycletime = 1e-6 / (double) val; + if (speed_option_verbose) + printf ("Using sysctl() hw.model %u for cycle time %.3g\n", + val, speed_cycletime); + return 1; + } + } + } +#endif + return 0; +} + + +/* /proc/cpuinfo for linux kernel. + + Linux doesn't seem to have any system call to get the CPU frequency, at + least not in 2.0.x or 2.2.x, so it's necessary to read /proc/cpuinfo. + + i386 2.0.36 - "bogomips" is the CPU frequency. + + i386 2.2.13 - has both "cpu MHz" and "bogomips", and it's "cpu MHz" which + is the frequency. + + alpha 2.2.5 - "cycle frequency [Hz]" seems to be right, "BogoMIPS" is + very slightly different. + + alpha 2.2.18pre21 - "cycle frequency [Hz]" is 0 on at least one system, + "BogoMIPS" seems near enough. + + powerpc 2.2.19 - "clock" is the frequency, bogomips is something weird + */ + +static int +freq_proc_cpuinfo (int help) +{ + FILE *fp; + char buf[128]; + double val; + int ret = 0; + int end; + + HELP ("linux kernel /proc/cpuinfo file, cpu MHz or bogomips"); + + if ((fp = fopen ("/proc/cpuinfo", "r")) != NULL) + { + while (fgets (buf, sizeof (buf), fp) != NULL) + { + if (sscanf (buf, "cycle frequency [Hz] : %lf", &val) == 1 + && val != 0.0) + { + speed_cycletime = 1.0 / val; + if (speed_option_verbose) + printf ("Using /proc/cpuinfo \"cycle frequency\" %.2f for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + if (sscanf (buf, "cpu MHz : %lf\n", &val) == 1) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /proc/cpuinfo \"cpu MHz\" %.2f for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + end = 0; + if (sscanf (buf, "clock : %lfMHz\n%n", &val, &end) == 1 && end != 0) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /proc/cpuinfo \"clock\" %.2f for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + if (sscanf (buf, "bogomips : %lf\n", &val) == 1 + || sscanf (buf, "BogoMIPS : %lf\n", &val) == 1) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /proc/cpuinfo \"bogomips\" %.2f for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + } + fclose (fp); + } + return ret; +} + + +/* /bin/sysinfo for SunOS 4. + Prints a line like: cpu0 is a "75 MHz TI,TMS390Z55" CPU */ +static int +freq_sunos_sysinfo (int help) +{ + int ret = 0; +#if HAVE_POPEN + FILE *fp; + char buf[128]; + double val; + int end; + + HELP ("SunOS /bin/sysinfo program output, cpu0"); + + /* Error messages are sent to /dev/null in case /bin/sysinfo doesn't + exist. The brackets are necessary for some shells. */ + if ((fp = popen ("(/bin/sysinfo) 2>/dev/null", "r")) != NULL) + { + while (fgets (buf, sizeof (buf), fp) != NULL) + { + end = 0; + if (sscanf (buf, " cpu0 is a \"%lf MHz%n", &val, &end) == 1 + && end != 0) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /bin/sysinfo \"cpu0 MHz\" %.2f for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + } + pclose (fp); + } +#endif + return ret; +} + + +/* "/etc/hw -r cpu" for SCO OpenUnix 8, printing a line like + The speed of the CPU is approximately 450MHz + */ +static int +freq_sco_etchw (int help) +{ + int ret = 0; +#if HAVE_POPEN + FILE *fp; + char buf[128]; + double val; + int end; + + HELP ("SCO /etc/hw program output"); + + /* Error messages are sent to /dev/null in case /etc/hw doesn't exist. + The brackets are necessary for some shells. */ + if ((fp = popen ("(/etc/hw -r cpu) 2>/dev/null", "r")) != NULL) + { + while (fgets (buf, sizeof (buf), fp) != NULL) + { + end = 0; + if (sscanf (buf, " The speed of the CPU is approximately %lfMHz%n", + &val, &end) == 1 && end != 0) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /etc/hw %.2f MHz, for cycle time %.3g\n", + val, speed_cycletime); + ret = 1; + break; + } + } + pclose (fp); + } +#endif + return ret; +} + + +/* attr_get("/hw/cpunum/0",INFO_LBL_DETAIL_INVENT) ic_cpu_info.cpufq for + IRIX 6.5. Past versions don't have INFO_LBL_DETAIL_INVENT, + invent_cpuinfo_t, or /hw/cpunum/0. + + The same information is available from the "hinv -c processor" command, + but it seems better to make a system call where possible. */ + +static int +freq_attr_get_invent (int help) +{ + int ret = 0; +#if HAVE_ATTR_GET && HAVE_INVENT_H && defined (INFO_LBL_DETAIL_INVENT) + invent_cpuinfo_t inv; + int len, val; + + HELP ("attr_get(\"/hw/cpunum/0\") ic_cpu_info.cpufq"); + + len = sizeof (inv); + if (attr_get ("/hw/cpunum/0", INFO_LBL_DETAIL_INVENT, + (char *) &inv, &len, 0) == 0 + && len == sizeof (inv) + && inv.ic_gen.ig_invclass == INV_PROCESSOR) + { + val = inv.ic_cpu_info.cpufq; + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using attr_get(\"/hw/cpunum/0\") ic_cpu_info.cpufq %d MHz for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + } +#endif + return ret; +} + + +/* FreeBSD on i386 gives a line like the following at bootup, and which can + be read back from /var/run/dmesg.boot. + + CPU: AMD Athlon(tm) Processor (755.29-MHz 686-class CPU) + CPU: Pentium 4 (1707.56-MHz 686-class CPU) + CPU: i486 DX4 (486-class CPU) + + This is useful on FreeBSD 4.x, where there's no sysctl machdep.tsc_freq + or machdep.i586_freq. + + It's better to use /var/run/dmesg.boot than to run /sbin/dmesg, since the + latter prints the current system message buffer, which is a limited size + and can wrap around if the system is up for a long time. */ + +static int +freq_bsd_dmesg (int help) +{ + FILE *fp; + char buf[256], *p; + double val; + int ret = 0; + int end; + + HELP ("BSD /var/run/dmesg.boot file"); + + if ((fp = fopen ("/var/run/dmesg.boot", "r")) != NULL) + { + while (fgets (buf, sizeof (buf), fp) != NULL) + { + if (memcmp (buf, "CPU:", 4) == 0) + { + for (p = buf; *p != '\0'; p++) + { + end = 0; + if (sscanf (p, "(%lf-MHz%n", &val, &end) == 1 && end != 0) + { + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using /var/run/dmesg.boot CPU: %.2f MHz for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + } + } + } + fclose (fp); + } + return ret; +} + + +/* "hinv -c processor" for IRIX. The following lines have been seen, + + 1 150 MHZ IP20 Processor + 2 195 MHZ IP27 Processors + Processor 0: 500 MHZ IP35 + + This information is available from attr_get() on IRIX 6.5 (see above), + but on IRIX 6.2 it's not clear where to look, so fall back on + parsing. */ + +static int +freq_irix_hinv (int help) +{ + int ret = 0; +#if HAVE_POPEN + FILE *fp; + char buf[128]; + double val; + int nproc, end; + + HELP ("IRIX \"hinv -c processor\" output"); + + /* Error messages are sent to /dev/null in case hinv doesn't exist. The + brackets are necessary for some shells. */ + if ((fp = popen ("(hinv -c processor) 2>/dev/null", "r")) != NULL) + { + while (fgets (buf, sizeof (buf), fp) != NULL) + { + end = 0; + if (sscanf (buf, "Processor 0: %lf MHZ%n", &val, &end) == 1 + && end != 0) + { + found: + speed_cycletime = 1e-6 / val; + if (speed_option_verbose) + printf ("Using hinv -c processor \"%.2f MHZ\" for cycle time %.3g\n", val, speed_cycletime); + ret = 1; + break; + } + end = 0; + if (sscanf (buf, "%d %lf MHZ%n", &nproc, &val, &end) == 2 + && end != 0) + goto found; + } + pclose (fp); + } +#endif + return ret; +} + + +/* processor_info() for Solaris. "psrinfo" is the command-line interface to + this. "prtconf -vp" gives similar information. + + Apple Darwin has a processor_info, but in an incompatible style. It + doesn't have , so test for that. */ + +static int +freq_processor_info (int help) +{ +#if HAVE_PROCESSOR_INFO && HAVE_SYS_PROCESSOR_H + processor_info_t p; + int i, n, mhz = 0; + + HELP ("processor_info() pi_clock"); + + n = sysconf (_SC_NPROCESSORS_CONF); + for (i = 0; i < n; i++) + { + if (processor_info (i, &p) != 0) + continue; + if (p.pi_state != P_ONLINE) + continue; + + if (mhz != 0 && p.pi_clock != mhz) + { + fprintf (stderr, + "freq_processor_info(): There's more than one CPU and they have different clock speeds\n"); + return 0; + } + + mhz = p.pi_clock; + } + + speed_cycletime = 1.0e-6 / (double) mhz; + + if (speed_option_verbose) + printf ("Using processor_info() %d mhz for cycle time %.3g\n", + mhz, speed_cycletime); + return 1; + +#else + return 0; +#endif +} + + +#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETTIMEOFDAY +static double +freq_measure_gettimeofday_one (void) +{ +#define call_gettimeofday(t) gettimeofday (&(t), NULL) +#define timeval_tv_sec(t) ((t).tv_sec) +#define timeval_tv_usec(t) ((t).tv_usec) + FREQ_MEASURE_ONE ("gettimeofday", struct timeval, + call_gettimeofday, speed_cyclecounter, + timeval_tv_sec, timeval_tv_usec); +} +#endif + +#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETRUSAGE +static double +freq_measure_getrusage_one (void) +{ +#define call_getrusage(t) getrusage (0, &(t)) +#define rusage_tv_sec(t) ((t).ru_utime.tv_sec) +#define rusage_tv_usec(t) ((t).ru_utime.tv_usec) + FREQ_MEASURE_ONE ("getrusage", struct rusage, + call_getrusage, speed_cyclecounter, + rusage_tv_sec, rusage_tv_usec); +} +#endif + + +/* MEASURE_MATCH is how many readings within MEASURE_TOLERANCE of each other + are required. This must be at least 2. */ +#define MEASURE_MAX_ATTEMPTS 20 +#define MEASURE_TOLERANCE 1.005 /* 0.5% */ +#define MEASURE_MATCH 3 + +double +freq_measure (const char *name, double (*one) (void)) +{ + double t[MEASURE_MAX_ATTEMPTS]; + int i, j; + + for (i = 0; i < numberof (t); i++) + { + t[i] = (*one) (); + + qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); + if (speed_option_verbose >= 3) + for (j = 0; j <= i; j++) + printf (" t[%d] is %.6g\n", j, t[j]); + + for (j = 0; j+MEASURE_MATCH-1 <= i; j++) + { + if (t[j+MEASURE_MATCH-1] <= t[j] * MEASURE_TOLERANCE) + { + /* use the average of the range found */ + return (t[j+MEASURE_MATCH-1] + t[j]) / 2.0; + } + } + } + return -1.0; +} + +static int +freq_measure_getrusage (int help) +{ +#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETRUSAGE + double cycletime; + + if (! getrusage_microseconds_p ()) + return 0; + if (! cycles_works_p ()) + return 0; + + HELP ("cycle counter measured with microsecond getrusage()"); + + cycletime = freq_measure ("getrusage", freq_measure_getrusage_one); + if (cycletime == -1.0) + return 0; + + speed_cycletime = cycletime; + if (speed_option_verbose) + printf ("Using getrusage() measured cycle counter %.4g (%.2f MHz)\n", + speed_cycletime, 1e-6/speed_cycletime); + return 1; + +#else + return 0; +#endif +} + +static int +freq_measure_gettimeofday (int help) +{ +#if HAVE_SPEED_CYCLECOUNTER && HAVE_GETTIMEOFDAY + double cycletime; + + if (! gettimeofday_microseconds_p ()) + return 0; + if (! cycles_works_p ()) + return 0; + + HELP ("cycle counter measured with microsecond gettimeofday()"); + + cycletime = freq_measure ("gettimeofday", freq_measure_gettimeofday_one); + if (cycletime == -1.0) + return 0; + + speed_cycletime = cycletime; + if (speed_option_verbose) + printf ("Using gettimeofday() measured cycle counter %.4g (%.2f MHz)\n", + speed_cycletime, 1e-6/speed_cycletime); + return 1; +#else + return 0; +#endif +} + + +/* Each function returns 1 if it succeeds in setting speed_cycletime, or 0 + if not. + + In general system call tests are first since they're fast, then file + tests, then tests running programs. Necessary exceptions to this rule + are noted. The measuring is last since it's time consuming, and rather + wasteful of cpu. */ + +static int +freq_all (int help) +{ + return + /* This should be first, so an environment variable can override + anything the system gives. */ + freq_environment (help) + + || freq_attr_get_invent (help) + || freq_getsysinfo (help) + || freq_pstat_getprocessor (help) + || freq_sysctl_hw_model (help) + || freq_sysctl_hw_cpufrequency (help) + || freq_sysctlbyname_i586_freq (help) + || freq_sysctlbyname_tsc_freq (help) + + /* SCO openunix 8 puts a dummy pi_clock==16 in processor_info, so be + sure to check /etc/hw before that function. */ + || freq_sco_etchw (help) + + || freq_processor_info (help) + || freq_proc_cpuinfo (help) + || freq_bsd_dmesg (help) + || freq_irix_hinv (help) + || freq_sunos_sysinfo (help) + || freq_measure_getrusage (help) + || freq_measure_gettimeofday (help); +} + + +void +speed_cycletime_init (void) +{ + static int attempted = 0; + + if (attempted) + return; + attempted = 1; + + if (freq_all (0)) + return; + + if (speed_option_verbose) + printf ("CPU frequency couldn't be determined\n"); +} + + +void +speed_cycletime_fail (const char *str) +{ + fprintf (stderr, "Measuring with: %s\n", speed_time_string); + fprintf (stderr, "%s,\n", str); + fprintf (stderr, "but none of the following are available,\n"); + freq_all (1); + abort (); +} + +/* speed_time_init leaves speed_cycletime set to either 0.0 or 1.0 when the + CPU frequency is unknown. 0.0 is when the time base is in seconds, so + that's no good if cycles are wanted. 1.0 is when the time base is in + cycles, which conversely is no good if seconds are wanted. */ +void +speed_cycletime_need_cycles (void) +{ + speed_time_init (); + if (speed_cycletime == 0.0) + speed_cycletime_fail + ("Need to know CPU frequency to give times in cycles"); +} +void +speed_cycletime_need_seconds (void) +{ + speed_time_init (); + if (speed_cycletime == 1.0) + speed_cycletime_fail + ("Need to know CPU frequency to convert cycles to seconds"); +} diff --git a/gmp-6.3.0/tune/gcdext_double.c b/gmp-6.3.0/tune/gcdext_double.c new file mode 100644 index 0000000..2b2ba15 --- /dev/null +++ b/gmp-6.3.0/tune/gcdext_double.c @@ -0,0 +1,38 @@ +/* mpn/generic/gcdext.c forced to use double limb calculations. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 0 +#define __gmpn_gcdext mpn_gcdext_double + +#include "../mpn/generic/gcdext.c" diff --git a/gmp-6.3.0/tune/gcdext_single.c b/gmp-6.3.0/tune/gcdext_single.c new file mode 100644 index 0000000..3c1d28c --- /dev/null +++ b/gmp-6.3.0/tune/gcdext_single.c @@ -0,0 +1,38 @@ +/* mpn/generic/gcdext.c forced to use single limb calculations. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_gcdext mpn_gcdext_single + +#include "../mpn/generic/gcdext.c" diff --git a/gmp-6.3.0/tune/gcdextod.c b/gmp-6.3.0/tune/gcdextod.c new file mode 100644 index 0000000..f40cae6 --- /dev/null +++ b/gmp-6.3.0/tune/gcdextod.c @@ -0,0 +1,39 @@ +/* mpn/generic/gcdext.c forced to one double limb step. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 0 +#define WANT_GCDEXT_ONE_STEP 1 +#define __gmpn_gcdext mpn_gcdext_one_double + +#include "../mpn/generic/gcdext.c" diff --git a/gmp-6.3.0/tune/gcdextos.c b/gmp-6.3.0/tune/gcdextos.c new file mode 100644 index 0000000..f51ff52 --- /dev/null +++ b/gmp-6.3.0/tune/gcdextos.c @@ -0,0 +1,39 @@ +/* mpn/generic/gcdext.c forced to one single limb step. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD MP_SIZE_T_MAX +#define WANT_GCDEXT_ONE_STEP 1 +#define __gmpn_gcdext mpn_gcdext_one_single + +#include "../mpn/generic/gcdext.c" diff --git a/gmp-6.3.0/tune/hgcd2-1.c b/gmp-6.3.0/tune/hgcd2-1.c new file mode 100644 index 0000000..1e8948c --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2-1.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 1. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 1 +#define __gmpn_hgcd2 mpn_hgcd2_1 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_1 + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd2-2.c b/gmp-6.3.0/tune/hgcd2-2.c new file mode 100644 index 0000000..bbb123b --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2-2.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 2. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 2 +#define __gmpn_hgcd2 mpn_hgcd2_2 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_2 + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd2-3.c b/gmp-6.3.0/tune/hgcd2-3.c new file mode 100644 index 0000000..ac62108 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2-3.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 3. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 3 +#define __gmpn_hgcd2 mpn_hgcd2_3 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_3 + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd2-4.c b/gmp-6.3.0/tune/hgcd2-4.c new file mode 100644 index 0000000..ec7f927 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2-4.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 4. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 4 +#define __gmpn_hgcd2 mpn_hgcd2_4 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_4 + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd2-5.c b/gmp-6.3.0/tune/hgcd2-5.c new file mode 100644 index 0000000..ed66171 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2-5.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 5. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 5 +#define __gmpn_hgcd2 mpn_hgcd2_5 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_5 + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd2.c b/gmp-6.3.0/tune/hgcd2.c new file mode 100644 index 0000000..146af72 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd2.c @@ -0,0 +1,49 @@ +/* mpn/generic/hgcd2.c for tuning + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define TUNE_PROGRAM_BUILD 1 + +#include "gmp-impl.h" + +hgcd2_func_t mpn_hgcd2_default; + +hgcd2_func_t *hgcd2_func = &mpn_hgcd2_default; + +int +mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M) +{ + return hgcd2_func(ah, al, bh, bl, M); +} + +#undef mpn_hgcd2 +#define mpn_hgcd2 mpn_hgcd2_default + +#include "mpn/generic/hgcd2.c" diff --git a/gmp-6.3.0/tune/hgcd_appr_lehmer.c b/gmp-6.3.0/tune/hgcd_appr_lehmer.c new file mode 100644 index 0000000..aa43a07 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd_appr_lehmer.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd_appr.c forced to use Lehmer's quadratic algorithm. */ + +/* +Copyright 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD_APPR_THRESHOLD +#define HGCD_APPR_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_hgcd_appr mpn_hgcd_appr_lehmer +#define __gmpn_hgcd_appr_itch mpn_hgcd_appr_lehmer_itch + +#include "../mpn/generic/hgcd_appr.c" diff --git a/gmp-6.3.0/tune/hgcd_lehmer.c b/gmp-6.3.0/tune/hgcd_lehmer.c new file mode 100644 index 0000000..364749d --- /dev/null +++ b/gmp-6.3.0/tune/hgcd_lehmer.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd.c forced to use Lehmer's quadratic algorithm. */ + +/* +Copyright 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD_THRESHOLD +#define HGCD_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_hgcd mpn_hgcd_lehmer +#define __gmpn_hgcd_itch mpn_hgcd_lehmer_itch + +#include "../mpn/generic/hgcd.c" diff --git a/gmp-6.3.0/tune/hgcd_reduce_1.c b/gmp-6.3.0/tune/hgcd_reduce_1.c new file mode 100644 index 0000000..5052233 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd_reduce_1.c @@ -0,0 +1,40 @@ +/* mpn/generic/hgcd_reduce.c forced to use hgcd. */ + +/* +Copyright 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD_REDUCE_THRESHOLD +#define HGCD_REDUCE_THRESHOLD MP_SIZE_T_MAX +#define __gmpn_hgcd_reduce mpn_hgcd_reduce_1 +#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_1_itch + + +#include "../mpn/generic/hgcd_reduce.c" diff --git a/gmp-6.3.0/tune/hgcd_reduce_2.c b/gmp-6.3.0/tune/hgcd_reduce_2.c new file mode 100644 index 0000000..5d802e0 --- /dev/null +++ b/gmp-6.3.0/tune/hgcd_reduce_2.c @@ -0,0 +1,39 @@ +/* mpn/generic/hgcd_reduce.c forced to use hgcd_appr. */ + +/* +Copyright 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD_REDUCE_THRESHOLD +#define HGCD_REDUCE_THRESHOLD 0 +#define __gmpn_hgcd_reduce mpn_hgcd_reduce_2 +#define __gmpn_hgcd_reduce_itch mpn_hgcd_reduce_2_itch + +#include "../mpn/generic/hgcd_reduce.c" diff --git a/gmp-6.3.0/tune/hppa.asm b/gmp-6.3.0/tune/hppa.asm new file mode 100644 index 0000000..fc9d62e --- /dev/null +++ b/gmp-6.3.0/tune/hppa.asm @@ -0,0 +1,42 @@ +dnl HPPA 32-bit time stamp counter access routine. + +dnl Copyright 2000, 2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl void speed_cyclecounter (unsigned p[2]); +dnl +dnl Get the HPPA interval timer. + +PROLOGUE(speed_cyclecounter) + mfctl %cr16,%r28 + stw %r28,0(0,%r26) + bv 0(%r2) + stw %r0,4(0,%r26) +EPILOGUE(speed_cyclecounter) diff --git a/gmp-6.3.0/tune/hppa2.asm b/gmp-6.3.0/tune/hppa2.asm new file mode 100644 index 0000000..57ef4c4 --- /dev/null +++ b/gmp-6.3.0/tune/hppa2.asm @@ -0,0 +1,44 @@ +dnl HPPA 64-bit time stamp counter access routine. + +dnl Copyright 2000, 2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl void speed_cyclecounter (unsigned p[2]); +dnl +dnl Get the HPPA interval timer. + + .level 2.0 +PROLOGUE(speed_cyclecounter) + mfctl %cr16,%r28 + stw %r28,0(0,%r26) ; low word + extrd,u %r28,31,32,%r28 + bve (%r2) + stw %r28,4(0,%r26) ; high word +EPILOGUE(speed_cyclecounter) diff --git a/gmp-6.3.0/tune/hppa2w.asm b/gmp-6.3.0/tune/hppa2w.asm new file mode 100644 index 0000000..215a0cc --- /dev/null +++ b/gmp-6.3.0/tune/hppa2w.asm @@ -0,0 +1,44 @@ +dnl HPPA 64-bit time stamp counter access routine. + +dnl Copyright 2000, 2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl void speed_cyclecounter (unsigned p[2]); +dnl +dnl Get the HPPA interval timer. + + .level 2.0w +PROLOGUE(speed_cyclecounter) + mfctl %cr16,%r28 + stw %r28,0(0,%r26) ; low word + extrd,u %r28,31,32,%r28 + bve (%r2) + stw %r28,4(0,%r26) ; high word +EPILOGUE(speed_cyclecounter) diff --git a/gmp-6.3.0/tune/ia64.asm b/gmp-6.3.0/tune/ia64.asm new file mode 100644 index 0000000..0651111 --- /dev/null +++ b/gmp-6.3.0/tune/ia64.asm @@ -0,0 +1,47 @@ +dnl IA-64 time stamp counter access routine. + +dnl Copyright 2000, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C void speed_cyclecounter (unsigned int p[2]); +C + +ASM_START() +PROLOGUE(speed_cyclecounter) + mov r14 = ar.itc + ;; + st4 [r32] = r14, 4 + shr.u r14 = r14, 32 + ;; + st4 [r32] = r14 + br.ret.sptk.many b0 +EPILOGUE(speed_cyclecounter) +ASM_END() diff --git a/gmp-6.3.0/tune/jacbase1.c b/gmp-6.3.0/tune/jacbase1.c new file mode 100644 index 0000000..89a584d --- /dev/null +++ b/gmp-6.3.0/tune/jacbase1.c @@ -0,0 +1,37 @@ +/* mpn/generic/jacbase.c method 1. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 1 +#define __gmpn_jacobi_base mpn_jacobi_base_1 + +#include "mpn/generic/jacbase.c" diff --git a/gmp-6.3.0/tune/jacbase2.c b/gmp-6.3.0/tune/jacbase2.c new file mode 100644 index 0000000..253d835 --- /dev/null +++ b/gmp-6.3.0/tune/jacbase2.c @@ -0,0 +1,37 @@ +/* mpn/generic/jacbase.c method 2. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 2 +#define __gmpn_jacobi_base mpn_jacobi_base_2 + +#include "mpn/generic/jacbase.c" diff --git a/gmp-6.3.0/tune/jacbase3.c b/gmp-6.3.0/tune/jacbase3.c new file mode 100644 index 0000000..4440f31 --- /dev/null +++ b/gmp-6.3.0/tune/jacbase3.c @@ -0,0 +1,37 @@ +/* mpn/generic/jacbase.c method 3. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 3 +#define __gmpn_jacobi_base mpn_jacobi_base_3 + +#include "mpn/generic/jacbase.c" diff --git a/gmp-6.3.0/tune/jacbase4.c b/gmp-6.3.0/tune/jacbase4.c new file mode 100644 index 0000000..daea3bb --- /dev/null +++ b/gmp-6.3.0/tune/jacbase4.c @@ -0,0 +1,37 @@ +/* mpn/generic/jacbase.c method 4. + +Copyright 2002, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 4 +#define __gmpn_jacobi_base mpn_jacobi_base_4 + +#include "mpn/generic/jacbase.c" diff --git a/gmp-6.3.0/tune/many.pl b/gmp-6.3.0/tune/many.pl new file mode 100644 index 0000000..524a67d --- /dev/null +++ b/gmp-6.3.0/tune/many.pl @@ -0,0 +1,1334 @@ +#! /usr/bin/perl -w + +# Copyright 2000-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cd $builddir/tune +# perl $srcdir/tune/many.pl [-t] ... +# +# Output: speed-many.c +# try-many.c +# Makefile.many +# +# Make alternate versions of various mpn routines available for measuring +# and testing. +# +# The $srcdir and $builddir in the invocation above just means the script +# lives in the tune source directory, but should be run in the tune build +# directory. When not using a separate object directory this just becomes +# +# cd tune +# perl many.pl [-t] ... +# +# +# SINGLE FILES +# +# Suppose $HOME/newcode/mul_1_experiment.asm is a new implementation of +# mpn_mul_1, then +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl $HOME/newcode/mul_1_experiment.asm +# +# will produce rules and renaming so that a speed program incorporating it +# can be built, +# +# make -f Makefile.many speed-many +# +# then for example it can be compared to the standard mul_1, +# +# ./speed-many -s 1-30 mpn_mul_1 mpn_mul_1_experiment +# +# An expanded try program can be used to check correctness, +# +# make -f Makefile.many try-many +# +# and run +# +# ./try-many mpn_mul_1_experiment +# +# Files can be ".c", ".S" or ".asm". ".s" files can't be used because they +# don't get any preprocessing so there's no way to do renaming of their +# functions. +# +# +# WHOLE DIRECTORIES +# +# If a directory is given, then all files in it will be made available. +# For example, +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl $HOME/newcode +# +# Each file should have a suffix, like "_experiment" above. +# +# +# MPN DIRECTORIES +# +# mpn directories from the GMP source tree can be included, and this is a +# convenient way to compare multiple implementations suiting different chips +# in a CPU family. For example the following would make all x86 routines +# available, +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl `find $srcdir/mpn/x86 -type d` +# +# On a new x86 chip a comparison could then be made to see how existing code +# runs. For example, +# +# make -f Makefile.many speed-many +# ./speed-many -s 1-30 -c \ +# mpn_add_n_x86 mpn_add_n_pentium mpn_add_n_k6 mpn_add_n_k7 +# +# Files in "mpn" subdirectories don't need the "_experiment" style suffix +# described above, instead a suffix is constructed from the subdirectory. +# For example "mpn/x86/k7/mmx/mod_1.asm" will generate a function +# mpn_mod_1_k7_mmx. The rule is to take the last directory name after the +# "mpn", or the last two if there's three or more. (Check the generated +# speed-many.c if in doubt.) +# +# +# GENERIC C +# +# The mpn/generic directory can be included too, just like any processor +# specific directory. This is a good way to compare assembler and generic C +# implementations. For example, +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl $srcdir/mpn/generic +# +# or if just a few routines are of interest, then for example +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl \ +# $srcdir/mpn/generic/lshift.c \ +# $srcdir/mpn/generic/mod_1.c \ +# $srcdir/mpn/generic/aorsmul_1.c +# +# giving mpn_lshift_generic etc. +# +# +# TESTS/DEVEL PROGRAMS +# +# Makefile.many also has rules to build the tests/devel programs with suitable +# renaming, and with some parameters for correctness or speed. This is less +# convenient than the speed and try programs, but provides an independent +# check. For example, +# +# make -f Makefile.many tests_mul_1_experimental +# ./tests_mul_1_experimental +# +# and for speed +# +# make -f Makefile.many tests_mul_1_experimental_sp +# ./tests_mul_1_experimental_sp +# +# Not all the programs support speed measuring, in which case only the +# correctness test will be useful. +# +# The parameters for repetitions and host clock speed are -D defines. Some +# defaults are provided at the end of Makefile.many, but probably these will +# want to be overridden. For example, +# +# rm tests_mul_1_experimental.o +# make -f Makefile.many \ +# CFLAGS_TESTS="-DSIZE=50 -DTIMES=1000 -DRANDOM -DCLOCK=175000000" \ +# tests_mul_1_experimental +# ./tests_mul_1_experimental +# +# +# OTHER NOTES +# +# The mappings of file names to functions, and the macros to then use for +# speed measuring etc are driven by @table below. The scheme isn't +# completely general, it's only got as many variations as have been needed +# so far. +# +# Some functions are only made available in speed-many, or others only in +# try-many. An @table entry speed=>none means no speed measuring is +# available, or try=>none no try program testing. These can be removed +# if/when the respective programs get the necessary support. +# +# If a file has "1c" or "nc" carry-in entrypoints, they're renamed and made +# available too. These are recognised from PROLOGUE or MULFUNC_PROLOGUE in +# .S and .asm files, or from a line starting with "mpn_foo_1c" in a .c file +# (possibly via a #define), and on that basis are entirely optional. This +# entrypoint matching is done for the standard entrypoints too, but it would +# be very unusual to have for instance a mul_1c without a mul_1. +# +# Some mpz files are recognized. For example an experimental copy of +# mpz/powm.c could be included as powm_new.c and would be called +# mpz_powm_new. So far only speed measuring is available for these. +# +# For the ".S" and ".asm" files, both PIC and non-PIC objects are built. +# The PIC functions have a "_pic" suffix, for example "mpn_mod_1_k7_mmx_pic". +# This can be ignored for routines that don't differ for PIC, or for CPUs +# where everything is PIC anyway. +# +# K&R compilers are supported via the same ansi2knr mechanism used by +# automake, though it's hard to believe anyone will have much interest in +# measuring a compiler so old that it doesn't even have an ANSI mode. +# +# The "-t" option can be used to print a trace of the files found and what's +# done with them. A great deal of obscure output is produced, but it can +# indicate where or why some files aren't being recognised etc. For +# example, +# +# cd $builddir/tune +# perl $srcdir/tune/many.pl -t $HOME/newcode/add_n_weird.asm +# +# In general, when including new code, all that's really necessary is that +# it will compile or assemble under the current configuration. It's fine if +# some code doesn't actually run due to bugs, or to needing a newer CPU or +# whatever, simply don't ask for the offending routines when invoking +# speed-many or try-many, or don't try to run them on sizes they don't yet +# support, or whatever. +# +# +# CPU SPECIFICS +# +# x86 - All the x86 code will assemble on any system, but code for newer +# chips might not run on older chips. Expect SIGILLs from new +# instructions on old chips. +# +# A few "new" instructions, like cmov for instance, are done as macros +# and will generate some equivalent plain i386 code when HAVE_HOST_CPU +# in config.m4 indicates an old CPU. It won't run fast, but it does +# make it possible to test correctness. +# +# +# INTERNALS +# +# The nonsense involving $ENV is some hooks used during development to add +# additional functions temporarily. +# +# +# FUTURE +# +# Maybe the C files should be compiled pic and non-pic too. Wait until +# there's a difference that might be of interest. +# +# Warn if a file provides no functions. +# +# Allow mpz and mpn files of the same name. Currently the mpn fib2_ui +# matching hides the mpz version of that. Will need to check the file +# contents to see which it is. Would be worth allowing an "mpz_" or "mpn_" +# prefix on the filenames to have working versions of both in one directory. +# +# +# LIMITATIONS +# +# Some of the command lines can become very long when a lot of files are +# included. If this is a problem on a given system the only suggestion is +# to run many.pl for just those that are actually wanted at a particular +# time. +# +# DOS 8.3 or SysV 14 char filesystems won't work, since the long filenames +# generated will almost certainly fail to be unique. + + +use strict; +use File::Basename; +use Getopt::Std; + +my %opt; +getopts('t', \%opt); + +my @DIRECTORIES = @ARGV; +if (defined $ENV{directories}) { push @DIRECTORIES, @{$ENV{directories}} } + + +# regexp - matched against the start of the filename. If a grouping "(...)" +# is present then only the first such part is used. +# +# mulfunc - filenames to be generated from a multi-function file. +# +# funs - functions provided by the file, defaulting to the filename with mpn +# (or mpX). +# +# mpX - prefix like "mpz", defaulting to "mpn". +# +# ret - return value type. +# +# args, args_ - arguments for the given function. If an args_ is +# set then it's used, otherwise plain args is used. "mp_limb_t +# carry" is appended for carry-in variants. +# +# try - try.c TYPE_ to use, defaulting to TYPE_fun with the function name +# in upper case. "C" is appended for carry-in variants. Can be +# 'none' for no try program entry. +# +# speed - SPEED_ROUTINE_ to use, handled like "try". +# +# speed_flags - SPEED_ROUTINE_ to use, handled like "try". + + +my @table = + ( + { + 'regexp'=> 'add_n|sub_n|addlsh1_n|sublsh1_n|rsh1add_n|rsh1sub_n', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size', + 'speed' => 'SPEED_ROUTINE_MPN_BINARY_N', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + }, + { + 'regexp'=> 'aors_n', + 'mulfunc'=> ['add_n','sub_n'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size', + 'speed' => 'SPEED_ROUTINE_MPN_BINARY_N', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + }, + + { + 'regexp'=> 'addmul_1|submul_1', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_1', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'aorsmul_1', + 'mulfunc'=> ['addmul_1','submul_1'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_1', + 'speed_flags'=> 'FLAG_R', + }, + + { + 'regexp'=> 'addmul_2|submul_2', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_2', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 2, + }, + { + 'regexp'=> 'addmul_3|submul_3', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_3', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 3, + }, + { + 'regexp'=> 'addmul_4|submul_4', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_4', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 4, + }, + { + 'regexp'=> 'addmul_5|submul_5', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_5', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 5, + }, + { + 'regexp'=> 'addmul_6|submul_6', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_6', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 6, + }, + { + 'regexp'=> 'addmul_7|submul_7', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_7', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 7, + }, + { + 'regexp'=> 'addmul_8|submul_8', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr yp', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_8', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try-minsize' => 8, + }, + + { + 'regexp'=> 'add_n_sub_n', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr sum, mp_ptr diff, mp_srcptr xp, mp_srcptr yp, mp_size_t size', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + }, + + { + 'regexp'=> 'com|copyi|copyd', + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size', + 'speed' => 'SPEED_ROUTINE_MPN_COPY', + }, + + { + 'regexp'=> 'dive_1', + 'funs' => ['divexact_1'], + 'ret' => 'void', + 'args' => 'mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'diveby3', + 'funs' => ['divexact_by3c'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr dst, mp_srcptr src, mp_size_t size', + 'carrys'=> [''], + 'speed' => 'SPEED_ROUTINE_MPN_COPY', + }, + + # mpn_preinv_divrem_1 is an optional extra entrypoint + { + 'regexp'=> 'divrem_1', + 'funs' => ['divrem_1', 'preinv_divrem_1'], + 'ret' => 'mp_limb_t', + 'args_divrem_1' => 'mp_ptr rp, mp_size_t xsize, mp_srcptr sp, mp_size_t size, mp_limb_t divisor', + 'args_preinv_divrem_1' => 'mp_ptr rp, mp_size_t xsize, mp_srcptr sp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse, unsigned shift', + 'speed_flags'=> 'FLAG_R', + 'speed_suffixes' => ['f'], + }, + { + 'regexp'=> 'pre_divrem_1', + 'funs' => ['preinv_divrem_1'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr qp, mp_size_t qxn, mp_srcptr ap, mp_size_t asize, mp_limb_t divisor, mp_limb_t inverse, int shift', + 'speed_flags' => 'FLAG_R', + }, + + { + 'regexp'=> 'divrem_2', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr qp, mp_size_t qxn, mp_srcptr np, mp_size_t nsize, mp_srcptr dp', + 'try' => 'none', + }, + + { + 'regexp'=> 'sb_divrem_mn', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr qp, mp_ptr np, mp_size_t nsize, mp_srcptr dp, mp_size_t dsize', + 'speed' => 'SPEED_ROUTINE_MPN_DC_DIVREM_SB', + 'try-minsize' => 3, + }, + { + 'regexp'=> 'tdiv_qr', + 'ret' => 'void', + 'args' => 'mp_ptr qp, mp_size_t qxn, mp_ptr np, mp_size_t nsize, mp_srcptr dp, mp_size_t dsize', + 'speed' => 'none', + }, + + { + 'regexp'=> 'get_str', + 'ret' => 'size_t', + 'args' => 'unsigned char *str, int base, mp_ptr mptr, mp_size_t msize', + 'speed_flags' => 'FLAG_R_OPTIONAL', + 'try' => 'none', + }, + { + 'regexp'=> 'set_str', + 'ret' => 'mp_size_t', + 'args' => 'mp_ptr xp, const unsigned char *str, size_t str_len, int base', + 'speed_flags' => 'FLAG_R_OPTIONAL', + 'try' => 'none', + }, + + { + 'regexp'=> 'fac_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr r, unsigned long n', + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + + { + 'regexp'=> 'fib2_ui', + 'ret' => 'void', + 'args' => 'mp_ptr fp, mp_ptr f1p, unsigned long n', + 'rename'=> ['__gmp_fib_table'], + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + { + 'regexp'=> 'fib_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr fn, unsigned long n', + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + { + 'regexp'=> 'fib2_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr fn, mpz_ptr fnsub1, unsigned long n', + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + + { + 'regexp'=> 'lucnum_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr ln, unsigned long n', + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + { + 'regexp'=> 'lucnum2_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr ln, mpz_ptr lnsub1, unsigned long n', + 'speed_flags' => 'FLAG_NODATA', + 'try' => 'none', + }, + + { + 'regexp'=> 'gcd_1', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr xp, mp_size_t xsize, mp_limb_t y', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'speed_suffixes' => ['N'], + }, + { + 'regexp'=> '(gcd)(?!(_1|ext|_finda))', + 'ret' => 'mp_size_t', + 'args' => 'mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize', + }, + { + 'regexp'=> 'gcd_finda', + 'ret' => 'mp_limb_t', + 'args' => 'mp_srcptr cp', + }, + + + { + 'regexp'=> 'jacobi', + 'funs' => ['jacobi', 'legendre', 'kronecker'], + 'mpX' => 'mpz', + 'ret' => 'int', + 'args' => 'mpz_srcptr a, mpz_srcptr b', + 'try-legendre' => 'TYPE_MPZ_JACOBI', + }, + { + 'regexp'=> 'jacbase', + 'funs' => ['jacobi_base'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_limb_t a, mp_limb_t b, int bit1', + 'speed' => 'SPEED_ROUTINE_MPN_JACBASE', + 'try' => 'none', + }, + + { + 'regexp'=> 'logops_n', + 'mulfunc'=> ['and_n','andn_n','nand_n','ior_n','iorn_n','nior_n','xor_n','xnor_n'], + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size', + 'speed' => 'SPEED_ROUTINE_MPN_BINARY_N', + }, + + { + 'regexp'=> '[lr]shift', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, unsigned shift', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_1', + 'speed_flags'=> 'FLAG_R', + }, + + # mpn_preinv_mod_1 is an optional extra entrypoint + { + 'regexp'=> '(mod_1)(?!_rs)', + 'funs' => ['mod_1','preinv_mod_1'], + 'ret' => 'mp_limb_t', + 'args_mod_1' => 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor', + 'args_preinv_mod_1'=> 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'pre_mod_1', + 'funs' => ['preinv_mod_1'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_srcptr xp, mp_size_t size, mp_limb_t divisor, mp_limb_t inverse', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'mod_34lsub1', + 'ret' => 'mp_limb_t', + 'args' => 'mp_srcptr src, mp_size_t len', + }, + { + 'regexp'=> 'invert_limb', + 'ret' => 'mp_limb_t', + 'args' => 'mp_limb_t divisor', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try' => 'none', + }, + + { + # not for use with hppa reversed argument versions of mpn_umul_ppmm + 'regexp'=> 'udiv', + 'funs' => ['udiv_qrnnd','udiv_qrnnd_r'], + 'ret' => 'mp_limb_t', + 'args_udiv_qrnnd' => 'mp_limb_t *, mp_limb_t, mp_limb_t, mp_limb_t', + 'args_udiv_qrnnd_r' => 'mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t *', + 'speed' => 'none', + 'try-minsize' => 2, + }, + + { + 'regexp'=> 'mode1o', + 'funs' => ['modexact_1_odd'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_srcptr src, mp_size_t size, mp_limb_t divisor', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'modlinv', + 'funs' => ['modlimb_invert'], + 'ret' => 'mp_limb_t', + 'args' => 'mp_limb_t v', + 'carrys'=> [''], + 'try' => 'none', + }, + + { + 'regexp'=> 'mul_1', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_limb_t mult', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_1', + 'speed_flags'=> 'FLAG_R', + }, + { + 'regexp'=> 'mul_2', + 'ret' => 'mp_limb_t', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size, mp_srcptr mult', + 'speed' => 'SPEED_ROUTINE_MPN_UNARY_2', + 'speed_flags'=> 'FLAG_R', + }, + + { + 'regexp'=> 'mul_basecase', + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t xsize, mp_srcptr yp, mp_size_t ysize', + 'speed_flags' => 'FLAG_R_OPTIONAL | FLAG_RSIZE', + }, + { + 'regexp'=> '(mul_n)[_.]', + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size', + 'rename'=> ['kara_mul_n','kara_sqr_n','toom3_mul_n','toom3_sqr_n'], + }, + { + 'regexp'=> 'umul', + 'funs' => ['umul_ppmm','umul_ppmm_r'], + 'ret' => 'mp_limb_t', + 'args_umul_ppmm' => 'mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2', + 'args_umul_ppmm_r' => 'mp_limb_t m1, mp_limb_t m2, mp_limb_t *lowptr', + 'speed' => 'none', + 'try-minsize' => 3, + }, + + + { + 'regexp'=> 'popham', + 'mulfunc'=> ['popcount','hamdist'], + 'ret' => 'unsigned long', + 'args_popcount'=> 'mp_srcptr xp, mp_size_t size', + 'args_hamdist' => 'mp_srcptr xp, mp_srcptr yp, mp_size_t size', + }, + { + 'regexp'=> 'popcount', + 'ret' => 'unsigned long', + 'args' => 'mp_srcptr xp, mp_size_t size', + }, + { + 'regexp'=> 'hamdist', + 'ret' => 'unsigned long', + 'args' => 'mp_srcptr xp, mp_srcptr yp, mp_size_t size', + # extra renaming to support sharing a data table with mpn_popcount + 'rename'=> ['popcount'], + }, + + { + 'regexp'=> 'sqr_basecase', + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size', + 'speed' => 'SPEED_ROUTINE_MPN_SQR', + 'try' => 'TYPE_SQR', + }, + { + 'regexp'=> 'sqr_diagonal', + 'ret' => 'void', + 'args' => 'mp_ptr wp, mp_srcptr xp, mp_size_t size', + 'try' => 'none', + }, + + { + 'regexp'=> 'sqrtrem', + 'ret' => 'mp_size_t', + 'args' => 'mp_ptr root, mp_ptr rem, mp_srcptr src, mp_size_t size', + 'try' => 'none', + }, + + { + 'regexp'=> 'cntlz', + 'funs' => ['count_leading_zeros'], + 'ret' => 'unsigned', + 'args' => 'mp_limb_t', + 'macro-before' => "#undef COUNT_LEADING_ZEROS_0", + 'macro-speed' => +'#ifdef COUNT_LEADING_ZEROS_0 +#define COUNT_LEADING_ZEROS_0_ALLOWED 1 +#else +#define COUNT_LEADING_ZEROS_0_ALLOWED 0 +#endif + SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); + $fun (c, n); + SPEED_ROUTINE_COUNT_ZEROS_B ()', + 'speed_flags'=> 'FLAG_R_OPTIONAL', + 'try' => 'none', + }, + { + 'regexp'=> 'cnttz', + 'funs' => ['count_trailing_zeros'], + 'ret' => 'unsigned', + 'args' => 'mp_limb_t', + 'macro-speed' => ' + SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); + $fun (c, n); + SPEED_ROUTINE_COUNT_ZEROS_B ()', + 'speed_flags' => 'FLAG_R_OPTIONAL', + 'try' => 'none', + }, + + { + 'regexp'=> 'zero', + 'ret' => 'void', + 'args' => 'mp_ptr ptr, mp_size_t size', + }, + + { + 'regexp'=> '(powm)(?!_ui)', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m', + 'try' => 'none', + }, + { + 'regexp'=> 'powm_ui', + 'mpX' => 'mpz', + 'ret' => 'void', + 'args' => 'mpz_ptr r, mpz_srcptr b, unsigned long e, mpz_srcptr m', + 'try' => 'none', + }, + + # special for use during development + { + 'regexp'=> 'back', + 'funs' => ['back_to_back'], + 'ret' => 'void', + 'args' => 'void', + 'pic' => 'no', + 'try' => 'none', + 'speed_flags'=> 'FLAG_NODATA', + }, + ); + +if (defined $ENV{table2}) { + my @newtable = @{$ENV{table2}}; + push @newtable, @table; + @table = @newtable; +} + + +my %pictable = + ( + 'yes' => { + 'suffix' => '_pic', + 'asmflags'=> '$(ASMFLAGS_PIC)', + 'cflags' => '$(CFLAGS_PIC)', + }, + 'no' => { + 'suffix' => '', + 'asmflags'=> '', + 'cflags' => '', + }, + ); + + +my $builddir = $ENV{builddir}; +$builddir = "." if (! defined $builddir); + +my $top_builddir = "${builddir}/.."; + + +open(MAKEFILE, "<${builddir}/Makefile") + or die "Cannot open ${builddir}/Makefile: $!\n" + . "Is this a tune build directory?"; +my ($srcdir, $top_srcdir); +while () { + if (/^srcdir = (.*)/) { $srcdir = $1; } + if (/^top_srcdir = (.*)/) { $top_srcdir = $1; } +} +die "Cannot find \$srcdir in Makefile\n" if (! defined $srcdir); +die "Cannot find \$top_srcdir in Makefile\n" if (! defined $top_srcdir); +print "srcdir $srcdir\n" if $opt{'t'}; +print "top_srcdir $top_srcdir\n" if $opt{'t'}; +close(MAKEFILE); + + +open(SPEED, ">speed-many.c") or die; +print SPEED +"/* speed-many.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */ + +"; +my $SPEED_EXTRA_ROUTINES = "#define SPEED_EXTRA_ROUTINES \\\n"; +my $SPEED_EXTRA_PROTOS = "#define SPEED_EXTRA_PROTOS \\\n"; +my $SPEED_CODE = ""; + +open(TRY, ">try-many.c") or die; +print TRY + "/* try-many.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */\n" . + "\n"; +my $TRY_EXTRA_ROUTINES = "#define EXTRA_ROUTINES \\\n"; +my $TRY_EXTRA_PROTOS = "#define EXTRA_PROTOS \\\n"; + +open(FD,"<${top_builddir}/libtool") or die "Cannot open \"${top_builddir}/libtool\": $!\n"; +my $pic_flag; +while () { + if (/^pic_flag="?([^"]*)"?$/) { + $pic_flag=$1; + last; + } +} +close FD; +if (! defined $pic_flag) { + die "Cannot find pic_flag in ${top_builddir}/libtool"; +} + +my $CFLAGS_PIC = $pic_flag; + +my $ASMFLAGS_PIC = ""; +foreach (split /[ \t]/, $pic_flag) { + if (/^-D/) { + $ASMFLAGS_PIC .= " " . $_; + } +} + +open(MAKEFILE, ">Makefile.many") or die; +print MAKEFILE + "# Makefile.many generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST\n" . + "\n" . + "all: speed-many try-many\n" . + "\n" . + "#--------- begin included copy of basic Makefile ----------\n" . + "\n"; +open(FD,"<${builddir}/Makefile") or die "Cannot open \"${builddir}/Makefile\": $!\n"; +print MAKEFILE ; +close FD; +print MAKEFILE + "\n" . + "#--------- end included copy of basic Makefile ----------\n" . + "\n" . + "CFLAGS_PIC = $CFLAGS_PIC\n" . + "ASMFLAGS_PIC = $ASMFLAGS_PIC\n" . + "\n"; + +my $CLEAN=""; +my $MANY_OBJS=""; + + +sub print_ansi2knr { + my ($base,$file,$includes) = @_; + if (! defined $file) { $file = "$base.c"; } + if (! defined $includes) { $includes = ""; } + + print MAKEFILE <${base}_.c + +EOF +} + + +# Spawning a glob is a touch slow when there's lots of files. +my @files = (); +foreach my $dir (@DIRECTORIES) { + print "dir $dir\n" if $opt{'t'}; + if (-f $dir) { + push @files,$dir; + } else { + if (! opendir DD,$dir) { + print "Cannot open $dir: $!\n"; + } else { + push @files, map {$_="$dir/$_"} grep /\.(c|asm|S|h)$/, readdir DD; + closedir DD; + } + } +} +@files = sort @files; +print "@files ",join(" ",@files),"\n" if $opt{'t'}; + +my $count_files = 0; +my $count_functions = 0; +my %seen_obj; +my %seen_file; + +foreach my $file_full (@files) { + if (! -f $file_full) { + print "Not a file: $file_full\n"; + next; + } + if (defined $seen_file{$file_full}) { + print "Skipping duplicate file: $file_full\n"; + next; + } + $seen_file{$file_full} = 1; + + my ($FILE,$path,$lang) = fileparse($file_full,"\.[a-zA-Z]+"); + $path =~ s/\/$//; + print "file $FILE path $path lang $lang\n" if $opt{'t'}; + + my @pic_choices; + if ($lang eq '.asm') { @pic_choices=('no','yes'); } + elsif ($lang eq '.c') { @pic_choices=('no'); } + elsif ($lang eq '.S') { @pic_choices=('no','yes'); } + elsif ($lang eq '.h') { @pic_choices=('no'); } + else { next }; + + my ($t, $file_match); + foreach my $p (@table) { + # print " ",$p->{'regexp'},"\n" if $opt{'t'}; + if ($FILE =~ "^($p->{'regexp'})") { + $t = $p; + $file_match = $1; + $file_match = $2 if defined $2; + last; + } + } + next if ! defined $t; + print "match $t->{'regexp'} $FILE ($file_full)\n" if $opt{'t'}; + + if (! open FD,"<$file_full") { print "Can't open $file_full: $!\n"; next } + my @file_contents = ; + close FD; + + my $objs; + if (defined $t->{'mulfunc'}) { $objs = $t->{'mulfunc'}; } + else { $objs = [$file_match]; } + print "objs @$objs\n" if $opt{'t'}; + + my $ret = $t->{'ret'}; + if (! defined $ret && $lang eq '.h') { $ret = ''; } + if (! defined $ret) { die "$FILE return type not defined\n" }; + print "ret $ret\n" if $opt{'t'}; + + my $mpX = $t->{'mpX'}; + if (! defined $mpX) { $mpX = ($lang eq '.h' ? '' : 'mpn'); } + $mpX = "${mpX}_" if $mpX ne ''; + print "mpX $mpX\n" if $opt{'t'}; + + my $carrys; + if (defined $t->{'carrys'}) { $carrys = $t->{'carrys'}; } + else { $carrys = ['','c']; } + print "carrys $carrys @$carrys\n" if $opt{'t'}; + + # some restriction functions are implemented, but they're not very useful + my $restriction=''; + + my $suffix; + if ($FILE =~ ("${file_match}_(.+)")) { + $suffix = $1; + } elsif ($path =~ /\/mp[zn]\/(.*)$/) { + # derive the suffix from the path + $suffix = $1; + $suffix =~ s/\//_/g; + # use last directory name, or if there's 3 or more then the last two + if ($suffix =~ /([^_]*_)+([^_]+_[^_]+)$/) { + $suffix = $2; + } elsif ($suffix =~ /([^_]*_)*([^_]+)$/) { + $suffix = $2; + } + } else { + die "Can't determine suffix for: $file_full (path $path)\n"; + } + print "suffix $suffix\n" if $opt{'t'}; + + $count_files++; + + foreach my $obj (@{$objs}) { + print "obj $obj\n" if $opt{'t'}; + + my $obj_with_suffix = "${obj}_$suffix"; + if (defined $seen_obj{$obj_with_suffix}) { + print "Skipping duplicate object: $obj_with_suffix\n"; + print " first from: $seen_obj{$obj_with_suffix}\n"; + print " now from: $file_full\n"; + next; + } + $seen_obj{$obj_with_suffix} = $file_full; + + my $funs = $t->{'funs'}; + $funs = [$obj] if ! defined $funs; + print "funs @$funs\n" if $opt{'t'}; + + if (defined $t->{'pic'}) { @pic_choices = ('no'); } + + foreach my $pic (map {$pictable{$_}} @pic_choices) { + print "pic $pic->{'suffix'}\n" if $opt{'t'}; + + my $objbase = "${obj}_$suffix$pic->{'suffix'}"; + print "objbase $objbase\n" if $opt{'t'}; + + if ($path !~ "." && -f "${objbase}.c") { + die "Already have ${objbase}.c"; + } + + my $tmp_file = "tmp-$objbase.c"; + + my $renaming; + foreach my $fun (@{$funs}) { + if ($mpX eq 'mpn_' && $lang eq '.c') { + $renaming .= "\t\t-DHAVE_NATIVE_mpn_$fun=1 \\\n"; + } + + # The carry-in variant is with a "c" appended, unless there's a "_1" + # somewhere, eg. "modexact_1_odd", in which case that becomes "_1c". + my $fun_carry = $fun; + if (! ($fun_carry =~ s/_1/_1c/)) { $fun_carry = "${fun}c"; } + + $renaming .= + "\t\t-D__g$mpX$fun=$mpX${fun}_$suffix$pic->{'suffix'} \\\n" . + "\t\t-D__g$mpX$fun_carry=$mpX${fun_carry}_$suffix$pic->{'suffix'} \\\n"; + } + foreach my $r (@{$t->{'rename'}}) { + if ($r =~ /^__gmp/) { + $renaming .= "\\\n" . + "\t\t-D$r=${r}_$suffix$pic->{'suffix'}"; + } else { + $renaming .= "\\\n" . + "\t\t-D__g$mpX$r=$mpX${r}_$suffix$pic->{'suffix'}"; + } + } + print "renaming $renaming\n" if $opt{'t'}; + + print MAKEFILE "\n"; + if ($lang eq '.asm') { + print MAKEFILE + "$objbase.o: $file_full \$(ASM_HEADERS)\n" . + " \$(M4) \$(M4FLAGS) -DOPERATION_$obj $pic->{'asmflags'} \\\n" . + "$renaming" . + " $file_full >tmp-$objbase.s\n" . + " \$(CCAS) \$(COMPILE_FLAGS) $pic->{'cflags'} tmp-$objbase.s -o $objbase.o\n" . + " \$(RM_TMP) tmp-$objbase.s\n"; + $MANY_OBJS .= " $objbase.o"; + + } elsif ($lang eq '.c') { + print MAKEFILE + "$objbase.o: $file_full\n" . + " \$(COMPILE) -DOPERATION_$obj $pic->{'cflags'} \\\n" . + "$renaming" . + " -c $file_full -o $objbase.o\n"; + print_ansi2knr($objbase, + $file_full, + " -DOPERATION_$obj\\\n$renaming\t\t"); + $MANY_OBJS .= " $objbase\$U.o"; + + } elsif ($lang eq '.S') { + print MAKEFILE + "$objbase.o: $file_full\n" . + " \$(COMPILE) -g $pic->{'asmflags'} \\\n" . + "$renaming" . + " -c $file_full -o $objbase.o\n"; + $MANY_OBJS .= " $objbase.o"; + + } elsif ($lang eq '.h') { + print MAKEFILE + "$objbase.o: tmp-$objbase.c $file_full\n" . + " \$(COMPILE) -DOPERATION_$obj $pic->{'cflags'} \\\n" . + "$renaming" . + " -c tmp-$objbase.c -o $objbase.o\n"; + print_ansi2knr($objbase, + "tmp-$objbase.c", + " -DOPERATION_$obj\\\n$renaming\t\t"); + $MANY_OBJS .= " $objbase\$U.o"; + + $CLEAN .= " tmp-$objbase.c"; + open(TMP_C,">tmp-$objbase.c") + or die "Can't create tmp-$objbase.c: $!\n"; + print TMP_C +"/* tmp-$objbase.c generated by many.pl - DO NOT EDIT, CHANGES WILL BE LOST */ + +#include \"gmp.h\" +#include \"gmp-impl.h\" +#include \"longlong.h\" +#include \"speed.h\" + +"; + } + + my $tests_program = "$top_srcdir/tests/devel/$obj.c"; + if (-f $tests_program) { + $tests_program = "\$(top_srcdir)/tests/devel/$obj.c"; + print_ansi2knr("tests_${objbase}", + $tests_program, + "\\\n$renaming\t\t\$(CFLAGS_TESTS_SP)"); + print_ansi2knr("tests_${objbase}_sp", + $tests_program, + "\\\n$renaming\t\t\$(CFLAGS_TESTS_SP)"); + + print MAKEFILE <{'macro_before'}; + $macro_before = "" if ! defined $macro_before; + print TMP_C +"$macro_before +#undef $fun +#include \"$file_full\" + +"; + } + + my $args = $t->{"args_$fun"}; + if (! defined $args) { $args = $t->{'args'}; } + if (! defined $args) { die "Need args for $fun\n"; } + print "args $args\n" if $opt{'t'}; + + foreach my $carry (@$carrys) { + print "carry $carry\n" if $opt{'t'}; + + my $fun_carry = $fun; + if (! ($fun_carry =~ s/_1/_1$carry/)) { $fun_carry = "$fun$carry"; } + print "fun_carry $fun_carry\n" if $opt{'t'}; + + if ($lang =~ /\.(asm|S)/ + && ! grep(m"PROLOGUE\((.* )?$mpX$fun_carry[ ,)]",@file_contents)) { + print "no PROLOGUE $mpX$fun_carry\n" if $opt{'t'}; + next; + } + if ($lang eq '.c' + && ! grep(m"^(#define FUNCTION\s+)?$mpX$fun_carry\W", @file_contents)) { + print "no mention of $mpX$fun_carry\n" if $opt{'t'}; + next; + } + if ($lang eq '.h' + && ! grep(m"^#define $fun_carry\W", @file_contents)) { + print "no mention of #define $fun_carry\n" if $opt{'t'}; + next; + } + + $count_functions++; + + my $carryarg; + if (defined $t->{'carryarg'}) { $carryarg = $t->{'carryarg'}; } + if ($carry eq '') { $carryarg = ''; } + else { $carryarg = ', mp_limb_t carry'; } + print "carryarg $carryarg\n" if $opt{'t'}; + + my $funfull="$mpX${fun_carry}_$suffix$pic->{'suffix'}"; + print "funfull $funfull\n" if $opt{'t'}; + + if ($lang ne '.h') { + my $proto = "$t->{'ret'} $funfull _PROTO (($args$carryarg)); \\\n"; + $SPEED_EXTRA_PROTOS .= $proto; + $TRY_EXTRA_PROTOS .= $proto; + } + + my $try_type = $t->{"try-$fun"}; + $try_type = $t->{'try'} if ! defined $try_type; + if (! defined $try_type) { + if ($mpX eq 'mpn_') { + $try_type = "TYPE_\U$fun_carry"; + } else { + $try_type = "TYPE_\U$mpX\U$fun_carry"; + } + } + print "try_type $try_type\n" if $opt{'t'}; + + my $try_minsize = $t->{'try-minsize'}; + if (defined $try_minsize) { + $try_minsize = ", " . $try_minsize; + } else { + $try_minsize = ""; + } + print "try_minsize $try_minsize\n" if $opt{'t'}; + + if ($try_type ne 'none') { + $TRY_EXTRA_ROUTINES .= + " { TRY($mpX${fun_carry}_$suffix$pic->{'suffix'}), $try_type$try_minsize }, \\\n"; + } + + my $speed_flags = $t->{'speed_flags'}; + $speed_flags = '0' if ! defined $speed_flags; + print "speed_flags $speed_flags\n" if $opt{'t'}; + + my $speed_routine = $t->{'speed'}; + $speed_routine = "SPEED_ROUTINE_\U$mpX\U$fun" + if !defined $speed_routine; + if (! ($speed_routine =~ s/_1/_1\U$carry/)) { + $speed_routine = "$speed_routine\U$carry"; + } + print "speed_routine $speed_routine\n" if $opt{'t'}; + + my @speed_suffixes = (); + push (@speed_suffixes, '') if $speed_routine ne 'none'; + push (@speed_suffixes, @{$t->{'speed_suffixes'}}) + if defined $t->{'speed_suffixes'}; + + my $macro_speed = $t->{'macro-speed'}; + $macro_speed = "$speed_routine ($fun_carry)" if ! defined $macro_speed; + $macro_speed =~ s/\$fun/$fun_carry/g; + + foreach my $S (@speed_suffixes) { + my $Sfunfull="$mpX${fun_carry}${S}_$suffix$pic->{'suffix'}"; + + $SPEED_EXTRA_PROTOS .= + "double speed_$Sfunfull _PROTO ((struct speed_params *s)); \\\n"; + $SPEED_EXTRA_ROUTINES .= + " { \"$Sfunfull\", speed_$Sfunfull, $speed_flags }, \\\n"; + if ($lang eq '.h') { + print TMP_C +"double +speed_$Sfunfull (struct speed_params *s) +{ +$macro_speed +} + +"; + } else { + $SPEED_CODE .= + "double\n" . + "speed_$Sfunfull (struct speed_params *s)\n" . + "{\n" . + "$restriction" . + " $speed_routine\U$S\E ($funfull)\n" . + "}\n"; + } + } + } + } + } + } +} + + +print SPEED $SPEED_EXTRA_PROTOS . "\n"; +print SPEED $SPEED_EXTRA_ROUTINES . "\n"; +if (defined $ENV{speedinc}) { print SPEED $ENV{speedinc} . "\n"; } +print SPEED + "#include \"speed.c\"\n" . + "\n"; +print SPEED $SPEED_CODE; + +print TRY $TRY_EXTRA_ROUTINES . "\n"; +print TRY $TRY_EXTRA_PROTOS . "\n"; +my $tryinc = ""; +if (defined $ENV{tryinc}) { + $tryinc = $ENV{tryinc}; + print TRY "#include \"$tryinc\"\n"; +} +print "tryinc $tryinc\n" if $opt{'t'}; +print TRY + "#include \"try.c\"\n" . + "\n"; + +my $extra_libraries = ""; +if (defined $ENV{extra_libraries}) { $extra_libraries = $ENV{extra_libraries};} + +my $trydeps = ""; +if (defined $ENV{trydeps}) { $trydeps = $ENV{trydeps}; } +$trydeps .= " $tryinc"; +print "trydeps $trydeps\n" if $opt{'t'}; + +print MAKEFILE < +#include "gmp-impl.h" +#include "longlong.h" +#include "speed.h" + + +/* Like the standard version in gmp-impl.h, but with the expressions using a + "1-" form. This has the same number of steps, but "1-" is on the + dependent chain, whereas the "2*" in the standard version isn't. + Depending on the CPU this should be the same or a touch slower. */ + +#if GMP_LIMB_BITS <= 32 +#define binvert_limb_mul1(inv,n) \ + do { \ + mp_limb_t __n = (n); \ + mp_limb_t __inv; \ + ASSERT ((__n & 1) == 1); \ + __inv = binvert_limb_table[(__n&0xFF)/2]; /* 8 */ \ + __inv = (1 - __n * __inv) * __inv + __inv; /* 16 */ \ + __inv = (1 - __n * __inv) * __inv + __inv; /* 32 */ \ + ASSERT (__inv * __n == 1); \ + (inv) = __inv; \ + } while (0) +#endif + +#if GMP_LIMB_BITS > 32 && GMP_LIMB_BITS <= 64 +#define binvert_limb_mul1(inv,n) \ + do { \ + mp_limb_t __n = (n); \ + mp_limb_t __inv; \ + ASSERT ((__n & 1) == 1); \ + __inv = binvert_limb_table[(__n&0xFF)/2]; /* 8 */ \ + __inv = (1 - __n * __inv) * __inv + __inv; /* 16 */ \ + __inv = (1 - __n * __inv) * __inv + __inv; /* 32 */ \ + __inv = (1 - __n * __inv) * __inv + __inv; /* 64 */ \ + ASSERT (__inv * __n == 1); \ + (inv) = __inv; \ + } while (0) +#endif + + +/* The loop based version used in GMP 3.0 and earlier. Usually slower than + multiplying, due to the number of steps that must be performed. Much + slower when the processor has a good multiply. */ + +#define binvert_limb_loop(inv,n) \ + do { \ + mp_limb_t __v = (n); \ + mp_limb_t __v_orig = __v; \ + mp_limb_t __make_zero = 1; \ + mp_limb_t __two_i = 1; \ + mp_limb_t __v_inv = 0; \ + \ + ASSERT ((__v & 1) == 1); \ + \ + do \ + { \ + while ((__two_i & __make_zero) == 0) \ + __two_i <<= 1, __v <<= 1; \ + __v_inv += __two_i; \ + __make_zero -= __v; \ + } \ + while (__make_zero); \ + \ + ASSERT (__v_orig * __v_inv == 1); \ + (inv) = __v_inv; \ + } while (0) + + +/* Another loop based version with conditionals, but doing a fixed number of + steps. */ + +#define binvert_limb_cond(inv,n) \ + do { \ + mp_limb_t __n = (n); \ + mp_limb_t __rem = (1 - __n) >> 1; \ + mp_limb_t __inv = GMP_LIMB_HIGHBIT; \ + int __count; \ + \ + ASSERT ((__n & 1) == 1); \ + \ + __count = GMP_LIMB_BITS-1; \ + do \ + { \ + __inv >>= 1; \ + if (__rem & 1) \ + { \ + __inv |= GMP_LIMB_HIGHBIT; \ + __rem -= __n; \ + } \ + __rem >>= 1; \ + } \ + while (-- __count); \ + \ + ASSERT (__inv * __n == 1); \ + (inv) = __inv; \ + } while (0) + + +/* Another loop based bitwise version, but purely arithmetic, no + conditionals. */ + +#define binvert_limb_arith(inv,n) \ + do { \ + mp_limb_t __n = (n); \ + mp_limb_t __rem = (1 - __n) >> 1; \ + mp_limb_t __inv = GMP_LIMB_HIGHBIT; \ + mp_limb_t __lowbit; \ + int __count; \ + \ + ASSERT ((__n & 1) == 1); \ + \ + __count = GMP_LIMB_BITS-1; \ + do \ + { \ + __lowbit = __rem & 1; \ + __inv = (__inv >> 1) | (__lowbit << (GMP_LIMB_BITS-1)); \ + __rem = (__rem - (__n & -__lowbit)) >> 1; \ + } \ + while (-- __count); \ + \ + ASSERT (__inv * __n == 1); \ + (inv) = __inv; \ + } while (0) + + +double +speed_binvert_limb_mul1 (struct speed_params *s) +{ + SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_mul1); +} +double +speed_binvert_limb_loop (struct speed_params *s) +{ + SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_loop); +} +double +speed_binvert_limb_cond (struct speed_params *s) +{ + SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_cond); +} +double +speed_binvert_limb_arith (struct speed_params *s) +{ + SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_arith); +} diff --git a/gmp-6.3.0/tune/noop.c b/gmp-6.3.0/tune/noop.c new file mode 100644 index 0000000..c127b73 --- /dev/null +++ b/gmp-6.3.0/tune/noop.c @@ -0,0 +1,67 @@ +/* Noop routines. + + These are in a separate file to stop gcc recognising do-nothing functions + and optimizing away calls to them. */ + +/* +Copyright 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#include "speed.h" + + +void +noop (void) +{ +} + +/*ARGSUSED*/ +void +noop_1 (mp_limb_t n) +{ +} + +/*ARGSUSED*/ +void +noop_wxs (mp_ptr wp, mp_srcptr xp, mp_size_t size) +{ +} + +/*ARGSUSED*/ +void +noop_wxys (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size) +{ +} + +/*ARGSUSED*/ +void +mpn_cache_fill_dummy (mp_limb_t n) +{ +} diff --git a/gmp-6.3.0/tune/pentium.asm b/gmp-6.3.0/tune/pentium.asm new file mode 100644 index 0000000..fb1e833 --- /dev/null +++ b/gmp-6.3.0/tune/pentium.asm @@ -0,0 +1,60 @@ +dnl x86 pentium time stamp counter access routine. + +dnl Copyright 1999, 2000, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C void speed_cyclecounter (unsigned p[2]); +C +C Get the pentium rdtsc cycle counter, storing the least significant word in +C p[0] and the most significant in p[1]. +C +C cpuid is used to serialize execution. On big measurements this won't be +C significant but it may help make small single measurements more accurate. + + .text + ALIGN(8) + +defframe(PARAM_P,4) + +PROLOGUE(speed_cyclecounter) +deflit(`FRAME',0) + pushl %ebx +FRAME_pushl() + xorl %eax, %eax + cpuid + rdtsc + movl PARAM_P, %ebx + movl %eax, (%ebx) + movl %edx, 4(%ebx) + popl %ebx + ret +EPILOGUE() diff --git a/gmp-6.3.0/tune/powerpc.asm b/gmp-6.3.0/tune/powerpc.asm new file mode 100644 index 0000000..2f4ac27 --- /dev/null +++ b/gmp-6.3.0/tune/powerpc.asm @@ -0,0 +1,53 @@ +dnl PowerPC mftb_function -- read time base registers. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C void mftb_function (unsigned a[2]); +C + +ASM_START() +PROLOGUE(mftb_function) + + C r3 a + +L(again): + mftbu r4 + mftb r5 + mftbu r6 + cmpw cr0, r4, r6 + bne L(again) + + stw r5, 0(r3) + stw r4, 4(r3) + blr + +EPILOGUE() diff --git a/gmp-6.3.0/tune/powerpc64.asm b/gmp-6.3.0/tune/powerpc64.asm new file mode 100644 index 0000000..1ade996 --- /dev/null +++ b/gmp-6.3.0/tune/powerpc64.asm @@ -0,0 +1,49 @@ +dnl PowerPC mftb_function -- read time base registers, 64-bit integer. + +dnl Copyright 2002-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C void mftb_function (unsigned a[2]); +C + +ASM_START() +PROLOGUE(mftb_function) + + C r3 a + + mftb r5 + + srdi r4, r5, 32 + stw r5, 0(r3) + stw r4, 4(r3) + blr + +EPILOGUE() diff --git a/gmp-6.3.0/tune/powm_mod.c b/gmp-6.3.0/tune/powm_mod.c new file mode 100644 index 0000000..765fd7b --- /dev/null +++ b/gmp-6.3.0/tune/powm_mod.c @@ -0,0 +1,38 @@ +/* mpz/powm.c forced to use division. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef POWM_THRESHOLD +#define POWM_THRESHOLD 1 +#define __gmpz_powm mpz_powm_mod + +#include "../mpz/powm.c" diff --git a/gmp-6.3.0/tune/powm_redc.c b/gmp-6.3.0/tune/powm_redc.c new file mode 100644 index 0000000..8584614 --- /dev/null +++ b/gmp-6.3.0/tune/powm_redc.c @@ -0,0 +1,40 @@ +/* mpz/powm.c forced to use REDC. */ + +/* +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* WANT_GLOBAL_REDC makes redc() available for speed and tune program use. */ +#undef POWM_THRESHOLD +#define POWM_THRESHOLD MP_SIZE_T_MAX +#define WANT_REDC_GLOBAL 1 +#define __gmpz_powm mpz_powm_redc + +#include "../mpz/powm.c" diff --git a/gmp-6.3.0/tune/pre_divrem_1.c b/gmp-6.3.0/tune/pre_divrem_1.c new file mode 100644 index 0000000..66d00da --- /dev/null +++ b/gmp-6.3.0/tune/pre_divrem_1.c @@ -0,0 +1,40 @@ +/* mpn_preinv_divrem_1 -- if not already in libgmp. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if ! USE_PREINV_DIVREM_1 + +#undef USE_PREINV_DIVREM_1 +#define USE_PREINV_DIVREM_1 1 + +#include "mpn/generic/pre_divrem_1.c" + +#endif diff --git a/gmp-6.3.0/tune/set_strb.c b/gmp-6.3.0/tune/set_strb.c new file mode 100644 index 0000000..128c41b --- /dev/null +++ b/gmp-6.3.0/tune/set_strb.c @@ -0,0 +1,46 @@ +/* mpn_set_str_basecase -- mpn_set_str forced to its basecase. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __gmpn_set_str mpn_set_str_basecase +#define __gmpn_bc_set_str mpn_bc_set_str_basecase +#define __gmpn_dc_set_str mpn_dc_set_str_basecase + +#include "gmp-impl.h" + +#ifndef SIZE_T_MAX +#define SIZE_T_MAX ((size_t) ULONG_MAX) +#endif + +#undef SET_STR_DC_THRESHOLD +#define SET_STR_DC_THRESHOLD SIZE_T_MAX /* always */ +#undef SET_STR_PRECOMPUTE_THRESHOLD +#define SET_STR_PRECOMPUTE_THRESHOLD SIZE_T_MAX /* always */ + +#include "mpn/generic/set_str.c" diff --git a/gmp-6.3.0/tune/set_strp.c b/gmp-6.3.0/tune/set_strp.c new file mode 100644 index 0000000..3053b60 --- /dev/null +++ b/gmp-6.3.0/tune/set_strp.c @@ -0,0 +1,42 @@ +/* mpn_set_str_subquad -- mpn_set_str forced to the sub-quadratic case. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ + +#include "gmp-impl.h" + +void +mpn_pre_set_str (mp_ptr wp, unsigned char *str, size_t str_len, powers_t *powtab, mp_ptr tp) +{ + if (BELOW_THRESHOLD (str_len, set_str_dc_threshold)) + mpn_bc_set_str (wp, str, str_len, powtab->base); + else + mpn_dc_set_str (wp, str, str_len, powtab, tp); +} diff --git a/gmp-6.3.0/tune/set_strs.c b/gmp-6.3.0/tune/set_strs.c new file mode 100644 index 0000000..d2a9fc2 --- /dev/null +++ b/gmp-6.3.0/tune/set_strs.c @@ -0,0 +1,42 @@ +/* mpn_set_str_subquad -- mpn_set_str forced to the sub-quadratic case. + +Copyright 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __gmpn_set_str mpn_set_str_subquad +#define __gmpn_bc_set_str mpn_bc_set_str_subquad +#define __gmpn_dc_set_str mpn_dc_set_str_subquad + +#include "gmp-impl.h" + +#undef SET_STR_DC_THRESHOLD +#define SET_STR_DC_THRESHOLD 2 /* never */ +#undef SET_STR_PRECOMPUTE_THRESHOLD +#define SET_STR_PRECOMPUTE_THRESHOLD 2 /* never */ + +#include "mpn/generic/set_str.c" diff --git a/gmp-6.3.0/tune/sparcv9.asm b/gmp-6.3.0/tune/sparcv9.asm new file mode 100644 index 0000000..f0981c7 --- /dev/null +++ b/gmp-6.3.0/tune/sparcv9.asm @@ -0,0 +1,45 @@ +dnl Sparc v9 32-bit time stamp counter access routine. + +dnl Copyright 2000, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C void speed_cyclecounter (unsigned p[2]); +C +C Get the sparc v9 tick counter. + +ASM_START() +PROLOGUE(speed_cyclecounter) + rd %tick,%g1 + st %g1,[%o0] C low 32 bits + srlx %g1,32,%g4 + retl + st %g4,[%o0+4] C high 32 bits +EPILOGUE(speed_cyclecounter) diff --git a/gmp-6.3.0/tune/speed-ext.c b/gmp-6.3.0/tune/speed-ext.c new file mode 100644 index 0000000..e7fb8b9 --- /dev/null +++ b/gmp-6.3.0/tune/speed-ext.c @@ -0,0 +1,233 @@ +/* An example of extending the speed program to measure routines not in GMP. + +Copyright 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* The extension here is three versions of an mpn arithmetic mean. These + aren't meant to be particularly useful, just examples. + + You can run something like the following to compare their speeds. + + ./speed-ext -s 1-20 -c mean_calls mean_open mean_open2 + + On RISC chips, mean_open() might be fastest if the compiler is doing a + good job. On the register starved x86s, mean_calls will be fastest. + + + Notes: + + SPEED_EXTRA_PROTOS and SPEED_EXTRA_ROUTINES are macros that get expanded + by speed.c in useful places. SPEED_EXTRA_PROTOS goes after the header + files, and SPEED_EXTRA_ROUTINES goes in the array of available routines. + + The advantage of this #include "speed.c" scheme is that there's no + editing of a copy of that file, and new features in new versions of it + will be immediately available. + + In a real program the routines mean_calls() etc would probably be in + separate C or assembler source files, and just the measuring + speed_mean_calls() etc would be here. Linking against other libraries + for things to measure is perfectly possible too. + + When attempting to compare two versions of the same named routine, say + like the generic and assembler versions of mpn_add_n(), creative use of + cc -D or #define is suggested, so one or both can be renamed and linked + into the same program. It'll be much easier to compare them side by side + than with separate programs for each. + + common.c has notes on writing speed measuring routines. + + Remember to link against tune/libspeed.la (or tune/.libs/libspeed.a if + not using libtool) to get common.o and other objects needed by speed.c. */ + + +#define SPEED_EXTRA_PROTOS \ + double speed_mean_calls (struct speed_params *s); \ + double speed_mean_open (struct speed_params *s); \ + double speed_mean_open2 (struct speed_params *s); + +#define SPEED_EXTRA_ROUTINES \ + { "mean_calls", speed_mean_calls }, \ + { "mean_open", speed_mean_open }, \ + { "mean_open2", speed_mean_open2 }, + +#include "speed.c" + + +/* A straightforward implementation calling mpn subroutines. + + wp,size is set to (xp,size + yp,size) / 2. The return value is the + remainder from the division. The other versions are the same. */ + +mp_limb_t +mean_calls (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size) +{ + mp_limb_t c, ret; + + ASSERT (size >= 1); + + c = mpn_add_n (wp, xp, yp, size); + ret = mpn_rshift (wp, wp, size, 1) >> (GMP_LIMB_BITS-1); + wp[size-1] |= (c << (GMP_LIMB_BITS-1)); + return ret; +} + + +/* An open-coded version, making one pass over the data. The right shift is + done as the added limbs are produced. The addition code follows + mpn/generic/add_n.c. */ + +mp_limb_t +mean_open (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size) +{ + mp_limb_t w, wprev, x, y, c, ret; + mp_size_t i; + + ASSERT (size >= 1); + + x = xp[0]; + y = yp[0]; + + wprev = x + y; + c = (wprev < x); + ret = (wprev & 1); + +#define RSHIFT(hi,lo) (((lo) >> 1) | ((hi) << (GMP_LIMB_BITS-1))) + + for (i = 1; i < size; i++) + { + x = xp[i]; + y = yp[i]; + + w = x + c; + c = (w < x); + w += y; + c += (w < y); + + wp[i-1] = RSHIFT (w, wprev); + wprev = w; + } + + wp[i-1] = RSHIFT (c, wprev); + + return ret; +} + + +/* Another one-pass version, but right shifting the source limbs rather than + the result limbs. There's not much chance of this being better than the + above, but it's an alternative at least. */ + +mp_limb_t +mean_open2 (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size) +{ + mp_limb_t w, x, y, xnext, ynext, c, ret; + mp_size_t i; + + ASSERT (size >= 1); + + x = xp[0]; + y = yp[0]; + + /* ret is the low bit of x+y, c is the carry out of that low bit add */ + ret = (x ^ y) & 1; + c = (x & y) & 1; + + for (i = 0; i < size-1; i++) + { + xnext = xp[i+1]; + ynext = yp[i+1]; + x = RSHIFT (xnext, x); + y = RSHIFT (ynext, y); + + w = x + c; + c = (w < x); + w += y; + c += (w < y); + wp[i] = w; + + x = xnext; + y = ynext; + } + + wp[i] = (x >> 1) + (y >> 1) + c; + + return ret; +} + + +/* The speed measuring routines are the same apart from which function they + run, so a macro is used. Actually this macro is the same as + SPEED_ROUTINE_MPN_BINARY_N. */ + +#define SPEED_ROUTINE_MEAN(mean_fun) \ + { \ + unsigned i; \ + mp_ptr wp; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + mean_fun (wp, s->xp, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +double +speed_mean_calls (struct speed_params *s) +{ + SPEED_ROUTINE_MEAN (mean_calls); +} + +double +speed_mean_open (struct speed_params *s) +{ + SPEED_ROUTINE_MEAN (mean_open); +} + +double +speed_mean_open2 (struct speed_params *s) +{ + SPEED_ROUTINE_MEAN (mean_open2); +} diff --git a/gmp-6.3.0/tune/speed.c b/gmp-6.3.0/tune/speed.c new file mode 100644 index 0000000..f8909bc --- /dev/null +++ b/gmp-6.3.0/tune/speed.c @@ -0,0 +1,1419 @@ +/* Speed measuring program. + +Copyright 1999-2003, 2005, 2006, 2008-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* Usage message is in the code below, run with no arguments to print it. + See README for interesting applications. + + To add a new routine foo(), create a speed_foo() function in the style of + the existing ones and add an entry in the routine[] array. Put FLAG_R if + speed_foo() wants an "r" parameter. + + The routines don't have help messages or descriptions, but most have + suggestive names. See the source code for full details. + +*/ + +#include "config.h" + +#include +#include +#include +#include + +#if HAVE_UNISTD_H +#include /* for getpid, R_OK */ +#endif + +#if TIME_WITH_SYS_TIME +# include /* for struct timeval */ +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#if HAVE_SYS_RESOURCE_H +#include /* for getrusage() */ +#endif + + +#include "gmp-impl.h" +#include "longlong.h" /* for the benefit of speed-many.c */ +#include "tests.h" +#include "speed.h" + + +#if !HAVE_DECL_OPTARG +extern char *optarg; +extern int optind, opterr; +#endif + +#if !HAVE_STRTOUL +#define strtoul(p,e,b) (unsigned long) strtol(p,e,b) +#endif + +#ifdef SPEED_EXTRA_PROTOS +SPEED_EXTRA_PROTOS +#endif +#ifdef SPEED_EXTRA_PROTOS2 +SPEED_EXTRA_PROTOS2 +#endif + + +#if GMP_LIMB_BITS == 32 +#define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK) +#endif +#if GMP_LIMB_BITS == 64 +#define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK) +#endif + + +#define CMP_ABSOLUTE 1 +#define CMP_RATIO 2 +#define CMP_DIFFERENCE 3 +#define CMP_DIFFPREV 4 +int option_cmp = CMP_ABSOLUTE; + +#define UNIT_SECONDS 1 +#define UNIT_CYCLES 2 +#define UNIT_CYCLESPERLIMB 3 +int option_unit = UNIT_SECONDS; + +#define DATA_RANDOM 1 +#define DATA_RANDOM2 2 +#define DATA_ZEROS 3 +#define DATA_AAS 4 +#define DATA_FFS 5 +#define DATA_2FD 6 +int option_data = DATA_RANDOM; + +int option_square = 0; +double option_factor = 0.0; +mp_size_t option_step = 1; +int option_gnuplot = 0; +char *option_gnuplot_basename; +struct size_array_t { + mp_size_t start, end; +} *size_array = NULL; +mp_size_t size_num = 0; +mp_size_t size_allocnum = 0; +int option_resource_usage = 0; +long option_seed = 123456789; + +struct speed_params sp; + +#define COLUMN_WIDTH 13 /* for the free-form output */ + +#define FLAG_R (1<<0) /* require ".r" */ +#define FLAG_R_OPTIONAL (1<<1) /* optional ".r" */ +#define FLAG_RSIZE (1<<2) +#define FLAG_NODATA (1<<3) /* don't alloc xp, yp */ + +const struct routine_t { + /* constants */ + const char *name; + speed_function_t fun; + int flag; +} routine[] = { + + { "noop", speed_noop }, + { "noop_wxs", speed_noop_wxs }, + { "noop_wxys", speed_noop_wxys }, + + { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL }, + { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL }, + { "mpn_add_1", speed_mpn_add_1, FLAG_R }, + { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R }, + { "mpn_sub_1", speed_mpn_sub_1, FLAG_R }, + { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R }, + + { "mpn_add_err1_n", speed_mpn_add_err1_n }, + { "mpn_add_err2_n", speed_mpn_add_err2_n }, + { "mpn_add_err3_n", speed_mpn_add_err3_n }, + { "mpn_sub_err1_n", speed_mpn_sub_err1_n }, + { "mpn_sub_err2_n", speed_mpn_sub_err2_n }, + { "mpn_sub_err3_n", speed_mpn_sub_err3_n }, + +#if HAVE_NATIVE_mpn_add_n_sub_n + { "mpn_add_n_sub_n", speed_mpn_add_n_sub_n, FLAG_R_OPTIONAL }, +#endif + + { "mpn_addmul_1", speed_mpn_addmul_1, FLAG_R }, + { "mpn_submul_1", speed_mpn_submul_1, FLAG_R }, +#if HAVE_NATIVE_mpn_addmul_2 + { "mpn_addmul_2", speed_mpn_addmul_2, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_3 + { "mpn_addmul_3", speed_mpn_addmul_3, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_4 + { "mpn_addmul_4", speed_mpn_addmul_4, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_5 + { "mpn_addmul_5", speed_mpn_addmul_5, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_6 + { "mpn_addmul_6", speed_mpn_addmul_6, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_7 + { "mpn_addmul_7", speed_mpn_addmul_7, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addmul_8 + { "mpn_addmul_8", speed_mpn_addmul_8, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addaddmul_1msb0 + { "mpn_addaddmul_1msb0", speed_mpn_addaddmul_1msb0, FLAG_R_OPTIONAL }, +#endif + { "mpn_mul_1", speed_mpn_mul_1, FLAG_R }, + { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R }, +#if HAVE_NATIVE_mpn_mul_2 + { "mpn_mul_2", speed_mpn_mul_2, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_mul_3 + { "mpn_mul_3", speed_mpn_mul_3, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_mul_4 + { "mpn_mul_4", speed_mpn_mul_4, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_mul_5 + { "mpn_mul_5", speed_mpn_mul_5, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_mul_6 + { "mpn_mul_6", speed_mpn_mul_6, FLAG_R_OPTIONAL }, +#endif + + { "mpn_divrem_1", speed_mpn_divrem_1, FLAG_R }, + { "mpn_divrem_1f", speed_mpn_divrem_1f, FLAG_R }, +#if HAVE_NATIVE_mpn_divrem_1c + { "mpn_divrem_1c", speed_mpn_divrem_1c, FLAG_R }, + { "mpn_divrem_1cf", speed_mpn_divrem_1cf,FLAG_R }, +#endif + { "mpn_mod_1", speed_mpn_mod_1, FLAG_R }, +#if HAVE_NATIVE_mpn_mod_1c + { "mpn_mod_1c", speed_mpn_mod_1c, FLAG_R }, +#endif + { "mpn_preinv_divrem_1", speed_mpn_preinv_divrem_1, FLAG_R }, + { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R }, + { "mpn_preinv_mod_1", speed_mpn_preinv_mod_1, FLAG_R }, + + { "mpn_mod_1_1", speed_mpn_mod_1_1, FLAG_R }, + { "mpn_mod_1_1_1", speed_mpn_mod_1_1_1, FLAG_R }, + { "mpn_mod_1_1_2", speed_mpn_mod_1_1_2, FLAG_R }, + { "mpn_mod_1s_2", speed_mpn_mod_1_2, FLAG_R }, + { "mpn_mod_1s_3", speed_mpn_mod_1_3, FLAG_R }, + { "mpn_mod_1s_4", speed_mpn_mod_1_4, FLAG_R }, + + { "mpn_divrem_1_div", speed_mpn_divrem_1_div, FLAG_R }, + { "mpn_divrem_1_inv", speed_mpn_divrem_1_inv, FLAG_R }, + { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R }, + { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R }, + { "mpn_mod_1_div", speed_mpn_mod_1_div, FLAG_R }, + { "mpn_mod_1_inv", speed_mpn_mod_1_inv, FLAG_R }, + + { "mpn_divrem_2", speed_mpn_divrem_2, }, + { "mpn_divrem_2_div", speed_mpn_divrem_2_div, }, + { "mpn_divrem_2_inv", speed_mpn_divrem_2_inv, }, + + { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R }, + { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R }, + { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R }, + { "mpn_div_qr_1n_pi1_3",speed_mpn_div_qr_1n_pi1_3, FLAG_R }, + { "mpn_div_qr_1n_pi1_4",speed_mpn_div_qr_1n_pi1_4, FLAG_R }, + { "mpn_div_qr_1", speed_mpn_div_qr_1, FLAG_R }, + + { "mpn_div_qr_2n", speed_mpn_div_qr_2n, }, + { "mpn_div_qr_2u", speed_mpn_div_qr_2u, }, + + { "mpn_divexact_1", speed_mpn_divexact_1, FLAG_R }, + { "mpn_divexact_by3", speed_mpn_divexact_by3 }, + + { "mpn_bdiv_q_1", speed_mpn_bdiv_q_1, FLAG_R }, + { "mpn_pi1_bdiv_q_1", speed_mpn_pi1_bdiv_q_1, FLAG_R_OPTIONAL }, + { "mpn_bdiv_dbm1c", speed_mpn_bdiv_dbm1c, FLAG_R_OPTIONAL }, + +#if HAVE_NATIVE_mpn_modexact_1_odd + { "mpn_modexact_1_odd", speed_mpn_modexact_1_odd, FLAG_R }, +#endif + { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R }, + +#if GMP_NUMB_BITS % 4 == 0 + { "mpn_mod_34lsub1", speed_mpn_mod_34lsub1 }, +#endif + + { "mpn_lshift", speed_mpn_lshift, FLAG_R }, + { "mpn_lshiftc", speed_mpn_lshiftc, FLAG_R }, + { "mpn_rshift", speed_mpn_rshift, FLAG_R }, + + { "mpn_and_n", speed_mpn_and_n, FLAG_R_OPTIONAL }, + { "mpn_andn_n", speed_mpn_andn_n, FLAG_R_OPTIONAL }, + { "mpn_nand_n", speed_mpn_nand_n, FLAG_R_OPTIONAL }, + { "mpn_ior_n", speed_mpn_ior_n, FLAG_R_OPTIONAL }, + { "mpn_iorn_n", speed_mpn_iorn_n, FLAG_R_OPTIONAL }, + { "mpn_nior_n", speed_mpn_nior_n, FLAG_R_OPTIONAL }, + { "mpn_xor_n", speed_mpn_xor_n, FLAG_R_OPTIONAL }, + { "mpn_xnor_n", speed_mpn_xnor_n, FLAG_R_OPTIONAL }, + { "mpn_com", speed_mpn_com }, + { "mpn_neg", speed_mpn_neg }, + + { "mpn_popcount", speed_mpn_popcount }, + { "mpn_hamdist", speed_mpn_hamdist }, + + { "mpn_matrix22_mul", speed_mpn_matrix22_mul }, + + { "mpn_hgcd2", speed_mpn_hgcd2, FLAG_NODATA }, + { "mpn_hgcd2_1", speed_mpn_hgcd2_1, FLAG_NODATA }, + { "mpn_hgcd2_2", speed_mpn_hgcd2_2, FLAG_NODATA }, + { "mpn_hgcd2_3", speed_mpn_hgcd2_3, FLAG_NODATA }, + { "mpn_hgcd2_4", speed_mpn_hgcd2_4, FLAG_NODATA }, + { "mpn_hgcd2_5", speed_mpn_hgcd2_5, FLAG_NODATA }, + { "mpn_hgcd", speed_mpn_hgcd }, + { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer }, + { "mpn_hgcd_appr", speed_mpn_hgcd_appr }, + { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer }, + + { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce }, + { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 }, + { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 }, + + { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, + { "mpn_gcd_11", speed_mpn_gcd_11, FLAG_R_OPTIONAL }, + { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL }, + { "mpn_gcd_22", speed_mpn_gcd_22, FLAG_R_OPTIONAL }, + + { "mpn_gcd", speed_mpn_gcd }, + + { "mpn_gcdext", speed_mpn_gcdext }, + { "mpn_gcdext_single", speed_mpn_gcdext_single }, + { "mpn_gcdext_double", speed_mpn_gcdext_double }, + { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single }, + { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double }, +#if 0 + { "mpn_gcdext_lehmer", speed_mpn_gcdext_lehmer }, +#endif + + { "gmp_primesieve", speed_gmp_primesieve, FLAG_NODATA }, + { "mpz_nextprime", speed_mpz_nextprime }, + { "mpz_nextprime_1", speed_mpz_nextprime_1, FLAG_R_OPTIONAL }, + { "mpz_prevprime", speed_mpz_prevprime }, + { "mpz_prevprime_1", speed_mpz_prevprime_1, FLAG_R_OPTIONAL }, + + { "mpz_jacobi", speed_mpz_jacobi }, + { "mpn_jacobi_base", speed_mpn_jacobi_base }, + { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1 }, + { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 }, + { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 }, + { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4 }, + + { "mpn_mul", speed_mpn_mul, FLAG_R_OPTIONAL }, + { "mpn_mul_basecase", speed_mpn_mul_basecase,FLAG_R_OPTIONAL }, + { "mpn_sqr_basecase", speed_mpn_sqr_basecase }, +#if HAVE_NATIVE_mpn_sqr_diagonal + { "mpn_sqr_diagonal", speed_mpn_sqr_diagonal }, +#endif +#if HAVE_NATIVE_mpn_sqr_diag_addlsh1 + { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 }, +#endif + + { "mpn_mul_n", speed_mpn_mul_n }, + { "mpn_sqr", speed_mpn_sqr }, + + { "mpn_toom2_sqr", speed_mpn_toom2_sqr }, + { "mpn_toom3_sqr", speed_mpn_toom3_sqr }, + { "mpn_toom4_sqr", speed_mpn_toom4_sqr }, + { "mpn_toom6_sqr", speed_mpn_toom6_sqr }, + { "mpn_toom8_sqr", speed_mpn_toom8_sqr }, + { "mpn_toom22_mul", speed_mpn_toom22_mul }, + { "mpn_toom33_mul", speed_mpn_toom33_mul }, + { "mpn_toom44_mul", speed_mpn_toom44_mul }, + { "mpn_toom6h_mul", speed_mpn_toom6h_mul }, + { "mpn_toom8h_mul", speed_mpn_toom8h_mul }, + { "mpn_toom32_mul", speed_mpn_toom32_mul }, + { "mpn_toom42_mul", speed_mpn_toom42_mul }, + { "mpn_toom43_mul", speed_mpn_toom43_mul }, + { "mpn_toom63_mul", speed_mpn_toom63_mul }, + { "mpn_nussbaumer_mul", speed_mpn_nussbaumer_mul }, + { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr}, +#if WANT_OLD_FFT_FULL + { "mpn_mul_fft_full", speed_mpn_mul_fft_full }, + { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr }, +#endif + { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL }, + { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL }, + + { "mpn_sqrlo", speed_mpn_sqrlo }, + { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase }, + { "mpn_mullo_n", speed_mpn_mullo_n }, + { "mpn_mullo_basecase", speed_mpn_mullo_basecase }, + + { "mpn_mulmid_basecase", speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL }, + { "mpn_toom42_mulmid", speed_mpn_toom42_mulmid }, + { "mpn_mulmid_n", speed_mpn_mulmid_n }, + { "mpn_mulmid", speed_mpn_mulmid, FLAG_R_OPTIONAL }, + + { "mpn_bc_mulmod_bnm1", speed_mpn_bc_mulmod_bnm1 }, + { "mpn_mulmod_bnm1", speed_mpn_mulmod_bnm1 }, + { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded }, + { "mpn_sqrmod_bnm1", speed_mpn_sqrmod_bnm1 }, + + { "mpn_mulmod_bknp1", speed_mpn_mulmod_bknp1, FLAG_R_OPTIONAL }, + { "mpn_sqrmod_bknp1", speed_mpn_sqrmod_bknp1, FLAG_R_OPTIONAL }, + { "mpn_mulmod_bnp1", speed_mpn_mulmod_bnp1 }, + { "mpn_sqrmod_bnp1", speed_mpn_sqrmod_bnp1 }, + + { "mpn_invert", speed_mpn_invert }, + { "mpn_invertappr", speed_mpn_invertappr }, + { "mpn_ni_invertappr", speed_mpn_ni_invertappr }, + { "mpn_binvert", speed_mpn_binvert }, + { "mpn_sec_invert", speed_mpn_sec_invert }, + + { "mpn_sbpi1_div_qr", speed_mpn_sbpi1_div_qr, FLAG_R_OPTIONAL}, + { "mpn_dcpi1_div_qr", speed_mpn_dcpi1_div_qr, FLAG_R_OPTIONAL}, + { "mpn_mu_div_qr", speed_mpn_mu_div_qr, FLAG_R_OPTIONAL}, + { "mpn_mupi_div_qr", speed_mpn_mupi_div_qr, FLAG_R_OPTIONAL}, + { "mpn_sbpi1_divappr_q", speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL}, + { "mpn_dcpi1_divappr_q", speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL}, + + { "mpn_sbpi1_bdiv_qr", speed_mpn_sbpi1_bdiv_qr }, + { "mpn_dcpi1_bdiv_qr", speed_mpn_dcpi1_bdiv_qr }, + { "mpn_sbpi1_bdiv_q", speed_mpn_sbpi1_bdiv_q }, + { "mpn_dcpi1_bdiv_q", speed_mpn_dcpi1_bdiv_q }, + { "mpn_sbpi1_bdiv_r", speed_mpn_sbpi1_bdiv_r }, + + { "mpn_broot", speed_mpn_broot, FLAG_R }, + { "mpn_broot_invm1", speed_mpn_broot_invm1, FLAG_R }, + { "mpn_brootinv", speed_mpn_brootinv, FLAG_R }, + + { "mpn_get_str", speed_mpn_get_str, FLAG_R_OPTIONAL }, + { "mpn_set_str", speed_mpn_set_str, FLAG_R_OPTIONAL }, + { "mpn_set_str_basecase", speed_mpn_bc_set_str, FLAG_R_OPTIONAL }, + + { "mpn_sqrtrem", speed_mpn_sqrtrem }, + { "mpn_rootrem", speed_mpn_rootrem, FLAG_R }, + { "mpn_sqrt", speed_mpn_sqrt }, + { "mpn_root", speed_mpn_root, FLAG_R }, + + { "mpn_perfect_power_p", speed_mpn_perfect_power_p, }, + { "mpn_perfect_square_p", speed_mpn_perfect_square_p, }, + + { "mpn_fib2_ui", speed_mpn_fib2_ui, FLAG_NODATA }, + { "mpz_fib_ui", speed_mpz_fib_ui, FLAG_NODATA }, + { "mpz_fib2_ui", speed_mpz_fib2_ui, FLAG_NODATA }, + { "mpz_lucnum_ui", speed_mpz_lucnum_ui, FLAG_NODATA }, + { "mpz_lucnum2_ui", speed_mpz_lucnum2_ui, FLAG_NODATA }, + + { "mpz_add", speed_mpz_add }, + { "mpz_invert", speed_mpz_invert, FLAG_R_OPTIONAL }, + { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL }, + { "mpz_bin_ui", speed_mpz_bin_ui, FLAG_NODATA | FLAG_R_OPTIONAL }, + { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA }, + { "mpz_2fac_ui", speed_mpz_2fac_ui, FLAG_NODATA }, + { "mpz_mfac_uiui", speed_mpz_mfac_uiui, FLAG_NODATA | FLAG_R_OPTIONAL }, + { "mpz_primorial_ui", speed_mpz_primorial_ui, FLAG_NODATA }, + { "mpz_powm", speed_mpz_powm, FLAG_R_OPTIONAL }, + { "mpz_powm_mod", speed_mpz_powm_mod }, + { "mpz_powm_redc", speed_mpz_powm_redc }, + { "mpz_powm_sec", speed_mpz_powm_sec }, + { "mpz_powm_ui", speed_mpz_powm_ui, FLAG_R_OPTIONAL }, + + { "mpz_mod", speed_mpz_mod }, + { "mpn_redc_1", speed_mpn_redc_1 }, + { "mpn_redc_2", speed_mpn_redc_2 }, + { "mpn_redc_n", speed_mpn_redc_n }, + + { "MPN_COPY", speed_MPN_COPY }, + { "MPN_COPY_INCR", speed_MPN_COPY_INCR }, + { "MPN_COPY_DECR", speed_MPN_COPY_DECR }, + { "memcpy", speed_memcpy }, +#if HAVE_NATIVE_mpn_copyi + { "mpn_copyi", speed_mpn_copyi }, +#endif +#if HAVE_NATIVE_mpn_copyd + { "mpn_copyd", speed_mpn_copyd }, +#endif + { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL }, +#if HAVE_NATIVE_mpn_addlsh1_n == 1 + { "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_sublsh1_n == 1 + { "mpn_sublsh1_n", speed_mpn_sublsh1_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_addlsh1_n_ip2 + { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2 }, +#endif +#if HAVE_NATIVE_mpn_sublsh1_n_ip1 + { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_rsblsh1_n == 1 + { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addlsh2_n == 1 + { "mpn_addlsh2_n", speed_mpn_addlsh2_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_sublsh2_n == 1 + { "mpn_sublsh2_n", speed_mpn_sublsh2_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addlsh2_n_ip1 + { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_addlsh2_n_ip2 + { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2 }, +#endif +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 + { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_rsblsh2_n == 1 + { "mpn_rsblsh2_n", speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addlsh_n + { "mpn_addlsh_n", speed_mpn_addlsh_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_sublsh_n + { "mpn_sublsh_n", speed_mpn_sublsh_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_addlsh_n_ip1 + { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_addlsh_n_ip2 + { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2 }, +#endif +#if HAVE_NATIVE_mpn_sublsh_n_ip1 + { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1 }, +#endif +#if HAVE_NATIVE_mpn_rsblsh_n + { "mpn_rsblsh_n", speed_mpn_rsblsh_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_rsh1add_n + { "mpn_rsh1add_n", speed_mpn_rsh1add_n, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_rsh1sub_n + { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL }, +#endif + + { "mpn_cnd_add_n", speed_mpn_cnd_add_n, FLAG_R_OPTIONAL }, + { "mpn_cnd_sub_n", speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL }, + + { "MPN_ZERO", speed_MPN_ZERO }, + + { "binvert_limb", speed_binvert_limb, FLAG_NODATA }, + { "binvert_limb_mul1", speed_binvert_limb_mul1, FLAG_NODATA }, + { "binvert_limb_loop", speed_binvert_limb_loop, FLAG_NODATA }, + { "binvert_limb_cond", speed_binvert_limb_cond, FLAG_NODATA }, + { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA }, + + { "malloc_free", speed_malloc_free }, + { "malloc_realloc_free", speed_malloc_realloc_free }, + { "gmp_allocate_free", speed_gmp_allocate_free }, + { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free }, + { "mpz_init_clear", speed_mpz_init_clear }, + { "mpq_init_clear", speed_mpq_init_clear }, + { "mpf_init_clear", speed_mpf_init_clear }, + { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear }, + + { "umul_ppmm", speed_umul_ppmm, FLAG_R_OPTIONAL }, +#if HAVE_NATIVE_mpn_umul_ppmm + { "mpn_umul_ppmm", speed_mpn_umul_ppmm, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_umul_ppmm_r + { "mpn_umul_ppmm_r", speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL }, +#endif + + { "count_leading_zeros", speed_count_leading_zeros, FLAG_NODATA | FLAG_R_OPTIONAL }, + { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL }, + + { "udiv_qrnnd", speed_udiv_qrnnd, FLAG_R_OPTIONAL }, + { "udiv_qrnnd_c", speed_udiv_qrnnd_c, FLAG_R_OPTIONAL }, +#if HAVE_NATIVE_mpn_udiv_qrnnd + { "mpn_udiv_qrnnd", speed_mpn_udiv_qrnnd, FLAG_R_OPTIONAL }, +#endif +#if HAVE_NATIVE_mpn_udiv_qrnnd_r + { "mpn_udiv_qrnnd_r", speed_mpn_udiv_qrnnd_r, FLAG_R_OPTIONAL }, +#endif + { "invert_limb", speed_invert_limb, FLAG_R_OPTIONAL }, + + { "operator_div", speed_operator_div, FLAG_R_OPTIONAL }, + { "operator_mod", speed_operator_mod, FLAG_R_OPTIONAL }, + + { "gmp_randseed", speed_gmp_randseed, FLAG_R_OPTIONAL }, + { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA }, + { "mpz_urandomb", speed_mpz_urandomb, FLAG_R_OPTIONAL | FLAG_NODATA }, + +#ifdef SPEED_EXTRA_ROUTINES + SPEED_EXTRA_ROUTINES +#endif +#ifdef SPEED_EXTRA_ROUTINES2 + SPEED_EXTRA_ROUTINES2 +#endif +}; + + +struct choice_t { + const struct routine_t *p; + mp_limb_t r; + double scale; + double time; + int no_time; + double prev_time; + const char *name; +}; +struct choice_t *choice; +int num_choices = 0; + + +void +data_fill (mp_ptr ptr, mp_size_t size) +{ + switch (option_data) { + case DATA_RANDOM: + mpn_random (ptr, size); + break; + case DATA_RANDOM2: + mpn_random2 (ptr, size); + break; + case DATA_ZEROS: + MPN_ZERO (ptr, size); + break; + case DATA_AAS: + MPN_FILL (ptr, size, GMP_NUMB_0xAA); + break; + case DATA_FFS: + MPN_FILL (ptr, size, GMP_NUMB_MAX); + break; + case DATA_2FD: + MPN_FILL (ptr, size, GMP_NUMB_MAX); + ptr[0] -= 2; + break; + default: + abort(); + /*NOTREACHED*/ + } +} + +/* The code here handling the various combinations of output options isn't + too attractive, but it works and is fairly clean. */ + +#define SIZE_TO_DIVISOR(n) \ + (option_square == 1 ? (n)*(n) \ + : option_square == 2 ? (n)*((n)+1)/2 \ + : (n)) + +void +run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size) +{ + const char *first_open_fastest, *first_open_notfastest, *first_close; + int i, fastest, want_data; + double fastest_time; + TMP_DECL; + + TMP_MARK; + + /* allocate data, unless all routines are NODATA */ + want_data = 0; + for (i = 0; i < num_choices; i++) + want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0); + + if (want_data) + { + SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp); + SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp); + + data_fill (s->xp, s->size); + data_fill (s->yp, s->size); + } + else + { + sp.xp = NULL; + sp.yp = NULL; + } + + if (prev_size == -1 && option_cmp == CMP_DIFFPREV) + { + first_open_fastest = "(#"; + first_open_notfastest = " ("; + first_close = ")"; + } + else + { + first_open_fastest = "#"; + first_open_notfastest = " "; + first_close = ""; + } + + fastest = -1; + fastest_time = -1.0; + for (i = 0; i < num_choices; i++) + { + s->r = choice[i].r; + choice[i].time = speed_measure (choice[i].p->fun, s); + choice[i].no_time = (choice[i].time == -1.0); + if (! choice[i].no_time) + choice[i].time *= choice[i].scale; + + /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time + is before any differences. */ + { + double t; + t = choice[i].time; + if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1) + { + if (choice[i].prev_time == -1.0) + choice[i].no_time = 1; + else + choice[i].time = choice[i].time - choice[i].prev_time; + } + choice[i].prev_time = t; + } + + if (choice[i].no_time) + continue; + + /* Look for the fastest after CMP_DIFFPREV has been applied, but + before CMP_RATIO or CMP_DIFFERENCE. There's only a fastest shown + if there's more than one routine. */ + if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time)) + { + fastest = i; + fastest_time = choice[i].time; + } + + if (option_cmp == CMP_DIFFPREV) + { + /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */ + if (option_unit == UNIT_CYCLES) + choice[i].time /= speed_cycletime; + else if (option_unit == UNIT_CYCLESPERLIMB) + { + if (prev_size == -1) + choice[i].time /= speed_cycletime; + else + choice[i].time /= (speed_cycletime + * (SIZE_TO_DIVISOR(s->size) + - SIZE_TO_DIVISOR(prev_size))); + } + } + else + { + if (option_unit == UNIT_CYCLES) + choice[i].time /= speed_cycletime; + else if (option_unit == UNIT_CYCLESPERLIMB) + choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size)); + + if (option_cmp == CMP_RATIO && i > 0) + { + /* A ratio isn't affected by the units chosen. */ + if (choice[0].no_time || choice[0].time == 0.0) + choice[i].no_time = 1; + else + choice[i].time /= choice[0].time; + } + else if (option_cmp == CMP_DIFFERENCE && i > 0) + { + if (choice[0].no_time) + { + choice[i].no_time = 1; + continue; + } + choice[i].time -= choice[0].time; + } + } + } + + if (option_gnuplot) + { + /* In CMP_DIFFPREV, don't print anything for the first size, start + with the second where an actual difference is available. + + In CMP_RATIO, print the first column as 1.0. + + The 9 decimals printed is much more than the expected precision of + the measurements actually. */ + + if (! (option_cmp == CMP_DIFFPREV && prev_size == -1)) + { + fprintf (fp, "%-6ld ", s->size); + for (i = 0; i < num_choices; i++) + fprintf (fp, " %.9e", + choice[i].no_time ? 0.0 + : (option_cmp == CMP_RATIO && i == 0) ? 1.0 + : choice[i].time); + fprintf (fp, "\n"); + } + } + else + { + fprintf (fp, "%-6ld ", s->size); + for (i = 0; i < num_choices; i++) + { + char buf[128]; + int decimals; + + if (choice[i].no_time) + { + fprintf (fp, " %*s", COLUMN_WIDTH, "n/a"); + } + else + {if (option_unit == UNIT_CYCLESPERLIMB + || (option_cmp == CMP_RATIO && i > 0)) + decimals = 4; + else if (option_unit == UNIT_CYCLES) + decimals = 2; + else + decimals = 9; + + sprintf (buf, "%s%.*f%s", + i == fastest ? first_open_fastest : first_open_notfastest, + decimals, choice[i].time, first_close); + fprintf (fp, " %*s", COLUMN_WIDTH, buf); + } + } + fprintf (fp, "\n"); + } + + TMP_FREE; +} + +void +run_all (FILE *fp) +{ + mp_size_t prev_size; + int i; + TMP_DECL; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp); + SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp); + + data_fill (sp.xp_block, SPEED_BLOCK_SIZE); + data_fill (sp.yp_block, SPEED_BLOCK_SIZE); + + for (i = 0; i < size_num; i++) + { + sp.size = size_array[i].start; + prev_size = -1; + for (;;) + { + mp_size_t step; + + if (option_data == DATA_2FD && sp.size >= 2) + sp.xp[sp.size-1] = 2; + + run_one (fp, &sp, prev_size); + prev_size = sp.size; + + if (option_data == DATA_2FD && sp.size >= 2) + sp.xp[sp.size-1] = MP_LIMB_T_MAX; + + if (option_factor != 0.0) + { + step = (mp_size_t) (sp.size * option_factor - sp.size); + if (step < 1) + step = 1; + } + else + step = 1; + if (step < option_step) + step = option_step; + + sp.size += step; + if (sp.size > size_array[i].end) + break; + } + } + + TMP_FREE; +} + + +FILE * +fopen_for_write (const char *filename) +{ + FILE *fp; + if ((fp = fopen (filename, "w")) == NULL) + { + fprintf (stderr, "Cannot create %s\n", filename); + exit(1); + } + return fp; +} + +void +fclose_written (FILE *fp, const char *filename) +{ + int err; + + err = ferror (fp); + err |= fclose (fp); + + if (err) + { + fprintf (stderr, "Error writing %s\n", filename); + exit(1); + } +} + + +void +run_gnuplot (int argc, char *argv[]) +{ + char *plot_filename; + char *data_filename; + FILE *fp; + int i; + + plot_filename = (char *) (*__gmp_allocate_func) + (strlen (option_gnuplot_basename) + 20); + data_filename = (char *) (*__gmp_allocate_func) + (strlen (option_gnuplot_basename) + 20); + + sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename); + sprintf (data_filename, "%s.data", option_gnuplot_basename); + + fp = fopen_for_write (plot_filename); + + fprintf (fp, "# Generated with:\n"); + fprintf (fp, "#"); + for (i = 0; i < argc; i++) + fprintf (fp, " %s", argv[i]); + fprintf (fp, "\n"); + fprintf (fp, "\n"); + + fprintf (fp, "reset\n"); + + /* Putting the key at the top left is usually good, and you can change it + interactively if it's not. */ + fprintf (fp, "set key left\n"); + + /* write underscores, not subscripts */ + fprintf (fp, "set termoption noenhanced\n"); + + /* designed to make it possible to see crossovers easily */ + fprintf (fp, "set style data lines\n"); + + fprintf (fp, "plot "); + for (i = 0; i < num_choices; i++) + { + fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2); + fprintf (fp, " title \"%s\"", choice[i].name); + + if (i != num_choices-1) + fprintf (fp, ", \\"); + fprintf (fp, "\n"); + } + + fprintf (fp, "load \"-\"\n"); + fclose_written (fp, plot_filename); + + fp = fopen_for_write (data_filename); + + /* Unbuffered so you can see where the program was up to if it crashes or + you kill it. */ + setbuf (fp, NULL); + + run_all (fp); + fclose_written (fp, data_filename); +} + + +/* Return a limb with n many one bits (starting from the least significant) */ + +#define LIMB_ONES(n) \ + ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX \ + : (n) == 0 ? CNST_LIMB(0) \ + : (CNST_LIMB(1) << (n)) - 1) + +mp_limb_t +r_string (const char *s) +{ + const char *s_orig = s; + long n; + + if (strcmp (s, "aas") == 0) + return GMP_NUMB_0xAA; + + { + mpz_t z; + mp_limb_t l; + int set, siz; + + mpz_init (z); + set = mpz_set_str (z, s, 0); + siz = SIZ(z); + l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]); + mpz_clear (z); + if (set == 0) + { + if (siz > 1 || siz < -1) + printf ("Warning, r parameter %s truncated to %d bits\n", + s_orig, GMP_LIMB_BITS); + return l; + } + } + + if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) + n = strtoul (s+2, (char **) &s, 16); + else + n = strtol (s, (char **) &s, 10); + + if (strcmp (s, "bits") == 0) + { + mp_limb_t l; + if (n > GMP_LIMB_BITS) + { + fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", + n, GMP_LIMB_BITS); + exit (1); + } + mpn_random (&l, 1); + return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n); + } + else if (strcmp (s, "ones") == 0) + { + if (n > GMP_LIMB_BITS) + { + fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", + n, GMP_LIMB_BITS); + exit (1); + } + return LIMB_ONES (n); + } + else if (*s != '\0') + { + fprintf (stderr, "invalid r parameter: %s\n", s_orig); + exit (1); + } + + return n; +} + + +void +routine_find (struct choice_t *c, const char *s_orig) +{ + const char *s; + int i; + size_t nlen; + + c->name = s_orig; + s = strchr (s_orig, '*'); + if (s != NULL) + { + c->scale = atof(s_orig); + s++; + } + else + { + c->scale = 1.0; + s = s_orig; + } + + for (i = 0; i < numberof (routine); i++) + { + nlen = strlen (routine[i].name); + if (memcmp (s, routine[i].name, nlen) != 0) + continue; + + if (s[nlen] == '.') + { + /* match, with a .r parameter */ + + if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL))) + { + fprintf (stderr, + "Choice %s bad: doesn't take a \".\" parameter\n", + s_orig); + exit (1); + } + + c->p = &routine[i]; + c->r = r_string (s + nlen + 1); + return; + } + + if (s[nlen] == '\0') + { + /* match, with no parameter */ + + if (routine[i].flag & FLAG_R) + { + fprintf (stderr, + "Choice %s bad: needs a \".\" parameter\n", + s_orig); + exit (1); + } + + c->p = &routine[i]; + c->r = 0; + return; + } + } + + fprintf (stderr, "Choice %s unrecognised\n", s_orig); + exit (1); +} + + +void +usage (void) +{ + int i; + + speed_time_init (); + + printf ("Usage: speed [-options] -s size ...\n"); + printf ("Measure the speed of some routines.\n"); + printf ("Times are in seconds, accuracy is shown.\n"); + printf ("\n"); + printf (" -p num set precision as number of time units each routine must run\n"); + printf (" -s size[-end][,size[-end]]... sizes to measure\n"); + printf (" single sizes or ranges, sep with comma or use multiple -s\n"); + printf (" -t step step through sizes by given amount\n"); + printf (" -f factor step through sizes by given factor (eg. 1.05)\n"); + printf (" -r show times as ratios of the first routine\n"); + printf (" -d show times as difference from the first routine\n"); + printf (" -D show times as difference from previous size shown\n"); + printf (" -c show times in CPU cycles\n"); + printf (" -C show times in cycles per limb\n"); + printf (" -u print resource usage (memory) at end\n"); + printf (" -P name output plot files \"name.gnuplot\" and \"name.data\"\n"); + printf (" -a use given data: random(default), random2, zeros, aas, ffs, 2fd\n"); + printf (" -x, -y, -w, -W specify data alignments, sources and dests\n"); + printf (" -o addrs print addresses of data blocks\n"); + printf ("\n"); + printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n"); + printf ("is greater.\n"); + printf ("If both -C and -D are used, it means cycles per however many limbs between a\n"); + printf ("size and the previous size.\n"); + printf ("\n"); + printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n"); + printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n"); + printf ("a log/log plot).\n"); + printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n"); + printf ("when viewing more than one routine, it means same axis scales for all data).\n"); + printf ("\n"); + printf ("The available routines are as follows.\n"); + printf ("\n"); + + for (i = 0; i < numberof (routine); i++) + { + if (routine[i].flag & FLAG_R) + printf ("\t%s.r\n", routine[i].name); + else if (routine[i].flag & FLAG_R_OPTIONAL) + printf ("\t%s (optional .r)\n", routine[i].name); + else + printf ("\t%s\n", routine[i].name); + } + printf ("\n"); + printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n"); + printf ("r should be in decimal, or use 0xN for hexadecimal.\n"); + printf ("\n"); + printf ("Special forms for r are \"bits\" for a random N bit number, \"ones\" for\n"); + printf ("N one bits, or \"aas\" for 0xAA..AA.\n"); + printf ("\n"); + printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n"); + printf ("The fastest routine at each size is marked with a # (free form output only).\n"); + printf ("\n"); + printf ("%s", speed_time_string); + printf ("\n"); + printf ("Gnuplot home page http://www.gnuplot.info/\n"); + printf ("Quickplot home page http://quickplot.sourceforge.net/\n"); +} + +void +check_align_option (const char *name, mp_size_t align) +{ + if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK) + { + fprintf (stderr, "Alignment request out of range: %s %ld\n", + name, (long) align); + fprintf (stderr, " should be 0 to %d (limbs), inclusive\n", + SPEED_TMP_ALLOC_ADJUST_MASK); + exit (1); + } +} + +int +main (int argc, char *argv[]) +{ + int i; + int opt; + + /* Unbuffered so output goes straight out when directed to a pipe or file + and isn't lost on killing the program half way. */ + setbuf (stdout, NULL); + + for (;;) + { + opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z"); + if (opt == EOF) + break; + + switch (opt) { + case 'a': + if (strcmp (optarg, "random") == 0) option_data = DATA_RANDOM; + else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2; + else if (strcmp (optarg, "zeros") == 0) option_data = DATA_ZEROS; + else if (strcmp (optarg, "aas") == 0) option_data = DATA_AAS; + else if (strcmp (optarg, "ffs") == 0) option_data = DATA_FFS; + else if (strcmp (optarg, "2fd") == 0) option_data = DATA_2FD; + else + { + fprintf (stderr, "unrecognised data option: %s\n", optarg); + exit (1); + } + break; + case 'C': + if (option_unit != UNIT_SECONDS) goto bad_unit; + option_unit = UNIT_CYCLESPERLIMB; + break; + case 'c': + if (option_unit != UNIT_SECONDS) + { + bad_unit: + fprintf (stderr, "cannot use more than one of -c, -C\n"); + exit (1); + } + option_unit = UNIT_CYCLES; + break; + case 'D': + if (option_cmp != CMP_ABSOLUTE) goto bad_cmp; + option_cmp = CMP_DIFFPREV; + break; + case 'd': + if (option_cmp != CMP_ABSOLUTE) + { + bad_cmp: + fprintf (stderr, "cannot use more than one of -d, -D, -r\n"); + exit (1); + } + option_cmp = CMP_DIFFERENCE; + break; + case 'E': + option_square = 1; + break; + case 'F': + option_square = 2; + break; + case 'f': + option_factor = atof (optarg); + if (option_factor <= 1.0) + { + fprintf (stderr, "-f factor must be > 1.0\n"); + exit (1); + } + break; + case 'o': + speed_option_set (optarg); + break; + case 'P': + option_gnuplot = 1; + option_gnuplot_basename = optarg; + break; + case 'p': + speed_precision = atoi (optarg); + break; + case 'R': + option_seed = time (NULL); + break; + case 'r': + if (option_cmp != CMP_ABSOLUTE) + goto bad_cmp; + option_cmp = CMP_RATIO; + break; + case 's': + { + char *s; + for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ",")) + { + if (size_num == size_allocnum) + { + size_array = (struct size_array_t *) + __gmp_allocate_or_reallocate + (size_array, + size_allocnum * sizeof(size_array[0]), + (size_allocnum+10) * sizeof(size_array[0])); + size_allocnum += 10; + } + if (sscanf (s, "%ld-%ld", + &size_array[size_num].start, + &size_array[size_num].end) != 2) + { + size_array[size_num].start = size_array[size_num].end + = atol (s); + } + + if (size_array[size_num].start < 0 + || size_array[size_num].end < 0 + || size_array[size_num].start > size_array[size_num].end) + { + fprintf (stderr, "invalid size parameter: %s\n", s); + exit (1); + } + + size_num++; + } + } + break; + case 't': + option_step = atol (optarg); + if (option_step < 1) + { + fprintf (stderr, "-t step must be >= 1\n"); + exit (1); + } + break; + case 'u': + option_resource_usage = 1; + break; + case 'z': + sp.cache = 1; + break; + case 'x': + sp.align_xp = atol (optarg); + check_align_option ("-x", sp.align_xp); + break; + case 'y': + sp.align_yp = atol (optarg); + check_align_option ("-y", sp.align_yp); + break; + case 'w': + sp.align_wp = atol (optarg); + check_align_option ("-w", sp.align_wp); + break; + case 'W': + sp.align_wp2 = atol (optarg); + check_align_option ("-W", sp.align_wp2); + break; + case '?': + exit(1); + } + } + + if (optind >= argc) + { + usage (); + exit (1); + } + + if (size_num == 0) + { + fprintf (stderr, "-s must be specified\n"); + exit (1); + } + + gmp_randinit_default (__gmp_rands); + __gmp_rands_initialized = 1; + gmp_randseed_ui (__gmp_rands, option_seed); + + choice = (struct choice_t *) (*__gmp_allocate_func) + ((argc - optind) * sizeof(choice[0])); + for ( ; optind < argc; optind++) + { + struct choice_t c; + routine_find (&c, argv[optind]); + choice[num_choices] = c; + num_choices++; + } + + if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) && + num_choices < 2) + { + fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n"); + } + + speed_time_init (); + if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB) + speed_cycletime_need_cycles (); + else + speed_cycletime_need_seconds (); + + if (option_gnuplot) + { + run_gnuplot (argc, argv); + } + else + { + if (option_unit == UNIT_SECONDS) + printf ("overhead %.9f secs", speed_measure (speed_noop, NULL)); + else + printf ("overhead %.2f cycles", + speed_measure (speed_noop, NULL) / speed_cycletime); + printf (", precision %d units of %.2e secs", + speed_precision, speed_unittime); + + if (speed_cycletime == 1.0 || speed_cycletime == 0.0) + printf (", CPU freq unknown\n"); + else + printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); + + printf (" "); + for (i = 0; i < num_choices; i++) + printf (" %*s", COLUMN_WIDTH, choice[i].name); + printf ("\n"); + + run_all (stdout); + } + + if (option_resource_usage) + { +#if HAVE_GETRUSAGE + { + /* This doesn't give data sizes on linux 2.0.x, only utime. */ + struct rusage r; + if (getrusage (RUSAGE_SELF, &r) != 0) + perror ("getrusage"); + else + printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n", + (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec, + r.ru_idrss, r.ru_isrss, r.ru_ixrss); + } +#else + printf ("getrusage() not available\n"); +#endif + + /* Linux kernel. */ + { + char buf[128]; + sprintf (buf, "/proc/%d/status", getpid()); + if (access (buf, R_OK) == 0) + { + sprintf (buf, "cat /proc/%d/status", getpid()); + system (buf); + } + + } + } + + return 0; +} diff --git a/gmp-6.3.0/tune/speed.h b/gmp-6.3.0/tune/speed.h new file mode 100644 index 0000000..f09472c --- /dev/null +++ b/gmp-6.3.0/tune/speed.h @@ -0,0 +1,3981 @@ +/* Header for speed and threshold things. + +Copyright 1999-2003, 2005, 2006, 2008-2017, 2019-2022 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#ifndef __SPEED_H__ +#define __SPEED_H__ + + +/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it + newsize long. */ +#define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \ + do { \ + ASSERT ((newsize) >= (oldsize)); \ + MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \ + } while (0) + +/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on + x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */ +#define MP_LIMB_T_LOWBITMASK(n) \ + ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) + + +/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ + +#define TMP_ALLOC_ALIGNED(bytes, align) \ + align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align)) +#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ + ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) + +/* CACHE_LINE_SIZE is our default alignment for speed operands, and the + limit on what s->align_xp etc and then request for off-alignment. Maybe + this should be an option of some sort, but in any case here are some line + sizes, + + bytes + 32 pentium + 64 athlon + 64 itanium-2 L1 + 128 itanium-2 L2 +*/ +#define CACHE_LINE_SIZE 64 /* bytes */ + +#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1) + +/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb + alignment. */ +#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \ + do { \ + mp_ptr __ptr; \ + mp_size_t __ptr_align, __ptr_add; \ + \ + ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0); \ + __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \ + __ptr_align = (__ptr - (mp_ptr) NULL); \ + __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \ + (ptr) = __ptr + __ptr_add; \ + } while (0) + + +/* This is the size for s->xp_block and s->yp_block, used in certain + routines that want to run across many different data values and use + s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. + + 512 means 2kbytes of data for each of xp_block and yp_block, making 4k + total, which should fit easily in any L1 data cache. */ + +#define SPEED_BLOCK_SIZE 512 /* limbs */ + + +extern double speed_unittime; +extern double speed_cycletime; +extern int speed_precision; +extern char speed_time_string[]; +void speed_time_init (void); +void speed_cycletime_fail (const char *str); +void speed_cycletime_init (void); +void speed_cycletime_need_cycles (void); +void speed_cycletime_need_seconds (void); +void speed_starttime (void); +double speed_endtime (void); + + +struct speed_params { + unsigned reps; /* how many times to run the routine */ + mp_ptr xp; /* first argument */ + mp_ptr yp; /* second argument */ + mp_size_t size; /* size of both arguments */ + mp_limb_t r; /* user supplied parameter */ + mp_size_t align_xp; /* alignment of xp */ + mp_size_t align_yp; /* alignment of yp */ + mp_size_t align_wp; /* intended alignment of wp */ + mp_size_t align_wp2; /* intended alignment of wp2 */ + mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ + mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ + + double time_divisor; /* optionally set by the speed routine */ + + /* used by the cache priming things */ + int cache; + unsigned src_num, dst_num; + struct { + mp_ptr ptr; + mp_size_t size; + } src[5], dst[4]; +}; + +typedef double (*speed_function_t) (struct speed_params *); + +double speed_measure (speed_function_t fun, struct speed_params *); + +/* Prototypes for speed measuring routines */ + +double speed_back_to_back (struct speed_params *); +double speed_count_leading_zeros (struct speed_params *); +double speed_count_trailing_zeros (struct speed_params *); +double speed_find_a (struct speed_params *); +double speed_gmp_allocate_free (struct speed_params *); +double speed_gmp_allocate_reallocate_free (struct speed_params *); +double speed_invert_limb (struct speed_params *); +double speed_malloc_free (struct speed_params *); +double speed_malloc_realloc_free (struct speed_params *); +double speed_memcpy (struct speed_params *); +double speed_binvert_limb (struct speed_params *); +double speed_binvert_limb_mul1 (struct speed_params *); +double speed_binvert_limb_loop (struct speed_params *); +double speed_binvert_limb_cond (struct speed_params *); +double speed_binvert_limb_arith (struct speed_params *); + +double speed_mpf_init_clear (struct speed_params *); + +double speed_mpn_add_n (struct speed_params *); +double speed_mpn_add_1 (struct speed_params *); +double speed_mpn_add_1_inplace (struct speed_params *); +double speed_mpn_add_err1_n (struct speed_params *); +double speed_mpn_add_err2_n (struct speed_params *); +double speed_mpn_add_err3_n (struct speed_params *); +double speed_mpn_addlsh_n (struct speed_params *); +double speed_mpn_addlsh1_n (struct speed_params *); +double speed_mpn_addlsh2_n (struct speed_params *); +double speed_mpn_addlsh_n_ip1 (struct speed_params *); +double speed_mpn_addlsh1_n_ip1 (struct speed_params *); +double speed_mpn_addlsh2_n_ip1 (struct speed_params *); +double speed_mpn_addlsh_n_ip2 (struct speed_params *); +double speed_mpn_addlsh1_n_ip2 (struct speed_params *); +double speed_mpn_addlsh2_n_ip2 (struct speed_params *); +double speed_mpn_add_n_sub_n (struct speed_params *); +double speed_mpn_and_n (struct speed_params *); +double speed_mpn_andn_n (struct speed_params *); +double speed_mpn_addmul_1 (struct speed_params *); +double speed_mpn_addmul_2 (struct speed_params *); +double speed_mpn_addmul_3 (struct speed_params *); +double speed_mpn_addmul_4 (struct speed_params *); +double speed_mpn_addmul_5 (struct speed_params *); +double speed_mpn_addmul_6 (struct speed_params *); +double speed_mpn_addmul_7 (struct speed_params *); +double speed_mpn_addmul_8 (struct speed_params *); +double speed_mpn_addaddmul_1msb0 (struct speed_params *); +double speed_mpn_cnd_add_n (struct speed_params *); +double speed_mpn_cnd_sub_n (struct speed_params *); +double speed_mpn_com (struct speed_params *); +double speed_mpn_neg (struct speed_params *); +double speed_mpn_copyd (struct speed_params *); +double speed_mpn_copyi (struct speed_params *); +double speed_MPN_COPY (struct speed_params *); +double speed_MPN_COPY_DECR (struct speed_params *); +double speed_MPN_COPY_INCR (struct speed_params *); +double speed_mpn_sec_tabselect (struct speed_params *); +double speed_mpn_divexact_1 (struct speed_params *); +double speed_mpn_divexact_by3 (struct speed_params *); +double speed_mpn_bdiv_q_1 (struct speed_params *); +double speed_mpn_pi1_bdiv_q_1 (struct speed_params *); +double speed_mpn_bdiv_dbm1c (struct speed_params *); +double speed_mpn_divrem_1 (struct speed_params *); +double speed_mpn_divrem_1f (struct speed_params *); +double speed_mpn_divrem_1c (struct speed_params *); +double speed_mpn_divrem_1cf (struct speed_params *); +double speed_mpn_divrem_1_div (struct speed_params *); +double speed_mpn_divrem_1f_div (struct speed_params *); +double speed_mpn_divrem_1_inv (struct speed_params *); +double speed_mpn_divrem_1f_inv (struct speed_params *); +double speed_mpn_divrem_2 (struct speed_params *); +double speed_mpn_divrem_2_div (struct speed_params *); +double speed_mpn_divrem_2_inv (struct speed_params *); +double speed_mpn_div_qr_1n_pi1 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_3 (struct speed_params *); +double speed_mpn_div_qr_1n_pi1_4 (struct speed_params *); +double speed_mpn_div_qr_1 (struct speed_params *); +double speed_mpn_div_qr_2n (struct speed_params *); +double speed_mpn_div_qr_2u (struct speed_params *); +double speed_mpn_fib2_ui (struct speed_params *); +double speed_mpn_matrix22_mul (struct speed_params *); +double speed_mpn_hgcd2 (struct speed_params *); +double speed_mpn_hgcd2_1 (struct speed_params *); +double speed_mpn_hgcd2_2 (struct speed_params *); +double speed_mpn_hgcd2_3 (struct speed_params *); +double speed_mpn_hgcd2_4 (struct speed_params *); +double speed_mpn_hgcd2_5 (struct speed_params *); +double speed_mpn_hgcd (struct speed_params *); +double speed_mpn_hgcd_lehmer (struct speed_params *); +double speed_mpn_hgcd_appr (struct speed_params *); +double speed_mpn_hgcd_appr_lehmer (struct speed_params *); +double speed_mpn_hgcd_reduce (struct speed_params *); +double speed_mpn_hgcd_reduce_1 (struct speed_params *); +double speed_mpn_hgcd_reduce_2 (struct speed_params *); +double speed_mpn_gcd (struct speed_params *); +double speed_mpn_gcd_1 (struct speed_params *); +double speed_mpn_gcd_11 (struct speed_params *); +double speed_mpn_gcd_1N (struct speed_params *); +double speed_mpn_gcd_22 (struct speed_params *); +double speed_mpn_gcdext (struct speed_params *); +double speed_mpn_gcdext_double (struct speed_params *); +double speed_mpn_gcdext_one_double (struct speed_params *); +double speed_mpn_gcdext_one_single (struct speed_params *); +double speed_mpn_gcdext_single (struct speed_params *); +double speed_mpn_get_str (struct speed_params *); +double speed_mpn_hamdist (struct speed_params *); +double speed_mpn_ior_n (struct speed_params *); +double speed_mpn_iorn_n (struct speed_params *); +double speed_mpn_jacobi_base (struct speed_params *); +double speed_mpn_jacobi_base_1 (struct speed_params *); +double speed_mpn_jacobi_base_2 (struct speed_params *); +double speed_mpn_jacobi_base_3 (struct speed_params *); +double speed_mpn_jacobi_base_4 (struct speed_params *); +double speed_mpn_lshift (struct speed_params *); +double speed_mpn_lshiftc (struct speed_params *); +double speed_mpn_mod_1 (struct speed_params *); +double speed_mpn_mod_1c (struct speed_params *); +double speed_mpn_mod_1_div (struct speed_params *); +double speed_mpn_mod_1_inv (struct speed_params *); +double speed_mpn_mod_1_1 (struct speed_params *); +double speed_mpn_mod_1_1_1 (struct speed_params *); +double speed_mpn_mod_1_1_2 (struct speed_params *); +double speed_mpn_mod_1_2 (struct speed_params *); +double speed_mpn_mod_1_3 (struct speed_params *); +double speed_mpn_mod_1_4 (struct speed_params *); +double speed_mpn_mod_34lsub1 (struct speed_params *); +double speed_mpn_modexact_1_odd (struct speed_params *); +double speed_mpn_modexact_1c_odd (struct speed_params *); +double speed_mpn_mul_1 (struct speed_params *); +double speed_mpn_mul_1_inplace (struct speed_params *); +double speed_mpn_mul_2 (struct speed_params *); +double speed_mpn_mul_3 (struct speed_params *); +double speed_mpn_mul_4 (struct speed_params *); +double speed_mpn_mul_5 (struct speed_params *); +double speed_mpn_mul_6 (struct speed_params *); +double speed_mpn_mul (struct speed_params *); +double speed_mpn_mul_basecase (struct speed_params *); +double speed_mpn_mulmid (struct speed_params *); +double speed_mpn_mulmid_basecase (struct speed_params *); +double speed_mpn_mul_fft (struct speed_params *); +double speed_mpn_mul_fft_sqr (struct speed_params *); +double speed_mpn_fft_mul (struct speed_params *); +double speed_mpn_fft_sqr (struct speed_params *); +#if WANT_OLD_FFT_FULL +double speed_mpn_mul_fft_full (struct speed_params *); +double speed_mpn_mul_fft_full_sqr (struct speed_params *); +#endif +double speed_mpn_nussbaumer_mul (struct speed_params *); +double speed_mpn_nussbaumer_mul_sqr (struct speed_params *); +double speed_mpn_mul_n (struct speed_params *); +double speed_mpn_mul_n_sqr (struct speed_params *); +double speed_mpn_mulmid_n (struct speed_params *); +double speed_mpn_sqrlo (struct speed_params *); +double speed_mpn_sqrlo_basecase (struct speed_params *); +double speed_mpn_mullo_n (struct speed_params *); +double speed_mpn_mullo_basecase (struct speed_params *); +double speed_mpn_nand_n (struct speed_params *); +double speed_mpn_nior_n (struct speed_params *); +double speed_mpn_popcount (struct speed_params *); +double speed_mpn_preinv_divrem_1 (struct speed_params *); +double speed_mpn_preinv_divrem_1f (struct speed_params *); +double speed_mpn_preinv_mod_1 (struct speed_params *); +double speed_mpn_sbpi1_div_qr (struct speed_params *); +double speed_mpn_dcpi1_div_qr (struct speed_params *); +double speed_mpn_sbpi1_divappr_q (struct speed_params *); +double speed_mpn_dcpi1_divappr_q (struct speed_params *); +double speed_mpn_mu_div_qr (struct speed_params *); +double speed_mpn_mu_divappr_q (struct speed_params *); +double speed_mpn_mupi_div_qr (struct speed_params *); +double speed_mpn_mu_div_q (struct speed_params *); +double speed_mpn_sbpi1_bdiv_qr (struct speed_params *); +double speed_mpn_dcpi1_bdiv_qr (struct speed_params *); +double speed_mpn_sbpi1_bdiv_q (struct speed_params *); +double speed_mpn_dcpi1_bdiv_q (struct speed_params *); +double speed_mpn_sbpi1_bdiv_r (struct speed_params *); +double speed_mpn_mu_bdiv_q (struct speed_params *); +double speed_mpn_mu_bdiv_qr (struct speed_params *); +double speed_mpn_broot (struct speed_params *); +double speed_mpn_broot_invm1 (struct speed_params *); +double speed_mpn_brootinv (struct speed_params *); +double speed_mpn_invert (struct speed_params *); +double speed_mpn_invertappr (struct speed_params *); +double speed_mpn_ni_invertappr (struct speed_params *); +double speed_mpn_sec_invert (struct speed_params *s); +double speed_mpn_binvert (struct speed_params *); +double speed_mpn_redc_1 (struct speed_params *); +double speed_mpn_redc_2 (struct speed_params *); +double speed_mpn_redc_n (struct speed_params *); +double speed_mpn_rsblsh_n (struct speed_params *); +double speed_mpn_rsblsh1_n (struct speed_params *); +double speed_mpn_rsblsh2_n (struct speed_params *); +double speed_mpn_rsh1add_n (struct speed_params *); +double speed_mpn_rsh1sub_n (struct speed_params *); +double speed_mpn_rshift (struct speed_params *); +double speed_mpn_sb_divrem_m3 (struct speed_params *); +double speed_mpn_sb_divrem_m3_div (struct speed_params *); +double speed_mpn_sb_divrem_m3_inv (struct speed_params *); +double speed_mpn_set_str (struct speed_params *); +double speed_mpn_bc_set_str (struct speed_params *); +double speed_mpn_dc_set_str (struct speed_params *); +double speed_mpn_set_str_pre (struct speed_params *); +double speed_mpn_sqr_basecase (struct speed_params *); +double speed_mpn_sqr_diag_addlsh1 (struct speed_params *); +double speed_mpn_sqr_diagonal (struct speed_params *); +double speed_mpn_sqr (struct speed_params *); +double speed_mpn_sqrtrem (struct speed_params *); +double speed_mpn_rootrem (struct speed_params *); +double speed_mpn_sqrt (struct speed_params *); +double speed_mpn_root (struct speed_params *); +double speed_mpn_perfect_power_p (struct speed_params *); +double speed_mpn_perfect_square_p (struct speed_params *); +double speed_mpn_sub_n (struct speed_params *); +double speed_mpn_sub_1 (struct speed_params *); +double speed_mpn_sub_1_inplace (struct speed_params *); +double speed_mpn_sub_err1_n (struct speed_params *); +double speed_mpn_sub_err2_n (struct speed_params *); +double speed_mpn_sub_err3_n (struct speed_params *); +double speed_mpn_sublsh_n (struct speed_params *); +double speed_mpn_sublsh1_n (struct speed_params *); +double speed_mpn_sublsh2_n (struct speed_params *); +double speed_mpn_sublsh_n_ip1 (struct speed_params *); +double speed_mpn_sublsh1_n_ip1 (struct speed_params *); +double speed_mpn_sublsh2_n_ip1 (struct speed_params *); +double speed_mpn_submul_1 (struct speed_params *); +double speed_mpn_toom2_sqr (struct speed_params *); +double speed_mpn_toom3_sqr (struct speed_params *); +double speed_mpn_toom4_sqr (struct speed_params *); +double speed_mpn_toom6_sqr (struct speed_params *); +double speed_mpn_toom8_sqr (struct speed_params *); +double speed_mpn_toom22_mul (struct speed_params *); +double speed_mpn_toom33_mul (struct speed_params *); +double speed_mpn_toom44_mul (struct speed_params *); +double speed_mpn_toom6h_mul (struct speed_params *); +double speed_mpn_toom8h_mul (struct speed_params *); +double speed_mpn_toom32_mul (struct speed_params *); +double speed_mpn_toom42_mul (struct speed_params *); +double speed_mpn_toom43_mul (struct speed_params *); +double speed_mpn_toom63_mul (struct speed_params *); +double speed_mpn_toom32_for_toom43_mul (struct speed_params *); +double speed_mpn_toom43_for_toom32_mul (struct speed_params *); +double speed_mpn_toom32_for_toom53_mul (struct speed_params *); +double speed_mpn_toom53_for_toom32_mul (struct speed_params *); +double speed_mpn_toom42_for_toom53_mul (struct speed_params *); +double speed_mpn_toom53_for_toom42_mul (struct speed_params *); +double speed_mpn_toom43_for_toom54_mul (struct speed_params *); +double speed_mpn_toom54_for_toom43_mul (struct speed_params *); +double speed_mpn_toom42_mulmid (struct speed_params *); +double speed_mpn_mulmod_bnm1 (struct speed_params *); +double speed_mpn_bc_mulmod_bnm1 (struct speed_params *); +double speed_mpn_mulmod_bnm1_rounded (struct speed_params *); +double speed_mpn_sqrmod_bnm1 (struct speed_params *); +double speed_mpn_mulmod_bknp1 (struct speed_params *); +double speed_mpn_sqrmod_bknp1 (struct speed_params *); +double speed_mpn_mulmod_bnp1 (struct speed_params *); +double speed_mpn_sqrmod_bnp1 (struct speed_params *); +double speed_mpn_udiv_qrnnd (struct speed_params *); +double speed_mpn_udiv_qrnnd_r (struct speed_params *); +double speed_mpn_umul_ppmm (struct speed_params *); +double speed_mpn_umul_ppmm_r (struct speed_params *); +double speed_mpn_xnor_n (struct speed_params *); +double speed_mpn_xor_n (struct speed_params *); +double speed_MPN_ZERO (struct speed_params *); + +double speed_mpq_init_clear (struct speed_params *); + +double speed_mpz_add (struct speed_params *); +double speed_mpz_invert (struct speed_params *); +double speed_mpz_bin_uiui (struct speed_params *); +double speed_mpz_bin_ui (struct speed_params *); +double speed_mpz_fac_ui (struct speed_params *); +double speed_mpz_2fac_ui (struct speed_params *); +double speed_mpz_mfac_uiui (struct speed_params *); +double speed_mpz_primorial_ui (struct speed_params *); +double speed_mpz_fib_ui (struct speed_params *); +double speed_mpz_fib2_ui (struct speed_params *); +double speed_mpz_init_clear (struct speed_params *); +double speed_mpz_init_realloc_clear (struct speed_params *); +double speed_gmp_primesieve (struct speed_params *); +double speed_mpz_nextprime (struct speed_params *); +double speed_mpz_nextprime_1 (struct speed_params *); +double speed_mpz_prevprime (struct speed_params *); +double speed_mpz_prevprime_1 (struct speed_params *); +double speed_mpz_jacobi (struct speed_params *); +double speed_mpz_lucnum_ui (struct speed_params *); +double speed_mpz_lucnum2_ui (struct speed_params *); +double speed_mpz_mod (struct speed_params *); +double speed_mpz_powm (struct speed_params *); +double speed_mpz_powm_mod (struct speed_params *); +double speed_mpz_powm_redc (struct speed_params *); +double speed_mpz_powm_sec (struct speed_params *); +double speed_mpz_powm_ui (struct speed_params *); +double speed_mpz_urandomb (struct speed_params *); + +double speed_gmp_randseed (struct speed_params *); +double speed_gmp_randseed_ui (struct speed_params *); + +double speed_noop (struct speed_params *); +double speed_noop_wxs (struct speed_params *); +double speed_noop_wxys (struct speed_params *); + +double speed_operator_div (struct speed_params *); +double speed_operator_mod (struct speed_params *); + +double speed_udiv_qrnnd (struct speed_params *); +double speed_udiv_qrnnd_preinv1 (struct speed_params *); +double speed_udiv_qrnnd_preinv2 (struct speed_params *); +double speed_udiv_qrnnd_preinv3 (struct speed_params *); +double speed_udiv_qrnnd_c (struct speed_params *); +double speed_umul_ppmm (struct speed_params *); + +/* Prototypes for other routines */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* low 32-bits in p[0], high 32-bits in p[1] */ +void speed_cyclecounter (unsigned p[2]); + +#if defined (__cplusplus) +} +#endif + +void mftb_function (unsigned p[2]); + +double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]); +int gettimeofday_microseconds_p (void); +int getrusage_microseconds_p (void); +int cycles_works_p (void); +long clk_tck (void); +double freq_measure (const char *, double (*)(void)); + +int double_cmp_ptr (const double *, const double *); +void pentium_wbinvd (void); +typedef int (*qsort_function_t) (const void *, const void *); + +void noop (void); +void noop_1 (mp_limb_t); +void noop_wxs (mp_ptr, mp_srcptr, mp_size_t); +void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); +void mpn_cache_fill (mp_srcptr, mp_size_t); +void mpn_cache_fill_dummy (mp_limb_t); +void speed_cache_fill (struct speed_params *); +void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t); +void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t); + +extern int speed_option_addrs; +extern int speed_option_verbose; +extern int speed_option_cycles_broken; +void speed_option_set (const char *); + +mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); +mp_limb_t mpn_div_qr_1n_pi1_4 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t); + +mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); +mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); +mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); +mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); + +int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int); +int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int); +int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int); +int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int); + +int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); +int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); +int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); +int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); +int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*); + +mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t); +mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t); + +mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); +mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]); + +void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); +void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); + +mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); +mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); +mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); +mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); +mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); +mp_size_t mpn_hgcd_lehmer_itch (mp_size_t); + +int mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); +mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t); + +mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); +mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t); + +mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); +mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t); + +mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); +mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); + +mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int); +void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr); + +void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); +void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); + +int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int); + + +/* "get" is called repeatedly until it ticks over, just in case on a fast + processor it takes less than a microsecond, though this is probably + unlikely if it's a system call. + + speed_cyclecounter is called on the same side of the "get" for the start + and end measurements. It doesn't matter how long it takes from the "get" + sample to the cycles sample, since that period will cancel out in the + difference calculation (assuming it's the same each time). + + Letting the test run for more than a process time slice is probably only + going to reduce accuracy, especially for getrusage when the cycle counter + is real time, or for gettimeofday if the cycle counter is in fact process + time. Use CLK_TCK/2 as a reasonable stop. + + It'd be desirable to be quite accurate here. The default speed_precision + for a cycle counter is 10000 cycles, so to mix that with getrusage or + gettimeofday the frequency should be at least that accurate. But running + measurements for 10000 microseconds (or more) is too long. Be satisfied + with just a half clock tick (5000 microseconds usually). */ + +#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \ + do { \ + type st1, st, et1, et; \ + unsigned sc[2], ec[2]; \ + long dt, half_tick; \ + double dc, cyc; \ + \ + half_tick = (1000000L / clk_tck()) / 2; \ + \ + get (st1); \ + do { \ + get (st); \ + } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \ + \ + getc (sc); \ + \ + for (;;) \ + { \ + get (et1); \ + do { \ + get (et); \ + } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \ + \ + getc (ec); \ + \ + dc = speed_cyclecounter_diff (ec, sc); \ + \ + /* allow secs to cancel before multiplying */ \ + dt = sec(et) - sec(st); \ + dt = dt * 1000000L + (usec(et) - usec(st)); \ + \ + if (dt >= half_tick) \ + break; \ + } \ + \ + cyc = dt * 1e-6 / dc; \ + \ + if (speed_option_verbose >= 2) \ + printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \ + name, dc, dt, cyc); \ + \ + return dt * 1e-6 / dc; \ + \ + } while (0) + + + + +/* The measuring routines use these big macros to save duplication for + similar forms. They also get used for some automatically generated + measuring of new implementations of functions. + + Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a + function pointer is considered undesirable since it's not the way a + normal application will be calling, and some processors might do + different things with an indirect call, like not branch predicting, or + doing a full pipe flush. At least some of the "functions" measured are + actually macros too. + + The net effect is to bloat the object code, possibly in a big way, but + only what's being measured is being run, so that doesn't matter. + + The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or + ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure + function pointer doesn't work in gcc 3.2. Using an actual non-pure + function pointer variable works, but stands a real risk of a + non-optimizing compiler generating unnecessary overheads in the call. + Currently the best idea is not to use those attributes for a timing + program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and + gmp-impl.h to omit them from routines there. */ + +#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; + +/* For mpn_copy or similar. */ +#define SPEED_ROUTINE_MPN_COPY_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_COPY(function) \ + SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size)) + +#define SPEED_ROUTINE_MPN_TABSELECT(function) \ + { \ + mp_ptr xp, wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + if (s->r == 0) \ + s->r = s->size; /* default to a quadratic shape */ \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, xp, s->size * s->r); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, s->size, s->r, (s->r) / 2); \ + while (--i != 0); \ + t = speed_endtime () / s->r; \ + \ + TMP_FREE; \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_COPYC(function) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->size, 0); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +/* s->size is still in limbs, and it's limbs which are copied, but + "function" takes a size in bytes not limbs. */ +#define SPEED_ROUTINE_MPN_COPY_BYTES(function) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->size * GMP_LIMB_BYTES); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* For mpn_add_n, mpn_sub_n, or similar. */ +#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \ + { \ + mp_ptr wp; \ + mp_ptr xp, yp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + xp = s->xp; \ + yp = s->yp; \ + \ + if (s->r == 0) ; \ + else if (s->r == 1) { xp = wp; } \ + else if (s->r == 2) { yp = wp; } \ + else if (s->r == 3) { xp = wp; yp = wp; } \ + else if (s->r == 4) { yp = xp; } \ + else { \ + TMP_FREE; \ + return -1.0; \ + } \ + \ + /* initialize wp if operand overlap */ \ + if (xp == wp || yp == wp) \ + MPN_COPY (wp, s->xp, s->size); \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* For mpn_aors_errK_n, where 1 <= K <= 3. */ +#define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \ + { \ + mp_ptr wp; \ + mp_ptr xp, yp; \ + mp_ptr zp[K]; \ + mp_limb_t ep[2*K]; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + /* (don't have a mechanism to specify zp alignments) */ \ + for (i = 0; i < K; i++) \ + SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \ + \ + xp = s->xp; \ + yp = s->yp; \ + \ + if (s->r == 0) ; \ + else if (s->r == 1) { xp = wp; } \ + else if (s->r == 2) { yp = wp; } \ + else if (s->r == 3) { xp = wp; yp = wp; } \ + else if (s->r == 4) { yp = xp; } \ + else { \ + TMP_FREE; \ + return -1.0; \ + } \ + \ + /* initialize wp if operand overlap */ \ + if (xp == wp || yp == wp) \ + MPN_COPY (wp, s->xp, s->size); \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + for (i = 0; i < K; i++) \ + speed_operand_src (s, zp[i], s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1) + +#define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2) + +#define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \ + SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3) + + +/* For mpn_add_n, mpn_sub_n, or similar. */ +#define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ + { \ + mp_ptr ap, sp; \ + mp_ptr xp, yp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \ + \ + xp = s->xp; \ + yp = s->yp; \ + \ + if ((s->r & 1) != 0) { xp = ap; } \ + if ((s->r & 2) != 0) { yp = ap; } \ + if ((s->r & 4) != 0) { xp = sp; } \ + if ((s->r & 8) != 0) { yp = sp; } \ + if ((s->r & 3) == 3 || (s->r & 12) == 12) \ + { \ + TMP_FREE; \ + return -1.0; \ + } \ + \ + /* initialize ap if operand overlap */ \ + if (xp == ap || yp == ap) \ + MPN_COPY (ap, s->xp, s->size); \ + /* initialize sp if operand overlap */ \ + if (xp == sp || yp == sp) \ + MPN_COPY (sp, s->xp, s->size); \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + speed_operand_dst (s, ap, s->size); \ + speed_operand_dst (s, sp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_BINARY_N(function) \ + SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size)) + +#define SPEED_ROUTINE_MPN_BINARY_NC(function) \ + SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0)) + + +/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */ +#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_UNARY_1(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_UNARY_1C(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) + +/* FIXME: wp is uninitialized here, should start it off from xp */ +#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \ + { \ + unsigned shift; \ + mp_limb_t dinv; \ + \ + SPEED_RESTRICT_COND (s->size > 0); \ + SPEED_RESTRICT_COND (s->r != 0); \ + \ + count_trailing_zeros (shift, s->r); \ + binvert_limb (dinv, s->r >> shift); \ + \ + SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ + } +#define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \ + SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \ + ((*function) (wp, s->xp, s->size, s->r, dinv, shift)) + +#define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) + +#define SPEED_ROUTINE_MPN_DIVREM_1(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0)) + +#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r)) + +#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \ + SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0)) + + +#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \ + { \ + unsigned shift; \ + mp_limb_t dinv; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + SPEED_RESTRICT_COND (s->r != 0); \ + \ + count_leading_zeros (shift, s->r); \ + invert_limb (dinv, s->r << shift); \ + \ + SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ + } \ + +#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \ + SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ + ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift)) + +/* s->size limbs worth of fraction part */ +#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \ + SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ + ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift)) + + +/* s->r is duplicated to form the multiplier, defaulting to + MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at + least it provides some control. */ +#define SPEED_ROUTINE_MPN_UNARY_N(function,N) \ + { \ + mp_ptr wp; \ + mp_size_t wn; \ + unsigned i; \ + double t; \ + mp_limb_t yp[N]; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= N); \ + \ + TMP_MARK; \ + wn = s->size + N-1; \ + SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ + for (i = 0; i < N; i++) \ + yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, yp, (mp_size_t) N); \ + speed_operand_dst (s, wp, wn); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->size, yp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_UNARY_2(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 2) +#define SPEED_ROUTINE_MPN_UNARY_3(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 3) +#define SPEED_ROUTINE_MPN_UNARY_4(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 4) +#define SPEED_ROUTINE_MPN_UNARY_5(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 5) +#define SPEED_ROUTINE_MPN_UNARY_6(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 6) +#define SPEED_ROUTINE_MPN_UNARY_7(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 7) +#define SPEED_ROUTINE_MPN_UNARY_8(function) \ + SPEED_ROUTINE_MPN_UNARY_N (function, 8) + +#define SPEED_ROUTINE_MPN_ADDADDMUL1_MSB0(function) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + mp_limb_t r; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + r = s->r != 0 ? s->r : MP_BASES_BIG_BASE_10; \ + r &= ~GMP_NUMB_HIGHBIT; \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->yp, s->size, r, r); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +/* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */ +#define SPEED_ROUTINE_MPN_MUL(function) \ + { \ + mp_ptr wp; \ + mp_size_t size1; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? s->size : s->r); \ + if (size1 < 0) size1 = -size1 - s->size; \ + \ + SPEED_RESTRICT_COND (size1 >= 1); \ + SPEED_RESTRICT_COND (s->size >= size1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, size1); \ + speed_operand_dst (s, wp, size1 + s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->size, s->yp, size1); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size)); + +#define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MULLO_N(function) \ + SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); + +#define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \ + SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); + +#define SPEED_ROUTINE_MPN_SQRLO(function) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +/* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */ +#define SPEED_ROUTINE_MPN_MULMID(function) \ + { \ + mp_ptr wp, xp; \ + mp_size_t size1; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (size1 >= s->size); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, size1, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MULMID_N(function) \ + { \ + mp_ptr wp, xp; \ + mp_size_t size1; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = 2 * s->size - 1; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \ + { \ + mp_ptr wp, xp, scratch; \ + mp_size_t size1, scratch_size; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + size1 = 2 * s->size - 1; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ + scratch_size = mpn_toom42_mulmid_itch (s->size); \ + SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \ + \ + speed_operand_src (s, xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 - s->size + 3); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, xp, s->yp, s->size, scratch); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ + { \ + mp_ptr wp, tp; \ + unsigned i; \ + double t; \ + mp_size_t itch; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, 2 * s->size); \ + speed_operand_dst (s, tp, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \ + { \ + mp_ptr wp, tp; \ + unsigned i; \ + double t; \ + mp_size_t size, itch; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + size = mpn_mulmod_bnm1_next_size (s->size); \ + itch = mpn_mulmod_bnm1_itch (size, size, size); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size); \ + speed_operand_dst (s, tp, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, size, s->xp, s->size, s->yp, s->size, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#ifndef MOD_BKNP1_USE11 +#define MOD_BKNP1_USE11 0 +#endif +#ifndef MOD_BKNP1_ONLY3 +#define MOD_BKNP1_ONLY3 0 +#endif + +#define SPEED_ROUTINE_MPN_MULMOD_BNP1_CALL(call,use_r) \ + { \ + mp_ptr wp, tp; \ + unsigned i, k; \ + double t; \ + mp_size_t itch, nk; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (!use_r || (s->r == 0) || \ + (s->r == 3) || (s->r == 5) || (s->r == 7) || \ + (s->r == 13) || (s->r == 17) || \ + ((MOD_BKNP1_USE11) && (s->r == 11))); \ + \ + if (!use_r || (s->r < 2)) \ + { \ + if (s->size % 3 == 0) {nk = s->size / (k = 3);} \ + else if (s->size % 5 == 0) {nk = s->size / (k = 5);} \ + else if (s->size % 7 == 0) {nk = s->size / (k = 7);} \ + else if (s->size % 11 == 0) {nk = s->size / (k = 11);} \ + else if (s->size % 13 == 0) {nk = s->size / (k = 13);} \ + else if (s->size % 17 == 0) {nk = s->size / (k = 17);} \ + else nk = s->size / (k = 1); \ + } \ + else nk = s->size / (k = s->r); \ + \ + if (MOD_BKNP1_ONLY3) \ + k = 3; \ + SPEED_RESTRICT_COND ((!use_r || (k > 2)) && (s->size == k * nk)); \ + SPEED_RESTRICT_COND ((GMP_NUMB_MAX % k == 0) || (nk % 3 != 0) || \ + ((MOD_BKNP1_USE11) && (k == 11))); \ + \ + itch = mpn_mulmod_bknp1_itch (s->size); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size + 2, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ + \ + s->xp [s->size] &= 1; \ + s->yp [s->size] &= 1; \ + speed_operand_src (s, s->xp, s->size + 1); \ + speed_operand_src (s, s->yp, s->size + 1); \ + speed_operand_dst (s, wp, 2 * s->size + 2); \ + speed_operand_dst (s, tp, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \ + { \ + mp_ptr wp, tspace; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= minsize); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_operand_dst (s, tspace, tsize); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ + mpn_toom22_mul_itch (s->size, s->size), \ + MPN_TOOM22_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ + mpn_toom33_mul_itch (s->size, s->size), \ + MPN_TOOM33_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ + mpn_toom44_mul_itch (s->size, s->size), \ + MPN_TOOM44_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ + mpn_toom6h_mul_itch (s->size, s->size), \ + MPN_TOOM6H_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ + mpn_toom8h_mul_itch (s->size, s->size), \ + MPN_TOOM8H_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ + mpn_toom32_mul_itch (s->size, 2*s->size/3), \ + MPN_TOOM32_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ + mpn_toom42_mul_itch (s->size, s->size/2), \ + MPN_TOOM42_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ + mpn_toom43_mul_itch (s->size, s->size*3/4), \ + MPN_TOOM43_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ + mpn_toom63_mul_itch (s->size, s->size/2), \ + MPN_TOOM63_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ + mpn_toom32_mul_itch (s->size, 17*s->size/24), \ + MPN_TOOM32_MUL_MINSIZE) +#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ + mpn_toom43_mul_itch (s->size, 17*s->size/24), \ + MPN_TOOM43_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ + mpn_toom32_mul_itch (s->size, 19*s->size/30), \ + MPN_TOOM32_MUL_MINSIZE) +#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ + mpn_toom53_mul_itch (s->size, 19*s->size/30), \ + MPN_TOOM53_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ + mpn_toom42_mul_itch (s->size, 11*s->size/20), \ + MPN_TOOM42_MUL_MINSIZE) +#define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ + mpn_toom53_mul_itch (s->size, 11*s->size/20), \ + MPN_TOOM53_MUL_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ + mpn_toom42_mul_itch (s->size, 5*s->size/6), \ + MPN_TOOM54_MUL_MINSIZE) +#define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ + mpn_toom54_mul_itch (s->size, 5*s->size/6), \ + MPN_TOOM54_MUL_MINSIZE) + + + +#define SPEED_ROUTINE_MPN_SQR_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) + +#define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call) \ + { \ + mp_ptr wp, tp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, tp, 2 * s->size); \ + speed_operand_dst (s, wp, 2 * s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime () / 2; \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \ + { \ + mp_ptr wp, tspace; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= minsize); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_operand_dst (s, tspace, tsize); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + mpn_toom2_sqr_itch (s->size), \ + MPN_TOOM2_SQR_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + mpn_toom3_sqr_itch (s->size), \ + MPN_TOOM3_SQR_MINSIZE) + + +#define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + mpn_toom4_sqr_itch (s->size), \ + MPN_TOOM4_SQR_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + mpn_toom6_sqr_itch (s->size), \ + MPN_TOOM6_SQR_MINSIZE) + +#define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + mpn_toom8_sqr_itch (s->size), \ + MPN_TOOM8_SQR_MINSIZE) + +#define SPEED_ROUTINE_MPN_MOD_CALL(call) \ + { \ + unsigned i; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + \ + return speed_endtime (); \ + } + +#define SPEED_ROUTINE_MPN_MOD_1(function) \ + SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r)) + +#define SPEED_ROUTINE_MPN_MOD_1C(function) \ + SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0))) + +#define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \ + SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r)); + +#define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \ + SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0))); + +#define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \ + SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size)) + +#define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \ + { \ + unsigned i; \ + mp_limb_t inv; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \ + \ + invert_limb (inv, s->r); \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + (*function) (s->xp, s->size, s->r, inv); \ + while (--i != 0); \ + \ + return speed_endtime (); \ + } + +#define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \ + { \ + unsigned i; \ + mp_limb_t inv[4]; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + mpn_mod_1_1p_cps (inv, s->r); \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + pfunc (inv, s->r); \ + function (s->xp, s->size, s->r << inv[1], inv); \ + } while (--i != 0); \ + \ + return speed_endtime (); \ + } +#define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \ + { \ + unsigned i; \ + mp_limb_t inv[N+3]; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + pfunc (inv, s->r); \ + function (s->xp, s->size, s->r, inv); \ + } while (--i != 0); \ + \ + return speed_endtime (); \ + } + + +/* A division of 2*s->size by s->size limbs */ + +#define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \ + { \ + unsigned i; \ + mp_ptr a, d, q, r; \ + double t; \ + gmp_pi1_t dinv; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= GMP_NUMB_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_operand_dst (s, r, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* A remainder 2*s->size by s->size limbs */ + +#define SPEED_ROUTINE_MPZ_MOD(function) \ + { \ + unsigned i; \ + mpz_t a, d, r; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + mpz_init_set_n (d, s->yp, s->size); \ + \ + /* high part less than d, low part a duplicate copied in */ \ + mpz_init_set_n (a, s->xp, s->size); \ + mpz_mod (a, a, d); \ + mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \ + MPN_COPY (PTR(a), s->xp, s->size); \ + \ + mpz_init (r); \ + \ + speed_operand_src (s, PTR(a), SIZ(a)); \ + speed_operand_src (s, PTR(d), SIZ(d)); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (r, a, d); \ + while (--i != 0); \ + return speed_endtime (); \ + } + +#define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \ + { \ + unsigned i; \ + mp_ptr dp, tp, ap, qp; \ + gmp_pi1_t inv; \ + double t; \ + mp_size_t size1; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? 2 * s->size : s->r); \ + \ + SPEED_RESTRICT_COND (s->size >= DMIN); \ + SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \ + \ + /* we don't fill in dividend completely when size1 > s->size */ \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap + size1 - s->size, s->xp, s->size); \ + \ + MPN_COPY (dp, s->yp, s->size); \ + \ + /* normalize the data */ \ + dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ + ap[size1 - 1] = dp[s->size - 1] - 1; \ + \ + invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \ + \ + speed_operand_src (s, ap, size1); \ + speed_operand_dst (s, tp, size1); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, qp, size1 - s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, size1); \ + function (qp, tp, size1, dp, s->size, INV); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \ + { \ + unsigned i; \ + mp_ptr dp, tp, qp, scratch; \ + double t; \ + mp_size_t itch; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + itch = itchfn (2 * s->size, s->size, 0); \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ + \ + MPN_COPY (tp, s->xp, s->size); \ + MPN_COPY (tp+s->size, s->xp, s->size); \ + \ + /* normalize the data */ \ + dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ + tp[2*s->size-1] = dp[s->size-1] - 1; \ + \ + speed_operand_dst (s, qp, s->size); \ + speed_operand_src (s, tp, 2 * s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, scratch, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + function (qp, tp, 2 * s->size, dp, s->size, scratch); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \ + { \ + unsigned i; \ + mp_ptr dp, tp, qp, rp, scratch; \ + double t; \ + mp_size_t size1, itch; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? 2 * s->size : s->r); \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + SPEED_RESTRICT_COND (size1 >= s->size); \ + \ + itch = itchfn (size1, s->size, 0); \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ + SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ + \ + /* we don't fill in dividend completely when size1 > s->size */ \ + MPN_COPY (tp, s->xp, s->size); \ + MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ + \ + MPN_COPY (dp, s->yp, s->size); \ + \ + /* normalize the data */ \ + dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ + tp[size1 - 1] = dp[s->size - 1] - 1; \ + \ + speed_operand_dst (s, qp, size1 - s->size); \ + speed_operand_dst (s, rp, s->size); \ + speed_operand_src (s, tp, size1); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, scratch, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + function (qp, rp, tp, size1, dp, s->size, scratch); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \ + { \ + unsigned i; \ + mp_ptr dp, tp, qp, rp, ip, scratch, tmp; \ + double t; \ + mp_size_t size1, itch; \ + TMP_DECL; \ + \ + size1 = (s->r == 0 ? 2 * s->size : s->r); \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + SPEED_RESTRICT_COND (size1 >= s->size); \ + \ + itch = itchfn (size1, s->size, s->size); \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ + SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \ + \ + /* we don't fill in dividend completely when size1 > s->size */ \ + MPN_COPY (tp, s->xp, s->size); \ + MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ + \ + MPN_COPY (dp, s->yp, s->size); \ + \ + /* normalize the data */ \ + dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ + tp[size1 - 1] = dp[s->size-1] - 1; \ + \ + tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size)); \ + mpn_invert (ip, dp, s->size, tmp); \ + \ + speed_operand_dst (s, qp, size1 - s->size); \ + speed_operand_dst (s, rp, s->size); \ + speed_operand_src (s, tp, size1); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_src (s, ip, s->size); \ + speed_operand_dst (s, scratch, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \ + { \ + unsigned i; \ + mp_ptr dp, tp, ap, qp; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ + \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap+s->size, s->xp, s->size); \ + \ + /* divisor must be odd */ \ + MPN_COPY (dp, s->yp, s->size); \ + dp[0] |= 1; \ + binvert_limb (inv, dp[0]); \ + inv = -inv; \ + \ + speed_operand_src (s, ap, 2*s->size); \ + speed_operand_dst (s, tp, 2*s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, qp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, 2*s->size); \ + function (qp, tp, 2*s->size, dp, s->size, inv); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \ + { \ + unsigned i; \ + mp_ptr dp, tp, qp; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \ + \ + /* divisor must be odd */ \ + MPN_COPY (dp, s->yp, s->size); \ + dp[0] |= 1; \ + binvert_limb (inv, dp[0]); \ + inv = -inv; \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, qp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, s->xp, s->size); \ + function (qp, tp, s->size, dp, s->size, inv); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_PI1_BDIV_R(function) \ + { \ + unsigned i; \ + mp_ptr dp, tp, ap; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ + \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap+s->size, s->xp, s->size); \ + \ + /* divisor must be odd */ \ + MPN_COPY (dp, s->yp, s->size); \ + dp[0] |= 1; \ + binvert_limb (inv, dp[0]); \ + inv = -inv; \ + \ + speed_operand_src (s, ap, 2*s->size); \ + speed_operand_dst (s, tp, 2*s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, 2*s->size); \ + function (tp, 2*s->size, dp, s->size, inv); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \ + { \ + unsigned i; \ + mp_ptr dp, qp, scratch; \ + double t; \ + mp_size_t itch; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + itch = itchfn (s->size, s->size); \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ + \ + /* divisor must be odd */ \ + MPN_COPY (dp, s->yp, s->size); \ + dp[0] |= 1; \ + \ + speed_operand_dst (s, qp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, scratch, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + function (qp, s->xp, s->size, dp, s->size, scratch); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \ + { \ + unsigned i; \ + mp_ptr dp, tp, qp, rp, scratch; \ + double t; \ + mp_size_t itch; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + itch = itchfn (2 * s->size, s->size); \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ + SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ + \ + MPN_COPY (tp, s->xp, s->size); \ + MPN_COPY (tp+s->size, s->xp, s->size); \ + \ + /* divisor must be odd */ \ + MPN_COPY (dp, s->yp, s->size); \ + dp[0] |= 1; \ + \ + speed_operand_dst (s, qp, s->size); \ + speed_operand_dst (s, rp, s->size); \ + speed_operand_src (s, tp, 2 * s->size); \ + speed_operand_src (s, dp, s->size); \ + speed_operand_dst (s, scratch, itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_BROOT(function) \ + { \ + SPEED_RESTRICT_COND (s->r & 1); \ + s->xp[0] |= 1; \ + SPEED_ROUTINE_MPN_UNARY_1_CALL \ + ((*function) (wp, s->xp, s->size, s->r)); \ + } + +#define SPEED_ROUTINE_MPN_BROOTINV(function, itch) \ + { \ + mp_ptr wp, tp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + TMP_MARK; \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (s->r & 1); \ + wp = TMP_ALLOC_LIMBS (s->size); \ + tp = TMP_ALLOC_LIMBS ( (itch)); \ + s->xp[0] |= 1; \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + (*function) (wp, s->xp, s->size, s->r, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \ + { \ + long i; \ + mp_ptr up, tp, ip; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ + \ + MPN_COPY (up, s->xp, s->size); \ + \ + /* normalize the data */ \ + up[s->size-1] |= GMP_NUMB_HIGHBIT; \ + \ + speed_operand_src (s, up, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_dst (s, ip, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (ip, up, s->size, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \ + { \ + long i; \ + mp_ptr up, tp, ip; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ + \ + MPN_COPY (up, s->xp, s->size); \ + \ + /* normalize the data */ \ + up[s->size-1] |= GMP_NUMB_HIGHBIT; \ + \ + speed_operand_src (s, up, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_dst (s, ip, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (ip, up, s->size, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \ + { \ + long i; \ + mp_ptr up, tp, ip; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 3); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ + \ + MPN_COPY (up, s->xp, s->size); \ + \ + /* normalize the data */ \ + up[s->size-1] |= GMP_NUMB_HIGHBIT; \ + \ + speed_operand_src (s, up, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_dst (s, ip, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (ip, up, s->size, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \ + { \ + long i; \ + mp_ptr up, tp, ip; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ + \ + MPN_COPY (up, s->xp, s->size); \ + \ + /* normalize the data */ \ + up[0] |= 1; \ + \ + speed_operand_src (s, up, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_dst (s, ip, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (ip, up, s->size, tp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn) \ + { \ + long i; \ + mp_ptr up, mp, tp, ip; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ + \ + speed_operand_src (s, up, s->size); \ + speed_operand_dst (s, tp, s->size); \ + speed_operand_dst (s, ip, s->size); \ + speed_cache_fill (s); \ + \ + MPN_COPY (mp, s->yp, s->size); \ + /* Must be odd */ \ + mp[0] |= 1; \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + MPN_COPY (up, s->xp, s->size); \ + function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_REDC_1(function) \ + { \ + unsigned i; \ + mp_ptr cp, mp, tp, ap; \ + mp_limb_t inv; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ + \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap+s->size, s->xp, s->size); \ + \ + /* modulus must be odd */ \ + MPN_COPY (mp, s->yp, s->size); \ + mp[0] |= 1; \ + binvert_limb (inv, mp[0]); \ + inv = -inv; \ + \ + speed_operand_src (s, ap, 2*s->size+1); \ + speed_operand_dst (s, tp, 2*s->size+1); \ + speed_operand_src (s, mp, s->size); \ + speed_operand_dst (s, cp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, 2*s->size); \ + function (cp, tp, mp, s->size, inv); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_REDC_2(function) \ + { \ + unsigned i; \ + mp_ptr cp, mp, tp, ap; \ + mp_limb_t invp[2]; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ + \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap+s->size, s->xp, s->size); \ + \ + /* modulus must be odd */ \ + MPN_COPY (mp, s->yp, s->size); \ + mp[0] |= 1; \ + mpn_binvert (invp, mp, 2, tp); \ + invp[0] = -invp[0]; invp[1] = ~invp[1]; \ + \ + speed_operand_src (s, ap, 2*s->size+1); \ + speed_operand_dst (s, tp, 2*s->size+1); \ + speed_operand_src (s, mp, s->size); \ + speed_operand_dst (s, cp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, 2*s->size); \ + function (cp, tp, mp, s->size, invp); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } +#define SPEED_ROUTINE_REDC_N(function) \ + { \ + unsigned i; \ + mp_ptr cp, mp, tp, ap, invp; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size > 8); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ + SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \ + \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (ap+s->size, s->xp, s->size); \ + \ + /* modulus must be odd */ \ + MPN_COPY (mp, s->yp, s->size); \ + mp[0] |= 1; \ + mpn_binvert (invp, mp, s->size, tp); \ + \ + speed_operand_src (s, ap, 2*s->size+1); \ + speed_operand_dst (s, tp, 2*s->size+1); \ + speed_operand_src (s, mp, s->size); \ + speed_operand_dst (s, cp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do { \ + MPN_COPY (tp, ap, 2*s->size); \ + function (cp, tp, mp, s->size, invp); \ + } while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_POPCOUNT(function) \ + { \ + unsigned i; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (s->xp, s->size); \ + while (--i != 0); \ + \ + return speed_endtime (); \ + } + +#define SPEED_ROUTINE_MPN_HAMDIST(function) \ + { \ + unsigned i; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (s->xp, s->yp, s->size); \ + while (--i != 0); \ + \ + return speed_endtime (); \ + } + + +#define SPEED_ROUTINE_MPZ_UI(function) \ + { \ + mpz_t z; \ + unsigned i; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + mpz_init (z); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (z, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (z); \ + return t; \ + } + +#define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function) +#define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function) +#define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function) + + +#define SPEED_ROUTINE_MPZ_UNARY_1(function) \ + { \ + mpz_t z, a; \ + unsigned i; \ + mp_limb_t ls; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + mpz_init (z); \ + ls = s->size; \ + mpz_roinit_n (a, &ls, s->size != 0); \ + \ + if (s->r < 2) \ + { \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (z, a); \ + while (--i != 0); \ + t = speed_endtime (); \ + } \ + else \ + { \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + int j = s->r; \ + mpz_set (z, a); \ + do \ + { \ + function (z, z); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + s->time_divisor = s->r; \ + } \ + \ + mpz_clear (z); \ + return t; \ + } + + +#define SPEED_ROUTINE_MPZ_2_UI(function) \ + { \ + mpz_t z, z2; \ + unsigned i; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + mpz_init (z); \ + mpz_init (z2); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (z, z2, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (z); \ + mpz_clear (z2); \ + return t; \ + } + +#define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) +#define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) + + +#define SPEED_ROUTINE_MPN_FIB2_UI(function) \ + { \ + mp_ptr fp, f1p; \ + mp_size_t alloc; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + alloc = MPN_FIB2_SIZE (s->size); \ + SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (fp, f1p, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + + +/* Calculate b^e mod m for random b and m of s->size limbs and random e of 6 + limbs. m is forced to odd so that redc can be used. e is limited in + size so the calculation doesn't take too long. */ +#define SPEED_ROUTINE_MPZ_POWM(function) \ + { \ + mpz_t r, b, e, m; \ + unsigned i; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + mpz_init (r); \ + if (s->r < 2) \ + mpz_init_set_n (b, s->xp, s->size); \ + else \ + mpz_init_set_ui (b, s->r); \ + mpz_init_set_n (m, s->yp, s->size); \ + mpz_setbit (m, 0); /* force m to odd */ \ + mpz_init_set_n (e, s->xp_block, 6); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (r, b, e, m); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (r); \ + mpz_clear (b); \ + mpz_clear (e); \ + mpz_clear (m); \ + return t; \ + } + +/* (m-2)^0xAAAAAAAA mod m */ +#define SPEED_ROUTINE_MPZ_POWM_UI(function) \ + { \ + mpz_t r, b, m; \ + unsigned long e; \ + unsigned i; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + mpz_init (r); \ + \ + /* force m to odd */ \ + mpz_init (m); \ + mpz_set_n (m, s->xp, s->size); \ + PTR(m)[0] |= 1; \ + \ + e = (~ (unsigned long) 0) / 3; \ + if (s->r != 0) \ + e = s->r; \ + \ + mpz_init_set (b, m); \ + mpz_sub_ui (b, b, 2); \ +/* printf ("%X\n", mpz_get_ui(m)); */ \ + i = s->reps; \ + speed_starttime (); \ + do \ + function (r, b, e, m); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (r); \ + mpz_clear (b); \ + mpz_clear (m); \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \ + { \ + mp_ptr wp, wp2, xp, yp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ + xp = s->xp; \ + yp = s->yp; \ + \ + if (s->r == 0) ; \ + else if (s->r == 1) { xp = wp; } \ + else if (s->r == 2) { yp = wp2; } \ + else if (s->r == 3) { xp = wp; yp = wp2; } \ + else if (s->r == 4) { xp = wp2; yp = wp; } \ + else { \ + TMP_FREE; \ + return -1.0; \ + } \ + if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ + if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_operand_dst (s, wp2, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ + SPEED_ROUTINE_MPN_ADDSUB_CALL \ + (function (wp, wp2, xp, yp, s->size)); + +#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ + SPEED_ROUTINE_MPN_ADDSUB_CALL \ + (function (wp, wp2, xp, yp, s->size, 0)); + + +/* Doing an Nx1 gcd with the given r. */ +#define SPEED_ROUTINE_MPN_GCD_1N(function) \ + { \ + mp_ptr xp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (s->r != 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ + MPN_COPY (xp, s->xp, s->size); \ + xp[0] |= refmpn_zero_p (xp, s->size); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (xp, s->size, s->r); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ + +#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ + { \ + unsigned i, j; \ + mp_ptr px, py; \ + mp_limb_t x_mask, y_mask; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \ + MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ + MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ + \ + x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ + y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ + for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ + { \ + px[i] &= x_mask; px[i] += (px[i] == 0); \ + py[i] &= y_mask; py[i] += (py[i] == 0); \ + setup; \ + } \ + \ + speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ + speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = SPEED_BLOCK_SIZE; \ + do \ + { \ + call; \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_GCD_1(function) \ + SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1])) + +#define SPEED_ROUTINE_MPN_GCD_11(function) \ + SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \ + function (px[j-1], py[j-1])) + +/* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so + we can measure gcd_22 loop only, without gcd_11. */ +#define SPEED_ROUTINE_MPN_GCD_22(function) \ + SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1), \ + function (px[j-1], px[j-1], py[j-1], py[j-1])) + +#define SPEED_ROUTINE_MPN_JACBASE(function) \ + SPEED_ROUTINE_MPN_GCD_1_CALL \ + ({ \ + /* require xxp_block, SPEED_BLOCK_SIZE); \ + speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + mp_limb_t chain = 0; \ + do \ + { \ + for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2) \ + { \ + /* randomized but successively dependent */ \ + function (s->xp_block[j] | GMP_NUMB_HIGHBIT, \ + s->xp_block[j+1] + chain, \ + s->yp_block[j] | GMP_NUMB_HIGHBIT, \ + s->yp_block[j+1], &m); \ + chain += m.u[0][0]; \ + } \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + /* make sure the compiler won't optimize away chain */ \ + noop_1 (chain); \ + \ + s->time_divisor = SPEED_BLOCK_SIZE / 2; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc) \ + { \ + mp_size_t hgcd_init_itch, hgcd_itch; \ + mp_ptr ap, bp, wp, tmp1; \ + struct hgcd_matrix hgcd; \ + int res; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + if (s->size < 2) \ + return -1; \ + \ + TMP_MARK; \ + \ + SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ + \ + s->xp[s->size - 1] |= 1; \ + s->yp[s->size - 1] |= 1; \ + \ + hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ + hgcd_itch = itchfunc (s->size); \ + \ + SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, ap, s->size + 1); \ + speed_operand_dst (s, bp, s->size + 1); \ + speed_operand_dst (s, wp, hgcd_itch); \ + speed_operand_dst (s, tmp1, hgcd_init_itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (bp, s->yp, s->size); \ + mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ + res = func (ap, bp, s->size, &hgcd, wp); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \ + { \ + mp_size_t hgcd_init_itch, hgcd_step_itch; \ + mp_ptr ap, bp, wp, tmp1; \ + struct hgcd_matrix hgcd; \ + mp_size_t p = s->size/2; \ + int res; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + if (s->size < 2) \ + return -1; \ + \ + TMP_MARK; \ + \ + SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ + \ + s->xp[s->size - 1] |= 1; \ + s->yp[s->size - 1] |= 1; \ + \ + hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ + hgcd_step_itch = itchfunc (s->size, p); \ + \ + SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, ap, s->size + 1); \ + speed_operand_dst (s, bp, s->size + 1); \ + speed_operand_dst (s, wp, hgcd_step_itch); \ + speed_operand_dst (s, tmp1, hgcd_init_itch); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + MPN_COPY (ap, s->xp, s->size); \ + MPN_COPY (bp, s->yp, s->size); \ + mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ + res = func (&hgcd, ap, bp, s->size, p, wp); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + TMP_FREE; \ + return t; \ + } + +/* Run some GCDs of s->size limbs each. The number of different data values + is decreased as s->size**2, since GCD is a quadratic algorithm. + SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT + though, because the plain gcd is about twice as fast as gcdext. */ + +#define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \ + { \ + unsigned i; \ + mp_size_t j, pieces, psize; \ + mp_ptr wp, wp2, xtmp, ytmp, px, py; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ + \ + pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \ + pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \ + pieces = MAX (pieces, 1); \ + \ + psize = pieces * s->size; \ + px = TMP_ALLOC_LIMBS (psize); \ + py = TMP_ALLOC_LIMBS (psize); \ + MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ + MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ + \ + /* Requirements: x >= y, y must be odd, high limbs != 0. \ + No need to ensure random numbers are really great. */ \ + for (j = 0; j < pieces; j++) \ + { \ + mp_ptr x = px + j * s->size; \ + mp_ptr y = py + j * s->size; \ + if (x[s->size - 1] == 0) x[s->size - 1] = 1; \ + if (y[s->size - 1] == 0) y[s->size - 1] = 1; \ + \ + if (x[s->size - 1] < y[s->size - 1]) \ + MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \ + else if (x[s->size - 1] == y[s->size - 1]) \ + { \ + x[s->size - 1] = 2; \ + y[s->size - 1] = 1; \ + } \ + y[0] |= 1; \ + } \ + \ + speed_operand_src (s, px, psize); \ + speed_operand_src (s, py, psize); \ + speed_operand_dst (s, xtmp, s->size); \ + speed_operand_dst (s, ytmp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = pieces; \ + do \ + { \ + MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \ + MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \ + call; \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + \ + s->time_divisor = pieces; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_GCD(function) \ + SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size)) + +#define SPEED_ROUTINE_MPN_GCDEXT(function) \ + SPEED_ROUTINE_MPN_GCD_CALL \ + (4, { mp_size_t wp2size; \ + function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); }) + + +#define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \ + { \ + unsigned i; \ + mp_size_t j, pieces, psize, wp2size; \ + mp_ptr wp, wp2, xtmp, ytmp, px, py; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + \ + SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ + MPN_COPY (xtmp, s->xp, s->size); \ + MPN_COPY (ytmp, s->yp, s->size); \ + \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ + \ + pieces = SPEED_BLOCK_SIZE / 3; \ + psize = 3 * pieces; \ + px = TMP_ALLOC_LIMBS (psize); \ + py = TMP_ALLOC_LIMBS (psize); \ + MPN_COPY (px, s->xp_block, psize); \ + MPN_COPY (py, s->yp_block, psize); \ + \ + /* x must have at least as many bits as y, \ + high limbs must be non-zero */ \ + for (j = 0; j < pieces; j++) \ + { \ + mp_ptr x = px+3*j; \ + mp_ptr y = py+3*j; \ + x[2] += (x[2] == 0); \ + y[2] += (y[2] == 0); \ + if (x[2] < y[2]) \ + MP_LIMB_T_SWAP (x[2], y[2]); \ + } \ + \ + speed_operand_src (s, px, psize); \ + speed_operand_src (s, py, psize); \ + speed_operand_dst (s, xtmp, s->size); \ + speed_operand_dst (s, ytmp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + mp_ptr x = px; \ + mp_ptr y = py; \ + mp_ptr xth = &xtmp[s->size-3]; \ + mp_ptr yth = &ytmp[s->size-3]; \ + j = pieces; \ + do \ + { \ + xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \ + yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \ + \ + ytmp[0] |= 1; /* y must be odd, */ \ + \ + function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \ + \ + x += 3; \ + y += 3; \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + \ + s->time_divisor = pieces; \ + return t; \ + } + +#define SPEED_ROUTINE_GMP_PRIMESIEVE(function) \ +{ \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + mp_limb_t a = s->size * GMP_LIMB_BITS * 3; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, a); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ +} + + +/* Calculate nextprime(n) for random n of s->size bits (not limbs). */ +#define SPEED_ROUTINE_MPZ_NEXTPRIME(function) \ + { \ + unsigned i, j; \ + mpz_t wp, n; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 10); \ + \ + mpz_init (wp); \ + mpz_init_set_n (n, s->xp, s->size); \ + /* limit to s->size bits, as this function is very slow */ \ + mpz_tdiv_r_2exp (n, n, s->size); \ + /* set high bits so operand and result are genaral s->size bits */ \ + mpz_setbit (n, s->size - 1); \ + mpz_clrbit (n, s->size - 2); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + /* nextprime timing is variable, so average over many calls */ \ + j = SPEED_BLOCK_SIZE - 1; \ + /* starts on random, after measures prime to next prime */ \ + function (wp, n); \ + do \ + { \ + function (wp, wp); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (wp); \ + mpz_clear (n); \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPZ_JACOBI(function) \ + { \ + mpz_t a, b; \ + unsigned i; \ + mp_size_t j, pieces, psize; \ + mp_ptr px, py; \ + double t; \ + TMP_DECL; \ + \ + TMP_MARK; \ + pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \ + pieces = MAX (pieces, 1); \ + s->time_divisor = pieces; \ + \ + psize = pieces * s->size; \ + px = TMP_ALLOC_LIMBS (psize); \ + py = TMP_ALLOC_LIMBS (psize); \ + MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ + MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ + \ + for (j = 0; j < pieces; j++) \ + { \ + mp_ptr x = px+j*s->size; \ + mp_ptr y = py+j*s->size; \ + \ + /* y odd */ \ + y[0] |= 1; \ + \ + /* high limbs non-zero */ \ + if (x[s->size-1] == 0) x[s->size-1] = 1; \ + if (y[s->size-1] == 0) y[s->size-1] = 1; \ + } \ + \ + SIZ(a) = s->size; \ + SIZ(b) = s->size; \ + \ + speed_operand_src (s, px, psize); \ + speed_operand_src (s, py, psize); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = pieces; \ + do \ + { \ + PTR(a) = px+(j-1)*s->size; \ + PTR(b) = py+(j-1)*s->size; \ + function (a, b); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DIVREM_2(function) \ + { \ + mp_ptr wp, xp; \ + mp_limb_t yp[2]; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + /* source is destroyed */ \ + MPN_COPY (xp, s->xp, s->size); \ + \ + /* divisor must be normalized */ \ + MPN_COPY (yp, s->yp_block, 2); \ + yp[1] |= GMP_NUMB_HIGHBIT; \ + \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, 2); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, 0, xp, s->size, yp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DIV_QR_1(function) \ + { \ + mp_ptr wp, xp; \ + mp_limb_t d; \ + mp_limb_t r; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + d = s->r; \ + if (d == 0) \ + d = 1; \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + r = function (wp, wp+s->size-1, s->xp, s->size, d); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function) \ + { \ + mp_ptr wp, xp; \ + mp_limb_t d, dinv; \ + mp_limb_t r; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + d = s->r; \ + /* divisor must be normalized */ \ + SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT); \ + invert_limb (dinv, d); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + r = function (wp, s->xp, s->size, 0, d, dinv); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm) \ + { \ + mp_ptr wp, xp; \ + mp_limb_t yp[2]; \ + mp_limb_t rp[2]; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + \ + /* divisor must be normalized */ \ + MPN_COPY (yp, s->yp_block, 2); \ + if (norm) \ + yp[1] |= GMP_NUMB_HIGHBIT; \ + else \ + { \ + yp[1] &= ~GMP_NUMB_HIGHBIT; \ + if (yp[1] == 0) \ + yp[1] = 1; \ + } \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, yp, 2); \ + speed_operand_dst (s, wp, s->size); \ + speed_operand_dst (s, rp, 2); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, rp, s->xp, s->size, yp); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MODLIMB_INVERT(function) \ + { \ + unsigned i, j; \ + mp_ptr xp; \ + mp_limb_t n = 1; \ + double t; \ + \ + xp = s->xp_block-1; \ + \ + speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = SPEED_BLOCK_SIZE; \ + do \ + { \ + /* randomized but successively dependent */ \ + n += (xp[j] << 1); \ + \ + function (n, n); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + /* make sure the compiler won't optimize away n */ \ + noop_1 (n); \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call) \ + { \ + mp_ptr wp, wp2; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_operand_dst (s, wp2, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* Calculate worst case for perfect_power + Worst case is multiple prime factors larger than trial div limit. */ +#define SPEED_ROUTINE_MPN_PERFECT_POWER(function) \ + { \ + mpz_t r; \ + unsigned i, power; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 10); \ + \ + mpz_init (r); \ + power = s->size * GMP_NUMB_BITS / 17; \ + mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1); \ + mpz_mul_ui(r, r, (1 << 16) + 1); /* larger than 1000th prime */ \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (PTR(r), SIZ(r)); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (r); \ + return t; \ + } + +/* Calculate worst case (larger prime) for perfect_square */ +#define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function) \ + { \ + mpz_t r; \ + unsigned i; \ + double t; \ + \ + SPEED_RESTRICT_COND (s->size >= 2); \ + mpz_init_set_n (r, s->xp, s->size / 2); \ + mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1); \ + mpz_mul (r, r, r); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (PTR(r), SIZ(r)); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + mpz_clear (r); \ + return t; \ + } + + +/* s->size controls the number of limbs in the input, s->r is the base, or + decimal by default. */ +#define SPEED_ROUTINE_MPN_GET_STR(function) \ + { \ + unsigned char *wp; \ + mp_size_t wn; \ + mp_ptr xp; \ + int base; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + base = s->r == 0 ? 10 : s->r; \ + SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \ + \ + MPN_SIZEINBASE (wn, s->xp, s->size, base); \ + wp = (unsigned char *) TMP_ALLOC (wn); \ + \ + /* use this during development to guard against overflowing wp */ \ + /* \ + MPN_COPY (xp, s->xp, s->size); \ + ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \ + */ \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, xp, s->size); \ + speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + MPN_COPY (xp, s->xp, s->size); \ + function (wp, base, xp, s->size); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +/* s->size controls the number of digits in the input, s->r is the base, or + decimal by default. */ +#define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \ + { \ + unsigned char *xp; \ + mp_ptr wp; \ + mp_size_t wn; \ + unsigned i; \ + int base; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + base = s->r == 0 ? 10 : s->r; \ + SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ + \ + TMP_MARK; \ + \ + xp = (unsigned char *) TMP_ALLOC (s->size); \ + for (i = 0; i < s->size; i++) \ + xp[i] = s->xp[i] % base; \ + \ + LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); \ + SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ + \ + /* use this during development to check wn is big enough */ \ + /* \ + ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \ + */ \ + \ + speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES); \ + speed_operand_dst (s, wp, wn); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + + +/* Run an accel gcd find_a() function over various data values. A set of + values is used in case some run particularly fast or slow. The size + parameter is ignored, the amount of data tested is fixed. */ + +#define SPEED_ROUTINE_MPN_GCD_FINDA(function) \ + { \ + unsigned i, j; \ + mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \ + double t; \ + TMP_DECL; \ + \ + TMP_MARK; \ + \ + /* low must be odd, high must be non-zero */ \ + for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ + { \ + cp[i][0] = s->xp_block[i] | 1; \ + cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \ + } \ + \ + speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = SPEED_BLOCK_SIZE; \ + do \ + { \ + function (cp[j-1]); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + return t; \ + } + + +/* "call" should do "count_foo_zeros(c,n)". + Give leading=1 if foo is leading zeros, leading=0 for trailing. + Give zero=1 if n=0 is allowed in the call, zero=0 if not. */ + +#define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \ + { \ + mp_ptr xp; \ + int i, c; \ + unsigned j; \ + mp_limb_t n; \ + double t; \ + TMP_DECL; \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \ + \ + if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \ + return -1.0; \ + speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + c = 0; \ + speed_starttime (); \ + j = s->reps; \ + do { \ + for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ + { \ + n = xp[i]; \ + n ^= c; \ + +#define SPEED_ROUTINE_COUNT_ZEROS_B() \ + } \ + } while (--j != 0); \ + t = speed_endtime (); \ + \ + /* don't let c go dead */ \ + noop_1 (c); \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + \ + TMP_FREE; \ + return t; \ + } \ + +#define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \ + do { \ + SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \ + call; \ + SPEED_ROUTINE_COUNT_ZEROS_B (); \ + } while (0) \ + +#define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \ + SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero) +#define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \ + SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0) + +#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \ + SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero) +#define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \ + SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0) + + +#define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \ + { \ + unsigned i, j; \ + mp_limb_t d, dinv=0; \ + mp_ptr xp = s->xp_block - 1; \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = SPEED_BLOCK_SIZE; \ + do \ + { \ + d = dinv ^ xp[j]; \ + d |= GMP_LIMB_HIGHBIT; \ + do { call; } while (0); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + \ + /* don't let the compiler optimize everything away */ \ + noop_1 (dinv); \ + \ + return speed_endtime(); \ + } + + +#define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \ + { \ + unsigned i; \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (); \ + while (--i != 0); \ + return speed_endtime (); \ + } + + +#define SPEED_ROUTINE_MPN_ZERO_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL; \ + \ + SPEED_RESTRICT_COND (s->size >= 0); \ + \ + TMP_MARK; \ + SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ + speed_operand_dst (s, wp, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE; \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_ZERO(function) \ + SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size)) + + +#endif diff --git a/gmp-6.3.0/tune/sqr_basecase.c b/gmp-6.3.0/tune/sqr_basecase.c new file mode 100644 index 0000000..93adac5 --- /dev/null +++ b/gmp-6.3.0/tune/sqr_basecase.c @@ -0,0 +1,2 @@ +/* not sure that an empty file can compile, so put in a dummy */ +int sqr_basecase_dummy; diff --git a/gmp-6.3.0/tune/time.c b/gmp-6.3.0/tune/time.c new file mode 100644 index 0000000..5ba482b --- /dev/null +++ b/gmp-6.3.0/tune/time.c @@ -0,0 +1,1598 @@ +/* Time routines for speed measurements. + +Copyright 1999-2004, 2010-2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* Usage: + + The code in this file implements the lowest level of time measuring, + simple one-time measuring of time between two points. + + void speed_starttime (void) + double speed_endtime (void) + Call speed_starttime to start measuring, and then call speed_endtime + when done. + + speed_endtime returns the time taken, in seconds. Or if the timebase + is in CPU cycles and the CPU frequency is unknown then speed_endtime + returns cycles. Applications can identify the cycles return by + checking for speed_cycletime (described below) equal to 1.0. + + If some sort of temporary glitch occurs then speed_endtime returns + 0.0. Currently this is for various cases where a negative time has + occurred. This unfortunately occurs with getrusage on some systems, + and with the hppa cycle counter on hpux. + + double speed_cycletime + The time in seconds for each CPU cycle. For example on a 100 MHz CPU + this would be 1.0e-8. + + If the CPU frequency is unknown, then speed_cycletime is either 0.0 + or 1.0. It's 0.0 when speed_endtime is returning seconds, or it's + 1.0 when speed_endtime is returning cycles. + + It may be noted that "speed_endtime() / speed_cycletime" gives a + measured time in cycles, irrespective of whether speed_endtime is + returning cycles or seconds. (Assuming cycles can be had, ie. it's + either cycles already or the cpu frequency is known. See also + speed_cycletime_need_cycles below.) + + double speed_unittime + The unit of time measurement accuracy for the timing method in use. + This is in seconds or cycles, as per speed_endtime. + + char speed_time_string[] + A null-terminated string describing the time method in use. + + void speed_time_init (void) + Initialize time measuring. speed_starttime() does this + automatically, so it's only needed if an application wants to inspect + the above global variables before making a measurement. + + int speed_precision + The intended accuracy of time measurements. speed_measure() in + common.c for instance runs target routines with enough repetitions so + it takes at least "speed_unittime * speed_precision" (this expression + works for both cycles or seconds from speed_endtime). + + A program can provide an option so the user to set speed_precision. + If speed_precision is zero when speed_time_init or speed_starttime + first run then it gets a default based on the measuring method + chosen. (More precision for higher accuracy methods.) + + void speed_cycletime_need_seconds (void) + Call this to demand that speed_endtime will return seconds, and not + cycles. If only cycles are available then an error is printed and + the program exits. + + void speed_cycletime_need_cycles (void) + Call this to demand that speed_cycletime is non-zero, so that + "speed_endtime() / speed_cycletime" will give times in cycles. + + + + Notes: + + Various combinations of cycle counter, read_real_time(), getrusage(), + gettimeofday() and times() can arise, according to which are available + and their precision. + + + Allowing speed_endtime() to return either seconds or cycles is only a + slight complication and makes it possible for the speed program to do + some sensible things without demanding the CPU frequency. If seconds are + being measured then it can always print seconds, and if cycles are being + measured then it can always print them without needing to know how long + they are. Also the tune program doesn't care at all what the units are. + + GMP_CPU_FREQUENCY can always be set when the automated methods in freq.c + fail. This will be needed if times in seconds are wanted but a cycle + counter is being used, or if times in cycles are wanted but getrusage or + another seconds based timer is in use. + + If the measuring method uses a cycle counter but supplements it with + getrusage or the like, then knowing the CPU frequency is mandatory since + the code compares values from the two. + + + Not done: + + Solaris gethrtime() seems no more than a slow way to access the Sparc V9 + cycle counter. gethrvtime() seems to be relevant only to light weight + processes, it doesn't for instance give nanosecond virtual time. So + neither of these are used. + + + Bugs: + + getrusage_microseconds_p is fundamentally flawed, getrusage and + gettimeofday can have resolutions other than clock ticks or microseconds, + for instance IRIX 5 has a tick of 10 ms but a getrusage of 1 ms. + + + Enhancements: + + The SGI hardware counter has 64 bits on some machines, which could be + used when available. But perhaps 32 bits is enough range, and then rely + on the getrusage supplement. + + Maybe getrusage (or times) should be used as a supplement for any + wall-clock measuring method. Currently a wall clock with a good range + (eg. a 64-bit cycle counter) is used without a supplement. + + On PowerPC the timebase registers could be used, but would have to do + something to find out the speed. On 6xx chips it's normally 1/4 bus + speed, on 4xx chips it's either that or an external clock. Measuring + against gettimeofday might be ok. */ + + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include /* for getenv() */ + +#if HAVE_FCNTL_H +#include /* for open() */ +#endif + +#if HAVE_STDINT_H +#include /* for uint64_t */ +#endif + +#if HAVE_UNISTD_H +#include /* for sysconf() */ +#endif + +#include + +#if TIME_WITH_SYS_TIME +# include /* for struct timeval */ +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#if HAVE_SYS_MMAN_H +#include /* for mmap() */ +#endif + +#if HAVE_SYS_RESOURCE_H +#include /* for struct rusage */ +#endif + +#if HAVE_SYS_SYSSGI_H +#include /* for syssgi() */ +#endif + +#if HAVE_SYS_SYSTEMCFG_H +#include /* for RTC_POWER on AIX */ +#endif + +#if HAVE_SYS_TIMES_H +#include /* for times() and struct tms */ +#endif + +#include "gmp-impl.h" + +#include "speed.h" + + +/* strerror is only used for some stuff on newish systems, no need to have a + proper replacement */ +#if ! HAVE_STRERROR +#define strerror(n) "" +#endif + + +char speed_time_string[256]; +int speed_precision = 0; +double speed_unittime; +double speed_cycletime = 0.0; + + +/* don't rely on "unsigned" to "double" conversion, it's broken in SunOS 4 + native cc */ +#define M_2POWU (((double) INT_MAX + 1.0) * 2.0) + +#define M_2POW32 4294967296.0 +#define M_2POW64 (M_2POW32 * M_2POW32) + + +/* Conditionals for the time functions available are done with normal C + code, which is a lot easier than wildly nested preprocessor directives. + + The choice of what to use is partly made at run-time, according to + whether the cycle counter works and the measured accuracy of getrusage + and gettimeofday. + + A routine that's not available won't be getting called, but is an abort() + to be sure it isn't called mistakenly. + + It can be assumed that if a function exists then its data type will, but + if the function doesn't then the data type might or might not exist, so + the type can't be used unconditionally. The "struct_rusage" etc macros + provide dummies when the respective function doesn't exist. */ + + +#if HAVE_SPEED_CYCLECOUNTER +static const int have_cycles = HAVE_SPEED_CYCLECOUNTER; +#else +static const int have_cycles = 0; +#define speed_cyclecounter(p) ASSERT_FAIL (speed_cyclecounter not available) +#endif + +/* "stck" returns ticks since 1 Jan 1900 00:00 GMT, where each tick is 2^-12 + microseconds. Same #ifdefs here as in longlong.h. */ +#if defined (__GNUC__) && ! defined (NO_ASM) \ + && (defined (__i370__) || defined (__s390__) || defined (__mvs__)) +static const int have_stck = 1; +static const int use_stck = 1; /* always use when available */ +typedef uint64_t stck_t; /* gcc for s390 is quite new, always has uint64_t */ +#define STCK(timestamp) \ + do { \ + asm ("stck %0" : "=Q" (timestamp)); \ + } while (0) +#else +static const int have_stck = 0; +static const int use_stck = 0; +typedef unsigned long stck_t; /* dummy */ +#define STCK(timestamp) ASSERT_FAIL (stck instruction not available) +#endif +#define STCK_PERIOD (1.0 / 4096e6) /* 2^-12 microseconds */ + +/* mftb + Enhancement: On 64-bit chips mftb gives a 64-bit value, no need for mftbu + and a loop (see powerpc64.asm). */ +#if HAVE_HOST_CPU_FAMILY_powerpc +static const int have_mftb = 1; +#if defined (__GNUC__) && ! defined (NO_ASM) +#define MFTB(a) \ + do { \ + unsigned __h1, __l, __h2; \ + do { \ + asm volatile ("mftbu %0\n" \ + "mftb %1\n" \ + "mftbu %2" \ + : "=r" (__h1), \ + "=r" (__l), \ + "=r" (__h2)); \ + } while (__h1 != __h2); \ + a[0] = __l; \ + a[1] = __h1; \ + } while (0) +#else +#define MFTB(a) mftb_function (a) +#endif +#else /* ! powerpc */ +static const int have_mftb = 0; +#define MFTB(a) \ + do { \ + a[0] = 0; \ + a[1] = 0; \ + ASSERT_FAIL (mftb not available); \ + } while (0) +#endif + +/* Unicos 10.X has syssgi(), but not mmap(). */ +#if HAVE_SYSSGI && HAVE_MMAP +static const int have_sgi = 1; +#else +static const int have_sgi = 0; +#endif + +#if HAVE_READ_REAL_TIME +static const int have_rrt = 1; +#else +static const int have_rrt = 0; +#define read_real_time(t,s) ASSERT_FAIL (read_real_time not available) +#define time_base_to_time(t,s) ASSERT_FAIL (time_base_to_time not available) +#define RTC_POWER 1 +#define RTC_POWER_PC 2 +#define timebasestruct_t struct timebasestruct_dummy +struct timebasestruct_dummy { + int flag; + unsigned int tb_high; + unsigned int tb_low; +}; +#endif + +#if HAVE_CLOCK_GETTIME +static const int have_cgt = 1; +#define struct_timespec struct timespec +#else +static const int have_cgt = 0; +#define struct_timespec struct timespec_dummy +#define clock_gettime(id,ts) (ASSERT_FAIL (clock_gettime not available), -1) +#define clock_getres(id,ts) (ASSERT_FAIL (clock_getres not available), -1) +#endif + +#if HAVE_GETRUSAGE +static const int have_grus = 1; +#define struct_rusage struct rusage +#else +static const int have_grus = 0; +#define getrusage(n,ru) ASSERT_FAIL (getrusage not available) +#define struct_rusage struct rusage_dummy +#endif + +#if HAVE_GETTIMEOFDAY +static const int have_gtod = 1; +#define struct_timeval struct timeval +#else +static const int have_gtod = 0; +#define gettimeofday(tv,tz) ASSERT_FAIL (gettimeofday not available) +#define struct_timeval struct timeval_dummy +#endif + +#if HAVE_TIMES +static const int have_times = 1; +#define struct_tms struct tms +#else +static const int have_times = 0; +#define times(tms) ASSERT_FAIL (times not available) +#define struct_tms struct tms_dummy +#endif + +struct tms_dummy { + long tms_utime; +}; +struct timeval_dummy { + long tv_sec; + long tv_usec; +}; +struct rusage_dummy { + struct_timeval ru_utime; +}; +struct timespec_dummy { + long tv_sec; + long tv_nsec; +}; + +static int use_cycles; +static int use_mftb; +static int use_sgi; +static int use_rrt; +static int use_cgt; +static int use_gtod; +static int use_grus; +static int use_times; +static int use_tick_boundary; + +static unsigned start_cycles[2]; +static stck_t start_stck; +static unsigned start_mftb[2]; +static unsigned start_sgi; +static timebasestruct_t start_rrt; +static struct_timespec start_cgt; +static struct_rusage start_grus; +static struct_timeval start_gtod; +static struct_tms start_times; + +static double cycles_limit = 1e100; +static double mftb_unittime; +static double sgi_unittime; +static double cgt_unittime; +static double grus_unittime; +static double gtod_unittime; +static double times_unittime; + +/* for RTC_POWER format, ie. seconds and nanoseconds */ +#define TIMEBASESTRUCT_SECS(t) ((t)->tb_high + (t)->tb_low * 1e-9) + + +/* Return a string representing a time in seconds, nicely formatted. + Eg. "10.25ms". */ +char * +unittime_string (double t) +{ + static char buf[128]; + + const char *unit; + int prec; + + /* choose units and scale */ + if (t < 1e-6) + t *= 1e9, unit = "ns"; + else if (t < 1e-3) + t *= 1e6, unit = "us"; + else if (t < 1.0) + t *= 1e3, unit = "ms"; + else + unit = "s"; + + /* want 4 significant figures */ + if (t < 1.0) + prec = 4; + else if (t < 10.0) + prec = 3; + else if (t < 100.0) + prec = 2; + else + prec = 1; + + sprintf (buf, "%.*f%s", prec, t, unit); + return buf; +} + + +static jmp_buf cycles_works_buf; + +static RETSIGTYPE +cycles_works_handler (int sig) +{ + longjmp (cycles_works_buf, 1); +} + +int +cycles_works_p (void) +{ + static int result = -1; + + if (result != -1) + goto done; + + /* FIXME: On linux, the cycle counter is not saved and restored over + * context switches, making it almost useless for precise cputime + * measurements. When available, it's better to use clock_gettime, + * which seems to have reasonable accuracy (tested on x86_32, + * linux-2.6.26, glibc-2.7). However, there are also some linux + * systems where clock_gettime is broken in one way or the other, + * like CLOCK_PROCESS_CPUTIME_ID not implemented (easy case) or + * kind-of implemented but broken (needs code to detect that), and + * on those systems a wall-clock cycle counter is the least bad + * fallback. + * + * So we need some code to disable the cycle counter on some but not + * all linux systems. */ +#ifdef SIGILL + { + RETSIGTYPE (*old_handler) (int); + unsigned cycles[2]; + + old_handler = signal (SIGILL, cycles_works_handler); + if (old_handler == SIG_ERR) + { + if (speed_option_verbose) + printf ("cycles_works_p(): SIGILL not supported, assuming speed_cyclecounter() works\n"); + goto yes; + } + if (setjmp (cycles_works_buf)) + { + if (speed_option_verbose) + printf ("cycles_works_p(): SIGILL during speed_cyclecounter(), so doesn't work\n"); + result = 0; + goto done; + } + speed_cyclecounter (cycles); + signal (SIGILL, old_handler); + if (speed_option_verbose) + printf ("cycles_works_p(): speed_cyclecounter() works\n"); + } +#else + + if (speed_option_verbose) + printf ("cycles_works_p(): SIGILL not defined, assuming speed_cyclecounter() works\n"); + goto yes; +#endif + + yes: + result = 1; + + done: + return result; +} + + +/* The number of clock ticks per second, but looking at sysconf rather than + just CLK_TCK, where possible. */ +long +clk_tck (void) +{ + static long result = -1L; + if (result != -1L) + return result; + +#if HAVE_SYSCONF + result = sysconf (_SC_CLK_TCK); + if (result != -1L) + { + if (speed_option_verbose) + printf ("sysconf(_SC_CLK_TCK) is %ld per second\n", result); + return result; + } + + fprintf (stderr, + "sysconf(_SC_CLK_TCK) not working, using CLK_TCK instead\n"); +#endif + +#ifdef CLK_TCK + result = CLK_TCK; + if (speed_option_verbose) + printf ("CLK_TCK is %ld per second\n", result); + return result; +#else + fprintf (stderr, "CLK_TCK not defined, cannot continue\n"); + abort (); +#endif +} + + +/* If two times can be observed less than half a clock tick apart, then + assume "get" is microsecond accurate. + + Two times only 1 microsecond apart are not believed, since some kernels + take it upon themselves to ensure gettimeofday doesn't return the same + value twice, for the benefit of applications using it for a timestamp. + This is obviously very stupid given the speed of CPUs these days. + + Making "reps" many calls to noop_1() is designed to waste some CPU, with + a view to getting measurements 2 microseconds (or more) apart. "reps" is + increased progressively until such a period is seen. + + The outer loop "attempts" are just to allow for any random nonsense or + system load upsetting the measurements (ie. making two successive calls + to "get" come out as a longer interval than normal). + + Bugs: + + The assumption that any interval less than a half tick implies + microsecond resolution is obviously fairly rash, the true resolution + could be anything between a microsecond and that half tick. Perhaps + something special would have to be done on a system where this is the + case, since there's no obvious reliable way to detect it + automatically. */ + +#define MICROSECONDS_P(name, type, get, sec, usec) \ + { \ + static int result = -1; \ + type st, et; \ + long dt, half_tick; \ + unsigned attempt, reps, i, j; \ + \ + if (result != -1) \ + return result; \ + \ + result = 0; \ + half_tick = (1000000L / clk_tck ()) / 2; \ + \ + for (attempt = 0; attempt < 5; attempt++) \ + { \ + reps = 0; \ + for (;;) \ + { \ + get (st); \ + for (i = 0; i < reps; i++) \ + for (j = 0; j < 100; j++) \ + noop_1 (CNST_LIMB(0)); \ + get (et); \ + \ + dt = (sec(et)-sec(st))*1000000L + usec(et)-usec(st); \ + \ + if (speed_option_verbose >= 2) \ + printf ("%s attempt=%u, reps=%u, dt=%ld\n", \ + name, attempt, reps, dt); \ + \ + if (dt >= 2) \ + break; \ + \ + reps = (reps == 0 ? 1 : 2*reps); \ + if (reps == 0) \ + break; /* uint overflow, not normal */ \ + } \ + \ + if (dt < half_tick) \ + { \ + result = 1; \ + break; \ + } \ + } \ + \ + if (speed_option_verbose) \ + { \ + if (result) \ + printf ("%s is microsecond accurate\n", name); \ + else \ + printf ("%s is only %s clock tick accurate\n", \ + name, unittime_string (1.0/clk_tck())); \ + } \ + return result; \ + } + + +int +gettimeofday_microseconds_p (void) +{ +#define call_gettimeofday(t) gettimeofday (&(t), NULL) +#define timeval_tv_sec(t) ((t).tv_sec) +#define timeval_tv_usec(t) ((t).tv_usec) + MICROSECONDS_P ("gettimeofday", struct_timeval, + call_gettimeofday, timeval_tv_sec, timeval_tv_usec); +} + +int +getrusage_microseconds_p (void) +{ +#define call_getrusage(t) getrusage (0, &(t)) +#define rusage_tv_sec(t) ((t).ru_utime.tv_sec) +#define rusage_tv_usec(t) ((t).ru_utime.tv_usec) + MICROSECONDS_P ("getrusage", struct_rusage, + call_getrusage, rusage_tv_sec, rusage_tv_usec); +} + +/* Test whether getrusage goes backwards, return non-zero if it does + (suggesting it's flawed). + + On a macintosh m68040-unknown-netbsd1.4.1 getrusage looks like it's + microsecond accurate, but has been seen remaining unchanged after many + microseconds have elapsed. It also regularly goes backwards by 1000 to + 5000 usecs, this has been seen after between 500 and 4000 attempts taking + perhaps 0.03 seconds. We consider this too broken for good measuring. + We used to have configure pretend getrusage didn't exist on this system, + but a runtime test should be more reliable, since we imagine the problem + is not confined to just this exact system tuple. */ + +int +getrusage_backwards_p (void) +{ + static int result = -1; + struct rusage start, prev, next; + long d; + int i; + + if (result != -1) + return result; + + getrusage (0, &start); + memcpy (&next, &start, sizeof (next)); + + result = 0; + i = 0; + for (;;) + { + memcpy (&prev, &next, sizeof (prev)); + getrusage (0, &next); + + if (next.ru_utime.tv_sec < prev.ru_utime.tv_sec + || (next.ru_utime.tv_sec == prev.ru_utime.tv_sec + && next.ru_utime.tv_usec < prev.ru_utime.tv_usec)) + { + if (speed_option_verbose) + printf ("getrusage went backwards (attempt %d: %ld.%06ld -> %ld.%06ld)\n", + i, + (long) prev.ru_utime.tv_sec, (long) prev.ru_utime.tv_usec, + (long) next.ru_utime.tv_sec, (long) next.ru_utime.tv_usec); + result = 1; + break; + } + + /* minimum 1000 attempts, then stop after either 0.1 seconds or 50000 + attempts, whichever comes first */ + d = 1000000 * (next.ru_utime.tv_sec - start.ru_utime.tv_sec) + + (next.ru_utime.tv_usec - start.ru_utime.tv_usec); + i++; + if (i > 50000 || (i > 1000 && d > 100000)) + break; + } + + return result; +} + +/* CLOCK_PROCESS_CPUTIME_ID looks like it's going to be in a future version + of glibc (some time post 2.2). + + CLOCK_VIRTUAL is process time, available in BSD systems (though sometimes + defined, but returning -1 for an error). */ + +#ifdef CLOCK_PROCESS_CPUTIME_ID +# define CGT_ID CLOCK_PROCESS_CPUTIME_ID +#else +# ifdef CLOCK_VIRTUAL +# define CGT_ID CLOCK_VIRTUAL +# endif +#endif +#ifdef CGT_ID +const int have_cgt_id = 1; +#else +const int have_cgt_id = 0; +# define CGT_ID (ASSERT_FAIL (CGT_ID not determined), -1) +#endif + +#define CGT_DELAY_COUNT 1000 + +int +cgt_works_p (void) +{ + static int result = -1; + struct_timespec unit; + + if (! have_cgt) + return 0; + + if (! have_cgt_id) + { + if (speed_option_verbose) + printf ("clock_gettime don't know what ID to use\n"); + result = 0; + return result; + } + + if (result != -1) + return result; + + /* trial run to see if it works */ + if (clock_gettime (CGT_ID, &unit) != 0) + { + if (speed_option_verbose) + printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); + result = 0; + return result; + } + + /* get the resolution */ + if (clock_getres (CGT_ID, &unit) != 0) + { + if (speed_option_verbose) + printf ("clock_getres id=%d error: %s\n", CGT_ID, strerror (errno)); + result = 0; + return result; + } + + cgt_unittime = unit.tv_sec + unit.tv_nsec * 1e-9; + if (speed_option_verbose) + printf ("clock_gettime is %s accurate\n", unittime_string (cgt_unittime)); + + if (cgt_unittime < 10e-9) + { + /* Do we believe this? */ + struct timespec start, end; + static volatile int counter; + double duration; + if (clock_gettime (CGT_ID, &start)) + { + if (speed_option_verbose) + printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); + result = 0; + return result; + } + /* Loop of at least 1000 memory accesses, ought to take at + least 100 ns*/ + for (counter = 0; counter < CGT_DELAY_COUNT; counter++) + ; + if (clock_gettime (CGT_ID, &end)) + { + if (speed_option_verbose) + printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno)); + result = 0; + return result; + } + duration = (end.tv_sec + end.tv_nsec * 1e-9 + - start.tv_sec - start.tv_nsec * 1e-9); + if (speed_option_verbose) + printf ("delay loop of %d rounds took %s (according to clock_gettime)\n", + CGT_DELAY_COUNT, unittime_string (duration)); + if (duration < 100e-9) + { + if (speed_option_verbose) + printf ("clock_gettime id=%d not believable\n", CGT_ID); + result = 0; + return result; + } + } + result = 1; + return result; +} + + +static double +freq_measure_mftb_one (void) +{ +#define call_gettimeofday(t) gettimeofday (&(t), NULL) +#define timeval_tv_sec(t) ((t).tv_sec) +#define timeval_tv_usec(t) ((t).tv_usec) + FREQ_MEASURE_ONE ("mftb", struct_timeval, + call_gettimeofday, MFTB, + timeval_tv_sec, timeval_tv_usec); +} + + +static jmp_buf mftb_works_buf; + +static RETSIGTYPE +mftb_works_handler (int sig) +{ + longjmp (mftb_works_buf, 1); +} + +int +mftb_works_p (void) +{ + unsigned a[2]; + RETSIGTYPE (*old_handler) (int); + double cycletime; + + /* suppress a warning about a[] unused */ + a[0] = 0; + + if (! have_mftb) + return 0; + +#ifdef SIGILL + old_handler = signal (SIGILL, mftb_works_handler); + if (old_handler == SIG_ERR) + { + if (speed_option_verbose) + printf ("mftb_works_p(): SIGILL not supported, assuming mftb works\n"); + return 1; + } + if (setjmp (mftb_works_buf)) + { + if (speed_option_verbose) + printf ("mftb_works_p(): SIGILL during mftb, so doesn't work\n"); + return 0; + } + MFTB (a); + signal (SIGILL, old_handler); + if (speed_option_verbose) + printf ("mftb_works_p(): mftb works\n"); +#else + + if (speed_option_verbose) + printf ("mftb_works_p(): SIGILL not defined, assuming mftb works\n"); +#endif + +#if ! HAVE_GETTIMEOFDAY + if (speed_option_verbose) + printf ("mftb_works_p(): no gettimeofday available to measure mftb\n"); + return 0; +#endif + + /* The time base is normally 1/4 of the bus speed on 6xx and 7xx chips, on + other chips it can be driven from an external clock. */ + cycletime = freq_measure ("mftb", freq_measure_mftb_one); + if (cycletime == -1.0) + { + if (speed_option_verbose) + printf ("mftb_works_p(): cannot measure mftb period\n"); + return 0; + } + + mftb_unittime = cycletime; + return 1; +} + + +volatile unsigned *sgi_addr; + +int +sgi_works_p (void) +{ +#if HAVE_SYSSGI && HAVE_MMAP + static int result = -1; + + size_t pagesize, offset; + __psunsigned_t phys, physpage; + void *virtpage; + unsigned period_picoseconds; + int size, fd; + + if (result != -1) + return result; + + phys = syssgi (SGI_QUERY_CYCLECNTR, &period_picoseconds); + if (phys == (__psunsigned_t) -1) + { + /* ENODEV is the error when a counter is not available */ + if (speed_option_verbose) + printf ("syssgi SGI_QUERY_CYCLECNTR error: %s\n", strerror (errno)); + result = 0; + return result; + } + sgi_unittime = period_picoseconds * 1e-12; + + /* IRIX 5 doesn't have SGI_CYCLECNTR_SIZE, assume 32 bits in that case. + Challenge/ONYX hardware has a 64 bit byte counter, but there seems no + obvious way to identify that without SGI_CYCLECNTR_SIZE. */ +#ifdef SGI_CYCLECNTR_SIZE + size = syssgi (SGI_CYCLECNTR_SIZE); + if (size == -1) + { + if (speed_option_verbose) + { + printf ("syssgi SGI_CYCLECNTR_SIZE error: %s\n", strerror (errno)); + printf (" will assume size==4\n"); + } + size = 32; + } +#else + size = 32; +#endif + + if (size < 32) + { + printf ("syssgi SGI_CYCLECNTR_SIZE gives %d, expected 32 or 64\n", size); + result = 0; + return result; + } + + pagesize = getpagesize(); + offset = (size_t) phys & (pagesize-1); + physpage = phys - offset; + + /* shouldn't cross over a page boundary */ + ASSERT_ALWAYS (offset + size/8 <= pagesize); + + fd = open("/dev/mmem", O_RDONLY); + if (fd == -1) + { + if (speed_option_verbose) + printf ("open /dev/mmem: %s\n", strerror (errno)); + result = 0; + return result; + } + + virtpage = mmap (0, pagesize, PROT_READ, MAP_PRIVATE, fd, (off_t) physpage); + if (virtpage == (void *) -1) + { + if (speed_option_verbose) + printf ("mmap /dev/mmem: %s\n", strerror (errno)); + result = 0; + return result; + } + + /* address of least significant 4 bytes, knowing mips is big endian */ + sgi_addr = (unsigned *) ((char *) virtpage + offset + + size/8 - sizeof(unsigned)); + result = 1; + return result; + +#else /* ! (HAVE_SYSSGI && HAVE_MMAP) */ + return 0; +#endif +} + + +#define DEFAULT(var,n) \ + do { \ + if (! (var)) \ + (var) = (n); \ + } while (0) + +void +speed_time_init (void) +{ + double supplement_unittime = 0.0; + + static int speed_time_initialized = 0; + if (speed_time_initialized) + return; + speed_time_initialized = 1; + + speed_cycletime_init (); + + if (!speed_option_cycles_broken && have_cycles && cycles_works_p ()) + { + use_cycles = 1; + DEFAULT (speed_cycletime, 1.0); + speed_unittime = speed_cycletime; + DEFAULT (speed_precision, 10000); + strcpy (speed_time_string, "CPU cycle counter"); + + /* only used if a supplementary method is chosen below */ + cycles_limit = (have_cycles == 1 ? M_2POW32 : M_2POW64) / 2.0 + * speed_cycletime; + + if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p()) + { + /* this is a good combination */ + use_grus = 1; + supplement_unittime = grus_unittime = 1.0e-6; + strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond getrusage()"); + } + else if (have_cycles == 1) + { + /* When speed_cyclecounter has a limited range, look for something + to supplement it. */ + if (have_gtod && gettimeofday_microseconds_p()) + { + use_gtod = 1; + supplement_unittime = gtod_unittime = 1.0e-6; + strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond gettimeofday()"); + } + else if (have_grus) + { + use_grus = 1; + supplement_unittime = grus_unittime = 1.0 / (double) clk_tck (); + sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick getrusage()", unittime_string (supplement_unittime)); + } + else if (have_times) + { + use_times = 1; + supplement_unittime = times_unittime = 1.0 / (double) clk_tck (); + sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick times()", unittime_string (supplement_unittime)); + } + else if (have_gtod) + { + use_gtod = 1; + supplement_unittime = gtod_unittime = 1.0 / (double) clk_tck (); + sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick gettimeofday()", unittime_string (supplement_unittime)); + } + else + { + fprintf (stderr, "WARNING: cycle counter is 32 bits and there's no other functions.\n"); + fprintf (stderr, " Wraparounds may produce bad results on long measurements.\n"); + } + } + + if (use_grus || use_times || use_gtod) + { + /* must know cycle period to compare cycles to other measuring + (via cycles_limit) */ + speed_cycletime_need_seconds (); + + if (speed_precision * supplement_unittime > cycles_limit) + { + fprintf (stderr, "WARNING: requested precision can't always be achieved due to limited range\n"); + fprintf (stderr, " cycle counter and limited precision supplemental method\n"); + fprintf (stderr, " (%s)\n", speed_time_string); + } + } + } + else if (have_stck) + { + strcpy (speed_time_string, "STCK timestamp"); + /* stck is in units of 2^-12 microseconds, which is very likely higher + resolution than a cpu cycle */ + if (speed_cycletime == 0.0) + speed_cycletime_fail + ("Need to know CPU frequency for effective stck unit"); + speed_unittime = MAX (speed_cycletime, STCK_PERIOD); + DEFAULT (speed_precision, 10000); + } + else if (have_mftb && mftb_works_p ()) + { + use_mftb = 1; + DEFAULT (speed_precision, 10000); + speed_unittime = mftb_unittime; + sprintf (speed_time_string, "mftb counter (%s)", + unittime_string (speed_unittime)); + } + else if (have_sgi && sgi_works_p ()) + { + use_sgi = 1; + DEFAULT (speed_precision, 10000); + speed_unittime = sgi_unittime; + sprintf (speed_time_string, "syssgi() mmap counter (%s), supplemented by millisecond getrusage()", + unittime_string (speed_unittime)); + /* supplemented with getrusage, which we assume to have 1ms resolution */ + use_grus = 1; + supplement_unittime = 1e-3; + } + else if (have_rrt) + { + timebasestruct_t t; + use_rrt = 1; + DEFAULT (speed_precision, 10000); + read_real_time (&t, sizeof(t)); + switch (t.flag) { + case RTC_POWER: + /* FIXME: What's the actual RTC resolution? */ + speed_unittime = 1e-7; + strcpy (speed_time_string, "read_real_time() power nanoseconds"); + break; + case RTC_POWER_PC: + t.tb_high = 1; + t.tb_low = 0; + time_base_to_time (&t, sizeof(t)); + speed_unittime = TIMEBASESTRUCT_SECS(&t) / M_2POW32; + sprintf (speed_time_string, "%s read_real_time() powerpc ticks", + unittime_string (speed_unittime)); + break; + default: + fprintf (stderr, "ERROR: Unrecognised timebasestruct_t flag=%d\n", + t.flag); + abort (); + } + } + else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5e-6) + { + /* use clock_gettime if microsecond or better resolution */ + choose_cgt: + use_cgt = 1; + speed_unittime = cgt_unittime; + DEFAULT (speed_precision, (cgt_unittime <= 0.1e-6 ? 10000 : 1000)); + strcpy (speed_time_string, "microsecond accurate clock_gettime()"); + } + else if (have_times && clk_tck() > 1000000) + { + /* Cray vector systems have times() which is clock cycle resolution + (eg. 450 MHz). */ + DEFAULT (speed_precision, 10000); + goto choose_times; + } + else if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p()) + { + use_grus = 1; + speed_unittime = grus_unittime = 1.0e-6; + DEFAULT (speed_precision, 1000); + strcpy (speed_time_string, "microsecond accurate getrusage()"); + } + else if (have_gtod && gettimeofday_microseconds_p()) + { + use_gtod = 1; + speed_unittime = gtod_unittime = 1.0e-6; + DEFAULT (speed_precision, 1000); + strcpy (speed_time_string, "microsecond accurate gettimeofday()"); + } + else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5/clk_tck()) + { + /* use clock_gettime if 1 tick or better resolution */ + goto choose_cgt; + } + else if (have_times) + { + use_tick_boundary = 1; + DEFAULT (speed_precision, 200); + choose_times: + use_times = 1; + speed_unittime = times_unittime = 1.0 / (double) clk_tck (); + sprintf (speed_time_string, "%s clock tick times()", + unittime_string (speed_unittime)); + } + else if (have_grus) + { + use_grus = 1; + use_tick_boundary = 1; + speed_unittime = grus_unittime = 1.0 / (double) clk_tck (); + DEFAULT (speed_precision, 200); + sprintf (speed_time_string, "%s clock tick getrusage()\n", + unittime_string (speed_unittime)); + } + else if (have_gtod) + { + use_gtod = 1; + use_tick_boundary = 1; + speed_unittime = gtod_unittime = 1.0 / (double) clk_tck (); + DEFAULT (speed_precision, 200); + sprintf (speed_time_string, "%s clock tick gettimeofday()", + unittime_string (speed_unittime)); + } + else + { + fprintf (stderr, "No time measuring method available\n"); + fprintf (stderr, "None of: speed_cyclecounter(), STCK(), getrusage(), gettimeofday(), times()\n"); + abort (); + } + + if (speed_option_verbose) + { + printf ("speed_time_init: %s\n", speed_time_string); + printf (" speed_precision %d\n", speed_precision); + printf (" speed_unittime %.2g\n", speed_unittime); + if (supplement_unittime) + printf (" supplement_unittime %.2g\n", supplement_unittime); + printf (" use_tick_boundary %d\n", use_tick_boundary); + if (have_cycles) + printf (" cycles_limit %.2g seconds\n", cycles_limit); + } +} + + + +/* Burn up CPU until a clock tick boundary, for greater accuracy. Set the + corresponding "start_foo" appropriately too. */ + +void +grus_tick_boundary (void) +{ + struct_rusage prev; + getrusage (0, &prev); + do { + getrusage (0, &start_grus); + } while (start_grus.ru_utime.tv_usec == prev.ru_utime.tv_usec); +} + +void +gtod_tick_boundary (void) +{ + struct_timeval prev; + gettimeofday (&prev, NULL); + do { + gettimeofday (&start_gtod, NULL); + } while (start_gtod.tv_usec == prev.tv_usec); +} + +void +times_tick_boundary (void) +{ + struct_tms prev; + times (&prev); + do + times (&start_times); + while (start_times.tms_utime == prev.tms_utime); +} + + +/* "have_" values are tested to let unused code go dead. */ + +void +speed_starttime (void) +{ + speed_time_init (); + + if (have_grus && use_grus) + { + if (use_tick_boundary) + grus_tick_boundary (); + else + getrusage (0, &start_grus); + } + + if (have_gtod && use_gtod) + { + if (use_tick_boundary) + gtod_tick_boundary (); + else + gettimeofday (&start_gtod, NULL); + } + + if (have_times && use_times) + { + if (use_tick_boundary) + times_tick_boundary (); + else + times (&start_times); + } + + if (have_cgt && use_cgt) + clock_gettime (CGT_ID, &start_cgt); + + if (have_rrt && use_rrt) + read_real_time (&start_rrt, sizeof(start_rrt)); + + if (have_sgi && use_sgi) + start_sgi = *sgi_addr; + + if (have_mftb && use_mftb) + MFTB (start_mftb); + + if (have_stck && use_stck) + STCK (start_stck); + + /* Cycles sampled last for maximum accuracy. */ + if (have_cycles && use_cycles) + speed_cyclecounter (start_cycles); +} + + +/* Calculate the difference between two cycle counter samples, as a "double" + counter of cycles. + + The start and end values are allowed to cancel in integers in case the + counter values are bigger than the 53 bits that normally fit in a double. + + This works even if speed_cyclecounter() puts a value bigger than 32-bits + in the low word (the high word always gets a 2**32 multiplier though). */ + +double +speed_cyclecounter_diff (const unsigned end[2], const unsigned start[2]) +{ + unsigned d; + double t; + + if (have_cycles == 1) + { + t = (end[0] - start[0]); + } + else + { + d = end[0] - start[0]; + t = d - (d > end[0] ? M_2POWU : 0.0); + t += (end[1] - start[1]) * M_2POW32; + } + return t; +} + + +double +speed_mftb_diff (const unsigned end[2], const unsigned start[2]) +{ + unsigned d; + double t; + + d = end[0] - start[0]; + t = (double) d - (d > end[0] ? M_2POW32 : 0.0); + t += (end[1] - start[1]) * M_2POW32; + return t; +} + + +/* Calculate the difference between "start" and "end" using fields "sec" and + "psec", where each "psec" is a "punit" of a second. + + The seconds parts are allowed to cancel before being combined with the + psec parts, in case a simple "sec+psec*punit" exceeds the precision of a + double. + + Total time is only calculated in a "double" since an integer count of + psecs might overflow. 2^32 microseconds is only a bit over an hour, or + 2^32 nanoseconds only about 4 seconds. + + The casts to "long" are for the benefit of timebasestruct_t, where the + fields are only "unsigned int", but we want a signed difference. */ + +#define DIFF_SECS_ROUTINE(sec, psec, punit) \ + { \ + long sec_diff, psec_diff; \ + sec_diff = (long) end->sec - (long) start->sec; \ + psec_diff = (long) end->psec - (long) start->psec; \ + return (double) sec_diff + punit * (double) psec_diff; \ + } + +double +timeval_diff_secs (const struct_timeval *end, const struct_timeval *start) +{ + DIFF_SECS_ROUTINE (tv_sec, tv_usec, 1e-6); +} + +double +rusage_diff_secs (const struct_rusage *end, const struct_rusage *start) +{ + DIFF_SECS_ROUTINE (ru_utime.tv_sec, ru_utime.tv_usec, 1e-6); +} + +double +timespec_diff_secs (const struct_timespec *end, const struct_timespec *start) +{ + DIFF_SECS_ROUTINE (tv_sec, tv_nsec, 1e-9); +} + +/* This is for use after time_base_to_time, ie. for seconds and nanoseconds. */ +double +timebasestruct_diff_secs (const timebasestruct_t *end, + const timebasestruct_t *start) +{ + DIFF_SECS_ROUTINE (tb_high, tb_low, 1e-9); +} + + +double +speed_endtime (void) +{ +#define END_USE(name,value) \ + do { \ + if (speed_option_verbose >= 3) \ + printf ("speed_endtime(): used %s\n", name); \ + result = value; \ + goto done; \ + } while (0) + +#define END_ENOUGH(name,value) \ + do { \ + if (speed_option_verbose >= 3) \ + printf ("speed_endtime(): %s gives enough precision\n", name); \ + result = value; \ + goto done; \ + } while (0) + +#define END_EXCEED(name,value) \ + do { \ + if (speed_option_verbose >= 3) \ + printf ("speed_endtime(): cycle counter limit exceeded, used %s\n", \ + name); \ + result = value; \ + goto done; \ + } while (0) + + unsigned end_cycles[2]; + stck_t end_stck; + unsigned end_mftb[2]; + unsigned end_sgi; + timebasestruct_t end_rrt; + struct_timespec end_cgt; + struct_timeval end_gtod; + struct_rusage end_grus; + struct_tms end_times; + double t_gtod, t_grus, t_times, t_cgt; + double t_rrt, t_sgi, t_mftb, t_stck, t_cycles; + double result; + + /* Cycles sampled first for maximum accuracy. + "have_" values tested to let unused code go dead. */ + + if (have_cycles && use_cycles) speed_cyclecounter (end_cycles); + if (have_stck && use_stck) STCK (end_stck); + if (have_mftb && use_mftb) MFTB (end_mftb); + if (have_sgi && use_sgi) end_sgi = *sgi_addr; + if (have_rrt && use_rrt) read_real_time (&end_rrt, sizeof(end_rrt)); + if (have_cgt && use_cgt) clock_gettime (CGT_ID, &end_cgt); + if (have_gtod && use_gtod) gettimeofday (&end_gtod, NULL); + if (have_grus && use_grus) getrusage (0, &end_grus); + if (have_times && use_times) times (&end_times); + + result = -1.0; + + if (speed_option_verbose >= 4) + { + printf ("speed_endtime():\n"); + if (use_cycles) + printf (" cycles 0x%X,0x%X -> 0x%X,0x%X\n", + start_cycles[1], start_cycles[0], + end_cycles[1], end_cycles[0]); + + if (use_stck) + printf (" stck 0x%lX -> 0x%lX\n", start_stck, end_stck); + + if (use_mftb) + printf (" mftb 0x%X,%08X -> 0x%X,%08X\n", + start_mftb[1], start_mftb[0], + end_mftb[1], end_mftb[0]); + + if (use_sgi) + printf (" sgi 0x%X -> 0x%X\n", start_sgi, end_sgi); + + if (use_rrt) + printf (" read_real_time (%d)%u,%u -> (%d)%u,%u\n", + start_rrt.flag, start_rrt.tb_high, start_rrt.tb_low, + end_rrt.flag, end_rrt.tb_high, end_rrt.tb_low); + + if (use_cgt) + printf (" clock_gettime %ld.%09ld -> %ld.%09ld\n", + (long) start_cgt.tv_sec, (long) start_cgt.tv_nsec, + (long) end_cgt.tv_sec, (long) end_cgt.tv_nsec); + + if (use_gtod) + printf (" gettimeofday %ld.%06ld -> %ld.%06ld\n", + (long) start_gtod.tv_sec, (long) start_gtod.tv_usec, + (long) end_gtod.tv_sec, (long) end_gtod.tv_usec); + + if (use_grus) + printf (" getrusage %ld.%06ld -> %ld.%06ld\n", + (long) start_grus.ru_utime.tv_sec, + (long) start_grus.ru_utime.tv_usec, + (long) end_grus.ru_utime.tv_sec, + (long) end_grus.ru_utime.tv_usec); + + if (use_times) + printf (" times %ld -> %ld\n", + start_times.tms_utime, end_times.tms_utime); + } + + if (use_rrt) + { + time_base_to_time (&start_rrt, sizeof(start_rrt)); + time_base_to_time (&end_rrt, sizeof(end_rrt)); + t_rrt = timebasestruct_diff_secs (&end_rrt, &start_rrt); + END_USE ("read_real_time()", t_rrt); + } + + if (use_cgt) + { + t_cgt = timespec_diff_secs (&end_cgt, &start_cgt); + END_USE ("clock_gettime()", t_cgt); + } + + if (use_grus) + { + t_grus = rusage_diff_secs (&end_grus, &start_grus); + + /* Use getrusage() if the cycle counter limit would be exceeded, or if + it provides enough accuracy already. */ + if (use_cycles) + { + if (t_grus >= speed_precision*grus_unittime) + END_ENOUGH ("getrusage()", t_grus); + if (t_grus >= cycles_limit) + END_EXCEED ("getrusage()", t_grus); + } + } + + if (use_times) + { + t_times = (end_times.tms_utime - start_times.tms_utime) * times_unittime; + + /* Use times() if the cycle counter limit would be exceeded, or if + it provides enough accuracy already. */ + if (use_cycles) + { + if (t_times >= speed_precision*times_unittime) + END_ENOUGH ("times()", t_times); + if (t_times >= cycles_limit) + END_EXCEED ("times()", t_times); + } + } + + if (use_gtod) + { + t_gtod = timeval_diff_secs (&end_gtod, &start_gtod); + + /* Use gettimeofday() if it measured a value bigger than the cycle + counter can handle. */ + if (use_cycles) + { + if (t_gtod >= cycles_limit) + END_EXCEED ("gettimeofday()", t_gtod); + } + } + + if (use_mftb) + { + t_mftb = speed_mftb_diff (end_mftb, start_mftb) * mftb_unittime; + END_USE ("mftb", t_mftb); + } + + if (use_stck) + { + t_stck = (end_stck - start_stck) * STCK_PERIOD; + END_USE ("stck", t_stck); + } + + if (use_sgi) + { + t_sgi = (end_sgi - start_sgi) * sgi_unittime; + END_USE ("SGI hardware counter", t_sgi); + } + + if (use_cycles) + { + t_cycles = speed_cyclecounter_diff (end_cycles, start_cycles) + * speed_cycletime; + END_USE ("cycle counter", t_cycles); + } + + if (use_grus && getrusage_microseconds_p()) + END_USE ("getrusage()", t_grus); + + if (use_gtod && gettimeofday_microseconds_p()) + END_USE ("gettimeofday()", t_gtod); + + if (use_times) END_USE ("times()", t_times); + if (use_grus) END_USE ("getrusage()", t_grus); + if (use_gtod) END_USE ("gettimeofday()", t_gtod); + + fprintf (stderr, "speed_endtime(): oops, no time method available\n"); + abort (); + + done: + if (result < 0.0) + { + if (speed_option_verbose >= 2) + fprintf (stderr, "speed_endtime(): warning, treating negative time as zero: %.9f\n", result); + result = 0.0; + } + return result; +} diff --git a/gmp-6.3.0/tune/tune-gcd-p.c b/gmp-6.3.0/tune/tune-gcd-p.c new file mode 100644 index 0000000..3b5a4a8 --- /dev/null +++ b/gmp-6.3.0/tune/tune-gcd-p.c @@ -0,0 +1,225 @@ +/* tune-gcd-p + + Tune the choice for splitting p in divide-and-conquer gcd. + +Copyright 2008, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define TUNE_GCD_P 1 + +#include "../mpn/gcd.c" + +#include +#include +#include +#include + +#include "speed.h" + +/* Search for minimum over a range. FIXME: Implement golden-section / + fibonacci search*/ +static int +search (double *minp, double (*f)(void *, int), void *ctx, int start, int end) +{ + int x[4]; + double y[4]; + + int best_i; + + x[0] = start; + x[3] = end; + + y[0] = f(ctx, x[0]); + y[3] = f(ctx, x[3]); + + for (;;) + { + int i; + int length = x[3] - x[0]; + + x[1] = x[0] + length/3; + x[2] = x[0] + 2*length/3; + + y[1] = f(ctx, x[1]); + y[2] = f(ctx, x[2]); + +#if 0 + printf("%d: %f, %d: %f, %d:, %f %d: %f\n", + x[0], y[0], x[1], y[1], x[2], y[2], x[3], y[3]); +#endif + for (best_i = 0, i = 1; i < 4; i++) + if (y[i] < y[best_i]) + best_i = i; + + if (length <= 4) + break; + + if (best_i >= 2) + { + x[0] = x[1]; + y[0] = y[1]; + } + else + { + x[3] = x[2]; + y[3] = y[2]; + } + } + *minp = y[best_i]; + return x[best_i]; +} + +static int +compare_double(const void *ap, const void *bp) +{ + double a = * (const double *) ap; + double b = * (const double *) bp; + + if (a < b) + return -1; + else if (a > b) + return 1; + else + return 0; +} + +static double +median (double *v, size_t n) +{ + qsort(v, n, sizeof(*v), compare_double); + + return v[n/2]; +} + +#define TIME(res, code) do { \ + double time_measurement[5]; \ + unsigned time_i; \ + \ + for (time_i = 0; time_i < 5; time_i++) \ + { \ + speed_starttime(); \ + code; \ + time_measurement[time_i] = speed_endtime(); \ + } \ + res = median(time_measurement, 5); \ +} while (0) + +struct bench_data +{ + mp_size_t n; + mp_ptr ap; + mp_ptr bp; + mp_ptr up; + mp_ptr vp; + mp_ptr gp; +}; + +static double +bench_gcd (void *ctx, int p) +{ + struct bench_data *data = (struct bench_data *) ctx; + double t; + + p_table[data->n] = p; + TIME(t, { + MPN_COPY (data->up, data->ap, data->n); + MPN_COPY (data->vp, data->bp, data->n); + mpn_gcd (data->gp, data->up, data->n, data->vp, data->n); + }); + + return t; +} + +int +main(int argc, char **argv) +{ + gmp_randstate_t rands; struct bench_data data; + mp_size_t n; + + TMP_DECL; + + /* Unbuffered so if output is redirected to a file it isn't lost if the + program is killed part way through. */ + setbuf (stdout, NULL); + setbuf (stderr, NULL); + + gmp_randinit_default (rands); + + TMP_MARK; + + data.ap = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + data.bp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + data.up = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + data.vp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + data.gp = TMP_ALLOC_LIMBS (P_TABLE_SIZE); + + mpn_random (data.ap, P_TABLE_SIZE); + mpn_random (data.bp, P_TABLE_SIZE); + + memset (p_table, 0, sizeof(p_table)); + + for (n = 100; n < P_TABLE_SIZE; n++) + { + mp_size_t p; + mp_size_t best_p; + double best_time; + double lehmer_time; + + if (data.ap[n-1] == 0) + data.ap[n-1] = 1; + + if (data.bp[n-1] == 0) + data.bp[n-1] = 1; + + data.n = n; + + lehmer_time = bench_gcd (&data, 0); + + best_p = search (&best_time, bench_gcd, &data, n/5, 4*n/5); + if (best_time > lehmer_time) + best_p = 0; + + printf("%6zu %6zu %5.3g", n, best_p, (double) best_p / n); + if (best_p > 0) + { + double speedup = 100 * (lehmer_time - best_time) / lehmer_time; + printf(" %5.3g%%", speedup); + if (speedup < 1.0) + { + printf(" (ignored)"); + best_p = 0; + } + } + printf("\n"); + + p_table[n] = best_p; + } + TMP_FREE; + gmp_randclear(rands); + return 0; +} diff --git a/gmp-6.3.0/tune/tuneup.c b/gmp-6.3.0/tune/tuneup.c new file mode 100644 index 0000000..8ae211e --- /dev/null +++ b/gmp-6.3.0/tune/tuneup.c @@ -0,0 +1,3072 @@ +/* Create tuned thresholds for various algorithms. + +Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* Usage: tuneup [-t] [-t] [-p precision] + + -t turns on some diagnostic traces, a second -t turns on more traces. + + Notes: + + The code here isn't a vision of loveliness, mainly because it's subject + to ongoing changes according to new things wanting to be tuned, and + practical requirements of systems tested. + + Sometimes running the program twice produces slightly different results. + This is probably because there's so little separating algorithms near + their crossover, and on that basis it should make little or no difference + to the final speed of the relevant routines, but nothing has been done to + check that carefully. + + Algorithm: + + The thresholds are determined as follows. A crossover may not be a + single size but rather a range where it oscillates between method A or + method B faster. If the threshold is set making B used where A is faster + (or vice versa) that's bad. Badness is the percentage time lost and + total badness is the sum of this over all sizes measured. The threshold + is set to minimize total badness. + + Suppose, as sizes increase, method B becomes faster than method A. The + effect of the rule is that, as you look at increasing sizes, isolated + points where B is faster are ignored, but when it's consistently faster, + or faster on balance, then the threshold is set there. The same result + is obtained thinking in the other direction of A becoming faster at + smaller sizes. + + In practice the thresholds tend to be chosen to bring on the next + algorithm fairly quickly. + + This rule is attractive because it's got a basis in reason and is fairly + easy to implement, but no work has been done to actually compare it in + absolute terms to other possibilities. + + Implementation: + + In a normal library build the thresholds are constants. To tune them + selected objects are recompiled with the thresholds as global variables + instead. #define TUNE_PROGRAM_BUILD does this, with help from code at + the end of gmp-impl.h, and rules in tune/Makefile.am. + + MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The + threshold is set to "size+1" to avoid karatsuba, or to "size" to use one + level, but recurse into the basecase. + + MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value. + Other routines in turn will make use of both of those. Naturally the + dependants must be tuned first. + + In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, + just a threshold based on comparing two routines (mpn_divrem_1 and + mpn_divexact_1), and no further use of the value determined. + + Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being + just comparisons between certain routines on representative data. + + Shortcuts are applied when native (assembler) versions of routines exist. + For instance a native mpn_sqr_basecase is assumed to be always faster + than mpn_mul_basecase, with no measuring. + + No attempt is made to tune within assembler routines, for instance + DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be + written and tuned all by hand. Assembler routines that might have hard + limits are recompiled though, to make them accept a bigger range of sizes + than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr. + + Limitations: + + The FFTs aren't subject to the same badness rule as the other thresholds, + so each k is probably being brought on a touch early. This isn't likely + to make a difference, and the simpler probing means fewer tests. + +*/ + +#define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ + +#include "config.h" + +#include +#include +#include +#include +#if HAVE_UNISTD_H +#include +#endif + +#include "gmp-impl.h" +#include "longlong.h" + +#include "tests.h" +#include "speed.h" + +#if !HAVE_DECL_OPTARG +extern char *optarg; +extern int optind, opterr; +#endif + + +#define DEFAULT_MAX_SIZE 1000 /* limbs */ + +#if WANT_FFT +mp_size_t option_fft_max_size = 50000; /* limbs */ +#else +mp_size_t option_fft_max_size = 0; +#endif +int option_trace = 0; +int option_fft_trace = 0; +struct speed_params s; + +struct dat_t { + mp_size_t size; + double d; +} *dat = NULL; +int ndat = 0; +int allocdat = 0; + +/* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that + case use zero here, which for params.max_size means no limit. */ +#ifndef TUNE_SQR_TOOM2_MAX +#define TUNE_SQR_TOOM2_MAX 0 +#endif + +mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT; +mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; +mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT; +mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT; +mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX; +mp_size_t mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX; +mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; +mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_toom2_threshold + = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX); +mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; +mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; +mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; +mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; +mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; +mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; +mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX; +mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX; +mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX; +mp_size_t sqrlo_basecase_threshold = MP_SIZE_T_MAX; +mp_size_t sqrlo_dc_threshold = MP_SIZE_T_MAX; +mp_size_t sqrlo_sqr_threshold = MP_SIZE_T_MAX; +mp_size_t mulmid_toom42_threshold = MP_SIZE_T_MAX; +mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX; +mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX; +mp_size_t div_qr_2_pi2_threshold = MP_SIZE_T_MAX; +mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX; +mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX; +mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX; +mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX; +mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX; +mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX; +mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX; +mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX; +mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX; +mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX; +mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX; +mp_size_t inv_newton_threshold = MP_SIZE_T_MAX; +mp_size_t inv_appr_threshold = MP_SIZE_T_MAX; +mp_size_t binv_newton_threshold = MP_SIZE_T_MAX; +mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX; +mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX; +mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX; +mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_appr_threshold = MP_SIZE_T_MAX; +mp_size_t hgcd_reduce_threshold = MP_SIZE_T_MAX; +mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; +mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; +int div_qr_1n_pi1_method = 0; +mp_size_t div_qr_1_norm_threshold = MP_SIZE_T_MAX; +mp_size_t div_qr_1_unnorm_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; +int mod_1_1p_method = 0; +mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX; +mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX; +mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX; +mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; +mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; +mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; +mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; +mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; +mp_size_t fac_odd_threshold = 0; +mp_size_t fac_dsc_threshold = FAC_DSC_THRESHOLD_LIMIT; + +mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; +mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; + +struct param_t { + const char *name; + speed_function_t function; + speed_function_t function2; + double step_factor; /* how much to step relatively */ + int step; /* how much to step absolutely */ + double function_fudge; /* multiplier for "function" speeds */ + int stop_since_change; + double stop_factor; + mp_size_t min_size; + int min_is_always; + mp_size_t max_size; + mp_size_t check_size; + mp_size_t size_extra; + +#define DATA_HIGH_LT_R 1 +#define DATA_HIGH_GE_R 2 + int data_high; + + int noprint; +}; + + +/* These are normally undefined when false, which suits "#if" fine. + But give them zero values so they can be used in plain C "if"s. */ +#ifndef UDIV_PREINV_ALWAYS +#define UDIV_PREINV_ALWAYS 0 +#endif +#ifndef HAVE_NATIVE_mpn_divexact_1 +#define HAVE_NATIVE_mpn_divexact_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 0 +#endif +#ifndef HAVE_NATIVE_mpn_divrem_1 +#define HAVE_NATIVE_mpn_divrem_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_divrem_2 +#define HAVE_NATIVE_mpn_divrem_2 0 +#endif +#ifndef HAVE_NATIVE_mpn_mod_1 +#define HAVE_NATIVE_mpn_mod_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_mod_1_1p +#define HAVE_NATIVE_mpn_mod_1_1p 0 +#endif +#ifndef HAVE_NATIVE_mpn_modexact_1_odd +#define HAVE_NATIVE_mpn_modexact_1_odd 0 +#endif +#ifndef HAVE_NATIVE_mpn_preinv_divrem_1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_preinv_mod_1 +#define HAVE_NATIVE_mpn_preinv_mod_1 0 +#endif +#ifndef HAVE_NATIVE_mpn_sqr_basecase +#define HAVE_NATIVE_mpn_sqr_basecase 0 +#endif + + +#define MAX3(a,b,c) MAX (MAX (a, b), c) + +mp_limb_t +randlimb_norm (void) +{ + mp_limb_t n; + mpn_random (&n, 1); + n |= GMP_NUMB_HIGHBIT; + return n; +} + +#define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) + +mp_limb_t +randlimb_half (void) +{ + mp_limb_t n; + mpn_random (&n, 1); + n &= GMP_NUMB_HALFMASK; + n += (n==0); + return n; +} + + +/* Add an entry to the end of the dat[] array, reallocing to make it bigger + if necessary. */ +void +add_dat (mp_size_t size, double d) +{ +#define ALLOCDAT_STEP 500 + + ASSERT_ALWAYS (ndat <= allocdat); + + if (ndat == allocdat) + { + dat = (struct dat_t *) __gmp_allocate_or_reallocate + (dat, allocdat * sizeof(dat[0]), + (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); + allocdat += ALLOCDAT_STEP; + } + + dat[ndat].size = size; + dat[ndat].d = d; + ndat++; +} + + +/* Return the threshold size based on the data accumulated. */ +mp_size_t +analyze_dat (int final) +{ + double x, min_x; + int j, min_j; + + /* If the threshold is set at dat[0].size, any positive values are bad. */ + x = 0.0; + for (j = 0; j < ndat; j++) + if (dat[j].d > 0.0) + x += dat[j].d; + + if (option_trace >= 2 && final) + { + printf ("\n"); + printf ("x is the sum of the badness from setting thresh at given size\n"); + printf (" (minimum x is sought)\n"); + printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); + } + + min_x = x; + min_j = 0; + + + /* When stepping to the next dat[j].size, positive values are no longer + bad (so subtracted), negative values become bad (so add the absolute + value, meaning subtract). */ + for (j = 0; j < ndat; x -= dat[j].d, j++) + { + if (option_trace >= 2 && final) + printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); + + if (x < min_x) + { + min_x = x; + min_j = j; + } + } + + return min_j; +} + + +/* Measuring for recompiled mpn/generic/div_qr_1.c, + * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */ + +mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t); + +#if defined (__cplusplus) +extern "C" { +#endif + +mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); +mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t); +void mpz_fac_ui_tune (mpz_ptr, unsigned long); + +#if defined (__cplusplus) +} +#endif + +double +speed_mpn_mod_1_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); +} +double +speed_mpn_divrem_1_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); +} +double +speed_mpz_fac_ui_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune); +} +double +speed_mpn_div_qr_1_tune (struct speed_params *s) +{ + SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune); +} + +double +tuneup_measure (speed_function_t fun, + const struct param_t *param, + struct speed_params *s) +{ + static struct param_t dummy; + double t; + TMP_DECL; + + if (! param) + param = &dummy; + + s->size += param->size_extra; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); + SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); + + mpn_random (s->xp, s->size); + mpn_random (s->yp, s->size); + + switch (param->data_high) { + case DATA_HIGH_LT_R: + s->xp[s->size-1] %= s->r; + s->yp[s->size-1] %= s->r; + break; + case DATA_HIGH_GE_R: + s->xp[s->size-1] |= s->r; + s->yp[s->size-1] |= s->r; + break; + } + + t = speed_measure (fun, s); + + s->size -= param->size_extra; + + TMP_FREE; + return t; +} + + +#define PRINT_WIDTH 31 + +void +print_define_start (const char *name) +{ + printf ("#define %-*s ", PRINT_WIDTH, name); + if (option_trace) + printf ("...\n"); +} + +void +print_define_end_remark (const char *name, mp_size_t value, const char *remark) +{ + if (option_trace) + printf ("#define %-*s ", PRINT_WIDTH, name); + + if (value == MP_SIZE_T_MAX) + printf ("MP_SIZE_T_MAX"); + else + printf ("%5ld", (long) value); + + if (remark != NULL) + printf (" /* %s */", remark); + printf ("\n"); + fflush (stdout); +} + +void +print_define_end (const char *name, mp_size_t value) +{ + const char *remark; + if (value == MP_SIZE_T_MAX) + remark = "never"; + else if (value == 0) + remark = "always"; + else + remark = NULL; + print_define_end_remark (name, value, remark); +} + +void +print_define (const char *name, mp_size_t value) +{ + print_define_start (name); + print_define_end (name, value); +} + +void +print_define_remark (const char *name, mp_size_t value, const char *remark) +{ + print_define_start (name); + print_define_end_remark (name, value, remark); +} + +void +print_define_with_speedup (const char *name, mp_size_t value, + mp_size_t runner_up, double speedup) +{ + char buf[100]; + snprintf (buf, sizeof(buf), "%.2f%% faster than %ld", + 100.0 * (speedup - 1), runner_up); + print_define_remark (name, value, buf); +} + +void +one (mp_size_t *threshold, struct param_t *param) +{ + int since_positive, since_thresh_change; + int thresh_idx, new_thresh_idx; + +#define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) + + DEFAULT (param->function_fudge, 1.0); + DEFAULT (param->function2, param->function); + DEFAULT (param->step_factor, 0.01); /* small steps by default */ + DEFAULT (param->step, 1); /* small steps by default */ + DEFAULT (param->stop_since_change, 80); + DEFAULT (param->stop_factor, 1.2); + DEFAULT (param->min_size, 10); + DEFAULT (param->max_size, DEFAULT_MAX_SIZE); + + if (param->check_size != 0) + { + double t1, t2; + s.size = param->check_size; + + *threshold = s.size+1; + t1 = tuneup_measure (param->function, param, &s); + + *threshold = s.size; + t2 = tuneup_measure (param->function2, param, &s); + if (t1 == -1.0 || t2 == -1.0) + { + printf ("Oops, can't run both functions at size %ld\n", + (long) s.size); + abort (); + } + t1 *= param->function_fudge; + + /* ask that t2 is at least 4% below t1 */ + if (t1 < t2*1.04) + { + if (option_trace) + printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); + *threshold = MP_SIZE_T_MAX; + if (! param->noprint) + print_define (param->name, *threshold); + return; + } + + if (option_trace >= 2) + printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", + (long) s.size, t1, t2); + } + + if (! param->noprint || option_trace) + print_define_start (param->name); + + ndat = 0; + since_positive = 0; + since_thresh_change = 0; + thresh_idx = 0; + + if (option_trace >= 2) + { + printf (" algorithm-A algorithm-B ratio possible\n"); + printf (" (seconds) (seconds) diff thresh\n"); + } + + for (s.size = param->min_size; + s.size < param->max_size; + s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step)) + { + double ti, tiplus1, d; + + /* + FIXME: check minimum size requirements are met, possibly by just + checking for the -1 returns from the speed functions. + */ + + /* using method A at this size */ + *threshold = s.size+1; + ti = tuneup_measure (param->function, param, &s); + if (ti == -1.0) + abort (); + ti *= param->function_fudge; + + /* using method B at this size */ + *threshold = s.size; + tiplus1 = tuneup_measure (param->function2, param, &s); + if (tiplus1 == -1.0) + abort (); + + /* Calculate the fraction by which the one or the other routine is + slower. */ + if (tiplus1 >= ti) + d = (tiplus1 - ti) / tiplus1; /* negative */ + else + d = (tiplus1 - ti) / ti; /* positive */ + + add_dat (s.size, d); + + new_thresh_idx = analyze_dat (0); + + if (option_trace >= 2) + printf ("size=%ld %.9f %.9f % .4f %c %ld\n", + (long) s.size, ti, tiplus1, d, + ti > tiplus1 ? '#' : ' ', + (long) dat[new_thresh_idx].size); + + /* Stop if the last time method i was faster was more than a + certain number of measurements ago. */ +#define STOP_SINCE_POSITIVE 200 + if (d >= 0) + since_positive = 0; + else + if (++since_positive > STOP_SINCE_POSITIVE) + { + if (option_trace >= 1) + printf ("stopped due to since_positive (%d)\n", + STOP_SINCE_POSITIVE); + break; + } + + /* Stop if method A has become slower by a certain factor. */ + if (ti >= tiplus1 * param->stop_factor) + { + if (option_trace >= 1) + printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", + param->stop_factor); + break; + } + + /* Stop if the threshold implied hasn't changed in a certain + number of measurements. (It's this condition that usually + stops the loop.) */ + if (thresh_idx != new_thresh_idx) + since_thresh_change = 0, thresh_idx = new_thresh_idx; + else + if (++since_thresh_change > param->stop_since_change) + { + if (option_trace >= 1) + printf ("stopped due to since_thresh_change (%d)\n", + param->stop_since_change); + break; + } + + /* Stop if the threshold implied is more than a certain number of + measurements ago. */ +#define STOP_SINCE_AFTER 500 + if (ndat - thresh_idx > STOP_SINCE_AFTER) + { + if (option_trace >= 1) + printf ("stopped due to ndat - thresh_idx > amount (%d)\n", + STOP_SINCE_AFTER); + break; + } + + /* Stop when the size limit is reached before the end of the + crossover, but only show this as an error for >= the default max + size. FIXME: Maybe should make it a param choice whether this is + an error. */ + if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) + { + fprintf (stderr, "%s\n", param->name); + fprintf (stderr, "sizes %ld to %ld total %d measurements\n", + (long) dat[0].size, (long) dat[ndat-1].size, ndat); + fprintf (stderr, " max size reached before end of crossover\n"); + break; + } + } + + if (option_trace >= 1) + printf ("sizes %ld to %ld total %d measurements\n", + (long) dat[0].size, (long) dat[ndat-1].size, ndat); + + *threshold = dat[analyze_dat (1)].size; + + if (param->min_is_always) + { + if (*threshold == param->min_size) + *threshold = 0; + } + + if (! param->noprint || option_trace) + print_define_end (param->name, *threshold); +} + +/* Time N different FUNCTIONS with the same parameters and size, to + select the fastest. Since *_METHOD defines start numbering from + one, if functions[i] is fastest, the value of the define is i+1. + Also output a comment with speedup compared to the next fastest + function. The NAME argument is used only for trace output. + + Returns the index of the fastest function. +*/ +int +one_method (int n, speed_function_t *functions, + const char *name, const char *define, + const struct param_t *param) +{ + double *t; + int i; + int method; + int method_runner_up; + + TMP_DECL; + TMP_MARK; + t = (double*) TMP_ALLOC (n * sizeof (*t)); + + for (i = 0; i < n; i++) + { + t[i] = tuneup_measure (functions[i], param, &s); + if (option_trace >= 1) + printf ("size=%ld, %s, method %d %.9f\n", + (long) s.size, name, i + 1, t[i]); + if (t[i] == -1.0) + { + printf ("Oops, can't measure all %s methods\n", name); + abort (); + } + } + method = 0; + for (i = 1; i < n; i++) + if (t[i] < t[method]) + method = i; + + method_runner_up = (method == 0); + for (i = 0; i < n; i++) + if (i != method && t[i] < t[method_runner_up]) + method_runner_up = i; + + print_define_with_speedup (define, method + 1, method_runner_up + 1, + t[method_runner_up] / t[method]); + + TMP_FREE; + return method; +} + + +/* Special probing for the fft thresholds. The size restrictions on the + FFTs mean the graph of time vs size has a step effect. See this for + example using + + ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 + gnuplot foo.gnuplot + + The current approach is to compare routines at the midpoint of relevant + steps. Arguably a more sophisticated system of threshold data is wanted + if this step effect remains. */ + +struct fft_param_t { + const char *table_name; + const char *threshold_name; + const char *modf_threshold_name; + mp_size_t *p_threshold; + mp_size_t *p_modf_threshold; + mp_size_t first_size; + mp_size_t max_size; + speed_function_t function; + speed_function_t mul_modf_function; + speed_function_t mul_function; + mp_size_t sqr; +}; + + +/* mpn_mul_fft requires pl a multiple of 2^k limbs, but with + N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple + of 2^(k-1) bits. */ + +mp_size_t +fft_step_size (int k) +{ + mp_size_t step; + + step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS; + step *= (mp_size_t) 1 << k; + + if (step <= 0) + { + printf ("Can't handle k=%d\n", k); + abort (); + } + + return step; +} + +mp_size_t +fft_next_size (mp_size_t pl, int k) +{ + mp_size_t m = fft_step_size (k); + +/* printf ("[k=%d %ld] %ld ->", k, m, pl); */ + + if (pl == 0 || (pl & (m-1)) != 0) + pl = (pl | (m-1)) + 1; + +/* printf (" %ld\n", pl); */ + return pl; +} + +#define NMAX_DEFAULT 1000000 +#define MAX_REPS 25 +#define MIN_REPS 5 + +static inline size_t +mpn_mul_fft_lcm (size_t a, unsigned int k) +{ + unsigned int l = k; + + while (a % 2 == 0 && k > 0) + { + a >>= 1; + k--; + } + return a << l; +} + +mp_size_t +fftfill (mp_size_t pl, int k, int sqr) +{ + mp_size_t maxLK; + mp_bitcnt_t N, Nprime, nprime, M; + + N = pl * GMP_NUMB_BITS; + M = N >> k; + + maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k); + + Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; + nprime = Nprime / GMP_NUMB_BITS; + if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + size_t K2; + for (;;) + { + K2 = 1L << mpn_fft_best_k (nprime, sqr); + if ((nprime & (K2 - 1)) == 0) + break; + nprime = (nprime + K2 - 1) & -K2; + Nprime = nprime * GMP_LIMB_BITS; + } + } + ASSERT_ALWAYS (nprime < pl); + + return Nprime; +} + +static int +compare_double (const void *ap, const void *bp) +{ + double a = * (const double *) ap; + double b = * (const double *) bp; + + if (a < b) + return -1; + else if (a > b) + return 1; + else + return 0; +} + +double +median (double *times, int n) +{ + qsort (times, n, sizeof (double), compare_double); + return times[n/2]; +} + +#define FFT_CACHE_SIZE 25 +typedef struct fft_cache +{ + mp_size_t n; + double time; +} fft_cache_t; + +fft_cache_t fft_cache[FFT_CACHE_SIZE]; + +double +cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k, + int n_measurements) +{ + int i; + double t, ttab[MAX_REPS]; + + if (fft_cache[k].n == n) + return fft_cache[k].time; + + for (i = 0; i < n_measurements; i++) + { + speed_starttime (); + mpn_mul_fft (rp, n, ap, n, bp, n, k); + ttab[i] = speed_endtime (); + } + + t = median (ttab, n_measurements); + fft_cache[k].n = n; + fft_cache[k].time = t; + return t; +} + +#define INSERT_FFTTAB(idx, nval, kval) \ + do { \ + fft_tab[idx].n = nval; \ + fft_tab[idx].k = kval; \ + fft_tab[idx+1].n = (1 << 27) - 1; /* sentinel, 27b wide field */ \ + fft_tab[idx+1].k = (1 << 5) - 1; \ + } while (0) + +int +fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print) +{ + mp_size_t n, n1, prev_n1; + int k, best_k, last_best_k, kmax; + int eff, prev_eff; + double t0, t1; + int n_measurements; + mp_limb_t *ap, *bp, *rp; + mp_size_t alloc; + struct fft_table_nk *fft_tab; + + fft_tab = mpn_fft_table3[p->sqr]; + + for (k = 0; k < FFT_CACHE_SIZE; k++) + fft_cache[k].n = 0; + + if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD); + } + + if (print) + printf ("#define %s%*s", p->table_name, 38, ""); + + if (idx == 0) + { + INSERT_FFTTAB (0, nmin, initial_k); + + if (print) + { + printf ("\\\n { "); + printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k); + } + + idx = 1; + } + + ap = (mp_ptr) malloc (sizeof (mp_limb_t)); + if (p->sqr) + bp = ap; + else + bp = (mp_ptr) malloc (sizeof (mp_limb_t)); + rp = (mp_ptr) malloc (sizeof (mp_limb_t)); + alloc = 1; + + /* Round n to comply to initial k value */ + n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k); + + n_measurements = (18 - initial_k) | 1; + n_measurements = MAX (n_measurements, MIN_REPS); + n_measurements = MIN (n_measurements, MAX_REPS); + + last_best_k = initial_k; + best_k = initial_k; + + while (n < nmax) + { + int start_k, end_k; + + /* Assume the current best k is best until we hit its next FFT step. */ + t0 = 99999; + + prev_n1 = n + 1; + + start_k = MAX (4, best_k - 4); + end_k = MIN (24, best_k + 4); + for (k = start_k; k <= end_k; k++) + { + n1 = mpn_fft_next_size (prev_n1, k); + + eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr); + + if (eff < 70) /* avoid measuring too slow fft:s */ + continue; + + if (n1 > alloc) + { + alloc = n1; + if (p->sqr) + { + ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); + rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); + ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); + mpn_random (ap, alloc); + rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); + } + else + { + ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t)); + bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t)); + rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t)); + ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t)); + mpn_random (ap, alloc); + bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t)); + mpn_random (bp, alloc); + rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t)); + } + } + + t1 = cached_measure (rp, ap, bp, n1, k, n_measurements); + + if (t1 * n_measurements > 0.3) + n_measurements -= 2; + n_measurements = MAX (n_measurements, MIN_REPS); + + if (t1 < t0) + { + best_k = k; + t0 = t1; + } + } + + n1 = mpn_fft_next_size (prev_n1, best_k); + + if (last_best_k != best_k) + { + ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1); + + if (idx >= FFT_TABLE3_SIZE) + { + printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); + abort (); + } + INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k); + + if (print) + { + printf (", "); + if (idx % 4 == 0) + printf ("\\\n "); + printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); + } + + if (option_trace >= 2) + { + printf ("{%lu,%u}\n", prev_n1, best_k); + fflush (stdout); + } + + last_best_k = best_k; + idx++; + } + + for (;;) + { + prev_n1 = n1; + prev_eff = fftfill (prev_n1, best_k, p->sqr); + n1 = mpn_fft_next_size (prev_n1 + 1, best_k); + eff = fftfill (n1, best_k, p->sqr); + + if (eff != prev_eff) + break; + } + + n = prev_n1; + } + + kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */ + kmax = MIN (kmax, 25-1); + for (k = last_best_k + 1; k <= kmax; k++) + { + if (idx >= FFT_TABLE3_SIZE) + { + printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n"); + abort (); + } + INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k); + + if (print) + { + printf (", "); + if (idx % 4 == 0) + printf ("\\\n "); + printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k); + } + + idx++; + } + + if (print) + printf (" }\n"); + + free (ap); + if (! p->sqr) + free (bp); + free (rp); + + return idx; +} + +void +fft (struct fft_param_t *p) +{ + mp_size_t size; + int k, idx, initial_k; + + /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/ + +#if 1 + { + /* Use plain one() mechanism, for some reasonable initial values of k. The + advantage is that we don't depend on mpn_fft_table3, which can therefore + leave it completely uninitialized. */ + + static struct param_t param; + mp_size_t thres, best_thres; + int best_k; + char buf[20]; + + best_thres = MP_SIZE_T_MAX; + best_k = -1; + + for (k = 5; k <= 7; k++) + { + param.name = p->modf_threshold_name; + param.min_size = 100; + param.max_size = 2000; + param.function = p->mul_function; + param.step_factor = 0.0; + param.step = 4; + param.function2 = p->mul_modf_function; + param.noprint = 1; + s.r = k; + one (&thres, ¶m); + if (thres < best_thres) + { + best_thres = thres; + best_k = k; + } + } + + *(p->p_modf_threshold) = best_thres; + sprintf (buf, "k = %d", best_k); + print_define_remark (p->modf_threshold_name, best_thres, buf); + initial_k = best_k; + } +#else + size = p->first_size; + for (;;) + { + double tk, tm; + + size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr)); + k = mpn_fft_best_k (size, p->sqr); + + if (size >= p->max_size) + break; + + s.size = size + fft_step_size (k) / 2; + s.r = k; + tk = tuneup_measure (p->mul_modf_function, NULL, &s); + if (tk == -1.0) + abort (); + + tm = tuneup_measure (p->mul_function, NULL, &s); + if (tm == -1.0) + abort (); + + if (option_trace >= 2) + printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n", + (long) size, + (long) size + fft_step_size (k) / 2, k, tk, + (long) s.size, tm); + + if (tk < tm) + { + *p->p_modf_threshold = s.size; + print_define (p->modf_threshold_name, *p->p_modf_threshold); + break; + } + } + initial_k = ?; +#endif + + /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/ + + idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1); + printf ("#define %s_SIZE %d\n", p->table_name, idx); + + /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/ + + size = 2 * *p->p_modf_threshold; /* OK? */ + for (;;) + { + double tk, tm; + mp_size_t mulmod_size, mul_size;; + + if (size >= p->max_size) + break; + + mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2; + mul_size = (size + mulmod_size) / 2; /* middle of step */ + + s.size = mulmod_size; + tk = tuneup_measure (p->function, NULL, &s); + if (tk == -1.0) + abort (); + + s.size = mul_size; + tm = tuneup_measure (p->mul_function, NULL, &s); + if (tm == -1.0) + abort (); + + if (option_trace >= 2) + printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n", + (long) size, + (long) mulmod_size, tk, + (long) mul_size, tm); + + size = mulmod_size; + + if (tk < tm) + { + *p->p_threshold = s.size; + print_define (p->threshold_name, *p->p_threshold); + break; + } + } +} + +/* Compare mpn_mul_1 to whatever fast exact single-limb division we have. This + is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch. + This is used in get_str and set_str. */ +void +relspeed_div_1_vs_mul_1 (void) +{ + const size_t max_opsize = 100; + mp_size_t n; + long j; + mp_limb_t rp[max_opsize]; + mp_limb_t ap[max_opsize]; + double multime, divtime; + + mpn_random (ap, max_opsize); + + multime = 0; + for (n = max_opsize; n > 1; n--) + { + mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); + speed_starttime (); + for (j = speed_precision; j != 0 ; j--) + mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10); + multime += speed_endtime () / n; + } + + divtime = 0; + for (n = max_opsize; n > 1; n--) + { + /* Make input divisible for good measure. */ + ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10); + +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 + mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, + MP_BASES_BIG_BASE_BINVERTED_10, + MP_BASES_BIG_BASE_CTZ_10); +#else + mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); +#endif + speed_starttime (); + for (j = speed_precision; j != 0 ; j--) + { +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 + mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10, + MP_BASES_BIG_BASE_BINVERTED_10, + MP_BASES_BIG_BASE_CTZ_10); +#else + mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10); +#endif + } + divtime += speed_endtime () / n; + } + + print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime)); +} + + +/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, + giving wrong results. */ +void +tune_mul_n (void) +{ + static struct param_t param; + mp_size_t next_toom_start; + int something_changed; + + param.function = speed_mpn_mul_n; + + param.name = "MUL_TOOM22_THRESHOLD"; + param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE); + param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1; + one (&mul_toom22_threshold, ¶m); + + param.noprint = 1; + + /* Threshold sequence loop. Disable functions that would be used in a very + narrow range, re-measuring things when that happens. */ + something_changed = 1; + while (something_changed) + { + something_changed = 0; + + next_toom_start = mul_toom22_threshold; + + if (mul_toom33_threshold != 0) + { + param.name = "MUL_TOOM33_THRESHOLD"; + param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE); + param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1; + one (&mul_toom33_threshold, ¶m); + + if (next_toom_start * 1.05 >= mul_toom33_threshold) + { + mul_toom33_threshold = 0; + something_changed = 1; + } + } + + next_toom_start = MAX (next_toom_start, mul_toom33_threshold); + + if (mul_toom44_threshold != 0) + { + param.name = "MUL_TOOM44_THRESHOLD"; + param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE); + param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; + one (&mul_toom44_threshold, ¶m); + + if (next_toom_start * 1.05 >= mul_toom44_threshold) + { + mul_toom44_threshold = 0; + something_changed = 1; + } + } + + next_toom_start = MAX (next_toom_start, mul_toom44_threshold); + + if (mul_toom6h_threshold != 0) + { + param.name = "MUL_TOOM6H_THRESHOLD"; + param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE); + param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1; + one (&mul_toom6h_threshold, ¶m); + + if (next_toom_start * 1.05 >= mul_toom6h_threshold) + { + mul_toom6h_threshold = 0; + something_changed = 1; + } + } + + next_toom_start = MAX (next_toom_start, mul_toom6h_threshold); + + if (mul_toom8h_threshold != 0) + { + param.name = "MUL_TOOM8H_THRESHOLD"; + param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE); + param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1; + one (&mul_toom8h_threshold, ¶m); + + if (next_toom_start * 1.05 >= mul_toom8h_threshold) + { + mul_toom8h_threshold = 0; + something_changed = 1; + } + } + } + + print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD); + print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD); + print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD); + print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD); + + /* disabled until tuned */ + MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; +} + +void +tune_mul (void) +{ + static struct param_t param; + mp_size_t thres; + + param.noprint = 1; + + param.function = speed_mpn_toom32_for_toom43_mul; + param.function2 = speed_mpn_toom43_for_toom32_mul; + param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD"; + param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17; + one (&thres, ¶m); + mul_toom32_to_toom43_threshold = thres * 17 / 24; + print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold); + + param.function = speed_mpn_toom32_for_toom53_mul; + param.function2 = speed_mpn_toom53_for_toom32_mul; + param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD"; + param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19; + one (&thres, ¶m); + mul_toom32_to_toom53_threshold = thres * 19 / 30; + print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold); + + param.function = speed_mpn_toom42_for_toom53_mul; + param.function2 = speed_mpn_toom53_for_toom42_mul; + param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD"; + param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11; + one (&thres, ¶m); + mul_toom42_to_toom53_threshold = thres * 11 / 20; + print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold); + + param.function = speed_mpn_toom42_mul; + param.function2 = speed_mpn_toom63_mul; + param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD"; + param.min_size = MPN_TOOM63_MUL_MINSIZE * 2; + one (&thres, ¶m); + mul_toom42_to_toom63_threshold = thres / 2; + print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold); + + /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */ + param.function = speed_mpn_toom43_for_toom54_mul; + param.function2 = speed_mpn_toom54_for_toom43_mul; + param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD"; + param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5; + one (&thres, ¶m); + mul_toom43_to_toom54_threshold = thres * 5 / 6; + print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold); +} + + +void +tune_mullo (void) +{ + static struct param_t param; + + param.function = speed_mpn_mullo_n; + + param.name = "MULLO_BASECASE_THRESHOLD"; + param.min_size = 2; + param.min_is_always = 1; + param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1; + param.stop_factor = 1.5; + param.noprint = 1; + one (&mullo_basecase_threshold, ¶m); + + param.name = "MULLO_DC_THRESHOLD"; + param.min_size = 8; + param.min_is_always = 0; + param.max_size = 1000; + one (&mullo_dc_threshold, ¶m); + + if (mullo_basecase_threshold >= mullo_dc_threshold) + { + print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold); + print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase"); + } + else + { + print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold); + print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold); + } + + if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2) + { + param.name = "MULLO_MUL_N_THRESHOLD"; + param.min_size = mullo_dc_threshold; + param.max_size = 2 * mul_fft_threshold; + param.noprint = 0; + param.step_factor = 0.03; + one (&mullo_mul_n_threshold, ¶m); + } + else + print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX, + "without FFT use mullo forever"); +} + +void +tune_sqrlo (void) +{ + static struct param_t param; + + param.function = speed_mpn_sqrlo; + + param.name = "SQRLO_BASECASE_THRESHOLD"; + param.min_size = 2; + param.min_is_always = 1; + param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1; + param.stop_factor = 1.5; + param.noprint = 1; + one (&sqrlo_basecase_threshold, ¶m); + + param.name = "SQRLO_DC_THRESHOLD"; + param.min_size = 8; + param.min_is_always = 0; + param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1; + one (&sqrlo_dc_threshold, ¶m); + + if (sqrlo_basecase_threshold >= sqrlo_dc_threshold) + { + print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold); + print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase"); + } + else + { + print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold); + print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold); + } + + if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2) + { + param.name = "SQRLO_SQR_THRESHOLD"; + param.min_size = sqrlo_dc_threshold; + param.max_size = 2 * sqr_fft_threshold; + param.noprint = 0; + param.step_factor = 0.03; + one (&sqrlo_sqr_threshold, ¶m); + } + else + print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX, + "without FFT use sqrlo forever"); +} + +void +tune_mulmid (void) +{ + static struct param_t param; + + param.name = "MULMID_TOOM42_THRESHOLD"; + param.function = speed_mpn_mulmid_n; + param.min_size = 4; + param.max_size = 100; + one (&mulmid_toom42_threshold, ¶m); +} + +void +tune_mulmod_bnm1 (void) +{ + static struct param_t param; + + param.name = "MULMOD_BNM1_THRESHOLD"; + param.function = speed_mpn_mulmod_bnm1; + param.min_size = 4; + param.max_size = 100; + one (&mulmod_bnm1_threshold, ¶m); +} + +void +tune_sqrmod_bnm1 (void) +{ + static struct param_t param; + + param.name = "SQRMOD_BNM1_THRESHOLD"; + param.function = speed_mpn_sqrmod_bnm1; + param.min_size = 4; + param.max_size = 100; + one (&sqrmod_bnm1_threshold, ¶m); +} + + +/* Start the basecase from 3, since 1 is a special case, and if mul_basecase + is faster only at size==2 then we don't want to bother with extra code + just for that. Start karatsuba from 4 same as MUL above. */ + +void +tune_sqr (void) +{ + /* disabled until tuned */ + SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; + + if (HAVE_NATIVE_mpn_sqr_basecase) + { + print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); + sqr_basecase_threshold = 0; + } + else + { + static struct param_t param; + param.name = "SQR_BASECASE_THRESHOLD"; + param.function = speed_mpn_sqr; + param.min_size = 3; + param.min_is_always = 1; + param.max_size = TUNE_SQR_TOOM2_MAX; + param.noprint = 1; + one (&sqr_basecase_threshold, ¶m); + } + + { + static struct param_t param; + param.name = "SQR_TOOM2_THRESHOLD"; + param.function = speed_mpn_sqr; + param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE); + param.max_size = TUNE_SQR_TOOM2_MAX; + param.noprint = 1; + one (&sqr_toom2_threshold, ¶m); + + if (! HAVE_NATIVE_mpn_sqr_basecase + && sqr_toom2_threshold < sqr_basecase_threshold) + { + /* Karatsuba becomes faster than mul_basecase before + sqr_basecase does. Arrange for the expression + "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which + selects mpn_sqr_basecase in mpn_sqr to be false, by setting + SQR_TOOM2_THRESHOLD to zero, making + SQR_BASECASE_THRESHOLD the toom2 threshold. */ + + sqr_basecase_threshold = SQR_TOOM2_THRESHOLD; + SQR_TOOM2_THRESHOLD = 0; + + print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, + "toom2"); + print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD, + "never sqr_basecase"); + } + else + { + if (! HAVE_NATIVE_mpn_sqr_basecase) + print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); + print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD); + } + } + + { + static struct param_t param; + mp_size_t next_toom_start; + int something_changed; + + param.function = speed_mpn_sqr; + param.noprint = 1; + + /* Threshold sequence loop. Disable functions that would be used in a very + narrow range, re-measuring things when that happens. */ + something_changed = 1; + while (something_changed) + { + something_changed = 0; + + next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold); + + sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; + param.name = "SQR_TOOM3_THRESHOLD"; + param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE); + param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; + one (&sqr_toom3_threshold, ¶m); + + next_toom_start = MAX (next_toom_start, sqr_toom3_threshold); + + if (sqr_toom4_threshold != 0) + { + param.name = "SQR_TOOM4_THRESHOLD"; + sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; + param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE); + param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; + one (&sqr_toom4_threshold, ¶m); + + if (next_toom_start * 1.05 >= sqr_toom4_threshold) + { + sqr_toom4_threshold = 0; + something_changed = 1; + } + } + + next_toom_start = MAX (next_toom_start, sqr_toom4_threshold); + + if (sqr_toom6_threshold != 0) + { + param.name = "SQR_TOOM6_THRESHOLD"; + sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT; + param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE); + param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1; + one (&sqr_toom6_threshold, ¶m); + + if (next_toom_start * 1.05 >= sqr_toom6_threshold) + { + sqr_toom6_threshold = 0; + something_changed = 1; + } + } + + next_toom_start = MAX (next_toom_start, sqr_toom6_threshold); + + if (sqr_toom8_threshold != 0) + { + param.name = "SQR_TOOM8_THRESHOLD"; + sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT; + param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE); + param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1; + one (&sqr_toom8_threshold, ¶m); + + if (next_toom_start * 1.05 >= sqr_toom8_threshold) + { + sqr_toom8_threshold = 0; + something_changed = 1; + } + } + } + + print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD); + print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD); + print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD); + print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD); + } +} + + +void +tune_dc_div (void) +{ + s.r = 0; /* clear to make speed function do 2n/n */ + { + static struct param_t param; + param.name = "DC_DIV_QR_THRESHOLD"; + param.function = speed_mpn_sbpi1_div_qr; + param.function2 = speed_mpn_dcpi1_div_qr; + param.min_size = 6; + one (&dc_div_qr_threshold, ¶m); + } + { + static struct param_t param; + param.name = "DC_DIVAPPR_Q_THRESHOLD"; + param.function = speed_mpn_sbpi1_divappr_q; + param.function2 = speed_mpn_dcpi1_divappr_q; + param.min_size = 6; + one (&dc_divappr_q_threshold, ¶m); + } +} + +static double +speed_mpn_sbordcpi1_div_qr (struct speed_params *s) +{ + if (s->size < DC_DIV_QR_THRESHOLD) + return speed_mpn_sbpi1_div_qr (s); + else + return speed_mpn_dcpi1_div_qr (s); +} + +void +tune_mu_div (void) +{ + s.r = 0; /* clear to make speed function do 2n/n */ + { + static struct param_t param; + param.name = "MU_DIV_QR_THRESHOLD"; + param.function = speed_mpn_dcpi1_div_qr; + param.function2 = speed_mpn_mu_div_qr; + param.min_size = mul_toom22_threshold; + param.max_size = 5000; + param.step_factor = 0.02; + one (&mu_div_qr_threshold, ¶m); + } + { + static struct param_t param; + param.name = "MU_DIVAPPR_Q_THRESHOLD"; + param.function = speed_mpn_dcpi1_divappr_q; + param.function2 = speed_mpn_mu_divappr_q; + param.min_size = mul_toom22_threshold; + param.max_size = 5000; + param.step_factor = 0.02; + one (&mu_divappr_q_threshold, ¶m); + } + { + static struct param_t param; + param.name = "MUPI_DIV_QR_THRESHOLD"; + param.function = speed_mpn_sbordcpi1_div_qr; + param.function2 = speed_mpn_mupi_div_qr; + param.min_size = 6; + param.min_is_always = 1; + param.max_size = 1000; + param.step_factor = 0.02; + one (&mupi_div_qr_threshold, ¶m); + } +} + +void +tune_dc_bdiv (void) +{ + s.r = 0; /* clear to make speed function do 2n/n*/ + { + static struct param_t param; + param.name = "DC_BDIV_QR_THRESHOLD"; + param.function = speed_mpn_sbpi1_bdiv_qr; + param.function2 = speed_mpn_dcpi1_bdiv_qr; + param.min_size = 4; + one (&dc_bdiv_qr_threshold, ¶m); + } + { + static struct param_t param; + param.name = "DC_BDIV_Q_THRESHOLD"; + param.function = speed_mpn_sbpi1_bdiv_q; + param.function2 = speed_mpn_dcpi1_bdiv_q; + param.min_size = 4; + one (&dc_bdiv_q_threshold, ¶m); + } +} + +void +tune_mu_bdiv (void) +{ + s.r = 0; /* clear to make speed function do 2n/n*/ + { + static struct param_t param; + param.name = "MU_BDIV_QR_THRESHOLD"; + param.function = speed_mpn_dcpi1_bdiv_qr; + param.function2 = speed_mpn_mu_bdiv_qr; + param.min_size = dc_bdiv_qr_threshold; + param.max_size = 5000; + param.step_factor = 0.02; + one (&mu_bdiv_qr_threshold, ¶m); + } + { + static struct param_t param; + param.name = "MU_BDIV_Q_THRESHOLD"; + param.function = speed_mpn_dcpi1_bdiv_q; + param.function2 = speed_mpn_mu_bdiv_q; + param.min_size = dc_bdiv_q_threshold; + param.max_size = 5000; + param.step_factor = 0.02; + one (&mu_bdiv_q_threshold, ¶m); + } +} + +void +tune_invertappr (void) +{ + static struct param_t param; + + param.function = speed_mpn_ni_invertappr; + param.name = "INV_MULMOD_BNM1_THRESHOLD"; + param.min_size = 5; + one (&inv_mulmod_bnm1_threshold, ¶m); + + param.function = speed_mpn_invertappr; + param.name = "INV_NEWTON_THRESHOLD"; + param.min_size = 5; + one (&inv_newton_threshold, ¶m); +} + +void +tune_invert (void) +{ + static struct param_t param; + + param.function = speed_mpn_invert; + param.name = "INV_APPR_THRESHOLD"; + param.min_size = 5; + one (&inv_appr_threshold, ¶m); +} + +void +tune_binvert (void) +{ + static struct param_t param; + + param.function = speed_mpn_binvert; + param.name = "BINV_NEWTON_THRESHOLD"; + param.min_size = 8; /* pointless with smaller operands */ + one (&binv_newton_threshold, ¶m); +} + +void +tune_redc (void) +{ +#define TUNE_REDC_2_MAX 100 +#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 +#define WANT_REDC_2 1 +#endif + +#if WANT_REDC_2 + { + static struct param_t param; + param.name = "REDC_1_TO_REDC_2_THRESHOLD"; + param.function = speed_mpn_redc_1; + param.function2 = speed_mpn_redc_2; + param.min_size = 1; + param.min_is_always = 1; + param.max_size = TUNE_REDC_2_MAX; + param.noprint = 1; + param.stop_factor = 1.5; + one (&redc_1_to_redc_2_threshold, ¶m); + } + { + static struct param_t param; + param.name = "REDC_2_TO_REDC_N_THRESHOLD"; + param.function = speed_mpn_redc_2; + param.function2 = speed_mpn_redc_n; + param.min_size = 16; + param.noprint = 1; + one (&redc_2_to_redc_n_threshold, ¶m); + } + if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold) + { + redc_2_to_redc_n_threshold = 0; /* disable redc_2 */ + + /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as + REDC_1_TO_REDC_2_THRESHOLD. */ + { + static struct param_t param; + param.name = "REDC_1_TO_REDC_2_THRESHOLD"; + param.function = speed_mpn_redc_1; + param.function2 = speed_mpn_redc_n; + param.min_size = 16; + param.noprint = 1; + one (&redc_1_to_redc_2_threshold, ¶m); + } + } + print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD); + print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD); +#else + { + static struct param_t param; + param.name = "REDC_1_TO_REDC_N_THRESHOLD"; + param.function = speed_mpn_redc_1; + param.function2 = speed_mpn_redc_n; + param.min_size = 16; + one (&redc_1_to_redc_n_threshold, ¶m); + } +#endif +} + +void +tune_matrix22_mul (void) +{ + static struct param_t param; + param.name = "MATRIX22_STRASSEN_THRESHOLD"; + param.function = speed_mpn_matrix22_mul; + param.min_size = 2; + one (&matrix22_strassen_threshold, ¶m); +} + +void +tune_hgcd2 (void) +{ + static struct param_t param; + hgcd2_func_t *f[5] = + { mpn_hgcd2_1, + mpn_hgcd2_2, + mpn_hgcd2_3, + mpn_hgcd2_4, + mpn_hgcd2_5 }; + speed_function_t speed_f[5] = + { speed_mpn_hgcd2_1, + speed_mpn_hgcd2_2, + speed_mpn_hgcd2_3, + speed_mpn_hgcd2_4, + speed_mpn_hgcd2_5 }; + int best; + + s.size = 1; + best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", ¶m); + + /* Use selected function when tuning hgcd and gcd */ + hgcd2_func = f[best]; +} + +void +tune_hgcd (void) +{ + static struct param_t param; + param.name = "HGCD_THRESHOLD"; + param.function = speed_mpn_hgcd; + /* We seem to get strange results for small sizes */ + param.min_size = 30; + one (&hgcd_threshold, ¶m); +} + +void +tune_hgcd_appr (void) +{ + static struct param_t param; + param.name = "HGCD_APPR_THRESHOLD"; + param.function = speed_mpn_hgcd_appr; + /* We seem to get strange results for small sizes */ + param.min_size = 50; + param.stop_since_change = 150; + one (&hgcd_appr_threshold, ¶m); +} + +void +tune_hgcd_reduce (void) +{ + static struct param_t param; + param.name = "HGCD_REDUCE_THRESHOLD"; + param.function = speed_mpn_hgcd_reduce; + param.min_size = 30; + param.max_size = 7000; + param.step_factor = 0.04; + one (&hgcd_reduce_threshold, ¶m); +} + +void +tune_gcd_dc (void) +{ + static struct param_t param; + param.name = "GCD_DC_THRESHOLD"; + param.function = speed_mpn_gcd; + param.min_size = hgcd_threshold; + param.max_size = 3000; + param.step_factor = 0.02; + one (&gcd_dc_threshold, ¶m); +} + +void +tune_gcdext_dc (void) +{ + static struct param_t param; + param.name = "GCDEXT_DC_THRESHOLD"; + param.function = speed_mpn_gcdext; + param.min_size = hgcd_threshold; + param.max_size = 3000; + param.step_factor = 0.02; + one (&gcdext_dc_threshold, ¶m); +} + +/* In tune_powm_sec we compute the table used by the win_size function. The + cutoff points are in exponent bits, disregarding other operand sizes. It is + not possible to use the one framework since it currently uses a granularity + of full limbs. +*/ + +/* This win_size replaces the variant in the powm code, allowing us to + control k in the k-ary algorithms. */ +int winsize; +int +win_size (mp_bitcnt_t eb) +{ + return winsize; +} + +void +tune_powm_sec (void) +{ + mp_size_t n; + int k, i; + mp_size_t itch; + mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff; + const int n_max = 3000 / GMP_NUMB_BITS; + const int n_measurements = 5; + mp_ptr rp, bp, ep, mp, tp; + double ttab[n_measurements], tk, tkp1; + TMP_DECL; + TMP_MARK; + + possible_nbits_cutoff = 0; + + k = 1; + + winsize = 10; /* the itch function needs this */ + itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max); + + rp = TMP_ALLOC_LIMBS (n_max); + bp = TMP_ALLOC_LIMBS (n_max); + ep = TMP_ALLOC_LIMBS (n_max); + mp = TMP_ALLOC_LIMBS (n_max); + tp = TMP_ALLOC_LIMBS (itch); + + mpn_random (bp, n_max); + mpn_random (mp, n_max); + mp[0] |= 1; + +/* How about taking the M operand size into account? + + An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming + B = O(M)). + + Using k-ary and no sliding window, the precomputation will need time + O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) + + O(log(E)/k*M(N)), for the squarings, multiplications, respectively. + + An operation R=powm_sec(B,E,N) will take time like powm. + + Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the + main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) + + O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full + table reads, respectively. */ + + printf ("#define POWM_SEC_TABLE "); + + /* For nbits == 1, we should always use k == 1, so no need to tune + that. Starting with nbits == 2 also ensure that nbits always is + larger than the windowsize k+1. */ + for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; ) + { + n = (nbits - 1) / GMP_NUMB_BITS + 1; + + /* Generate E such that sliding-window for k and k+1 works equally + well/poorly (but sliding is not used in powm_sec, of course). */ + for (i = 0; i < n; i++) + ep[i] = ~CNST_LIMB(0); + + winsize = k; + for (i = 0; i < n_measurements; i++) + { + speed_starttime (); + mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); + ttab[i] = speed_endtime (); + } + tk = median (ttab, n_measurements); + + winsize = k + 1; + speed_starttime (); + for (i = 0; i < n_measurements; i++) + { + speed_starttime (); + mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp); + ttab[i] = speed_endtime (); + } + tkp1 = median (ttab, n_measurements); +/* + printf ("testing: %ld, %d", nbits, k, ep[n-1]); + printf (" %10.5f %10.5f\n", tk, tkp1); +*/ + if (tkp1 < tk) + { + if (possible_nbits_cutoff) + { + /* Two consecutive sizes indicate k increase, obey. */ + + /* Must always have x[k] >= k */ + ASSERT_ALWAYS (possible_nbits_cutoff >= k); + + if (k > 1) + printf (","); + printf ("%ld", (long) possible_nbits_cutoff); + k++; + possible_nbits_cutoff = 0; + } + else + { + /* One measurement indicate k increase, save nbits for further + consideration. */ + /* The new larger k gets used for sizes > the cutoff + value, hence the cutoff should be one less than the + smallest size where it gives a speedup. */ + possible_nbits_cutoff = nbits - 1; + } + } + else + possible_nbits_cutoff = 0; + + nbits_next = nbits * 65 / 64; + nbits = nbits_next + (nbits_next == nbits); + } + printf ("\n"); + TMP_FREE; +} + + +/* size_extra==1 reflects the fact that with high= mod_1_2_to_mod_1_4_threshold) + { + /* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */ + mod_1_2_to_mod_1_4_threshold = 0; + + param.function = speed_mpn_mod_1_1; + param.function2 = speed_mpn_mod_1_4; + param.min_is_always = 1; + param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake"; + param.min_size = 2; + one (&mod_1_1_to_mod_1_2_threshold, ¶m); + } + + param.function = speed_mpn_mod_1_tune; + param.function2 = NULL; + param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD"; + param.min_size = 2; + param.min_is_always = 0; + one (&mod_1u_to_mod_1_1_threshold, ¶m); + + if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold) + mod_1_1_to_mod_1_2_threshold = 0; + if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold) + mod_1_2_to_mod_1_4_threshold = 0; + + print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL); + print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, + mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL); + print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, + mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL); + } + + { + static struct param_t param; + + param.check_size = 256; + + param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD"; + s.r = randlimb_norm (); + param.function = speed_mpn_preinv_mod_1; + param.function2 = speed_mpn_mod_1_tune; + param.min_size = 1; + one (&preinv_mod_1_to_mod_1_threshold, ¶m); + } +} + + +/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would + imply that udiv_qrnnd_preinv is worth using, but it seems most + straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div + directly. */ + +void +tune_preinv_divrem_1 (void) +{ + static struct param_t param; + speed_function_t divrem_1; + const char *divrem_1_name; + double t1, t2; + + if (GMP_NAIL_BITS != 0) + { + print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); + return; + } + + /* Any native version of mpn_preinv_divrem_1 is assumed to exist because + it's faster than mpn_divrem_1. */ + if (HAVE_NATIVE_mpn_preinv_divrem_1) + { + print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); + return; + } + + /* If udiv_qrnnd_preinv is the only division method then of course + mpn_preinv_divrem_1 should be used. */ + if (UDIV_PREINV_ALWAYS) + { + print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); + return; + } + + /* If we've got an assembler version of mpn_divrem_1, then compare against + that, not the mpn_divrem_1_div generic C. */ + if (HAVE_NATIVE_mpn_divrem_1) + { + divrem_1 = speed_mpn_divrem_1; + divrem_1_name = "mpn_divrem_1"; + } + else + { + divrem_1 = speed_mpn_divrem_1_div; + divrem_1_name = "mpn_divrem_1_div"; + } + + param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ + s.size = 200; /* generous but not too big */ + /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, + since in general that's probably most common, though in fact for a + 64-bit limb mp_bases[10].big_base is normalized. */ + s.r = urandom() & (GMP_NUMB_MASK >> 4); + if (s.r == 0) s.r = 123; + + t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); + t2 = tuneup_measure (divrem_1, ¶m, &s); + if (t1 == -1.0 || t2 == -1.0) + { + printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", + divrem_1_name, (long) s.size); + abort (); + } + if (option_trace >= 1) + printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", + (long) s.size, t1, divrem_1_name, t2); + + print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); +} + + + +void +tune_divrem_2 (void) +{ + static struct param_t param; + + /* No support for tuning native assembler code, do that by hand and put + the results in the .asm file, and there's no need for such thresholds + to appear in gmp-mparam.h. */ + if (HAVE_NATIVE_mpn_divrem_2) + return; + + if (GMP_NAIL_BITS != 0) + { + print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, + "no preinv with nails"); + return; + } + + if (UDIV_PREINV_ALWAYS) + { + print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); + return; + } + + /* Tune for the integer part of mpn_divrem_2. This will very possibly be + a bit out for the fractional part, but that's too bad, the integer part + is more important. + + min_size must be >=2 since nsize>=2 is required, but is set to 4 to save + code space if plain division is better only at size==2 or size==3. */ + param.name = "DIVREM_2_THRESHOLD"; + param.check_size = 256; + param.min_size = 4; + param.min_is_always = 1; + param.size_extra = 2; /* does qsize==nsize-2 divisions */ + param.stop_factor = 2.0; + + s.r = randlimb_norm (); + param.function = speed_mpn_divrem_2; + one (&divrem_2_threshold, ¶m); +} + +void +tune_div_qr_2 (void) +{ + static struct param_t param; + param.name = "DIV_QR_2_PI2_THRESHOLD"; + param.function = speed_mpn_div_qr_2n; + param.check_size = 500; + param.min_size = 4; + one (&div_qr_2_pi2_threshold, ¶m); +} + +/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so + tune for that. Its speed can differ on odd or even divisor, so take an + average threshold for the two. + + mpn_divrem_1 can vary with highsize >= 1); + + base = s->r == 0 ? 10 : s->r; + SPEED_RESTRICT_COND (base >= 2 && base <= 256); + + TMP_MARK; + + str = (unsigned char *) TMP_ALLOC (s->size); + for (i = 0; i < s->size; i++) + str[i] = s->xp[i] % base; + + LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); + SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); + + /* use this during development to check wn is big enough */ + /* + ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); + */ + + speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES); + speed_operand_dst (s, wp, wn); + speed_cache_fill (s); + + chars_per_limb = mp_bases[base].chars_per_limb; + un = s->size / chars_per_limb + 1; + powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); + size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base); + powers_t *pt = powtab + n_pows; + tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); + + speed_starttime (); + i = s->reps; + do + { + mpn_pre_set_str (wp, str, s->size, pt, tp); + } + while (--i != 0); + t = speed_endtime (); + + TMP_FREE; + return t; +} + +void +tune_set_str (void) +{ + s.r = 10; /* decimal */ + { + static struct param_t param; + SET_STR_PRECOMPUTE_THRESHOLD = 0; + param.step_factor = 0.01; + param.name = "SET_STR_DC_THRESHOLD"; + param.function = speed_mpn_pre_set_str; + param.min_size = 100; + param.max_size = 50000; + one (&set_str_dc_threshold, ¶m); + } + { + static struct param_t param; + param.step_factor = 0.02; + param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; + param.function = speed_mpn_set_str; + param.min_size = SET_STR_DC_THRESHOLD; + param.max_size = 100000; + one (&set_str_precompute_threshold, ¶m); + } +} + + +void +tune_fft_mul (void) +{ + static struct fft_param_t param; + + if (option_fft_max_size == 0) + return; + + param.table_name = "MUL_FFT_TABLE3"; + param.threshold_name = "MUL_FFT_THRESHOLD"; + param.p_threshold = &mul_fft_threshold; + param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; + param.p_modf_threshold = &mul_fft_modf_threshold; + param.first_size = MUL_TOOM33_THRESHOLD / 2; + param.max_size = option_fft_max_size; + param.function = speed_mpn_fft_mul; + param.mul_modf_function = speed_mpn_mul_fft; + param.mul_function = speed_mpn_mul_n; + param.sqr = 0; + fft (¶m); +} + + +void +tune_fft_sqr (void) +{ + static struct fft_param_t param; + + if (option_fft_max_size == 0) + return; + + param.table_name = "SQR_FFT_TABLE3"; + param.threshold_name = "SQR_FFT_THRESHOLD"; + param.p_threshold = &sqr_fft_threshold; + param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; + param.p_modf_threshold = &sqr_fft_modf_threshold; + param.first_size = SQR_TOOM3_THRESHOLD / 2; + param.max_size = option_fft_max_size; + param.function = speed_mpn_fft_sqr; + param.mul_modf_function = speed_mpn_mul_fft_sqr; + param.mul_function = speed_mpn_sqr; + param.sqr = 1; + fft (¶m); +} + +void +tune_fac_ui (void) +{ + static struct param_t param; + + param.function = speed_mpz_fac_ui_tune; + + param.name = "FAC_DSC_THRESHOLD"; + param.min_size = 70; + param.max_size = FAC_DSC_THRESHOLD_LIMIT; + one (&fac_dsc_threshold, ¶m); + + param.name = "FAC_ODD_THRESHOLD"; + param.min_size = 22; + param.stop_factor = 1.7; + param.min_is_always = 1; + one (&fac_odd_threshold, ¶m); +} + +void +all (void) +{ + time_t start_time, end_time; + TMP_DECL; + + TMP_MARK; + SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); + SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); + + mpn_random (s.xp_block, SPEED_BLOCK_SIZE); + mpn_random (s.yp_block, SPEED_BLOCK_SIZE); + + fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); + + speed_time_init (); + fprintf (stderr, "Using: %s\n", speed_time_string); + + fprintf (stderr, "speed_precision %d", speed_precision); + if (speed_unittime == 1.0) + fprintf (stderr, ", speed_unittime 1 cycle"); + else + fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); + if (speed_cycletime == 1.0 || speed_cycletime == 0.0) + fprintf (stderr, ", CPU freq unknown\n"); + else + fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); + + fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", + DEFAULT_MAX_SIZE, (long) option_fft_max_size); + fprintf (stderr, "\n"); + + time (&start_time); + { + struct tm *tp; + tp = localtime (&start_time); + printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", + tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); + +#ifdef __GNUC__ + /* gcc sub-minor version doesn't seem to come through as a define */ + printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); +#define PRINTED_COMPILER +#endif +#if defined (__SUNPRO_C) + printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); +#define PRINTED_COMPILER +#endif +#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) + /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ + printf ("MIPSpro C %d.%d.%d */\n", + _COMPILER_VERSION / 100, + _COMPILER_VERSION / 10 % 10, + _COMPILER_VERSION % 10); +#define PRINTED_COMPILER +#endif +#if defined (__DECC) && defined (__DECC_VER) + printf ("DEC C %d */\n", __DECC_VER); +#define PRINTED_COMPILER +#endif +#if ! defined (PRINTED_COMPILER) + printf ("system compiler */\n"); +#endif + } + printf ("\n"); + + tune_divrem_1 (); + tune_mod_1 (); + tune_preinv_divrem_1 (); + tune_div_qr_1 (); +#if 0 + tune_divrem_2 (); +#endif + tune_div_qr_2 (); + tune_divexact_1 (); + tune_modexact_1_odd (); + printf("\n"); + + relspeed_div_1_vs_mul_1 (); + printf("\n"); + + tune_mul_n (); + printf("\n"); + + tune_mul (); + printf("\n"); + + tune_sqr (); + printf("\n"); + + tune_mulmid (); + printf("\n"); + + tune_mulmod_bnm1 (); + tune_sqrmod_bnm1 (); + printf("\n"); + + tune_fft_mul (); + printf("\n"); + + tune_fft_sqr (); + printf ("\n"); + + tune_mullo (); + tune_sqrlo (); + printf("\n"); + + tune_dc_div (); + tune_dc_bdiv (); + + printf("\n"); + tune_invertappr (); + tune_invert (); + printf("\n"); + + tune_binvert (); + tune_redc (); + printf("\n"); + + tune_mu_div (); + tune_mu_bdiv (); + printf("\n"); + + tune_powm_sec (); + printf("\n"); + + tune_get_str (); + tune_set_str (); + printf("\n"); + + tune_fac_ui (); + printf("\n"); + + tune_matrix22_mul (); + tune_hgcd2 (); + tune_hgcd (); + tune_hgcd_appr (); + tune_hgcd_reduce(); + tune_gcd_dc (); + tune_gcdext_dc (); + tune_jacobi_base (); + printf("\n"); + + time (&end_time); + printf ("/* Tuneup completed successfully, took %ld seconds */\n", + (long) (end_time - start_time)); + + TMP_FREE; +} + + +int +main (int argc, char *argv[]) +{ + int opt; + + /* Unbuffered so if output is redirected to a file it isn't lost if the + program is killed part way through. */ + setbuf (stdout, NULL); + setbuf (stderr, NULL); + + while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) + { + switch (opt) { + case 'f': + if (optarg[0] == 't') + option_fft_trace = 2; + else + option_fft_max_size = atol (optarg); + break; + case 'o': + speed_option_set (optarg); + break; + case 'p': + speed_precision = atoi (optarg); + break; + case 't': + option_trace++; + break; + case '?': + exit(1); + } + } + + all (); + exit (0); +} diff --git a/gmp-6.3.0/tune/x86_64.asm b/gmp-6.3.0/tune/x86_64.asm new file mode 100644 index 0000000..b7ec44c --- /dev/null +++ b/gmp-6.3.0/tune/x86_64.asm @@ -0,0 +1,55 @@ +dnl x86 pentium time stamp counter access routine. + +dnl Copyright 1999, 2000, 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C void speed_cyclecounter (unsigned p[2]); +C +C Get the pentium rdtsc cycle counter, storing the least significant word in +C p[0] and the most significant in p[1]. +C +C cpuid is used to serialize execution. On big measurements this won't be +C significant but it may help make small single measurements more accurate. + +PROLOGUE(speed_cyclecounter) + + C rdi p + + movq %rbx, %r10 + xorl %eax, %eax + cpuid + rdtsc + movl %eax, (%rdi) + movl %edx, 4(%rdi) + movq %r10, %rbx + ret +EPILOGUE() -- cgit v1.2.3