/* * alg_config - help determine optimal values for algorithm levels * * Copyright (C) 2006 Landon Curt Noll * * Calc is open software; you can redistribute it and/or modify it under * the terms of the version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * Calc is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General * Public License for more details. * * A copy of version 2.1 of the GNU Lesser General Public License is * distributed with calc under the filename COPYING-LGPL. You should have * received a copy with calc; if not, write to Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * @(#) $Revision: 30.1 $ * @(#) $Id: alg_config.cal,v 30.1 2007/03/16 11:09:54 chongo Exp $ * @(#) $Source: /usr/local/src/cmd/calc/cal/RCS/alg_config.cal,v $ * * Under source code control: 2006/06/07 14:10:11 * File existed as early as: 2006 * * chongo /\oo/\ http://www.isthe.com/chongo/ * Share and enjoy! :-) http://www.isthe.com/chongo/tech/comp/calc/ */ global test_time; /* try for this many seconds in loop test */ /* * mul_loop - measure the CPU time to perform a set of multiply loops * * given: * repeat number of multiply loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * * returns: * approximate runtime to perform repeat the multiply loops * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_mul2() instead. */ define mul_loop(repeat, x) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* multiplicand */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "mul_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "mul_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "mul_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "mul_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "mul_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "mul_loop: 2nd arg matrix elements are not of equal BASEB-bit word length"; } } /* multiply pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; } else { answer = (*x)[0] * (*x)[1]; answer = (*x)[0] * (*x)[2]; answer = (*x)[0] * (*x)[3]; answer = (*x)[0] * (*x)[4]; /**/ answer = (*x)[1] * (*x)[0]; answer = (*x)[1] * (*x)[2]; answer = (*x)[1] * (*x)[3]; answer = (*x)[1] * (*x)[4]; /**/ answer = (*x)[2] * (*x)[0]; answer = (*x)[2] * (*x)[1]; answer = (*x)[2] * (*x)[3]; answer = (*x)[2] * (*x)[4]; /**/ answer = (*x)[3] * (*x)[0]; answer = (*x)[3] * (*x)[1]; answer = (*x)[3] * (*x)[2]; answer = (*x)[3] * (*x)[4]; /**/ answer = (*x)[4] * (*x)[0]; answer = (*x)[4] * (*x)[1]; answer = (*x)[4] * (*x)[2]; answer = (*x)[4] * (*x)[3]; } } /* * return duration */ end = usertime(); return end-start; } /* * mul_ratio - ratio of rates of 1st and 2nd multiply algorithms * * given: * len length in BASEB-bit words to multiply * * return: * ratio of (1st / 2nd) algorithm rate * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_mul2() instead. */ define mul_ratio(len) { local mat x[5]; /* array of values for mul_loop to multiply */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of multiply loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "mul_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * initialize x, the values we will multiply * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("mul2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } if (config("user_debug") > 3) { printf("\t will try alg1 %d loops\n", loops); } tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = mul_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "mul_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "mul_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("mul2", 2),; loops = 1/2; do { loops *= 2; tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = mul_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "mul_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "mul_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ return (alg1_rate / alg2_rate); } /* * best_mul2 - determine the best config("mul2") parameter * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_mul2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* low loop value tested */ local expand; /* how fast to expand the length */ /* * setup */ test_time = 15.0; if (config("user_debug") > 0) { printf("will start with loop test time of %d secs\n", test_time); } /* * firewall - must have a >1 ratio for the initial length */ high = 16; if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.3f\n", ratio); } if (ratio <= 1.0) { quit "best_mul2: tests imply config(\"mul2\") should be < 4"; } /* * expand lengths until the ratio flips */ do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. */ expand = ((ratio >= 3.5) ? 16 : 2^round(ratio)); low = high; high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_mul2: tests imply config(\"mul2\") should be >= 2^31"; } if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.3f\n", ratio); } } while (ratio >= 1.0); if (config("user_debug") > 0) { printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", int((low+high)/2)); } ratio = mul_ratio(int((low+high)/2)); if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.3f\n", ratio); } /* bump lower range up if we went over */ if (ratio >= 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, int((low+high)/2)); } low = int((low+high)/2); /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, int((low+high)/2)); } high = int((low+high)/2); } } /* * return on the suggested config("mul2") value */ if (config("user_debug") > 0) { printf("best value of config(\"mul2\") is %d\n", (ratio >= 1.0) ? high : low); } return ((ratio >= 1.0) ? high : low); } /* * sq_loop - measure the CPU time to perform a set of square loops * * given: * repeat number of square loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * returns: * approximate runtime to perform a square loop * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_sq2() instead. */ define sq_loop(repeat, x) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* squared value */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "sq_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "sq_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "sq_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "sq_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "sq_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "sq_loop: 2nd arg matrix elements are not of equal BASEB-bit word length"; } } /* square pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; } else { /* one square loop */ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; } } /* * return duration */ end = usertime(); return end-start; } /* * sq_ratio - ratio of rates of 1st and 2nd square algorithms * * given: * len length in BASEB-bit words to square * * return: * ratio of (1st / 2nd) algorithm rates * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_sq2() instead. */ define sq_ratio(len) { local mat x[5]; /* array of values for sq_loop to square */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of square loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "sq_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * initialize x, the values we will square * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("sq2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = sq_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "sq_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "sq_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("sq2", 2),; loops = 1/2; do { loops *= 2; tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = sq_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "sq_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "sq_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ return (alg1_rate / alg2_rate); } /* * best_sq2 - determine the best config("sq2") parameter * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_sq2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* low loop value tested */ local expand; /* how fast to expand the length */ /* * setup */ test_time = 15.0; if (config("user_debug") > 0) { printf("will start with loop test time of %d secs\n", test_time); } /* * firewall - must have a >1 ratio for the initial length */ high = 16; if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", high); } ratio = sq_ratio(high); if (config("user_debug") > 1) { printf(" square alg1/alg2 ratio = %.3f\n", ratio); } if (ratio <= 1.0) { quit "best_sq2: tests imply config(\"sq2\") should be < 4"; } /* * expand lengths until the ratio flips */ do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. */ expand = ((ratio >= 3.5) ? 16 : 2^round(ratio)); low = high; high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_sq2: tests imply config(\"sq2\") should be >= 2^31"; } if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", high); } ratio = sq_ratio(high); if (config("user_debug") > 1) { printf(" square alg1/alg2 ratio = %.3f\n", ratio); } } while (ratio >= 1.0); if (config("user_debug") > 0) { printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", int((low+high)/2)); } ratio = sq_ratio(int((low+high)/2)); if (config("user_debug") > 1) { printf(" square alg1/alg2 ratio = %.3f\n", ratio); } /* bump lower range up if we went over */ if (ratio >= 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, int((low+high)/2)); } low = int((low+high)/2); /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, int((low+high)/2)); } high = int((low+high)/2); } } /* * return on the suggested config("sq2") value */ if (config("user_debug") > 0) { printf("best value of config(\"sq2\") is %d\n", (ratio >= 1.0) ? high : low); } return ((ratio >= 1.0) ? high : low); } /* * pow_loop - measure the CPU time to perform a set of pmod loops * * given: * repeat number of pmod loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * * ex exponent for pmod value * * returns: * approximate runtime to perform a pmod loop * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_pow2() instead. */ define pow_loop(repeat, x, ex) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* pmod value */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "pow_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "pow_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "pow_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "pow_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "pow_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "pow_loop: 2nd arg matrix elements are not of equal BASEB-bit word length"; } } if (!isint(ex) || ex < 3) { quit" pow_loop: 3rd arg ex is not an integer > 2"; } /* pmod pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); } else { answer = pmod((*x)[0], ex, (*x)[1]); answer = pmod((*x)[0], ex, (*x)[2]); answer = pmod((*x)[0], ex, (*x)[3]); answer = pmod((*x)[0], ex, (*x)[4]); /**/ answer = pmod((*x)[1], ex, (*x)[0]); answer = pmod((*x)[1], ex, (*x)[2]); answer = pmod((*x)[1], ex, (*x)[3]); answer = pmod((*x)[1], ex, (*x)[4]); /**/ answer = pmod((*x)[2], ex, (*x)[0]); answer = pmod((*x)[2], ex, (*x)[1]); answer = pmod((*x)[2], ex, (*x)[3]); answer = pmod((*x)[2], ex, (*x)[4]); /**/ answer = pmod((*x)[3], ex, (*x)[0]); answer = pmod((*x)[3], ex, (*x)[1]); answer = pmod((*x)[3], ex, (*x)[2]); answer = pmod((*x)[3], ex, (*x)[4]); /**/ answer = pmod((*x)[4], ex, (*x)[0]); answer = pmod((*x)[4], ex, (*x)[1]); answer = pmod((*x)[4], ex, (*x)[2]); answer = pmod((*x)[4], ex, (*x)[3]); } } /* * return duration */ end = usertime(); return end-start; } /* * pow_ratio - ratio of rates of 1st and 2nd pmod algorithms * * given: * len length in BASEB-bit words to pmod * * return: * ratio of (1st / 2nd) algorithm rates * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_pow2() instead. */ define pow_ratio(len) { local mat x[5]; /* array of values for pow_loop to pmod */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of pmod loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local ex; /* exponent to use in pow_loop() */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "pow_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * setup */ ex = 5; /* * initialize x, the values we will pmod * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(ex, baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("pow2", 2^31-1),; config("redc2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = pow_loop(loops, &one, ex); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "pow_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "pow_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("pow2", 2),; config("redc2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 8) { if (config("user_debug") > 1) { printf(" we must expand loop test time to more than %d secs\n", ceil(test_time * (8 / loops))); } loops = 8; } tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = pow_loop(loops, &one, ex); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "pow_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "pow_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ return (alg1_rate / alg2_rate); } /* * best_pow2 - determine the best config("pow2") parameter w/o REDC2 * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_pow2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* low loop value tested */ local expand; /* how fast to expand the length */ local looped; /* 1 ==> we have expanded lengths before */ /* * setup */ test_time = 15.0; if (config("user_debug") > 0) { printf("will start with loop test time of %d secs\n", test_time); } /* * firewall - must have a >1.02 ratio for the initial length * * We select 1.02 because of the precision of the CPU timing. We * want to firt move into an area where the 1st algoritm clearly * dominates. */ low = 4; high = 4; do { high *= 4; if (config("user_debug") > 0) { printf("testing pmod alg1/alg2 ratio for len = %d\n", high); } ratio = pow_ratio(high); if (config("user_debug") > 1) { printf(" pmod alg1/alg2 ratio = %.3f\n", ratio); if (ratio > 1.0 && ratio <= 1.02) { printf(" while alg1 is slightly better than alg2, it is not clearly better\n"); } } } while (ratio <= 1.02); if (config("user_debug") > 0) { printf("starting the pow2 search above %d\n", high); } /* * expand lengths until the ratio flips */ looped = 0; do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. * * NOTE: At low lengths, the ratios seen to be very small * so we force an expansion of 4 to speed us on our * way to larger lengths. At these somewhat larger * lengths, the ratios usually don't get faster than * 1.25, so we need to expand force a more rapid * expansion than normal. At lengths longer than * 2k, the time to test becomes very long, so we * want to slow down at these higher lengths. */ expand = 2; if (looped) { low = high; } high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_pow2: tests imply config(\"pow2\") should be >= 2^31"; } if (config("user_debug") > 0) { printf("testing pmod alg1/alg2 ratio for len = %d\n", high); } ratio = pow_ratio(high); if (config("user_debug") > 1) { printf(" pmod alg1/alg2 ratio = %.3f\n", ratio); } looped = 1; } while (ratio >= 1.0); if (config("user_debug") > 0) { printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ if (config("user_debug") > 0) { printf("testing pmod alg1/alg2 ratio for len = %d\n", int((low+high)/2)); } ratio = pow_ratio(int((low+high)/2)); if (config("user_debug") > 1) { printf(" pmod alg1/alg2 ratio = %.3f\n", ratio); } /* bump lower range up if we went over */ if (ratio >= 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, int((low+high)/2)); } low = int((low+high)/2); /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, int((low+high)/2)); } high = int((low+high)/2); } } /* * return on the suggested config("pow2") value */ if (config("user_debug") > 0) { printf("best value of config(\"pow2\") is %d\n", (ratio >= 1.0) ? high : low); } return ((ratio >= 1.0) ? high : low); }