/* * alg_config - help determine optimal values for algorithm levels * * Copyright (C) 2006,2014 Landon Curt Noll * * Calc is open software; you can redistribute it and/or modify it under * the terms of the version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * Calc is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General * Public License for more details. * * A copy of version 2.1 of the GNU Lesser General Public License is * distributed with calc under the filename COPYING-LGPL. You should have * received a copy with calc; if not, write to Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * @(#) $Revision: 30.11 $ * @(#) $Id: alg_config.cal,v 30.11 2014/09/07 06:13:04 chongo Exp $ * @(#) $Source: /usr/local/src/bin/calc/cal/RCS/alg_config.cal,v $ * * Under source code control: 2006/06/07 14:10:11 * File existed as early as: 2006 * * chongo /\oo/\ http://www.isthe.com/chongo/ * Share and enjoy! :-) http://www.isthe.com/chongo/tech/comp/calc/ */ static test_time; /* try for this many seconds in loop test */ /* * close_to_one - set to 1 if the ratio is close enough to 1 * * given: * ratio the ratio of time between two algorithms * * retuns: * 1 When ratio is near 1.0 * 0 otherwise * * We consider the range [0.995, 1.005] to be close enough to 1 to be call unity * because of the precision of the CPU timing. */ define close_to_one(ratio) { /* firewall */ if (!isreal(ratio)) { quit "close: 1st arg: must be a real number"; } /* check if the ratio is far from unity */ if ((ratio < 0.995) || (ratio > 1.005)) { return 0; } /* we are close to unity */ return 1; } /* * mul_loop - measure the CPU time to perform a set of multiply loops * * given: * repeat number of multiply loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * * returns: * approximate runtime to perform repeat the multiply loops * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_mul2() instead. */ define mul_loop(repeat, x) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* multiplicand */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "mul_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "mul_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "mul_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "mul_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "mul_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "mul_loop: 2nd arg matrix elements are not of " "equal BASEB-bit word length"; } } /* multiply pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; /**/ answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; } else { answer = (*x)[0] * (*x)[1]; answer = (*x)[0] * (*x)[2]; answer = (*x)[0] * (*x)[3]; answer = (*x)[0] * (*x)[4]; /**/ answer = (*x)[1] * (*x)[0]; answer = (*x)[1] * (*x)[2]; answer = (*x)[1] * (*x)[3]; answer = (*x)[1] * (*x)[4]; /**/ answer = (*x)[2] * (*x)[0]; answer = (*x)[2] * (*x)[1]; answer = (*x)[2] * (*x)[3]; answer = (*x)[2] * (*x)[4]; /**/ answer = (*x)[3] * (*x)[0]; answer = (*x)[3] * (*x)[1]; answer = (*x)[3] * (*x)[2]; answer = (*x)[3] * (*x)[4]; /**/ answer = (*x)[4] * (*x)[0]; answer = (*x)[4] * (*x)[1]; answer = (*x)[4] * (*x)[2]; answer = (*x)[4] * (*x)[3]; } } /* * return duration */ end = usertime(); return end-start; } /* * mul_ratio - ratio of rates of 1st and 2nd multiply algorithms * * given: * len length in BASEB-bit words to multiply * * return: * ratio of (1st / 2nd) algorithm rate. * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_mul2() instead. */ define mul_ratio(len) { local mat x[5]; /* array of values for mul_loop to multiply */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of multiply loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local ret; /* return ratio, or 1.0 */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "mul_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * initialize x, the values we will multiply * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("mul2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg1 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } if (config("user_debug") > 3) { printf("\t will try alg1 %d loops\n", loops); } tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = mul_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "mul_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "mul_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("mul2", 2),; loops = 1/2; do { loops *= 2; tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg2 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } tlen = mul_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = mul_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "mul_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "mul_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ ret = alg1_rate / alg2_rate; if (config("user_debug") > 2) { printf("\tprecise ratio is: %.f mul_ratio will return: %.3f\n", alg1_rate / alg2_rate, ret); } return ret; } /* * best_mul2 - determine the best config("mul2") parameter * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_mul2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* high loop value tested */ local mid; /* between low and high */ local best_val; /* value found with ratio closest to unity */ local best_ratio; /* cloest ratio found to unity */ local expand; /* how fast to expand the length */ /* * setup */ test_time = 30.0; printf("The best_mul2() function will take a LONG time to run!\n"); printf("It is important that best_mul2() run on an othwewise idle host!\n"); if (config("user_debug") <= 0) { printf("To monitor progress, set user_debug to 2: " "config(\"user_debug\", 2)\n"); } printf("Starting with loop test time of %d secs\n", test_time); /* * firewall - must have a >1 ratio for the initial length */ high = 16; best_val = high; if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); best_ratio = ratio; if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.6f\n", ratio); } if (ratio < 1.0) { quit "best_mul2: tests imply mul2 should be < 16, which seems bogus"; } /* * expand lengths until the ratio flips */ do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. */ expand = ((ratio >= 10) ? 1024 : 2^round(ratio)); low = high; high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_mul2: test implies mul2 >= 2^31, which seems bogus"; } if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.6f\n", ratio); } } while (ratio > 1.0); /* * If we previously expanded more than by a factor of 2, then * we may have jumped over the crossover point. So now * drop down powers of two until the ratio is again >= 1.0 */ if (expand > 2) { do { /* * contract by 2 */ high /= 2; low = high / 2; if (config("user_debug") > 0) { printf("retesting multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.6f\n", ratio); } } while (ratio <= 1.0); /* now that the ratio flipped again, use the previous range */ low = high; high = high * 2; } if (config("user_debug") > 0) { printf("Starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ mid = int((low+high)/2); if (config("user_debug") > 0) { printf("testing multiply alg1/alg2 ratio for len = %d\n", mid); } ratio = mul_ratio(mid); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = mid; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" len %d multiply alg1/alg2 ratio = %.6f\n", mid, ratio); } /* stop search if near unity */ if (close_to_one(ratio)) { low = mid; high = mid; if (config("user_debug") > 0) { printf("\twe are close enough to unity ratio at: %d\n", mid); } break; } /* bump lower range up if we went over */ if (ratio > 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, mid); } low = mid; /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, mid); } high = mid; } /* report on test loop progress */ if (config("user_debug") > 1) { printf("\tsetting low: %d high: %d diff: %d\n", low, high, high-low); } } /* * return on the suggested config("mul2") value */ if (config("user_debug") > 0) { printf("Best value for multiply is near %d\n", best_val); printf("Best multiply alg1/alg2 ratio is: %.6f\n", best_ratio); printf("We suggest placing this line in your .calcrc:\n"); printf("config(\"mul2\", %d),;\n", best_val); printf("WARNING: It is believed that the output " "of this resource file is bogus!\n"); printf("WARNING: You may NOT wish to follow the above suggeston.\n"); } return mid; } /* * sq_loop - measure the CPU time to perform a set of square loops * * given: * repeat number of square loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * returns: * approximate runtime to perform a square loop * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_sq2() instead. */ define sq_loop(repeat, x) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* squared value */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "sq_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "sq_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "sq_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "sq_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "sq_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "sq_loop: 2nd arg matrix elements are not of equal " "BASEB-bit word length"; } } /* square pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; /**/ answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2; } else { /* one square loop */ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; /**/ answer = (*x)[0]^2; answer = (*x)[1]^2; answer = (*x)[2]^2; answer = (*x)[3]^2; answer = (*x)[4]^2; } } /* * return duration */ end = usertime(); return end-start; } /* * sq_ratio - ratio of rates of 1st and 2nd square algorithms * * given: * len length in BASEB-bit words to square * * return: * ratio of (1st / 2nd) algorithm rates * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_sq2() instead. */ define sq_ratio(len) { local mat x[5]; /* array of values for sq_loop to square */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of square loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local ret; /* return ratio, or 1.0 */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "sq_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * initialize x, the values we will square * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("sq2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg1 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = sq_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "sq_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "sq_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("sq2", 2),; loops = 1/2; do { loops *= 2; tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg2 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } tlen = sq_loop(loops, &x); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = sq_loop(loops, &one); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "sq_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "sq_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ ret = alg1_rate / alg2_rate; if (config("user_debug") > 2) { printf("\tprecise ratio is: %.f sq_ratio will return: %.3f\n", alg1_rate / alg2_rate, ret); } return ret; } /* * best_sq2 - determine the best config("sq2") parameter * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_sq2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* high loop value tested */ local mid; /* between low and high */ local best_val; /* value found with ratio closest to unity */ local best_ratio; /* cloest ratio found to unity */ local expand; /* how fast to expand the length */ /* * setup */ test_time = 30.0; printf("The best_sq2() function will take a LONG time to run!\n"); printf("It is important that best_sq2() run on an othwewise idle host!\n"); if (config("user_debug") <= 0) { printf("To monitor progress, set user_debug to 2: " "config(\"user_debug\", 2)\n"); } printf("Starting with loop test time of %d secs\n", test_time); /* * firewall - must have a >1 ratio for the initial length */ high = 16; best_val = high; if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", high); } ratio = sq_ratio(high); best_ratio = ratio; if (config("user_debug") > 1) { printf(" square alg1/alg2 ratio = %.3f\n", ratio); } if (ratio < 1.0) { quit "best_sq2: test implies sq2 < 16, which seems bogus"; } /* * expand lengths until the ratio flips */ do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. */ expand = ((ratio >= 10) ? 1024 : 2^round(ratio)); low = high; high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_sq2: tests imply sq2 >= 2^31, which seems bogus"; } if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", high); } ratio = sq_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" square alg1/alg2 ratio = %.3f\n", ratio); } } while (ratio > 1.0); /* * If we previously expanded more than by a factor of 2, then * we may have jumped over the crossover point. So now * drop down powers of two until the ratio is again >= 1.0 */ if (expand > 2) { do { /* * contract by 2 */ high /= 2; low = high / 2; if (config("user_debug") > 0) { printf("retesting multiply alg1/alg2 ratio for len = %d\n", high); } ratio = mul_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" multiply alg1/alg2 ratio = %.6f\n", ratio); } } while (ratio <= 1.0); /* now that the ratio flipped again, use the previous range */ low = high; high = high * 2; } if (config("user_debug") > 0) { printf("Starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ mid = int((low+high)/2); if (config("user_debug") > 0) { printf("testing square alg1/alg2 ratio for len = %d\n", mid); } ratio = sq_ratio(mid); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = mid; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" len %d square alg1/alg2 ratio = %.6f\n", mid, ratio); } /* stop search if near unity */ if (close_to_one(ratio)) { low = mid; high = mid; if (config("user_debug") > 0) { printf("\twe are close enough to unity ratio at: %d\n", mid); } break; } /* bump lower range up if we went over */ if (ratio > 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, mid); } low = mid; /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, mid); } high = mid; } /* report on test loop progress */ if (config("user_debug") > 1) { printf("\tsetting low: %d high: %d diff: %d\n", low, high, high-low); } } /* * return on the suggested config("sq2") value */ mid = int((low+high)/2); if (config("user_debug") > 0) { printf("Best value for square is near %d\n", best_val); printf("Best square alg1/alg2 ratio is: %.6f\n", best_ratio); printf("We suggest placing this line in your .calcrc:\n"); printf("config(\"sq2\", %d),;\n", best_val); printf("WARNING: It is believed that the output " "of this resource file is bogus!\n"); printf("WARNING: You may NOT wish to follow the above suggeston.\n"); } return mid; } /* * pow_loop - measure the CPU time to perform a set of pmod loops * * given: * repeat number of pmod loops to perform * x array of 5 values, each the same length in BASEB-bit words * * NOTE: When their lengths are 1 BASEB-bit word, then a * dummy loop of simple constants are used. Thus the * length == 1 is an approximation of loop overhead. * * ex exponent for pmod value * * returns: * approximate runtime to perform a pmod loop * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_pow2() instead. */ define pow_loop(repeat, x, ex) { local start; /* start of execution */ local end; /* end of execution */ local answer; /* pmod value */ local len; /* length of each element */ local baseb_bytes; /* bytes in a BASEB-bit word */ local i; /* firewall */ if (!isint(repeat) || repeat < 0) { quit "pow_loop: 1st arg: repeat must be an integer > 0"; } if (size(*x) != 5) { quit "pow_loop: 2nd arg matrix does not have 5 elements"; } if (matdim(*x) != 1) { quit "pow_loop: 2nd arg matrix is not 1 dimensional"; } if (matmin(*x, 1) != 0) { quit "pow_loop: 2nd arg matrix index range does not start with 0"; } if (matmax(*x, 1) != 4) { quit "pow_loop: 2nd arg matrix index range does not end with 4"; } baseb_bytes = config("baseb") / 8; len = sizeof((*x)[0]) / baseb_bytes; for (i=1; i < 4; ++i) { if ((sizeof((*x)[i]) / baseb_bytes) != len) { quit "pow_loop: 2nd arg matrix elements are not of " "equal BASEB-bit word length"; } } if (!isint(ex) || ex < 3) { quit" pow_loop: 3rd arg ex is not an integer > 2"; } /* pmod pairwise, all sets of a given length */ start = usertime(); for (i=0; i < repeat; ++i) { if (len == 1) { /* we use len == 1 to test this tester loop overhead */ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); /**/ answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); answer = pmod(0,0,0); } else { answer = pmod((*x)[0], ex, (*x)[1]); answer = pmod((*x)[0], ex, (*x)[2]); answer = pmod((*x)[0], ex, (*x)[3]); answer = pmod((*x)[0], ex, (*x)[4]); /**/ answer = pmod((*x)[1], ex, (*x)[0]); answer = pmod((*x)[1], ex, (*x)[2]); answer = pmod((*x)[1], ex, (*x)[3]); answer = pmod((*x)[1], ex, (*x)[4]); /**/ answer = pmod((*x)[2], ex, (*x)[0]); answer = pmod((*x)[2], ex, (*x)[1]); answer = pmod((*x)[2], ex, (*x)[3]); answer = pmod((*x)[2], ex, (*x)[4]); /**/ answer = pmod((*x)[3], ex, (*x)[0]); answer = pmod((*x)[3], ex, (*x)[1]); answer = pmod((*x)[3], ex, (*x)[2]); answer = pmod((*x)[3], ex, (*x)[4]); /**/ answer = pmod((*x)[4], ex, (*x)[0]); answer = pmod((*x)[4], ex, (*x)[1]); answer = pmod((*x)[4], ex, (*x)[2]); answer = pmod((*x)[4], ex, (*x)[3]); } } /* * return duration */ end = usertime(); return end-start; } /* * pow_ratio - ratio of rates of 1st and 2nd pmod algorithms * * given: * len length in BASEB-bit words to pmod * * return: * ratio of (1st / 2nd) algorithm rates * * When want to determine a rate to a precision of 1 part in 1000. * Most systems today return CPU time to at least 10 msec precision. * So to get rates to that precision, we need to time loops to at * least 1000 times as long as the precision (10 msec * 1000) * which usually requires timing of loops that last 10 seconds or more. * * NOTE: This is an internal support function that is normally * not called directly from the command line. Call the * function best_pow2() instead. */ define pow_ratio(len) { local mat x[5]; /* array of values for pow_loop to pmod */ local mat one[5]; /* array if single BASEB-bit values */ local baseb; /* calc word size in bits */ local orig_cfg; /* caller configuration */ local loops; /* number of pmod loops to time */ local tlen; /* time to perform some number of loops */ local tover; /* est of time for loop overhead */ local alg1_rate; /* loop rate of 1st algorithm */ local alg2_rate; /* loop rate of 2nd algorithm */ local ex; /* exponent to use in pow_loop() */ local ret; /* return ratio, or 1.0 */ local i; /* * firewall */ if (!isint(len) || len < 2) { quit "pow_ratio: 1st arg: len is not an integer > 1"; } /* * remember the caller's config state */ orig_cfg = config("all"); config("mul2", 0),; config("sq2", 0),; config("pow2", 0),; config("redc2", 0),; config("tilde", 0),; /* * setup */ ex = 7; /* * initialize x, the values we will pmod * * We want these tests to be repeatable as possible, so we will seed * the PRNG in a deterministic way. */ baseb = config("baseb"); srand(sha1(sha1(ex, baseb, config("version")))); for (i=0; i < 5; ++i) { /* force the values to be a full len words long */ x[i] = ((1<<(((len-1) * baseb) + baseb-1)) | randbit(((len-1) * baseb) + baseb-2)); /* single BASEB-bit values */ one[i] = 1; } /* * determine the number of loops needed to test 1st alg */ config("pow2", 2^31-1),; config("redc2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg1 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 1st algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg1 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg1 time = %.3f secs\n", tlen); } tover = pow_loop(loops, &one, ex); if (config("user_debug") > 3) { printf("\t alg1 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "pow_ratio: overhead >= loop time"; } alg1_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate); } if (alg1_rate <= 0.0) { quit "pow_ratio: alg1 rate was <= 0.0"; } /* * determine the number of loops needed to test 1st alg */ config("pow2", 2),; config("redc2", 2^31-1),; loops = 1/2; do { loops *= 2; tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg2 loops %d took %.3f sec\n", loops, tlen); } } while (tlen < 1.0); /* * determine the 2nd algorithm rate */ loops = max(1, ceil(loops * test_time / tlen)); if (loops < 16) { if (config("user_debug") > 1) { printf(" we must expand alg2 loop test time to about %d secs\n", ceil(test_time * (16 / loops))); } loops = 16; } tlen = pow_loop(loops, &x, ex); if (config("user_debug") > 3) { printf("\t alg2 time = %.3f secs\n", tlen); } tover = pow_loop(loops, &one, ex); if (config("user_debug") > 3) { printf("\t alg2 overhead look %.3f secs\n", tover); } if (tlen <= tover) { quit "pow_ratio: overhead >= loop time"; } alg2_rate = loops / (tlen - tover); if (config("user_debug") > 2) { printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate); } if (alg2_rate <= 0.0) { quit "pow_ratio: alg2 rate was <= 0.0"; } /* * restore old config */ config("all", orig_cfg),; /* * return alg1 / alg2 rate ratio */ ret = alg1_rate / alg2_rate; if (config("user_debug") > 2) { printf("\tprecise ratio is: %.f pow_ratio will return: %.3f\n", alg1_rate / alg2_rate, ret); } return ret; } /* * best_pow2 - determine the best config("pow2") parameter w/o REDC2 * * NOTE: Due to precision problems with CPU measurements, it is not * unusual for the output of this function to vary slightly * from run to run. * * NOTE: This function is designed to take a long time to run. * We recommend setting: * * config("user_debug", 2) * * so that yon can watch the progress of this function. */ define best_pow2() { local ratio; /* previously calculated alg1/alg2 ratio */ local low; /* low loop value tested */ local high; /* high loop value tested */ local mid; /* between low and high */ local best_val; /* value found with ratio closest to unity */ local best_ratio; /* cloest ratio found to unity */ local expand; /* how fast to expand the length */ local looped; /* 1 ==> we have expanded lengths before */ /* * setup */ test_time = 60.0; printf("The best_pow2() function will take a LONG time to run!\n"); printf("It is important that best_pow2() run on an othwewise idle host!\n"); if (config("user_debug") <= 0) { printf("To monitor progress, set user_debug to 2: " "config(\"user_debug\", 2)\n"); } printf("Starting with loop test time of %d secs\n", test_time); /* * firewall - must have a >1.02 ratio for the initial length * * We select 1.02 because of the precision of the CPU timing. We * want to firt move into an area where the 1st algoritm clearly * dominates. */ low = 4; high = 4; best_val = high; best_ratio = 1e10; /* not a real value */ do { high *= 4; if (config("user_debug") > 0) { printf("testing pmod alg1/alg2 ratio for len = %d\n", high); } ratio = pow_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" pmod alg1/alg2 ratio = %.3f\n", ratio); if (ratio > 1.0 && ratio <= 1.02) { printf(" while alg1 is slightly better than alg2, " "it is not clearly better\n"); } } } while (ratio <= 1.02); if (config("user_debug") > 0) { printf("starting the pow2 search above %d\n", high); } /* * expand lengths until the ratio flips */ looped = 0; do { /* * determine the paramters of the next ratio test * * We will multiplicatively expand our test level until * the ratio drops below 1.0. * * NOTE: At low lengths, the ratios seen to be very small * so we force an expansion of 4 to speed us on our * way to larger lengths. At these somewhat larger * lengths, the ratios usually don't get faster than * 1.25, so we need to expand force a more rapid * expansion than normal. At lengths longer than * 2k, the time to test becomes very long, so we * want to slow down at these higher lengths. */ expand = 2; if (looped) { low = high; } high *= expand; if (config("user_debug") > 1) { printf(" expand the next test range by a factor of %d\n", expand); } /* * determine the alg1/alg2 test ratio for this new length */ if (high >= 2^31) { quit "best_pow2: test implies pow2 >= 2^31, which seems bogus"; } if (config("user_debug") > 0) { printf("testing pmod alg1/alg2 ratio for len = %d\n", high); } ratio = pow_ratio(high); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = high; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" pmod alg1/alg2 ratio = %.6f\n", ratio); } looped = 1; } while (ratio > 1.0); if (config("user_debug") > 0) { printf("Starting binary search between %d and %d\n", low, high); } /* * binary search between low and high, for where ratio is just under 1.0 */ while (low+1 < high) { /* try the mid-point */ mid = int((low+high)/2); if (config("user_debug") > 0) { printf("testing pow2 alg1/alg2 ratio for len = %d\n", mid); } ratio = pow_ratio(mid); if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) { best_val = mid; best_ratio = ratio; if (config("user_debug") > 1) { printf(" len %d has a new cloest ratio to unity: %.6f\n", best_val, best_ratio); } } if (config("user_debug") > 1) { printf(" len %d pmod alg1/alg2 ratio = %.6f\n", mid, ratio); } /* stop search if near unity */ if (close_to_one(ratio)) { low = mid; high = mid; if (config("user_debug") > 0) { printf("\twe are close enough to unity ratio at: %d\n", mid); } break; } /* bump lower range up if we went over */ if (ratio > 1.0) { if (config("user_debug") > 2) { printf("\tmove low from %d up to %d\n", low, mid); } low = mid; /* drop higher range down if we went under */ } else { if (config("user_debug") > 2) { printf("\tmove high from %d down to %d\n", high, mid); } high = mid; } /* report on test loop progress */ if (config("user_debug") > 1) { printf("\tsetting low: %d high: %d diff: %d\n", low, high, high-low); } } /* * return on the suggested config("pow2") value */ mid = int((low+high)/2); if (config("user_debug") > 0) { printf("Best value for pmod is near %d\n", best_val); printf("Best pmod alg1/alg2 ratio is: %.6f\n", best_ratio); printf("We suggest placing this line in your .calcrc:\n"); printf("config(\"pow2\", %d),;\n", best_val); printf("WARNING: It is believed that the output " "of this resource file is bogus!\n"); printf("WARNING: You may NOT wish to follow the above suggeston.\n"); } return mid; }