mirror of
https://github.com/lcn2/calc.git
synced 2025-08-16 01:03:29 +03:00
Some folks might think: “you still use RCS”?!? And we will say, hey, at least we switched from SCCS to RCS back in … I think it was around 1994 ... at least we are keeping up! :-) :-) :-) Logs say that SCCS version 18 became RCS version 19 on 1994 March 18. RCS served us well. But now it is time to move on. And so we are switching to git. Calc releases produce a lot of file changes. In the 125 releases of calc since 1996, when I started managing calc releases, there have been 15473 file mods!
1523 lines
42 KiB
Plaintext
1523 lines
42 KiB
Plaintext
/*
|
|
* alg_config - help determine optimal values for algorithm levels
|
|
*
|
|
* Copyright (C) 2006,2014 Landon Curt Noll
|
|
*
|
|
* Calc is open software; you can redistribute it and/or modify it under
|
|
* the terms of the version 2.1 of the GNU Lesser General Public License
|
|
* as published by the Free Software Foundation.
|
|
*
|
|
* Calc is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
|
|
* Public License for more details.
|
|
*
|
|
* A copy of version 2.1 of the GNU Lesser General Public License is
|
|
* distributed with calc under the filename COPYING-LGPL. You should have
|
|
* received a copy with calc; if not, write to Free Software Foundation, Inc.
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*
|
|
* Under source code control: 2006/06/07 14:10:11
|
|
* File existed as early as: 2006
|
|
*
|
|
* chongo <was here> /\oo/\ http://www.isthe.com/chongo/
|
|
* Share and enjoy! :-) http://www.isthe.com/chongo/tech/comp/calc/
|
|
*/
|
|
|
|
static test_time; /* try for this many seconds in loop test */
|
|
|
|
|
|
/*
|
|
* close_to_one - set to 1 if the ratio is close enough to 1
|
|
*
|
|
* given:
|
|
* ratio the ratio of time between two algorithms
|
|
*
|
|
* retuns:
|
|
* 1 When ratio is near 1.0
|
|
* 0 otherwise
|
|
*
|
|
* We consider the range [0.995, 1.005] to be close enough to 1 to be call unity
|
|
* because of the precision of the CPU timing.
|
|
*/
|
|
define close_to_one(ratio)
|
|
{
|
|
/* firewall */
|
|
if (!isreal(ratio)) {
|
|
quit "close: 1st arg: must be a real number";
|
|
}
|
|
|
|
/* check if the ratio is far from unity */
|
|
if ((ratio < 0.995) || (ratio > 1.005)) {
|
|
return 0;
|
|
}
|
|
|
|
/* we are close to unity */
|
|
return 1;
|
|
}
|
|
|
|
|
|
/*
|
|
* mul_loop - measure the CPU time to perform a set of multiply loops
|
|
*
|
|
* given:
|
|
* repeat number of multiply loops to perform
|
|
* x array of 5 values, each the same length in BASEB-bit words
|
|
*
|
|
* NOTE: When their lengths are 1 BASEB-bit word, then a
|
|
* dummy loop of simple constants are used. Thus the
|
|
* length == 1 is an approximation of loop overhead.
|
|
*
|
|
* returns:
|
|
* approximate runtime to perform repeat the multiply loops
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_mul2() instead.
|
|
*/
|
|
define mul_loop(repeat, x)
|
|
{
|
|
local start; /* start of execution */
|
|
local end; /* end of execution */
|
|
local answer; /* multiplicand */
|
|
local len; /* length of each element */
|
|
local baseb_bytes; /* bytes in a BASEB-bit word */
|
|
local i;
|
|
|
|
/* firewall */
|
|
if (!isint(repeat) || repeat < 0) {
|
|
quit "mul_loop: 1st arg: repeat must be an integer > 0";
|
|
}
|
|
if (size(*x) != 5) {
|
|
quit "mul_loop: 2nd arg matrix does not have 5 elements";
|
|
}
|
|
if (matdim(*x) != 1) {
|
|
quit "mul_loop: 2nd arg matrix is not 1 dimensional";
|
|
}
|
|
if (matmin(*x, 1) != 0) {
|
|
quit "mul_loop: 2nd arg matrix index range does not start with 0";
|
|
}
|
|
if (matmax(*x, 1) != 4) {
|
|
quit "mul_loop: 2nd arg matrix index range does not end with 4";
|
|
}
|
|
|
|
baseb_bytes = config("baseb") / 8;
|
|
len = sizeof((*x)[0]) / baseb_bytes;
|
|
for (i=1; i < 4; ++i) {
|
|
if ((sizeof((*x)[i]) / baseb_bytes) != len) {
|
|
quit "mul_loop: 2nd arg matrix elements are not of "
|
|
"equal BASEB-bit word length";
|
|
}
|
|
}
|
|
|
|
/* multiply pairwise, all sets of a given length */
|
|
start = usertime();
|
|
for (i=0; i < repeat; ++i) {
|
|
|
|
if (len == 1) {
|
|
/* we use len == 1 to test this tester loop overhead */
|
|
answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
|
|
/**/
|
|
answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
|
|
/**/
|
|
answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
|
|
/**/
|
|
answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
|
|
/**/
|
|
answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
|
|
} else {
|
|
answer = (*x)[0] * (*x)[1];
|
|
answer = (*x)[0] * (*x)[2];
|
|
answer = (*x)[0] * (*x)[3];
|
|
answer = (*x)[0] * (*x)[4];
|
|
/**/
|
|
answer = (*x)[1] * (*x)[0];
|
|
answer = (*x)[1] * (*x)[2];
|
|
answer = (*x)[1] * (*x)[3];
|
|
answer = (*x)[1] * (*x)[4];
|
|
/**/
|
|
answer = (*x)[2] * (*x)[0];
|
|
answer = (*x)[2] * (*x)[1];
|
|
answer = (*x)[2] * (*x)[3];
|
|
answer = (*x)[2] * (*x)[4];
|
|
/**/
|
|
answer = (*x)[3] * (*x)[0];
|
|
answer = (*x)[3] * (*x)[1];
|
|
answer = (*x)[3] * (*x)[2];
|
|
answer = (*x)[3] * (*x)[4];
|
|
/**/
|
|
answer = (*x)[4] * (*x)[0];
|
|
answer = (*x)[4] * (*x)[1];
|
|
answer = (*x)[4] * (*x)[2];
|
|
answer = (*x)[4] * (*x)[3];
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return duration
|
|
*/
|
|
end = usertime();
|
|
return end-start;
|
|
}
|
|
|
|
|
|
/*
|
|
* mul_ratio - ratio of rates of 1st and 2nd multiply algorithms
|
|
*
|
|
* given:
|
|
* len length in BASEB-bit words to multiply
|
|
*
|
|
* return:
|
|
* ratio of (1st / 2nd) algorithm rate.
|
|
*
|
|
* When want to determine a rate to a precision of 1 part in 1000.
|
|
* Most systems today return CPU time to at least 10 msec precision.
|
|
* So to get rates to that precision, we need to time loops to at
|
|
* least 1000 times as long as the precision (10 msec * 1000)
|
|
* which usually requires timing of loops that last 10 seconds or more.
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_mul2() instead.
|
|
*/
|
|
define mul_ratio(len)
|
|
{
|
|
local mat x[5]; /* array of values for mul_loop to multiply */
|
|
local mat one[5]; /* array if single BASEB-bit values */
|
|
local baseb; /* calc word size in bits */
|
|
local orig_cfg; /* caller configuration */
|
|
local loops; /* number of multiply loops to time */
|
|
local tlen; /* time to perform some number of loops */
|
|
local tover; /* est of time for loop overhead */
|
|
local alg1_rate; /* loop rate of 1st algorithm */
|
|
local alg2_rate; /* loop rate of 2nd algorithm */
|
|
local ret; /* return ratio, or 1.0 */
|
|
local i;
|
|
|
|
/*
|
|
* firewall
|
|
*/
|
|
if (!isint(len) || len < 2) {
|
|
quit "mul_ratio: 1st arg: len is not an integer > 1";
|
|
}
|
|
|
|
/*
|
|
* remember the caller's config state
|
|
*/
|
|
orig_cfg = config("all");
|
|
config("mul2", 0),;
|
|
config("sq2", 0),;
|
|
config("pow2", 0),;
|
|
config("redc2", 0),;
|
|
config("tilde", 0),;
|
|
|
|
/*
|
|
* initialize x, the values we will multiply
|
|
*
|
|
* We want these tests to be repeatable as possible, so we will seed
|
|
* the PRNG in a deterministic way.
|
|
*/
|
|
baseb = config("baseb");
|
|
srand(sha1(sha1(baseb, config("version"))));
|
|
for (i=0; i < 5; ++i) {
|
|
/* force the values to be a full len words long */
|
|
x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
|
|
randbit(((len-1) * baseb) + baseb-2));
|
|
/* single BASEB-bit values */
|
|
one[i] = 1;
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("mul2", 2^31-1),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = mul_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 1st algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg1 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
if (config("user_debug") > 3) {
|
|
printf("\t will try alg1 %d loops\n", loops);
|
|
}
|
|
tlen = mul_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = mul_loop(loops, &one);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "mul_ratio: overhead >= loop time";
|
|
}
|
|
alg1_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate);
|
|
}
|
|
if (alg1_rate <= 0.0) {
|
|
quit "mul_ratio: alg1 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("mul2", 2),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = mul_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 2nd algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg2 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
tlen = mul_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = mul_loop(loops, &one);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "mul_ratio: overhead >= loop time";
|
|
}
|
|
alg2_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate);
|
|
}
|
|
if (alg2_rate <= 0.0) {
|
|
quit "mul_ratio: alg2 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* restore old config
|
|
*/
|
|
config("all", orig_cfg),;
|
|
|
|
/*
|
|
* return alg1 / alg2 rate ratio
|
|
*/
|
|
ret = alg1_rate / alg2_rate;
|
|
if (config("user_debug") > 2) {
|
|
printf("\tprecise ratio is: %.f mul_ratio will return: %.3f\n",
|
|
alg1_rate / alg2_rate, ret);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
/*
|
|
* best_mul2 - determine the best config("mul2") parameter
|
|
*
|
|
* NOTE: Due to precision problems with CPU measurements, it is not
|
|
* unusual for the output of this function to vary slightly
|
|
* from run to run.
|
|
*
|
|
* NOTE: This function is designed to take a long time to run.
|
|
* We recommend setting:
|
|
*
|
|
* config("user_debug", 2)
|
|
*
|
|
* so that yon can watch the progress of this function.
|
|
*/
|
|
define best_mul2()
|
|
{
|
|
local ratio; /* previously calculated alg1/alg2 ratio */
|
|
local low; /* low loop value tested */
|
|
local high; /* high loop value tested */
|
|
local mid; /* between low and high */
|
|
local best_val; /* value found with ratio closest to unity */
|
|
local best_ratio; /* cloest ratio found to unity */
|
|
local expand; /* how fast to expand the length */
|
|
|
|
/*
|
|
* setup
|
|
*/
|
|
printf("WARNING: This tool may not be computing the correct best value\n");
|
|
test_time = 5.0;
|
|
printf("The best_mul2() function will take a LONG time to run!\n");
|
|
printf("It is important that best_mul2() run on an othwewise idle host!\n");
|
|
if (config("user_debug") <= 0) {
|
|
printf("To monitor progress, set user_debug to 2: "
|
|
"config(\"user_debug\", 2)\n");
|
|
}
|
|
printf("Starting with loop test time of %d secs\n", test_time);
|
|
|
|
/*
|
|
* firewall - must have a >1 ratio for the initial length
|
|
*/
|
|
high = 8;
|
|
best_val = high;
|
|
if (config("user_debug") > 0) {
|
|
printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = mul_ratio(high);
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" multiply alg1/alg2 ratio = %.6f\n", ratio);
|
|
}
|
|
if (ratio < 1.0) {
|
|
quit "best_mul2: tests imply mul2 should be < 16, which seems bogus";
|
|
}
|
|
|
|
/*
|
|
* expand lengths until the ratio flips
|
|
*/
|
|
do {
|
|
/*
|
|
* determine the paramters of the next ratio test
|
|
*
|
|
* We will multiplicatively expand our test level until
|
|
* the ratio drops below 1.0.
|
|
*/
|
|
expand = 2;
|
|
low = high;
|
|
high *= expand;
|
|
if (config("user_debug") > 1) {
|
|
printf(" expand the next test range by a factor of %d\n",
|
|
expand);
|
|
}
|
|
|
|
/*
|
|
* determine the alg1/alg2 test ratio for this new length
|
|
*/
|
|
if (high >= 2^31) {
|
|
quit "best_mul2: test implies mul2 >= 2^31, which seems bogus";
|
|
}
|
|
if (config("user_debug") > 0) {
|
|
printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = mul_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" multiply alg1/alg2 ratio = %.6f\n", ratio);
|
|
}
|
|
} while (ratio > 1.0);
|
|
|
|
/*
|
|
* If we previously expanded more than by a factor of 2, then
|
|
* we may have jumped over the crossover point. So now
|
|
* drop down powers of two until the ratio is again >= 1.0
|
|
*/
|
|
if (expand > 2) {
|
|
do {
|
|
|
|
/*
|
|
* contract by 2
|
|
*/
|
|
high /= 2;
|
|
low = high / 2;
|
|
if (config("user_debug") > 0) {
|
|
printf("retesting multiply alg1/alg2 ratio for len = %d\n",
|
|
high);
|
|
}
|
|
ratio = mul_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" multiply alg1/alg2 ratio = %.6f\n", ratio);
|
|
}
|
|
|
|
} while (ratio <= 1.0);
|
|
|
|
/* now that the ratio flipped again, use the previous range */
|
|
low = high;
|
|
high = high * 2;
|
|
}
|
|
if (config("user_debug") > 0) {
|
|
printf("Starting binary search between %d and %d\n", low, high);
|
|
}
|
|
|
|
/*
|
|
* binary search between low and high, for where ratio is just under 1.0
|
|
*/
|
|
while (low+1 < high) {
|
|
|
|
/* try the mid-point */
|
|
mid = int((low+high)/2);
|
|
if (config("user_debug") > 0) {
|
|
printf("testing multiply alg1/alg2 ratio for len = %d\n", mid);
|
|
}
|
|
ratio = mul_ratio(mid);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = mid;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d multiply alg1/alg2 ratio = %.6f\n", mid, ratio);
|
|
}
|
|
|
|
/* stop search if near unity */
|
|
if (close_to_one(ratio)) {
|
|
low = mid;
|
|
high = mid;
|
|
if (config("user_debug") > 0) {
|
|
printf("\twe are close enough to unity ratio at: %d\n", mid);
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* bump lower range up if we went over */
|
|
if (ratio > 1.0) {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove low from %d up to %d\n",
|
|
low, mid);
|
|
}
|
|
low = mid;
|
|
|
|
/* drop higher range down if we went under */
|
|
} else {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove high from %d down to %d\n",
|
|
high, mid);
|
|
}
|
|
high = mid;
|
|
}
|
|
|
|
/* report on test loop progress */
|
|
if (config("user_debug") > 1) {
|
|
printf("\tsetting low: %d high: %d diff: %d\n",
|
|
low, high, high-low);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return on the suggested config("mul2") value
|
|
*/
|
|
if (config("user_debug") > 0) {
|
|
printf("Best value for multiply is near %d\n", best_val);
|
|
printf("Best multiply alg1/alg2 ratio is: %.6f\n", best_ratio);
|
|
printf("We suggest placing this line in your .calcrc:\n");
|
|
printf("config(\"mul2\", %d),;\n", best_val);
|
|
printf("WARNING: It is believed that the output "
|
|
"of this resource file is bogus!\n");
|
|
printf("WARNING: You may NOT wish to follow the above suggeston.\n");
|
|
}
|
|
return mid;
|
|
}
|
|
|
|
|
|
/*
|
|
* sq_loop - measure the CPU time to perform a set of square loops
|
|
*
|
|
* given:
|
|
* repeat number of square loops to perform
|
|
* x array of 5 values, each the same length in BASEB-bit words
|
|
*
|
|
* NOTE: When their lengths are 1 BASEB-bit word, then a
|
|
* dummy loop of simple constants are used. Thus the
|
|
* length == 1 is an approximation of loop overhead.
|
|
* returns:
|
|
* approximate runtime to perform a square loop
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_sq2() instead.
|
|
*/
|
|
define sq_loop(repeat, x)
|
|
{
|
|
local start; /* start of execution */
|
|
local end; /* end of execution */
|
|
local answer; /* squared value */
|
|
local len; /* length of each element */
|
|
local baseb_bytes; /* bytes in a BASEB-bit word */
|
|
local i;
|
|
|
|
/* firewall */
|
|
if (!isint(repeat) || repeat < 0) {
|
|
quit "sq_loop: 1st arg: repeat must be an integer > 0";
|
|
}
|
|
if (size(*x) != 5) {
|
|
quit "sq_loop: 2nd arg matrix does not have 5 elements";
|
|
}
|
|
if (matdim(*x) != 1) {
|
|
quit "sq_loop: 2nd arg matrix is not 1 dimensional";
|
|
}
|
|
if (matmin(*x, 1) != 0) {
|
|
quit "sq_loop: 2nd arg matrix index range does not start with 0";
|
|
}
|
|
if (matmax(*x, 1) != 4) {
|
|
quit "sq_loop: 2nd arg matrix index range does not end with 4";
|
|
}
|
|
baseb_bytes = config("baseb") / 8;
|
|
len = sizeof((*x)[0]) / baseb_bytes;
|
|
for (i=1; i < 4; ++i) {
|
|
if ((sizeof((*x)[i]) / baseb_bytes) != len) {
|
|
quit "sq_loop: 2nd arg matrix elements are not of equal "
|
|
"BASEB-bit word length";
|
|
}
|
|
}
|
|
|
|
/* square pairwise, all sets of a given length */
|
|
start = usertime();
|
|
for (i=0; i < repeat; ++i) {
|
|
|
|
if (len == 1) {
|
|
/* we use len == 1 to test this tester loop overhead */
|
|
answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
|
|
answer = 0^2;
|
|
/**/
|
|
answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
|
|
answer = 0^2;
|
|
/**/
|
|
answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
|
|
answer = 0^2;
|
|
/**/
|
|
answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
|
|
answer = 0^2;
|
|
} else {
|
|
/* one square loop */
|
|
answer = (*x)[0]^2;
|
|
answer = (*x)[1]^2;
|
|
answer = (*x)[2]^2;
|
|
answer = (*x)[3]^2;
|
|
answer = (*x)[4]^2;
|
|
/**/
|
|
answer = (*x)[0]^2;
|
|
answer = (*x)[1]^2;
|
|
answer = (*x)[2]^2;
|
|
answer = (*x)[3]^2;
|
|
answer = (*x)[4]^2;
|
|
/**/
|
|
answer = (*x)[0]^2;
|
|
answer = (*x)[1]^2;
|
|
answer = (*x)[2]^2;
|
|
answer = (*x)[3]^2;
|
|
answer = (*x)[4]^2;
|
|
/**/
|
|
answer = (*x)[0]^2;
|
|
answer = (*x)[1]^2;
|
|
answer = (*x)[2]^2;
|
|
answer = (*x)[3]^2;
|
|
answer = (*x)[4]^2;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return duration
|
|
*/
|
|
end = usertime();
|
|
return end-start;
|
|
}
|
|
|
|
|
|
/*
|
|
* sq_ratio - ratio of rates of 1st and 2nd square algorithms
|
|
*
|
|
* given:
|
|
* len length in BASEB-bit words to square
|
|
*
|
|
* return:
|
|
* ratio of (1st / 2nd) algorithm rates
|
|
*
|
|
* When want to determine a rate to a precision of 1 part in 1000.
|
|
* Most systems today return CPU time to at least 10 msec precision.
|
|
* So to get rates to that precision, we need to time loops to at
|
|
* least 1000 times as long as the precision (10 msec * 1000)
|
|
* which usually requires timing of loops that last 10 seconds or more.
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_sq2() instead.
|
|
*/
|
|
define sq_ratio(len)
|
|
{
|
|
local mat x[5]; /* array of values for sq_loop to square */
|
|
local mat one[5]; /* array if single BASEB-bit values */
|
|
local baseb; /* calc word size in bits */
|
|
local orig_cfg; /* caller configuration */
|
|
local loops; /* number of square loops to time */
|
|
local tlen; /* time to perform some number of loops */
|
|
local tover; /* est of time for loop overhead */
|
|
local alg1_rate; /* loop rate of 1st algorithm */
|
|
local alg2_rate; /* loop rate of 2nd algorithm */
|
|
local ret; /* return ratio, or 1.0 */
|
|
local i;
|
|
|
|
/*
|
|
* firewall
|
|
*/
|
|
if (!isint(len) || len < 2) {
|
|
quit "sq_ratio: 1st arg: len is not an integer > 1";
|
|
}
|
|
|
|
/*
|
|
* remember the caller's config state
|
|
*/
|
|
orig_cfg = config("all");
|
|
config("mul2", 0),;
|
|
config("sq2", 0),;
|
|
config("pow2", 0),;
|
|
config("redc2", 0),;
|
|
config("tilde", 0),;
|
|
|
|
/*
|
|
* initialize x, the values we will square
|
|
*
|
|
* We want these tests to be repeatable as possible, so we will seed
|
|
* the PRNG in a deterministic way.
|
|
*/
|
|
baseb = config("baseb");
|
|
srand(sha1(sha1(baseb, config("version"))));
|
|
for (i=0; i < 5; ++i) {
|
|
/* force the values to be a full len words long */
|
|
x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
|
|
randbit(((len-1) * baseb) + baseb-2));
|
|
/* single BASEB-bit values */
|
|
one[i] = 1;
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("sq2", 2^31-1),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = sq_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 1st algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg1 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
tlen = sq_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = sq_loop(loops, &one);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "sq_ratio: overhead >= loop time";
|
|
}
|
|
alg1_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate);
|
|
}
|
|
if (alg1_rate <= 0.0) {
|
|
quit "sq_ratio: alg1 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("sq2", 2),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = sq_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 2nd algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg2 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
tlen = sq_loop(loops, &x);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = sq_loop(loops, &one);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "sq_ratio: overhead >= loop time";
|
|
}
|
|
alg2_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate);
|
|
}
|
|
if (alg2_rate <= 0.0) {
|
|
quit "sq_ratio: alg2 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* restore old config
|
|
*/
|
|
config("all", orig_cfg),;
|
|
|
|
/*
|
|
* return alg1 / alg2 rate ratio
|
|
*/
|
|
ret = alg1_rate / alg2_rate;
|
|
if (config("user_debug") > 2) {
|
|
printf("\tprecise ratio is: %.f sq_ratio will return: %.3f\n",
|
|
alg1_rate / alg2_rate, ret);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
/*
|
|
* best_sq2 - determine the best config("sq2") parameter
|
|
*
|
|
* NOTE: Due to precision problems with CPU measurements, it is not
|
|
* unusual for the output of this function to vary slightly
|
|
* from run to run.
|
|
*
|
|
* NOTE: This function is designed to take a long time to run.
|
|
* We recommend setting:
|
|
*
|
|
* config("user_debug", 2)
|
|
*
|
|
* so that yon can watch the progress of this function.
|
|
*/
|
|
define best_sq2()
|
|
{
|
|
local ratio; /* previously calculated alg1/alg2 ratio */
|
|
local low; /* low loop value tested */
|
|
local high; /* high loop value tested */
|
|
local mid; /* between low and high */
|
|
local best_val; /* value found with ratio closest to unity */
|
|
local best_ratio; /* cloest ratio found to unity */
|
|
local expand; /* how fast to expand the length */
|
|
|
|
/*
|
|
* setup
|
|
*/
|
|
printf("WARNING: This tool may not be computing the correct best value\n");
|
|
test_time = 5.0;
|
|
printf("The best_sq2() function will take a LONG time to run!\n");
|
|
printf("It is important that best_sq2() run on an othwewise idle host!\n");
|
|
if (config("user_debug") <= 0) {
|
|
printf("To monitor progress, set user_debug to 2: "
|
|
"config(\"user_debug\", 2)\n");
|
|
}
|
|
printf("Starting with loop test time of %d secs\n", test_time);
|
|
|
|
/*
|
|
* firewall - must have a >1 ratio for the initial length
|
|
*/
|
|
high = 8;
|
|
best_val = high;
|
|
if (config("user_debug") > 0) {
|
|
printf("testing square alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = sq_ratio(high);
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" square alg1/alg2 ratio = %.3f\n", ratio);
|
|
}
|
|
if (ratio < 1.0) {
|
|
quit "best_sq2: test implies sq2 < 16, which seems bogus";
|
|
}
|
|
|
|
/*
|
|
* expand lengths until the ratio flips
|
|
*/
|
|
do {
|
|
/*
|
|
* determine the paramters of the next ratio test
|
|
*
|
|
* We will multiplicatively expand our test level until
|
|
* the ratio drops below 1.0.
|
|
*/
|
|
expand = 2;
|
|
low = high;
|
|
high *= expand;
|
|
if (config("user_debug") > 1) {
|
|
printf(" expand the next test range by a factor of %d\n",
|
|
expand);
|
|
}
|
|
|
|
/*
|
|
* determine the alg1/alg2 test ratio for this new length
|
|
*/
|
|
if (high >= 2^31) {
|
|
quit "best_sq2: tests imply sq2 >= 2^31, which seems bogus";
|
|
}
|
|
if (config("user_debug") > 0) {
|
|
printf("testing square alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = sq_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" square alg1/alg2 ratio = %.3f\n", ratio);
|
|
}
|
|
} while (ratio > 1.0);
|
|
|
|
/*
|
|
* If we previously expanded more than by a factor of 2, then
|
|
* we may have jumped over the crossover point. So now
|
|
* drop down powers of two until the ratio is again >= 1.0
|
|
*/
|
|
if (expand > 2) {
|
|
do {
|
|
|
|
/*
|
|
* contract by 2
|
|
*/
|
|
high /= 2;
|
|
low = high / 2;
|
|
if (config("user_debug") > 0) {
|
|
printf("retesting multiply alg1/alg2 ratio for len = %d\n",
|
|
high);
|
|
}
|
|
ratio = mul_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" multiply alg1/alg2 ratio = %.6f\n", ratio);
|
|
}
|
|
|
|
} while (ratio <= 1.0);
|
|
|
|
/* now that the ratio flipped again, use the previous range */
|
|
low = high;
|
|
high = high * 2;
|
|
}
|
|
if (config("user_debug") > 0) {
|
|
printf("Starting binary search between %d and %d\n", low, high);
|
|
}
|
|
|
|
/*
|
|
* binary search between low and high, for where ratio is just under 1.0
|
|
*/
|
|
while (low+1 < high) {
|
|
|
|
/* try the mid-point */
|
|
mid = int((low+high)/2);
|
|
if (config("user_debug") > 0) {
|
|
printf("testing square alg1/alg2 ratio for len = %d\n", mid);
|
|
}
|
|
ratio = sq_ratio(mid);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = mid;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d square alg1/alg2 ratio = %.6f\n", mid, ratio);
|
|
}
|
|
|
|
/* stop search if near unity */
|
|
if (close_to_one(ratio)) {
|
|
low = mid;
|
|
high = mid;
|
|
if (config("user_debug") > 0) {
|
|
printf("\twe are close enough to unity ratio at: %d\n", mid);
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* bump lower range up if we went over */
|
|
if (ratio > 1.0) {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove low from %d up to %d\n",
|
|
low, mid);
|
|
}
|
|
low = mid;
|
|
|
|
/* drop higher range down if we went under */
|
|
} else {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove high from %d down to %d\n",
|
|
high, mid);
|
|
}
|
|
high = mid;
|
|
}
|
|
|
|
/* report on test loop progress */
|
|
if (config("user_debug") > 1) {
|
|
printf("\tsetting low: %d high: %d diff: %d\n",
|
|
low, high, high-low);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return on the suggested config("sq2") value
|
|
*/
|
|
mid = int((low+high)/2);
|
|
if (config("user_debug") > 0) {
|
|
printf("Best value for square is near %d\n", best_val);
|
|
printf("Best square alg1/alg2 ratio is: %.6f\n", best_ratio);
|
|
printf("We suggest placing this line in your .calcrc:\n");
|
|
printf("config(\"sq2\", %d),;\n", best_val);
|
|
printf("WARNING: It is believed that the output "
|
|
"of this resource file is bogus!\n");
|
|
printf("WARNING: You may NOT wish to follow the above suggeston.\n");
|
|
}
|
|
return mid;
|
|
}
|
|
|
|
|
|
/*
|
|
* pow_loop - measure the CPU time to perform a set of pmod loops
|
|
*
|
|
* given:
|
|
* repeat number of pmod loops to perform
|
|
* x array of 5 values, each the same length in BASEB-bit words
|
|
*
|
|
* NOTE: When their lengths are 1 BASEB-bit word, then a
|
|
* dummy loop of simple constants are used. Thus the
|
|
* length == 1 is an approximation of loop overhead.
|
|
*
|
|
* ex exponent for pmod value
|
|
*
|
|
* returns:
|
|
* approximate runtime to perform a pmod loop
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_pow2() instead.
|
|
*/
|
|
define pow_loop(repeat, x, ex)
|
|
{
|
|
local start; /* start of execution */
|
|
local end; /* end of execution */
|
|
local answer; /* pmod value */
|
|
local len; /* length of each element */
|
|
local baseb_bytes; /* bytes in a BASEB-bit word */
|
|
local i;
|
|
|
|
/* firewall */
|
|
if (!isint(repeat) || repeat < 0) {
|
|
quit "pow_loop: 1st arg: repeat must be an integer > 0";
|
|
}
|
|
if (size(*x) != 5) {
|
|
quit "pow_loop: 2nd arg matrix does not have 5 elements";
|
|
}
|
|
if (matdim(*x) != 1) {
|
|
quit "pow_loop: 2nd arg matrix is not 1 dimensional";
|
|
}
|
|
if (matmin(*x, 1) != 0) {
|
|
quit "pow_loop: 2nd arg matrix index range does not start with 0";
|
|
}
|
|
if (matmax(*x, 1) != 4) {
|
|
quit "pow_loop: 2nd arg matrix index range does not end with 4";
|
|
}
|
|
baseb_bytes = config("baseb") / 8;
|
|
len = sizeof((*x)[0]) / baseb_bytes;
|
|
for (i=1; i < 4; ++i) {
|
|
if ((sizeof((*x)[i]) / baseb_bytes) != len) {
|
|
quit "pow_loop: 2nd arg matrix elements are not of "
|
|
"equal BASEB-bit word length";
|
|
}
|
|
}
|
|
if (!isint(ex) || ex < 3) {
|
|
quit" pow_loop: 3rd arg ex is not an integer > 2";
|
|
}
|
|
|
|
/* pmod pairwise, all sets of a given length */
|
|
start = usertime();
|
|
for (i=0; i < repeat; ++i) {
|
|
|
|
if (len == 1) {
|
|
/* we use len == 1 to test this tester loop overhead */
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
/**/
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
/**/
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
/**/
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
/**/
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
/**/
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
answer = pmod(0,0,0); answer = pmod(0,0,0);
|
|
} else {
|
|
answer = pmod((*x)[0], ex, (*x)[1]);
|
|
answer = pmod((*x)[0], ex, (*x)[2]);
|
|
answer = pmod((*x)[0], ex, (*x)[3]);
|
|
answer = pmod((*x)[0], ex, (*x)[4]);
|
|
/**/
|
|
answer = pmod((*x)[1], ex, (*x)[0]);
|
|
answer = pmod((*x)[1], ex, (*x)[2]);
|
|
answer = pmod((*x)[1], ex, (*x)[3]);
|
|
answer = pmod((*x)[1], ex, (*x)[4]);
|
|
/**/
|
|
answer = pmod((*x)[2], ex, (*x)[0]);
|
|
answer = pmod((*x)[2], ex, (*x)[1]);
|
|
answer = pmod((*x)[2], ex, (*x)[3]);
|
|
answer = pmod((*x)[2], ex, (*x)[4]);
|
|
/**/
|
|
answer = pmod((*x)[3], ex, (*x)[0]);
|
|
answer = pmod((*x)[3], ex, (*x)[1]);
|
|
answer = pmod((*x)[3], ex, (*x)[2]);
|
|
answer = pmod((*x)[3], ex, (*x)[4]);
|
|
/**/
|
|
answer = pmod((*x)[4], ex, (*x)[0]);
|
|
answer = pmod((*x)[4], ex, (*x)[1]);
|
|
answer = pmod((*x)[4], ex, (*x)[2]);
|
|
answer = pmod((*x)[4], ex, (*x)[3]);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return duration
|
|
*/
|
|
end = usertime();
|
|
return end-start;
|
|
}
|
|
|
|
|
|
/*
|
|
* pow_ratio - ratio of rates of 1st and 2nd pmod algorithms
|
|
*
|
|
* given:
|
|
* len length in BASEB-bit words to pmod
|
|
*
|
|
* return:
|
|
* ratio of (1st / 2nd) algorithm rates
|
|
*
|
|
* When want to determine a rate to a precision of 1 part in 1000.
|
|
* Most systems today return CPU time to at least 10 msec precision.
|
|
* So to get rates to that precision, we need to time loops to at
|
|
* least 1000 times as long as the precision (10 msec * 1000)
|
|
* which usually requires timing of loops that last 10 seconds or more.
|
|
*
|
|
* NOTE: This is an internal support function that is normally
|
|
* not called directly from the command line. Call the
|
|
* function best_pow2() instead.
|
|
*/
|
|
define pow_ratio(len)
|
|
{
|
|
local mat x[5]; /* array of values for pow_loop to pmod */
|
|
local mat one[5]; /* array if single BASEB-bit values */
|
|
local baseb; /* calc word size in bits */
|
|
local orig_cfg; /* caller configuration */
|
|
local loops; /* number of pmod loops to time */
|
|
local tlen; /* time to perform some number of loops */
|
|
local tover; /* est of time for loop overhead */
|
|
local alg1_rate; /* loop rate of 1st algorithm */
|
|
local alg2_rate; /* loop rate of 2nd algorithm */
|
|
local ex; /* exponent to use in pow_loop() */
|
|
local ret; /* return ratio, or 1.0 */
|
|
local i;
|
|
|
|
/*
|
|
* firewall
|
|
*/
|
|
if (!isint(len) || len < 2) {
|
|
quit "pow_ratio: 1st arg: len is not an integer > 1";
|
|
}
|
|
|
|
/*
|
|
* remember the caller's config state
|
|
*/
|
|
orig_cfg = config("all");
|
|
config("mul2", 0),;
|
|
config("sq2", 0),;
|
|
config("pow2", 0),;
|
|
config("redc2", 0),;
|
|
config("tilde", 0),;
|
|
|
|
/*
|
|
* setup
|
|
*/
|
|
ex = 7;
|
|
|
|
/*
|
|
* initialize x, the values we will pmod
|
|
*
|
|
* We want these tests to be repeatable as possible, so we will seed
|
|
* the PRNG in a deterministic way.
|
|
*/
|
|
baseb = config("baseb");
|
|
srand(sha1(sha1(ex, baseb, config("version"))));
|
|
for (i=0; i < 5; ++i) {
|
|
/* force the values to be a full len words long */
|
|
x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
|
|
randbit(((len-1) * baseb) + baseb-2));
|
|
/* single BASEB-bit values */
|
|
one[i] = 1;
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("pow2", 2^31-1),;
|
|
config("redc2", 2^31-1),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = pow_loop(loops, &x, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 1st algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg1 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
tlen = pow_loop(loops, &x, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = pow_loop(loops, &one, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg1 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "pow_ratio: overhead >= loop time";
|
|
}
|
|
alg1_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate);
|
|
}
|
|
if (alg1_rate <= 0.0) {
|
|
quit "pow_ratio: alg1 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* determine the number of loops needed to test 1st alg
|
|
*/
|
|
config("pow2", 2),;
|
|
config("redc2", 2^31-1),;
|
|
loops = 1/2;
|
|
do {
|
|
loops *= 2;
|
|
tlen = pow_loop(loops, &x, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 loops %d took %.3f sec\n", loops, tlen);
|
|
}
|
|
} while (tlen < 1.0);
|
|
|
|
/*
|
|
* determine the 2nd algorithm rate
|
|
*/
|
|
loops = max(1, ceil(loops * test_time / tlen));
|
|
if (loops < 16) {
|
|
if (config("user_debug") > 1) {
|
|
printf(" we must expand alg2 loop test time to about %d secs\n",
|
|
ceil(test_time * (16 / loops)));
|
|
}
|
|
loops = 16;
|
|
}
|
|
tlen = pow_loop(loops, &x, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 time = %.3f secs\n", tlen);
|
|
}
|
|
tover = pow_loop(loops, &one, ex);
|
|
if (config("user_debug") > 3) {
|
|
printf("\t alg2 overhead look %.3f secs\n", tover);
|
|
}
|
|
if (tlen <= tover) {
|
|
quit "pow_ratio: overhead >= loop time";
|
|
}
|
|
alg2_rate = loops / (tlen - tover);
|
|
if (config("user_debug") > 2) {
|
|
printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate);
|
|
}
|
|
if (alg2_rate <= 0.0) {
|
|
quit "pow_ratio: alg2 rate was <= 0.0";
|
|
}
|
|
|
|
/*
|
|
* restore old config
|
|
*/
|
|
config("all", orig_cfg),;
|
|
|
|
/*
|
|
* return alg1 / alg2 rate ratio
|
|
*/
|
|
ret = alg1_rate / alg2_rate;
|
|
if (config("user_debug") > 2) {
|
|
printf("\tprecise ratio is: %.f pow_ratio will return: %.3f\n",
|
|
alg1_rate / alg2_rate, ret);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
/*
|
|
* best_pow2 - determine the best config("pow2") parameter w/o REDC2
|
|
*
|
|
* NOTE: Due to precision problems with CPU measurements, it is not
|
|
* unusual for the output of this function to vary slightly
|
|
* from run to run.
|
|
*
|
|
* NOTE: This function is designed to take a long time to run.
|
|
* We recommend setting:
|
|
*
|
|
* config("user_debug", 2)
|
|
*
|
|
* so that yon can watch the progress of this function.
|
|
*/
|
|
define best_pow2()
|
|
{
|
|
local ratio; /* previously calculated alg1/alg2 ratio */
|
|
local low; /* low loop value tested */
|
|
local high; /* high loop value tested */
|
|
local mid; /* between low and high */
|
|
local best_val; /* value found with ratio closest to unity */
|
|
local best_ratio; /* cloest ratio found to unity */
|
|
local expand; /* how fast to expand the length */
|
|
local looped; /* 1 ==> we have expanded lengths before */
|
|
|
|
/*
|
|
* setup
|
|
*/
|
|
printf("WARNING: This tool may not be computing the correct best value\n");
|
|
test_time = 60.0;
|
|
printf("The best_pow2() function will take a LONG time to run!\n");
|
|
printf("It is important that best_pow2() run on an othwewise idle host!\n");
|
|
if (config("user_debug") <= 0) {
|
|
printf("To monitor progress, set user_debug to 2: "
|
|
"config(\"user_debug\", 2)\n");
|
|
}
|
|
printf("Starting with loop test time of %d secs\n", test_time);
|
|
|
|
/*
|
|
* firewall - must have a >1.02 ratio for the initial length
|
|
*
|
|
* We select 1.02 because of the precision of the CPU timing. We
|
|
* want to firt move into an area where the 1st algoritm clearly
|
|
* dominates.
|
|
*/
|
|
low = 4;
|
|
high = 4;
|
|
best_val = high;
|
|
best_ratio = 1e10; /* not a real value */
|
|
do {
|
|
high *= 4;
|
|
if (config("user_debug") > 0) {
|
|
printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = pow_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" pmod alg1/alg2 ratio = %.3f\n", ratio);
|
|
if (ratio > 1.0 && ratio <= 1.02) {
|
|
printf(" while alg1 is slightly better than alg2, "
|
|
"it is not clearly better\n");
|
|
}
|
|
}
|
|
} while (ratio <= 1.02);
|
|
if (config("user_debug") > 0) {
|
|
printf("starting the pow2 search above %d\n", high);
|
|
}
|
|
|
|
/*
|
|
* expand lengths until the ratio flips
|
|
*/
|
|
looped = 0;
|
|
do {
|
|
/*
|
|
* determine the paramters of the next ratio test
|
|
*
|
|
* We will multiplicatively expand our test level until
|
|
* the ratio drops below 1.0.
|
|
*
|
|
* NOTE: At low lengths, the ratios seen to be very small
|
|
* so we force an expansion of 4 to speed us on our
|
|
* way to larger lengths. At these somewhat larger
|
|
* lengths, the ratios usually don't get faster than
|
|
* 1.25, so we need to expand force a more rapid
|
|
* expansion than normal. At lengths longer than
|
|
* 2k, the time to test becomes very long, so we
|
|
* want to slow down at these higher lengths.
|
|
*/
|
|
expand = 2;
|
|
if (looped) {
|
|
low = high;
|
|
}
|
|
high *= expand;
|
|
if (config("user_debug") > 1) {
|
|
printf(" expand the next test range by a factor of %d\n",
|
|
expand);
|
|
}
|
|
|
|
/*
|
|
* determine the alg1/alg2 test ratio for this new length
|
|
*/
|
|
if (high >= 2^31) {
|
|
quit "best_pow2: test implies pow2 >= 2^31, which seems bogus";
|
|
}
|
|
if (config("user_debug") > 0) {
|
|
printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
|
|
}
|
|
ratio = pow_ratio(high);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = high;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" pmod alg1/alg2 ratio = %.6f\n", ratio);
|
|
}
|
|
looped = 1;
|
|
} while (ratio > 1.0);
|
|
if (config("user_debug") > 0) {
|
|
printf("Starting binary search between %d and %d\n", low, high);
|
|
}
|
|
|
|
/*
|
|
* binary search between low and high, for where ratio is just under 1.0
|
|
*/
|
|
while (low+1 < high) {
|
|
|
|
/* try the mid-point */
|
|
mid = int((low+high)/2);
|
|
if (config("user_debug") > 0) {
|
|
printf("testing pow2 alg1/alg2 ratio for len = %d\n", mid);
|
|
}
|
|
ratio = pow_ratio(mid);
|
|
if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
|
|
best_val = mid;
|
|
best_ratio = ratio;
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d has a new cloest ratio to unity: %.6f\n",
|
|
best_val, best_ratio);
|
|
}
|
|
}
|
|
if (config("user_debug") > 1) {
|
|
printf(" len %d pmod alg1/alg2 ratio = %.6f\n", mid, ratio);
|
|
}
|
|
|
|
/* stop search if near unity */
|
|
if (close_to_one(ratio)) {
|
|
low = mid;
|
|
high = mid;
|
|
if (config("user_debug") > 0) {
|
|
printf("\twe are close enough to unity ratio at: %d\n", mid);
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* bump lower range up if we went over */
|
|
if (ratio > 1.0) {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove low from %d up to %d\n",
|
|
low, mid);
|
|
}
|
|
low = mid;
|
|
|
|
/* drop higher range down if we went under */
|
|
} else {
|
|
if (config("user_debug") > 2) {
|
|
printf("\tmove high from %d down to %d\n",
|
|
high, mid);
|
|
}
|
|
high = mid;
|
|
}
|
|
|
|
/* report on test loop progress */
|
|
if (config("user_debug") > 1) {
|
|
printf("\tsetting low: %d high: %d diff: %d\n",
|
|
low, high, high-low);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* return on the suggested config("pow2") value
|
|
*/
|
|
mid = int((low+high)/2);
|
|
if (config("user_debug") > 0) {
|
|
printf("Best value for pmod is near %d\n", best_val);
|
|
printf("Best pmod alg1/alg2 ratio is: %.6f\n", best_ratio);
|
|
printf("We suggest placing this line in your .calcrc:\n");
|
|
printf("config(\"pow2\", %d),;\n", best_val);
|
|
printf("WARNING: It is believed that the output "
|
|
"of this resource file is bogus!\n");
|
|
printf("WARNING: You may NOT wish to follow the above suggeston.\n");
|
|
}
|
|
return mid;
|
|
}
|