calc/cal/alg_config.cal

/*
 * alg_config - help determine optimal values for algorithm levels
 *
 * Copyright (C) 2006,2014  Landon Curt Noll
 *
 * Calc is open software; you can redistribute it and/or modify it under
 * the terms of the version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * Calc is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU Lesser General
 * Public License for more details.
 *
 * A copy of version 2.1 of the GNU Lesser General Public License is
 * distributed with calc under the filename COPYING-LGPL.  You should have
 * received a copy with calc; if not, write to Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * Under source code control:	2006/06/07 14:10:11
 * File existed as early as:	2006
 *
 * chongo <was here> /\oo/\	http://www.isthe.com/chongo/
 * Share and enjoy!  :-)	http://www.isthe.com/chongo/tech/comp/calc/
 */

static test_time;	/* try for this many seconds in loop test */


/*
 * close_to_one - set to 1 if the ratio is close enough to 1
 *
 * given:
 *	ratio	the ratio of time between two algorithms
 *
 * retuns:
 *	1	When ratio is near 1.0
 *	0	otherwise
 *
 * We consider the range [0.995, 1.005] to be close enough to 1 to be call unity
 * because of the precision of the CPU timing.
 */
define close_to_one(ratio)
{
    /* firewall */
    if (!isreal(ratio)) {
	quit "close: 1st arg: must be a real number";
    }

    /* check if the ratio is far from unity */
    if ((ratio < 0.995) || (ratio > 1.005)) {
	return 0;
    }

    /* we are close to unity */
    return 1;
}


/*
 * mul_loop - measure the CPU time to perform a set of multiply loops
 *
 * given:
 *	repeat	number of multiply loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 *
 * returns:
 *	approximate runtime to perform repeat the multiply loops
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_mul2() instead.
 */
define mul_loop(repeat, x)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* multiplicand */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "mul_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "mul_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "mul_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "mul_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "mul_loop: 2nd arg matrix index range does not end with 4";
    }

    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "mul_loop: 2nd arg matrix elements are not of "
	         "equal BASEB-bit word length";
	}
    }

    /* multiply pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	} else {
	    answer = (*x)[0] * (*x)[1];
	    answer = (*x)[0] * (*x)[2];
	    answer = (*x)[0] * (*x)[3];
	    answer = (*x)[0] * (*x)[4];
	    /**/
	    answer = (*x)[1] * (*x)[0];
	    answer = (*x)[1] * (*x)[2];
	    answer = (*x)[1] * (*x)[3];
	    answer = (*x)[1] * (*x)[4];
	    /**/
	    answer = (*x)[2] * (*x)[0];
	    answer = (*x)[2] * (*x)[1];
	    answer = (*x)[2] * (*x)[3];
	    answer = (*x)[2] * (*x)[4];
	    /**/
	    answer = (*x)[3] * (*x)[0];
	    answer = (*x)[3] * (*x)[1];
	    answer = (*x)[3] * (*x)[2];
	    answer = (*x)[3] * (*x)[4];
	    /**/
	    answer = (*x)[4] * (*x)[0];
	    answer = (*x)[4] * (*x)[1];
	    answer = (*x)[4] * (*x)[2];
	    answer = (*x)[4] * (*x)[3];
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * mul_ratio - ratio of rates of 1st and 2nd multiply algorithms
 *
 * given:
 *	len	length in BASEB-bit words to multiply
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rate.
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_mul2() instead.
 */
define mul_ratio(len)
{
    local mat x[5];		/* array of values for mul_loop to multiply */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of multiply loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local ret;			/* return ratio, or 1.0 */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "mul_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * initialize x, the values we will multiply
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("mul2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = mul_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg1 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    if (config("user_debug") > 3) {
	printf("\t    will try alg1 %d loops\n", loops);
    }
    tlen = mul_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = mul_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "mul_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "mul_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("mul2", 2),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = mul_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg2 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    tlen = mul_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = mul_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "mul_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "mul_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    ret = alg1_rate / alg2_rate;
    if (config("user_debug") > 2) {
	printf("\tprecise ratio is: %.f mul_ratio will return: %.3f\n",
		alg1_rate / alg2_rate, ret);
    }
    return ret;
}


/*
 * best_mul2 - determine the best config("mul2") parameter
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_mul2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* high loop value tested */
    local mid;		/* between low and high */
    local best_val;	/* value found with ratio closest to unity */
    local best_ratio;	/* cloest ratio found to unity */
    local expand;	/* how fast to expand the length */

    /*
     * setup
     */
    printf("WARNING: This tool may not be computing the correct best value\n");
    test_time = 5.0;
    printf("The best_mul2() function will take a LONG time to run!\n");
    printf("It is important that best_mul2() run on an othwewise idle host!\n");
    if (config("user_debug") <= 0) {
	printf("To monitor progress, set user_debug to 2: "
	       "config(\"user_debug\", 2)\n");
    }
    printf("Starting with loop test time of %d secs\n", test_time);

    /*
     * firewall - must have a >1 ratio for the initial length
     */
    high = 8;
    best_val = high;
    if (config("user_debug") > 0) {
	printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
    }
    ratio = mul_ratio(high);
    best_ratio = ratio;
    if (config("user_debug") > 1) {
	printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
    }
    if (ratio < 1.0) {
	quit "best_mul2: tests imply mul2 should be < 16, which seems bogus";
    }

    /*
     * expand lengths until the ratio flips
     */
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 */
	expand = 2;
	low = high;
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	           expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_mul2: test implies mul2 >= 2^31, which seems bogus";
	}
	if (config("user_debug") > 0) {
	    printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = mul_ratio(high);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = high;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
	}
    } while (ratio > 1.0);

    /*
     * If we previously expanded more than by a factor of 2, then
     * we may have jumped over the crossover point.  So now
     * drop down powers of two until the ratio is again >= 1.0
     */
    if (expand > 2) {
	do {

	    /*
	     * contract by 2
	     */
	    high /= 2;
	    low = high / 2;
	    if (config("user_debug") > 0) {
		printf("retesting multiply alg1/alg2 ratio for len = %d\n",
		       high);
	    }
	    ratio = mul_ratio(high);
	    if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
		best_val = high;
		best_ratio = ratio;
		if (config("user_debug") > 1) {
		    printf("    len %d has a new cloest ratio to unity: %.6f\n",
			   best_val, best_ratio);
		}
	    }
	    if (config("user_debug") > 1) {
		printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
	    }

	} while (ratio <= 1.0);

	/* now that the ratio flipped again, use the previous range */
	low = high;
	high = high * 2;
    }
    if (config("user_debug") > 0) {
	printf("Starting binary search between %d and %d\n", low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	mid = int((low+high)/2);
	if (config("user_debug") > 0) {
	    printf("testing multiply alg1/alg2 ratio for len = %d\n", mid);
	}
	ratio = mul_ratio(mid);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = mid;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    len %d multiply alg1/alg2 ratio = %.6f\n", mid, ratio);
	}

	/* stop search if near unity */
	if (close_to_one(ratio)) {
	    low = mid;
	    high = mid;
	    if (config("user_debug") > 0) {
	    	printf("\twe are close enough to unity ratio at: %d\n", mid);
	    }
	    break;
	}

	/* bump lower range up if we went over */
	if (ratio > 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, mid);
	    }
	    low = mid;

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, mid);
	    }
	    high = mid;
	}

	/* report on test loop progress */
	if (config("user_debug") > 1) {
	    printf("\tsetting low: %d high: %d diff: %d\n",
	    	   low, high, high-low);
	}
    }

    /*
     * return on the suggested config("mul2") value
     */
    if (config("user_debug") > 0) {
	printf("Best value for multiply is near %d\n", best_val);
	printf("Best multiply alg1/alg2 ratio is: %.6f\n", best_ratio);
	printf("We suggest placing this line in your .calcrc:\n");
	printf("config(\"mul2\", %d),;\n", best_val);
	printf("WARNING: It is believed that the output "
	       "of this resource file is bogus!\n");
	printf("WARNING: You may NOT wish to follow the above suggeston.\n");
    }
    return mid;
}


/*
 * sq_loop - measure the CPU time to perform a set of square loops
 *
 * given:
 *	repeat	number of square loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 * returns:
 *	approximate runtime to perform a square loop
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_sq2() instead.
 */
define sq_loop(repeat, x)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* squared value */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "sq_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "sq_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "sq_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "sq_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "sq_loop: 2nd arg matrix index range does not end with 4";
    }
    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "sq_loop: 2nd arg matrix elements are not of equal "
	         "BASEB-bit word length";
	}
    }

    /* square pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	} else {
	    /* one square loop */
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * sq_ratio - ratio of rates of 1st and 2nd square algorithms
 *
 * given:
 *	len	length in BASEB-bit words to square
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rates
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_sq2() instead.
 */
define sq_ratio(len)
{
    local mat x[5];		/* array of values for sq_loop to square */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of square loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local ret;			/* return ratio, or 1.0 */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "sq_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * initialize x, the values we will square
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("sq2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = sq_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg1 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    tlen = sq_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = sq_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "sq_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "sq_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("sq2", 2),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = sq_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg2 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    tlen = sq_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = sq_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "sq_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "sq_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    ret = alg1_rate / alg2_rate;
    if (config("user_debug") > 2) {
	printf("\tprecise ratio is: %.f sq_ratio will return: %.3f\n",
		alg1_rate / alg2_rate, ret);
    }
    return ret;
}


/*
 * best_sq2 - determine the best config("sq2") parameter
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_sq2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* high loop value tested */
    local mid;		/* between low and high */
    local best_val;	/* value found with ratio closest to unity */
    local best_ratio;	/* cloest ratio found to unity */
    local expand;	/* how fast to expand the length */

    /*
     * setup
     */
    printf("WARNING: This tool may not be computing the correct best value\n");
    test_time = 5.0;
    printf("The best_sq2() function will take a LONG time to run!\n");
    printf("It is important that best_sq2() run on an othwewise idle host!\n");
    if (config("user_debug") <= 0) {
	printf("To monitor progress, set user_debug to 2: "
	       "config(\"user_debug\", 2)\n");
    }
    printf("Starting with loop test time of %d secs\n", test_time);

    /*
     * firewall - must have a >1 ratio for the initial length
     */
    high = 8;
    best_val = high;
    if (config("user_debug") > 0) {
	printf("testing square alg1/alg2 ratio for len = %d\n", high);
    }
    ratio = sq_ratio(high);
    best_ratio = ratio;
    if (config("user_debug") > 1) {
	printf("    square alg1/alg2 ratio = %.3f\n", ratio);
    }
    if (ratio < 1.0) {
	quit "best_sq2: test implies sq2 < 16, which seems bogus";
    }

    /*
     * expand lengths until the ratio flips
     */
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 */
	expand = 2;
	low = high;
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	    	   expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_sq2: tests imply sq2 >= 2^31, which seems bogus";
	}
	if (config("user_debug") > 0) {
	    printf("testing square alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = sq_ratio(high);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = high;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    square alg1/alg2 ratio = %.3f\n", ratio);
	}
    } while (ratio > 1.0);

    /*
     * If we previously expanded more than by a factor of 2, then
     * we may have jumped over the crossover point.  So now
     * drop down powers of two until the ratio is again >= 1.0
     */
    if (expand > 2) {
	do {

	    /*
	     * contract by 2
	     */
	    high /= 2;
	    low = high / 2;
	    if (config("user_debug") > 0) {
		printf("retesting multiply alg1/alg2 ratio for len = %d\n",
		       high);
	    }
	    ratio = mul_ratio(high);
	    if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
		best_val = high;
		best_ratio = ratio;
		if (config("user_debug") > 1) {
		    printf("    len %d has a new cloest ratio to unity: %.6f\n",
			   best_val, best_ratio);
		}
	    }
	    if (config("user_debug") > 1) {
		printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
	    }

	} while (ratio <= 1.0);

	/* now that the ratio flipped again, use the previous range */
	low = high;
	high = high * 2;
    }
    if (config("user_debug") > 0) {
	printf("Starting binary search between %d and %d\n", low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	mid = int((low+high)/2);
	if (config("user_debug") > 0) {
	    printf("testing square alg1/alg2 ratio for len = %d\n", mid);
	}
	ratio = sq_ratio(mid);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = mid;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    len %d square alg1/alg2 ratio = %.6f\n", mid, ratio);
	}

	/* stop search if near unity */
	if (close_to_one(ratio)) {
	    low = mid;
	    high = mid;
	    if (config("user_debug") > 0) {
	    	printf("\twe are close enough to unity ratio at: %d\n", mid);
	    }
	    break;
	}

	/* bump lower range up if we went over */
	if (ratio > 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, mid);
	    }
	    low = mid;

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, mid);
	    }
	    high = mid;
	}

	/* report on test loop progress */
	if (config("user_debug") > 1) {
	    printf("\tsetting low: %d high: %d diff: %d\n",
	    	   low, high, high-low);
	}
    }

    /*
     * return on the suggested config("sq2") value
     */
    mid = int((low+high)/2);
    if (config("user_debug") > 0) {
	printf("Best value for square is near %d\n", best_val);
	printf("Best square alg1/alg2 ratio is: %.6f\n", best_ratio);
	printf("We suggest placing this line in your .calcrc:\n");
	printf("config(\"sq2\", %d),;\n", best_val);
	printf("WARNING: It is believed that the output "
	       "of this resource file is bogus!\n");
	printf("WARNING: You may NOT wish to follow the above suggeston.\n");
    }
    return mid;
}


/*
 * pow_loop - measure the CPU time to perform a set of pmod loops
 *
 * given:
 *	repeat	number of pmod loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 *
 *	ex	exponent for pmod value
 *
 * returns:
 *	approximate runtime to perform a pmod loop
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_pow2() instead.
 */
define pow_loop(repeat, x, ex)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* pmod value */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "pow_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "pow_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "pow_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "pow_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "pow_loop: 2nd arg matrix index range does not end with 4";
    }
    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "pow_loop: 2nd arg matrix elements are not of "
	         "equal BASEB-bit word length";
	}
    }
    if (!isint(ex) || ex < 3) {
	quit" pow_loop: 3rd arg ex is not an integer > 2";
    }

    /* pmod pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	} else {
	    answer = pmod((*x)[0], ex, (*x)[1]);
	    answer = pmod((*x)[0], ex, (*x)[2]);
	    answer = pmod((*x)[0], ex, (*x)[3]);
	    answer = pmod((*x)[0], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[1], ex, (*x)[0]);
	    answer = pmod((*x)[1], ex, (*x)[2]);
	    answer = pmod((*x)[1], ex, (*x)[3]);
	    answer = pmod((*x)[1], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[2], ex, (*x)[0]);
	    answer = pmod((*x)[2], ex, (*x)[1]);
	    answer = pmod((*x)[2], ex, (*x)[3]);
	    answer = pmod((*x)[2], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[3], ex, (*x)[0]);
	    answer = pmod((*x)[3], ex, (*x)[1]);
	    answer = pmod((*x)[3], ex, (*x)[2]);
	    answer = pmod((*x)[3], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[4], ex, (*x)[0]);
	    answer = pmod((*x)[4], ex, (*x)[1]);
	    answer = pmod((*x)[4], ex, (*x)[2]);
	    answer = pmod((*x)[4], ex, (*x)[3]);
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * pow_ratio - ratio of rates of 1st and 2nd pmod algorithms
 *
 * given:
 *	len	length in BASEB-bit words to pmod
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rates
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_pow2() instead.
 */
define pow_ratio(len)
{
    local mat x[5];		/* array of values for pow_loop to pmod */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of pmod loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local ex;			/* exponent to use in pow_loop() */
    local ret;			/* return ratio, or 1.0 */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "pow_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * setup
     */
    ex = 7;

    /*
     * initialize x, the values we will pmod
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(ex, baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("pow2", 2^31-1),;
    config("redc2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = pow_loop(loops, &x, ex);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg1 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    tlen = pow_loop(loops, &x, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = pow_loop(loops, &one, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "pow_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "pow_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("pow2", 2),;
    config("redc2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = pow_loop(loops, &x, ex);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 16) {
	if (config("user_debug") > 1) {
	    printf("    we must expand alg2 loop test time to about %d secs\n",
		ceil(test_time * (16 / loops)));
	}
	loops = 16;
    }
    tlen = pow_loop(loops, &x, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = pow_loop(loops, &one, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "pow_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "pow_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    ret = alg1_rate / alg2_rate;
    if (config("user_debug") > 2) {
	printf("\tprecise ratio is: %.f pow_ratio will return: %.3f\n",
		alg1_rate / alg2_rate, ret);
    }
    return ret;
}


/*
 * best_pow2 - determine the best config("pow2") parameter w/o REDC2
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_pow2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* high loop value tested */
    local mid;		/* between low and high */
    local best_val;	/* value found with ratio closest to unity */
    local best_ratio;	/* cloest ratio found to unity */
    local expand;	/* how fast to expand the length */
    local looped;	/* 1 ==> we have expanded lengths before */

    /*
     * setup
     */
    printf("WARNING: This tool may not be computing the correct best value\n");
    test_time = 60.0;
    printf("The best_pow2() function will take a LONG time to run!\n");
    printf("It is important that best_pow2() run on an othwewise idle host!\n");
    if (config("user_debug") <= 0) {
	printf("To monitor progress, set user_debug to 2: "
	       "config(\"user_debug\", 2)\n");
    }
    printf("Starting with loop test time of %d secs\n", test_time);

    /*
     * firewall - must have a >1.02 ratio for the initial length
     *
     * We select 1.02 because of the precision of the CPU timing.  We
     * want to firt move into an area where the 1st algoritm clearly
     * dominates.
     */
    low = 4;
    high = 4;
    best_val = high;
    best_ratio = 1e10;	/* not a real value */
    do {
	high *= 4;
	if (config("user_debug") > 0) {
	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = pow_ratio(high);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = high;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    pmod alg1/alg2 ratio = %.3f\n", ratio);
	    if (ratio > 1.0 && ratio <= 1.02) {
	    printf("    while alg1 is slightly better than alg2, "
	    	   "it is not clearly better\n");
	    }
	}
    } while (ratio <= 1.02);
    if (config("user_debug") > 0) {
	printf("starting the pow2 search above %d\n", high);
    }

    /*
     * expand lengths until the ratio flips
     */
    looped = 0;
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 *
	 * NOTE: At low lengths, the ratios seen to be very small
	 *	 so we force an expansion of 4 to speed us on our
	 *	 way to larger lengths.  At these somewhat larger
	 *	 lengths, the ratios usually don't get faster than
	 *	 1.25, so we need to expand force a more rapid
	 *	 expansion than normal.  At lengths longer than
	 *	 2k, the time to test becomes very long, so we
	 *	 want to slow down at these higher lengths.
	 */
	expand = 2;
	if (looped) {
	    low = high;
	}
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	    	   expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_pow2: test implies pow2 >= 2^31, which seems bogus";
	}
	if (config("user_debug") > 0) {
	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = pow_ratio(high);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = high;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    pmod alg1/alg2 ratio = %.6f\n", ratio);
	}
	looped = 1;
    } while (ratio > 1.0);
    if (config("user_debug") > 0) {
	printf("Starting binary search between %d and %d\n", low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	mid = int((low+high)/2);
	if (config("user_debug") > 0) {
	    printf("testing pow2 alg1/alg2 ratio for len = %d\n", mid);
	}
	ratio = pow_ratio(mid);
	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
	    best_val = mid;
	    best_ratio = ratio;
	    if (config("user_debug") > 1) {
		printf("    len %d has a new cloest ratio to unity: %.6f\n",
		       best_val, best_ratio);
	    }
	}
	if (config("user_debug") > 1) {
	    printf("    len %d pmod alg1/alg2 ratio = %.6f\n", mid, ratio);
	}

	/* stop search if near unity */
	if (close_to_one(ratio)) {
	    low = mid;
	    high = mid;
	    if (config("user_debug") > 0) {
	    	printf("\twe are close enough to unity ratio at: %d\n", mid);
	    }
	    break;
	}

	/* bump lower range up if we went over */
	if (ratio > 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, mid);
	    }
	    low = mid;

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, mid);
	    }
	    high = mid;
	}

	/* report on test loop progress */
	if (config("user_debug") > 1) {
	    printf("\tsetting low: %d high: %d diff: %d\n",
	    	   low, high, high-low);
	}
    }

    /*
     * return on the suggested config("pow2") value
     */
    mid = int((low+high)/2);
    if (config("user_debug") > 0) {
	printf("Best value for pmod is near %d\n", best_val);
	printf("Best pmod alg1/alg2 ratio is: %.6f\n", best_ratio);
	printf("We suggest placing this line in your .calcrc:\n");
	printf("config(\"pow2\", %d),;\n", best_val);
	printf("WARNING: It is believed that the output "
	       "of this resource file is bogus!\n");
	printf("WARNING: You may NOT wish to follow the above suggeston.\n");
    }
    return mid;
}