calc/cal/alg_config.cal

/*
 * alg_config - help determine optimal values for algorithm levels
 *
 * Copyright (C) 2006  Landon Curt Noll
 *
 * Calc is open software; you can redistribute it and/or modify it under
 * the terms of the version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * Calc is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU Lesser General
 * Public License for more details.
 *
 * A copy of version 2.1 of the GNU Lesser General Public License is
 * distributed with calc under the filename COPYING-LGPL.  You should have
 * received a copy with calc; if not, write to Free Software Foundation, Inc.
 * 59 Temple Place, Suite 330, Boston, MA  02111-1307, USA.
 *
 * @(#) $Revision: 29.15 $
 * @(#) $Id: alg_config.cal,v 29.15 2006/12/16 11:18:46 chongo Exp $
 * @(#) $Source: /usr/local/src/cmd/calc/cal/RCS/alg_config.cal,v $
 *
 * Under source code control:	2006/06/07 14:10:11
 * File existed as early as:	2006
 *
 * chongo <was here> /\oo/\	http://www.isthe.com/chongo/
 * Share and enjoy!  :-)	http://www.isthe.com/chongo/tech/comp/calc/
 */


global test_time;	/* try for this many seconds in loop test */


/*
 * mul_loop - measure the CPU time to perform a set of multiply loops
 *
 * given:
 *	repeat	number of multiply loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 *
 * returns:
 *	approximate runtime to perform repeat the multiply loops
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_mul2() instead.
 */
define mul_loop(repeat, x)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* multiplicand */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "mul_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "mul_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "mul_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "mul_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "mul_loop: 2nd arg matrix index range does not end with 4";
    }

    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "mul_loop: 2nd arg matrix elements are not of equal BASEB-bit word length";
	}
    }

    /* multiply pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	    /**/
	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
	} else {
	    answer = (*x)[0] * (*x)[1];
	    answer = (*x)[0] * (*x)[2];
	    answer = (*x)[0] * (*x)[3];
	    answer = (*x)[0] * (*x)[4];
	    /**/
	    answer = (*x)[1] * (*x)[0];
	    answer = (*x)[1] * (*x)[2];
	    answer = (*x)[1] * (*x)[3];
	    answer = (*x)[1] * (*x)[4];
	    /**/
	    answer = (*x)[2] * (*x)[0];
	    answer = (*x)[2] * (*x)[1];
	    answer = (*x)[2] * (*x)[3];
	    answer = (*x)[2] * (*x)[4];
	    /**/
	    answer = (*x)[3] * (*x)[0];
	    answer = (*x)[3] * (*x)[1];
	    answer = (*x)[3] * (*x)[2];
	    answer = (*x)[3] * (*x)[4];
	    /**/
	    answer = (*x)[4] * (*x)[0];
	    answer = (*x)[4] * (*x)[1];
	    answer = (*x)[4] * (*x)[2];
	    answer = (*x)[4] * (*x)[3];
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * mul_ratio - ratio of rates of 1st and 2nd multiply algorithms
 *
 * given:
 *	len	length in BASEB-bit words to multiply
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rate
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_mul2() instead.
 */
define mul_ratio(len)
{
    local mat x[5];		/* array of values for mul_loop to multiply */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of multiply loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "mul_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * initialize x, the values we will multiply
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("mul2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = mul_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    if (config("user_debug") > 3) {
	printf("\t    will try alg1 %d loops\n", loops);
    }
    tlen = mul_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = mul_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "mul_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "mul_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("mul2", 2),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = mul_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    tlen = mul_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = mul_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "mul_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "mul_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    return (alg1_rate / alg2_rate);
}


/*
 * best_mul2 - determine the best config("mul2") parameter
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_mul2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* low loop value tested */
    local expand;	/* how fast to expand the length */

    /*
     * setup
     */
    test_time = 15.0;
    if (config("user_debug") > 0) {
	printf("will start with loop test time of %d secs\n", test_time);
    }

    /*
     * firewall - must have a >1 ratio for the initial length
     */
    high = 16;
    if (config("user_debug") > 0) {
	printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
    }
    ratio = mul_ratio(high);
    if (config("user_debug") > 1) {
	printf("    multiply alg1/alg2 ratio = %.3f\n", ratio);
    }
    if (ratio <= 1.0) {
	quit "best_mul2: tests imply config(\"mul2\") should be < 4";
    }

    /*
     * expand lengths until the ratio flips
     */
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 */
	expand = ((ratio >= 3.5) ? 16 : 2^round(ratio));
	low = high;
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	           expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_mul2: tests imply config(\"mul2\") should be >= 2^31";
	}
	if (config("user_debug") > 0) {
	    printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = mul_ratio(high);
	if (config("user_debug") > 1) {
	    printf("    multiply alg1/alg2 ratio = %.3f\n", ratio);
	}
    } while (ratio >= 1.0);
    if (config("user_debug") > 0) {
	printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n",
	    low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	if (config("user_debug") > 0) {
	    printf("testing multiply alg1/alg2 ratio for len = %d\n",
	    	int((low+high)/2));
	}
	ratio = mul_ratio(int((low+high)/2));
	if (config("user_debug") > 1) {
	    printf("    multiply alg1/alg2 ratio = %.3f\n", ratio);
	}

	/* bump lower range up if we went over */
	if (ratio >= 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, int((low+high)/2));
	    }
	    low = int((low+high)/2);

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, int((low+high)/2));
	    }
	    high = int((low+high)/2);
	}
    }

    /*
     * return on the suggested config("mul2") value
     */
    if (config("user_debug") > 0) {
	printf("best value of config(\"mul2\") is %d\n",
	    (ratio >= 1.0) ? high : low);
    }
    return ((ratio >= 1.0) ? high : low);
}


/*
 * sq_loop - measure the CPU time to perform a set of square loops
 *
 * given:
 *	repeat	number of square loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 * returns:
 *	approximate runtime to perform a square loop
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_sq2() instead.
 */
define sq_loop(repeat, x)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* squared value */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "sq_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "sq_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "sq_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "sq_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "sq_loop: 2nd arg matrix index range does not end with 4";
    }
    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "sq_loop: 2nd arg matrix elements are not of equal BASEB-bit word length";
	}
    }

    /* square pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	    /**/
	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
	    answer = 0^2;
	} else {
	    /* one square loop */
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	    /**/
	    answer = (*x)[0]^2;
	    answer = (*x)[1]^2;
	    answer = (*x)[2]^2;
	    answer = (*x)[3]^2;
	    answer = (*x)[4]^2;
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * sq_ratio - ratio of rates of 1st and 2nd square algorithms
 *
 * given:
 *	len	length in BASEB-bit words to square
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rates
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_sq2() instead.
 */
define sq_ratio(len)
{
    local mat x[5];		/* array of values for sq_loop to square */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of square loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "sq_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * initialize x, the values we will square
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("sq2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = sq_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    tlen = sq_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = sq_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "sq_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "sq_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("sq2", 2),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = sq_loop(loops, &x);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    tlen = sq_loop(loops, &x);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = sq_loop(loops, &one);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "sq_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "sq_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    return (alg1_rate / alg2_rate);
}


/*
 * best_sq2 - determine the best config("sq2") parameter
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_sq2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* low loop value tested */
    local expand;	/* how fast to expand the length */

    /*
     * setup
     */
    test_time = 15.0;
    if (config("user_debug") > 0) {
	printf("will start with loop test time of %d secs\n", test_time);
    }

    /*
     * firewall - must have a >1 ratio for the initial length
     */
    high = 16;
    if (config("user_debug") > 0) {
	printf("testing square alg1/alg2 ratio for len = %d\n", high);
    }
    ratio = sq_ratio(high);
    if (config("user_debug") > 1) {
	printf("    square alg1/alg2 ratio = %.3f\n", ratio);
    }
    if (ratio <= 1.0) {
	quit "best_sq2: tests imply config(\"sq2\") should be < 4";
    }

    /*
     * expand lengths until the ratio flips
     */
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 */
	expand = ((ratio >= 3.5) ? 16 : 2^round(ratio));
	low = high;
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	           expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_sq2: tests imply config(\"sq2\") should be >= 2^31";
	}
	if (config("user_debug") > 0) {
	    printf("testing square alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = sq_ratio(high);
	if (config("user_debug") > 1) {
	    printf("    square alg1/alg2 ratio = %.3f\n", ratio);
	}
    } while (ratio >= 1.0);
    if (config("user_debug") > 0) {
	printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n",
	    low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	if (config("user_debug") > 0) {
	    printf("testing square alg1/alg2 ratio for len = %d\n",
	    	int((low+high)/2));
	}
	ratio = sq_ratio(int((low+high)/2));
	if (config("user_debug") > 1) {
	    printf("    square alg1/alg2 ratio = %.3f\n", ratio);
	}

	/* bump lower range up if we went over */
	if (ratio >= 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, int((low+high)/2));
	    }
	    low = int((low+high)/2);

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, int((low+high)/2));
	    }
	    high = int((low+high)/2);
	}
    }

    /*
     * return on the suggested config("sq2") value
     */
    if (config("user_debug") > 0) {
	printf("best value of config(\"sq2\") is %d\n",
	    (ratio >= 1.0) ? high : low);
    }
    return ((ratio >= 1.0) ? high : low);
}


/*
 * pow_loop - measure the CPU time to perform a set of pmod loops
 *
 * given:
 *	repeat	number of pmod loops to perform
 *	x	array of 5 values, each the same length in BASEB-bit words
 *
 *		NOTE: When their lengths are 1 BASEB-bit word, then a
 *		      dummy loop of simple constants are used.  Thus the
 *		      length == 1 is an approximation of loop overhead.
 *
 *	ex	exponent for pmod value
 *
 * returns:
 *	approximate runtime to perform a pmod loop
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_pow2() instead.
 */
define pow_loop(repeat, x, ex)
{
    local start;	/* start of execution */
    local end;		/* end of execution */
    local answer;	/* pmod value */
    local len;		/* length of each element */
    local baseb_bytes;	/* bytes in a BASEB-bit word */
    local i;

    /* firewall */
    if (!isint(repeat) || repeat < 0) {
	quit "pow_loop: 1st arg: repeat must be an integer > 0";
    }
    if (size(*x) != 5) {
	quit "pow_loop: 2nd arg matrix does not have 5 elements";
    }
    if (matdim(*x) != 1) {
	quit "pow_loop: 2nd arg matrix is not 1 dimensional";
    }
    if (matmin(*x, 1) != 0) {
	quit "pow_loop: 2nd arg matrix index range does not start with 0";
    }
    if (matmax(*x, 1) != 4) {
	quit "pow_loop: 2nd arg matrix index range does not end with 4";
    }
    baseb_bytes = config("baseb") / 8;
    len = sizeof((*x)[0]) / baseb_bytes;
    for (i=1; i < 4; ++i) {
	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
	    quit "pow_loop: 2nd arg matrix elements are not of equal BASEB-bit word length";
	}
    }
    if (!isint(ex) || ex < 3) {
	quit" pow_loop: 3rd arg ex is not an integer > 2";
    }

    /* pmod pairwise, all sets of a given length */
    start = usertime();
    for (i=0; i < repeat; ++i) {

	if (len == 1) {
	    /* we use len == 1 to test this tester loop overhead */
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    /**/
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	    answer = pmod(0,0,0); answer = pmod(0,0,0);
	} else {
	    answer = pmod((*x)[0], ex, (*x)[1]);
	    answer = pmod((*x)[0], ex, (*x)[2]);
	    answer = pmod((*x)[0], ex, (*x)[3]);
	    answer = pmod((*x)[0], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[1], ex, (*x)[0]);
	    answer = pmod((*x)[1], ex, (*x)[2]);
	    answer = pmod((*x)[1], ex, (*x)[3]);
	    answer = pmod((*x)[1], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[2], ex, (*x)[0]);
	    answer = pmod((*x)[2], ex, (*x)[1]);
	    answer = pmod((*x)[2], ex, (*x)[3]);
	    answer = pmod((*x)[2], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[3], ex, (*x)[0]);
	    answer = pmod((*x)[3], ex, (*x)[1]);
	    answer = pmod((*x)[3], ex, (*x)[2]);
	    answer = pmod((*x)[3], ex, (*x)[4]);
	    /**/
	    answer = pmod((*x)[4], ex, (*x)[0]);
	    answer = pmod((*x)[4], ex, (*x)[1]);
	    answer = pmod((*x)[4], ex, (*x)[2]);
	    answer = pmod((*x)[4], ex, (*x)[3]);
	}
    }

    /*
     * return duration
     */
    end = usertime();
    return end-start;
}


/*
 * pow_ratio - ratio of rates of 1st and 2nd pmod algorithms
 *
 * given:
 *	len	length in BASEB-bit words to pmod
 *
 * return:
 *	ratio of (1st / 2nd) algorithm rates
 *
 * When want to determine a rate to a precision of 1 part in 1000.
 * Most systems today return CPU time to at least 10 msec precision.
 * So to get rates to that precision, we need to time loops to at
 * least 1000 times as long as the precision (10 msec * 1000)
 * which usually requires timing of loops that last 10 seconds or more.
 *
 * NOTE: This is an internal support function that is normally
 *	 not called directly from the command line.  Call the
 *	 function best_pow2() instead.
 */
define pow_ratio(len)
{
    local mat x[5];		/* array of values for pow_loop to pmod */
    local mat one[5];		/* array if single BASEB-bit values */
    local baseb;		/* calc word size in bits */
    local orig_cfg;		/* caller configuration */
    local loops;		/* number of pmod loops to time */
    local tlen;			/* time to perform some number of loops */
    local tover;		/* est of time for loop overhead */
    local alg1_rate;		/* loop rate of 1st algorithm */
    local alg2_rate;		/* loop rate of 2nd algorithm */
    local ex;			/* exponent to use in pow_loop() */
    local i;

    /*
     * firewall
     */
    if (!isint(len) || len < 2) {
	quit "pow_ratio: 1st arg: len is not an integer > 1";
    }

    /*
     * remember the caller's config state
     */
    orig_cfg = config("all");
    config("mul2", 0),;
    config("sq2", 0),;
    config("pow2", 0),;
    config("redc2", 0),;
    config("tilde", 0),;

    /*
     * setup
     */
    ex = 5;

    /*
     * initialize x, the values we will pmod
     *
     * We want these tests to be repeatable as possible, so we will seed
     * the PRNG in a deterministic way.
     */
    baseb = config("baseb");
    srand(sha1(sha1(ex, baseb, config("version"))));
    for (i=0; i < 5; ++i) {
	/* force the values to be a full len words long */
	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
		    randbit(((len-1) * baseb) + baseb-2));
	/* single BASEB-bit values */
        one[i] = 1;
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("pow2", 2^31-1),;
    config("redc2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = pow_loop(loops, &x, ex);
	if (config("user_debug") > 3) {
	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 1st algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    tlen = pow_loop(loops, &x, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg1 time = %.3f secs\n", tlen);
    }
    tover = pow_loop(loops, &one, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg1 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "pow_ratio: overhead >= loop time";
    }
    alg1_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate);
    }
    if (alg1_rate <= 0.0) {
	quit "pow_ratio: alg1 rate was <= 0.0";
    }

    /*
     * determine the number of loops needed to test 1st alg
     */
    config("pow2", 2),;
    config("redc2", 2^31-1),;
    loops = 1/2;
    do {
	loops *= 2;
	tlen = pow_loop(loops, &x, ex);
	if (config("user_debug") > 3) {
	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
	}
    } while (tlen < 1.0);

    /*
     * determine the 2nd algorithm rate
     */
    loops = max(1, ceil(loops * test_time / tlen));
    if (loops < 8) {
	if (config("user_debug") > 1) {
	    printf("    we must expand loop test time to more than %d secs\n",
		ceil(test_time * (8 / loops)));
	}
	loops = 8;
    }
    tlen = pow_loop(loops, &x, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg2 time = %.3f secs\n", tlen);
    }
    tover = pow_loop(loops, &one, ex);
    if (config("user_debug") > 3) {
	printf("\t    alg2 overhead look %.3f secs\n", tover);
    }
    if (tlen <= tover) {
	quit "pow_ratio: overhead >= loop time";
    }
    alg2_rate = loops / (tlen - tover);
    if (config("user_debug") > 2) {
	printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate);
    }
    if (alg2_rate <= 0.0) {
	quit "pow_ratio: alg2 rate was <= 0.0";
    }

    /*
     * restore old config
     */
    config("all", orig_cfg),;

    /*
     * return alg1 / alg2 rate ratio
     */
    return (alg1_rate / alg2_rate);
}


/*
 * best_pow2 - determine the best config("pow2") parameter w/o REDC2
 *
 * NOTE: Due to precision problems with CPU measurements, it is not
 *	 unusual for the output of this function to vary slightly
 *	 from run to run.
 *
 * NOTE: This function is designed to take a long time to run.
 *	  We recommend setting:
 *
 *		config("user_debug", 2)
 *
 *	  so that yon can watch the progress of this function.
 */
define best_pow2()
{
    local ratio;	/* previously calculated alg1/alg2 ratio */
    local low;		/* low loop value tested */
    local high;		/* low loop value tested */
    local expand;	/* how fast to expand the length */
    local looped;	/* 1 ==> we have expanded lengths before */

    /*
     * setup
     */
    test_time = 15.0;
    if (config("user_debug") > 0) {
	printf("will start with loop test time of %d secs\n", test_time);
    }

    /*
     * firewall - must have a >1.02 ratio for the initial length
     *
     * We select 1.02 because of the precision of the CPU timing.  We
     * want to firt move into an area where the 1st algoritm clearly
     * dominates.
     */
    low = 4;
    high = 4;
    do {
	high *= 4;
	if (config("user_debug") > 0) {
	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = pow_ratio(high);
	if (config("user_debug") > 1) {
	    printf("    pmod alg1/alg2 ratio = %.3f\n", ratio);
	    if (ratio > 1.0 && ratio <= 1.02) {
	    printf("    while alg1 is slightly better than alg2, it is not clearly better\n");
	    }
	}
    } while (ratio <= 1.02);
    if (config("user_debug") > 0) {
	printf("starting the pow2 search above %d\n", high);
    }

    /*
     * expand lengths until the ratio flips
     */
    looped = 0;
    do {
	/*
	 * determine the paramters of the next ratio test
	 *
	 * We will multiplicatively expand our test level until
	 * the ratio drops below 1.0.
	 *
	 * NOTE: At low lengths, the ratios seen to be very small
	 *	 so we force an expansion of 4 to speed us on our
	 *	 way to larger lengths.  At these somewhat larger
	 *	 lengths, the ratios usually don't get faster than
	 *	 1.25, so we need to expand force a more rapid
	 *	 expansion than normal.  At lengths longer than
	 *	 2k, the time to test becomes very long, so we
	 *	 want to slow down at these higher lengths.
	 */
	expand = 2;
	if (looped) {
	    low = high;
	}
	high *= expand;
	if (config("user_debug") > 1) {
	    printf("    expand the next test range by a factor of %d\n",
	    	   expand);
	}

	/*
	 * determine the alg1/alg2 test ratio for this new length
	 */
	if (high >= 2^31) {
	    quit "best_pow2: tests imply config(\"pow2\") should be >= 2^31";
	}
	if (config("user_debug") > 0) {
	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
	}
	ratio = pow_ratio(high);
	if (config("user_debug") > 1) {
	    printf("    pmod alg1/alg2 ratio = %.3f\n", ratio);
	}
	looped = 1;
    } while (ratio >= 1.0);
    if (config("user_debug") > 0) {
	printf("alg1/alg2 ratio now < 1.0, starting binary search between %d and %d\n",
	    low, high);
    }

    /*
     * binary search between low and high, for where ratio is just under 1.0
     */
    while (low+1 < high) {

    	/* try the mid-point */
	if (config("user_debug") > 0) {
	    printf("testing pmod alg1/alg2 ratio for len = %d\n",
	    	int((low+high)/2));
	}
	ratio = pow_ratio(int((low+high)/2));
	if (config("user_debug") > 1) {
	    printf("    pmod alg1/alg2 ratio = %.3f\n", ratio);
	}

	/* bump lower range up if we went over */
	if (ratio >= 1.0) {
	    if (config("user_debug") > 2) {
	    	printf("\tmove low from %d up to %d\n",
		    low, int((low+high)/2));
	    }
	    low = int((low+high)/2);

	/* drop higher range down if we went under */
	} else {
	    if (config("user_debug") > 2) {
	    	printf("\tmove high from %d down to %d\n",
		     high, int((low+high)/2));
	    }
	    high = int((low+high)/2);
	}
    }

    /*
     * return on the suggested config("pow2") value
     */
    if (config("user_debug") > 0) {
	printf("best value of config(\"pow2\") is %d\n",
	    (ratio >= 1.0) ? high : low);
    }
    return ((ratio >= 1.0) ? high : low);
}