calc/zmul.c

/*
 * zmul - faster than usual multiplying and squaring routines
 *
 * Copyright (C) 1999-2007,2021-2023  David I. Bell
 *
 * Calc is open software; you can redistribute it and/or modify it under
 * the terms of the version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * Calc is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU Lesser General
 * Public License for more details.
 *
 * A copy of version 2.1 of the GNU Lesser General Public License is
 * distributed with calc under the filename COPYING-LGPL.  You should have
 * received a copy with calc; if not, write to Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * Under source code control:	1991/01/09 20:01:31
 * File existed as early as:	1991
 *
 * Share and enjoy!  :-)	http://www.isthe.com/chongo/tech/comp/calc/
 */

/*
 * Faster than usual multiplying and squaring routines.
 * The algorithm used is the reasonably simple one from Knuth, volume 2,
 * section 4.3.3.  These recursive routines are of speed O(N^1.585)
 * instead of O(N^2).  The usual multiplication and (almost usual) squaring
 * algorithms are used for small numbers.  On a 386 with its compiler, the
 * two algorithms are equal in speed at about 100 decimal digits.
 */


#include "config.h"
#include "zmath.h"


#include "errtbl.h"
#include "banned.h"	/* include after system header <> includes */


STATIC HALF *tempbuf;		/* temporary buffer for multiply and square */

S_FUNC LEN domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans);
S_FUNC LEN dosquare(HALF *vp, LEN size, HALF *ans);


/*
 * Multiply two numbers using the following formula recursively:
 *	(A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
 * where S is a power of 2^16, and so multiplies by it are shifts, and
 * A,B,C,D are the left and right HALFs of the numbers to be multiplied.
 *
 * given:
 *	z1		numbers to multiply
 *	z2		numbers to multiply
 *	res		result of multiplication
 */
void
zmul(ZVALUE z1, ZVALUE z2, ZVALUE *res)
{
	LEN len;		/* size of array */

	if (ziszero(z1) || ziszero(z2)) {
		*res = _zero_;
		return;
	}
	if (zisunit(z1)) {
		zcopy(z2, res);
		res->sign = (z1.sign != z2.sign);
		return;
	}
	if (zisunit(z2)) {
		zcopy(z1, res);
		res->sign = (z1.sign != z2.sign);
		return;
	}

	/*
	 * Allocate a temporary buffer for the recursion levels to use.
	 * An array needs to be allocated large enough for all of the
	 * temporary results to fit in.	 This size is about twice the size
	 * of the largest original number, since each recursion level uses
	 * the size of its given number, and whose size is 1/2 the size of
	 * the previous level.	The sum of the infinite series is 2.
	 * Add some extra words because of rounding when dividing by 2
	 * and also because of the extra word that each multiply needs.
	 */
	len = z1.len;
	if (len < z2.len)
		len = z2.len;
	len = 2 * len + 64;
	tempbuf = zalloctemp(len);

	res->sign = (z1.sign != z2.sign);
	res->v = alloc(z1.len + z2.len + 2);
	res->len = domul(z1.v, z1.len, z2.v, z2.len, res->v);
}


/*
 * Recursive routine to multiply two numbers by splitting them up into
 * two numbers of half the size, and using the results of multiplying the
 * sub-pieces.  The result is placed in the indicated array, which must be
 * large enough for the result plus one extra word (size1 + size2 + 1).
 * Returns the actual size of the result with leading zeroes stripped.
 * This also uses a temporary array which must be twice as large as
 * one more than the size of the number at the top level recursive call.
 *
 * given:
 *	v1		first number
 *	size1		size of first number
 *	v2		second number
 *	size2		size of second number
 *	ans		location for result
 */
S_FUNC LEN
domul(HALF *v1, LEN size1, HALF *v2, LEN size2, HALF *ans)
{
	LEN shift;		/* amount numbers are shifted by */
	LEN sizeA;		/* size of left half of first number */
	LEN sizeB;		/* size of right half of first number */
	LEN sizeC;		/* size of left half of second number */
	LEN sizeD;		/* size of right half of second number */
	LEN sizeAB;		/* size of subtraction of A and B */
	LEN sizeDC;		/* size of subtraction of D and C */
	LEN sizeABDC;		/* size of product of above two results */
	LEN subsize;		/* size of difference of HALFs */
	LEN copysize;		/* size of number left to copy */
	LEN sizetotal;		/* total size of product */
	LEN len;		/* temporary length */
	HALF *baseA;		/* base of left half of first number */
	HALF *baseB;		/* base of right half of first number */
	HALF *baseC;		/* base of left half of second number */
	HALF *baseD;		/* base of right half of second number */
	HALF *baseAB;		/* base of result of subtraction of A and B */
	HALF *baseDC;		/* base of result of subtraction of D and C */
	HALF *baseABDC;		/* base of product of above two results */
	HALF *baseAC;		/* base of product of A and C */
	HALF *baseBD;		/* base of product of B and D */
	FULL carry;		/* carry digit for small multiplications */
	FULL carryACBD;		/* carry from addition of A*C and B*D */
	FULL digit;		/* single digit multiplying by */
	HALF *temp;		/* base for temporary calculations */
	bool neg;		/* whether intermediate term is negative */
	register HALF *hd, *h1=NULL, *h2=NULL;	/* for inner loops */
	SIUNION sival;		/* for addition of digits */

	/* firewall */
	if (v1 == NULL) {
		math_error("%s: v1 NULL", __func__);
		not_reached();
	}
	if (ans == NULL) {
		math_error("%s: ans NULL", __func__);
		not_reached();
	}

	/*
	 * Trim the numbers of leading zeroes and initialize the
	 * estimated size of the result.
	 */
	hd = &v1[size1 - 1];
	while ((*hd == 0) && (size1 > 1)) {
		hd--;
		size1--;
	}
	hd = &v2[size2 - 1];
	while ((*hd == 0) && (size2 > 1)) {
		hd--;
		size2--;
	}
	sizetotal = size1 + size2;

	/*
	 * First check for zero answer.
	 */
	if (((size1 == 1) && (*v1 == 0)) || ((size2 == 1) && (*v2 == 0))) {
		*ans = 0;
		return 1;
	}

	/*
	 * Exchange the two numbers if necessary to make the number of
	 * digits of the first number be greater than or equal to the
	 * second number.
	 */
	if (size1 < size2) {
		len = size1; size1 = size2; size2 = len;
		hd = v1; v1 = v2; v2 = hd;
	}

	/*
	 * If the smaller number has only a few digits, then calculate
	 * the result in the normal manner in order to avoid the overhead
	 * of the recursion for small numbers.	The number of digits where
	 * the algorithm changes is settable from 2 to maxint.
	 */
	if (size2 < conf->mul2) {
		/*
		 * First clear the top part of the result, and then multiply
		 * by the lowest digit to get the first partial sum.  Later
		 * products will then add into this result.
		 */
		hd = &ans[size1];
		len = size2;
		while (len--)
			*hd++ = 0;

		digit = *v2++;
		h1 = v1;
		hd = ans;
		carry = 0;
		len = size1;
		while (len >= 4) {	/* expand the loop some */
			len -= 4;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			/* ignore Saber-C warning #112 - get ushort from uint */
			/*	  OK to ignore on name domul`sival */
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (len--) {
			sival.ivalue = ((FULL) *h1++) * digit + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		*hd = (HALF)carry;

		/*
		 * Now multiply by the remaining digits of the second number,
		 * adding each product into the final result.
		 */
		h2 = ans;
		while (--size2 > 0) {
			digit = *v2++;
			h1 = v1;
			hd = ++h2;
			if (digit == 0)
				continue;
			carry = 0;
			len = size1;
			while (len >= 4) {	/* expand the loop some */
				len -= 4;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
			while (len--) {
				sival.ivalue = ((FULL) *h1++) * digit
					+ ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
			while (carry) {
				sival.ivalue = ((FULL) *hd) + carry;
				*hd++ = sival.silow;
				carry = sival.sihigh;
			}
		}

		/*
		 * Now return the true size of the number.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			hd--;
			len--;
		}
		return len;
	}

	/*
	 * Need to multiply by a large number.
	 * Allocate temporary space for calculations, and calculate the
	 * value for the shift.	 The shift value is 1/2 the size of the
	 * larger (first) number (rounded up).	The amount of temporary
	 * space needed is twice the size of the shift, plus one more word
	 * for the multiply to use.
	 */
	shift = (size1 + 1) / 2;
	temp = tempbuf;
	tempbuf += (2 * shift) + 1;

	/*
	 * Determine the sizes and locations of all the numbers.
	 * The value of sizeC can be negative, and this is checked later.
	 * The value of sizeD is limited by the full size of the number.
	 */
	baseA = v1 + shift;
	baseB = v1;
	baseC = v2 + ((shift <= size2)	? shift : size2);
	baseD = v2;
	baseAB = ans;
	baseDC = ans + shift;
	baseAC = ans + shift * 2;
	baseBD = ans;

	sizeA = size1 - shift;
	sizeC = size2 - shift;

	sizeB = shift;
	hd = &baseB[shift - 1];
	while ((*hd == 0) && (sizeB > 1)) {
		hd--;
		sizeB--;
	}

	sizeD = shift;
	if (sizeD > size2)
		sizeD = size2;
	hd = &baseD[sizeD - 1];
	while ((*hd == 0) && (sizeD > 1)) {
		hd--;
		sizeD--;
	}

	/*
	 * If the smaller number has a high half of zero, then calculate
	 * the result by breaking up the first number into two numbers
	 * and combining the results using the obvious formula:
	 *	(A*S+B) * D = (A*D)*S + B*D
	 */
	if (sizeC <= 0) {
		len = domul(baseB, sizeB, baseD, sizeD, ans);
		hd = &ans[len];
		len = sizetotal - len;
		while (len--)
			*hd++ = 0;

		/*
		 * Add the second number into the first number, shifted
		 * over at the correct position.
		 */
		len = domul(baseA, sizeA, baseD, sizeD, temp);
		h1 = temp;
		hd = ans + shift;
		carry = 0;
		while (len--) {
			sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Determine the final size of the number and return it.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			hd--;
			len--;
		}
		tempbuf = temp;
		return len;
	}

	/*
	 * Now we know that the high HALFs of the numbers are nonzero,
	 * so we can use the complete formula.
	 *	(A*S+B)*(C*S+D) = (S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D.
	 * The steps are done in the following order:
	 *	A-B
	 *	D-C
	 *	(A-B)*(D-C)
	 *	S^2*A*C + B*D
	 *	(S^2+S)*A*C + (S+1)*B*D				(*)
	 *	(S^2+S)*A*C + S*(A-B)*(D-C) + (S+1)*B*D
	 *
	 * Note: step (*) above can produce a result which is larger than
	 * the final product will be, and this is where the extra word
	 * needed in the product comes from.  After the final subtraction is
	 * done, the result fits in the expected size.	Using the extra word
	 * is easier than suppressing the carries and borrows everywhere.
	 *
	 * Begin by forming the product (A-B)*(D-C) into a temporary
	 * location that we save until the final step.	Do each subtraction
	 * at positions 0 and S.  Be very careful about the relative sizes
	 * of the numbers since this result can be negative.  For the first
	 * step calculate the absolute difference of A and B into a temporary
	 * location at position 0 of the result.  Negate the sign if A is
	 * smaller than B.
	 */
	neg = false;
	if (sizeA == sizeB) {
		len = sizeA;
		h1 = &baseA[len - 1];
		h2 = &baseB[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeA > sizeB) || ((sizeA == sizeB) && h1 && h2 && (*h1 > *h2))) {
		h1 = baseA;
		h2 = baseB;
		sizeAB = sizeA;
		subsize = sizeB;
	} else {
		neg = !neg;
		h1 = baseB;
		h2 = baseA;
		sizeAB = sizeB;
		subsize = sizeA;
	}
	copysize = sizeAB - subsize;

	hd = baseAB;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}

	hd = &baseAB[sizeAB - 1];
	while ((*hd == 0) && (sizeAB > 1)) {
		hd--;
		sizeAB--;
	}

	/*
	 * This completes the calculation of abs(A-B).	For the next step
	 * calculate the absolute difference of D and C into a temporary
	 * location at position S of the result.  Negate the sign if C is
	 * larger than D.
	 */
	if (sizeC == sizeD) {
		len = sizeC;
		h1 = &baseC[len - 1];
		h2 = &baseD[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeC > sizeD) || ((sizeC == sizeD) && (*h1 > *h2))) {
		neg = !neg;
		h1 = baseC;
		h2 = baseD;
		sizeDC = sizeC;
		subsize = sizeD;
	} else {
		h1 = baseD;
		h2 = baseC;
		sizeDC = sizeD;
		subsize = sizeC;
	}
	copysize = sizeDC - subsize;

	hd = baseDC;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}
	hd = &baseDC[sizeDC - 1];
	while ((*hd == 0) && (sizeDC > 1)) {
		hd--;
		sizeDC--;
	}

	/*
	 * This completes the calculation of abs(D-C).	Now multiply
	 * together abs(A-B) and abs(D-C) into a temporary location,
	 * which is preserved until the final steps.
	 */
	baseABDC = temp;
	sizeABDC = domul(baseAB, sizeAB, baseDC, sizeDC, baseABDC);

	/*
	 * Now calculate B*D and A*C into one of their two final locations.
	 * Make sure the high order digits of the products are zeroed since
	 * this initializes the final result.  Be careful about this zeroing
	 * since the size of the high order words might be smaller than
	 * the shift size.  Do B*D first since the multiplies use one more
	 * word than the size of the product.  Also zero the final extra
	 * word in the result for possible carries to use.
	 */
	len = domul(baseB, sizeB, baseD, sizeD, baseBD);
	hd = &baseBD[len];
	len = shift * 2 - len;
	while (len--)
		*hd++ = 0;

	len = domul(baseA, sizeA, baseC, sizeC, baseAC);
	hd = &baseAC[len];
	len = sizetotal - shift * 2 - len + 1;
	while (len--)
		*hd++ = 0;

	/*
	 * Now add in A*C and B*D into themselves at the other shifted
	 * position that they need.  This addition is tricky in order to
	 * make sure that the two additions cannot interfere with each other.
	 * Therefore we first add in the top half of B*D and the lower half
	 * of A*C.  The sources and destinations of these two additions
	 * overlap, and so the same answer results from the two additions,
	 * thus only two pointers suffice for both additions.  Keep the
	 * final carry from these additions for later use since we cannot
	 * afford to change the top half of A*C yet.
	 */
	h1 = baseBD + shift;
	h2 = baseAC;
	carryACBD = 0;
	len = shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1) + ((FULL) *h2) + carryACBD;
		*h1++ = sival.silow;
		*h2++ = sival.silow;
		carryACBD = sival.sihigh;
	}

	/*
	 * Now add in the bottom half of B*D and the top half of A*C.
	 * These additions are straightforward, except that A*C should
	 * be done first because of possible carries from B*D, and the
	 * top half of A*C might not exist.  Add in one of the carries
	 * from the previous addition while we are at it.
	 */
	h1 = baseAC + shift;
	hd = baseAC;
	carry = carryACBD;
	len = sizetotal - 3 * shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	h1 = baseBD;
	hd = baseBD + shift;
	carry = 0;
	len = shift;
	while (len--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	/*
	 * Now finally add in the other delayed carry from the
	 * above addition.
	 */
	hd = baseAC + shift;
	while (carryACBD) {
		sival.ivalue = ((FULL) *hd) + carryACBD;
		*hd++ = sival.silow;
		carryACBD = sival.sihigh;
	}

	/*
	 * Now finally add or subtract (A-B)*(D-C) into the final result at
	 * the correct position (S), according to whether it is positive or
	 * negative.  When subtracting, the answer cannot go negative.
	 */
	h1 = baseABDC;
	hd = ans + shift;
	carry = 0;
	len = sizeABDC;
	if (neg) {
		while (len--) {
			sival.ivalue = BASE1 - ((FULL) *hd) +
				((FULL) *h1++) + carry;
			*hd++ = (HALF)(BASE1 - sival.silow);
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = BASE1 - ((FULL) *hd) + carry;
			*hd++ = (HALF)(BASE1 - sival.silow);
			carry = sival.sihigh;
		}
	} else {
		while (len--) {
			sival.ivalue = ((FULL) *h1++) + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
	}

	/*
	 * Finally determine the size of the final result and return that.
	 */
	len = sizetotal;
	hd = &ans[len - 1];
	while ((*hd == 0) && (len > 1)) {
		hd--;
		len--;
	}
	tempbuf = temp;
	return len;
}


/*
 * Square a number by using the following formula recursively:
 *	(A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2
 * where S is a power of 2^16, and so multiplies by it are shifts,
 * and A and B are the left and right HALFs of the number to square.
 */
void
zsquare(ZVALUE z, ZVALUE *res)
{
	LEN len;

	/* firewall */
	if (res == NULL) {
		math_error("%s: res NULL", __func__);
		not_reached();
	}

	if (ziszero(z)) {
		*res = _zero_;
		return;
	}
	if (zisunit(z)) {
		*res = _one_;
		return;
	}

	/*
	 * Allocate a temporary array if necessary for the recursion to use.
	 * The array needs to be allocated large enough for all of the
	 * temporary results to fit in.	 This size is about 3 times the
	 * size of the original number, since each recursion level uses 3/2
	 * of the size of its given number, and whose size is 1/2 the size
	 * of the previous level.  The sum of the infinite series is 3.
	 * Allocate some extra words for rounding up the sizes.
	 */
	len = 3 * z.len + 32;
	tempbuf = zalloctemp(len);

	res->sign = 0;
	res->v = alloc((z.len+2) * 2);
	/*
	 * Without the memset below, Purify reports that dosquare()
	 *	 will read uninitialized memory at the dosquare() line below
	 *	 the comment:
	 *
	 *		uninitialized memory read (see zsquare)
	 *
	 * This problem occurs during regression test #622 and may
	 * be duplicated by executing:
	 *
	 *	config("sq2", 2);
	 *	0xffff0000ffffffff00000000ffff0000000000000000ffff^2;
	 */
	memset((char *)res->v, 0, ((z.len+2) * 2)*sizeof(HALF));
	res->len = dosquare(z.v, z.len, res->v);
}


/*
 * Recursive routine to square a number by splitting it up into two numbers
 * of half the size, and using the results of squaring the sub-pieces.
 * The result is placed in the indicated array, which must be large
 * enough for the result (size * 2).  Returns the size of the result.
 * This uses a temporary array which must be 3 times as large as the
 * size of the number at the top level recursive call.
 *
 * given:
 *	vp		value to be squared
 *	size		length of value to square
 *	ans		location for result
 */
S_FUNC LEN
dosquare(HALF *vp, LEN size, HALF *ans)
{
	LEN shift;		/* amount numbers are shifted by */
	LEN sizeA;		/* size of left half of number to square */
	LEN sizeB;		/* size of right half of number to square */
	LEN sizeAA;		/* size of square of left half */
	LEN sizeBB;		/* size of square of right half */
	LEN sizeAABB;		/* size of sum of squares of A and B */
	LEN sizeAB;		/* size of difference of A and B */
	LEN sizeABAB;		/* size of square of difference of A and B */
	LEN subsize;		/* size of difference of HALFs */
	LEN copysize;		/* size of number left to copy */
	LEN sumsize;		/* size of sum */
	LEN sizetotal;		/* total size of square */
	LEN len;		/* temporary length */
	LEN len1;		/* another temporary length */
	FULL carry;		/* carry digit for small multiplications */
	FULL digit;		/* single digit multiplying by */
	HALF *temp;		/* base for temporary calculations */
	HALF *baseA;		/* base of left half of number */
	HALF *baseB;		/* base of right half of number */
	HALF *baseAA;		/* base of square of left half of number */
	HALF *baseBB;		/* base of square of right half of number */
	HALF *baseAABB;		/* base of sum of squares of A and B */
	HALF *baseAB;		/* base of difference of A and B */
	HALF *baseABAB;		/* base of square of difference of A and B */
	register HALF *hd, *h1, *h2, *h3;	/* for inner loops */
	SIUNION sival;		/* for addition of digits */

	/* firewall */
	if (vp == NULL) {
		math_error("%s: vp NULL", __func__);
		not_reached();
	}
	if (ans == NULL) {
		math_error("%s: ans NULL", __func__);
		not_reached();
	}

	/*
	 * First trim the number of leading zeroes.
	 */
	hd = &vp[size - 1];
	while ((*hd == 0) && (size > 1)) {
		size--;
		hd--;
	}
	sizetotal = size + size;

	/*
	 * If the number has only a small number of digits, then use the
	 * (almost) normal multiplication method.  Multiply each halfword
	 * only by those halfwords further on in the number.  Missed terms
	 * will then be the same pairs of products repeated, and the squares
	 * of each halfword.  The first case is handled by doubling the
	 * result.  The second case is handled explicitly.  The number of
	 * digits where the algorithm changes is settable from 2 to maxint.
	 */
	if (size < conf->sq2) {
		hd = ans;
		len = sizetotal;
		while (len--)
			*hd++ = 0;

		h2 = vp;
		hd = ans + 1;
		for (len = size; len--; hd += 2) {
			digit = (FULL) *h2++;
			if (digit == 0)
				continue;
			h3 = h2;
			h1 = hd;
			carry = 0;
			len1 = len;
			while (len1 >= 4) {	/* expand the loop some */
				len1 -= 4;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + ((FULL) sival.sihigh);
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
			while (len1--) {
				sival.ivalue = (digit * ((FULL) *h3++))
					+ ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
			while (carry) {
				sival.ivalue = ((FULL) *h1) + carry;
				*h1++ = sival.silow;
				carry = sival.sihigh;
			}
		}

		/*
		 * Now double the result.
		 * There is no final carry to worry about because we
		 * handle all digits of the result which must fit.
		 */
		carry = 0;
		hd = ans;
		len = sizetotal;
		while (len--) {
			digit = ((FULL) *hd);
			sival.ivalue = digit + digit + carry;
			/* ignore Saber-C warning #112 - get ushort from uint */
			/*	  OK to ignore on name dosquare`sival */
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Now add in the squares of each halfword.
		 */
		carry = 0;
		hd = ans;
		h3 = vp;
		len = size;
		while (len--) {
			digit = ((FULL) *h3++);
			sival.ivalue = digit * digit + ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}
		while (carry) {
			sival.ivalue = ((FULL) *hd) + carry;
			*hd++ = sival.silow;
			carry = sival.sihigh;
		}

		/*
		 * Finally return the size of the result.
		 */
		len = sizetotal;
		hd = &ans[len - 1];
		while ((*hd == 0) && (len > 1)) {
			len--;
			hd--;
		}
		return len;
	}

	/*
	 * The number to be squared is large.
	 * Allocate temporary space and determine the sizes and
	 * positions of the values to be calculated.
	 */
	temp = tempbuf;
	tempbuf += (3 * (size + 1) / 2);

	sizeA = size / 2;
	sizeB = size - sizeA;
	shift = sizeB;
	baseA = vp + sizeB;
	baseB = vp;
	baseAA = &ans[shift * 2];
	baseBB = ans;
	baseAABB = temp;
	baseAB = temp;
	baseABAB = &temp[shift];

	/*
	 * Trim the second number of leading zeroes.
	 */
	hd = &baseB[sizeB - 1];
	while ((*hd == 0) && (sizeB > 1)) {
		sizeB--;
		hd--;
	}

	/*
	 * Now to proceed to calculate the result using the formula.
	 *	(A*S+B)^2 = (S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
	 * The steps are done in the following order:
	 *	S^2*A^2 + B^2
	 *	A^2 + B^2
	 *	(S^2+S)*A^2 + (S+1)*B^2
	 *	(A-B)^2
	 *	(S^2+S)*A^2 + (S+1)*B^2 - S*(A-B)^2.
	 *
	 * Begin by forming the squares of two the HALFs concatenated
	 * together in the final result location.  Make sure that the
	 * highest words of the results are zero.
	 */
	sizeBB = dosquare(baseB, sizeB, baseBB);
	hd = &baseBB[sizeBB];
	len = shift * 2 - sizeBB;
	while (len--)
		*hd++ = 0;

	sizeAA = dosquare(baseA, sizeA, baseAA);
	hd = &baseAA[sizeAA];
	len = sizetotal - shift * 2 - sizeAA;
	while (len--)
		*hd++ = 0;

	/*
	 * Sum the two squares into a temporary location.
	 */
	if (sizeAA >= sizeBB) {
		h1 = baseAA;
		h2 = baseBB;
		sizeAABB = sizeAA;
		sumsize = sizeBB;
	} else {
		h1 = baseBB;
		h2 = baseAA;
		sizeAABB = sizeBB;
		sumsize = sizeAA;
	}
	copysize = sizeAABB - sumsize;

	hd = baseAABB;
	carry = 0;
	while (sumsize--) {
		sival.ivalue = ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = ((FULL) *h1++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	if (carry) {
		*hd = (HALF)carry;
		sizeAABB++;
	}

	/*
	 * Add the sum back into the previously calculated squares
	 * shifted over to the proper location.
	 */
	h1 = baseAABB;
	hd = ans + shift;
	carry = 0;
	len = sizeAABB;
	while (len--) {
		sival.ivalue = ((FULL) *hd) + ((FULL) *h1++) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}
	while (carry) {
		/* uninitialized memory read (see zsquare) */
		sival.ivalue = ((FULL) *hd) + carry;
		*hd++ = sival.silow;
		carry = sival.sihigh;
	}

	/*
	 * Calculate the absolute value of the difference of the two HALFs
	 * into a temporary location.
	 */
	if (sizeA == sizeB) {
		len = sizeA;
		h1 = &baseA[len - 1];
		h2 = &baseB[len - 1];
		while ((len > 1) && (*h1 == *h2)) {
			len--;
			h1--;
			h2--;
		}
	}
	if ((sizeA > sizeB) || ((sizeA == sizeB) && (*h1 > *h2))) {
		h1 = baseA;
		h2 = baseB;
		sizeAB = sizeA;
		subsize = sizeB;
	} else {
		h1 = baseB;
		h2 = baseA;
		sizeAB = sizeB;
		subsize = sizeA;
	}
	copysize = sizeAB - subsize;

	hd = baseAB;
	carry = 0;
	while (subsize--) {
		sival.ivalue = BASE1 - ((FULL) *h1++) + ((FULL) *h2++) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}
	while (copysize--) {
		sival.ivalue = (BASE1 - ((FULL) *h1++)) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}

	hd = &baseAB[sizeAB - 1];
	while ((*hd == 0) && (sizeAB > 1)) {
		sizeAB--;
		hd--;
	}

	/*
	 * Now square the number into another temporary location,
	 * and subtract that from the final result.
	 */
	sizeABAB = dosquare(baseAB, sizeAB, baseABAB);

	h1 = baseABAB;
	hd = ans + shift;
	carry = 0;
	while (sizeABAB--) {
		sival.ivalue = BASE1 - ((FULL) *hd) + ((FULL) *h1++) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}
	while (carry) {
		sival.ivalue = BASE1 - ((FULL) *hd) + carry;
		*hd++ = (HALF)(BASE1 - sival.silow);
		carry = sival.sihigh;
	}

	/*
	 * Return the size of the result.
	 */
	len = sizetotal;
	hd = &ans[len - 1];
	while ((*hd == 0) && (len > 1)) {
		len--;
		hd--;
	}
	tempbuf = temp;
	return len;
}


/*
 * Return a pointer to a buffer to be used for holding a temporary number.
 * The buffer will be at least as large as the specified number of HALFs,
 * and remains valid until the next call to this routine.  The buffer cannot
 * be freed by the caller.  There is only one temporary buffer, and so as to
 * avoid possible conflicts this is only used by the lowest level routines
 * such as divide, multiply, and square.
 *
 * given:
 *	len		required number of HALFs in buffer
 */
HALF *
zalloctemp(LEN len)
{
	HALF *hp;
	STATIC LEN buflen;	/* current length of temp buffer */
	STATIC HALF *bufptr;	/* pointer to current temp buffer */

	if (len <= buflen)
		return bufptr;

	/*
	 * We need to grow the temporary buffer.
	 * First free any existing buffer, and then allocate the new one.
	 * While we are at it, make the new buffer bigger than necessary
	 * in order to reduce the number of reallocations.
	 */
	len += 100;
	if (buflen) {
		buflen = 0;
		free(bufptr);
	}
	/* don't call alloc() because _math_abort_ may not be set right */
	hp = (HALF *) malloc((len+1) * sizeof(HALF));
	if (hp == NULL) {
		math_error("No memory for temp buffer");
		not_reached();
	}
	bufptr = hp;
	buflen = len;
	return hp;
}