1 /** Fundamental operations for arbitrary-precision arithmetic
2  *
3  * These functions are for internal use only.
4  *
5  * Copyright: Copyright (C) 2008 Don Clugston.  All rights reserved.
6  * License:   BSD style: $(LICENSE)
7  * Authors:   Don Clugston
8  */
9 /* References:
10   - R.P. Brent and P. Zimmermann, "Modern Computer Arithmetic", 
11     Version 0.2, p. 26, (June 2009).
12   - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022, 
13     Max-Planck Institute fuer Informatik, (Oct 1998).
14   - G. Hanrot, M. Quercia, and P. Zimmermann, "The Middle Product Algorithm, I.",
15     INRIA 4664, (Dec 2002).
16   - M. Bodrato and A. Zanoni, "What about Toom-Cook Matrices Optimality?",
17     http://bodrato.it/papers (2006).
18   - A. Fog, "Optimizing subroutines in assembly language", 
19     www.agner.org/optimize (2008).
20   - A. Fog, "The microarchitecture of Intel and AMD CPU's",
21     www.agner.org/optimize (2008).
22   - A. Fog, "Instruction tables: Lists of instruction latencies, throughputs
23     and micro-operation breakdowns for Intel and AMD CPU's.", www.agner.org/optimize (2008).
24 */ 
25 module tango.math.internal.BiguintCore;
26 
27 //version=TangoBignumNoAsm;       /// temporal: see ticket #1878
28 
29 version(GNU){
30     // GDC is a filthy liar. It can't actually do inline asm.
31 } else version(TangoBignumNoAsm) {
32 
33 } else version(D_InlineAsm_X86) {
34     version = Naked_D_InlineAsm_X86;
35 } else version(LLVM_InlineAsm_X86) { 
36     version = Naked_D_InlineAsm_X86; 
37 }
38 
39 version(Naked_D_InlineAsm_X86) { 
40 private import tango.math.internal.BignumX86;
41 } else {
42 private import tango.math.internal.BignumNoAsm;
43 }
44 version(build){// bud/build won't link properly without this.
45     static import tango.math.internal.BignumX86;
46 }
47 
48 alias multibyteAddSub!('+') multibyteAdd;
49 alias multibyteAddSub!('-') multibyteSub;
50 
51 // private import tango.core.Cpuid;
52 static this()
53 {
54     CACHELIMIT = 8000; // tango.core.Cpuid.datacache[0].size/2;
55     FASTDIVLIMIT = 100;
56 }
57 
58 private:
59 // Limits for when to switch between algorithms.
60 const int CACHELIMIT;   // Half the size of the data cache.
61 const int FASTDIVLIMIT; // crossover to recursive division
62 
63 
64 // These constants are used by shift operations
65 static if (BigDigit.sizeof == int.sizeof) {
66     enum { LG2BIGDIGITBITS = 5, BIGDIGITSHIFTMASK = 31 };
67     alias ushort BIGHALFDIGIT;
68 } else static if (BigDigit.sizeof == long.sizeof) {
69     alias uint BIGHALFDIGIT;
70     enum { LG2BIGDIGITBITS = 6, BIGDIGITSHIFTMASK = 63 };
71 } else static assert(0, "Unsupported BigDigit size");
72 
73 const BigDigit [] ZERO = [0];
74 const BigDigit [] ONE = [1];
75 const BigDigit [] TWO = [2];
76 const BigDigit [] TEN = [10];
77 
78 public:       
79 
80 /// BigUint performs memory management and wraps the low-level calls.
81 struct BigUint {
82 private:
83     invariant() {
84         assert( data.length == 1 || data[$-1] != 0 );
85     }
86     BigDigit [] data = ZERO; 
87     static BigUint opCall(BigDigit [] x) {
88        BigUint a;
89        a.data = x;
90        return a;
91     }
92 public: // for development only, will be removed eventually
93     // Equivalent to BigUint[numbytes-$..$]
94     BigUint sliceHighestBytes(uint numbytes) {
95         BigUint x;
96         x.data = data[$ - (numbytes>>2) .. $];
97         return x;
98     }
99     // Length in uints
100     int uintLength() {
101         static if (BigDigit.sizeof == uint.sizeof) {
102             return data.length;
103         } else static if (BigDigit.sizeof == ulong.sizeof) {
104             return data.length * 2 - 
105             ((data[$-1] & 0xFFFF_FFFF_0000_0000L) ? 1 : 0);
106         }
107     }
108     int ulongLength() {
109         static if (BigDigit.sizeof == uint.sizeof) {
110             return (data.length + 1) >> 1;
111         } else static if (BigDigit.sizeof == ulong.sizeof) {
112             return data.length;
113         }
114     }
115 
116     // The value at (cast(ulong[])data)[n]
117     ulong peekUlong(int n) {
118         static if (BigDigit.sizeof == int.sizeof) {
119             if (data.length == n*2 + 1) return data[n*2];
120             version(LittleEndian) {
121                 return data[n*2] + ((cast(ulong)data[n*2 + 1]) << 32 );
122             } else {
123                 return data[n*2 + 1] + ((cast(ulong)data[n*2]) << 32 );
124             }
125         } else static if (BigDigit.sizeof == long.sizeof) {
126             return data[n];
127         }
128     }
129     uint peekUint(int n) {
130         static if (BigDigit.sizeof == int.sizeof) {
131             return data[n];
132         } else {
133             ulong x = data[n >> 1];
134             return (n & 1) ? cast(uint)(x >> 32) : cast(uint)x;
135         }
136     }
137 public:
138     ///
139     void opAssign(ulong u) {
140         if (u == 0) data = ZERO;
141         else if (u == 1) data = ONE;
142         else if (u == 2) data = TWO;
143         else if (u == 10) data = TEN;
144         else {
145             uint ulo = cast(uint)(u & 0xFFFF_FFFF);
146             uint uhi = cast(uint)(u >> 32);
147             if (uhi==0) {
148               data = new BigDigit[1];
149               data[0] = ulo;
150             } else {
151               data = new BigDigit[2];
152               data[0] = ulo;
153               data[1] = uhi;
154             }
155         }
156     }
157     
158     void opAssign(BigUint a)
159     {
160 		data = a.data;
161 	}
162     
163 ///
164 int opCmp(BigUint y)
165 {
166     if (data.length != y.data.length) {
167         return (data.length > y.data.length) ?  1 : -1;
168     }
169     uint k = highestDifferentDigit(data, y.data);
170     if (data[k] == y.data[k]) return 0;
171     return data[k] > y.data[k] ? 1 : -1;
172 }
173 
174 ///
175 int opCmp(ulong y)
176 {
177     if (data.length>2) return 1;
178     uint ylo = cast(uint)(y & 0xFFFF_FFFF);
179     uint yhi = cast(uint)(y >> 32);
180     if (data.length == 2 && data[1] != yhi) {
181         return data[1] > yhi ? 1: -1;
182     }
183     if (data[0] == ylo) return 0;
184     return data[0] > ylo ? 1: -1;
185 }
186 
187 const bool opEquals(const ref BigUint y) {
188        return y.data[] == data[];
189 }
190 
191 const bool opEquals(ulong y) {
192     if (data.length>2) return 0;
193     uint ylo = cast(uint)(y & 0xFFFF_FFFF);
194     uint yhi = cast(uint)(y >> 32);
195     if (data.length==2 && data[1]!=yhi) return 0;
196     if (data.length==1 && yhi!=0) return 0;
197     return (data[0] == ylo);
198 }
199 
200 
201 bool isZero() { return data.length == 1 && data[0] == 0; }
202 
203 int numBytes() {
204     return data.length * BigDigit.sizeof;
205 }
206 
207 // the extra bytes are added to the start of the string
208 char [] toDecimalString(int frontExtraBytes)
209 {
210     uint predictlength = 20+20*(data.length/2); // just over 19
211     char [] buff = new char[frontExtraBytes + predictlength];
212     int sofar = biguintToDecimal(buff, data.dup);       
213     return buff[sofar-frontExtraBytes..$];
214 }
215 
216 /** Convert to a hex string, printing a minimum number of digits 'minPadding',
217  *  allocating an additional 'frontExtraBytes' at the start of the string.
218  *  Padding is done with padChar, which may be '0' or ' '.
219  *  'separator' is a digit separation character. If non-zero, it is inserted
220  *  between every 8 digits.
221  *  Separator characters do not contribute to the minPadding.
222  */
223 char [] toHexString(int frontExtraBytes, char separator = 0, int minPadding=0, char padChar = '0')
224 {
225     // Calculate number of extra padding bytes
226     size_t extraPad = (minPadding > data.length * 2 * BigDigit.sizeof) 
227         ? minPadding - data.length * 2 * BigDigit.sizeof : 0;
228 
229     // Length not including separator bytes                
230     size_t lenBytes = data.length * 2 * BigDigit.sizeof;
231 
232     // Calculate number of separator bytes
233     size_t mainSeparatorBytes = separator ? (lenBytes  / 8) - 1 : 0;
234     size_t totalSeparatorBytes = separator ? ((extraPad + lenBytes + 7) / 8) - 1: 0;
235 
236     char [] buff = new char[lenBytes + extraPad + totalSeparatorBytes + frontExtraBytes];
237     biguintToHex(buff[$ - lenBytes - mainSeparatorBytes .. $], data, separator);
238     if (extraPad > 0) {
239         if (separator) {
240             size_t start = frontExtraBytes; // first index to pad
241             if (extraPad &7) {
242                 // Do 1 to 7 extra zeros.
243                 buff[frontExtraBytes .. frontExtraBytes + (extraPad & 7)] = padChar;
244                 buff[frontExtraBytes + (extraPad & 7)] = (padChar == ' ' ? ' ' : separator);
245                 start += (extraPad & 7) + 1;
246             }
247             for (int i=0; i< (extraPad >> 3); ++i) {
248                 buff[start .. start + 8] = padChar;
249                 buff[start + 8] = (padChar == ' ' ? ' ' : separator);
250                 start += 9;
251             }
252         } else {
253             buff[frontExtraBytes .. frontExtraBytes + extraPad]=padChar;
254         }
255     }
256     int z = frontExtraBytes;
257     if (lenBytes > minPadding) {
258         // Strip leading zeros.
259         int maxStrip = lenBytes - minPadding;
260         while (z< buff.length-1 && (buff[z]=='0' || buff[z]==padChar) && maxStrip>0) {
261             ++z; --maxStrip;
262         }
263     }
264     if (padChar!='0') {
265         // Convert leading zeros into padChars.
266         for (size_t k= z; k< buff.length-1 && (buff[k]=='0' || buff[k]==padChar); ++k) {
267             if (buff[k]=='0') buff[k]=padChar;
268         }
269     }
270     return buff[z-frontExtraBytes..$];
271 }
272 
273 // return false if invalid character found
274 bool fromHexString(const(char []) s)
275 {
276     //Strip leading zeros
277     int firstNonZero = 0;    
278     while ((firstNonZero < s.length - 1) && 
279         (s[firstNonZero]=='0' || s[firstNonZero]=='_')) {
280             ++firstNonZero;
281     }    
282     int len = (s.length - firstNonZero + 15)/4;
283     data = new BigDigit[len+1];
284     uint part = 0;
285     uint sofar = 0;
286     uint partcount = 0;
287     assert(s.length>0);
288     for (int i = s.length - 1; i>=firstNonZero; --i) {
289         assert(i>=0);
290         char c = s[i];
291         if (s[i]=='_') continue;
292         uint x = (c>='0' && c<='9') ? c - '0' 
293                : (c>='A' && c<='F') ? c - 'A' + 10 
294                : (c>='a' && c<='f') ? c - 'a' + 10
295                : 100;
296         if (x==100) return false;
297         part >>= 4;
298         part |= (x<<(32-4));
299         ++partcount;
300         if (partcount==8) {
301             data[sofar] = part;
302             ++sofar;
303             partcount = 0;
304             part = 0;
305         }
306     }
307     if (part) {
308         for ( ; partcount != 8; ++partcount) part >>= 4;
309         data[sofar] = part;
310         ++sofar;
311     }
312     if (sofar == 0) data = ZERO;
313     else data = data[0..sofar];
314     return true;
315 }
316 
317 // return true if OK; false if erroneous characters found
318 bool fromDecimalString(const(char []) s)
319 {
320     //Strip leading zeros
321     int firstNonZero = 0;    
322     while ((firstNonZero < s.length - 1) && 
323         (s[firstNonZero]=='0' || s[firstNonZero]=='_')) {
324             ++firstNonZero;
325     }
326     if (firstNonZero == s.length - 1 && s.length > 1) {
327         data = ZERO;
328         return true;
329     }
330     uint predictlength = (18*2 + 2*(s.length-firstNonZero)) / 19;
331     data = new BigDigit[predictlength];
332     uint hi = biguintFromDecimal(data, s[firstNonZero..$]);
333     data.length = hi;
334     return true;
335 }
336 
337 ////////////////////////
338 //
339 // All of these member functions create a new BigUint.
340 
341 // return x >> y
342 BigUint opShr(ulong y)
343 {
344     assert(y>0);
345     uint bits = cast(uint)y & BIGDIGITSHIFTMASK;
346     if ((y>>LG2BIGDIGITBITS) >= data.length) return BigUint(ZERO);
347     uint words = cast(uint)(y >> LG2BIGDIGITBITS);
348     if (bits==0) {
349         return BigUint(data[words..$]);
350     } else {
351         uint [] result = new BigDigit[data.length - words];
352         multibyteShr(result, data[words..$], bits);
353         if (result.length>1 && result[$-1]==0) return BigUint(result[0..$-1]);
354         else return BigUint(result);
355     }
356 }
357 
358 // return x << y
359 BigUint opShl(ulong y)
360 {
361     assert(y>0);
362     if (isZero()) return this;
363     uint bits = cast(uint)y & BIGDIGITSHIFTMASK;
364     assert ((y>>LG2BIGDIGITBITS) < cast(ulong)(uint.max));
365     uint words = cast(uint)(y >> LG2BIGDIGITBITS);
366     BigDigit [] result = new BigDigit[data.length + words+1];
367     result[0..words] = 0;
368     if (bits==0) {
369         result[words..words+data.length] = data[];
370         return BigUint(result[0..words+data.length]);
371     } else {
372         uint c = multibyteShl(result[words..words+data.length], data, bits);
373         if (c==0) return BigUint(result[0..words+data.length]);
374         result[$-1] = c;
375         return BigUint(result);
376     }
377 }
378 
379 // If wantSub is false, return x+y, leaving sign unchanged
380 // If wantSub is true, return abs(x-y), negating sign if x<y
381 static BigUint addOrSubInt(BigUint x, ulong y, bool wantSub, bool *sign) {
382     BigUint r;
383     if (wantSub) { // perform a subtraction
384         if (x.data.length > 2) {
385             r.data = subInt(x.data, y);                
386         } else { // could change sign!
387             ulong xx = x.data[0];
388             if (x.data.length > 1) xx+= (cast(ulong)x.data[1]) << 32;
389             ulong d;
390             if (xx <= y) {
391                 d = y - xx;
392                 *sign = !*sign;
393             } else {
394                 d = xx - y;
395             }
396             if (d==0) {
397                 r = 0;
398                 return r;
399             }
400             r.data = new BigDigit[ d > uint.max ? 2: 1];
401             r.data[0] = cast(uint)(d & 0xFFFF_FFFF);
402             if (d > uint.max) r.data[1] = cast(uint)(d>>32);
403         }
404     } else {
405         r.data = addInt(x.data, y);
406     }
407     return r;
408 }
409 
410 // If wantSub is false, return x + y, leaving sign unchanged.
411 // If wantSub is true, return abs(x - y), negating sign if x<y
412 static BigUint addOrSub(BigUint x, BigUint y, bool wantSub, bool *sign) {
413     BigUint r;
414     if (wantSub) { // perform a subtraction
415         r.data = sub(x.data, y.data, sign);
416         if (r.isZero()) {
417             *sign = false;
418         }
419     } else {
420         r.data = add(x.data, y.data);
421     }
422     return r;
423 }
424 
425 
426 //  return x*y.
427 //  y must not be zero.
428 static BigUint mulInt(BigUint x, ulong y)
429 {
430     if (y==0 || x == 0) return BigUint(ZERO);
431     uint hi = cast(uint)(y >>> 32);
432     uint lo = cast(uint)(y & 0xFFFF_FFFF);
433     uint [] result = new BigDigit[x.data.length+1+(hi!=0)];
434     result[x.data.length] = multibyteMul(result[0..x.data.length], x.data, lo, 0);
435     if (hi!=0) {
436         result[x.data.length+1] = multibyteMulAdd!('+')(result[1..x.data.length+1],
437             x.data, hi, 0);
438     }
439     return BigUint(removeLeadingZeros(result));
440 }
441 
442 /*  return x*y.
443  */
444 static BigUint mul(BigUint x, BigUint y)
445 {
446     if (y==0 || x == 0) return BigUint(ZERO);
447 
448     uint len = x.data.length + y.data.length;
449     BigDigit [] result = new BigDigit[len];
450     if (y.data.length > x.data.length) {
451         mulInternal(result, y.data, x.data);
452     } else {
453         if (x.data[]==y.data[]) squareInternal(result, x.data);
454         else mulInternal(result, x.data, y.data);
455     }
456     // the highest element could be zero, 
457     // in which case we need to reduce the length
458     return BigUint(removeLeadingZeros(result));
459 }
460 
461 // return x/y
462 static BigUint divInt(BigUint x, uint y) {
463     uint [] result = new BigDigit[x.data.length];
464     if ((y&(-y))==y) {
465         assert(y!=0, "BigUint division by zero");
466         // perfect power of 2
467         uint b = 0;
468         for (;y!=1; y>>=1) {
469             ++b;
470         }
471         multibyteShr(result, x.data, b);
472     } else {
473         result[] = x.data[];
474         uint rem = multibyteDivAssign(result, y, 0);
475     }
476     return BigUint(removeLeadingZeros(result));
477 }
478 
479 // return x%y
480 static uint modInt(BigUint x, uint y) {
481     assert(y!=0);
482     if ((y&(-y))==y) { // perfect power of 2        
483         return x.data[0]&(y-1);   
484     } else {
485         // horribly inefficient - malloc, copy, & store are unnecessary.
486         uint [] wasteful = new BigDigit[x.data.length];
487         wasteful[] = x.data[];
488         uint rem = multibyteDivAssign(wasteful, y, 0);
489         delete wasteful;
490         return rem;
491     }   
492 }
493 
494 // return x/y
495 static BigUint div(BigUint x, BigUint y)
496 {
497     if (y.data.length > x.data.length) return BigUint(ZERO);
498     if (y.data.length == 1) return divInt(x, y.data[0]);
499     BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
500     divModInternal(result, null, x.data, y.data);
501     return BigUint(removeLeadingZeros(result));
502 }
503 
504 // return x%y
505 static BigUint mod(BigUint x, BigUint y)
506 {
507     if (y.data.length > x.data.length) return x;
508     if (y.data.length == 1) {
509         BigDigit [] result = new BigDigit[1];
510         result[0] = modInt(x, y.data[0]);
511         return BigUint(result);
512     }
513     BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
514     BigDigit [] rem = new BigDigit[y.data.length];
515     divModInternal(result, rem, x.data, y.data);
516     return BigUint(removeLeadingZeros(rem));
517 }
518 
519 /**
520  * Return a BigUint which is x raised to the power of y.
521  * Method: Powers of 2 are removed from x, then left-to-right binary
522  * exponentiation is used.
523  * Memory allocation is minimized: at most one temporary BigUint is used.
524  */
525 static BigUint pow(BigUint x, ulong y)
526 {
527     // Deal with the degenerate cases first.
528     if (y==0) return BigUint(ONE);
529     if (y==1) return x;
530     if (x==0 || x==1) return x;
531    
532     BigUint result;
533      
534     // Simplify, step 1: Remove all powers of 2.
535     uint firstnonzero = firstNonZeroDigit(x.data);
536     
537     // See if x can now fit into a single digit.            
538     bool singledigit = ((x.data.length - firstnonzero) == 1);
539     // If true, then x0 is that digit, and we must calculate x0 ^^ y0.
540     BigDigit x0 = x.data[firstnonzero];
541     assert(x0 !=0);
542     size_t xlength = x.data.length;
543     ulong y0;
544     uint evenbits = 0; // number of even bits in the bottom of x
545     while (!(x0 & 1)) { x0 >>= 1; ++evenbits; }
546     
547     if ((x.data.length- firstnonzero == 2)) {
548         // Check for a single digit straddling a digit boundary
549         BigDigit x1 = x.data[firstnonzero+1];
550         if ((x1 >> evenbits) == 0) {
551             x0 |= (x1 << (BigDigit.sizeof * 8 - evenbits));
552             singledigit = true;
553         }
554     }
555     uint evenshiftbits = 0; // Total powers of 2 to shift by, at the end
556     
557     // Simplify, step 2: For singledigits, see if we can trivially reduce y
558     
559     BigDigit finalMultiplier = 1;
560    
561     if (singledigit) {
562         // x fits into a single digit. Raise it to the highest power we can
563         // that still fits into a single digit, then reduce the exponent accordingly.
564         // We're quite likely to have a residual multiply at the end.
565         // For example, 10^^100 = (((5^^13)^^7) * 5^^9) * 2^^100.
566         // and 5^^13 still fits into a uint.
567         evenshiftbits  = cast(uint)( (evenbits * y) & BIGDIGITSHIFTMASK);
568         if (x0 == 1) { // Perfect power of 2
569              result = 1;
570              return result<< (evenbits + firstnonzero*BigDigit.sizeof)*y;
571         } else {
572             int p = highestPowerBelowUintMax(x0);
573             if (y <= p) { // Just do it with pow               
574                 result = intpow(x0, y);
575                 if (evenshiftbits+firstnonzero == 0) return result;
576                 return result<< (evenbits + firstnonzero*BigDigit.sizeof)*y;
577             }
578             y0 = y/p;
579             finalMultiplier = intpow(x0, y - y0*p);
580             x0 = intpow(x0, p);
581         }
582         xlength = 1;
583     }
584 
585     // Check for overflow and allocate result buffer
586     // Single digit case: +1 is for final multiplier, + 1 is for spare evenbits.
587     ulong estimatelength = singledigit ? firstnonzero*y + y0*1 + 2 + ((evenbits*y) >> LG2BIGDIGITBITS) 
588         : x.data.length * y; // estimated length in BigDigits
589     // (Estimated length can overestimate by a factor of 2, if x.data.length ~ 2).
590     if (estimatelength > uint.max/(4*BigDigit.sizeof)) assert(0, "Overflow in BigInt.pow");
591     
592     // The result buffer includes space for all the trailing zeros
593     BigDigit [] resultBuffer = new BigDigit[cast(size_t)estimatelength];
594     
595     // Do all the powers of 2!
596     size_t result_start = cast(size_t)(firstnonzero*y + singledigit? ((evenbits*y) >> LG2BIGDIGITBITS) : 0);
597     resultBuffer[0..result_start] = 0;
598     BigDigit [] t1 = resultBuffer[result_start..$];
599     BigDigit [] r1;
600     
601     if (singledigit) {
602         r1 = t1[0..1];
603         r1[0] = x0;
604         y = y0;        
605     } else {
606         // It's not worth right shifting by evenbits unless we also shrink the length after each 
607         // multiply or squaring operation. That might still be worthwhile for large y.
608         r1 = t1[0..x.data.length - firstnonzero];
609         r1[0..$] = x.data[firstnonzero..$];
610     }    
611 
612     if (y>1) {    // Set r1 = r1 ^^ y.
613          
614         // The secondary buffer only needs space for the multiplication results    
615         BigDigit [] secondaryBuffer = new BigDigit[resultBuffer.length - result_start];
616         BigDigit [] t2 = secondaryBuffer;
617         BigDigit [] r2;
618     
619         int shifts = 63; // num bits in a long
620         while(!(y & 0x8000_0000_0000_0000L)) {
621             y <<= 1;
622             --shifts;
623         }
624         y <<=1;
625    
626         while(y!=0) {
627             r2 = t2[0 .. r1.length*2];
628             squareInternal(r2, r1);
629             if (y & 0x8000_0000_0000_0000L) {           
630                 r1 = t1[0 .. r2.length + xlength];
631                 if (xlength == 1) {
632                     r1[$-1] = multibyteMul(r1[0 .. $-1], r2, x0, 0);
633                 } else {
634                     mulInternal(r1, r2, x.data);
635                 }
636             } else {
637                 r1 = t1[0 .. r2.length];
638                 r1[] = r2[];
639             }
640             y <<=1;
641             shifts--;
642         }
643         while (shifts>0) {
644             r2 = t2[0 .. r1.length * 2];
645             squareInternal(r2, r1);
646             r1 = t1[0 .. r2.length];
647             r1[] = r2[];
648             --shifts;
649         }
650     }   
651 
652     if (finalMultiplier!=1) {
653         BigDigit carry = multibyteMul(r1, r1, finalMultiplier, 0);
654         if (carry) {
655             r1 = t1[0 .. r1.length + 1];
656             r1[$-1] = carry;
657         }
658     }
659     if (evenshiftbits) {
660         BigDigit carry = multibyteShl(r1, r1, evenshiftbits);
661         if (carry!=0) {
662             r1 = t1[0 .. r1.length + 1];
663             r1[$ - 1] = carry;
664         }
665     }    
666     while(r1[$ - 1]==0) {
667         r1=r1[0 .. $ - 1];
668     }
669     result.data = resultBuffer[0 .. result_start + r1.length];
670     return result;
671 }
672 
673 } // end BigUint
674 
675 
676 // Remove leading zeros from x, to restore the BigUint invariant
677 BigDigit[] removeLeadingZeros(BigDigit [] x)
678 {
679     size_t k = x.length;
680     while(k>1 && x[k - 1]==0) --k;
681     return x[0 .. k];
682 }
683 
684 debug(UnitTest) {
685 unittest {
686 // Bug 1650.
687    BigUint r = BigUint([5]);
688    BigUint t = BigUint([7]);
689    BigUint s = BigUint.mod(r, t);
690    assert(s==5);
691 }
692 }
693 
694 
695 
696 debug (UnitTest) {
697 // Pow tests
698 unittest {
699     BigUint r, s;
700     r.fromHexString("80000000_00000001".dup);
701     s = BigUint.pow(r, 5);
702     r.fromHexString("08000000_00000000_50000000_00000001_40000000_00000002_80000000".dup
703       ~ "_00000002_80000000_00000001".dup);
704     assert(s == r);
705     s = 10;
706     s = BigUint.pow(s, 39);
707     r.fromDecimalString("1000000000000000000000000000000000000000".dup);
708     assert(s == r);
709     r.fromHexString("1_E1178E81_00000000".dup);
710     s = BigUint.pow(r, 15); // Regression test: this used to overflow array bounds
711 
712 }
713 
714 // Radix conversion tests
715 unittest {   
716     BigUint r;
717     r.fromHexString("1_E1178E81_00000000".dup);
718     assert(r.toHexString(0, '_', 0) == "1_E1178E81_00000000");
719     assert(r.toHexString(0, '_', 20) == "0001_E1178E81_00000000");
720     assert(r.toHexString(0, '_', 16+8) == "00000001_E1178E81_00000000");
721     assert(r.toHexString(0, '_', 16+9) == "0_00000001_E1178E81_00000000");
722     assert(r.toHexString(0, '_', 16+8+8) ==   "00000000_00000001_E1178E81_00000000");
723     assert(r.toHexString(0, '_', 16+8+8+1) ==      "0_00000000_00000001_E1178E81_00000000");
724     assert(r.toHexString(0, '_', 16+8+8+1, ' ') == "                  1_E1178E81_00000000");
725     assert(r.toHexString(0, 0, 16+8+8+1) == "00000000000000001E1178E8100000000");
726     r = 0;
727     assert(r.toHexString(0, '_', 0) == "0");
728     assert(r.toHexString(0, '_', 7) == "0000000");
729     assert(r.toHexString(0, '_', 7, ' ') == "      0");
730     assert(r.toHexString(0, '#', 9) == "0#00000000");
731     assert(r.toHexString(0, 0, 9) == "000000000");
732     
733 }
734 }
735 
736 private:
737 
738 // works for any type
739 T intpow(T)(T x, ulong n)
740 {
741     T p;
742 
743     switch (n)
744     {
745     case 0:
746         p = 1;
747         break;
748 
749     case 1:
750         p = x;
751         break;
752 
753     case 2:
754         p = x * x;
755         break;
756 
757     default:
758         p = 1;
759         while (1){
760             if (n & 1)
761                 p *= x;
762             n >>= 1;
763             if (!n)
764                 break;
765             x *= x;
766         }
767         break;
768     }
769     return p;
770 }
771 
772 
773 //  returns the maximum power of x that will fit in a uint.
774 int highestPowerBelowUintMax(uint x)
775 {
776      assert(x>1);     
777      const ubyte [22] maxpwr = [31, 20, 15, 13, 12, 11, 10, 10, 9, 9,
778                                  8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7];
779      if (x<24) return maxpwr[x-2]; 
780      if (x<41) return 6;
781      if (x<85) return 5;
782      if (x<256) return 4;
783      if (x<1626) return 3;
784      if (x<65536) return 2;
785      return 1;
786 }
787 
788 //  returns the maximum power of x that will fit in a ulong.
789 int highestPowerBelowUlongMax(uint x)
790 {
791      assert(x>1);     
792      const ubyte [39] maxpwr = [63, 40, 31, 27, 24, 22, 21, 20, 19, 18,
793                                  17, 17, 16, 16, 15, 15, 15, 15, 14, 14,
794                                  14, 14, 13, 13, 13, 13, 13, 13, 13, 12,
795                                  12, 12, 12, 12, 12, 12, 12, 12, 12];
796      if (x<41) return maxpwr[x-2]; 
797      if (x<57) return 11;
798      if (x<85) return 10;
799      if (x<139) return 9;
800      if (x<256) return 8;
801      if (x<566) return 7;
802      if (x<1626) return 6;
803      if (x<7132) return 5;
804      if (x<65536) return 4;
805      if (x<2642246) return 3;
806      return 2;
807 } 
808 
809 version(UnitTest) {
810 int slowHighestPowerBelowUintMax(uint x)
811 {
812      int pwr = 1;
813      for (ulong q = x;x*q < cast(ulong)uint.max; ) {
814          q*=x; ++pwr;
815      } 
816      return pwr;
817 }
818 
819 unittest {
820   assert(highestPowerBelowUintMax(10)==9);
821   for (int k=82; k<88; ++k) {assert(highestPowerBelowUintMax(k)== slowHighestPowerBelowUintMax(k)); }
822 }
823 }
824 
825 
826 /*  General unsigned subtraction routine for bigints.
827  *  Sets result = x - y. If the result is negative, negative will be true.
828  */
829 BigDigit [] sub(in BigDigit[] x, in BigDigit[] y, bool *negative)
830 {
831     if (x.length == y.length) {
832         // There's a possibility of cancellation, if x and y are almost equal.
833         int last = highestDifferentDigit(x, y);
834         BigDigit [] result = new BigDigit[last+1];
835         if (x[last] < y[last]) { // we know result is negative
836             multibyteSub(result[0..last+1], y[0..last+1], x[0..last+1], 0);
837             *negative = true;
838         } else { // positive or zero result
839             multibyteSub(result[0..last+1], x[0..last+1], y[0..last+1], 0);
840             *negative = false;
841         }
842         while (result.length > 1 && result[$-1] == 0) {
843             result = result[0..$-1];
844         }
845         return result;
846     }
847     // Lengths are different
848     const (BigDigit) [] large, small;
849     if (x.length < y.length) {
850         *negative = true;
851         large = y; small = x;
852     } else {
853         *negative = false;
854         large = x; small = y;
855     }
856     
857     BigDigit [] result = new BigDigit[large.length];
858     BigDigit carry = multibyteSub(result[0..small.length], large[0..small.length], small, 0);
859     result[small.length..$] = large[small.length..$];
860     if (carry) {
861         multibyteIncrementAssign!('-')(result[small.length..$], carry);
862     }
863     while (result.length > 1 && result[$-1] == 0) {
864         result = result[0..$-1];
865     }    
866     return result;
867 }
868 
869 
870 // return a + b
871 BigDigit [] add(BigDigit[] a, BigDigit [] b) {
872     BigDigit [] x, y;
873     if (a.length<b.length) { x = b; y = a; } else { x = a; y = b; }
874     // now we know x.length > y.length
875     // create result. add 1 in case it overflows
876     BigDigit [] result = new BigDigit[x.length + 1];
877     
878     BigDigit carry = multibyteAdd(result[0..y.length], x[0..y.length], y, 0);
879     if (x.length != y.length){
880         result[y.length..$-1]= x[y.length..$];
881         carry  = multibyteIncrementAssign!('+')(result[y.length..$-1], carry);
882     }
883     if (carry) {
884         result[$-1] = carry;
885         return result;
886     } else return result[0..$-1];
887 }
888     
889 /**  return x + y
890  */
891 BigDigit [] addInt(BigDigit[] x, ulong y)
892 {
893     uint hi = cast(uint)(y >>> 32);
894     uint lo = cast(uint)(y& 0xFFFF_FFFF);
895     uint len = x.length;
896     if (x.length < 2 && hi!=0) ++len;
897     BigDigit [] result = new BigDigit[len+1];
898     result[0..x.length] = x[]; 
899     if (x.length < 2 && hi!=0) { result[1]=hi; hi=0; }	
900     uint carry = multibyteIncrementAssign!('+')(result[0..$-1], lo);
901     if (hi!=0) carry += multibyteIncrementAssign!('+')(result[1..$-1], hi);
902     if (carry) {
903         result[$-1] = carry;
904         return result;
905     } else return result[0..$-1];
906 }
907 
908 /** Return x - y.
909  *  x must be greater than y.
910  */  
911 BigDigit [] subInt(BigDigit[] x, ulong y)
912 {
913     uint hi = cast(uint)(y >>> 32);
914     uint lo = cast(uint)(y & 0xFFFF_FFFF);
915     BigDigit [] result = new BigDigit[x.length];
916     result[] = x[];
917     multibyteIncrementAssign!('-')(result[], lo);
918     if (hi) multibyteIncrementAssign!('-')(result[1..$], hi);
919     if (result[$-1]==0) return result[0..$-1];
920     else return result; 
921 }
922 
923 /**  General unsigned multiply routine for bigints.
924  *  Sets result = x * y.
925  *
926  *  The length of y must not be larger than the length of x.
927  *  Different algorithms are used, depending on the lengths of x and y.
928  *  TODO: "Modern Computer Arithmetic" suggests the OddEvenKaratsuba algorithm for the
929  *  unbalanced case. (But I doubt it would be faster in practice).
930  *  
931  */
932 void mulInternal(BigDigit[] result, in BigDigit[] x, in BigDigit[] y)
933 {
934     assert( result.length == x.length + y.length );
935     assert( y.length > 0 );
936     assert( x.length >= y.length);
937     if (y.length <= KARATSUBALIMIT) {
938         // Small multiplier, we'll just use the asm classic multiply.
939         if (y.length==1) { // Trivial case, no cache effects to worry about
940             result[x.length] = multibyteMul(result[0..x.length], x, y[0], 0);
941             return;
942         }
943         if (x.length + y.length < CACHELIMIT) return mulSimple(result, x, y);
944         
945         // If x is so big that it won't fit into the cache, we divide it into chunks            
946         // Every chunk must be greater than y.length.
947         // We make the first chunk shorter, if necessary, to ensure this.
948         
949         uint chunksize = CACHELIMIT/y.length;
950         uint residual  =  x.length % chunksize;
951         if (residual < y.length) { chunksize -= y.length; }
952         // Use schoolbook multiply.
953         mulSimple(result[0 .. chunksize + y.length], x[0..chunksize], y);
954         uint done = chunksize;        
955     
956         while (done < x.length) {            
957             // result[done .. done+ylength] already has a value.
958             chunksize = (done + (CACHELIMIT/y.length) < x.length) ? (CACHELIMIT/y.length) :  x.length - done;
959             BigDigit [KARATSUBALIMIT] partial;
960             partial[0..y.length] = result[done..done+y.length];
961             mulSimple(result[done..done+chunksize+y.length], x[done..done+chunksize], y);
962             addAssignSimple(result[done..done+chunksize + y.length], partial[0..y.length]);
963             done += chunksize;
964         }
965         return;
966     }
967     
968     uint half = (x.length >> 1) + (x.length & 1);
969     if (2*y.length*y.length <= x.length*x.length) {
970         // UNBALANCED MULTIPLY
971         // Use school multiply to cut into quasi-squares of Karatsuba-size
972         // or larger. The ratio of the two sides of the 'square' must be 
973         // between 1.414:1 and 1:1. Use Karatsuba on each chunk. 
974         //
975         // For maximum performance, we want the ratio to be as close to 
976         // 1:1 as possible. To achieve this, we can either pad x or y.
977         // The best choice depends on the modulus x%y.       
978         uint numchunks = x.length / y.length;
979         uint chunksize = y.length;
980         uint extra =  x.length % y.length;
981         uint maxchunk = chunksize + extra;
982         bool paddingY; // true = we're padding Y, false = we're padding X.
983         if (extra * extra * 2 < y.length*y.length) {
984             // The leftover bit is small enough that it should be incorporated
985             // in the existing chunks.            
986             // Make all the chunks a tiny bit bigger
987             // (We're padding y with zeros)
988             chunksize += extra / cast(double)numchunks;
989             extra = x.length - chunksize*numchunks;
990             // there will probably be a few left over.
991             // Every chunk will either have size chunksize, or chunksize+1.
992             maxchunk = chunksize + 1;
993             paddingY = true;
994             assert(chunksize + extra + chunksize *(numchunks-1) == x.length );
995         } else  {
996             // the extra bit is large enough that it's worth making a new chunk.
997             // (This means we're padding x with zeros, when doing the first one).
998             maxchunk = chunksize;
999             ++numchunks;
1000             paddingY = false;
1001             assert(extra + chunksize *(numchunks-1) == x.length );
1002         }
1003         // We make the buffer a bit bigger so we have space for the partial sums.
1004         BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(maxchunk) + y.length];
1005         BigDigit [] partial = scratchbuff[$ - y.length .. $];
1006         uint done; // how much of X have we done so far?
1007         double residual = 0;
1008         if (paddingY) {
1009             // If the first chunk is bigger, do it first. We're padding y. 
1010           mulKaratsuba(result[0 .. y.length + chunksize + (extra > 0 ? 1 : 0 )], 
1011                         x[0 .. chunksize + (extra>0?1:0)], y, scratchbuff);
1012           done = chunksize + (extra > 0 ? 1 : 0);
1013           if (extra) --extra;
1014         } else { // We're padding X. Begin with the extra bit.
1015             mulKaratsuba(result[0 .. y.length + extra], y, x[0..extra], scratchbuff);
1016             done = extra;
1017             extra = 0;
1018         }
1019         auto basechunksize = chunksize;
1020         while (done < x.length) {
1021             chunksize = basechunksize + (extra > 0 ? 1 : 0);
1022             if (extra) --extra;
1023             partial[] = result[done .. done+y.length];
1024             mulKaratsuba(result[done .. done + y.length + chunksize], 
1025                        x[done .. done+chunksize], y, scratchbuff);
1026             addAssignSimple(result[done .. done + y.length + chunksize], partial);
1027             done += chunksize;
1028         }
1029         delete scratchbuff;
1030     } else {
1031         // Balanced. Use Karatsuba directly.
1032         BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
1033         mulKaratsuba(result, x, y, scratchbuff);
1034         delete scratchbuff;
1035     }
1036 }
1037 
1038 /**  General unsigned squaring routine for BigInts.
1039  *   Sets result = x*x.
1040  *   NOTE: If the highest half-digit of x is zero, the highest digit of result will
1041  *   also be zero.
1042  */
1043 void squareInternal(BigDigit[] result, BigDigit[] x)
1044 {
1045   // TODO: Squaring is potentially half a multiply, plus add the squares of 
1046   // the diagonal elements.
1047   assert(result.length == 2*x.length);
1048   if (x.length <= KARATSUBASQUARELIMIT) {
1049       if (x.length==1) {
1050          result[1] = multibyteMul(result[0..1], x, x[0], 0);
1051          return;
1052       }
1053       return squareSimple(result, x);
1054   }
1055   // The nice thing about squaring is that it always stays balanced
1056   BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
1057   squareKaratsuba(result, x, scratchbuff);
1058   delete scratchbuff;  
1059 }
1060 
1061 
1062 import tango.core.BitManip : bsr;
1063 
1064 /// if remainder is null, only calculate quotient.
1065 void divModInternal(BigDigit [] quotient, BigDigit[] remainder, BigDigit [] u, BigDigit [] v)
1066 {
1067     assert(quotient.length == u.length - v.length + 1);
1068     assert(remainder==null || remainder.length == v.length);
1069     assert(v.length > 1);
1070     assert(u.length >= v.length);
1071     
1072     // Normalize by shifting v left just enough so that
1073     // its high-order bit is on, and shift u left the
1074     // same amount. The highest bit of u will never be set.
1075    
1076     BigDigit [] vn = new BigDigit[v.length];
1077     BigDigit [] un = new BigDigit[u.length + 1];
1078     // How much to left shift v, so that its MSB is set.
1079     uint s = BIGDIGITSHIFTMASK - bsr(v[$-1]);
1080     if (s!=0) {
1081         multibyteShl(vn, v, s);        
1082         un[$-1] = multibyteShl(un[0..$-1], u, s);
1083     } else {
1084         vn[] = v[];
1085         un[0..$-1] = u[];
1086         un[$-1] = 0;
1087     }
1088     if (quotient.length<FASTDIVLIMIT) {
1089         schoolbookDivMod(quotient, un, vn);
1090     } else {
1091         fastDivMod(quotient, un, vn);        
1092     }
1093     
1094     // Unnormalize remainder, if required.
1095     if (remainder != null) {
1096         if (s == 0) remainder[] = un[0..vn.length];
1097         else multibyteShr(remainder, un[0..vn.length+1], s);
1098     }
1099     delete un;
1100     delete vn;
1101 }
1102 
1103 debug(UnitTest)
1104 {
1105 unittest {
1106     uint [] u = [0, 0xFFFF_FFFE, 0x8000_0000];
1107     uint [] v = [0xFFFF_FFFF, 0x8000_0000];
1108     uint [] q = new uint[u.length - v.length + 1];
1109     uint [] r = new uint[2];
1110     divModInternal(q, r, u, v);
1111     assert(q[]==[0xFFFF_FFFFu, 0]);
1112     assert(r[]==[0xFFFF_FFFFu, 0x7FFF_FFFF]);
1113     u = [0, 0xFFFF_FFFE, 0x8000_0001];
1114     v = [0xFFFF_FFFF, 0x8000_0000];
1115     divModInternal(q, r, u, v);
1116 }
1117 }
1118 
1119 private:
1120 // Converts a big uint to a hexadecimal string.
1121 //
1122 // Optionally, a separator character (eg, an underscore) may be added between
1123 // every 8 digits.
1124 // buff.length must be data.length*8 if separator is zero,
1125 // or data.length*9 if separator is non-zero. It will be completely filled.
1126 char [] biguintToHex(char [] buff, BigDigit [] data, char separator=0)
1127 {
1128     int x=0;
1129     for (int i=data.length - 1; i>=0; --i) {
1130         toHexZeroPadded(buff[x..x+8], data[i]);
1131         x+=8;
1132         if (separator) {
1133             if (i>0) buff[x] = separator;
1134             ++x;
1135         }
1136     }
1137     return buff;
1138 }
1139 
1140 /** Convert a big uint into a decimal string.
1141  *
1142  * Params:
1143  *  data    The biguint to be converted. Will be destroyed.
1144  *  buff    The destination buffer for the decimal string. Must be
1145  *          large enough to store the result, including leading zeros.
1146  *          Will be filled backwards, starting from buff[$-1].
1147  *
1148  * buff.length must be >= (data.length*32)/log2(10) = 9.63296 * data.length.
1149  * Returns:
1150  *    the lowest index of buff which was used.
1151  */
1152 int biguintToDecimal(char [] buff, BigDigit [] data){
1153     int sofar = buff.length;
1154     // Might be better to divide by (10^38/2^32) since that gives 38 digits for
1155     // the price of 3 divisions and a shr; this version only gives 27 digits
1156     // for 3 divisions.
1157     while(data.length>1) {
1158         uint rem = multibyteDivAssign(data, 10_0000_0000, 0);
1159         itoaZeroPadded(buff[sofar-9 .. sofar], rem);
1160         sofar -= 9;
1161         if (data[$-1]==0 && data.length>1) {
1162             data.length = data.length - 1;
1163         }
1164     }
1165     itoaZeroPadded(buff[sofar-10 .. sofar], data[0]);
1166     sofar -= 10;
1167     // and strip off the leading zeros
1168     while(sofar!= buff.length-1 && buff[sofar] == '0') sofar++;    
1169     return sofar;
1170 }
1171 
1172 /** Convert a decimal string into a big uint.
1173  *
1174  * Params:
1175  *  data    The biguint to be receive the result. Must be large enough to 
1176  *          store the result.
1177  *  s       The decimal string. May contain 0..9, or _. Will be preserved.
1178  *
1179  * The required length for the destination buffer is slightly less than
1180  *  1 + s.length/log2(10) = 1 + s.length/3.3219.
1181  *
1182  * Returns:
1183  *    the highest index of data which was used.
1184  */
1185 int biguintFromDecimal(BigDigit [] data, const(char []) s) {
1186     // Convert to base 1e19 = 10_000_000_000_000_000_000.
1187     // (this is the largest power of 10 that will fit into a long).
1188     // The length will be less than 1 + s.length/log2(10) = 1 + s.length/3.3219.
1189     // 485 bits will only just fit into 146 decimal digits.
1190     uint lo = 0;
1191     uint x = 0;
1192     ulong y = 0;
1193     uint hi = 0;
1194     data[0] = 0; // initially number is 0.
1195     data[1] = 0;    
1196    
1197     for (int i= (s[0]=='-' || s[0]=='+')? 1 : 0; i<s.length; ++i) {            
1198         if (s[i] == '_') continue;
1199         x *= 10;
1200         x += s[i] - '0';
1201         ++lo;
1202         if (lo==9) {
1203             y = x;
1204             x = 0;
1205         }
1206         if (lo==18) {
1207             y *= 10_0000_0000;
1208             y += x;
1209             x = 0;
1210         }
1211         if (lo==19) {
1212             y *= 10;
1213             y += x;
1214             x = 0;
1215             // Multiply existing number by 10^19, then add y1.
1216             if (hi>0) {
1217                 data[hi] = multibyteMul(data[0..hi], data[0..hi], 1220703125*2, 0); // 5^13*2 = 0x9184_E72A
1218                 ++hi;
1219                 data[hi] = multibyteMul(data[0..hi], data[0..hi], 15625*262144, 0); // 5^6*2^18 = 0xF424_0000
1220                 ++hi;
1221             } else hi = 2;
1222             uint c = multibyteIncrementAssign!('+')(data[0..hi], cast(uint)(y&0xFFFF_FFFF));
1223             c += multibyteIncrementAssign!('+')(data[1..hi], cast(uint)(y>>32));
1224             if (c!=0) {
1225                 data[hi]=c;
1226                 ++hi;
1227             }
1228             y = 0;
1229             lo = 0;
1230         }
1231     }
1232     // Now set y = all remaining digits.
1233     if (lo>=18) {
1234     } else if (lo>=9) {
1235         for (int k=9; k<lo; ++k) y*=10;
1236         y+=x;
1237     } else {
1238         for (int k=0; k<lo; ++k) y*=10;
1239         y+=x;
1240     }
1241     if (lo!=0) {
1242         if (hi==0)  {
1243             *cast(ulong *)(&data[hi]) = y;
1244             hi=2;
1245         } else {
1246             while (lo>0) {
1247                 uint c = multibyteMul(data[0..hi], data[0..hi], 10, 0);
1248                 if (c!=0) { data[hi]=c; ++hi; }                
1249                 --lo;
1250             }
1251             uint c = multibyteIncrementAssign!('+')(data[0..hi], cast(uint)(y&0xFFFF_FFFF));
1252             if (y>0xFFFF_FFFFL) {
1253                 c += multibyteIncrementAssign!('+')(data[1..hi], cast(uint)(y>>32));
1254             }
1255             if (c!=0) { data[hi]=c; ++hi; }
1256           //  hi+=2;
1257         }
1258     }
1259     if (hi>1 && data[hi-1]==0) --hi;
1260     return hi;
1261 }
1262 
1263 
1264 private:
1265 // ------------------------
1266 // These in-place functions are only for internal use; they are incompatible
1267 // with COW.
1268 
1269 // Classic 'schoolbook' multiplication.
1270 void mulSimple(BigDigit[] result, in BigDigit [] left, in BigDigit[] right)
1271 in {    
1272     assert(result.length == left.length + right.length);
1273     assert(right.length>1);
1274 }
1275 body {
1276     result[left.length] = multibyteMul(result[0..left.length], left, right[0], 0);   
1277     multibyteMultiplyAccumulate(result[1..$], left, right[1..$]);
1278 }
1279 
1280 // Classic 'schoolbook' squaring
1281 void squareSimple(BigDigit[] result, BigDigit [] x)
1282 in {    
1283     assert(result.length == 2*x.length);
1284     assert(x.length>1);
1285 }
1286 body {
1287     multibyteSquare(result, x);
1288 }
1289 
1290 
1291 // add two uints of possibly different lengths. Result must be as long
1292 // as the larger length.
1293 // Returns carry (0 or 1).
1294 uint addSimple(BigDigit [] result, BigDigit [] left, BigDigit [] right)
1295 in {
1296     assert(result.length == left.length);
1297     assert(left.length >= right.length);
1298     assert(right.length>0);
1299 }
1300 body {
1301     uint carry = multibyteAdd(result[0..right.length],
1302             left[0..right.length], right, 0);
1303     if (right.length < left.length) {
1304         result[right.length..left.length] = left[right.length .. $];            
1305         carry = multibyteIncrementAssign!('+')(result[right.length..$], carry);
1306     }
1307     return carry;
1308 }
1309 
1310 //  result = left - right
1311 // returns carry (0 or 1)
1312 BigDigit subSimple(BigDigit [] result, BigDigit [] left, BigDigit [] right)
1313 in {
1314     assert(result.length == left.length);
1315     assert(left.length >= right.length);
1316     assert(right.length>0);
1317 }
1318 body {
1319     BigDigit carry = multibyteSub(result[0..right.length],
1320             left[0..right.length], right, 0);
1321     if (right.length < left.length) {
1322         result[right.length..left.length] = left[right.length .. $];            
1323         carry = multibyteIncrementAssign!('-')(result[right.length..$], carry);
1324     } //else if (result.length==left.length+1) { result[$-1] = carry; carry=0; }
1325     return carry;
1326 }
1327 
1328 
1329 /* result = result - right 
1330  * Returns carry = 1 if result was less than right.
1331 */
1332 BigDigit subAssignSimple(BigDigit [] result, BigDigit [] right)
1333 {
1334     assert(result.length >= right.length);
1335     uint c = multibyteSub(result[0..right.length], result[0..right.length], right, 0); 
1336     if (c && result.length > right.length) c = multibyteIncrementAssign!('-')(result[right.length .. $], c);
1337     return c;
1338 }
1339 
1340 /* result = result + right
1341 */
1342 BigDigit addAssignSimple(BigDigit [] result, BigDigit [] right)
1343 {
1344     assert(result.length >= right.length);
1345     uint c = multibyteAdd(result[0..right.length], result[0..right.length], right, 0);
1346     if (c && result.length > right.length) {
1347        c = multibyteIncrementAssign!('+')(result[right.length .. $], c);
1348     }
1349     return c;
1350 }
1351 
1352 /* performs result += wantSub? - right : right;
1353 */
1354 BigDigit addOrSubAssignSimple(BigDigit [] result, BigDigit [] right, bool wantSub)
1355 {
1356   if (wantSub) return subAssignSimple(result, right);
1357   else return addAssignSimple(result, right);
1358 }
1359 
1360 
1361 // return true if x<y, considering leading zeros
1362 bool less(in BigDigit[] x, in BigDigit[] y)
1363 {
1364     assert(x.length >= y.length);
1365     uint k = x.length-1;
1366     while(x[k]==0 && k>=y.length) --k; 
1367     if (k>=y.length) return false;
1368     while (k>0 && x[k]==y[k]) --k;
1369     return x[k] < y[k];
1370 }
1371 
1372 // Set result = abs(x-y), return true if result is negative(x<y), false if x<=y.
1373 bool inplaceSub(BigDigit[] result, in BigDigit[] x, in BigDigit[] y)
1374 {
1375     assert(result.length == (x.length >= y.length) ? x.length : y.length);
1376     
1377     size_t minlen;
1378     bool negative;
1379     if (x.length >= y.length) {
1380         minlen = y.length;
1381         negative = less(x, y);
1382     } else {
1383        minlen = x.length;
1384        negative = !less(y, x);
1385     }
1386     const(BigDigit)[] large, small;
1387     if (negative) { large = y; small=x; } else { large=x; small=y; }
1388        
1389     BigDigit carry = multibyteSub(result[0..minlen], large[0..minlen], small[0..minlen], 0);
1390     if (x.length != y.length) {
1391         result[minlen..large.length]= large[minlen..$];
1392         result[large.length..$] = 0;
1393         if (carry) multibyteIncrementAssign!('-')(result[minlen..$], carry);
1394     }
1395     return negative;
1396 }
1397 
1398 /* Determine how much space is required for the temporaries
1399  * when performing a Karatsuba multiplication. 
1400  */
1401 uint karatsubaRequiredBuffSize(uint xlen)
1402 {
1403     return xlen <= KARATSUBALIMIT ? 0 : 2*xlen; // - KARATSUBALIMIT+2;
1404 }
1405 
1406 /* Sets result = x*y, using Karatsuba multiplication.
1407 * x must be longer or equal to y.
1408 * Valid only for balanced multiplies, where x is not shorter than y.
1409 * It is superior to schoolbook multiplication if and only if 
1410 *    sqrt(2)*y.length > x.length > y.length.
1411 * Karatsuba multiplication is O(n^1.59), whereas schoolbook is O(n^2)
1412 * The maximum allowable length of x and y is uint.max; but better algorithms
1413 * should be used far before that length is reached.
1414 * Params:
1415 * scratchbuff      An array long enough to store all the temporaries. Will be destroyed.
1416 */
1417 void mulKaratsuba(BigDigit [] result, in BigDigit [] x, in BigDigit[] y, BigDigit [] scratchbuff)
1418 {
1419     assert(x.length >= y.length);
1420 	  assert(result.length < uint.max, "Operands too large");
1421     assert(result.length == x.length + y.length);
1422     if (x.length <= KARATSUBALIMIT) {
1423         return mulSimple(result, x, y);
1424     }
1425     // Must be almost square (otherwise, a schoolbook iteration is better)
1426     assert(2L * y.length * y.length > (x.length-1) * (x.length-1),
1427         "Bigint Internal Error: Asymmetric Karatsuba");
1428         
1429     // The subtractive version of Karatsuba multiply uses the following result:
1430     // (Nx1 + x0)*(Ny1 + y0) = (N*N)*x1y1 + x0y0 + N * (x0y0 + x1y1 - mid)
1431     // where mid = (x0-x1)*(y0-y1)
1432     // requiring 3 multiplies of length N, instead of 4.
1433     // The advantage of the subtractive over the additive version is that
1434     // the mid multiply cannot exceed length N. But there are subtleties:
1435     // (x0-x1),(y0-y1) may be negative or zero. To keep it simple, we 
1436     // retain all of the leading zeros in the subtractions
1437     
1438     // half length, round up.
1439     uint half = (x.length >> 1) + (x.length & 1);
1440     
1441     const(BigDigit) [] x0 = x[0 .. half];
1442     const(BigDigit) [] x1 = x[half .. $];    
1443     const(BigDigit) [] y0 = y[0 .. half];
1444     const(BigDigit) [] y1 = y[half .. $];
1445     BigDigit [] mid = scratchbuff[0 .. half*2];
1446     BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
1447     BigDigit [] resultLow = result[0 .. 2*half];
1448     BigDigit [] resultHigh = result[2*half .. $];
1449      // initially use result to store temporaries
1450     BigDigit [] xdiff= result[0 .. half];
1451     BigDigit [] ydiff = result[half .. half*2];
1452     
1453     // First, we calculate mid, and sign of mid
1454     bool midNegative = inplaceSub(xdiff, x0, x1)
1455                       ^ inplaceSub(ydiff, y0, y1);
1456     mulKaratsuba(mid, xdiff, ydiff, newscratchbuff);
1457     
1458     // Low half of result gets x0 * y0. High half gets x1 * y1
1459   
1460     mulKaratsuba(resultLow, x0, y0, newscratchbuff);
1461     
1462     if (2L * y1.length * y1.length < x1.length * x1.length) {
1463         // an asymmetric situation has been created.
1464         // Worst case is if x:y = 1.414 : 1, then x1:y1 = 2.41 : 1.
1465         // Applying one schoolbook multiply gives us two pieces each 1.2:1
1466         if (y1.length <= KARATSUBALIMIT) {
1467             mulSimple(resultHigh, x1, y1);
1468         } else {
1469             // divide x1 in two, then use schoolbook multiply on the two pieces.
1470             uint quarter = (x1.length >> 1) + (x1.length & 1);
1471             bool ysmaller = (quarter >= y1.length);
1472             mulKaratsuba(resultHigh[0..quarter+y1.length], ysmaller ? x1[0..quarter] : y1, 
1473                 ysmaller ? y1 : x1[0..quarter], newscratchbuff);
1474             // Save the part which will be overwritten.
1475             bool ysmaller2 = ((x1.length - quarter) >= y1.length);
1476             newscratchbuff[0..y1.length] = resultHigh[quarter..quarter + y1.length];
1477             mulKaratsuba(resultHigh[quarter..$], ysmaller2 ? x1[quarter..$] : y1, 
1478                 ysmaller2 ? y1 : x1[quarter..$], newscratchbuff[y1.length..$]);
1479 
1480             resultHigh[quarter..$].addAssignSimple(newscratchbuff[0..y1.length]);                
1481         }
1482     } else mulKaratsuba(resultHigh, x1, y1, newscratchbuff);
1483 
1484     /* We now have result = x0y0 + (N*N)*x1y1
1485        Before adding or subtracting mid, we must calculate
1486        result += N * (x0y0 + x1y1)    
1487        We can do this with three half-length additions. With a = x0y0, b = x1y1:
1488                       aHI aLO
1489         +       aHI   aLO
1490         +       bHI   bLO
1491         +  bHI  bLO
1492         =  R3   R2    R1   R0        
1493         R1 = aHI + bLO + aLO
1494         R2 = aHI + bLO + aHI + carry_from_R1
1495         R3 = bHi + carry_from_R2
1496          Can also do use newscratchbuff:
1497 
1498 //    It might actually be quicker to do it in two full-length additions:        
1499 //    newscratchbuff[2*half] = addSimple(newscratchbuff[0..2*half], result[0..2*half], result[2*half..$]);
1500 //    addAssignSimple(result[half..$], newscratchbuff[0..2*half+1]);
1501    */
1502     BigDigit[] R1 = result[half..half*2];
1503     BigDigit[] R2 = result[half*2..half*3];
1504     BigDigit[] R3 = result[half*3..$];
1505     BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
1506     BigDigit c2 = multibyteAdd(R1, R2, result[0..half], 0); // c2:R1 = R2 + R1 + R0
1507     BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
1508     if (c1+c2) multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
1509     if (c1+c3) multibyteIncrementAssign!('+')(R3, c1+c3);
1510      
1511     // And finally we subtract mid
1512     addOrSubAssignSimple(result[half..$], mid, !midNegative);
1513 }
1514 
1515 void squareKaratsuba(BigDigit [] result, BigDigit [] x, BigDigit [] scratchbuff)
1516 {
1517     // See mulKaratsuba for implementation comments.
1518     // Squaring is simpler, since it never gets asymmetric.
1519 	  assert(result.length < uint.max, "Operands too large");
1520     assert(result.length == 2*x.length);
1521     if (x.length <= KARATSUBASQUARELIMIT) {
1522         return squareSimple(result, x);
1523     }
1524     // half length, round up.
1525     uint half = (x.length >> 1) + (x.length & 1);
1526     
1527     BigDigit [] x0 = x[0 .. half];
1528     BigDigit [] x1 = x[half .. $];    
1529     BigDigit [] mid = scratchbuff[0 .. half*2];
1530     BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
1531      // initially use result to store temporaries
1532     BigDigit [] xdiff= result[0 .. half];
1533     BigDigit [] ydiff = result[half .. half*2];
1534     
1535     // First, we calculate mid. We don't need its sign
1536     inplaceSub(xdiff, x0, x1);
1537     squareKaratsuba(mid, xdiff, newscratchbuff);
1538   
1539     // Set result = x0x0 + (N*N)*x1x1
1540     squareKaratsuba(result[0 .. 2*half], x0, newscratchbuff);
1541     squareKaratsuba(result[2*half .. $], x1, newscratchbuff);
1542 
1543     /* result += N * (x0x0 + x1x1)    
1544        Do this with three half-length additions. With a = x0x0, b = x1x1:
1545         R1 = aHI + bLO + aLO
1546         R2 = aHI + bLO + aHI + carry_from_R1
1547         R3 = bHi + carry_from_R2
1548     */
1549     BigDigit[] R1 = result[half..half*2];
1550     BigDigit[] R2 = result[half*2..half*3];
1551     BigDigit[] R3 = result[half*3..$];
1552     BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
1553     BigDigit c2 = multibyteAdd(R1, R2, result[0..half], 0); // c2:R1 = R2 + R1 + R0
1554     BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
1555     if (c1+c2) multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
1556     if (c1+c3) multibyteIncrementAssign!('+')(R3, c1+c3);
1557      
1558     // And finally we subtract mid, which is always positive
1559     subAssignSimple(result[half..$], mid);
1560 }
1561 
1562 /* Knuth's Algorithm D, as presented in 
1563  * H.S. Warren, "Hacker's Delight", Addison-Wesley Professional (2002).
1564  * Also described in "Modern Computer Arithmetic" 0.2, Exercise 1.8.18.
1565  * Given u and v, calculates  quotient  = u/v, u = u%v.
1566  * v must be normalized (ie, the MSB of v must be 1).
1567  * The most significant words of quotient and u may be zero.
1568  * u[0..v.length] holds the remainder.
1569  */
1570 void schoolbookDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
1571 {
1572     assert(quotient.length == u.length - v.length);
1573     assert(v.length > 1);
1574     assert(u.length >= v.length);
1575     assert((v[$-1]&0x8000_0000)!=0);
1576     assert(u[$-1] < v[$-1]);
1577     // BUG: This code only works if BigDigit is uint.
1578     uint vhi = v[$-1];
1579     uint vlo = v[$-2];
1580         
1581     for (int j = u.length - v.length - 1; j >= 0; j--) {
1582         // Compute estimate of quotient[j],
1583         // qhat = (three most significant words of u)/(two most sig words of v).
1584         uint qhat;               
1585         if (u[j + v.length] == vhi) {
1586             // uu/vhi could exceed uint.max (it will be 0x8000_0000 or 0x8000_0001)
1587             qhat = uint.max;
1588         } else {
1589             uint ulo = u[j + v.length - 2];
1590 version(Naked_D_InlineAsm_X86) {
1591             // Note: On DMD, this is only ~10% faster than the non-asm code. 
1592             uint *p = &u[j + v.length - 1];
1593             asm {
1594                 mov EAX, p;
1595                 mov EDX, [EAX+4];
1596                 mov EAX, [EAX];
1597                 div dword ptr [vhi];
1598                 mov qhat, EAX;
1599                 mov ECX, EDX;
1600 div3by2correction:                
1601                 mul dword ptr [vlo]; // EDX:EAX = qhat * vlo
1602                 sub EAX, ulo;
1603                 sbb EDX, ECX;
1604                 jbe div3by2done;
1605                 mov EAX, qhat;
1606                 dec EAX;
1607                 mov qhat, EAX;
1608                 add ECX, dword ptr [vhi];
1609                 jnc div3by2correction;
1610 div3by2done:    ;
1611 }
1612             } else { // version(InlineAsm)
1613                 ulong uu = (cast(ulong)(u[j+v.length]) << 32) | u[j+v.length-1];
1614                 ulong bigqhat = uu / vhi;
1615                 ulong rhat =  uu - bigqhat * vhi;
1616                 qhat = cast(uint)bigqhat;            
1617        again:
1618                 if (cast(ulong)qhat*vlo > ((rhat<<32) + ulo)) {
1619                     --qhat;
1620                     rhat += vhi;
1621                     if (!(rhat & 0xFFFF_FFFF_0000_0000L)) goto again;
1622                 }
1623             } // version(InlineAsm)
1624         } 
1625         // Multiply and subtract.
1626         uint carry = multibyteMulAdd!('-')(u[j..j+v.length], v, qhat, 0);
1627 
1628         if (u[j+v.length] < carry) {
1629             // If we subtracted too much, add back
1630             --qhat;
1631             carry -= multibyteAdd(u[j..j+v.length],u[j..j+v.length], v, 0);
1632         }
1633         quotient[j] = qhat;
1634         u[j + v.length] = u[j + v.length] - carry;
1635     }
1636 }
1637 
1638 private:
1639 // TODO: Replace with a library call
1640 void itoaZeroPadded(char[] output, uint value, int radix = 10) {
1641     int x = output.length - 1;
1642     for( ; x>=0; --x) {
1643         output[x]= cast(char)(value % radix + '0');
1644         value /= radix;
1645     }
1646 }
1647 
1648 void toHexZeroPadded(char[] output, uint value) {
1649     int x = output.length - 1;
1650     const char [] hexDigits = "0123456789ABCDEF";
1651     for( ; x>=0; --x) {        
1652         output[x] = hexDigits[value & 0xF];
1653         value >>= 4;
1654     }
1655 }
1656 
1657 private:
1658     
1659 // Returns the highest value of i for which left[i]!=right[i],
1660 // or 0 if left[]==right[]
1661 int highestDifferentDigit(in BigDigit [] left, in BigDigit [] right)
1662 {
1663     assert(left.length == right.length);
1664     for (int i=left.length-1; i>0; --i) {
1665         if (left[i]!=right[i]) return i;
1666     }
1667     return 0;
1668 }
1669 
1670 // Returns the lowest value of i for which x[i]!=0.
1671 int firstNonZeroDigit(BigDigit[] x)
1672 {
1673     int k = 0;
1674     while (x[k]==0) {
1675         ++k;
1676         assert(k<x.length);
1677     }
1678     return k;
1679 }
1680 
1681 /* Calculate quotient and remainder of u / v using fast recursive division.
1682   v must be normalised, and must be at least half as long as u.
1683   Given u and v, v normalised, calculates  quotient  = u/v, u = u%v.
1684   Algorithm is described in 
1685   - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022, 
1686     Max-Planck Institute fuer Informatik, (Oct 1998).
1687   - R.P. Brent and P. Zimmermann, "Modern Computer Arithmetic", 
1688     Version 0.2, p. 26, (June 2008).
1689 Returns:    
1690     u[0..v.length] is the remainder. u[v.length..$] is corrupted.
1691     scratch is temporary storage space, must be at least as long as quotient.
1692 */
1693 void recursiveDivMod(BigDigit[] quotient, BigDigit[] u, in BigDigit[] v,
1694                      BigDigit[] scratch)
1695 in {
1696     assert(quotient.length == u.length - v.length);
1697     assert(u.length <= 2 * v.length, "Asymmetric division"); // use base-case division to get it to this situation
1698     assert(v.length > 1);
1699     assert(u.length >= v.length);
1700     assert((v[$ - 1] & 0x8000_0000) != 0);
1701     assert(scratch.length >= quotient.length);
1702     
1703 }
1704 body {
1705     if(quotient.length < FASTDIVLIMIT) {
1706         return schoolbookDivMod(quotient, u, v);
1707     }
1708     uint k = quotient.length >> 1;
1709     uint h = k + v.length;
1710 
1711     recursiveDivMod(quotient[k .. $], u[2 * k .. $], v[k .. $], scratch);
1712     adjustRemainder(quotient[k .. $], u[k .. h], v, k,
1713             scratch[0 .. quotient.length]);
1714     recursiveDivMod(quotient[0 .. k], u[k .. h], v[k .. $], scratch);
1715     adjustRemainder(quotient[0 .. k], u[0 .. v.length], v, k,
1716             scratch[0 .. 2 * k]);
1717 }
1718 
1719 // rem -= quot * v[0..k].
1720 // If would make rem negative, decrease quot until rem is >=0.
1721 // Needs (quot.length * k) scratch space to store the result of the multiply. 
1722 void adjustRemainder(BigDigit[] quot, BigDigit[] rem, in BigDigit[] v, int k,
1723                      BigDigit[] scratch)
1724 {
1725     assert(rem.length == v.length);
1726     mulInternal(scratch, quot, v[0 .. k]);
1727     uint carry = subAssignSimple(rem, scratch);
1728     while(carry) {
1729         multibyteIncrementAssign!('-')(quot, 1); // quot--
1730         carry -= multibyteAdd(rem, rem, v, 0);
1731     }
1732 }
1733 
1734 // Cope with unbalanced division by performing block schoolbook division.
1735 void fastDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
1736 {
1737     assert(quotient.length == u.length - v.length);
1738     assert(v.length > 1);
1739     assert(u.length >= v.length);
1740     assert((v[$-1] & 0x8000_0000)!=0);
1741     BigDigit [] scratch = new BigDigit[v.length];
1742 
1743     // Perform block schoolbook division, with 'v.length' blocks.
1744     uint m = u.length - v.length;
1745     while (m > v.length) {
1746         recursiveDivMod(quotient[m-v.length..m], 
1747             u[m - v.length..m + v.length], v, scratch);
1748         m -= v.length;
1749     }
1750     recursiveDivMod(quotient[0..m], u[0..m + v.length], v, scratch);
1751     delete scratch;
1752 }
1753 
1754 debug(UnitTest)
1755 {
1756 import tango.stdc.stdio;
1757 
1758 void printBiguint(uint [] data)
1759 {
1760     char [] buff = new char[data.length*9];
1761     printf("%.*s\n", biguintToHex(buff, data, '_'));
1762 }
1763 
1764 void printDecimalBigUint(BigUint data)
1765 {
1766    printf("%.*s\n", data.toDecimalString(0)); 
1767 }
1768 
1769 unittest{
1770   uint [] a, b;
1771   a = new uint[43];
1772   b = new uint[179];
1773   for (int i=0; i<a.length; ++i) a[i] = 0x1234_B6E9 + i;
1774   for (int i=0; i<b.length; ++i) b[i] = 0x1BCD_8763 - i*546;
1775   
1776   a[$-1] |= 0x8000_0000;
1777   uint [] r = new uint[a.length];
1778   uint [] q = new uint[b.length-a.length+1];
1779  
1780   divModInternal(q, r, b, a);
1781   q = q[0..$-1];
1782   uint [] r1 = r.dup;
1783   uint [] q1 = q.dup;  
1784   fastDivMod(q, b, a);
1785   r = b[0..a.length];
1786   assert(r[]==r1[]);
1787   assert(q[]==q1[]);
1788 }
1789 }
1790