tango.text.convert.Integer source code

1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6         
7         version:        Initial release: Nov 2005
8         
9         author:         Kris
10 
11         A set of functions for converting between string and integer 
12         values. 
13 
14         Applying the D "import alias" mechanism to this module is highly
15         recommended, in order to limit namespace pollution:
16         ---
17         import Integer = tango.text.convert.Integer;
18 
19         auto i = Integer.parse ("32767");
20         ---
21         
22 *******************************************************************************/
23 
24 module tango.text.convert.Integer;
25 
26 private import tango.core.Exception;
27 private import tango.core.Octal;
28 
29 /******************************************************************************
30 
31         Parse an integer value from the provided 'digits' string. 
32 
33         The string is inspected for a sign and an optional radix 
34         prefix. A radix may be provided as an argument instead, 
35         whereupon it must match the prefix (where present). When
36         radix is set to zero, conversion will default to decimal.
37 
38         Throws: IllegalArgumentException where the input text is not parsable
39         in its entirety.
40 
41         See_also: the low level functions parse() and convert()
42 
43 ******************************************************************************/
44 int toInt(T) (const(T[]) digits, uint radix=0)
45 {
46         auto x = toLong (digits, radix);
47         if (x > int.max)
48             throw new IllegalArgumentException ("Integer.toInt :: integer overflow");
49         return cast(int) x;
50 }
51 
52 /******************************************************************************
53 
54         Parse an integer value from the provided 'digits' string.
55 
56         The string is inspected for a sign and an optional radix
57         prefix. A radix may be provided as an argument instead,
58         whereupon it must match the prefix (where present). When
59         radix is set to zero, conversion will default to decimal.
60 
61         Throws: IllegalArgumentException where the input text is not parsable
62         in its entirety.
63 
64         See_also: the low level functions parse() and convert()
65 
66 ******************************************************************************/
67 long toLong(T) (const(T[]) digits, uint radix=0)
68 {
69         size_t len;
70 
71         auto x = parse (digits, radix, &len);
72         if (len < digits.length)
73             throw new IllegalArgumentException ("Integer.toLong :: invalid literal");
74         return x;
75 }
76 
77 /******************************************************************************
78 
79         Parse an unsignedinteger value from the provided 'digits' string.
80 
81         The string is inspected for an optional radix prefix. A
82         radix may be provided as an argument instead, whereupon
83         it must match the prefix (where present). When radix is
84         set to zero, conversion will default to decimal.
85 
86         Throws: IllegalArgumentException where the input text is not parsable
87         in its entirety.
88 
89         See_also: the low level functions parse() and convert()
90 
91 ******************************************************************************/
92 ulong toUlong(T) (const(T[]) digits, uint radix=0)
93 {
94         bool sign = false;
95 
96         auto eaten = trim (digits, sign, radix);
97         if (sign)
98             throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");
99 
100         size_t len = 0;
101         auto value = convert (digits[eaten..$], radix, &len);
102         if (len == 0 || eaten + len < digits.length)
103             throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");
104 
105         return value;
106 }
107 
108 /******************************************************************************
109 
110         Wrapper to make life simpler. Returns a text version
111         of the provided value.
112 
113         See format() for details
114 
115 ******************************************************************************/
116 
117 char[] toString (long i, const(char[]) fmt = null)
118 {
119         char[66] tmp = void;
120         return format (tmp, i, fmt).dup;
121 }
122                
123 /******************************************************************************
124 
125         Wrapper to make life simpler. Returns a text version
126         of the provided value.
127 
128         See format() for details
129 
130 ******************************************************************************/
131 
132 wchar[] toString16 (long i, const(wchar[]) fmt = null)
133 {
134         wchar[66] tmp = void;
135         return format (tmp, i, fmt).dup;
136 }
137                
138 /******************************************************************************
139 
140         Wrapper to make life simpler. Returns a text version
141         of the provided value.
142 
143         See format() for details
144 
145 ******************************************************************************/
146 
147 dchar[] toString32 (long i, const(dchar[]) fmt = null)
148 {
149         dchar[66] tmp = void;
150         return format (tmp, i, fmt).dup;
151 }
152                
153 /*******************************************************************************
154 
155         Supports format specifications via an array, where format follows
156         the notation given below:
157         ---
158         type width prefix
159         ---
160 
161         Type is one of [d, g, u, b, x, o] or uppercase equivalent, and
162         dictates the conversion radix or other semantics.
163 
164         Width is optional and indicates a minimum width for zero-padding,
165         while the optional prefix is one of ['#', ' ', '+'] and indicates
166         what variety of prefix should be placed in the output. e.g.
167         ---
168         "d"     => integer
169         "u"     => unsigned
170         "o"     => octal
171         "b"     => binary
172         "x"     => hexadecimal
173         "X"     => hexadecimal uppercase
174 
175         "d+"    => integer prefixed with "+"
176         "b#"    => binary prefixed with "0b"
177         "x#"    => hexadecimal prefixed with "0x"
178         "X#"    => hexadecimal prefixed with "0X"
179 
180         "d8"    => decimal padded to 8 places as required
181         "b8"    => binary padded to 8 places as required
182         "b8#"   => binary padded to 8 places and prefixed with "0b"
183         ---
184 
185         Note that the specified width is exclusive of the prefix, though
186         the width padding will be shrunk as necessary in order to ensure
187         a requested prefix can be inserted into the provided output.
188 
189 *******************************************************************************/
190 T[] format(T) (T[] dst, long i, const(T[]) fmt = null)
191 {
192         char    pre,
193                 type;
194         int     width;
195 
196         decode (fmt, type, pre, width);
197         return formatter (dst, i, type, pre, width);
198 } 
199 
200 private void decode(T) (in T[] fmt, ref char type, out char pre, out int width)
201 {
202         if (fmt.length is 0)
203             type = 'd';
204         else
205            {
206            type = cast(char)fmt[0];
207            if (fmt.length > 1)
208               {
209               auto p = &fmt[1];
210               for (int j=1; j < fmt.length; ++j, ++p)
211                    if (*p >= '0' && *p <= '9')
212                        width = width * 10 + (*p - '0');
213                    else
214                       pre = cast(char)*p;
215               }
216            }
217 } 
218 private struct _FormatterInfo(T)
219 {
220     uint    radix;
221     immutable(T)[]     prefix;
222     immutable(T)[]     numbers;
223 }
224 
225 T[] formatter(T) (T[] dst, long i, char type, char pre, int width)
226 {
227         __gshared immutable immutable(T)[] lower = "0123456789abcdef";
228         __gshared immutable immutable(T)[] upper = "0123456789ABCDEF";
229         
230         alias _FormatterInfo!(T) Info;
231 
232         __gshared immutable Info[] formats = 
233                 [
234                 {10, null, lower}, 
235                 {10, "-",  lower}, 
236                 {10, " ",  lower}, 
237                 {10, "+",  lower}, 
238                 { 2, "0b", lower}, 
239                 { 8, "0o", lower}, 
240                 {16, "0x", lower}, 
241                 {16, "0X", upper},
242                 ];
243 
244         ubyte index;
245         int len = cast(int)dst.length;
246 
247         if (len)
248            {
249            switch (type)
250                   {
251                   case 'd':
252                   case 'D':
253                   case 'g':
254                   case 'G':
255                        if (i < 0)
256                           {
257                           index = 1;
258                           i = -i;
259                           }
260                        else
261                           if (pre is ' ')
262                               index = 2;
263                           else
264                              if (pre is '+')
265                                  index = 3;
266                        goto case 'U';
267                   case 'u':
268                   case 'U':
269                        pre = '#';
270                        break;
271 
272                   case 'b':
273                   case 'B':
274                        index = 4;
275                        break;
276 
277                   case 'o':
278                   case 'O':
279                        index = 5;
280                        break;
281 
282                   case 'x':
283                        index = 6;
284                        break;
285 
286                   case 'X':
287                        index = 7;
288                        break;
289 
290                   default:
291                         return cast(T[])"{unknown format '".dup~cast(T)type~cast(T[])"'}".dup;
292                   }
293 
294            auto info = &formats[index];
295            auto numbers = info.numbers;
296            auto radix = info.radix;
297 
298            // convert number to text
299            auto p = dst.ptr + len;
300            if (uint.max >= cast(ulong) i)
301               {
302               auto v = cast (uint) i;
303               do {
304                  *--p = numbers [v % radix];
305                  } while ((v /= radix) && --len);
306               }
307            else
308               {
309               auto v = cast (ulong) i;
310               do {
311                  *--p = numbers [cast(uint) (v % radix)];
312                  } while ((v /= radix) && --len);
313               }
314         
315            auto prefix = (pre is '#') ? info.prefix : null;
316            if (len > prefix.length)
317               {
318               len -= prefix.length + 1;
319 
320               // prefix number with zeros? 
321               if (width)
322                  {
323                  width = cast(int)dst.length - width - cast(int)prefix.length;
324                  while (len > width && len > 0)
325                        {
326                        *--p = '0';
327                        --len;
328                        }
329                  }
330               // write optional prefix string ...
331               dst [len .. len + prefix.length] = prefix[];
332 
333               // return slice of provided output buffer
334               return dst [len .. $];                               
335               }
336            }
337         
338         return cast(T[])"{output width too small}".dup;
339 } 
340 
341 
342 /******************************************************************************
343 
344         Parse an integer value from the provided 'digits' string. 
345 
346         The string is inspected for a sign and an optional radix 
347         prefix. A radix may be provided as an argument instead, 
348         whereupon it must match the prefix (where present). When
349         radix is set to zero, conversion will default to decimal.
350 
351         A non-null 'ate' will return the number of characters used
352         to construct the returned value.
353 
354         Throws: none. The 'ate' param should be checked for valid input.
355 
356 ******************************************************************************/
357 long parse(T) (T[] digits, uint radix=0, size_t* ate=null)
358 {
359         bool sign;
360 
361         auto eaten = trim (digits, sign, radix);
362         auto value = convert (digits[eaten..$], radix, ate);
363 
364         // check *ate > 0 to make sure we don't parse "-" as 0.
365         if (ate && *ate > 0)
366             *ate += eaten;
367 
368         return cast(long) (sign ? -value : value);
369 }
370 
371 /******************************************************************************
372 
373         Convert the provided 'digits' into an integer value,
374         without checking for a sign or radix. The radix defaults
375         to decimal (10).
376 
377         Returns the value and updates 'ate' with the number of
378         characters consumed.
379 
380         Throws: none. The 'ate' param should be checked for valid input.
381 
382 ******************************************************************************/
383 ulong convert(T) (const(T[]) digits, uint radix=10, size_t* ate=null)
384 {
385         uint  eaten;
386         ulong value;
387 
388         foreach (c; cast(T[])digits)
389                 {
390                 if (c >= '0' && c <= '9')
391                    {}
392                 else
393                    if (c >= 'a' && c <= 'z')
394                        c -= 39;
395                    else
396                       if (c >= 'A' && c <= 'Z')
397                           c -= 7;
398                       else
399                          break;
400 
401                 if ((c -= '0') < radix)
402                    {
403                    value = value * radix + c;
404                    ++eaten;
405                    }
406                 else
407                    break;
408                 }
409 
410         if (ate)
411             *ate = eaten;
412 
413         return value;
414 }
415 
416 
417 /******************************************************************************
418 
419         Strip leading whitespace, extract an optional +/- sign,
420         and an optional radix prefix. If the radix value matches
421         an optional prefix, or the radix is zero, the prefix will
422         be consumed and assigned. Where the radix is non zero and
423         does not match an explicit prefix, the latter will remain 
424         unconsumed. Otherwise, radix will default to 10.
425 
426         Returns the number of characters consumed.
427 
428 ******************************************************************************/
429 size_t trim(T) (const(T[]) digits, ref bool sign, ref uint radix)
430 {
431         T       c;
432         const (T)*      p = digits.ptr;
433         auto     len = digits.length;
434 
435         if (len)
436            {
437            // strip off whitespace and sign characters
438            for (c = *p; len; c = *++p, --len)
439                 if (c is ' ' || c is '\t')
440                    {}
441                 else
442                    if (c is '-')
443                        sign = true;
444                    else
445                       if (c is '+')
446                           sign = false;
447                    else
448                       break;
449 
450            // strip off a radix specifier also?
451            auto r = radix;
452            if (c is '0' && len > 1)
453                switch (*++p)
454                       {
455                       case 'x':
456                       case 'X':
457                            ++p;
458                            r = 16;
459                            break;
460  
461                       case 'b':
462                       case 'B':
463                            ++p;
464                            r = 2;
465                            break;
466  
467                       case 'o':
468                       case 'O':
469                            ++p;
470                            r = 8;
471                            break;
472  
473                       default: 
474                             --p;
475                            break;
476                       } 
477 
478            // default the radix to 10
479            if (r is 0)
480                radix = 10;
481            else
482               // explicit radix must match (optional) prefix
483               if (radix != r)
484               {
485                   if (radix)
486                       p -= 2;
487                   else
488                      radix = r;
489               }
490            }
491 
492         // return number of characters eaten
493         return (p - digits.ptr);
494 }
495 
496 
497 /******************************************************************************
498 
499         quick & dirty text-to-unsigned int converter. Use only when you
500         know what the content is, or use parse() or convert() instead.
501 
502         Return the parsed uint
503         
504 ******************************************************************************/
505 
506 uint atoi(T) (T[] s, int radix = 10)
507 {
508         uint value;
509 
510         foreach (c; s)
511                  if (c >= '0' && c <= '9')
512                      value = value * radix + (c - '0');
513                  else
514                     break;
515         return value;
516 }
517 
518 
519 /******************************************************************************
520 
521         quick & dirty unsigned to text converter, where the provided output
522         must be large enough to house the result (10 digits in the largest
523         case). For mainstream use, consider utilizing format() instead.
524 
525         Returns a populated slice of the provided output
526         
527 ******************************************************************************/
528 T[] itoa(T) (T[] output, uint value, int radix = 10)
529 {
530         T* p = output.ptr + output.length;
531 
532         do {
533            *--p = cast(T)(value % radix + '0');
534            } while (value /= radix);
535         return output[cast(size_t) (p-output.ptr) .. $];
536 }
537 
538 
539 /******************************************************************************
540 
541         Consume a number from the input without converting it. Argument
542         'fp' enables floating-point consumption. Supports hex input for
543         numbers which are prefixed appropriately
544 
545         Since version 0.99.9
546 
547 ******************************************************************************/
548 
549 T[] consume(T) (T[] src, bool fp=false)
550 {
551         T       c;
552         bool    sign;
553         uint    radix;
554 
555         // remove leading space, and sign
556         auto e = src.ptr + src.length;
557         auto p = src.ptr + trim (src, sign, radix);
558         auto b = p;
559 
560         // bail out if the string is empty
561         if (src.length is 0 || p > &src[$-1])
562             return null;
563 
564         // read leading digits
565         for (c=*p; p < e && ((c >= '0' && c <= '9') || 
566             (radix is 16 && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))));)
567              c = *++p;
568 
569         if (fp)
570            {
571            // gobble up a point
572            if (c is '.' && p < e)
573                c = *++p;
574 
575            // read fractional digits
576            while (c >= '0' && c <= '9' && p < e)
577                   c = *++p;
578 
579            // did we consume anything?
580            if (p > b)
581               {
582               // consume exponent?
583               if ((c is 'e' || c is 'E') && p < e )
584                  {
585                  c = *++p;
586                  if (c is '+' || c is '-')
587                      c = *++p;
588                  while (c >= '0' && c <= '9' && p < e)
589                         c = *++p;
590                  }
591               }
592            }
593         return src [0 .. p-src.ptr];
594 }
595 
596 
597 /******************************************************************************
598 
599 ******************************************************************************/
600 
601 debug (UnitTest)
602 {
603 
604         unittest
605         {
606         char[64] tmp;
607         
608         assert (toInt("1") is 1);
609         assert (toLong("1") is 1);
610         assert (toInt("1", 10) is 1);
611         assert (toLong("1", 10) is 1);
612         assert (toUlong("1", 10) is 1);
613         assert (toUlong("18446744073709551615") is ulong.max);
614 
615         assert (atoi ("12345") is 12345);
616         assert (itoa (tmp, 12345) == "12345");
617 
618         assert(parse( "0"w ) ==  0 );
619         assert(parse( "1"w ) ==  1 );
620         assert(parse( "-1"w ) ==  -1 );
621         assert(parse( "+1"w ) ==  1 );
622 
623         // numerical limits
624         assert(parse( "-2147483648" ) == int.min );
625         assert(parse(  "2147483647" ) == int.max );
626         assert(parse(  "4294967295" ) == uint.max );
627 
628         assert(parse( "-9223372036854775808" ) == long.min );
629         assert(parse( "9223372036854775807" ) == long.max );
630         assert(parse( "18446744073709551615" ) == ulong.max );
631 
632         // hex
633         assert(parse( "a", 16) == 0x0A );
634         assert(parse( "b", 16) == 0x0B );
635         assert(parse( "c", 16) == 0x0C );
636         assert(parse( "d", 16) == 0x0D );
637         assert(parse( "e", 16) == 0x0E );
638         assert(parse( "f", 16) == 0x0F );
639         assert(parse( "A", 16) == 0x0A );
640         assert(parse( "B", 16) == 0x0B );
641         assert(parse( "C", 16) == 0x0C );
642         assert(parse( "D", 16) == 0x0D );
643         assert(parse( "E", 16) == 0x0E );
644         assert(parse( "F", 16) == 0x0F );
645         assert(parse( "FFFF", 16) == ushort.max );
646         assert(parse( "ffffFFFF", 16) == uint.max );
647         assert(parse( "ffffFFFFffffFFFF", 16u ) == ulong.max );
648         // oct
649         assert(parse( "55", 8) == octal!(55) );
650         assert(parse( "100", 8) == octal!(100) );
651         // bin
652         assert(parse( "10000", 2) == 0x10 );
653         // trim
654         assert(parse( "    \t20") == 20 );
655         assert(parse( "    \t-20") == -20 );
656         assert(parse( "-    \t 20") == -20 );
657         // recognise radix prefix
658         assert(parse( "0xFFFF" ) == ushort.max );
659         assert(parse( "0XffffFFFF" ) == uint.max );
660         assert(parse( "0o55") == octal!(55) );
661         assert(parse( "0O55" ) == octal!(55) );
662         assert(parse( "0b10000") == 0x10 );
663         assert(parse( "0B10000") == 0x10 );
664 
665         // prefix tests
666         auto str = "0x";
667         assert(parse( str[0..1] ) ==  0 );
668         assert(parse("0x10", 10) == 0);
669         assert(parse("0b10", 10) == 0);
670         assert(parse("0o10", 10) == 0);
671         assert(parse("0b10") == 0b10);
672         assert(parse("0o10") == octal!(10));
673         assert(parse("0b10", 2) == 0b10);
674         assert(parse("0o10", 8) == octal!(10));
675 
676         // revised tests
677         assert (format(tmp, 10, "d") == "10");
678         assert (format(tmp, -10, "d") == "-10");
679 
680         assert (format(tmp, 10L, "u") == "10");
681         assert (format(tmp, 10L, "U") == "10");
682         assert (format(tmp, 10L, "g") == "10");
683         assert (format(tmp, 10L, "G") == "10");
684         assert (format(tmp, 10L, "o") == "12");
685         assert (format(tmp, 10L, "O") == "12");
686         assert (format(tmp, 10L, "b") == "1010");
687         assert (format(tmp, 10L, "B") == "1010");
688         assert (format(tmp, 10L, "x") == "a");
689         assert (format(tmp, 10L, "X") == "A");
690 
691         assert (format(tmp, 10L, "d+") == "+10");
692         assert (format(tmp, 10L, "d ") == " 10");
693         assert (format(tmp, 10L, "d#") == "10");
694         assert (format(tmp, 10L, "x#") == "0xa");
695         assert (format(tmp, 10L, "X#") == "0XA");
696         assert (format(tmp, 10L, "b#") == "0b1010");
697         assert (format(tmp, 10L, "o#") == "0o12");
698 
699         assert (format(tmp, 10L, "d1") == "10");
700         assert (format(tmp, 10L, "d8") == "00000010");
701         assert (format(tmp, 10L, "x8") == "0000000a");
702         assert (format(tmp, 10L, "X8") == "0000000A");
703         assert (format(tmp, 10L, "b8") == "00001010");
704         assert (format(tmp, 10L, "o8") == "00000012");
705 
706         assert (format(tmp, 10L, "d1#") == "10");
707         assert (format(tmp, 10L, "d6#") == "000010");
708         assert (format(tmp, 10L, "x6#") == "0x00000a");
709         assert (format(tmp, 10L, "X6#") == "0X00000A");
710 
711         char[8] tmp1;
712         assert (format(tmp1, 10L, "b12#") == "0b001010");
713         assert (format(tmp1, 10L, "o12#") == "0o000012");
714         }
715 }
716 
717 /******************************************************************************
718 
719 ******************************************************************************/
720 
721 debug (Integer)
722 {
723         import tango.io.Stdout;
724 
725         void main()
726         {
727                 char[8] tmp;
728 
729                 Stdout.formatln ("d '{}'", format(tmp, 10));
730                 Stdout.formatln ("d '{}'", format(tmp, -10));
731 
732                 Stdout.formatln ("u '{}'", format(tmp, 10L, "u"));
733                 Stdout.formatln ("U '{}'", format(tmp, 10L, "U"));
734                 Stdout.formatln ("g '{}'", format(tmp, 10L, "g"));
735                 Stdout.formatln ("G '{}'", format(tmp, 10L, "G"));
736                 Stdout.formatln ("o '{}'", format(tmp, 10L, "o"));
737                 Stdout.formatln ("O '{}'", format(tmp, 10L, "O"));
738                 Stdout.formatln ("b '{}'", format(tmp, 10L, "b"));
739                 Stdout.formatln ("B '{}'", format(tmp, 10L, "B"));
740                 Stdout.formatln ("x '{}'", format(tmp, 10L, "x"));
741                 Stdout.formatln ("X '{}'", format(tmp, 10L, "X"));
742 
743                 Stdout.formatln ("d+ '{}'", format(tmp, 10L, "d+"));
744                 Stdout.formatln ("ds '{}'", format(tmp, 10L, "d "));
745                 Stdout.formatln ("d# '{}'", format(tmp, 10L, "d#"));
746                 Stdout.formatln ("x# '{}'", format(tmp, 10L, "x#"));
747                 Stdout.formatln ("X# '{}'", format(tmp, 10L, "X#"));
748                 Stdout.formatln ("b# '{}'", format(tmp, 10L, "b#"));
749                 Stdout.formatln ("o# '{}'", format(tmp, 10L, "o#"));
750 
751                 Stdout.formatln ("d1 '{}'", format(tmp, 10L, "d1"));
752                 Stdout.formatln ("d8 '{}'", format(tmp, 10L, "d8"));
753                 Stdout.formatln ("x8 '{}'", format(tmp, 10L, "x8"));
754                 Stdout.formatln ("X8 '{}'", format(tmp, 10L, "X8"));
755                 Stdout.formatln ("b8 '{}'", format(tmp, 10L, "b8"));
756                 Stdout.formatln ("o8 '{}'", format(tmp, 10L, "o8"));
757 
758                 Stdout.formatln ("d1# '{}'", format(tmp, 10L, "d1#"));
759                 Stdout.formatln ("d6# '{}'", format(tmp, 10L, "d6#"));
760                 Stdout.formatln ("x6# '{}'", format(tmp, 10L, "x6#"));
761                 Stdout.formatln ("X6# '{}'", format(tmp, 10L, "X6#"));
762 
763                 Stdout.formatln ("b12# '{}'", format(tmp, 10L, "b12#"));
764                 Stdout.formatln ("o12# '{}'", format(tmp, 10L, "o12#")).newline;
765 
766                 Stdout.formatln (consume("10"));
767                 Stdout.formatln (consume("0x1f"));
768                 Stdout.formatln (consume("0.123"));
769                 Stdout.formatln (consume("0.123", true));
770                 Stdout.formatln (consume("0.123e-10", true)).newline;
771 
772                 Stdout.formatln (consume("10  s"));
773                 Stdout.formatln (consume("0x1f   s"));
774                 Stdout.formatln (consume("0.123  s"));
775                 Stdout.formatln (consume("0.123  s", true));
776                 Stdout.formatln (consume("0.123e-10  s", true)).newline;
777         }
778 }
779 
780 
781