1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6 
7         version:        Initial release: Oct 2004
8 
9         authors:        Kris
10 
11         Fast Unicode transcoders. These are particularly sensitive to
12         minor changes on 32bit x86 devices, because the register set of
13         those devices is so small. Beware of subtle changes which might
14         extend the execution-period by as much as 200%. Because of this,
15         three of the six transcoders might read past the end of input by
16         one, two, or three bytes before arresting themselves. Note that
17         support for streaming adds a 15% overhead to the dchar => char
18         conversion, but has little effect on the others.
19 
20         These routines were tuned on an Intel P4; other devices may work
21         more efficiently with a slightly different approach, though this
22         is likely to be reasonably optimal on AMD x86 CPUs also. These
23         algorithms would benefit significantly from those extra AMD64
24         registers. On a 3GHz P4, the dchar/char conversions take around
25         2500ns to process an array of 1000 ASCII elements. Invoking the
26         memory manager doubles that period, and quadruples the time for
27         arrays of 100 elements. Memory allocation can slow down notably
28         in a multi-threaded environment, so avoid that where possible.
29 
30         Surrogate-pairs are dealt with in a non-optimal fashion when
31         transcoding between utf16 and utf8. Such cases are considered
32         to be boundary-conditions for this module.
33 
34         There are three common cases where the input may be incomplete,
35         including each 'widening' case of utf8 => utf16, utf8 => utf32,
36         and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
37         pairs are present. Such cases will throw an exception, unless
38         streaming-mode is enabled ~ in the latter mode, an additional
39         integer is returned indicating how many elements of the input
40         have been consumed. In all cases, a correct slice of the output
41         is returned.
42 
43         For details on Unicode processing see:
44         $(UL $(LINK http://www.utf-8.com/))
45         $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
46         $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
47         $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
48 
49 *******************************************************************************/
50 
51 module tango.text.convert.Utf;
52 
53 public extern (C) void onUnicodeError (const(char[]) msg, size_t idx = 0);
54 
55 /*******************************************************************************
56 
57         Symmetric calls for equivalent types; these return the provided
58         input with no conversion
59 
60 *******************************************************************************/
61 
62 inout(char[])  toString (inout(char[]) src, char[] dst = null, size_t* ate=null) {return src;}
63 inout(wchar[]) toString16 (inout(wchar[]) src, wchar[] dst = null, size_t* ate=null) {return src;}
64 inout(dchar[]) toString32 (inout(dchar[]) src, dchar[] dst = null, size_t* ate=null) {return src;}
65 
66 /*******************************************************************************
67 
68         Encode Utf8 up to a maximum of 4 bytes long (five & six byte
69         variations are not supported).
70 
71         If the output is provided off the stack, it should be large
72         enough to encompass the entire transcoding; failing to do
73         so will cause the output to be moved onto the heap instead.
74 
75         Returns a slice of the output buffer, corresponding to the
76         converted characters. For optimum performance, the returned
77         buffer should be specified as 'output' on subsequent calls.
78         For example:
79 
80         ---
81         char[] output;
82 
83         char[] result = toString (input, output);
84 
85         // reset output after a realloc
86         if (result.length > output.length)
87             output = result;
88         ---
89 
90         Where 'ate' is provided, it will be set to the number of 
91         elements consumed from the input, and the output buffer 
92         will not be resized (or allocated). This represents a
93         streaming mode, where slices of the input are processed
94         in sequence rather than all at one time (should use 'ate'
95         as an index for slicing into unconsumed input).
96 
97 *******************************************************************************/
98 
99 char[] toString (const(wchar[]) input, char[] output=null, size_t* ate=null)
100 {
101         if (ate)
102             *ate = input.length;
103         else
104            {
105            // potentially reallocate output
106            auto estimate = input.length * 2 + 3;
107            if (output.length < estimate)
108                output.length = estimate;
109            }
110 
111         char* pOut = output.ptr;
112         char* pMax = pOut + output.length - 3;
113 
114         foreach (int eaten, wchar b; input)
115                 {
116                 // about to overflow the output?
117                 if (pOut > pMax)
118                    {
119                    // if streaming, just return the unused input
120                    if (ate)
121                       {
122                       *ate = eaten;
123                       break;
124                       }
125 
126                    // reallocate the output buffer
127                    auto len = pOut - output.ptr;
128                    output.length = len + len / 2;
129                    pOut = output.ptr + len;
130                    pMax = output.ptr + output.length - 3;
131                    }
132 
133                 if (b < 0x80)
134                     *pOut++ = cast(char)b;
135                 else
136                    if (b < 0x0800)
137                       {
138                       pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
139                       pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
140                       pOut += 2;
141                       }
142                    else
143                       if (b < 0xd800 || b > 0xdfff)
144                          {
145                          pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
146                          pOut[1] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
147                          pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
148                          pOut += 3;
149                          }
150                       else
151                          // deal with surrogate-pairs
152                          return toString (toString32(input, null, ate), output);
153                 }
154 
155         // return the produced output
156         return output [0..(pOut - output.ptr)];
157 }
158 
159 /*******************************************************************************
160 
161         Decode Utf8 produced by the above toString() method.
162 
163         If the output is provided off the stack, it should be large
164         enough to encompass the entire transcoding; failing to do
165         so will cause the output to be moved onto the heap instead.
166 
167         Returns a slice of the output buffer, corresponding to the
168         converted characters. For optimum performance, the returned
169         buffer should be specified as 'output' on subsequent calls.
170 
171         Where 'ate' is provided, it will be set to the number of 
172         elements consumed from the input, and the output buffer 
173         will not be resized (or allocated). This represents a
174         streaming mode, where slices of the input are processed
175         in sequence rather than all at one time (should use 'ate'
176         as an index for slicing into unconsumed input).
177 
178 *******************************************************************************/
179 
180 wchar[] toString16 (const(char[]) input, wchar[] output=null, size_t* ate=null)
181 {
182         int     produced;
183         const(char)*   pIn = input.ptr;
184         const(char)*   pMax = pIn + input.length;
185         const(char)*   pValid;
186 
187         if (ate is null)
188             if (input.length > output.length)
189                 output.length = input.length;
190 
191         if (input.length)
192         foreach (ref wchar d; output)
193         {
194                 pValid = pIn;
195                 wchar b = cast(wchar) *pIn;
196 
197                 if (b & 0x80)
198                 {
199                     if (b < 0xe0)
200                        {
201                        b &= 0x1f;
202                        b = cast(wchar)((b << 6) | (*++pIn & 0x3f));
203                        }
204                     else
205                        {
206                        if (b < 0xf0)
207                           {
208                           b &= 0x0f;
209                           b = cast(wchar)((b << 6) | (pIn[1] & 0x3f));
210                           b = cast(wchar)((b << 6) | (pIn[2] & 0x3f));
211                           pIn += 2;
212                           }
213                        else
214                           // deal with surrogate-pairs
215                           return toString16 (toString32(input, null, ate), output);
216                        }
217                 }
218                 d = b;
219                 ++produced;
220 
221                 // did we read past the end of the input?
222                 if (++pIn >= pMax)
223                 {
224                     if (pIn > pMax)
225                        {
226                        // yep ~ return tail or throw error?
227                        if (ate)
228                           {
229                           pIn = pValid;
230                           --produced;
231                           break;
232                           }
233                        onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
234                        }
235                     else
236                        break;
237                 }
238         }
239 
240         // do we still have some input left?
241         if (ate)
242             *ate = pIn - input.ptr;
243         else
244            if (pIn < pMax)
245                // this should never happen!
246                onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
247 
248         // return the produced output
249         return output [0..produced];
250 }
251 
252 
253 /*******************************************************************************
254 
255         Encode Utf8 up to a maximum of 4 bytes long (five & six
256         byte variations are not supported). Throws an exception
257         where the input dchar is greater than 0x10ffff.
258 
259         If the output is provided off the stack, it should be large
260         enough to encompass the entire transcoding; failing to do
261         so will cause the output to be moved onto the heap instead.
262 
263         Returns a slice of the output buffer, corresponding to the
264         converted characters. For optimum performance, the returned
265         buffer should be specified as 'output' on subsequent calls.
266 
267         Where 'ate' is provided, it will be set to the number of 
268         elements consumed from the input, and the output buffer 
269         will not be resized (or allocated). This represents a
270         streaming mode, where slices of the input are processed
271         in sequence rather than all at one time (should use 'ate'
272         as an index for slicing into unconsumed input).
273 
274 *******************************************************************************/
275 
276 char[] toString (const(dchar[]) input, char[] output=null, size_t* ate=null)
277 {
278         if (ate)
279             *ate = input.length;
280         else
281            {
282            // potentially reallocate output
283            auto estimate = input.length * 2 + 4;
284            if (output.length < estimate)
285                output.length = estimate;
286            }
287 
288         char* pOut = output.ptr;
289         char* pMax = pOut + output.length - 4;
290 
291         foreach (int eaten, dchar b; input)
292                 {
293                 // about to overflow the output?
294                 if (pOut > pMax)
295                    {
296                    // if streaming, just return the unused input
297                    if (ate)
298                       {
299                       *ate = eaten;
300                       break;
301                       }
302 
303                    // reallocate the output buffer
304                    auto len = pOut - output.ptr;
305                    output.length = len + len / 2;
306                    pOut = output.ptr + len;
307                    pMax = output.ptr + output.length - 4;
308                    }
309 
310                 if (b < 0x80)
311                     *pOut++ = cast(char)b;
312                 else
313                    if (b < 0x0800)
314                       {
315                       pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
316                       pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
317                       pOut += 2;
318                       }
319                    else
320                       if (b < 0x10000)
321                          {
322                          pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
323                          pOut[1] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
324                          pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
325                          pOut += 3;
326                          }
327                       else
328                          if (b < 0x110000)
329                             {
330                             pOut[0] = cast(wchar)(0xf0 | ((b >> 18) & 0x3f));
331                             pOut[1] = cast(wchar)(0x80 | ((b >> 12) & 0x3f));
332                             pOut[2] = cast(wchar)(0x80 | ((b >> 6)  & 0x3f));
333                             pOut[3] = cast(wchar)(0x80 | (b & 0x3f));
334                             pOut += 4;
335                             }
336                          else
337                             onUnicodeError ("Unicode.toString : invalid dchar", eaten);
338                 }
339 
340         // return the produced output
341         return output [0..(pOut - output.ptr)];
342 }
343 
344 
345 /*******************************************************************************
346 
347         Decode Utf8 produced by the above toString() method.
348 
349         If the output is provided off the stack, it should be large
350         enough to encompass the entire transcoding; failing to do
351         so will cause the output to be moved onto the heap instead.
352 
353         Returns a slice of the output buffer, corresponding to the
354         converted characters. For optimum performance, the returned
355         buffer should be specified as 'output' on subsequent calls.
356 
357         Where 'ate' is provided, it will be set to the number of 
358         elements consumed from the input, and the output buffer 
359         will not be resized (or allocated). This represents a
360         streaming mode, where slices of the input are processed
361         in sequence rather than all at one time (should use 'ate'
362         as an index for slicing into unconsumed input).
363 
364 *******************************************************************************/
365 
366 dchar[] toString32 (const(char[]) input, dchar[] output=null, size_t* ate=null)
367 {
368         int     produced;
369         const(char)*   pIn = input.ptr;
370         const(char)*   pMax = pIn + input.length;
371         const(char)*   pValid;
372 
373         if (ate is null)
374             if (input.length > output.length)
375                 output.length = input.length;
376 
377         if (input.length)
378         foreach (ref dchar d; output)
379         {
380                 pValid = pIn;
381                 dchar b = cast(dchar) *pIn;
382 
383                 if (b & 0x80)
384                 {
385                     if (b < 0xe0)
386                        {
387                        b &= 0x1f;
388                        b = (b << 6) | (*++pIn & 0x3f);
389                        }
390                     else
391                        {
392                        if (b < 0xf0)
393                           {
394                           b &= 0x0f;
395                           b = (b << 6) | (pIn[1] & 0x3f);
396                           b = (b << 6) | (pIn[2] & 0x3f);
397                           pIn += 2;
398                           }
399                        else
400                           {
401                           b &= 0x07;
402                           b = (b << 6) | (pIn[1] & 0x3f);
403                           b = (b << 6) | (pIn[2] & 0x3f);
404                           b = (b << 6) | (pIn[3] & 0x3f);
405 
406                           if (b >= 0x110000)
407                               onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
408                           pIn += 3;
409                           }
410                        }
411                 }
412                 d = b;
413                 ++produced;
414 
415                 // did we read past the end of the input?
416                 if (++pIn >= pMax)
417                 {
418                     if (pIn > pMax)
419                        {
420                        // yep ~ return tail or throw error?
421                        if (ate)
422                           {
423                           pIn = pValid;
424                           --produced;
425                           break;
426                           }
427                        onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
428                        }
429                     else
430                        break;
431                 }
432         }
433 
434         // do we still have some input left?
435         if (ate)
436             *ate = pIn - input.ptr;
437         else
438            if (pIn < pMax)
439                // this should never happen!
440                onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
441 
442         // return the produced output
443         return output [0..produced];
444 }
445 
446 /*******************************************************************************
447 
448         Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
449         where the input dchar is greater than 0x10ffff.
450 
451         If the output is provided off the stack, it should be large
452         enough to encompass the entire transcoding; failing to do
453         so will cause the output to be moved onto the heap instead.
454 
455         Returns a slice of the output buffer, corresponding to the
456         converted characters. For optimum performance, the returned
457         buffer should be specified as 'output' on subsequent calls.
458 
459         Where 'ate' is provided, it will be set to the number of 
460         elements consumed from the input, and the output buffer 
461         will not be resized (or allocated). This represents a
462         streaming mode, where slices of the input are processed
463         in sequence rather than all at one time (should use 'ate'
464         as an index for slicing into unconsumed input).
465 
466 *******************************************************************************/
467 
468 wchar[] toString16 (const(dchar[]) input, wchar[] output=null, size_t* ate=null)
469 {
470         if (ate)
471             *ate = input.length;
472         else
473            {
474            size_t estimate = input.length * 2 + 2;
475            if (output.length < estimate)
476                output.length = estimate;
477            }
478 
479         wchar* pOut = output.ptr;
480         wchar* pMax = pOut + output.length - 2;
481 
482         foreach (int eaten, dchar b; input)
483                 {
484                 // about to overflow the output?
485                 if (pOut > pMax)
486                    {
487                    // if streaming, just return the unused input
488                    if (ate)
489                       {
490                       *ate = eaten;
491                       break;
492                       }
493 
494                    // reallocate the output buffer
495                    size_t len = pOut - output.ptr;
496                    output.length = len + len / 2;
497                    pOut = output.ptr + len;
498                    pMax = output.ptr + output.length - 2;
499                    }
500 
501                 if (b < 0x10000)
502                     *pOut++ = cast(wchar)b;
503                 else
504                    if (b < 0x110000)
505                       {
506                       pOut[0] = cast(wchar)(0xd800 | (((b - 0x10000) >> 10) & 0x3ff));
507                       pOut[1] = cast(wchar)(0xdc00 | ((b - 0x10000) & 0x3ff));
508                       pOut += 2;
509                       }
510                    else
511                       onUnicodeError ("Unicode.toString16 : invalid dchar", eaten);
512                 }
513 
514         // return the produced output
515         return output [0..(pOut - output.ptr)];
516 }
517 
518 /*******************************************************************************
519 
520         Decode Utf16 produced by the above toString16() method.
521 
522         If the output is provided off the stack, it should be large
523         enough to encompass the entire transcoding; failing to do
524         so will cause the output to be moved onto the heap instead.
525 
526         Returns a slice of the output buffer, corresponding to the
527         converted characters. For optimum performance, the returned
528         buffer should be specified as 'output' on subsequent calls.
529 
530         Where 'ate' is provided, it will be set to the number of 
531         elements consumed from the input, and the output buffer 
532         will not be resized (or allocated). This represents a
533         streaming mode, where slices of the input are processed
534         in sequence rather than all at one time (should use 'ate'
535         as an index for slicing into unconsumed input).
536 
537 *******************************************************************************/
538 
539 dchar[] toString32 (const(wchar[]) input, dchar[] output=null, size_t* ate=null)
540 {
541         int     produced;
542         const(wchar)*  pIn = input.ptr;
543         const(wchar)*  pMax = pIn + input.length;
544         const(wchar)*  pValid;
545 
546         if (ate is null)
547             if (input.length > output.length)
548                 output.length = input.length;
549 
550         if (input.length)
551         foreach (ref dchar d; output)
552         {
553                 pValid = pIn;
554                 dchar b = cast(dchar) *pIn;
555 
556                 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
557                 if (b >= 0xd800 && b <= 0xdfff)
558                     b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
559 
560                 if (b >= 0x110000)
561                     onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
562 
563                 d = b;
564                 ++produced;
565 
566                 if (++pIn >= pMax)
567                 {
568                     if (pIn > pMax)
569                        {
570                        // yep ~ return tail or throw error?
571                        if (ate)
572                           {
573                           pIn = pValid;
574                           --produced;
575                           break;
576                           }
577                        onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
578                        }
579                     else
580                        break;
581                 }
582         }
583 
584         // do we still have some input left?
585         if (ate)
586             *ate = pIn - input.ptr;
587         else
588            if (pIn < pMax)
589                // this should never happen!
590                onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
591 
592         // return the produced output
593         return output [0..produced];
594 }
595 
596 
597 /*******************************************************************************
598 
599         Decodes a single dchar from the given src text, and indicates how
600         many chars were consumed from src to do so.
601 
602 *******************************************************************************/
603 
604 dchar decode (const(char[]) src, ref size_t ate)
605 {
606         dchar[1] ret;
607         return toString32 (src, ret, &ate)[0];
608 }
609 
610 /*******************************************************************************
611 
612         Decodes a single dchar from the given src text, and indicates how
613         many wchars were consumed from src to do so.
614 
615 *******************************************************************************/
616 
617 dchar decode (const(wchar[]) src, ref size_t ate)
618 {
619         dchar[1] ret;
620         return toString32 (src, ret, &ate)[0];
621 }
622 
623 /*******************************************************************************
624 
625         Encode a dchar into the provided dst array, and return a slice of 
626         it representing the encoding
627 
628 *******************************************************************************/
629 
630 char[] encode (char[] dst, dchar c)
631 {
632         return toString ((&c)[0..1], dst);
633 }
634 
635 /*******************************************************************************
636 
637         Encode a dchar into the provided dst array, and return a slice of 
638         it representing the encoding
639 
640 *******************************************************************************/
641 
642 wchar[] encode (wchar[] dst, dchar c)
643 {
644         return toString16 ((&c)[0..1], dst);
645 }
646 
647 /*******************************************************************************
648 
649         Is the given character valid?
650 
651 *******************************************************************************/
652 
653 bool isValid (dchar c)
654 {
655         return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF));
656 }
657 
658 /*******************************************************************************
659 
660         Convert from a char[] into the type of the dst provided. 
661 
662         Returns a slice of the given dst, where it is sufficiently large
663         to house the result, or a heap-allocated array otherwise. Returns
664         the original input where no conversion is required.
665 
666 *******************************************************************************/
667 
668 inout(T[]) fromString8(T) (inout(char[]) s, T[] dst) if (is (T == char))
669 {
670     return s;
671 }
672 
673 T[] fromString8(T) (const(char[]) s, T[] dst) if (!is (T == char))
674 {
675         static if (is (T == wchar))
676                    return .toString16 (s, dst);
677 
678         static if (is (T == dchar))
679                    return .toString32 (s, dst);
680 }
681 
682 /*******************************************************************************
683 
684         Convert from a wchar[] into the type of the dst provided. 
685 
686         Returns a slice of the given dst, where it is sufficiently large
687         to house the result, or a heap-allocated array otherwise. Returns
688         the original input where no conversion is required.
689 
690 *******************************************************************************/
691 
692 inout(T[]) fromString16(T) (inout(wchar[]) s, T[] dst) if (is (T == wchar))
693 {
694     return s;
695 }
696 
697 T[] fromString16(T) (const(wchar[]) s, T[] dst) if (!is (T == wchar))
698 {
699         static if (is (T == char))
700                    return .toString (s, dst);
701 
702         static if (is (T == dchar))
703                    return .toString32 (s, dst);
704 }
705 
706 /*******************************************************************************
707 
708         Convert from a dchar[] into the type of the dst provided. 
709 
710         Returns a slice of the given dst, where it is sufficiently large
711         to house the result, or a heap-allocated array otherwise. Returns
712         the original input where no conversion is required.
713 
714 *******************************************************************************/
715 
716 inout(T[]) fromString32(T) (inout(dchar[]) s, T[] dst) if (is (T == dchar))
717 {
718     return s;
719 }
720 
721 T[] fromString32(T) (const(dchar[]) s, T[] dst) if (!is (T == dchar))
722 {
723         static if (is (T == char))
724                    return .toString (s, dst);
725 
726         static if (is (T == wchar))
727                    return .toString16 (s, dst);
728 }
729 
730 /*******************************************************************************
731 
732         Adjust the content such that no partial encodings exist on the 
733         left side of the provided text.
734 
735         Returns a slice of the input
736 
737 *******************************************************************************/
738 
739 T[] cropLeft(T) (T[] s)
740 {
741         static if (is (T == char))
742                    for (int i=0; i < s.length && (s[i] & 0x80); ++i)
743                         if ((s[i] & 0xc0) is 0xc0)
744                              return s [i..$];
745 
746         static if (is (T == wchar))
747                    // skip if first char is a trailing surrogate
748                    if ((s[0] & 0xfffffc00) is 0xdc00)
749                         return s [1..$];
750 
751         return s;
752 }
753 
754 /*******************************************************************************
755 
756         Adjust the content such that no partial encodings exist on the 
757         right side of the provided text.
758 
759         Returns a slice of the input
760 
761 *******************************************************************************/
762 
763 T[] cropRight(T) (T[] s)
764 {
765         if (s.length)
766            {
767            size_t i = s.length - 1;
768            static if (is (T == char))
769                       while (i && (s[i] & 0x80))
770                       {
771                              if ((s[i] & 0xc0) is 0xc0)
772                                 {
773                                 // located the first byte of a sequence
774                                 ubyte b = s[i];
775                                 size_t d = s.length - i;
776 
777                                 // is it a 3 byte sequence?
778                                 if (b & 0x20)
779                                     --d;
780    
781                                 // or a four byte sequence?
782                                 if (b & 0x10)
783                                     --d;
784 
785                                 // is the sequence complete?
786                                 if (d is 2)
787                                     i = s.length;
788                                 return s [0..i];
789                                 }
790                              else 
791                                 --i;
792                       }
793 
794            static if (is (T == wchar))
795                       // skip if last char is a leading surrogate
796                       if ((s[i] & 0xfffffc00) is 0xd800)
797                            return s [0..$-1];
798            }
799         return s;
800 }
801 
802 
803 
804 /*******************************************************************************
805 
806 *******************************************************************************/
807 
808 debug (Utf)
809 {
810         import tango.io.Console;
811 
812         void main()
813         {
814                 auto s = "[\xc2\xa2\xc2\xa2\xc2\xa2]";
815                 Cout (s).newline;
816 
817                 Cout (cropLeft(s[0..$])).newline;
818                 Cout (cropLeft(s[1..$])).newline;
819                 Cout (cropLeft(s[2..$])).newline;
820                 Cout (cropLeft(s[3..$])).newline;
821                 Cout (cropLeft(s[4..$])).newline;
822                 Cout (cropLeft(s[5..$])).newline;
823 
824                 Cout (cropRight(s[0..$])).newline;
825                 Cout (cropRight(s[0..$-1])).newline;
826                 Cout (cropRight(s[0..$-2])).newline;
827                 Cout (cropRight(s[0..$-3])).newline;
828                 Cout (cropRight(s[0..$-4])).newline;
829                 Cout (cropRight(s[0..$-5])).newline;
830         }
831 }