1 /*******************************************************************************
2 3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved
4 5 license: BSD style: $(LICENSE)
6 7 version: Initial release: Oct 2004
8 9 authors: Kris
10 11 Fast Unicode transcoders. These are particularly sensitive to
12 minor changes on 32bit x86 devices, because the register set of
13 those devices is so small. Beware of subtle changes which might
14 extend the execution-period by as much as 200%. Because of this,
15 three of the six transcoders might read past the end of input by
16 one, two, or three bytes before arresting themselves. Note that
17 support for streaming adds a 15% overhead to the dchar => char
18 conversion, but has little effect on the others.
19 20 These routines were tuned on an Intel P4; other devices may work
21 more efficiently with a slightly different approach, though this
22 is likely to be reasonably optimal on AMD x86 CPUs also. These
23 algorithms would benefit significantly from those extra AMD64
24 registers. On a 3GHz P4, the dchar/char conversions take around
25 2500ns to process an array of 1000 ASCII elements. Invoking the
26 memory manager doubles that period, and quadruples the time for
27 arrays of 100 elements. Memory allocation can slow down notably
28 in a multi-threaded environment, so avoid that where possible.
29 30 Surrogate-pairs are dealt with in a non-optimal fashion when
31 transcoding between utf16 and utf8. Such cases are considered
32 to be boundary-conditions for this module.
33 34 There are three common cases where the input may be incomplete,
35 including each 'widening' case of utf8 => utf16, utf8 => utf32,
36 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
37 pairs are present. Such cases will throw an exception, unless
38 streaming-mode is enabled ~ in the latter mode, an additional
39 integer is returned indicating how many elements of the input
40 have been consumed. In all cases, a correct slice of the output
41 is returned.
42 43 For details on Unicode processing see:
44 $(UL $(LINK http://www.utf-8.com/))
45 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
46 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
47 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
48 49 *******************************************************************************/50 51 moduletango.text.convert.Utf;
52 53 publicextern (C) voidonUnicodeError (const(char[]) msg, size_tidx = 0);
54 55 /*******************************************************************************
56 57 Symmetric calls for equivalent types; these return the provided
58 input with no conversion
59 60 *******************************************************************************/61 62 inout(char[]) toString (inout(char[]) src, char[] dst = null, size_t* ate=null) {returnsrc;}
63 inout(wchar[]) toString16 (inout(wchar[]) src, wchar[] dst = null, size_t* ate=null) {returnsrc;}
64 inout(dchar[]) toString32 (inout(dchar[]) src, dchar[] dst = null, size_t* ate=null) {returnsrc;}
65 66 /*******************************************************************************
67 68 Encode Utf8 up to a maximum of 4 bytes long (five & six byte
69 variations are not supported).
70 71 If the output is provided off the stack, it should be large
72 enough to encompass the entire transcoding; failing to do
73 so will cause the output to be moved onto the heap instead.
74 75 Returns a slice of the output buffer, corresponding to the
76 converted characters. For optimum performance, the returned
77 buffer should be specified as 'output' on subsequent calls.
78 For example:
79 80 ---
81 char[] output;
82 83 char[] result = toString (input, output);
84 85 // reset output after a realloc
86 if (result.length > output.length)
87 output = result;
88 ---
89 90 Where 'ate' is provided, it will be set to the number of
91 elements consumed from the input, and the output buffer
92 will not be resized (or allocated). This represents a
93 streaming mode, where slices of the input are processed
94 in sequence rather than all at one time (should use 'ate'
95 as an index for slicing into unconsumed input).
96 97 *******************************************************************************/98 99 char[] toString (const(wchar[]) input, char[] output=null, size_t* ate=null)
100 {
101 if (ate)
102 *ate = input.length;
103 else104 {
105 // potentially reallocate output106 autoestimate = input.length * 2 + 3;
107 if (output.length < estimate)
108 output.length = estimate;
109 }
110 111 char* pOut = output.ptr;
112 char* pMax = pOut + output.length - 3;
113 114 foreach (eaten, wcharb; input)
115 {
116 // about to overflow the output?117 if (pOut > pMax)
118 {
119 // if streaming, just return the unused input120 if (ate)
121 {
122 *ate = eaten;
123 break;
124 }
125 126 // reallocate the output buffer127 autolen = pOut - output.ptr;
128 output.length = len + len / 2;
129 pOut = output.ptr + len;
130 pMax = output.ptr + output.length - 3;
131 }
132 133 if (b < 0x80)
134 *pOut++ = cast(char)b;
135 else136 if (b < 0x0800)
137 {
138 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
139 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
140 pOut += 2;
141 }
142 else143 if (b < 0xd800 || b > 0xdfff)
144 {
145 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
146 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
147 pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
148 pOut += 3;
149 }
150 else151 // deal with surrogate-pairs152 returntoString (toString32(input, null, ate), output);
153 }
154 155 // return the produced output156 returnoutput [0..(pOut - output.ptr)];
157 }
158 159 /*******************************************************************************
160 161 Decode Utf8 produced by the above toString() method.
162 163 If the output is provided off the stack, it should be large
164 enough to encompass the entire transcoding; failing to do
165 so will cause the output to be moved onto the heap instead.
166 167 Returns a slice of the output buffer, corresponding to the
168 converted characters. For optimum performance, the returned
169 buffer should be specified as 'output' on subsequent calls.
170 171 Where 'ate' is provided, it will be set to the number of
172 elements consumed from the input, and the output buffer
173 will not be resized (or allocated). This represents a
174 streaming mode, where slices of the input are processed
175 in sequence rather than all at one time (should use 'ate'
176 as an index for slicing into unconsumed input).
177 178 *******************************************************************************/179 180 wchar[] toString16 (const(char[]) input, wchar[] output=null, size_t* ate=null)
181 {
182 intproduced;
183 const(char)* pIn = input.ptr;
184 const(char)* pMax = pIn + input.length;
185 const(char)* pValid;
186 187 if (ateisnull)
188 if (input.length > output.length)
189 output.length = input.length;
190 191 if (input.length)
192 foreach (refwchard; output)
193 {
194 pValid = pIn;
195 wcharb = cast(wchar) *pIn;
196 197 if (b & 0x80)
198 {
199 if (b < 0xe0)
200 {
201 b &= 0x1f;
202 b = cast(wchar)((b << 6) | (*++pIn & 0x3f));
203 }
204 else205 {
206 if (b < 0xf0)
207 {
208 b &= 0x0f;
209 b = cast(wchar)((b << 6) | (pIn[1] & 0x3f));
210 b = cast(wchar)((b << 6) | (pIn[2] & 0x3f));
211 pIn += 2;
212 }
213 else214 // deal with surrogate-pairs215 returntoString16 (toString32(input, null, ate), output);
216 }
217 }
218 d = b;
219 ++produced;
220 221 // did we read past the end of the input?222 if (++pIn >= pMax)
223 {
224 if (pIn > pMax)
225 {
226 // yep ~ return tail or throw error?227 if (ate)
228 {
229 pIn = pValid;
230 --produced;
231 break;
232 }
233 onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
234 }
235 else236 break;
237 }
238 }
239 240 // do we still have some input left?241 if (ate)
242 *ate = pIn - input.ptr;
243 else244 if (pIn < pMax)
245 // this should never happen!246 onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
247 248 // return the produced output249 returnoutput [0..produced];
250 }
251 252 253 /*******************************************************************************
254 255 Encode Utf8 up to a maximum of 4 bytes long (five & six
256 byte variations are not supported). Throws an exception
257 where the input dchar is greater than 0x10ffff.
258 259 If the output is provided off the stack, it should be large
260 enough to encompass the entire transcoding; failing to do
261 so will cause the output to be moved onto the heap instead.
262 263 Returns a slice of the output buffer, corresponding to the
264 converted characters. For optimum performance, the returned
265 buffer should be specified as 'output' on subsequent calls.
266 267 Where 'ate' is provided, it will be set to the number of
268 elements consumed from the input, and the output buffer
269 will not be resized (or allocated). This represents a
270 streaming mode, where slices of the input are processed
271 in sequence rather than all at one time (should use 'ate'
272 as an index for slicing into unconsumed input).
273 274 *******************************************************************************/275 276 char[] toString (const(dchar[]) input, char[] output=null, size_t* ate=null)
277 {
278 if (ate)
279 *ate = input.length;
280 else281 {
282 // potentially reallocate output283 autoestimate = input.length * 2 + 4;
284 if (output.length < estimate)
285 output.length = estimate;
286 }
287 288 char* pOut = output.ptr;
289 char* pMax = pOut + output.length - 4;
290 291 foreach (eaten, dcharb; input)
292 {
293 // about to overflow the output?294 if (pOut > pMax)
295 {
296 // if streaming, just return the unused input297 if (ate)
298 {
299 *ate = eaten;
300 break;
301 }
302 303 // reallocate the output buffer304 autolen = pOut - output.ptr;
305 output.length = len + len / 2;
306 pOut = output.ptr + len;
307 pMax = output.ptr + output.length - 4;
308 }
309 310 if (b < 0x80)
311 *pOut++ = cast(char)b;
312 else313 if (b < 0x0800)
314 {
315 pOut[0] = cast(wchar)(0xc0 | ((b >> 6) & 0x3f));
316 pOut[1] = cast(wchar)(0x80 | (b & 0x3f));
317 pOut += 2;
318 }
319 else320 if (b < 0x10000)
321 {
322 pOut[0] = cast(wchar)(0xe0 | ((b >> 12) & 0x3f));
323 pOut[1] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
324 pOut[2] = cast(wchar)(0x80 | (b & 0x3f));
325 pOut += 3;
326 }
327 else328 if (b < 0x110000)
329 {
330 pOut[0] = cast(wchar)(0xf0 | ((b >> 18) & 0x3f));
331 pOut[1] = cast(wchar)(0x80 | ((b >> 12) & 0x3f));
332 pOut[2] = cast(wchar)(0x80 | ((b >> 6) & 0x3f));
333 pOut[3] = cast(wchar)(0x80 | (b & 0x3f));
334 pOut += 4;
335 }
336 else337 onUnicodeError ("Unicode.toString : invalid dchar", eaten);
338 }
339 340 // return the produced output341 returnoutput [0..(pOut - output.ptr)];
342 }
343 344 345 /*******************************************************************************
346 347 Decode Utf8 produced by the above toString() method.
348 349 If the output is provided off the stack, it should be large
350 enough to encompass the entire transcoding; failing to do
351 so will cause the output to be moved onto the heap instead.
352 353 Returns a slice of the output buffer, corresponding to the
354 converted characters. For optimum performance, the returned
355 buffer should be specified as 'output' on subsequent calls.
356 357 Where 'ate' is provided, it will be set to the number of
358 elements consumed from the input, and the output buffer
359 will not be resized (or allocated). This represents a
360 streaming mode, where slices of the input are processed
361 in sequence rather than all at one time (should use 'ate'
362 as an index for slicing into unconsumed input).
363 364 *******************************************************************************/365 366 dchar[] toString32 (const(char[]) input, dchar[] output=null, size_t* ate=null)
367 {
368 intproduced;
369 const(char)* pIn = input.ptr;
370 const(char)* pMax = pIn + input.length;
371 const(char)* pValid;
372 373 if (ateisnull)
374 if (input.length > output.length)
375 output.length = input.length;
376 377 if (input.length)
378 foreach (refdchard; output)
379 {
380 pValid = pIn;
381 dcharb = cast(dchar) *pIn;
382 383 if (b & 0x80)
384 {
385 if (b < 0xe0)
386 {
387 b &= 0x1f;
388 b = (b << 6) | (*++pIn & 0x3f);
389 }
390 else391 {
392 if (b < 0xf0)
393 {
394 b &= 0x0f;
395 b = (b << 6) | (pIn[1] & 0x3f);
396 b = (b << 6) | (pIn[2] & 0x3f);
397 pIn += 2;
398 }
399 else400 {
401 b &= 0x07;
402 b = (b << 6) | (pIn[1] & 0x3f);
403 b = (b << 6) | (pIn[2] & 0x3f);
404 b = (b << 6) | (pIn[3] & 0x3f);
405 406 if (b >= 0x110000)
407 onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
408 pIn += 3;
409 }
410 }
411 }
412 d = b;
413 ++produced;
414 415 // did we read past the end of the input?416 if (++pIn >= pMax)
417 {
418 if (pIn > pMax)
419 {
420 // yep ~ return tail or throw error?421 if (ate)
422 {
423 pIn = pValid;
424 --produced;
425 break;
426 }
427 onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
428 }
429 else430 break;
431 }
432 }
433 434 // do we still have some input left?435 if (ate)
436 *ate = pIn - input.ptr;
437 else438 if (pIn < pMax)
439 // this should never happen!440 onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
441 442 // return the produced output443 returnoutput [0..produced];
444 }
445 446 /*******************************************************************************
447 448 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
449 where the input dchar is greater than 0x10ffff.
450 451 If the output is provided off the stack, it should be large
452 enough to encompass the entire transcoding; failing to do
453 so will cause the output to be moved onto the heap instead.
454 455 Returns a slice of the output buffer, corresponding to the
456 converted characters. For optimum performance, the returned
457 buffer should be specified as 'output' on subsequent calls.
458 459 Where 'ate' is provided, it will be set to the number of
460 elements consumed from the input, and the output buffer
461 will not be resized (or allocated). This represents a
462 streaming mode, where slices of the input are processed
463 in sequence rather than all at one time (should use 'ate'
464 as an index for slicing into unconsumed input).
465 466 *******************************************************************************/467 468 wchar[] toString16 (const(dchar[]) input, wchar[] output=null, size_t* ate=null)
469 {
470 if (ate)
471 *ate = input.length;
472 else473 {
474 size_testimate = input.length * 2 + 2;
475 if (output.length < estimate)
476 output.length = estimate;
477 }
478 479 wchar* pOut = output.ptr;
480 wchar* pMax = pOut + output.length - 2;
481 482 foreach (eaten, dcharb; input)
483 {
484 // about to overflow the output?485 if (pOut > pMax)
486 {
487 // if streaming, just return the unused input488 if (ate)
489 {
490 *ate = eaten;
491 break;
492 }
493 494 // reallocate the output buffer495 size_tlen = pOut - output.ptr;
496 output.length = len + len / 2;
497 pOut = output.ptr + len;
498 pMax = output.ptr + output.length - 2;
499 }
500 501 if (b < 0x10000)
502 *pOut++ = cast(wchar)b;
503 else504 if (b < 0x110000)
505 {
506 pOut[0] = cast(wchar)(0xd800 | (((b - 0x10000) >> 10) & 0x3ff));
507 pOut[1] = cast(wchar)(0xdc00 | ((b - 0x10000) & 0x3ff));
508 pOut += 2;
509 }
510 else511 onUnicodeError ("Unicode.toString16 : invalid dchar", eaten);
512 }
513 514 // return the produced output515 returnoutput [0..(pOut - output.ptr)];
516 }
517 518 /*******************************************************************************
519 520 Decode Utf16 produced by the above toString16() method.
521 522 If the output is provided off the stack, it should be large
523 enough to encompass the entire transcoding; failing to do
524 so will cause the output to be moved onto the heap instead.
525 526 Returns a slice of the output buffer, corresponding to the
527 converted characters. For optimum performance, the returned
528 buffer should be specified as 'output' on subsequent calls.
529 530 Where 'ate' is provided, it will be set to the number of
531 elements consumed from the input, and the output buffer
532 will not be resized (or allocated). This represents a
533 streaming mode, where slices of the input are processed
534 in sequence rather than all at one time (should use 'ate'
535 as an index for slicing into unconsumed input).
536 537 *******************************************************************************/538 539 dchar[] toString32 (const(wchar[]) input, dchar[] output=null, size_t* ate=null)
540 {
541 intproduced;
542 const(wchar)* pIn = input.ptr;
543 const(wchar)* pMax = pIn + input.length;
544 const(wchar)* pValid;
545 546 if (ateisnull)
547 if (input.length > output.length)
548 output.length = input.length;
549 550 if (input.length)
551 foreach (refdchard; output)
552 {
553 pValid = pIn;
554 dcharb = cast(dchar) *pIn;
555 556 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35557 if (b >= 0xd800 && b <= 0xdfff)
558 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
559 560 if (b >= 0x110000)
561 onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
562 563 d = b;
564 ++produced;
565 566 if (++pIn >= pMax)
567 {
568 if (pIn > pMax)
569 {
570 // yep ~ return tail or throw error?571 if (ate)
572 {
573 pIn = pValid;
574 --produced;
575 break;
576 }
577 onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
578 }
579 else580 break;
581 }
582 }
583 584 // do we still have some input left?585 if (ate)
586 *ate = pIn - input.ptr;
587 else588 if (pIn < pMax)
589 // this should never happen!590 onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
591 592 // return the produced output593 returnoutput [0..produced];
594 }
595 596 597 /*******************************************************************************
598 599 Decodes a single dchar from the given src text, and indicates how
600 many chars were consumed from src to do so.
601 602 *******************************************************************************/603 604 dchardecode (const(char[]) src, refsize_tate)
605 {
606 dchar[1] ret;
607 returntoString32 (src, ret, &ate)[0];
608 }
609 610 /*******************************************************************************
611 612 Decodes a single dchar from the given src text, and indicates how
613 many wchars were consumed from src to do so.
614 615 *******************************************************************************/616 617 dchardecode (const(wchar[]) src, refsize_tate)
618 {
619 dchar[1] ret;
620 returntoString32 (src, ret, &ate)[0];
621 }
622 623 /*******************************************************************************
624 625 Encode a dchar into the provided dst array, and return a slice of
626 it representing the encoding
627 628 *******************************************************************************/629 630 char[] encode (char[] dst, dcharc)
631 {
632 returntoString ((&c)[0..1], dst);
633 }
634 635 /*******************************************************************************
636 637 Encode a dchar into the provided dst array, and return a slice of
638 it representing the encoding
639 640 *******************************************************************************/641 642 wchar[] encode (wchar[] dst, dcharc)
643 {
644 returntoString16 ((&c)[0..1], dst);
645 }
646 647 /*******************************************************************************
648 649 Is the given character valid?
650 651 *******************************************************************************/652 653 boolisValid (dcharc)
654 {
655 return (c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF));
656 }
657 658 /*******************************************************************************
659 660 Convert from a char[] into the type of the dst provided.
661 662 Returns a slice of the given dst, where it is sufficiently large
663 to house the result, or a heap-allocated array otherwise. Returns
664 the original input where no conversion is required.
665 666 *******************************************************************************/667 668 inout(T[]) fromString8(T) (inout(char[]) s, T[] dst) if (is (T == char))
669 {
670 returns;
671 }
672 673 T[] fromString8(T) (const(char[]) s, T[] dst) if (!is (T == char))
674 {
675 staticif (is (T == wchar))
676 return .toString16 (s, dst);
677 678 staticif (is (T == dchar))
679 return .toString32 (s, dst);
680 }
681 682 /*******************************************************************************
683 684 Convert from a wchar[] into the type of the dst provided.
685 686 Returns a slice of the given dst, where it is sufficiently large
687 to house the result, or a heap-allocated array otherwise. Returns
688 the original input where no conversion is required.
689 690 *******************************************************************************/691 692 inout(T[]) fromString16(T) (inout(wchar[]) s, T[] dst) if (is (T == wchar))
693 {
694 returns;
695 }
696 697 T[] fromString16(T) (const(wchar[]) s, T[] dst) if (!is (T == wchar))
698 {
699 staticif (is (T == char))
700 return .toString (s, dst);
701 702 staticif (is (T == dchar))
703 return .toString32 (s, dst);
704 }
705 706 /*******************************************************************************
707 708 Convert from a dchar[] into the type of the dst provided.
709 710 Returns a slice of the given dst, where it is sufficiently large
711 to house the result, or a heap-allocated array otherwise. Returns
712 the original input where no conversion is required.
713 714 *******************************************************************************/715 716 inout(T[]) fromString32(T) (inout(dchar[]) s, T[] dst) if (is (T == dchar))
717 {
718 returns;
719 }
720 721 T[] fromString32(T) (const(dchar[]) s, T[] dst) if (!is (T == dchar))
722 {
723 staticif (is (T == char))
724 return .toString (s, dst);
725 726 staticif (is (T == wchar))
727 return .toString16 (s, dst);
728 }
729 730 /*******************************************************************************
731 732 Adjust the content such that no partial encodings exist on the
733 left side of the provided text.
734 735 Returns a slice of the input
736 737 *******************************************************************************/738 739 T[] cropLeft(T) (T[] s)
740 {
741 staticif (is (T == char))
742 for (inti=0; i < s.length && (s[i] & 0x80); ++i)
743 if ((s[i] & 0xc0) is0xc0)
744 returns [i..$];
745 746 staticif (is (T == wchar))
747 // skip if first char is a trailing surrogate748 if ((s[0] & 0xfffffc00) is0xdc00)
749 returns [1..$];
750 751 returns;
752 }
753 754 /*******************************************************************************
755 756 Adjust the content such that no partial encodings exist on the
757 right side of the provided text.
758 759 Returns a slice of the input
760 761 *******************************************************************************/762 763 T[] cropRight(T) (T[] s)
764 {
765 if (s.length)
766 {
767 size_ti = s.length - 1;
768 staticif (is (T == char))
769 while (i && (s[i] & 0x80))
770 {
771 if ((s[i] & 0xc0) is0xc0)
772 {
773 // located the first byte of a sequence774 ubyteb = s[i];
775 size_td = s.length - i;
776 777 // is it a 3 byte sequence?778 if (b & 0x20)
779 --d;
780 781 // or a four byte sequence?782 if (b & 0x10)
783 --d;
784 785 // is the sequence complete?786 if (dis2)
787 i = s.length;
788 returns [0..i];
789 }
790 else791 --i;
792 }
793 794 staticif (is (T == wchar))
795 // skip if last char is a leading surrogate796 if ((s[i] & 0xfffffc00) is0xd800)
797 returns [0..$-1];
798 }
799 returns;
800 }
801 802 803 804 /*******************************************************************************
805 806 *******************************************************************************/807 808 debug (Utf)
809 {
810 importtango.io.Console;
811 812 voidmain()
813 {
814 autos = "[\xc2\xa2\xc2\xa2\xc2\xa2]";
815 Cout (s).newline;
816 817 Cout (cropLeft(s[0..$])).newline;
818 Cout (cropLeft(s[1..$])).newline;
819 Cout (cropLeft(s[2..$])).newline;
820 Cout (cropLeft(s[3..$])).newline;
821 Cout (cropLeft(s[4..$])).newline;
822 Cout (cropLeft(s[5..$])).newline;
823 824 Cout (cropRight(s[0..$])).newline;
825 Cout (cropRight(s[0..$-1])).newline;
826 Cout (cropRight(s[0..$-2])).newline;
827 Cout (cropRight(s[0..$-3])).newline;
828 Cout (cropRight(s[0..$-4])).newline;
829 Cout (cropRight(s[0..$-5])).newline;
830 }
831 }