1 // Written in the D programming language
2 
3 /*
4  *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5  *  Written by Walter Bright
6  *
7  *  This software is provided 'as-is', without any express or implied
8  *  warranty. In no event will the authors be held liable for any damages
9  *  arising from the use of this software.
10  *
11  *  Permission is granted to anyone to use this software for any purpose,
12  *  including commercial applications, and to alter it and redistribute it
13  *  freely, subject to the following restrictions:
14  *
15  *  o  The origin of this software must not be misrepresented; you must not
16  *     claim that you wrote the original software. If you use this software
17  *     in a product, an acknowledgment in the product documentation would be
18  *     appreciated but is not required.
19  *  o  Altered source versions must be plainly marked as such, and must not
20  *     be misrepresented as being the original software.
21  *  o  This notice may not be removed or altered from any source
22  *     distribution.
23  */
24 
25 /********************************************
26  * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
27  *
28  * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
29  * wchar type.
30  * For linux systems, the C wchar_t type is UTF-32 and corresponds to
31  * the D utf.dchar type.
32  *
33  * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
34  *
35  * See_Also:
36  *	$(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
37  *	$(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
38  *	$(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
39  * Macros:
40  *	WIKI = Phobos/StdUtf
41  */
42 
43 module rt.compiler.util.utf;
44 
45 
46 
47 extern (C) void onUnicodeError( char[] msg, size_t idx );
48 /*******************************
49  * Test if c is a valid UTF-32 character.
50  *
51  * \uFFFE and \uFFFF are considered valid by this function,
52  * as they are permitted for internal use by an application,
53  * but they are not allowed for interchange by the Unicode standard.
54  *
55  * Returns: true if it is, false if not.
56  */
57 
58 
59 bool isValidDchar(dchar c)
60 {
61     /* Note: FFFE and FFFF are specifically permitted by the
62      * Unicode standard for application internal use, but are not
63      * allowed for interchange.
64      * (thanks to Arcane Jill)
65      */
66 
67     return c < 0xD800 ||
68 	(c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
69 }
70 
71 debug import tango.stdc.stdio : printf;
72 
73 unittest
74 {
75     debug(utf) printf("utf.isValidDchar.unittest\n");
76     assert(isValidDchar(cast(dchar)'a') == true);
77     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
78 }
79 
80 
81 /* This array gives the length of a UTF-8 sequence indexed by the value
82  * of the leading byte. An FF represents an illegal starting value of
83  * a UTF-8 sequence.
84  * FF is used instead of 0 to avoid having loops hang.
85  */
86 
87 ubyte[256] UTF8stride =
88 [
89     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
90     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
91     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
92     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
93     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
94     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
95     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
96     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
97     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
98     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
99     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
100     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
101     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
102     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
103     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
104     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
105 ];
106 
107 /**
108  * stride() returns the length of a UTF-8 sequence starting at index i
109  * in string s.
110  * Returns:
111  *	The number of bytes in the UTF-8 sequence or
112  *	0xFF meaning s[i] is not the start of of UTF-8 sequence.
113  */
114 uint stride(in char[] s, size_t i)
115 {
116     return UTF8stride[s[i]];
117 }
118 
119 /**
120  * stride() returns the length of a UTF-16 sequence starting at index i
121  * in string s.
122  */
123 uint stride(in wchar[] s, size_t i)
124 {   uint u = s[i];
125     return 1 + (u >= 0xD800 && u <= 0xDBFF);
126 }
127 
128 /**
129  * stride() returns the length of a UTF-32 sequence starting at index i
130  * in string s.
131  * Returns: The return value will always be 1.
132  */
133 uint stride(in dchar[] s, size_t i)
134 {
135     return 1;
136 }
137 
138 /*******************************************
139  * Given an index i into an array of characters s[],
140  * and assuming that index i is at the start of a UTF character,
141  * determine the number of UCS characters up to that index i.
142  */
143 
144 size_t toUCSindex(in char[] s, size_t i)
145 {
146     size_t n;
147     size_t j;
148     size_t stride;
149 
150     for (j = 0; j < i; j += stride)
151     {
152 	stride = UTF8stride[s[j]];
153 	if (stride == 0xFF)
154 	    goto Lerr;
155 	n++;
156     }
157     if (j > i)
158     {
159       Lerr:
160       onUnicodeError("invalid UTF-8 sequence", j);
161     }
162     return n;
163 }
164 
165 /** ditto */
166 size_t toUCSindex(in wchar[] s, size_t i)
167 {
168     size_t n;
169     size_t j;
170 
171     for (j = 0; j < i; )
172     {	uint u = s[j];
173 
174 	j += 1 + (u >= 0xD800 && u <= 0xDBFF);
175 	n++;
176     }
177     if (j > i)
178     {
179       Lerr:
180       onUnicodeError("invalid UTF-16 sequence", j);
181     }
182     return n;
183 }
184 
185 /** ditto */
186 size_t toUCSindex(in dchar[] s, size_t i)
187 {
188     return i;
189 }
190 
191 /******************************************
192  * Given a UCS index n into an array of characters s[], return the UTF index.
193  */
194 
195 size_t toUTFindex(in char[] s, size_t n)
196 {
197     size_t i;
198 
199     while (n--)
200     {
201 	uint j = UTF8stride[s[i]];
202 	if (j == 0xFF)
203 	    onUnicodeError("invalid UTF-8 sequence", i);
204 	i += j;
205     }
206     return i;
207 }
208 
209 /** ditto */
210 size_t toUTFindex(in wchar[] s, size_t n)
211 {
212     size_t i;
213 
214     while (n--)
215     {	wchar u = s[i];
216 
217 	i += 1 + (u >= 0xD800 && u <= 0xDBFF);
218     }
219     return i;
220 }
221 
222 /** ditto */
223 size_t toUTFindex(in dchar[] s, size_t n)
224 {
225     return n;
226 }
227 
228 /* =================== Decode ======================= */
229 
230 /***************
231  * Decodes and returns character starting at s[idx]. idx is advanced past the
232  * decoded character. If the character is not well formed, a UtfException is
233  * thrown and idx remains unchanged.
234  */
235 dchar decode(in char[] s, ref size_t idx)
236     in
237     {
238 	assert(idx >= 0 && idx < s.length);
239     }
240     out (result)
241     {
242 	assert(isValidDchar(result));
243     }
244     body
245     {
246 	size_t len = s.length;
247 	dchar V;
248 	size_t i = idx;
249 	char u = s[i];
250 
251 	if (u & 0x80)
252 	{   uint n;
253 	    char u2;
254 
255 	    /* The following encodings are valid, except for the 5 and 6 byte
256 	     * combinations:
257 	     *	0xxxxxxx
258 	     *	110xxxxx 10xxxxxx
259 	     *	1110xxxx 10xxxxxx 10xxxxxx
260 	     *	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
261 	     *	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
262 	     *	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
263 	     */
264 	    for (n = 1; ; n++)
265 	    {
266 		if (n > 4)
267 		    goto Lerr;		// only do the first 4 of 6 encodings
268 		if (((u << n) & 0x80) == 0)
269 		{
270 		    if (n == 1)
271 			goto Lerr;
272 		    break;
273 		}
274 	    }
275 
276 	    // Pick off (7 - n) significant bits of B from first byte of octet
277 	    V = cast(dchar)(u & ((1 << (7 - n)) - 1));
278 
279 	    if (i + (n - 1) >= len)
280 		goto Lerr;			// off end of string
281 
282 	    /* The following combinations are overlong, and illegal:
283 	     *	1100000x (10xxxxxx)
284 	     *	11100000 100xxxxx (10xxxxxx)
285 	     *	11110000 1000xxxx (10xxxxxx 10xxxxxx)
286 	     *	11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
287 	     *	11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
288 	     */
289 	    u2 = s[i + 1];
290 	    if ((u & 0xFE) == 0xC0 ||
291 		(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
292 		(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
293 		(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
294 		(u == 0xFC && (u2 & 0xFC) == 0x80))
295 		goto Lerr;			// overlong combination
296 
297 	    for (uint j = 1; j != n; j++)
298 	    {
299 		u = s[i + j];
300 		if ((u & 0xC0) != 0x80)
301 		    goto Lerr;			// trailing bytes are 10xxxxxx
302 		V = (V << 6) | (u & 0x3F);
303 	    }
304 	    if (!isValidDchar(V))
305 		goto Lerr;
306 	    i += n;
307 	}
308 	else
309 	{
310 	    V = cast(dchar) u;
311 	    i++;
312 	}
313 
314 	idx = i;
315 	return V;
316 
317       Lerr:
318       onUnicodeError("invalid UTF-8 sequence", i);
319     return V; // dummy return
320     }
321 
322 unittest
323 {   size_t i;
324     dchar c;
325 
326     debug(utf) printf("utf.decode.unittest\n");
327 
328     static s1 = "abcd"c;
329     i = 0;
330     c = decode(s1, i);
331     assert(c == cast(dchar)'a');
332     assert(i == 1);
333     c = decode(s1, i);
334     assert(c == cast(dchar)'b');
335     assert(i == 2);
336 
337     static s2 = "\xC2\xA9"c;
338     i = 0;
339     c = decode(s2, i);
340     assert(c == cast(dchar)'\u00A9');
341     assert(i == 2);
342 
343     static s3 = "\xE2\x89\xA0"c;
344     i = 0;
345     c = decode(s3, i);
346     assert(c == cast(dchar)'\u2260');
347     assert(i == 3);
348 
349     static char[][] s4 =
350     [	"\xE2\x89",		// too short
351 	"\xC0\x8A",
352 	"\xE0\x80\x8A",
353 	"\xF0\x80\x80\x8A",
354 	"\xF8\x80\x80\x80\x8A",
355 	"\xFC\x80\x80\x80\x80\x8A",
356     ];
357 
358     for (int j = 0; j < s4.length; j++)
359     {
360 	try
361 	{
362 	    i = 0;
363 	    c = decode(s4[j], i);
364 	    assert(0);
365 	}
366 	catch (Object o)
367 	{
368 	    i = 23;
369 	}
370 	assert(i == 23);
371     }
372 }
373 
374 /** ditto */
375 
376 dchar decode(in wchar[] s, ref size_t idx)
377     in
378     {
379 	assert(idx >= 0 && idx < s.length);
380     }
381     out (result)
382     {
383 	assert(isValidDchar(result));
384     }
385     body
386     {
387 	char[] msg;
388 	dchar V;
389 	size_t i = idx;
390 	uint u = s[i];
391 
392 	if (u & ~0x7F)
393 	{   if (u >= 0xD800 && u <= 0xDBFF)
394 	    {   uint u2;
395 
396 		if (i + 1 == s.length)
397 		{   msg = "surrogate UTF-16 high value past end of string";
398 		    goto Lerr;
399 		}
400 		u2 = s[i + 1];
401 		if (u2 < 0xDC00 || u2 > 0xDFFF)
402 		{   msg = "surrogate UTF-16 low value out of range";
403 		    goto Lerr;
404 		}
405 		u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
406 		i += 2;
407 	    }
408 	    else if (u >= 0xDC00 && u <= 0xDFFF)
409 	    {   msg = "unpaired surrogate UTF-16 value";
410 		goto Lerr;
411 	    }
412 	    else if (u == 0xFFFE || u == 0xFFFF)
413 	    {   msg = "illegal UTF-16 value";
414 		goto Lerr;
415 	    }
416 	    else
417 		i++;
418 	}
419 	else
420 	{
421 	    i++;
422 	}
423 
424 	idx = i;
425 	return cast(dchar)u;
426 
427       Lerr:
428 	  onUnicodeError(msg, i);
429 	return cast(dchar)u; // dummy return
430     }
431 
432 /** ditto */
433 
434 dchar decode(in dchar[] s, ref size_t idx)
435     in
436     {
437 	assert(idx >= 0 && idx < s.length);
438     }
439     body
440     {
441 	size_t i = idx;
442 	dchar c = s[i];
443 
444 	if (!isValidDchar(c))
445 	    goto Lerr;
446 	idx = i + 1;
447 	return c;
448 
449       Lerr:
450 	  onUnicodeError("invalid UTF-32 value", i);
451 	return c; // dummy return
452     }
453 
454 
455 /* =================== Encode ======================= */
456 
457 /*******************************
458  * Encodes character c and appends it to array s[].
459  */
460 void encode(ref char[] s, dchar c)
461     in
462     {
463 	assert(isValidDchar(c));
464     }
465     body
466     {
467 	char[] r = s;
468 
469 	if (c <= 0x7F)
470 	{
471 	    r ~= cast(char) c;
472 	}
473 	else
474 	{
475 	    char[4] buf;
476 	    uint L;
477 
478 	    if (c <= 0x7FF)
479 	    {
480 		buf[0] = cast(char)(0xC0 | (c >> 6));
481 		buf[1] = cast(char)(0x80 | (c & 0x3F));
482 		L = 2;
483 	    }
484 	    else if (c <= 0xFFFF)
485 	    {
486 		buf[0] = cast(char)(0xE0 | (c >> 12));
487 		buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
488 		buf[2] = cast(char)(0x80 | (c & 0x3F));
489 		L = 3;
490 	    }
491 	    else if (c <= 0x10FFFF)
492 	    {
493 		buf[0] = cast(char)(0xF0 | (c >> 18));
494 		buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
495 		buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
496 		buf[3] = cast(char)(0x80 | (c & 0x3F));
497 		L = 4;
498 	    }
499 	    else
500 	    {
501 		assert(0);
502 	    }
503 	    r ~= buf[0 .. L];
504 	}
505 	s = r;
506     }
507 
508 unittest
509 {
510     debug(utf) printf("utf.encode.unittest\n");
511 
512     char[] s = "abcd".dup;
513     encode(s, cast(dchar)'a');
514     assert(s.length == 5);
515     assert(s == "abcda");
516 
517     encode(s, cast(dchar)'\u00A9');
518     assert(s.length == 7);
519     assert(s == "abcda\xC2\xA9");
520     //assert(s == "abcda\u00A9");	// BUG: fix compiler
521 
522     encode(s, cast(dchar)'\u2260');
523     assert(s.length == 10);
524     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
525 }
526 
527 /** ditto */
528 
529 void encode(ref wchar[] s, dchar c)
530     in
531     {
532 	assert(isValidDchar(c));
533     }
534     body
535     {
536 	wchar[] r = s;
537 
538 	if (c <= 0xFFFF)
539 	{
540 	    r ~= cast(wchar) c;
541 	}
542 	else
543 	{
544 	    wchar[2] buf;
545 
546 	    buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
547 	    buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
548 	    r ~= buf;
549 	}
550 	s = r;
551     }
552 
553 /** ditto */
554 void encode(ref dchar[] s, dchar c)
555     in
556     {
557 	assert(isValidDchar(c));
558     }
559     body
560     {
561 	s ~= c;
562     }
563 
564 /* =================== Validation ======================= */
565 
566 void validate(char[] s)
567 {
568     size_t len = s.length;
569     size_t i;
570 
571     for (i = 0; i < len; )
572     {
573 	decode(s, i);
574     }
575 }
576 
577 void validate(wchar[] s)
578 {
579     size_t len = s.length;
580     size_t i;
581 
582     for (i = 0; i < len; )
583     {
584 	decode(s, i);
585     }
586 }
587 
588 void validate(dchar[] s)
589 {
590     size_t len = s.length;
591     size_t i;
592 
593     for (i = 0; i < len; )
594     {
595 	decode(s, i);
596     }
597 }
598 
599 /* =================== Conversion to UTF8 ======================= */
600 
601 char[] toUTF8(char[4] buf, dchar c)
602     in
603     {
604 	assert(isValidDchar(c));
605     }
606     body
607     {
608 	if (c <= 0x7F)
609 	{
610 	    buf[0] = cast(char) c;
611 	    return buf[0 .. 1];
612 	}
613 	else if (c <= 0x7FF)
614 	{
615 	    buf[0] = cast(char)(0xC0 | (c >> 6));
616 	    buf[1] = cast(char)(0x80 | (c & 0x3F));
617 	    return buf[0 .. 2];
618 	}
619 	else if (c <= 0xFFFF)
620 	{
621 	    buf[0] = cast(char)(0xE0 | (c >> 12));
622 	    buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
623 	    buf[2] = cast(char)(0x80 | (c & 0x3F));
624 	    return buf[0 .. 3];
625 	}
626 	else if (c <= 0x10FFFF)
627 	{
628 	    buf[0] = cast(char)(0xF0 | (c >> 18));
629 	    buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
630 	    buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
631 	    buf[3] = cast(char)(0x80 | (c & 0x3F));
632 	    return buf[0 .. 4];
633 	}
634 	assert(0);
635     }
636 
637 /*******************
638  * Encodes string s into UTF-8 and returns the encoded string.
639  */
640 char[] toUTF8(char[] s)
641     in
642     {
643 	validate(s);
644     }
645     body
646     {
647 	return s;
648     }
649 
650 /** ditto */
651 char[] toUTF8(in wchar[] s)
652 {
653     char[] r;
654     size_t i;
655     size_t slen = s.length;
656 
657     r.length = slen;
658 
659     for (i = 0; i < slen; i++)
660     {	wchar c = s[i];
661 
662 	if (c <= 0x7F)
663 	    r[i] = cast(char)c;		// fast path for ascii
664 	else
665 	{
666 	    r.length = i;
667 	    foreach (dchar c; s[i .. slen])
668 	    {
669 		encode(r, c);
670 	    }
671 	    break;
672 	}
673     }
674     return r;
675 }
676 
677 /** ditto */
678 char[] toUTF8(in dchar[] s)
679 {
680     char[] r;
681     size_t i;
682     size_t slen = s.length;
683 
684     r.length = slen;
685 
686     for (i = 0; i < slen; i++)
687     {	dchar c = s[i];
688 
689 	if (c <= 0x7F)
690 	    r[i] = cast(char)c;		// fast path for ascii
691 	else
692 	{
693 	    r.length = i;
694 	    foreach (dchar d; s[i .. slen])
695 	    {
696 		encode(r, d);
697 	    }
698 	    break;
699 	}
700     }
701     return r;
702 }
703 
704 /* =================== Conversion to UTF16 ======================= */
705 
706 wchar[] toUTF16(wchar[2] buf, dchar c)
707     in
708     {
709 	assert(isValidDchar(c));
710     }
711     body
712     {
713 	if (c <= 0xFFFF)
714 	{
715 	    buf[0] = cast(wchar) c;
716 	    return buf[0 .. 1];
717 	}
718 	else
719 	{
720 	    buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
721 	    buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
722 	    return buf[0 .. 2];
723 	}
724     }
725 
726 /****************
727  * Encodes string s into UTF-16 and returns the encoded string.
728  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
729  * an LPWSTR or LPCWSTR argument.
730  */
731 wchar[] toUTF16(in char[] s)
732 {
733     wchar[] r;
734     size_t slen = s.length;
735 
736     r.length = slen;
737     r.length = 0;
738     for (size_t i = 0; i < slen; )
739     {
740 	dchar c = s[i];
741 	if (c <= 0x7F)
742 	{
743 	    i++;
744 	    r ~= cast(wchar)c;
745 	}
746 	else
747 	{
748 	    c = decode(s, i);
749 	    encode(r, c);
750 	}
751     }
752     return r;
753 }
754 
755 alias wchar* wptr;
756 /** ditto */
757 wptr toUTF16z(in char[] s)
758 {
759     wchar[] r;
760     size_t slen = s.length;
761 
762     r.length = slen + 1;
763     r.length = 0;
764     for (size_t i = 0; i < slen; )
765     {
766 	dchar c = s[i];
767 	if (c <= 0x7F)
768 	{
769 	    i++;
770 	    r ~= cast(wchar)c;
771 	}
772 	else
773 	{
774 	    c = decode(s, i);
775 	    encode(r, c);
776 	}
777     }
778     r ~= "\000";
779     return r.ptr;
780 }
781 
782 /** ditto */
783 wchar[] toUTF16(wchar[] s)
784     in
785     {
786 	validate(s);
787     }
788     body
789     {
790 	return s;
791     }
792 
793 /** ditto */
794 wchar[] toUTF16(in dchar[] s)
795 {
796     wchar[] r;
797     size_t slen = s.length;
798 
799     r.length = slen;
800     r.length = 0;
801     for (size_t i = 0; i < slen; i++)
802     {
803 	encode(r, s[i]);
804     }
805     return r;
806 }
807 
808 /* =================== Conversion to UTF32 ======================= */
809 
810 /*****
811  * Encodes string s into UTF-32 and returns the encoded string.
812  */
813 dchar[] toUTF32(in char[] s)
814 {
815     dchar[] r;
816     size_t slen = s.length;
817     size_t j = 0;
818 
819     r.length = slen;		// r[] will never be longer than s[]
820     for (size_t i = 0; i < slen; )
821     {
822 	dchar c = s[i];
823 	if (c >= 0x80)
824 	    c = decode(s, i);
825 	else
826 	    i++;		// c is ascii, no need for decode
827 	r[j++] = c;
828     }
829     return cast(dchar[])r[0 .. j];
830 }
831 
832 /** ditto */
833 dchar[] toUTF32(in wchar[] s)
834 {
835     dchar[] r;
836     size_t slen = s.length;
837     size_t j = 0;
838 
839     r.length = slen;		// r[] will never be longer than s[]
840     for (size_t i = 0; i < slen; )
841     {
842 	dchar c = s[i];
843 	if (c >= 0x80)
844 	    c = decode(s, i);
845 	else
846 	    i++;		// c is ascii, no need for decode
847 	r[j++] = c;
848     }
849     return r[0 .. j];
850 }
851 
852 /** ditto */
853 dchar[] toUTF32(dchar[] s)
854     in
855     {
856 	validate(s);
857     }
858     body
859     {
860 	return s;
861     }
862 
863 /* ================================ tests ================================== */
864 
865 unittest
866 {
867     debug(utf) printf("utf.toUTF.unittest\n");
868 
869     char[] c;
870     wchar[] w;
871     dchar[] d;
872 
873     c = "hello";
874     w = toUTF16(c);
875     assert(w == "hello");
876     d = toUTF32(c);
877     assert(d == "hello");
878 
879     c = toUTF8(w);
880     assert(c == "hello");
881     d = toUTF32(w);
882     assert(d == "hello");
883 
884     c = toUTF8(d);
885     assert(c == "hello");
886     w = toUTF16(d);
887     assert(w == "hello");
888 
889     debug(utf) printf("utf.toUTF.unittest\n");
890 
891     c = "hel\u1234o";
892     w = toUTF16(c);
893     assert(w == "hel\u1234o");
894     d = toUTF32(c);
895     assert(d == "hel\u1234o");
896 
897     c = toUTF8(w);
898     assert(c == "hel\u1234o");
899     d = toUTF32(w);
900     assert(d == "hel\u1234o");
901 
902     c = toUTF8(d);
903     assert(c == "hel\u1234o");
904     w = toUTF16(d);
905     assert(w == "hel\u1234o");
906 
907     debug(utf) printf("utf.toUTF.unittest\n");
908 
909     c = "he\U0010AAAAllo";
910     w = toUTF16(c);
911     //foreach (wchar c; w) printf("c = x%x\n", c);
912     //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
913     assert(w == "he\U0010AAAAllo");
914     d = toUTF32(c);
915     assert(d == "he\U0010AAAAllo");
916 
917     c = toUTF8(w);
918     assert(c == "he\U0010AAAAllo");
919     d = toUTF32(w);
920     assert(d == "he\U0010AAAAllo");
921 
922     c = toUTF8(d);
923     assert(c == "he\U0010AAAAllo");
924     w = toUTF16(d);
925     assert(w == "he\U0010AAAAllo");
926 }