1 // Written in the D programming language 2 3 /* 4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com 5 * Written by Walter Bright 6 * 7 * This software is provided 'as-is', without any express or implied 8 * warranty. In no event will the authors be held liable for any damages 9 * arising from the use of this software. 10 * 11 * Permission is granted to anyone to use this software for any purpose, 12 * including commercial applications, and to alter it and redistribute it 13 * freely, subject to the following restrictions: 14 * 15 * o The origin of this software must not be misrepresented; you must not 16 * claim that you wrote the original software. If you use this software 17 * in a product, an acknowledgment in the product documentation would be 18 * appreciated but is not required. 19 * o Altered source versions must be plainly marked as such, and must not 20 * be misrepresented as being the original software. 21 * o This notice may not be removed or altered from any source 22 * distribution. 23 */ 24 25 /******************************************** 26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. 27 * 28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D 29 * wchar type. 30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to 31 * the D utf.dchar type. 32 * 33 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). 34 * 35 * See_Also: 36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 39 * Macros: 40 * WIKI = Phobos/StdUtf 41 */ 42 43 module rt.compiler.util.utf; 44 45 46 47 extern (C) void onUnicodeError( char[] msg, size_t idx ); 48 /******************************* 49 * Test if c is a valid UTF-32 character. 50 * 51 * \uFFFE and \uFFFF are considered valid by this function, 52 * as they are permitted for internal use by an application, 53 * but they are not allowed for interchange by the Unicode standard. 54 * 55 * Returns: true if it is, false if not. 56 */ 57 58 59 bool isValidDchar(dchar c) 60 { 61 /* Note: FFFE and FFFF are specifically permitted by the 62 * Unicode standard for application internal use, but are not 63 * allowed for interchange. 64 * (thanks to Arcane Jill) 65 */ 66 67 return c < 0xD800 || 68 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 69 } 70 71 debug import tango.stdc.stdio : printf; 72 73 unittest 74 { 75 debug(utf) printf("utf.isValidDchar.unittest\n"); 76 assert(isValidDchar(cast(dchar)'a') == true); 77 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); 78 } 79 80 81 /* This array gives the length of a UTF-8 sequence indexed by the value 82 * of the leading byte. An FF represents an illegal starting value of 83 * a UTF-8 sequence. 84 * FF is used instead of 0 to avoid having loops hang. 85 */ 86 87 ubyte[256] UTF8stride = 88 [ 89 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 90 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 91 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 92 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 93 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 94 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 95 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 96 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 97 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 98 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 99 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 100 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 101 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 102 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 103 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 104 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 105 ]; 106 107 /** 108 * stride() returns the length of a UTF-8 sequence starting at index i 109 * in string s. 110 * Returns: 111 * The number of bytes in the UTF-8 sequence or 112 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. 113 */ 114 uint stride(in char[] s, size_t i) 115 { 116 return UTF8stride[s[i]]; 117 } 118 119 /** 120 * stride() returns the length of a UTF-16 sequence starting at index i 121 * in string s. 122 */ 123 uint stride(in wchar[] s, size_t i) 124 { uint u = s[i]; 125 return 1 + (u >= 0xD800 && u <= 0xDBFF); 126 } 127 128 /** 129 * stride() returns the length of a UTF-32 sequence starting at index i 130 * in string s. 131 * Returns: The return value will always be 1. 132 */ 133 uint stride(in dchar[] s, size_t i) 134 { 135 return 1; 136 } 137 138 /******************************************* 139 * Given an index i into an array of characters s[], 140 * and assuming that index i is at the start of a UTF character, 141 * determine the number of UCS characters up to that index i. 142 */ 143 144 size_t toUCSindex(in char[] s, size_t i) 145 { 146 size_t n; 147 size_t j; 148 size_t stride; 149 150 for (j = 0; j < i; j += stride) 151 { 152 stride = UTF8stride[s[j]]; 153 if (stride == 0xFF) 154 goto Lerr; 155 n++; 156 } 157 if (j > i) 158 { 159 Lerr: 160 onUnicodeError("invalid UTF-8 sequence", j); 161 } 162 return n; 163 } 164 165 /** ditto */ 166 size_t toUCSindex(in wchar[] s, size_t i) 167 { 168 size_t n; 169 size_t j; 170 171 for (j = 0; j < i; ) 172 { uint u = s[j]; 173 174 j += 1 + (u >= 0xD800 && u <= 0xDBFF); 175 n++; 176 } 177 if (j > i) 178 { 179 Lerr: 180 onUnicodeError("invalid UTF-16 sequence", j); 181 } 182 return n; 183 } 184 185 /** ditto */ 186 size_t toUCSindex(in dchar[] s, size_t i) 187 { 188 return i; 189 } 190 191 /****************************************** 192 * Given a UCS index n into an array of characters s[], return the UTF index. 193 */ 194 195 size_t toUTFindex(in char[] s, size_t n) 196 { 197 size_t i; 198 199 while (n--) 200 { 201 uint j = UTF8stride[s[i]]; 202 if (j == 0xFF) 203 onUnicodeError("invalid UTF-8 sequence", i); 204 i += j; 205 } 206 return i; 207 } 208 209 /** ditto */ 210 size_t toUTFindex(in wchar[] s, size_t n) 211 { 212 size_t i; 213 214 while (n--) 215 { wchar u = s[i]; 216 217 i += 1 + (u >= 0xD800 && u <= 0xDBFF); 218 } 219 return i; 220 } 221 222 /** ditto */ 223 size_t toUTFindex(in dchar[] s, size_t n) 224 { 225 return n; 226 } 227 228 /* =================== Decode ======================= */ 229 230 /*************** 231 * Decodes and returns character starting at s[idx]. idx is advanced past the 232 * decoded character. If the character is not well formed, a UtfException is 233 * thrown and idx remains unchanged. 234 */ 235 dchar decode(in char[] s, ref size_t idx) 236 in 237 { 238 assert(idx >= 0 && idx < s.length); 239 } 240 out (result) 241 { 242 assert(isValidDchar(result)); 243 } 244 body 245 { 246 size_t len = s.length; 247 dchar V; 248 size_t i = idx; 249 char u = s[i]; 250 251 if (u & 0x80) 252 { uint n; 253 char u2; 254 255 /* The following encodings are valid, except for the 5 and 6 byte 256 * combinations: 257 * 0xxxxxxx 258 * 110xxxxx 10xxxxxx 259 * 1110xxxx 10xxxxxx 10xxxxxx 260 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 261 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 262 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 263 */ 264 for (n = 1; ; n++) 265 { 266 if (n > 4) 267 goto Lerr; // only do the first 4 of 6 encodings 268 if (((u << n) & 0x80) == 0) 269 { 270 if (n == 1) 271 goto Lerr; 272 break; 273 } 274 } 275 276 // Pick off (7 - n) significant bits of B from first byte of octet 277 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 278 279 if (i + (n - 1) >= len) 280 goto Lerr; // off end of string 281 282 /* The following combinations are overlong, and illegal: 283 * 1100000x (10xxxxxx) 284 * 11100000 100xxxxx (10xxxxxx) 285 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 286 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 287 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 288 */ 289 u2 = s[i + 1]; 290 if ((u & 0xFE) == 0xC0 || 291 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 292 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 293 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 294 (u == 0xFC && (u2 & 0xFC) == 0x80)) 295 goto Lerr; // overlong combination 296 297 for (uint j = 1; j != n; j++) 298 { 299 u = s[i + j]; 300 if ((u & 0xC0) != 0x80) 301 goto Lerr; // trailing bytes are 10xxxxxx 302 V = (V << 6) | (u & 0x3F); 303 } 304 if (!isValidDchar(V)) 305 goto Lerr; 306 i += n; 307 } 308 else 309 { 310 V = cast(dchar) u; 311 i++; 312 } 313 314 idx = i; 315 return V; 316 317 Lerr: 318 onUnicodeError("invalid UTF-8 sequence", i); 319 return V; // dummy return 320 } 321 322 unittest 323 { size_t i; 324 dchar c; 325 326 debug(utf) printf("utf.decode.unittest\n"); 327 328 static s1 = "abcd"c; 329 i = 0; 330 c = decode(s1, i); 331 assert(c == cast(dchar)'a'); 332 assert(i == 1); 333 c = decode(s1, i); 334 assert(c == cast(dchar)'b'); 335 assert(i == 2); 336 337 static s2 = "\xC2\xA9"c; 338 i = 0; 339 c = decode(s2, i); 340 assert(c == cast(dchar)'\u00A9'); 341 assert(i == 2); 342 343 static s3 = "\xE2\x89\xA0"c; 344 i = 0; 345 c = decode(s3, i); 346 assert(c == cast(dchar)'\u2260'); 347 assert(i == 3); 348 349 static char[][] s4 = 350 [ "\xE2\x89", // too short 351 "\xC0\x8A", 352 "\xE0\x80\x8A", 353 "\xF0\x80\x80\x8A", 354 "\xF8\x80\x80\x80\x8A", 355 "\xFC\x80\x80\x80\x80\x8A", 356 ]; 357 358 for (int j = 0; j < s4.length; j++) 359 { 360 try 361 { 362 i = 0; 363 c = decode(s4[j], i); 364 assert(0); 365 } 366 catch (Object o) 367 { 368 i = 23; 369 } 370 assert(i == 23); 371 } 372 } 373 374 /** ditto */ 375 376 dchar decode(in wchar[] s, ref size_t idx) 377 in 378 { 379 assert(idx >= 0 && idx < s.length); 380 } 381 out (result) 382 { 383 assert(isValidDchar(result)); 384 } 385 body 386 { 387 char[] msg; 388 dchar V; 389 size_t i = idx; 390 uint u = s[i]; 391 392 if (u & ~0x7F) 393 { if (u >= 0xD800 && u <= 0xDBFF) 394 { uint u2; 395 396 if (i + 1 == s.length) 397 { msg = "surrogate UTF-16 high value past end of string"; 398 goto Lerr; 399 } 400 u2 = s[i + 1]; 401 if (u2 < 0xDC00 || u2 > 0xDFFF) 402 { msg = "surrogate UTF-16 low value out of range"; 403 goto Lerr; 404 } 405 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 406 i += 2; 407 } 408 else if (u >= 0xDC00 && u <= 0xDFFF) 409 { msg = "unpaired surrogate UTF-16 value"; 410 goto Lerr; 411 } 412 else if (u == 0xFFFE || u == 0xFFFF) 413 { msg = "illegal UTF-16 value"; 414 goto Lerr; 415 } 416 else 417 i++; 418 } 419 else 420 { 421 i++; 422 } 423 424 idx = i; 425 return cast(dchar)u; 426 427 Lerr: 428 onUnicodeError(msg, i); 429 return cast(dchar)u; // dummy return 430 } 431 432 /** ditto */ 433 434 dchar decode(in dchar[] s, ref size_t idx) 435 in 436 { 437 assert(idx >= 0 && idx < s.length); 438 } 439 body 440 { 441 size_t i = idx; 442 dchar c = s[i]; 443 444 if (!isValidDchar(c)) 445 goto Lerr; 446 idx = i + 1; 447 return c; 448 449 Lerr: 450 onUnicodeError("invalid UTF-32 value", i); 451 return c; // dummy return 452 } 453 454 455 /* =================== Encode ======================= */ 456 457 /******************************* 458 * Encodes character c and appends it to array s[]. 459 */ 460 void encode(ref char[] s, dchar c) 461 in 462 { 463 assert(isValidDchar(c)); 464 } 465 body 466 { 467 char[] r = s; 468 469 if (c <= 0x7F) 470 { 471 r ~= cast(char) c; 472 } 473 else 474 { 475 char[4] buf; 476 uint L; 477 478 if (c <= 0x7FF) 479 { 480 buf[0] = cast(char)(0xC0 | (c >> 6)); 481 buf[1] = cast(char)(0x80 | (c & 0x3F)); 482 L = 2; 483 } 484 else if (c <= 0xFFFF) 485 { 486 buf[0] = cast(char)(0xE0 | (c >> 12)); 487 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 488 buf[2] = cast(char)(0x80 | (c & 0x3F)); 489 L = 3; 490 } 491 else if (c <= 0x10FFFF) 492 { 493 buf[0] = cast(char)(0xF0 | (c >> 18)); 494 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 495 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 496 buf[3] = cast(char)(0x80 | (c & 0x3F)); 497 L = 4; 498 } 499 else 500 { 501 assert(0); 502 } 503 r ~= buf[0 .. L]; 504 } 505 s = r; 506 } 507 508 unittest 509 { 510 debug(utf) printf("utf.encode.unittest\n"); 511 512 char[] s = "abcd".dup; 513 encode(s, cast(dchar)'a'); 514 assert(s.length == 5); 515 assert(s == "abcda"); 516 517 encode(s, cast(dchar)'\u00A9'); 518 assert(s.length == 7); 519 assert(s == "abcda\xC2\xA9"); 520 //assert(s == "abcda\u00A9"); // BUG: fix compiler 521 522 encode(s, cast(dchar)'\u2260'); 523 assert(s.length == 10); 524 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 525 } 526 527 /** ditto */ 528 529 void encode(ref wchar[] s, dchar c) 530 in 531 { 532 assert(isValidDchar(c)); 533 } 534 body 535 { 536 wchar[] r = s; 537 538 if (c <= 0xFFFF) 539 { 540 r ~= cast(wchar) c; 541 } 542 else 543 { 544 wchar[2] buf; 545 546 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 547 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 548 r ~= buf; 549 } 550 s = r; 551 } 552 553 /** ditto */ 554 void encode(ref dchar[] s, dchar c) 555 in 556 { 557 assert(isValidDchar(c)); 558 } 559 body 560 { 561 s ~= c; 562 } 563 564 /* =================== Validation ======================= */ 565 566 void validate(char[] s) 567 { 568 size_t len = s.length; 569 size_t i; 570 571 for (i = 0; i < len; ) 572 { 573 decode(s, i); 574 } 575 } 576 577 void validate(wchar[] s) 578 { 579 size_t len = s.length; 580 size_t i; 581 582 for (i = 0; i < len; ) 583 { 584 decode(s, i); 585 } 586 } 587 588 void validate(dchar[] s) 589 { 590 size_t len = s.length; 591 size_t i; 592 593 for (i = 0; i < len; ) 594 { 595 decode(s, i); 596 } 597 } 598 599 /* =================== Conversion to UTF8 ======================= */ 600 601 char[] toUTF8(char[4] buf, dchar c) 602 in 603 { 604 assert(isValidDchar(c)); 605 } 606 body 607 { 608 if (c <= 0x7F) 609 { 610 buf[0] = cast(char) c; 611 return buf[0 .. 1]; 612 } 613 else if (c <= 0x7FF) 614 { 615 buf[0] = cast(char)(0xC0 | (c >> 6)); 616 buf[1] = cast(char)(0x80 | (c & 0x3F)); 617 return buf[0 .. 2]; 618 } 619 else if (c <= 0xFFFF) 620 { 621 buf[0] = cast(char)(0xE0 | (c >> 12)); 622 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 623 buf[2] = cast(char)(0x80 | (c & 0x3F)); 624 return buf[0 .. 3]; 625 } 626 else if (c <= 0x10FFFF) 627 { 628 buf[0] = cast(char)(0xF0 | (c >> 18)); 629 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 630 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 631 buf[3] = cast(char)(0x80 | (c & 0x3F)); 632 return buf[0 .. 4]; 633 } 634 assert(0); 635 } 636 637 /******************* 638 * Encodes string s into UTF-8 and returns the encoded string. 639 */ 640 char[] toUTF8(char[] s) 641 in 642 { 643 validate(s); 644 } 645 body 646 { 647 return s; 648 } 649 650 /** ditto */ 651 char[] toUTF8(in wchar[] s) 652 { 653 char[] r; 654 size_t i; 655 size_t slen = s.length; 656 657 r.length = slen; 658 659 for (i = 0; i < slen; i++) 660 { wchar c = s[i]; 661 662 if (c <= 0x7F) 663 r[i] = cast(char)c; // fast path for ascii 664 else 665 { 666 r.length = i; 667 foreach (dchar c; s[i .. slen]) 668 { 669 encode(r, c); 670 } 671 break; 672 } 673 } 674 return r; 675 } 676 677 /** ditto */ 678 char[] toUTF8(in dchar[] s) 679 { 680 char[] r; 681 size_t i; 682 size_t slen = s.length; 683 684 r.length = slen; 685 686 for (i = 0; i < slen; i++) 687 { dchar c = s[i]; 688 689 if (c <= 0x7F) 690 r[i] = cast(char)c; // fast path for ascii 691 else 692 { 693 r.length = i; 694 foreach (dchar d; s[i .. slen]) 695 { 696 encode(r, d); 697 } 698 break; 699 } 700 } 701 return r; 702 } 703 704 /* =================== Conversion to UTF16 ======================= */ 705 706 wchar[] toUTF16(wchar[2] buf, dchar c) 707 in 708 { 709 assert(isValidDchar(c)); 710 } 711 body 712 { 713 if (c <= 0xFFFF) 714 { 715 buf[0] = cast(wchar) c; 716 return buf[0 .. 1]; 717 } 718 else 719 { 720 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 721 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 722 return buf[0 .. 2]; 723 } 724 } 725 726 /**************** 727 * Encodes string s into UTF-16 and returns the encoded string. 728 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take 729 * an LPWSTR or LPCWSTR argument. 730 */ 731 wchar[] toUTF16(in char[] s) 732 { 733 wchar[] r; 734 size_t slen = s.length; 735 736 r.length = slen; 737 r.length = 0; 738 for (size_t i = 0; i < slen; ) 739 { 740 dchar c = s[i]; 741 if (c <= 0x7F) 742 { 743 i++; 744 r ~= cast(wchar)c; 745 } 746 else 747 { 748 c = decode(s, i); 749 encode(r, c); 750 } 751 } 752 return r; 753 } 754 755 alias wchar* wptr; 756 /** ditto */ 757 wptr toUTF16z(in char[] s) 758 { 759 wchar[] r; 760 size_t slen = s.length; 761 762 r.length = slen + 1; 763 r.length = 0; 764 for (size_t i = 0; i < slen; ) 765 { 766 dchar c = s[i]; 767 if (c <= 0x7F) 768 { 769 i++; 770 r ~= cast(wchar)c; 771 } 772 else 773 { 774 c = decode(s, i); 775 encode(r, c); 776 } 777 } 778 r ~= "\000"; 779 return r.ptr; 780 } 781 782 /** ditto */ 783 wchar[] toUTF16(wchar[] s) 784 in 785 { 786 validate(s); 787 } 788 body 789 { 790 return s; 791 } 792 793 /** ditto */ 794 wchar[] toUTF16(in dchar[] s) 795 { 796 wchar[] r; 797 size_t slen = s.length; 798 799 r.length = slen; 800 r.length = 0; 801 for (size_t i = 0; i < slen; i++) 802 { 803 encode(r, s[i]); 804 } 805 return r; 806 } 807 808 /* =================== Conversion to UTF32 ======================= */ 809 810 /***** 811 * Encodes string s into UTF-32 and returns the encoded string. 812 */ 813 dchar[] toUTF32(in char[] s) 814 { 815 dchar[] r; 816 size_t slen = s.length; 817 size_t j = 0; 818 819 r.length = slen; // r[] will never be longer than s[] 820 for (size_t i = 0; i < slen; ) 821 { 822 dchar c = s[i]; 823 if (c >= 0x80) 824 c = decode(s, i); 825 else 826 i++; // c is ascii, no need for decode 827 r[j++] = c; 828 } 829 return cast(dchar[])r[0 .. j]; 830 } 831 832 /** ditto */ 833 dchar[] toUTF32(in wchar[] s) 834 { 835 dchar[] r; 836 size_t slen = s.length; 837 size_t j = 0; 838 839 r.length = slen; // r[] will never be longer than s[] 840 for (size_t i = 0; i < slen; ) 841 { 842 dchar c = s[i]; 843 if (c >= 0x80) 844 c = decode(s, i); 845 else 846 i++; // c is ascii, no need for decode 847 r[j++] = c; 848 } 849 return r[0 .. j]; 850 } 851 852 /** ditto */ 853 dchar[] toUTF32(dchar[] s) 854 in 855 { 856 validate(s); 857 } 858 body 859 { 860 return s; 861 } 862 863 /* ================================ tests ================================== */ 864 865 unittest 866 { 867 debug(utf) printf("utf.toUTF.unittest\n"); 868 869 char[] c; 870 wchar[] w; 871 dchar[] d; 872 873 c = "hello"; 874 w = toUTF16(c); 875 assert(w == "hello"); 876 d = toUTF32(c); 877 assert(d == "hello"); 878 879 c = toUTF8(w); 880 assert(c == "hello"); 881 d = toUTF32(w); 882 assert(d == "hello"); 883 884 c = toUTF8(d); 885 assert(c == "hello"); 886 w = toUTF16(d); 887 assert(w == "hello"); 888 889 debug(utf) printf("utf.toUTF.unittest\n"); 890 891 c = "hel\u1234o"; 892 w = toUTF16(c); 893 assert(w == "hel\u1234o"); 894 d = toUTF32(c); 895 assert(d == "hel\u1234o"); 896 897 c = toUTF8(w); 898 assert(c == "hel\u1234o"); 899 d = toUTF32(w); 900 assert(d == "hel\u1234o"); 901 902 c = toUTF8(d); 903 assert(c == "hel\u1234o"); 904 w = toUTF16(d); 905 assert(w == "hel\u1234o"); 906 907 debug(utf) printf("utf.toUTF.unittest\n"); 908 909 c = "he\U0010AAAAllo"; 910 w = toUTF16(c); 911 //foreach (wchar c; w) printf("c = x%x\n", c); 912 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); 913 assert(w == "he\U0010AAAAllo"); 914 d = toUTF32(c); 915 assert(d == "he\U0010AAAAllo"); 916 917 c = toUTF8(w); 918 assert(c == "he\U0010AAAAllo"); 919 d = toUTF32(w); 920 assert(d == "he\U0010AAAAllo"); 921 922 c = toUTF8(d); 923 assert(c == "he\U0010AAAAllo"); 924 w = toUTF16(d); 925 assert(w == "he\U0010AAAAllo"); 926 }