1 /******************************************************************************* 2 3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved 4 5 license: BSD style: $(LICENSE) 6 7 version: Apr 2004: Initial release 8 Dec 2006: South Seas version 9 10 author: Kris 11 12 13 Placeholder for a variety of wee functions. These functions are all 14 templated with the intent of being used for arrays of char, wchar, 15 and dchar. However, they operate correctly with other array types 16 also. 17 18 Several of these functions return an index value, representing where 19 some criteria was identified. When said criteria is not matched, the 20 functions return a value representing the array length provided to 21 them. That is, for those scenarios where C functions might typically 22 return -1 these functions return length instead. This operate nicely 23 with D slices: 24 --- 25 auto text = "happy:faces"; 26 27 assert (text[0 .. locate (text, ':')] == "happy"); 28 29 assert (text[0 .. locate (text, '!')] == "happy:faces"); 30 --- 31 32 The contains() function is more convenient for trivial lookup 33 cases: 34 --- 35 if (contains ("fubar", '!')) 36 ... 37 --- 38 39 Note that where some functions expect a size_t as an argument, the 40 D template-matching algorithm will fail where an int is provided 41 instead. This is the typically the cause of "template not found" 42 errors. Also note that name overloading is not supported cleanly 43 by IFTI at this time, so is not applied here. 44 45 46 Applying the D "import alias" mechanism to this module is highly 47 recommended, in order to limit namespace pollution: 48 --- 49 import Util = tango.text.Util; 50 51 auto s = Util.trim (" foo "); 52 --- 53 54 55 Function templates: 56 --- 57 trim (source) // trim whitespace 58 triml (source) // trim whitespace 59 trimr (source) // trim whitespace 60 strip (source, match) // trim elements 61 stripl (source, match) // trim elements 62 stripr (source, match) // trim elements 63 chopl (source, match) // trim pattern match 64 chopr (source, match) // trim pattern match 65 delimit (src, set) // split on delims 66 split (source, pattern) // split on pattern 67 splitLines (source); // split on lines 68 head (source, pattern, tail) // split to head & tail 69 join (source, postfix, output) // join text segments 70 prefix (dst, prefix, content...) // prefix text segments 71 postfix (dst, postfix, content...) // postfix text segments 72 combine (dst, prefix, postfix, content...) // combine lotsa stuff 73 repeat (source, count, output) // repeat source 74 replace (source, match, replacement) // replace chars 75 substitute (source, match, replacement) // replace/remove matches 76 count (source, match) // count instances 77 contains (source, match) // has char? 78 containsPattern (source, match) // has pattern? 79 index (source, match, start) // find match index 80 locate (source, match, start) // find char 81 locatePrior (source, match, start) // find prior char 82 locatePattern (source, match, start); // find pattern 83 locatePatternPrior (source, match, start); // find prior pattern 84 indexOf (s*, match, length) // low-level lookup 85 mismatch (s1*, s2*, length) // low-level compare 86 matching (s1*, s2*, length) // low-level compare 87 isSpace (match) // is whitespace? 88 unescape(source, output) // convert '\' prefixes 89 layout (destination, format ...) // featherweight printf 90 lines (str) // foreach lines 91 quotes (str, set) // foreach quotes 92 delimiters (str, set) // foreach delimiters 93 patterns (str, pattern) // foreach patterns 94 --- 95 96 Please note that any 'pattern' referred to within this module 97 refers to a pattern of characters, and not some kind of regex 98 descriptor. Use the Regex module for regex operation. 99 100 *******************************************************************************/ 101 102 module tango.text.Util; 103 104 /****************************************************************************** 105 106 Trim the provided array by stripping whitespace from both 107 ends. Returns a slice of the original content 108 109 ******************************************************************************/ 110 111 112 T[] trim(T) (T[] source) 113 { 114 auto head = source.ptr; 115 auto tail = head + source.length; 116 117 while (head < tail && isSpace(*head)) 118 ++head; 119 120 while (tail > head && isSpace(*(tail-1))) 121 --tail; 122 123 return head [0 .. tail - head]; 124 } 125 126 /****************************************************************************** 127 128 Trim the provided array by stripping whitespace from the left. 129 Returns a slice of the original content 130 131 ******************************************************************************/ 132 133 T[] triml(T) (T[] source) 134 { 135 auto head = source.ptr; 136 auto tail = head + source.length; 137 138 while (head < tail && isSpace(*head)) 139 ++head; 140 141 return head [0 .. tail - head]; 142 } 143 144 /****************************************************************************** 145 146 Trim the provided array by stripping whitespace from the right. 147 Returns a slice of the original content 148 149 ******************************************************************************/ 150 151 T[] trimr(T) (T[] source) 152 { 153 auto head = source.ptr; 154 auto tail = head + source.length; 155 156 while (tail > head && isSpace(*(tail-1))) 157 --tail; 158 159 return head [0 .. tail - head]; 160 } 161 162 /****************************************************************************** 163 164 Trim the given array by stripping the provided match from 165 both ends. Returns a slice of the original content 166 167 ******************************************************************************/ 168 169 T[] strip(T, S) (T[] source, S match) 170 { 171 auto head = source.ptr; 172 auto tail = head + source.length; 173 174 while (head < tail && *head is match) 175 ++head; 176 177 while (tail > head && *(tail-1) is match) 178 --tail; 179 180 return head [0 .. tail - head]; 181 } 182 183 /****************************************************************************** 184 185 Trim the given array by stripping the provided match from 186 the left hand side. Returns a slice of the original content 187 188 ******************************************************************************/ 189 190 T[] stripl(T, S) (T[] source, S match) 191 { 192 auto head = source.ptr; 193 auto tail = head + source.length; 194 195 while (head < tail && *head is match) 196 ++head; 197 198 return head [0 .. tail - head]; 199 } 200 201 /****************************************************************************** 202 203 Trim the given array by stripping the provided match from 204 the right hand side. Returns a slice of the original content 205 206 ******************************************************************************/ 207 208 T[] stripr(T, S) (T[] source, S match) 209 { 210 auto head = source.ptr; 211 auto tail = head + source.length; 212 213 while (tail > head && *(tail-1) is match) 214 --tail; 215 216 return head [0 .. tail - head]; 217 } 218 219 /****************************************************************************** 220 221 Chop the given source by stripping the provided match from 222 the left hand side. Returns a slice of the original content 223 224 ******************************************************************************/ 225 226 T[] chopl(T, S) (T[] source, S match) 227 { 228 if (match.length <= source.length) 229 if (source[0 .. match.length] == match) 230 source = source [match.length .. $]; 231 232 return source; 233 } 234 235 /****************************************************************************** 236 237 Chop the given source by stripping the provided match from 238 the right hand side. Returns a slice of the original content 239 240 ******************************************************************************/ 241 242 T[] chopr(T, S) (T[] source, S match) 243 { 244 if (match.length <= source.length) 245 if (source[$-match.length .. $] == match) 246 source = source [0 .. $-match.length]; 247 248 return source; 249 } 250 251 /****************************************************************************** 252 253 Replace all instances of one element with another (in place) 254 255 ******************************************************************************/ 256 257 T[] replace(T, S) (T[] source, S match, S replacement) 258 { 259 foreach (ref c; source) 260 if (c is match) 261 c = replacement; 262 return source; 263 } 264 265 /****************************************************************************** 266 267 Substitute all instances of match from source. Set replacement 268 to null in order to remove instead of replace 269 270 ******************************************************************************/ 271 272 T[] substitute(T) (const(T)[] source, const(T)[] match, const(T)[] replacement) 273 { 274 T[] output; 275 276 foreach (s; patterns (source, match, replacement)) 277 output ~= s; 278 return output; 279 } 280 281 /****************************************************************************** 282 283 Count all instances of match within source 284 285 ******************************************************************************/ 286 287 size_t count(T) (const(T)[] source, const(T)[] match) 288 { 289 size_t c; 290 291 foreach (s; patterns (source, match)) 292 ++c; 293 assert(c > 0); 294 return c - 1; 295 } 296 297 /****************************************************************************** 298 299 Returns whether or not the provided array contains an instance 300 of the given match 301 302 ******************************************************************************/ 303 304 bool contains(T) (const(T)[] source, const(T) match) 305 { 306 return indexOf (source.ptr, match, source.length) != source.length; 307 } 308 309 /****************************************************************************** 310 311 Returns whether or not the provided array contains an instance 312 of the given match 313 314 ******************************************************************************/ 315 316 bool containsPattern(T) (const(T)[] source, const(T)[] match) 317 { 318 return locatePattern (source, match) != source.length; 319 } 320 321 /****************************************************************************** 322 323 Return the index of the next instance of 'match' starting at 324 position 'start', or source.length where there is no match. 325 326 Parameter 'start' defaults to 0 327 328 ******************************************************************************/ 329 size_t index(T) (const(T)[] source, const(T)[] match, size_t start=0) 330 { 331 return (match.length is 1) ? locate (source, match[0], start) 332 : locatePattern (source, match, start); 333 } 334 335 /****************************************************************************** 336 337 Return the index of the prior instance of 'match' starting 338 just before 'start', or source.length where there is no match. 339 340 Parameter 'start' defaults to source.length 341 342 ******************************************************************************/ 343 size_t rindex(T) (const(T)[] source, const(T)[] match, size_t start=size_t.max) 344 { 345 return (match.length is 1) ? locatePrior (source, match[0], start) 346 : locatePatternPrior (source, match, start); 347 } 348 349 /****************************************************************************** 350 351 Return the index of the next instance of 'match' starting at 352 position 'start', or source.length where there is no match. 353 354 Parameter 'start' defaults to 0 355 356 ******************************************************************************/ 357 size_t locate(T) (const(T)[] source, const(T) match, size_t start=0) 358 { 359 if (start > source.length) 360 start = source.length; 361 362 return indexOf (source.ptr+start, match, source.length - start) + start; 363 } 364 365 /****************************************************************************** 366 367 Return the index of the prior instance of 'match' starting 368 just before 'start', or source.length where there is no match. 369 370 Parameter 'start' defaults to source.length 371 372 ******************************************************************************/ 373 size_t locatePrior(T) (const(T)[] source, const(T) match, size_t start=size_t.max) 374 { 375 if (start > source.length) 376 start = source.length; 377 378 while (start > 0) 379 if (source[--start] is match) 380 return start; 381 return source.length; 382 } 383 384 /****************************************************************************** 385 386 Return the index of the next instance of 'match' starting at 387 position 'start', or source.length where there is no match. 388 389 Parameter 'start' defaults to 0 390 391 ******************************************************************************/ 392 size_t locatePattern(T) (const(T)[] source, const(T)[] match, size_t start=0) 393 { 394 size_t idx; 395 const(T)* p = source.ptr + start; 396 size_t extent = source.length - start - match.length + 1; 397 398 if (match.length && extent <= source.length) 399 { 400 while (extent) 401 if ((idx = indexOf (p, match[0], extent)) is extent) 402 break; 403 else 404 if (matching (p+=idx, match.ptr, match.length)) 405 return p - source.ptr; 406 else 407 { 408 extent -= (idx+1); 409 ++p; 410 } 411 } 412 return source.length; 413 } 414 415 /****************************************************************************** 416 417 Return the index of the prior instance of 'match' starting 418 just before 'start', or source.length where there is no match. 419 420 Parameter 'start' defaults to source.length 421 422 ******************************************************************************/ 423 size_t locatePatternPrior(T) (const(T)[] source, const(T)[] match, size_t start=size_t.max) 424 { 425 auto len = source.length; 426 427 if (start > len) 428 start = len; 429 430 if (match.length && match.length <= len) 431 while (start) 432 { 433 start = locatePrior (source, match[0], start); 434 if (start is len) 435 break; 436 else 437 if ((start + match.length) <= len) 438 if (matching (source.ptr+start, match.ptr, match.length)) 439 return start; 440 } 441 442 return len; 443 } 444 445 /****************************************************************************** 446 447 Split the provided array on the first pattern instance, and 448 return the resultant head and tail. The pattern is excluded 449 from the two segments. 450 451 Where a segment is not found, tail will be null and the return 452 value will be the original array. 453 454 ******************************************************************************/ 455 456 T[] head(T, S) (T[] src, S[] pattern, out T[] tail) 457 { 458 auto i = locatePattern (src, pattern); 459 if (i != src.length) 460 { 461 tail = src [i + pattern.length .. $]; 462 src = src [0 .. i]; 463 } 464 return src; 465 } 466 467 /****************************************************************************** 468 469 Split the provided array on the last pattern instance, and 470 return the resultant head and tail. The pattern is excluded 471 from the two segments. 472 473 Where a segment is not found, head will be null and the return 474 value will be the original array. 475 476 ******************************************************************************/ 477 478 T[] tail(T, S) (T[] src, S[] pattern, out T[] head) 479 { 480 auto i = locatePatternPrior (src, pattern); 481 if (i != src.length) 482 { 483 head = src [0 .. i]; 484 src = src [i + pattern.length .. $]; 485 } 486 return src; 487 } 488 489 /****************************************************************************** 490 491 Split the provided array wherever a delimiter-set instance is 492 found, and return the resultant segments. The delimiters are 493 excluded from each of the segments. Note that delimiters are 494 matched as a set of alternates rather than as a pattern. 495 496 Splitting on a single delimiter is considerably faster than 497 splitting upon a set of alternatives. 498 499 Note that the src content is not duplicated by this function, 500 but is sliced instead. 501 502 ******************************************************************************/ 503 504 T[][] delimit(T, M) (T[] src, const(M)[] set) 505 { 506 T[][] result; 507 508 foreach (segment; delimiters (src, set)) 509 result ~= segment; 510 return result; 511 } 512 513 /****************************************************************************** 514 515 Split the provided array wherever a pattern instance is 516 found, and return the resultant segments. The pattern is 517 excluded from each of the segments. 518 519 Note that the src content is not duplicated by this function, 520 but is sliced instead. 521 522 ******************************************************************************/ 523 524 inout(T)[][] split(T) (inout(T)[] src, const(T)[] pattern) 525 { 526 const(T)[][] result; 527 528 foreach (segment; patterns (cast(const(T)[])src, pattern)) 529 result ~= segment; 530 return cast(inout(T)[][])result; 531 } 532 533 /****************************************************************************** 534 535 Convert text into a set of lines, where each line is identified 536 by a \n or \r\n combination. The line terminator is stripped from 537 each resultant array 538 539 Note that the src content is not duplicated by this function, but 540 is sliced instead. 541 542 ******************************************************************************/ 543 544 alias toLines splitLines; 545 T[][] toLines(T) (T[] src) 546 { 547 548 T[][] result; 549 550 foreach (line; lines (src)) 551 result ~= line; 552 return result; 553 } 554 555 /****************************************************************************** 556 557 Return the indexed line, where each line is identified by a \n 558 or \r\n combination. The line terminator is stripped from the 559 resultant line 560 561 Note that src content is not duplicated by this function, but 562 is sliced instead. 563 564 ******************************************************************************/ 565 566 T[] lineOf(T) (T[] src, size_t index) 567 { 568 int i = 0; 569 foreach (line; lines (src)) 570 if (i++ is index) 571 return line; 572 return null; 573 } 574 575 /****************************************************************************** 576 577 Combine a series of text segments together, each appended with 578 a postfix pattern. An optional output buffer can be provided to 579 avoid heap activity - it should be large enough to contain the 580 entire output, otherwise the heap will be used instead. 581 582 Returns a valid slice of the output, containing the concatenated 583 text. 584 585 ******************************************************************************/ 586 587 T[] join(T) (const(T[])[] src, const(T)[] postfix=null, T[] dst=null) 588 { 589 return combine!(T) (dst, null, postfix, src); 590 } 591 592 /****************************************************************************** 593 594 Combine a series of text segments together, each prepended with 595 a prefix pattern. An optional output buffer can be provided to 596 avoid heap activity - it should be large enough to contain the 597 entire output, otherwise the heap will be used instead. 598 599 Note that, unlike join(), the output buffer is specified first 600 such that a set of trailing strings can be provided. 601 602 Returns a valid slice of the output, containing the concatenated 603 text. 604 605 ******************************************************************************/ 606 607 T[] prefix(T) (T[] dst, const(T)[] prefix, const(T[])[] src...) 608 { 609 return combine!(T) (dst, prefix, null, src); 610 } 611 612 /****************************************************************************** 613 614 Combine a series of text segments together, each appended with an 615 optional postfix pattern. An optional output buffer can be provided 616 to avoid heap activity - it should be large enough to contain the 617 entire output, otherwise the heap will be used instead. 618 619 Note that, unlike join(), the output buffer is specified first 620 such that a set of trailing strings can be provided. 621 622 Returns a valid slice of the output, containing the concatenated 623 text. 624 625 ******************************************************************************/ 626 627 T[] postfix(T) (T[] dst, const(T)[] postfix, const(T[])[] src...) 628 { 629 return combine!(T) (dst, null, postfix, src); 630 } 631 632 /****************************************************************************** 633 634 Combine a series of text segments together, each prefixed and/or 635 postfixed with optional strings. An optional output buffer can be 636 provided to avoid heap activity - which should be large enough to 637 contain the entire output, otherwise the heap will be used instead. 638 639 Note that, unlike join(), the output buffer is specified first 640 such that a set of trailing strings can be provided. 641 642 Returns a valid slice of the output, containing the concatenated 643 text. 644 645 ******************************************************************************/ 646 647 T[] combine(T) (T[] dst, const(T)[] prefix, const(T)[] postfix, const(T[])[] src ...) 648 { 649 size_t len = src.length * prefix.length + 650 src.length * postfix.length; 651 652 foreach (segment; src) 653 len += segment.length; 654 655 if (dst.length < len) 656 dst.length = len; 657 658 T* p = dst.ptr; 659 foreach (segment; src) 660 { 661 p[0 .. prefix.length] = prefix[]; 662 p += prefix.length; 663 p[0 .. segment.length] = segment[]; 664 p += segment.length; 665 p[0 .. postfix.length] = postfix[]; 666 p += postfix.length; 667 } 668 669 // remove trailing seperator 670 if (len) 671 len -= postfix.length; 672 return dst [0 .. len]; 673 } 674 675 /****************************************************************************** 676 677 Repeat an array for a specific number of times. An optional output 678 buffer can be provided to avoid heap activity - it should be large 679 enough to contain the entire output, otherwise the heap will be used 680 instead. 681 682 Returns a valid slice of the output, containing the concatenated 683 text. 684 685 ******************************************************************************/ 686 687 T[] repeat(T) (const(T)[] src, size_t count, T[] dst=null) 688 { 689 size_t len = src.length * count; 690 if (len is 0) 691 return null; 692 693 if (dst.length < len) 694 dst.length = len; 695 696 for (auto p = dst.ptr; count--; p += src.length) 697 p[0 .. src.length] = src[]; 698 699 return dst [0 .. len]; 700 } 701 702 /****************************************************************************** 703 704 Is the argument a whitespace character? 705 706 ******************************************************************************/ 707 708 bool isSpace(T) (T c) 709 { 710 static if (T.sizeof is 1) 711 return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')); 712 else 713 return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')) || (c is '\u2028' || c is '\u2029'); 714 } 715 716 /****************************************************************************** 717 718 Return whether or not the two arrays have matching content 719 720 ******************************************************************************/ 721 722 bool matching(T) (const(T)* s1, const(T)* s2, size_t length) 723 { 724 return mismatch(s1, s2, length) is length; 725 } 726 727 /****************************************************************************** 728 729 Returns the index of the first match in str, failing once 730 length is reached. Note that we return 'length' for failure 731 and a 0-based index on success 732 733 ******************************************************************************/ 734 735 size_t indexOf(T) (const(T)* str, const(T) match, size_t length) 736 { 737 //assert (str); 738 739 static if (T.sizeof == 1) 740 enum : size_t {m1 = cast(size_t) 0x0101010101010101, 741 m2 = cast(size_t) 0x8080808080808080} 742 static if (T.sizeof == 2) 743 enum : size_t {m1 = cast(size_t) 0x0001000100010001, 744 m2 = cast(size_t) 0x8000800080008000} 745 static if (T.sizeof == 4) 746 enum : size_t {m1 = cast(size_t) 0x0000000100000001, 747 m2 = cast(size_t) 0x8000000080000000} 748 749 static if (T.sizeof < size_t.sizeof) 750 { 751 if (length) 752 { 753 size_t m = match; 754 m += m << (8 * T.sizeof); 755 756 static if (T.sizeof < size_t.sizeof / 2) 757 m += (m << (8 * T.sizeof * 2)); 758 759 static if (T.sizeof < size_t.sizeof / 4) 760 m += (m << (8 * T.sizeof * 4)); 761 762 auto p = str; 763 auto e = p + length - size_t.sizeof/T.sizeof; 764 while (p < e) 765 { 766 // clear matching T segments 767 auto v = (*cast(size_t*) p) ^ m; 768 // test for zero, courtesy of Alan Mycroft 769 if ((v - m1) & ~v & m2) 770 break; 771 p += size_t.sizeof/T.sizeof; 772 } 773 774 e += size_t.sizeof/T.sizeof; 775 while (p < e) 776 if (*p++ is match) 777 return cast(size_t) (p - str - 1); 778 } 779 return length; 780 } 781 else 782 { 783 auto len = length; 784 for (auto p=str-1; len--;) 785 if (*++p is match) 786 return cast(size_t) (p - str); 787 return length; 788 } 789 } 790 791 /****************************************************************************** 792 793 Returns the index of a mismatch between s1 & s2, failing when 794 length is reached. Note that we return 'length' upon failure 795 (array content matches) and a 0-based index upon success. 796 797 Use this as a faster opEquals. Also provides the basis for a 798 faster opCmp, since the index of the first mismatched character 799 can be used to determine the return value 800 801 ******************************************************************************/ 802 803 size_t mismatch(T) (const(T)* s1, const(T)* s2, size_t length) 804 { 805 assert (s1 && s2); 806 807 static if (T.sizeof < size_t.sizeof) 808 { 809 if (length) 810 { 811 auto start = s1; 812 auto e = start + length - size_t.sizeof/T.sizeof; 813 814 while (s1 < e) 815 { 816 if (*cast(size_t*) s1 != *cast(size_t*) s2) 817 break; 818 s1 += size_t.sizeof/T.sizeof; 819 s2 += size_t.sizeof/T.sizeof; 820 } 821 822 e += size_t.sizeof/T.sizeof; 823 while (s1 < e) 824 if (*s1++ != *s2++) 825 return s1 - start - 1; 826 } 827 return length; 828 } 829 else 830 { 831 auto len = length; 832 for (auto p=s1-1; len--;) 833 if (*++p != *s2++) 834 return p - s1; 835 return length; 836 } 837 } 838 839 /****************************************************************************** 840 841 Iterator to isolate lines. 842 843 Converts text into a set of lines, where each line is identified 844 by a \n or \r\n combination. The line terminator is stripped from 845 each resultant array. 846 847 --- 848 foreach (line; lines ("one\ntwo\nthree")) 849 ... 850 --- 851 852 ******************************************************************************/ 853 854 LineFruct!(T) lines(T) (T[] src) 855 { 856 LineFruct!(T) lines; 857 lines.src = src; 858 return lines; 859 } 860 861 /****************************************************************************** 862 863 Iterator to isolate text elements. 864 865 Splits the provided array wherever a delimiter-set instance is 866 found, and return the resultant segments. The delimiters are 867 excluded from each of the segments. Note that delimiters are 868 matched as a set of alternates rather than as a pattern. 869 870 Splitting on a single delimiter is considerably faster than 871 splitting upon a set of alternatives. 872 873 --- 874 foreach (segment; delimiters ("one,two;three", ",;")) 875 ... 876 --- 877 878 ******************************************************************************/ 879 880 DelimFruct!(T, M) delimiters(T, M) (T[] src, const(M)[] set) 881 { 882 DelimFruct!(T, M) elements; 883 elements.set = set; 884 elements.src = src; 885 return elements; 886 } 887 888 /****************************************************************************** 889 890 Iterator to isolate text elements. 891 892 Split the provided array wherever a pattern instance is found, 893 and return the resultant segments. Pattern are excluded from 894 each of the segments, and an optional sub argument enables 895 replacement. 896 897 --- 898 foreach (segment; patterns ("one, two, three", ", ")) 899 ... 900 --- 901 902 ******************************************************************************/ 903 904 PatternFruct!(T) patterns(T) (const(T)[] src, const(T)[] pattern, const(T)[] sub=null) 905 { 906 PatternFruct!(T) elements; 907 elements.pattern = pattern; 908 elements.sub = sub; 909 elements.src = src; 910 return elements; 911 } 912 913 /****************************************************************************** 914 915 Iterator to isolate optionally quoted text elements. 916 917 As per elements(), but with the extension of being quote-aware; 918 the set of delimiters is ignored inside a pair of quotes. Note 919 that an unterminated quote will consume remaining content. 920 921 --- 922 foreach (quote; quotes ("one two 'three four' five", " ")) 923 ... 924 --- 925 926 ******************************************************************************/ 927 928 QuoteFruct!(T, M) quotes(T, M) (T[] src, const(M)[] set) 929 { 930 QuoteFruct!(T, M) quotes; 931 quotes.set = set; 932 quotes.src = src; 933 return quotes; 934 } 935 936 /******************************************************************************* 937 938 Arranges text strings in order, using indices to specify where 939 each particular argument should be positioned within the text. 940 This is handy for collating I18N components, or as a simplistic 941 and lightweight formatter. Indices range from zero through nine. 942 943 --- 944 // write ordered text to the console 945 char[64] tmp; 946 947 Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline; 948 --- 949 950 *******************************************************************************/ 951 952 T[] layout(T) (T[] output, const(T[])[] layout ...) 953 { 954 const(T)[] badarg = cast(const(T)[])"{index out of range}"; 955 const(T)[] toosmall = cast(const(T)[])"{output buffer too small}"; 956 957 size_t pos, 958 args; 959 bool state; 960 961 args = layout.length - 1; 962 foreach (c; layout[0]) 963 { 964 if (state) 965 { 966 state = false; 967 if (c >= '0' && c <= '9') 968 { 969 size_t index = c - '0'; 970 if (index < args) 971 { 972 const(T)[] x = layout[index+1]; 973 974 size_t limit = pos + x.length; 975 if (limit < output.length) 976 { 977 output [pos .. limit] = x[]; 978 pos = limit; 979 continue; 980 } 981 else 982 return toosmall.dup; 983 } 984 else 985 return badarg.dup; 986 } 987 } 988 else 989 if (c is '%') 990 { 991 state = true; 992 continue; 993 } 994 995 if (pos < output.length) 996 { 997 output[pos] = c; 998 ++pos; 999 } 1000 else 1001 return toosmall.dup; 1002 } 1003 1004 return output [0..pos]; 1005 } 1006 1007 /****************************************************************************** 1008 1009 Convert 'escaped' chars to normal ones: \t => ^t for example. 1010 Supports \" \' \\ \a \b \f \n \r \t \v 1011 1012 ******************************************************************************/ 1013 1014 inout(T)[] unescape(T) (inout(T)[] src, T[] dst = null) 1015 { 1016 size_t delta; 1017 auto s = src.ptr; 1018 auto len = src.length; 1019 1020 // take a peek first to see if there's anything 1021 if ((delta = indexOf (s, '\\', len)) < len) 1022 { 1023 // make some room if not enough provided 1024 if (dst.length < src.length) 1025 dst.length = src.length; 1026 auto d = dst.ptr; 1027 1028 // copy segments over, a chunk at a time 1029 do { 1030 d [0 .. delta] = s [0 .. delta]; 1031 len -= delta; 1032 s += delta; 1033 d += delta; 1034 1035 // bogus trailing '\' 1036 if (len < 2) 1037 { 1038 *d++ = '\\'; 1039 len = 0; 1040 break; 1041 } 1042 1043 // translate \char 1044 T c = s[1]; 1045 switch (c) 1046 { 1047 case '\\': 1048 break; 1049 case '\'': 1050 c = '\''; 1051 break; 1052 case '"': 1053 c = '"'; 1054 break; 1055 case 'a': 1056 c = '\a'; 1057 break; 1058 case 'b': 1059 c = '\b'; 1060 break; 1061 case 'f': 1062 c = '\f'; 1063 break; 1064 case 'n': 1065 c = '\n'; 1066 break; 1067 case 'r': 1068 c = '\r'; 1069 break; 1070 case 't': 1071 c = '\t'; 1072 break; 1073 case 'v': 1074 c = '\v'; 1075 break; 1076 default: 1077 *d++ = '\\'; 1078 } 1079 *d++ = c; 1080 len -= 2; 1081 s += 2; 1082 } while ((delta = indexOf (s, '\\', len)) < len); 1083 1084 // copy tail too 1085 d [0 .. len] = s [0 .. len]; 1086 return cast(inout)(dst [0 .. (d + len) - dst.ptr]); 1087 } 1088 return src; 1089 } 1090 1091 1092 /****************************************************************************** 1093 1094 jhash() -- hash a variable-length key into a 32-bit value 1095 1096 k : the key (the unaligned variable-length array of bytes) 1097 len : the length of the key, counting by bytes 1098 level : can be any 4-byte value 1099 1100 Returns a 32-bit value. Every bit of the key affects every bit of 1101 the return value. Every 1-bit and 2-bit delta achieves avalanche. 1102 1103 About 4.3*len + 80 X86 instructions, with excellent pipelining 1104 1105 The best hash table sizes are powers of 2. There is no need to do 1106 mod a prime (mod is sooo slow!). If you need less than 32 bits, 1107 use a bitmask. For example, if you need only 10 bits, do 1108 1109 h = (h & hashmask(10)); 1110 1111 In which case, the hash table should have hashsize(10) elements. 1112 If you are hashing n strings (ub1 **)k, do it like this: 1113 1114 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); 1115 1116 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use 1117 this code any way you wish, private, educational, or commercial. 1118 It's free. 1119 1120 See http://burtleburtle.net/bob/hash/evahash.html 1121 Use for hash table lookup, or anything where one collision in 2^32 1122 is acceptable. Do NOT use for cryptographic purposes. 1123 1124 ******************************************************************************/ 1125 1126 @trusted nothrow pure 1127 size_t jhash (const(ubyte)* k, size_t len, size_t c = 0) 1128 { 1129 size_t a = 0x9e3779b9, 1130 b = 0x9e3779b9, 1131 i = len; 1132 1133 // handle most of the key 1134 while (i >= 12) 1135 { 1136 a += *cast(uint *)(k+0); 1137 b += *cast(uint *)(k+4); 1138 c += *cast(uint *)(k+8); 1139 1140 a -= b; a -= c; a ^= (c>>13); 1141 b -= c; b -= a; b ^= (a<<8); 1142 c -= a; c -= b; c ^= (b>>13); 1143 a -= b; a -= c; a ^= (c>>12); 1144 b -= c; b -= a; b ^= (a<<16); 1145 c -= a; c -= b; c ^= (b>>5); 1146 a -= b; a -= c; a ^= (c>>3); 1147 b -= c; b -= a; b ^= (a<<10); 1148 c -= a; c -= b; c ^= (b>>15); 1149 k += 12; i -= 12; 1150 } 1151 1152 // handle the last 11 bytes 1153 c += len; 1154 switch (i) 1155 { 1156 case 11: c+=(cast(uint)k[10]<<24); goto case; 1157 case 10: c+=(cast(uint)k[9]<<16); goto case; 1158 case 9 : c+=(cast(uint)k[8]<<8); goto case; 1159 case 8 : b+=(cast(uint)k[7]<<24); goto case; 1160 case 7 : b+=(cast(uint)k[6]<<16); goto case; 1161 case 6 : b+=(cast(uint)k[5]<<8); goto case; 1162 case 5 : b+=(cast(uint)k[4]); goto case; 1163 case 4 : a+=(cast(uint)k[3]<<24); goto case; 1164 case 3 : a+=(cast(uint)k[2]<<16); goto case; 1165 case 2 : a+=(cast(uint)k[1]<<8); goto case; 1166 case 1 : a+=(cast(uint)k[0]); goto default; 1167 default: 1168 } 1169 1170 a -= b; a -= c; a ^= (c>>13); 1171 b -= c; b -= a; b ^= (a<<8); 1172 c -= a; c -= b; c ^= (b>>13); 1173 a -= b; a -= c; a ^= (c>>12); 1174 b -= c; b -= a; b ^= (a<<16); 1175 c -= a; c -= b; c ^= (b>>5); 1176 a -= b; a -= c; a ^= (c>>3); 1177 b -= c; b -= a; b ^= (a<<10); 1178 c -= a; c -= b; c ^= (b>>15); 1179 1180 return c; 1181 } 1182 1183 /// ditto 1184 @trusted nothrow pure 1185 size_t jhash (const(void)[] x, size_t c = 0) 1186 { 1187 return jhash (cast(ubyte*) x.ptr, x.length, c); 1188 } 1189 1190 1191 /****************************************************************************** 1192 1193 Helper fruct for iterator lines(). A fruct is a low 1194 impact mechanism for capturing context relating to an 1195 opApply (conjunction of the names struct and foreach) 1196 1197 ******************************************************************************/ 1198 1199 private struct LineFruct(T) 1200 { 1201 private T[] src; 1202 1203 int opApply (scope int delegate (ref T[] line) dg) 1204 { 1205 int ret; 1206 size_t pos, 1207 mark; 1208 T[] line; 1209 1210 enum T nl = '\n'; 1211 enum T cr = '\r'; 1212 1213 while ((pos = locate (src, nl, mark)) < src.length) 1214 { 1215 auto end = pos; 1216 if (end && src[end-1] is cr) 1217 --end; 1218 1219 line = src [mark .. end]; 1220 if ((ret = dg (line)) != 0) 1221 return ret; 1222 mark = pos + 1; 1223 } 1224 1225 line = src [mark .. $]; 1226 if (mark <= src.length) 1227 ret = dg (line); 1228 1229 return ret; 1230 } 1231 } 1232 1233 /****************************************************************************** 1234 1235 Helper fruct for iterator delims(). A fruct is a low 1236 impact mechanism for capturing context relating to an 1237 opApply (conjunction of the names struct and foreach) 1238 1239 ******************************************************************************/ 1240 1241 private struct DelimFruct(T, M) 1242 { 1243 private T[] src; 1244 private const(M)[] set; 1245 1246 int opApply (scope int delegate (ref T[] token) dg) 1247 { 1248 int ret; 1249 size_t pos, 1250 mark; 1251 T[] token; 1252 1253 // optimize for single delimiter case 1254 if (set.length is 1) 1255 while ((pos = locate (src, set[0], mark)) < src.length) 1256 { 1257 token = src [mark .. pos]; 1258 if ((ret = dg (token)) != 0) 1259 return ret; 1260 mark = pos + 1; 1261 } 1262 else 1263 if (set.length > 1) 1264 foreach (i, elem; src) 1265 if (contains (set, elem)) 1266 { 1267 token = src [mark .. i]; 1268 if ((ret = dg (token)) != 0) 1269 return ret; 1270 mark = i + 1; 1271 } 1272 1273 token = src [mark .. $]; 1274 if (mark <= src.length) 1275 ret = dg (token); 1276 1277 return ret; 1278 } 1279 } 1280 1281 /****************************************************************************** 1282 1283 Helper fruct for iterator patterns(). A fruct is a low 1284 impact mechanism for capturing context relating to an 1285 opApply (conjunction of the names struct and foreach) 1286 1287 ******************************************************************************/ 1288 1289 private struct PatternFruct(T) 1290 { 1291 private const(T)[] src, 1292 sub, 1293 pattern; 1294 1295 int opApply (scope int delegate (ref const(T)[] token) dg) 1296 { 1297 int ret; 1298 size_t pos, 1299 mark; 1300 const(T)[] token; 1301 1302 while ((pos = index (src, pattern, mark)) < src.length) 1303 { 1304 token = src [mark .. pos]; 1305 if ((ret = dg(token)) != 0) 1306 return ret; 1307 if (sub.ptr && (ret = dg(sub)) != 0) 1308 return ret; 1309 mark = pos + pattern.length; 1310 } 1311 1312 token = src [mark .. $]; 1313 if (mark <= src.length) 1314 ret = dg (token); 1315 1316 return ret; 1317 } 1318 } 1319 1320 /****************************************************************************** 1321 1322 Helper fruct for iterator quotes(). A fruct is a low 1323 impact mechanism for capturing context relating to an 1324 opApply (conjunction of the names struct and foreach) 1325 1326 ******************************************************************************/ 1327 1328 private struct QuoteFruct(T, M) 1329 { 1330 private T[] src; 1331 private const(M)[] set; 1332 1333 int opApply (scope int delegate (ref const(T)[] token) dg) 1334 { 1335 int ret; 1336 size_t mark; 1337 const(T)[] token; 1338 1339 if (set.length) 1340 for (size_t i=0; i < src.length; ++i) 1341 { 1342 T c = src[i]; 1343 if (c is '"' || c is '\'') 1344 i = locate (src, c, i+1); 1345 else 1346 if (contains (set, c)) 1347 { 1348 token = src [mark .. i]; 1349 if ((ret = dg (token)) != 0) 1350 return ret; 1351 mark = i + 1; 1352 } 1353 } 1354 1355 token = src [mark .. $]; 1356 if (mark <= src.length) 1357 ret = dg (token); 1358 1359 return ret; 1360 } 1361 } 1362 1363 1364 /****************************************************************************** 1365 1366 ******************************************************************************/ 1367 1368 debug (UnitTest) 1369 { 1370 unittest 1371 { 1372 char[64] tmp; 1373 1374 assert (isSpace (' ') && !isSpace ('d')); 1375 1376 assert (indexOf ("abc".ptr, 'a', 3) is 0); 1377 assert (indexOf ("abc".ptr, 'b', 3) is 1); 1378 assert (indexOf ("abc".ptr, 'c', 3) is 2); 1379 assert (indexOf ("abc".ptr, 'd', 3) is 3); 1380 assert (indexOf ("abcabcabc".ptr, 'd', 9) is 9); 1381 1382 assert (indexOf ("abc"d.ptr, cast(dchar)'c', 3) is 2); 1383 assert (indexOf ("abc"d.ptr, cast(dchar)'d', 3) is 3); 1384 1385 assert (indexOf ("abc"w.ptr, cast(wchar)'c', 3) is 2); 1386 assert (indexOf ("abc"w.ptr, cast(wchar)'d', 3) is 3); 1387 assert (indexOf ("abcdefghijklmnopqrstuvwxyz"w.ptr, cast(wchar)'x', 25) is 23); 1388 1389 assert (mismatch ("abc".ptr, "abc".ptr, 3) is 3); 1390 assert (mismatch ("abc".ptr, "abd".ptr, 3) is 2); 1391 assert (mismatch ("abc".ptr, "acc".ptr, 3) is 1); 1392 assert (mismatch ("abc".ptr, "ccc".ptr, 3) is 0); 1393 1394 assert (mismatch ("abc"w.ptr, "abc"w.ptr, 3) is 3); 1395 assert (mismatch ("abc"w.ptr, "acc"w.ptr, 3) is 1); 1396 1397 assert (mismatch ("abc"d.ptr, "abc"d.ptr, 3) is 3); 1398 assert (mismatch ("abc"d.ptr, "acc"d.ptr, 3) is 1); 1399 1400 assert (matching ("abc".ptr, "abc".ptr, 3)); 1401 assert (matching ("abc".ptr, "abb".ptr, 3) is false); 1402 1403 assert (contains ("abc", 'a')); 1404 assert (contains ("abc", 'b')); 1405 assert (contains ("abc", 'c')); 1406 assert (contains ("abc", 'd') is false); 1407 1408 assert (containsPattern ("abc", "ab")); 1409 assert (containsPattern ("abc", "bc")); 1410 assert (containsPattern ("abc", "abc")); 1411 assert (containsPattern ("abc", "zabc") is false); 1412 assert (containsPattern ("abc", "abcd") is false); 1413 assert (containsPattern ("abc", "za") is false); 1414 assert (containsPattern ("abc", "cd") is false); 1415 1416 assert (trim ("") == ""); 1417 assert (trim (" abc ") == "abc"); 1418 assert (trim (" ") == ""); 1419 1420 assert (strip ("", '%') == ""); 1421 assert (strip ("%abc%%%", '%') == "abc"); 1422 assert (strip ("#####", '#') == ""); 1423 assert (stripl ("#####", '#') == ""); 1424 assert (stripl (" ###", ' ') == "###"); 1425 assert (stripl ("#####", 's') == "#####"); 1426 assert (stripr ("#####", '#') == ""); 1427 assert (stripr ("### ", ' ') == "###"); 1428 assert (stripr ("#####", 's') == "#####"); 1429 1430 assert (replace ("abc".dup, 'b', ':') == "a:c"); 1431 assert (substitute ("abc".dup, "bc", "x") == "ax"); 1432 1433 assert (locate ("abc", 'c', 1) is 2); 1434 1435 assert (locate ("abc", 'c') is 2); 1436 assert (locate ("abc", 'a') is 0); 1437 assert (locate ("abc", 'd') is 3); 1438 assert (locate ("", 'c') is 0); 1439 1440 assert (locatePrior ("abce", 'c') is 2); 1441 assert (locatePrior ("abce", 'a') is 0); 1442 assert (locatePrior ("abce", 'd') is 4); 1443 assert (locatePrior ("abce", 'c', 3) is 2); 1444 assert (locatePrior ("abce", 'c', 2) is 4); 1445 assert (locatePrior ("", 'c') is 0); 1446 1447 auto x = delimit ("::b", ":"); 1448 assert (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b"); 1449 x = delimit ("a:bc:d", ":"); 1450 assert (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d"); 1451 x = delimit ("abcd", ":"); 1452 assert (x.length is 1 && x[0] == "abcd"); 1453 x = delimit ("abcd:", ":"); 1454 assert (x.length is 2 && x[0] == "abcd" && x[1] == ""); 1455 x = delimit ("a;b$c#d:e@f", ";:$#@"); 1456 assert (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" && 1457 x[3]=="d" && x[4]=="e" && x[5]=="f"); 1458 1459 assert (locatePattern ("abcdefg", "") is 7); 1460 assert (locatePattern ("abcdefg", "g") is 6); 1461 assert (locatePattern ("abcdefg", "abcdefg") is 0); 1462 assert (locatePattern ("abcdefg", "abcdefgx") is 7); 1463 assert (locatePattern ("abcdefg", "cce") is 7); 1464 assert (locatePattern ("abcdefg", "cde") is 2); 1465 assert (locatePattern ("abcdefgcde", "cde", 3) is 7); 1466 1467 assert (locatePatternPrior ("abcdefg", "") is 7); 1468 assert (locatePatternPrior ("abcdefg", "cce") is 7); 1469 assert (locatePatternPrior ("abcdefg", "cde") is 2); 1470 assert (locatePatternPrior ("abcdefgcde", "cde", 6) is 2); 1471 assert (locatePatternPrior ("abcdefgcde", "cde", 4) is 2); 1472 assert (locatePatternPrior ("abcdefg", "abcdefgx") is 7); 1473 1474 x = splitLines ("a\nb\n"); 1475 assert (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == ""); 1476 x = splitLines ("a\r\n"); 1477 assert (x.length is 2 && x[0] == "a" && x[1] == ""); 1478 1479 x = splitLines ("a"); 1480 assert (x.length is 1 && x[0] == "a"); 1481 x = splitLines (""); 1482 assert (x.length is 1); 1483 1484 const(char)[][] q; 1485 foreach (element; quotes ("1 'avcc cc ' 3", " ")) 1486 q ~= element; 1487 assert (q.length is 3 && q[0] == "1" && q[1] == "'avcc cc '" && q[2] == "3"); 1488 1489 assert (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc"); 1490 1491 x = split ("one, two, three", ","); 1492 assert (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three"); 1493 x = split ("one, two, three", ", "); 1494 assert (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three"); 1495 x = split ("one, two, three", ",,"); 1496 assert (x.length is 1 && x[0] == "one, two, three"); 1497 x = split ("one,,", ","); 1498 assert (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == ""); 1499 1500 immutable(char)[] h, t; 1501 h = head ("one:two:three", ":", t); 1502 assert (h == "one" && t == "two:three"); 1503 h = head ("one:::two:three", ":::", t); 1504 assert (h == "one" && t == "two:three"); 1505 h = head ("one:two:three", "*", t); 1506 assert (h == "one:two:three" && t is null); 1507 1508 t = tail ("one:two:three", ":", h); 1509 assert (h == "one:two" && t == "three"); 1510 t = tail ("one:::two:three", ":::", h); 1511 assert (h == "one" && t == "two:three"); 1512 t = tail ("one:two:three", "*", h); 1513 assert (t == "one:two:three" && h is null); 1514 1515 assert (chopl("hello world", "hello ") == "world"); 1516 assert (chopl("hello", "hello") == ""); 1517 assert (chopl("hello world", " ") == "hello world"); 1518 assert (chopl("hello world", "") == "hello world"); 1519 1520 assert (chopr("hello world", " world") == "hello"); 1521 assert (chopr("hello", "hello") == ""); 1522 assert (chopr("hello world", " ") == "hello world"); 1523 assert (chopr("hello world", "") == "hello world"); 1524 1525 const(char)[][] foo = ["one", "two", "three"]; 1526 auto j = join (foo); 1527 assert (j == "onetwothree"); 1528 j = join (foo, ", "); 1529 assert (j == "one, two, three"); 1530 j = join (foo, " ", tmp); 1531 assert (j == "one two three"); 1532 assert (j.ptr is tmp.ptr); 1533 1534 assert (repeat ("abc", 0) == ""); 1535 assert (repeat ("abc", 1) == "abc"); 1536 assert (repeat ("abc", 2) == "abcabc"); 1537 assert (repeat ("abc", 4) == "abcabcabcabc"); 1538 assert (repeat ("", 4) == ""); 1539 char[10] rep; 1540 assert (repeat ("abc", 0, rep) == ""); 1541 assert (repeat ("abc", 1, rep) == "abc"); 1542 assert (repeat ("abc", 2, rep) == "abcabc"); 1543 assert (repeat ("", 4, rep) == ""); 1544 1545 assert (unescape ("abc") == "abc"); 1546 assert (unescape ("abc\\") == "abc\\"); 1547 assert (unescape ("abc\\t") == "abc\t"); 1548 assert (unescape ("abc\\tc") == "abc\tc"); 1549 assert (unescape ("\\t") == "\t"); 1550 assert (unescape ("\\tx") == "\tx"); 1551 assert (unescape ("\\v\\vx") == "\v\vx"); 1552 assert (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc"); 1553 } 1554 } 1555 1556 1557 1558 debug (Util) 1559 { 1560 auto x = import("Util.d"); 1561 1562 void main() 1563 { 1564 mismatch ("".ptr, S(x).ptr, 0); 1565 indexOf ("".ptr, '@', 0); 1566 char[] s; 1567 split (s, " "); 1568 //indexOf (s.ptr, '@', 0); 1569 1570 } 1571 }