1 /******************************************************************************* 2 3 copyright: Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved. 4 5 license: BSD style: $(LICENSE) 6 7 version: Initial release: Jan 2008 8 9 authors: Jascha Wetzel 10 11 This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method. 12 13 The Regex class is not thread safe 14 15 See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> 16 for details on regular expressions in general. 17 18 The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it, 19 as opposed to what "regular expression" means in most implementations 20 (e.g. PCRE or those from the standard libraries of Perl, Java or Python). 21 The advantage of this method is it's performance, it's disadvantage is the inability to realize some features 22 that Perl-like regular expressions have (e.g. back-references). 23 See <a href="http://swtch.com/~rsc/regexp/regexp1.html">"Regular Expression Matching Can Be Simple And Fast"</a> 24 for details on the differences. 25 26 The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the 27 number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input 28 and therefore matching is a linear-time process. 29 30 The syntax of a regular expressions is as follows. 31 <i>X</i> and <i>Y</i> stand for an arbitrary regular expression. 32 33 <table border=1 cellspacing=0 cellpadding=5> 34 <caption>Operators</caption> 35 $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) ) 36 $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) ) 37 $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) ) 38 $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) ) 39 $(TR $(TD [^Z]) $(TD negated character class specification) ) 40 $(TR $(TD <X) $(TD lookbehind, X may be a single character or a character class) ) 41 $(TR $(TD >X) $(TD lookahead, X may be a single character or a character class) ) 42 $(TR $(TD ^) $(TD start of input or start of line) ) 43 $(TR $(TD $) $(TD end of input or end of line) ) 44 $(TR $(TD \b) $(TD start or end of word, equals (?<\s>\S|<\S>\s)) ) 45 $(TR $(TD \B) $(TD opposite of \b, equals (?<\S>\S|<\s>\s)) ) 46 </table> 47 48 <table border=1 cellspacing=0 cellpadding=5> 49 <caption>Quantifiers</caption> 50 $(TR $(TD X?) $(TD zero or one) ) 51 $(TR $(TD X*) $(TD zero or more) ) 52 $(TR $(TD X+) $(TD one or more) ) 53 $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) ) 54 $(TR $(TD X??) $(TD non-greedy version of the above operators) ) 55 $(TR $(TD X*?) $(TD see above) ) 56 $(TR $(TD X+?) $(TD see above) ) 57 $(TR $(TD X{n,m}?) $(TD see above) ) 58 </table> 59 60 <table border=1 cellspacing=0 cellpadding=5> 61 <caption>Pre-defined character classes</caption> 62 $(TR $(TD .) $(TD any printable character) ) 63 $(TR $(TD \s) $(TD whitespace) ) 64 $(TR $(TD \S) $(TD non-whitespace) ) 65 $(TR $(TD \w) $(TD alpha-numeric characters or underscore) ) 66 $(TR $(TD \W) $(TD opposite of \w) ) 67 $(TR $(TD \d) $(TD digits) ) 68 $(TR $(TD \D) $(TD non-digit) ) 69 </table> 70 71 Note that "alphanumeric" only applies to Latin-1. 72 *******************************************************************************/ 73 module tango.text.Regex; 74 75 static import tango.core.Array; 76 77 static import tango.text.convert.Utf; 78 79 debug(TangoRegex) import tango.io.Stdout; 80 81 /* ***************************************************************************** 82 A simple pair 83 *******************************************************************************/ 84 private struct Pair(T) 85 { 86 static Pair opCall(T a, T b) 87 { 88 Pair p; 89 p.a = a; 90 p.b = b; 91 return p; 92 } 93 94 union 95 { 96 struct { 97 T first, second; 98 } 99 struct { 100 T a, b; 101 } 102 } 103 } 104 105 /* ***************************************************************************** 106 Double linked list 107 *******************************************************************************/ 108 private class List(T) 109 { 110 class Element 111 { 112 T value; 113 Element prev, 114 next; 115 116 this(T v) 117 { 118 value = v; 119 } 120 } 121 122 size_t len; 123 Element head, 124 tail; 125 126 List // opCatAssign(T v) 127 opOpAssign(string op)(T v) if (op == "~") 128 { 129 if ( tail is null ) 130 head = tail = new Element(v); 131 else { 132 tail.next = new Element(v); 133 tail.next.prev = tail; 134 tail = tail.next; 135 } 136 ++len; 137 return this; 138 } 139 140 List insertAfter(T w, T v) 141 { 142 foreach ( e; &this.elements ) 143 { 144 if ( e.value is w ) 145 return insertAfter(e, v); 146 } 147 return null; 148 } 149 150 List insertAfter(Element e, T v) 151 { 152 auto tmp = new Element(v); 153 tmp.prev = e; 154 tmp.next = e.next; 155 e.next.prev = tmp; 156 e.next = tmp; 157 if ( e is tail ) 158 tail = tmp; 159 ++len; 160 return this; 161 } 162 163 List // opCatAssign(List l) 164 opOpAssign(string op)(List l) if (op == "~") 165 { 166 if ( l.empty() ) 167 return this; 168 if ( tail is null ) { 169 head = l.head; 170 tail = l.tail; 171 } 172 else { 173 tail.next = l.head; 174 tail.next.prev = tail; 175 tail = l.tail; 176 } 177 len += l.len; 178 return this; 179 } 180 181 List pushFront(T v) 182 { 183 if ( head is null ) 184 head = tail = new Element(v); 185 else 186 { 187 head.prev = new Element(v); 188 head.prev.next = head; 189 head = head.prev; 190 } 191 ++len; 192 return this; 193 } 194 195 List insertBefore(T w, T v) 196 { 197 foreach ( e; &this.elements ) 198 { 199 if ( e.value is w ) 200 return insertBefore(e, v); 201 } 202 return null; 203 } 204 205 List insertBefore(Element e, T v) 206 { 207 auto tmp = new Element(v); 208 tmp.prev = e.prev; 209 tmp.next = e; 210 e.prev.next = tmp; 211 e.prev = tmp; 212 if ( e is head ) 213 head = tmp; 214 ++len; 215 return this; 216 } 217 218 List pushFront(List l) 219 { 220 if ( l.empty() ) 221 return this; 222 if ( head is null ) { 223 head = l.head; 224 tail = l.tail; 225 } 226 else { 227 head.prev = l.tail; 228 head.prev.next = head; 229 head = l.head; 230 } 231 len += l.len; 232 return this; 233 } 234 235 size_t length() 236 { 237 return len; 238 } 239 240 bool empty() 241 { 242 return head is null; 243 } 244 245 void clear() 246 { 247 head = null; 248 tail = null; 249 len = 0; 250 } 251 252 void pop() 253 { 254 remove(tail); 255 } 256 257 void remove(Element e) 258 { 259 if ( e is null ) 260 return; 261 if ( e.prev is null ) 262 head = e.next; 263 else 264 e.prev.next = e.next; 265 if ( e.next is null ) 266 tail = e.prev; 267 else 268 e.next.prev = e.prev; 269 --len; 270 } 271 272 int elements(int delegate(ref Element) dg) 273 { 274 for ( Element e=head; e !is null; e = e.next ) 275 { 276 int ret = dg(e); 277 if ( ret ) 278 return ret; 279 } 280 return 0; 281 } 282 283 int elements_reverse(int delegate(ref Element) dg) 284 { 285 for ( Element e=tail; e !is null; e = e.prev ) 286 { 287 int ret = dg(e); 288 if ( ret ) 289 return ret; 290 } 291 return 0; 292 } 293 294 int opApply(int delegate(ref T) dg) 295 { 296 for ( Element e=head; e !is null; e = e.next ) 297 { 298 int ret = dg(e.value); 299 if ( ret ) 300 return ret; 301 } 302 return 0; 303 } 304 305 int opApplyReverse(int delegate(ref T) dg) 306 { 307 for ( Element e=tail; e !is null; e = e.prev ) 308 { 309 int ret = dg(e.value); 310 if ( ret ) 311 return ret; 312 } 313 return 0; 314 } 315 } 316 317 /* ***************************************************************************** 318 Stack based on dynamic array 319 *******************************************************************************/ 320 private struct Stack(T) 321 { 322 size_t _top; 323 T[] stack; 324 325 void push(T v) 326 { 327 if ( _top >= stack.length ) 328 stack.length = stack.length*2+1; 329 stack[_top] = v; 330 ++_top; 331 } 332 // alias push opCatAssign; 333 334 void // opCatAssign(T[] vs) 335 opOpAssign(string op)(T[] vs) if (op == "~") 336 { 337 size_t end = _top+vs.length; 338 if ( end > stack.length ) 339 stack.length = end*2; 340 stack[_top..end] = vs[]; 341 _top = end; 342 } 343 344 void pop(size_t num) 345 { 346 assert(_top>=num); 347 _top -= num; 348 } 349 350 T pop() 351 { 352 assert(_top>0); 353 return stack[--_top]; 354 } 355 356 T top() 357 { 358 assert(_top>0); 359 return stack[_top-1]; 360 } 361 362 T* topPtr() 363 { 364 assert(_top>0); 365 return &stack[_top-1]; 366 } 367 368 bool empty() 369 { 370 return _top == 0; 371 } 372 373 void clear() 374 { 375 _top = 0; 376 } 377 378 size_t length() 379 { 380 return _top; 381 } 382 383 T[] array() 384 { 385 return stack[0.._top]; 386 } 387 388 T opIndex(size_t i) 389 { 390 return stack[i]; 391 } 392 393 Stack dup() 394 { 395 Stack s; 396 s._top = _top; 397 s.stack = stack.dup; 398 return s; 399 } 400 } 401 402 /* ************************************************************************************************ 403 Set container based on assoc array 404 **************************************************************************************************/ 405 private struct Set(T) 406 { 407 bool[T] data; 408 409 static Set opCall() 410 { 411 Set s; 412 return s; 413 } 414 415 static Set opCall(T v) 416 { 417 Set s; 418 s ~= v; 419 return s; 420 } 421 422 void opAddAssign(T v) 423 { 424 data[v] = true; 425 } 426 427 void opAddAssign(Set s) 428 { 429 foreach ( v; s.elements ) 430 data[v] = true; 431 } 432 alias opAddAssign opCatAssign; 433 434 size_t length() 435 { 436 return data.length; 437 } 438 439 T[] elements() 440 { 441 return data.keys; 442 } 443 444 bool remove(T v) 445 { 446 if ( (v in data) is null ) 447 return false; 448 data.remove(v); 449 return true; 450 } 451 452 bool contains(T v) 453 { 454 return (v in data) !is null; 455 } 456 457 bool contains(Set s) 458 { 459 Set tmp = s - this; 460 return tmp.empty; 461 } 462 463 bool empty() 464 { 465 return data.length==0; 466 } 467 468 Set opSub(Set s) 469 { 470 Set res = dup; 471 foreach ( v; s.elements ) 472 res.remove(v); 473 return res; 474 } 475 476 Set dup() 477 { 478 Set s; 479 foreach ( v; data.keys ) 480 s.data[v] = true; 481 return s; 482 } 483 } 484 485 /* ************************************************************************************************ 486 487 **************************************************************************************************/ 488 void quickSort(T)(T[] a) 489 { 490 quickSort(a,cast(size_t)0,a.length); 491 } 492 493 void quickSort(T)(T[] a, size_t l, size_t r) 494 { 495 T t; 496 auto i = r-l; 497 if ( i < 3 ) 498 { 499 if ( i < 2 ) 500 return; 501 if ( a[l] < a[l+1] ) 502 return; 503 t = a[l]; 504 a[l] = a[l+1]; 505 a[l+1] = t; 506 return; 507 } 508 509 auto p = a[l]; 510 i = l; 511 auto j = r; 512 513 while ( true ) 514 { 515 ++i; 516 for ( ; i < j && a[i] < p; ++i ) {} 517 --j; 518 for ( ; i < j && a[j] >= p; --j ) {} 519 if ( i >= j ) 520 break; 521 t = a[i]; 522 a[i] = a[j]; 523 a[j] = t; 524 } 525 --i; 526 a[l] = a[i]; 527 a[i] = p; 528 529 quickSort(a, l, i); 530 quickSort(a, i+1, r); 531 } 532 import tango.math.Math; 533 534 /* ************************************************************************************************ 535 A range of characters 536 **************************************************************************************************/ 537 struct CharRange(char_t) 538 { 539 char_t l_, r_; 540 541 static CharRange opCall(char_t c) 542 { 543 CharRange cr; 544 cr.l_ = c; 545 cr.r_ = c; 546 return cr; 547 } 548 549 static CharRange opCall(char_t a, char_t b) 550 { 551 CharRange cr; 552 cr.l_ = min(a,b); 553 cr.r_ = max(a,b); 554 return cr; 555 } 556 557 char_t l() 558 { 559 return l_; 560 } 561 562 char_t r() 563 { 564 return r_; 565 } 566 567 /* ******************************************************************************************** 568 Compares the ranges according to their beginning. 569 **********************************************************************************************/ 570 int opCmp(const ref CharRange cr) const 571 { 572 if ( l_ == cr.l_ ) 573 return 0; 574 if ( l_ < cr.l_ ) 575 return -1; 576 return 1; 577 } 578 579 580 const bool opEquals(ref const CharRange cr) 581 { 582 if ( l_ == cr.l_ && r_ == cr.r_ ) 583 return true; 584 return false; 585 } 586 587 bool contains(char_t c) 588 { 589 return c >= l_ && c <= r_; 590 } 591 592 bool contains(CharRange cr) 593 { 594 return l_ <= cr.l_ && r_ >= cr.r_; 595 } 596 597 bool intersects(CharRange cr) 598 { 599 return r_ >= cr.l_ && l_ <= cr.r_; 600 } 601 602 CharRange intersect(CharRange cr) 603 { 604 assert(intersects(cr)); 605 CharRange ir; 606 ir.l_ = max(l_, cr.l_); 607 ir.r_ = min(r_, cr.r_); 608 if ( ir.l_ > ir.r_ ) 609 ir.l_ = ir.r_ = char_t.min; 610 return ir; 611 } 612 613 CharRange[] subtract(CharRange cr) 614 { 615 CharRange[] sr; 616 if ( cr.contains(this) ) 617 return sr; 618 if ( !intersects(cr) ) 619 sr ~= this; 620 else 621 { 622 CharRange d; 623 if ( contains(cr) ) 624 { 625 d.l_ = l_; 626 d.r_ = cast(char_t)(cr.l_-1); 627 if ( d.l_ <= d.r_ ) 628 sr ~= d; 629 d.l_ = cast(char_t)(cr.r_+1); 630 d.r_ = r_; 631 if ( d.l_ <= d.r_ ) 632 sr ~= d; 633 } 634 else if ( cr.r_ > l_ ) 635 { 636 d.l_ = cast(char_t)(cr.r_+1); 637 d.r_ = r_; 638 if ( d.l_ <= d.r_ ) 639 sr ~= d; 640 } 641 else if ( cr.l_ < r_ ) 642 { 643 d.l_ = l_; 644 d.r_ = cast(char_t)(cr.l_-1); 645 if ( d.l_ <= d.r_ ) 646 sr ~= d; 647 } 648 } 649 return sr; 650 } 651 652 string toString() 653 { 654 string str; 655 656 if ( l_ == r_ ) 657 { 658 if ( l_ > 0x20 && l_ < 0x7f ) 659 str = Format.convert("'{}'", l_).idup; 660 else 661 str = Format.convert("({:x})", cast(int)l_).idup; 662 } 663 else 664 { 665 if ( l_ > 0x20 && l_ < 0x7f ) 666 str = Format.convert("'{}'", l_).idup; 667 else 668 str = Format.convert("({:x})", cast(int)l_).idup; 669 str ~= "-"; 670 if ( r_ > 0x20 && r_ < 0x7f ) 671 str ~= Format.convert("'{}'", r_).idup; 672 else 673 str ~= Format.convert("({:x})", cast(int)r_).idup; 674 } 675 return str; 676 } 677 } 678 679 import tango.core.Compiler; 680 681 static if(DMDFE_Version == 2063) 682 { 683 /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */ 684 void _bug(CharRange!(dchar) a) {} 685 } 686 687 /* ************************************************************************************************ 688 Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.) 689 **************************************************************************************************/ 690 struct CharClass(char_t) 691 { 692 alias CharRange!(char_t) range_t; 693 694 //--------------------------------------------------------------------------------------------- 695 // pre-defined character classes 696 static const CharClass!(char_t) 697 line_startend = {parts: [ 698 {l_:0x00, r_:0x00}, 699 {l_:0x0a, r_:0x0a}, 700 {l_:0x13, r_:0x13} 701 ]}, 702 digit = {parts: [ 703 {l_:0x30, r_:0x39} 704 ]}, 705 whitespace = {parts: [ 706 {l_:0x09, r_:0x09}, 707 {l_:0x0a, r_:0x0a}, 708 {l_:0x0b, r_:0x0b}, 709 {l_:0x13, r_:0x13}, 710 {l_:0x14, r_:0x14}, 711 {l_:0x20, r_:0x20} 712 ]}; 713 714 // 8bit classes 715 static if ( is(char_t == char) ) 716 { 717 static const CharClass!(char_t) 718 any_char = {parts: [ 719 {l_:0x01, r_:0xff} 720 ]}, 721 dot_oper = {parts: [ 722 {l_:0x01, r_:0xff} 723 ]}, 724 alphanum_ = {parts: [ 725 {l_:0x30, r_:0x39}, 726 {l_:0x41, r_:0x5a}, 727 {l_:0x5f, r_:0x5f}, 728 {l_:0x61, r_:0x7a} 729 ]}; 730 } 731 // 16bit and 32bit classes 732 static if ( is(char_t == wchar) || is(char_t == dchar) ) 733 { 734 static const CharClass!(char_t) 735 any_char = {parts: [ 736 {l_:0x0001, r_:0xffff} 737 ]}, 738 dot_oper = {parts: [ 739 {l_:0x0001, r_:0xffff} 740 ]}, 741 alphanum_ = {parts: [ 742 {l_:0x30, r_:0x39}, 743 {l_:0x41, r_:0x5a}, 744 {l_:0x5f, r_:0x5f}, 745 {l_:0x61, r_:0x7a} 746 ]}; 747 } 748 749 //--------------------------------------------------------------------------------------------- 750 range_t[] parts; 751 752 invariant() 753 { 754 // foreach ( i, p; parts ) 755 // assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_)); 756 } 757 758 static CharClass opCall(const(CharClass) cc) 759 { 760 CharClass ncc; 761 ncc.parts = cc.parts.dup; 762 return ncc; 763 } 764 765 int opCmp(const ref CharClass cc) const 766 { 767 if ( parts.length < cc.parts.length ) 768 return -1; 769 if ( parts.length > cc.parts.length ) 770 return 1; 771 foreach ( i, p; cc.parts ) 772 { 773 if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ ) 774 return 1; 775 } 776 return 0; 777 } 778 779 bool empty() 780 { 781 return parts.length <= 0; 782 } 783 784 bool matches(char_t c) 785 { 786 foreach ( p; parts ) 787 { 788 if ( p.contains(c) ) 789 return true; 790 } 791 return false; 792 } 793 794 CharClass intersect(CharClass cc) 795 { 796 CharClass ic; 797 foreach ( p; parts ) 798 { 799 foreach ( cp; cc.parts ) 800 { 801 if ( p.intersects(cp) ) 802 ic.parts ~= p.intersect(cp); 803 } 804 } 805 return ic; 806 } 807 808 // requires the class to be optimized 809 bool contains(range_t cr) 810 { 811 foreach ( p; parts ) 812 { 813 if ( p.contains(cr) ) 814 return true; 815 } 816 return false; 817 } 818 819 // requires the class to be optimized 820 bool contains(CharClass cc) 821 { 822 Louter: foreach ( p; cc.parts ) 823 { 824 foreach ( p2; parts ) 825 { 826 if ( p2.contains(p) ) 827 continue Louter; 828 } 829 return false; 830 } 831 return true; 832 } 833 834 void subtract(CharClass cc) 835 { 836 negate(); 837 add(cc); 838 negate(); 839 } 840 841 void add(CharClass cc) 842 { 843 parts ~= cc.parts; 844 } 845 846 void add(range_t cr) 847 { 848 parts ~= cr; 849 } 850 851 void add(char_t c) 852 { 853 parts ~= CharRange!(char_t)(c); 854 } 855 856 /* ******************************************************************************************** 857 Requires the CharClass to be optimized. 858 **********************************************************************************************/ 859 void negate() 860 { 861 optimize(); 862 char_t start = char_t.min; 863 864 // first part touches left boundary of value range 865 if ( parts.length > 0 && parts[0].l_ == start ) 866 { 867 start = parts[0].r_; 868 if ( start < char_t.max ) 869 ++start; 870 871 foreach ( i, ref cr; parts[0 .. $-1] ) 872 { 873 cr.l_ = start; 874 cr.r_ = cast(char_t)(parts[i+1].l_-1); 875 start = parts[i+1].r_; 876 if ( start < char_t.max ) 877 ++start; 878 } 879 if ( start != char_t.max ) { 880 parts[$-1].l_ = start; 881 parts[$-1].r_ = char_t.max; 882 } 883 else 884 parts.length = parts.length-1; 885 return; 886 } 887 888 foreach ( i, ref cr; parts ) 889 { 890 char_t tmp = cast(char_t)(cr.l_-1); 891 cr.l_ = start; 892 start = cr.r_; 893 if ( start < char_t.max ) 894 ++start; 895 cr.r_ = tmp; 896 } 897 898 // last part does not touch right boundary 899 if ( start != char_t.max ) 900 parts ~= range_t(start, char_t.max); 901 } 902 903 void optimize() 904 { 905 if ( empty() ) 906 return; 907 908 tango.core.Array.sort(parts); 909 910 size_t i = 0; 911 foreach ( p; parts[1 .. $] ) 912 { 913 if ( p.l_ > parts[i].r_+1 ) { 914 ++i; 915 parts[i].l_ = p.l_; 916 parts[i].r_ = p.r_; 917 continue; 918 } 919 parts[i].r_ = max(p.r_, parts[i].r_); 920 if ( parts[i].r_ >= char_t.max ) 921 break; 922 } 923 parts.length = i+1; 924 } 925 926 string toString() 927 { 928 string str; 929 str ~= "["; 930 foreach ( p; parts ) 931 str ~= p.toString(); 932 str ~= "]"; 933 return str; 934 } 935 } 936 937 debug(UnitTest) 938 { 939 unittest 940 { 941 static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] }; 942 assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']"); 943 cc.optimize(); 944 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 945 cc.negate(); 946 assert(cc.toString(), " [(12)-(13)'e'-(ff)]"); 947 cc.optimize(); 948 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 949 cc.negate(); 950 assert(cc.toString(), "[(12)-(13)'e'-(ff)]"); 951 952 static CharClass!(char) cc2 = { parts: [] }; 953 assert(cc.toString(), "[]"); 954 cc2.optimize(); 955 assert(cc.toString(), "[]"); 956 cc2.negate(); 957 assert(cc.toString(), "[(0)-(ff)]"); 958 cc2.optimize(); 959 assert(cc.toString(), "[(0)-(ff)]"); 960 cc2.negate(); 961 assert(cc.toString(), "[]"); 962 963 static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] }; 964 assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]"); 965 cc3.negate(); 966 assert(cc.toString(), "['e'-(c7)]"); 967 cc3.negate(); 968 assert(cc.toString(), "[(0)-'d'(c8)-(ff)]"); 969 970 static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] }; 971 assert(cc.toString(), "[(0)-(c8)'d'-(ff)]"); 972 cc4.optimize(); 973 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]"); 974 975 static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] }; 976 cc5.optimize(); 977 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 978 cc5.negate(); 979 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 980 cc5.optimize(); 981 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 982 cc5.negate(); 983 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 984 } 985 } 986 987 /* ************************************************************************************************ 988 989 **************************************************************************************************/ 990 private struct Predicate(char_t) 991 { 992 alias const(char_t)[] string_t; 993 alias CharClass!(char_t) cc_t; 994 alias CharRange!(char_t) cr_t; 995 996 // generic data 997 enum Type { 998 consume, epsilon, lookahead, lookbehind 999 } 1000 1001 cc_t input; 1002 Type type; 1003 1004 // data for compiled predicates 1005 enum uint MAX_BITMAP_LENGTH = 256, 1006 MAX_SEARCH_LENGTH = 256; 1007 enum MatchMode { 1008 generic, generic_l, 1009 single_char, bitmap, string_search, // consume 1010 single_char_l, bitmap_l, string_search_l // lookahead 1011 } 1012 1013 MatchMode mode; 1014 // data_chr had to be pulled out of the union due to 1015 // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back 1016 // in until this is resolved! 1017 // 1018 // Keep in mind that data_str.length can't be modified directly unless the 1019 // new length is strictly greater than the old length. This is essentially 1020 // because ubyte.sizeof can be less than char_t.sizeof. If you set 1021 // data_str.length to anything less than or equal to data_bmp.length, 1022 // data_str will not be reallocated: only the length value will change but 1023 // nothing will realize that not that much space has actually been 1024 // allocated. data_str will be too small and you'll likely get segfaults or 1025 // such. 1026 // 1027 // In short: don't mess with data_str.length. If you have to, remove the 1028 // union entirely. 1029 // 1030 // -- Deewiant 1031 union { 1032 ubyte[] data_bmp; 1033 string_t data_str; 1034 }; 1035 char_t data_chr; 1036 1037 1038 void compile() 1039 { 1040 assert(input.parts.length > 0); 1041 1042 // single char? 1043 if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ ) 1044 { 1045 mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l; 1046 data_chr = input.parts[0].l_; 1047 return; 1048 } 1049 // check whether we can use a bitmap 1050 foreach ( p; input.parts ) 1051 { 1052 if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH ) 1053 goto LnoBitmap; 1054 } 1055 1056 // setup bitmap 1057 data_bmp.length = MAX_BITMAP_LENGTH/8; 1058 foreach ( p; input.parts ) 1059 { 1060 for ( char_t c = p.l_; c <= p.r_; ++c ) 1061 data_bmp[c/8] |= 1 << (c&7); 1062 } 1063 mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l; 1064 return; 1065 1066 LnoBitmap: 1067 /* 1068 // check whether the class is small enough to justify a string-search 1069 // TODO: consider inverse class for 8bit chars? 1070 uint class_size; 1071 foreach ( p; input.parts ) 1072 class_size += cast(uint)p.r_+1-p.l_; 1073 if ( class_size > MAX_SEARCH_LENGTH ) 1074 goto Lgeneric; 1075 data_str.length = class_size; 1076 size_t ind; 1077 foreach ( p; input.parts ) 1078 { 1079 for ( char_t c = p.l_; c <= p.r_; ++c ) 1080 data_str[ind++] = c; 1081 } 1082 mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l; 1083 return; 1084 */ 1085 Lgeneric: 1086 data_str = cast(char_t[])input.parts; 1087 mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l; 1088 } 1089 1090 bool matches(char_t c) 1091 { 1092 if ( type == Type.consume || type == Type.lookahead ) 1093 return input.matches(c); 1094 assert(0); 1095 } 1096 1097 Predicate intersect(Predicate p) 1098 { 1099 Predicate p2; 1100 if ( type != Type.epsilon && p.type != Type.epsilon ) 1101 p2.input = input.intersect(p.input); 1102 return p2; 1103 } 1104 1105 bool intersects(Predicate p) 1106 { 1107 if ( type != p.type ) 1108 return false; 1109 foreach ( cr; input.parts ) 1110 { 1111 foreach ( cr2; p.input.parts ) 1112 { 1113 if ( cr.intersects(cr2) ) 1114 return true; 1115 } 1116 } 1117 return false; 1118 } 1119 1120 bool exceedsMax(uint maxc) 1121 { 1122 foreach ( p; input.parts ) 1123 { 1124 if ( p.l_ > maxc || p.r_ > maxc ) 1125 return true; 1126 } 1127 1128 return false; 1129 } 1130 1131 @property bool empty() 1132 { 1133 return type != Type.epsilon && input.empty(); 1134 } 1135 1136 void subtract(Predicate p) 1137 { 1138 if ( type != Type.epsilon && p.type != Type.epsilon ) 1139 input.subtract(p.input); 1140 } 1141 1142 void negate() 1143 { 1144 assert(type != Type.epsilon); 1145 input.negate(); 1146 } 1147 1148 void optimize() 1149 { 1150 assert(type != Type.epsilon); 1151 input.optimize(); 1152 } 1153 1154 int opCmp(const ref Predicate p) const 1155 { 1156 return input.opCmp(p.input); 1157 } 1158 1159 const bool opEquals(ref const Predicate p) 1160 { 1161 if ( type != p.type ) 1162 return false; 1163 if ( input.opCmp(p.input) != 0 ) 1164 return false; 1165 return true; 1166 } 1167 1168 cc_t getInput() 1169 { 1170 return input; 1171 } 1172 1173 void setInput(const(cc_t) cc) 1174 { 1175 input = cast(cc_t)cc; 1176 } 1177 1178 void appendInput(cr_t cr) 1179 { 1180 input.add(cr); 1181 } 1182 1183 void appendInput(cc_t cc) 1184 { 1185 input.add(cc); 1186 } 1187 1188 void appendInput(Predicate p) 1189 { 1190 input.add(p.input); 1191 } 1192 1193 string toString() 1194 { 1195 string str; 1196 switch ( type ) 1197 { 1198 case Type.consume: str = input.toString(); break; 1199 case Type.epsilon: str = "eps"; break; 1200 case Type.lookahead: str = "la:"~input.toString(); break; 1201 case Type.lookbehind: str = "lb:"~input.toString(); break; 1202 default: 1203 assert(0); 1204 } 1205 return str; 1206 } 1207 } 1208 import Utf = tango.text.convert.Utf; 1209 import tango.text.convert.Format; 1210 1211 /* ************************************************************************************************ 1212 1213 **************************************************************************************************/ 1214 class RegExpException : Exception 1215 { 1216 this(string msg) 1217 { 1218 super("RegExp: "~msg); 1219 } 1220 } 1221 1222 /* ************************************************************************************************ 1223 TNFA state 1224 **************************************************************************************************/ 1225 private class TNFAState(char_t) 1226 { 1227 bool accept = false, 1228 visited = false; 1229 size_t index; 1230 List!(TNFATransition!(char_t)) transitions; 1231 1232 this() 1233 { 1234 transitions = new List!(TNFATransition!(char_t)); 1235 } 1236 } 1237 1238 1239 /* ************************************************************************************************ 1240 Priority classes used to linearize priorities after non-linear transition creation. 1241 **************************************************************************************************/ 1242 private enum PriorityClass { 1243 greedy=0, normal=1, reluctant=2, extraReluctant=3 1244 } 1245 1246 /* ******************************************************************************** 1247 TNFA tagged transition 1248 ***********************************************************************************/ 1249 private class TNFATransition(char_t) 1250 { 1251 TNFAState!(char_t) target; 1252 Predicate!(char_t) predicate; 1253 uint priority, 1254 tag; /// one-based tag number, 0 = untagged 1255 PriorityClass priorityClass; 1256 1257 this(PriorityClass pc) 1258 { 1259 priorityClass = pc; 1260 } 1261 1262 /****************************************************************************** 1263 Move through states only going via epsilon transitions, and only choosing 1264 the one with highest priority. If the highest priority transition from a 1265 state isn't an epsilon transition, false is returned. 1266 If the accepting NFA state can be reached in this manner, true is returned. 1267 1268 NOTE: This method does not look for cycles which should be kept in mind for 1269 later. larsivi 20090827 1270 *******************************************************************************/ 1271 bool canFinish() 1272 { 1273 TNFAState!(char_t) t = target; 1274 while (!t.accept) { 1275 TNFATransition!(char_t) highestPriTrans; 1276 foreach (trans; t.transitions) { 1277 if (!highestPriTrans || highestPriTrans.priority > trans.priority) 1278 highestPriTrans = trans; 1279 } 1280 if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon)) 1281 return false; 1282 1283 t = highestPriTrans.target; 1284 } 1285 return true; 1286 } 1287 1288 } 1289 1290 /* ************************************************************************************************ 1291 Fragments of TNFAs as used in the Thompson method 1292 **************************************************************************************************/ 1293 private class TNFAFragment(char_t) 1294 { 1295 alias TNFAState!(char_t) state_t; 1296 alias TNFATransition!(char_t) trans_t; 1297 1298 List!(trans_t) entries, /// transitions to be added to the entry state 1299 exits, /// transitions to be added to the exit state 1300 entry_state, /// transitions to write the entry state to 1301 exit_state; /// transitions to write the exit state to 1302 1303 bool swapMatchingBracketSyntax; 1304 1305 this() 1306 { 1307 entries = new List!(trans_t); 1308 exits = new List!(trans_t); 1309 entry_state = new List!(trans_t); 1310 exit_state = new List!(trans_t); 1311 } 1312 1313 /* ******************************************************************************************** 1314 Write the given state as entry state to this fragment. 1315 **********************************************************************************************/ 1316 void setEntry(state_t state) 1317 { 1318 state.transitions ~= entries; 1319 foreach ( t; entry_state ) 1320 t.target = state; 1321 } 1322 1323 /* ******************************************************************************************** 1324 Write the given state as exit state to this fragment. 1325 **********************************************************************************************/ 1326 void setExit(state_t state) 1327 { 1328 state.transitions ~= exits; 1329 foreach ( t; exit_state ) 1330 t.target = state; 1331 } 1332 } 1333 1334 /* ************************************************************************************************ 1335 Tagged NFA 1336 **************************************************************************************************/ 1337 private final class TNFA(char_t) 1338 { 1339 alias TNFATransition!(char_t) trans_t; 1340 alias TNFAFragment!(char_t) frag_t; 1341 alias TNFAState!(char_t) state_t; 1342 alias Predicate!(char_t) predicate_t; 1343 alias char_t[] string_t; 1344 alias CharRange!(char_t) range_t; 1345 alias CharClass!(char_t) cc_t; 1346 1347 string_t pattern; 1348 state_t[] states; 1349 state_t start; 1350 1351 bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching 1352 1353 /* ******************************************************************************************** 1354 Creates the TNFA from the given regex pattern 1355 **********************************************************************************************/ 1356 this(string_t regex) 1357 { 1358 next_tag = 1; 1359 transitions = new List!(trans_t); 1360 1361 pattern = regex; 1362 } 1363 1364 /* ******************************************************************************************** 1365 Print the TNFA (tabular representation of the delta function) 1366 **********************************************************************************************/ 1367 debug(TangoRegex) void print() 1368 { 1369 foreach ( int i, s; states ) 1370 { 1371 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 1372 1373 bool first=true; 1374 Stdout(" {"); 1375 foreach ( t; s.transitions ) 1376 { 1377 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index); 1378 if ( t.tag > 0 ) { 1379 Stdout.format(" t{}", t.tag); 1380 } 1381 first = false; 1382 } 1383 Stdout("}").newline; 1384 } 1385 } 1386 1387 uint tagCount() 1388 { 1389 return next_tag-1; 1390 } 1391 1392 /* ******************************************************************************************** 1393 Constructs the TNFA using extended Thompson method. 1394 Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert 1395 the regexp from infix notation. 1396 **********************************************************************************************/ 1397 void parse(bool unanchored) 1398 { 1399 List!(frag_t) frags = new List!(frag_t); 1400 Stack!(Operator) opStack; 1401 Stack!(uint) tagStack; 1402 Stack!(Pair!(uint)) occurStack; 1403 opStack.push(Operator.eos); 1404 1405 /* **************************************************************************************** 1406 Perform action on operator stack 1407 ******************************************************************************************/ 1408 bool perform(Operator next_op, bool explicit_operator=true) 1409 { 1410 // calculate index in action matrix 1411 int index = cast(int)opStack.top()*(Operator.max+1); 1412 index += cast(int)next_op; 1413 1414 debug(tnfa) Stdout.formatln("\t{}:{} -> {} {} frag(s)", 1415 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length 1416 ); 1417 switch ( action_lookup[index] ) 1418 { 1419 case Act.pua: 1420 opStack.push(next_op); 1421 if ( next_op == Operator.open_par ) { 1422 tagStack.push(next_tag); 1423 next_tag += 2; 1424 } 1425 break; 1426 case Act.poc: 1427 switch ( opStack.top() ) 1428 { 1429 case Operator.concat: constructConcat(frags); break; 1430 case Operator.altern: constructAltern(frags); break; 1431 case Operator.zero_one_g: constructZeroOne(frags, PriorityClass.greedy); break; 1432 case Operator.zero_one_ng: constructZeroOne(frags, PriorityClass.reluctant); break; 1433 case Operator.zero_one_xr: constructZeroOne(frags, PriorityClass.extraReluctant); break; 1434 case Operator.zero_more_g: constructZeroMore(frags, PriorityClass.greedy); break; 1435 case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant); break; 1436 case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break; 1437 case Operator.one_more_g: constructOneMore(frags, PriorityClass.greedy); break; 1438 case Operator.one_more_ng: constructOneMore(frags, PriorityClass.reluctant); break; 1439 case Operator.one_more_xr: constructOneMore(frags, PriorityClass.extraReluctant); break; 1440 case Operator.occur_g: 1441 Pair!(uint) occur = occurStack.pop(); 1442 constructOccur(frags, occur.a, occur.b, PriorityClass.greedy); 1443 break; 1444 case Operator.occur_ng: 1445 Pair!(uint) occur = occurStack.pop(); 1446 constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant); 1447 break; 1448 default: 1449 throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1450 } 1451 opStack.pop(); 1452 1453 perform(next_op, false); 1454 break; 1455 case Act.poa: 1456 opStack.pop(); 1457 break; 1458 case Act.pca: 1459 if ( opStack.top() == Operator.open_par ) 1460 { 1461 if ( tagStack.empty() ) 1462 throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup); 1463 constructBracket(frags, tagStack.top()); 1464 tagStack.pop(); 1465 } 1466 else { 1467 assert(opStack.top() == Operator.open_par_nm); 1468 constructBracket(frags); 1469 } 1470 opStack.pop(); 1471 break; 1472 case Act.don: 1473 return true; 1474 case Act.err: 1475 default: 1476 throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup); 1477 } 1478 1479 return false; 1480 } 1481 1482 // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches 1483 // and matching bracket for total match group 1484 if ( unanchored ) { 1485 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1486 perform(Operator.zero_more_xr, false); 1487 perform(Operator.concat, false); 1488 perform(Operator.open_par, false); 1489 } 1490 1491 // convert regex to postfix and create TNFA 1492 bool implicit_concat; 1493 predicate_t.Type pred_type; 1494 1495 while ( !endOfPattern() ) 1496 { 1497 pred_type = predicate_t.Type.consume; 1498 1499 dchar c = readPattern(); 1500 switch ( c ) 1501 { 1502 case '|': 1503 perform(Operator.altern); 1504 implicit_concat = false; 1505 break; 1506 case '(': 1507 if ( implicit_concat ) 1508 perform(Operator.concat, false); 1509 implicit_concat = false; 1510 if ( peekPattern() == '?' ) { 1511 readPattern(); 1512 perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm); 1513 } 1514 else 1515 perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par); 1516 break; 1517 case ')': 1518 perform(Operator.close_par); 1519 break; 1520 case '?': 1521 if ( peekPattern() == '?' ) { 1522 readPattern(); 1523 perform(Operator.zero_one_ng); 1524 } 1525 else 1526 perform(Operator.zero_one_g); 1527 break; 1528 case '*': 1529 if ( peekPattern() == '?' ) { 1530 readPattern(); 1531 perform(Operator.zero_more_ng); 1532 } 1533 else 1534 perform(Operator.zero_more_g); 1535 break; 1536 case '+': 1537 if ( peekPattern() == '?' ) { 1538 readPattern(); 1539 perform(Operator.one_more_ng); 1540 } 1541 else 1542 perform(Operator.one_more_g); 1543 break; 1544 case '{': 1545 Pair!(uint) occur; 1546 parseOccurCount(occur.a, occur.b); 1547 occurStack.push(occur); 1548 if ( peekPattern() == '?' ) { 1549 readPattern(); 1550 perform(Operator.occur_ng); 1551 } 1552 else 1553 perform(Operator.occur_g); 1554 break; 1555 case '[': 1556 if ( implicit_concat ) 1557 perform(Operator.concat, false); 1558 implicit_concat = true; 1559 frags ~= constructCharClass(pred_type); 1560 break; 1561 case '.': 1562 if ( implicit_concat ) 1563 perform(Operator.concat, false); 1564 implicit_concat = true; 1565 frags ~= constructChars(cc_t.dot_oper, pred_type); 1566 break; 1567 case '$': 1568 if ( implicit_concat ) 1569 perform(Operator.concat, false); 1570 implicit_concat = true; 1571 1572 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead); 1573 break; 1574 case '^': 1575 if ( implicit_concat ) 1576 perform(Operator.concat, false); 1577 implicit_concat = true; 1578 1579 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind); 1580 break; 1581 case '>': 1582 c = readPattern(); 1583 pred_type = predicate_t.Type.lookahead; 1584 if ( c == '[' ) 1585 goto case '['; 1586 else if ( c == '\\' ) 1587 goto case '\\'; 1588 else if ( c == '.' ) 1589 goto case '.'; 1590 else 1591 goto default; 1592 case '<': 1593 c = readPattern(); 1594 pred_type = predicate_t.Type.lookbehind; 1595 if ( c == '[' ) 1596 goto case '['; 1597 else if ( c == '\\' ) 1598 goto case '\\'; 1599 else if ( c == '.' ) 1600 goto case '.'; 1601 else 1602 goto default; 1603 case '\\': 1604 c = readPattern(); 1605 1606 if ( implicit_concat ) 1607 perform(Operator.concat, false); 1608 implicit_concat = true; 1609 1610 switch ( c ) 1611 { 1612 case 't': 1613 frags ~= constructSingleChar('\t', pred_type); 1614 break; 1615 case 'n': 1616 frags ~= constructSingleChar('\n', pred_type); 1617 break; 1618 case 'r': 1619 frags ~= constructSingleChar('\r', pred_type); 1620 break; 1621 case 'w': // alphanumeric and _ 1622 frags ~= constructChars(cc_t.alphanum_, pred_type); 1623 break; 1624 case 'W': // non-(alphanum and _) 1625 auto cc = cc_t(cc_t.alphanum_); 1626 cc.negate(); 1627 frags ~= constructChars(cc, pred_type); 1628 break; 1629 case 's': // whitespace 1630 frags ~= constructChars(cc_t.whitespace, pred_type); 1631 break; 1632 case 'S': // non-whitespace 1633 auto cc = cc_t(cc_t.whitespace); 1634 cc.negate(); 1635 frags ~= constructChars(cc, pred_type); 1636 break; 1637 case 'd': // digit 1638 frags ~= constructChars(cc_t.digit, pred_type); 1639 break; 1640 case 'D': // non-digit 1641 auto cc = cc_t(cc_t.digit); 1642 cc.negate(); 1643 frags ~= constructChars(cc, pred_type); 1644 break; 1645 case 'b': // either end of word 1646 if ( pred_type != predicate_t.Type.consume ) 1647 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind"); 1648 1649 // create (?<\S>\s|<\s>\S) 1650 auto cc = cc_t(cc_t.whitespace); 1651 cc.negate(); 1652 1653 perform(Operator.open_par_nm); 1654 1655 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1656 perform(Operator.concat, false); 1657 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1658 perform(Operator.altern, false); 1659 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1660 perform(Operator.concat, false); 1661 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1662 1663 perform(Operator.close_par, false); 1664 break; 1665 case 'B': // neither end of word 1666 if ( pred_type != predicate_t.Type.consume ) 1667 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind"); 1668 1669 // create (?<\S>\S|<\s>\s) 1670 auto cc = cc_t(cc_t.whitespace); 1671 cc.negate(); 1672 1673 perform(Operator.open_par_nm); 1674 1675 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1676 perform(Operator.concat, false); 1677 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1678 perform(Operator.altern, false); 1679 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1680 perform(Operator.concat, false); 1681 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1682 1683 perform(Operator.close_par, false); 1684 break; 1685 case '(': 1686 case ')': 1687 case '[': 1688 case ']': 1689 case '{': 1690 case '}': 1691 case '*': 1692 case '+': 1693 case '?': 1694 case '.': 1695 case '\\': 1696 case '^': 1697 case '$': 1698 case '|': 1699 case '<': 1700 case '>': 1701 default: 1702 frags ~= constructSingleChar(c, pred_type); 1703 break; 1704 // throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c)); 1705 } 1706 break; 1707 1708 default: 1709 if ( implicit_concat ) 1710 perform(Operator.concat, false); 1711 implicit_concat = true; 1712 frags ~= constructSingleChar(c, pred_type); 1713 } 1714 } 1715 1716 // add implicit reluctant .* (with . == any_char) at the end for unanchored matches 1717 if ( unanchored ) 1718 { 1719 perform(Operator.close_par, false); 1720 if ( implicit_concat ) 1721 perform(Operator.concat, false); 1722 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1723 perform(Operator.zero_more_ng, false); 1724 } 1725 1726 // empty operator stack 1727 while ( !perform(Operator.eos) ) {} 1728 1729 // set start and finish states 1730 start = addState(); 1731 state_t finish = addState(); 1732 finish.accept = true; 1733 1734 foreach ( f; frags ) { 1735 f.setExit(finish); 1736 f.setEntry(start); 1737 } 1738 1739 // set transition priorities 1740 List!(trans_t)[PriorityClass.max+1] trans; 1741 foreach ( ref t; trans ) 1742 t = new List!(trans_t); 1743 1744 Stack!(trans_t) todo; 1745 state_t state = start; 1746 1747 while ( !todo.empty() || !state.visited ) 1748 { 1749 if ( !state.visited ) 1750 { 1751 state.visited = true; 1752 foreach_reverse ( t; state.transitions ) 1753 todo.push(t); 1754 } 1755 1756 if ( todo.empty() ) 1757 break; 1758 trans_t t = todo.top(); 1759 todo.pop(); 1760 assert(t.priorityClass<=PriorityClass.max); 1761 trans[t.priorityClass] ~= t; 1762 state = t.target; 1763 } 1764 1765 uint nextPrio; 1766 foreach ( ts; trans ) 1767 { 1768 foreach ( t; ts ) 1769 t.priority = nextPrio++; 1770 } 1771 } 1772 1773 private: 1774 uint next_tag; 1775 size_t cursor, 1776 next_cursor; 1777 List!(trans_t) transitions; 1778 1779 state_t[state_t] clonedStates; 1780 trans_t[trans_t] clonedTransitions; 1781 1782 /// RegEx operators 1783 enum Operator { 1784 eos, concat, altern, open_par, close_par, 1785 zero_one_g, zero_more_g, one_more_g, // greedy 1786 zero_one_ng, zero_more_ng, one_more_ng, // non-greedy/reluctant 1787 zero_one_xr, zero_more_xr, one_more_xr, // extra-reluctant 1788 open_par_nm, occur_g, occur_ng 1789 } 1790 __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"]; 1791 1792 /// Actions for to-postfix transformation 1793 enum Act { 1794 pua, poc, poa, pca, don, err 1795 } 1796 __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"]; 1797 1798 /// Action lookup for to-postfix transformation 1799 __gshared immutable Act[] action_lookup = 1800 [ 1801 // eos concat | ( ) ? * + ?? *? +? ??extra *?extra +?extra (? {x,y} {x,y}? 1802 Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1803 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1804 Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1805 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1806 Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, 1807 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1808 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1809 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1810 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1811 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1812 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1813 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1814 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1815 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1816 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1817 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1818 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc 1819 ]; 1820 1821 final dchar peekPattern() 1822 { 1823 auto tmp = next_cursor; 1824 if ( tmp < pattern.length ) 1825 return decode(pattern, tmp); 1826 return 0; 1827 } 1828 1829 final dchar readPattern() 1830 { 1831 cursor = next_cursor; 1832 if ( next_cursor < pattern.length ) 1833 return decode(pattern, next_cursor); 1834 return 0; 1835 } 1836 1837 final bool endOfPattern() 1838 { 1839 return next_cursor >= pattern.length; 1840 } 1841 1842 state_t addState() 1843 { 1844 state_t s = new state_t; 1845 s.index = states.length; 1846 states ~= s; 1847 return s; 1848 } 1849 1850 trans_t addTransition(PriorityClass pc = PriorityClass.normal) 1851 { 1852 trans_t trans = new trans_t(pc); 1853 transitions ~= trans; 1854 return trans; 1855 } 1856 1857 uint parseNumber() 1858 { 1859 uint res; 1860 while ( !endOfPattern() ) 1861 { 1862 auto c = peekPattern(); 1863 if ( c < '0' || c > '9' ) 1864 break; 1865 res = res*10+(c-'0'); 1866 readPattern(); 1867 } 1868 return res; 1869 } 1870 1871 void parseOccurCount(out uint minOccur, out uint maxOccur) 1872 { 1873 assert(pattern[cursor] == '{'); 1874 1875 minOccur = parseNumber(); 1876 if ( peekPattern() == '}' ) { 1877 readPattern(); 1878 maxOccur = minOccur; 1879 return; 1880 } 1881 if ( peekPattern() != ',' ) 1882 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1883 readPattern(); 1884 maxOccur = parseNumber(); 1885 if ( peekPattern() != '}' ) 1886 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1887 readPattern(); 1888 if ( maxOccur > 0 && maxOccur < minOccur ) 1889 throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1890 } 1891 1892 trans_t clone(trans_t t) 1893 { 1894 if ( t is null ) 1895 return null; 1896 trans_t* tmp = t in clonedTransitions; 1897 if ( tmp !is null ) 1898 return *tmp; 1899 1900 trans_t t2 = new trans_t(t.priorityClass); 1901 clonedTransitions[t] = t2; 1902 t2.tag = t.tag; 1903 t2.priority = t.priority; 1904 t2.predicate = t.predicate; 1905 t2.target = clone(t.target); 1906 transitions ~= t2; 1907 return t2; 1908 } 1909 1910 state_t clone(state_t s) 1911 { 1912 if ( s is null ) 1913 return null; 1914 state_t* tmp = s in clonedStates; 1915 if ( tmp !is null ) 1916 return *tmp; 1917 1918 state_t s2 = new state_t; 1919 clonedStates[s] = s2; 1920 s2.accept = s.accept; 1921 s2.visited = s.visited; 1922 foreach ( t; s.transitions ) 1923 s2.transitions ~= clone(t); 1924 s2.index = states.length; 1925 states ~= s2; 1926 return s2; 1927 } 1928 1929 frag_t clone(frag_t f) 1930 { 1931 if ( f is null ) 1932 return null; 1933 clonedStates = null; 1934 clonedTransitions = null; 1935 1936 frag_t f2 = new frag_t; 1937 foreach ( t; f.entries ) 1938 f2.entries ~= clone(t); 1939 foreach ( t; f.exits ) 1940 f2.exits ~= clone(t); 1941 foreach ( t; f.entry_state ) 1942 f2.entry_state ~= clone(t); 1943 foreach ( t; f.exit_state ) 1944 f2.exit_state ~= clone(t); 1945 return f2; 1946 } 1947 1948 //--------------------------------------------------------------------------------------------- 1949 // Thompson constructions of NFA fragments 1950 1951 frag_t constructSingleChar(char_t c, predicate_t.Type type) 1952 { 1953 debug(tnfa) Stdout.formatln("constructCharFrag {}", c); 1954 1955 trans_t trans = addTransition(); 1956 trans.predicate.appendInput(CharRange!(char_t)(c)); 1957 1958 trans.predicate.type = type; 1959 1960 frag_t frag = new frag_t; 1961 frag.exit_state ~= trans; 1962 frag.entries ~= trans; 1963 return frag; 1964 } 1965 1966 frag_t constructChars(string_t chars, in predicate_t.Type type) 1967 { 1968 cc_t cc; 1969 for ( int i = 0; i < chars.length; ++i ) 1970 cc.add(chars[i]); 1971 1972 return constructChars(cc, type); 1973 } 1974 1975 frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type) 1976 { 1977 debug(tnfa) Stdout.format("constructChars type={}", type); 1978 1979 trans_t trans = addTransition(); 1980 trans.predicate.type = type; 1981 1982 trans.predicate.setInput(cc_t(charclass)); 1983 1984 trans.predicate.optimize(); 1985 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 1986 1987 frag_t frag = new frag_t; 1988 frag.exit_state ~= trans; 1989 frag.entries ~= trans; 1990 return frag; 1991 } 1992 1993 frag_t constructCharClass(predicate_t.Type type) 1994 { 1995 debug(tnfa) Stdout.format("constructCharClass type={}", type); 1996 auto oldCursor = cursor; 1997 1998 trans_t trans = addTransition(); 1999 2000 bool negated=false; 2001 if ( peekPattern() == '^' ) { 2002 readPattern(); 2003 negated = true; 2004 } 2005 2006 char_t last; 2007 bool have_range_start, 2008 first_char = true; 2009 for ( ; !endOfPattern() && peekPattern() != ']'; ) 2010 { 2011 dchar c = readPattern(); 2012 switch ( c ) 2013 { 2014 case '-': 2015 if ( first_char ) { 2016 trans.predicate.appendInput(range_t(c)); 2017 break; 2018 } 2019 if ( !have_range_start ) 2020 throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2021 else if ( endOfPattern() || peekPattern() == ']' ) 2022 throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2023 else { 2024 c = readPattern(); 2025 trans.predicate.appendInput(range_t(last, c)); 2026 have_range_start = false; 2027 } 2028 break; 2029 case '\\': 2030 if ( endOfPattern() ) 2031 throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\""); 2032 c = readPattern(); 2033 switch ( c ) 2034 { 2035 case 't': 2036 c = '\t'; 2037 break; 2038 case 'n': 2039 c = '\n'; 2040 break; 2041 case 'r': 2042 c = '\r'; 2043 break; 2044 default: 2045 break; 2046 } 2047 goto default; 2048 default: 2049 if ( have_range_start ) 2050 trans.predicate.appendInput(range_t(last)); 2051 last = c; 2052 have_range_start = true; 2053 } 2054 first_char = false; 2055 } 2056 if ( !endOfPattern() ) 2057 readPattern(); 2058 if ( last != char_t.init ) 2059 trans.predicate.appendInput(range_t(last)); 2060 debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]); 2061 2062 if ( negated ) { 2063 auto tmp = cc_t(cc_t.any_char); 2064 tmp.subtract(trans.predicate.input); 2065 trans.predicate.input = tmp; 2066 } 2067 else 2068 trans.predicate.optimize(); 2069 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 2070 2071 trans.predicate.type = type; 2072 2073 frag_t frag = new frag_t; 2074 frag.exit_state ~= trans; 2075 frag.entries ~= trans; 2076 return frag; 2077 } 2078 2079 void constructBracket(List!(frag_t) frags, uint tag=0) 2080 { 2081 debug(tnfa) Stdout.formatln("constructBracket"); 2082 2083 state_t entry = addState(), 2084 exit = addState(); 2085 frags.tail.value.setEntry(entry); 2086 frags.tail.value.setExit(exit); 2087 2088 trans_t tag1 = addTransition(), 2089 tag2 = addTransition(); 2090 tag1.predicate.type = predicate_t.Type.epsilon; 2091 tag2.predicate.type = predicate_t.Type.epsilon; 2092 if ( tag > 0 ) 2093 { 2094 // make sure the tag indeces for bracket x are always 2095 // x*2 for the opening bracket and x*2+1 for the closing bracket 2096 tag1.tag = tag++; 2097 tag2.tag = tag; 2098 } 2099 tag1.target = entry; 2100 exit.transitions ~= tag2; 2101 2102 frag_t frag = new frag_t; 2103 frag.entries ~= tag1; 2104 frag.exit_state ~= tag2; 2105 frags.pop(); 2106 frags ~= frag; 2107 } 2108 2109 void constructOneMore(List!(frag_t) frags, PriorityClass prioClass) 2110 { 2111 debug(tnfa) Stdout.formatln("constructOneMore"); 2112 2113 if ( frags.empty() ) 2114 throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2115 2116 trans_t repeat = addTransition(prioClass), 2117 cont = addTransition(); 2118 repeat.predicate.type = predicate_t.Type.epsilon; 2119 cont.predicate.type = predicate_t.Type.epsilon; 2120 2121 state_t s = addState(); 2122 frags.tail.value.setExit(s); 2123 s.transitions ~= repeat; 2124 s.transitions ~= cont; 2125 2126 frag_t frag = new frag_t; 2127 frag.entries ~= frags.tail.value.entries; 2128 frag.entry_state ~= frags.tail.value.entry_state; 2129 frag.entry_state ~= repeat; 2130 frag.exit_state ~= cont; 2131 frags.pop(); 2132 frags ~= frag; 2133 } 2134 2135 void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass) 2136 { 2137 debug(tnfa) Stdout.formatln("constructZeroMore"); 2138 2139 if ( frags.empty() ) 2140 throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2141 2142 trans_t enter = addTransition(prioClass), 2143 repeat = addTransition(prioClass), 2144 skip = addTransition(); 2145 skip.predicate.type = predicate_t.Type.epsilon; 2146 repeat.predicate.type = predicate_t.Type.epsilon; 2147 enter.predicate.type = predicate_t.Type.epsilon; 2148 2149 state_t entry = addState(), 2150 exit = addState(); 2151 frags.tail.value.setEntry(entry); 2152 frags.tail.value.setExit(exit); 2153 exit.transitions ~= repeat; 2154 enter.target = entry; 2155 2156 frag_t frag = new frag_t; 2157 frag.entries ~= skip; 2158 frag.entries ~= enter; 2159 frag.exit_state ~= skip; 2160 frag.entry_state ~= repeat; 2161 frags.pop(); 2162 frags ~= frag; 2163 } 2164 2165 void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass) 2166 { 2167 debug(tnfa) Stdout.formatln("constructZeroOne"); 2168 2169 if ( frags.empty() ) 2170 throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2171 2172 trans_t use = addTransition(prioClass), 2173 skip = addTransition(); 2174 use.predicate.type = predicate_t.Type.epsilon; 2175 skip.predicate.type = predicate_t.Type.epsilon; 2176 2177 state_t s = addState(); 2178 frags.tail.value.setEntry(s); 2179 use.target = s; 2180 2181 frag_t frag = new frag_t; 2182 frag.entries ~= use; 2183 frag.entries ~= skip; 2184 frag.exits ~= frags.tail.value.exits; 2185 frag.exit_state ~= frags.tail.value.exit_state; 2186 frag.exit_state ~= skip; 2187 frags.pop(); 2188 frags ~= frag; 2189 } 2190 2191 void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass) 2192 { 2193 debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur); 2194 2195 if ( frags.empty() ) 2196 throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2197 2198 state_t s; 2199 frag_t total = new frag_t, 2200 prev; 2201 2202 for ( int i = 0; i < minOccur; ++i ) 2203 { 2204 frag_t f = clone(frags.tail.value); 2205 if ( prev !is null ) { 2206 s = addState(); 2207 prev.setExit(s); 2208 f.setEntry(s); 2209 } 2210 else { 2211 total.entries = f.entries; 2212 total.entry_state = f.entry_state; 2213 } 2214 prev = f; 2215 } 2216 2217 if ( maxOccur == 0 ) 2218 { 2219 frag_t f = frags.tail.value; 2220 trans_t t = addTransition(); 2221 t.predicate.type = predicate_t.Type.epsilon; 2222 f.entries ~= t; 2223 f.exit_state ~= t; 2224 2225 t = addTransition(); 2226 t.predicate.type = predicate_t.Type.epsilon; 2227 f.exits ~= t; 2228 f.entry_state ~= t; 2229 2230 s = addState(); 2231 f.setEntry(s); 2232 2233 if ( prev !is null ) 2234 prev.setExit(s); 2235 else { 2236 total.entries = f.entries; 2237 total.entry_state = f.entry_state; 2238 } 2239 2240 prev = f; 2241 } 2242 2243 for ( int i = minOccur; i < maxOccur; ++i ) 2244 { 2245 frag_t f; 2246 if ( i < maxOccur-1 ) 2247 f = clone(frags.tail.value); 2248 else 2249 f = frags.tail.value; 2250 trans_t t = addTransition(); 2251 t.predicate.type = predicate_t.Type.epsilon; 2252 f.entries ~= t; 2253 f.exit_state ~= t; 2254 2255 if ( prev !is null ) { 2256 s = addState(); 2257 prev.setExit(s); 2258 f.setEntry(s); 2259 } 2260 else { 2261 total.entries = f.entries; 2262 total.entry_state = f.entry_state; 2263 } 2264 prev = f; 2265 } 2266 2267 total.exits = prev.exits; 2268 total.exit_state = prev.exit_state; 2269 2270 frags.pop(); 2271 frags ~= total; 2272 } 2273 2274 void constructAltern(List!(frag_t) frags) 2275 { 2276 debug(tnfa) Stdout.formatln("constructAltern"); 2277 2278 if ( frags.empty() || frags.head is frags.tail ) 2279 throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2280 2281 frag_t frag = new frag_t, 2282 f1 = frags.tail.value, 2283 f2 = frags.tail.prev.value; 2284 frag.entry_state ~= f2.entry_state; 2285 frag.entry_state ~= f1.entry_state; 2286 frag.exit_state ~= f2.exit_state; 2287 frag.exit_state ~= f1.exit_state; 2288 frag.entries ~= f2.entries; 2289 frag.entries ~= f1.entries; 2290 frag.exits ~= f2.exits; 2291 frag.exits ~= f1.exits; 2292 2293 frags.pop(); 2294 frags.pop(); 2295 frags ~= frag; 2296 } 2297 2298 void constructConcat(List!(frag_t) frags) 2299 { 2300 debug(tnfa) Stdout.formatln("constructConcat"); 2301 2302 if ( frags.empty() || frags.head is frags.tail ) 2303 throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2304 2305 frag_t f1 = frags.tail.value, 2306 f2 = frags.tail.prev.value; 2307 2308 state_t state = addState(); 2309 f2.setExit(state); 2310 f1.setEntry(state); 2311 2312 frag_t frag = new frag_t; 2313 frag.entries ~= f2.entries; 2314 frag.exits ~= f1.exits; 2315 frag.entry_state ~= f2.entry_state; 2316 frag.exit_state ~= f1.exit_state; 2317 frags.pop(); 2318 frags.pop(); 2319 frags ~= frag; 2320 } 2321 } 2322 import tango.core.Array; 2323 2324 /* ************************************************************************************************ 2325 Tagged DFA 2326 **************************************************************************************************/ 2327 private class TDFA(char_t) 2328 { 2329 alias Predicate!(char_t) predicate_t; 2330 alias CharRange!(char_t) range_t; 2331 alias CharClass!(char_t) charclass_t; 2332 alias char_t[] string_t; 2333 2334 enum uint CURRENT_POSITION_REGISTER = ~0; 2335 2336 /* ******************************************************************************************** 2337 Tag map assignment command 2338 **********************************************************************************************/ 2339 struct Command 2340 { 2341 uint dst, /// register index to recieve data 2342 src; /// register index or CURRENT_POSITION_REGISTER for current position 2343 2344 string toString() 2345 { 2346 return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup; 2347 } 2348 2349 /* **************************************************************************************** 2350 Order transitions by the order of their predicates. 2351 ******************************************************************************************/ 2352 int opCmp(const ref Command cmd) const 2353 { 2354 if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER ) 2355 return 1; 2356 if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER ) 2357 return -1; 2358 if ( dst < cmd.dst ) 2359 return -1; 2360 if ( dst == cmd.dst ) 2361 return 0; 2362 return 1; 2363 } 2364 } 2365 2366 struct TagIndex 2367 { 2368 uint tag, 2369 index; 2370 2371 const int opCmp(ref const TagIndex o) 2372 { 2373 if (tag == o.tag && index == o.index) 2374 return 0; 2375 if (tag > o.tag) 2376 return 1; 2377 if (tag < o.tag) 2378 return -1; 2379 if (index > o.index) 2380 return 1; 2381 if (index < o.index) 2382 return -1; 2383 assert(0); 2384 } 2385 } 2386 2387 /* ******************************************************************************************** 2388 TDFA state 2389 **********************************************************************************************/ 2390 class State 2391 { 2392 enum Mode { 2393 GENERIC, MIXED, LOOKUP 2394 } 2395 2396 enum uint LOOKUP_LENGTH = 256, 2397 INVALID_STATE = 255; 2398 2399 bool accept = false; 2400 bool reluctant = false; 2401 size_t index; 2402 Transition[] transitions, 2403 generic_transitions; 2404 Command[] finishers; 2405 2406 ubyte[] lookup; 2407 Mode mode; 2408 2409 void optimize() 2410 { 2411 // merge transitions with equal targets (same state index and equal commands) 2412 size_t[] remove_indeces; 2413 foreach ( i, t; transitions[0 .. $-1] ) 2414 { 2415 foreach ( t2; transitions[i+1 .. $] ) 2416 { 2417 if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) ) 2418 continue; 2419 t2.predicate.appendInput(t.predicate); 2420 remove_indeces ~= i; 2421 break; 2422 } 2423 } 2424 2425 // remove transitions that have been merged into another 2426 if ( remove_indeces.length > 0 ) 2427 { 2428 Transition[] tmp; 2429 tmp.length = transitions.length - remove_indeces.length; 2430 size_t next_remove, next; 2431 foreach ( i, t; transitions ) 2432 { 2433 if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) { 2434 ++next_remove; 2435 continue; 2436 } 2437 tmp[next++] = t; 2438 } 2439 transitions = tmp; 2440 2441 foreach ( t; transitions ) 2442 t.predicate.optimize(); 2443 } 2444 } 2445 2446 void createLookup() 2447 { 2448 size_t count; 2449 foreach ( t; transitions ) 2450 { 2451 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2452 ++count; 2453 } 2454 2455 if ( count < 1 || transitions.length > INVALID_STATE ) { 2456 generic_transitions.length = transitions.length; 2457 generic_transitions[] = transitions[]; 2458 return; 2459 } 2460 2461 foreach ( t; transitions ) 2462 { 2463 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2464 generic_transitions ~= t; 2465 } 2466 2467 // setup lookup table 2468 lookup.length = LOOKUP_LENGTH; 2469 lookup[] = INVALID_STATE; 2470 foreach ( i, t; transitions ) 2471 { 2472 foreach ( p; t.predicate.input.parts ) 2473 { 2474 if ( p.l_ >= lookup.length ) 2475 continue; 2476 for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c ) 2477 lookup[c] = cast(ubyte)i; 2478 } 2479 } 2480 2481 mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP; 2482 } 2483 } 2484 2485 /* ******************************************************************************************** 2486 TDFA transition 2487 **********************************************************************************************/ 2488 class Transition 2489 { 2490 State target; 2491 predicate_t predicate; 2492 Command[] commands; 2493 2494 /* **************************************************************************************** 2495 Order transitions by the order of their predicates. 2496 ******************************************************************************************/ 2497 override final int opCmp(Object o) 2498 { 2499 Transition t = cast(Transition)o; 2500 assert(t !is null); 2501 return predicate.opCmp(t.predicate); 2502 } 2503 2504 override final const bool opEquals(const Object o) 2505 { 2506 auto t = cast(const(Transition))o; 2507 if ( t is null ) 2508 return false; 2509 if ( equalTarget(t) && t.predicate == predicate ) 2510 return true; 2511 return false; 2512 } 2513 2514 const bool equalTarget(in Transition t) 2515 { 2516 if ( t.target.index != target.index ) 2517 return false; 2518 if ( commands.length != t.commands.length ) 2519 return false; 2520 Louter: foreach ( cmd; commands ) 2521 { 2522 foreach ( cmd2; t.commands ) 2523 { 2524 if ( cmd == cmd2 ) 2525 continue Louter; 2526 } 2527 return false; 2528 } 2529 return true; 2530 } 2531 } 2532 2533 2534 State[] states; 2535 State start; 2536 Command[] initializer; 2537 uint num_tags; 2538 2539 uint[TagIndex] registers; 2540 uint next_register; 2541 2542 uint num_regs() 2543 { 2544 return next_register; 2545 } 2546 2547 /* ******************************************************************************************** 2548 Constructs the TDFA from the given TNFA using extended power set method 2549 **********************************************************************************************/ 2550 this(TNFA!(char_t) tnfa) 2551 { 2552 num_tags = tnfa.tagCount(); 2553 2554 next_register = num_tags; 2555 for ( int i = 1; i <= num_tags; ++i ) { 2556 TagIndex ti; 2557 ti.tag = i; 2558 registers[ti] = i-1; 2559 } 2560 2561 // create epsilon closure of TNFA start state 2562 SubsetState subset_start = new SubsetState; 2563 StateElement se = new StateElement; 2564 se.nfa_state = tnfa.start; 2565 subset_start.elms ~= se; 2566 2567 // apply lookbehind closure for string/line start 2568 predicate_t tmp_pred; 2569 tmp_pred.setInput(CharClass!(char_t).line_startend); 2570 subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start); 2571 2572 start = addState(); 2573 subset_start.dfa_state = start; 2574 2575 // generate initializer and finisher commands for TDFA start state 2576 generateInitializers(subset_start); 2577 generateFinishers(subset_start); 2578 2579 // initialize stack for state traversal 2580 List!(SubsetState) subset_states = new List!(SubsetState), 2581 unmarked = new List!(SubsetState); 2582 subset_states ~= subset_start; 2583 unmarked ~= subset_start; 2584 debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString); 2585 2586 while ( !unmarked.empty() ) 2587 { 2588 SubsetState state = unmarked.tail.value; 2589 unmarked.pop(); 2590 2591 // create transitions for each class, creating new states when necessary 2592 foreach ( pred; disjointPredicates(state) ) 2593 { 2594 // find NFA state we reach with pred 2595 // reach will set predicate type correctly 2596 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString); 2597 SubsetState target = reach(state, pred); 2598 if ( target is null ) { 2599 continue; 2600 debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping"); 2601 } 2602 debug(tdfa) Stdout.formatln(" {}", target.toString); 2603 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state); 2604 2605 Transition trans = new Transition; 2606 state.dfa_state.transitions ~= trans; 2607 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred); 2608 trans.predicate = pred; 2609 2610 // generate indeces for pos commands 2611 // delay creation of pos command until we have reorder-commands 2612 uint[uint] cmds = null; 2613 foreach ( e; target.elms ) 2614 { 2615 foreach ( tag, ref index; e.tags ) 2616 { 2617 bool found=false; 2618 foreach ( e2; state.elms ) 2619 { 2620 int* i = tag in e2.tags; 2621 if ( i !is null && *i == index ) { 2622 found=true; 2623 break; 2624 } 2625 } 2626 if ( !found ) 2627 { 2628 // if index is < 0 it is a temporary index 2629 // used only to distinguish the state from existing ones. 2630 // the previous index can be reused instead. 2631 if ( index < 0 ) 2632 index = -index-1; 2633 cmds[tag] = index; 2634 } 2635 else 2636 assert(index>=0); 2637 } 2638 } 2639 2640 // check whether a state exists that is identical except for tag index reorder-commands 2641 bool exists=false; 2642 foreach ( equivTarget; subset_states ) 2643 { 2644 if ( reorderTagIndeces(target, equivTarget, state, trans) ) { 2645 target = equivTarget; 2646 exists = true; 2647 break; 2648 } 2649 } 2650 // else create new target state 2651 if ( !exists ) 2652 { 2653 State ts = addState(); 2654 target.dfa_state = ts; 2655 subset_states ~= target; 2656 unmarked ~= target; 2657 debug(tdfa_add) { 2658 Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString); 2659 } 2660 generateFinishers(target); 2661 } 2662 2663 // now generate pos commands, rewriting reorder-commands if existent 2664 foreach ( tag, index; cmds ) 2665 { 2666 // check whether reordering used this tag, if so, overwrite the command directly, 2667 // for it's effect would be overwritten by a subsequent pos-command anyway 2668 uint reg = registerFromTagIndex(tag, index); 2669 bool found = false; 2670 foreach ( ref cmd; trans.commands ) 2671 { 2672 if ( cmd.src == reg ) { 2673 found = true; 2674 cmd.src = CURRENT_POSITION_REGISTER; 2675 break; 2676 } 2677 } 2678 if ( !found ) { 2679 Command cmd; 2680 cmd.dst = reg; 2681 cmd.src = CURRENT_POSITION_REGISTER; 2682 trans.commands ~= cmd; 2683 } 2684 } 2685 2686 trans.target = target.dfa_state; 2687 debug(tdfa) { 2688 Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index); 2689 } 2690 } 2691 2692 state.dfa_state.optimize(); 2693 } 2694 2695 // renumber registers continuously 2696 uint[uint] regNums; 2697 2698 for ( next_register = 0; next_register < num_tags; ++next_register ) 2699 regNums[next_register] = next_register; 2700 2701 void renumberCommand(ref Command cmd) 2702 { 2703 if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null ) 2704 regNums[cmd.src] = next_register++; 2705 if ( (cmd.dst in regNums) is null ) 2706 regNums[cmd.dst] = next_register++; 2707 if ( cmd.src != CURRENT_POSITION_REGISTER ) 2708 cmd.src = regNums[cmd.src]; 2709 cmd.dst = regNums[cmd.dst]; 2710 } 2711 2712 foreach ( state; states ) 2713 { 2714 foreach ( ref cmd; state.finishers ) 2715 renumberCommand(cmd); 2716 // make sure pos-commands are executed after reorder-commands and 2717 // reorder-commands do not overwrite each other 2718 tango.core.Array.sort(state.finishers); 2719 2720 foreach ( trans; state.transitions ) 2721 { 2722 foreach ( ref cmd; trans.commands ) 2723 renumberCommand(cmd); 2724 tango.core.Array.sort(trans.commands); 2725 trans.predicate.compile(); 2726 } 2727 } 2728 2729 debug(TangoRegex) 2730 { 2731 foreach ( ref v; registers ) 2732 { 2733 if ( (v in regNums) !is null ) 2734 v = regNums[v]; 2735 } 2736 } 2737 2738 minimizeDFA(); 2739 2740 foreach ( state; states ) 2741 state.createLookup(); 2742 2743 // TODO: optimize memory layout of TDFA 2744 2745 // TODO: add lookahead for string-end somewhere 2746 // TODO: mark dead-end states (not leaving a non-finishing susbet) 2747 // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition 2748 // (execution may stop in that state) 2749 } 2750 2751 /* ******************************************************************************************** 2752 Print the TDFA (tabular representation of the delta function) 2753 **********************************************************************************************/ 2754 debug(TangoRegex) void print() 2755 { 2756 Stdout.formatln("#tags = {}", num_tags); 2757 2758 auto tis = new TagIndex[registers.length]; 2759 foreach ( k, v; registers ) 2760 tis [v] = k; 2761 foreach ( r, ti; tis ) { 2762 Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r); 2763 } 2764 Stdout.formatln("Initializer:"); 2765 foreach ( cmd; initializer ) { 2766 Stdout.formatln("{}", cmd.toString); 2767 } 2768 Stdout.formatln("Delta function:"); 2769 foreach ( int i, s; states ) 2770 { 2771 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 2772 2773 bool first=true; 2774 Stdout(" {"); 2775 foreach ( t; s.transitions ) 2776 { 2777 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index); 2778 bool firstcmd=true; 2779 foreach ( cmd; t.commands ) 2780 { 2781 if ( firstcmd ) 2782 firstcmd = false; 2783 else 2784 Stdout(","); 2785 Stdout.format("{}", cmd.toString); 2786 } 2787 Stdout(")"); 2788 first = false; 2789 } 2790 Stdout("} ("); 2791 2792 bool firstcmd=true; 2793 foreach ( cmd; s.finishers ) 2794 { 2795 if ( firstcmd ) 2796 firstcmd = false; 2797 else 2798 Stdout(","); 2799 Stdout.format("{}", cmd.toString); 2800 } 2801 Stdout.formatln(")"); 2802 } 2803 } 2804 2805 private: 2806 /* ******************************************************************************************** 2807 A (TNFA state, tags) pair element of a subset state. 2808 **********************************************************************************************/ 2809 class StateElement 2810 { 2811 TNFAState!(char_t) nfa_state; 2812 int[uint] tags; 2813 // use place-value priority with 2 places, value(maxPrio) > value(lastPrio) 2814 uint maxPriority, 2815 lastPriority; 2816 2817 bool prioGreater(StateElement se) 2818 { 2819 if ( maxPriority < se.maxPriority ) 2820 return true; 2821 if ( maxPriority == se.maxPriority ) { 2822 assert(lastPriority != se.lastPriority); 2823 return lastPriority < se.lastPriority; 2824 } 2825 return false; 2826 } 2827 2828 override int opCmp(Object o) 2829 { 2830 StateElement se = cast(StateElement)o; 2831 assert(se !is null); 2832 if ( maxPriority < se.maxPriority ) 2833 return 1; 2834 if ( maxPriority == se.maxPriority ) 2835 { 2836 if ( lastPriority == se.lastPriority ) 2837 return 0; 2838 return lastPriority < se.lastPriority; 2839 } 2840 return -1; 2841 } 2842 2843 override string toString() 2844 { 2845 const(char)[] str; 2846 str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority); 2847 bool first = true; 2848 foreach ( k, v; tags ) { 2849 str ~= Format.convert("{}m({},{})", first?"":",", k, v); 2850 first = false; 2851 } 2852 str ~= "}"; 2853 return str.idup; 2854 } 2855 } 2856 2857 /* ******************************************************************************************** 2858 Represents a state in the NFA to DFA conversion. 2859 Contains the set of states (StateElements) the NFA might be in at the same time and the 2860 corresponding DFA state that we create. 2861 **********************************************************************************************/ 2862 class SubsetState 2863 { 2864 StateElement[] elms; 2865 State dfa_state; 2866 2867 this(StateElement[] elms=null) 2868 { 2869 this.elms = elms; 2870 } 2871 2872 int opApply(int delegate (ref TNFATransition!(char_t)) dg) 2873 { 2874 int res; 2875 foreach ( elm; elms ) 2876 { 2877 foreach ( t; elm.nfa_state.transitions ) 2878 { 2879 res = dg(t); 2880 if ( res ) 2881 return res; 2882 } 2883 } 2884 return res; 2885 } 2886 2887 override string toString() 2888 { 2889 string str = "[ "; 2890 bool first = true; 2891 foreach ( s; elms ) { 2892 if ( !first ) 2893 str ~= ", "; 2894 str ~= s.toString(); 2895 first = false; 2896 } 2897 return str~" ]"; 2898 } 2899 } 2900 2901 /* ******************************************************************************************** 2902 Temporary structure used for disjoint predicate computation 2903 **********************************************************************************************/ 2904 struct Mark 2905 { 2906 char_t c; 2907 bool end; /// false = start of range 2908 2909 int opCmp(const ref Mark m) const 2910 { 2911 if ( c < m.c ) 2912 return -1; 2913 if ( c > m.c ) 2914 return 1; 2915 if ( end < m.end ) 2916 return -1; 2917 if ( end > m.end ) 2918 return 1; 2919 return 0; 2920 } 2921 } 2922 2923 /* ******************************************************************************************** 2924 Calculates the register index for a given tag map entry. The TDFA implementation uses 2925 registers to save potential tag positions, the index space gets linearized here. 2926 2927 Params: tag = tag number 2928 index = tag map index 2929 Returns: index of the register to use for the tag map entry 2930 **********************************************************************************************/ 2931 uint registerFromTagIndex(uint tag, uint index) 2932 { 2933 if ( index > 0 ) 2934 { 2935 TagIndex ti; 2936 ti.tag = tag; 2937 ti.index = index; 2938 uint* i = ti in registers; 2939 if ( i !is null ) 2940 return *i; 2941 return registers[ti] = next_register++; 2942 } 2943 else 2944 return tag-1; 2945 } 2946 2947 Mark[] marks_; 2948 2949 /* ******************************************************************************************** 2950 Add new TDFA state to the automaton. 2951 **********************************************************************************************/ 2952 State addState() 2953 { 2954 State s = new State; 2955 s.index = states.length; 2956 states ~= s; 2957 return s; 2958 } 2959 2960 /* ******************************************************************************************** 2961 Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions. 2962 2963 Params: state = SubsetState to create the predicates from 2964 Returns: List of disjoint predicates that can be used for a DFA state 2965 **********************************************************************************************/ 2966 predicate_t[] disjointPredicates(SubsetState state) 2967 { 2968 alias CharRange!(char_t) range_t; 2969 debug(tdfa) Stdout.formatln("disjointPredicates()"); 2970 2971 size_t num_marks; 2972 foreach ( t; state ) 2973 { 2974 // partitioning will consider lookbehind transitions, 2975 // st. lb-closure will not expand for transitions with a superset of the lb-predicate 2976 if ( t.predicate.type != predicate_t.Type.epsilon ) 2977 { 2978 debug(tdfa) Stdout.formatln("{}", t.predicate.toString); 2979 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length ) 2980 marks_.length = num_marks+2*t.predicate.getInput().parts.length; 2981 foreach ( p; t.predicate.getInput().parts ) { 2982 marks_[num_marks++] = Mark(p.l(), false); 2983 marks_[num_marks++] = Mark(p.r(), true); 2984 } 2985 } 2986 } 2987 2988 if ( num_marks <= 1 ) 2989 throw new Exception("disjointPredicates: No transitions in subset state"); 2990 2991 debug(tdfa) Stdout("\nsorting...").newline; 2992 // using built-in sort somtimes gives an AV in TypeInfo.swap 2993 quickSort(marks_[0 .. num_marks]); 2994 assert(!marks_[0].end); 2995 2996 debug(tdfa) 2997 { 2998 Stdout("\nsorted marks:\n"); 2999 bool first=true; 3000 foreach ( m; marks_[0 .. num_marks] ) 3001 { 3002 if ( first ) 3003 first = false; 3004 else 3005 Stdout(","); 3006 if ( m.c > 0x20 && m.c < 0x7f ) 3007 Stdout.format("{}{}", m.end?"e":"s", m.c); 3008 else 3009 Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c); 3010 } 3011 Stdout.newline; 3012 } 3013 3014 size_t next, 3015 active = 1; 3016 char_t start = marks_[0].c, 3017 end; 3018 range_t[] disjoint = new range_t[num_marks/2+1]; 3019 3020 foreach ( m; marks_[1 .. num_marks] ) 3021 { 3022 if ( m.end ) 3023 { 3024 assert(active>0); 3025 --active; 3026 if ( m.c < start ) 3027 continue; 3028 end = m.c; 3029 // the next range cannot start at the same pos 3030 // because starts are sorted before endings 3031 if ( active > 0 ) 3032 ++m.c; 3033 } 3034 else 3035 { 3036 ++active; 3037 if ( active == 1 ) 3038 { 3039 // skip uncovered interval 3040 if ( m.c > start ) { 3041 start = m.c; 3042 continue; 3043 } 3044 end = m.c; 3045 ++m.c; 3046 } 3047 // skip range start if cursor already marks it 3048 else if ( m.c <= start ) 3049 continue; 3050 else 3051 end = m.c-1; 3052 } 3053 3054 // save range 3055 if ( disjoint.length <= next ) 3056 disjoint.length = disjoint.length*2; 3057 3058 disjoint[next].l_ = start; 3059 disjoint[next].r_ = end; 3060 ++next; 3061 3062 // advance cursor 3063 start = m.c; 3064 } 3065 disjoint.length = next; 3066 3067 // merge isolated ranges into sets of ranges 3068 // no range in a set may occur separated from the others in any predicate 3069 predicate_t[] preds; 3070 preds.length = 1; 3071 Lmerge: foreach ( r; disjoint ) 3072 { 3073 if ( preds[$-1].empty() ) 3074 preds[$-1].appendInput(r); 3075 else 3076 { 3077 // we can merge r into the current predicate if 3078 // pred contains r <=> pred contains all the other ranges 3079 foreach ( t; state ) 3080 { 3081 if ( t.predicate.type == predicate_t.Type.epsilon ) 3082 continue; 3083 3084 if ( t.predicate.getInput().contains(r) 3085 != t.predicate.getInput().contains(preds[$-1].getInput()) ) 3086 { 3087 preds.length = preds.length+1; 3088 break; 3089 } 3090 } 3091 preds[$-1].appendInput(r); 3092 } 3093 } 3094 3095 debug(tdfa) 3096 { 3097 Stdout("\ndisjoint ranges:\n"); 3098 first=true; 3099 foreach ( r; disjoint ) 3100 { 3101 if ( first ) 3102 first = false; 3103 else 3104 Stdout(","); 3105 Stdout.format("{}", r); 3106 } 3107 Stdout.newline; 3108 Stdout("\ndisjoint predicates:\n"); 3109 first=true; 3110 foreach ( ref p; preds ) 3111 { 3112 if ( first ) 3113 first = false; 3114 else 3115 Stdout(","); 3116 Stdout.format("{}", p.toString); 3117 } 3118 Stdout.newline; 3119 } 3120 3121 debug(tdfa) Stdout.formatln("disjointPredicates() end"); 3122 return preds; 3123 } 3124 3125 /* ******************************************************************************************** 3126 Finds all TNFA states that can be reached directly with the given predicate and creates 3127 a new SubsetState containing those target states. 3128 3129 Params: subst = SubsetState to start from 3130 pred = predicate that is matched against outgoing transitions 3131 Returns: SubsetState containing the reached target states 3132 **********************************************************************************************/ 3133 SubsetState reach(SubsetState subst, ref predicate_t pred) 3134 { 3135 // to handle the special case of overlapping consume and lookahead predicates, 3136 // we find the different intersecting predicate types 3137 bool have_consume, 3138 have_lookahead; 3139 foreach ( t; subst ) 3140 { 3141 if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead ) 3142 continue; 3143 auto intpred = t.predicate.intersect(pred); 3144 if ( !intpred.empty() ) 3145 { 3146 if ( t.predicate.type == predicate_t.Type.consume ) 3147 have_consume = true; 3148 else if ( t.predicate.type == predicate_t.Type.lookahead ) 3149 have_lookahead = true; 3150 } 3151 } 3152 3153 // if there is consume/lookahead overlap, 3154 // lookahead predicates are handled first 3155 predicate_t.Type processed_type; 3156 if ( have_lookahead ) 3157 processed_type = predicate_t.Type.lookahead; 3158 else if ( have_consume ) 3159 processed_type = predicate_t.Type.consume; 3160 else { 3161 debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString); 3162 return null; 3163 } 3164 pred.type = processed_type; 3165 3166 // add destination states to new subsetstate 3167 SubsetState r = new SubsetState; 3168 foreach ( s; subst.elms ) 3169 { 3170 foreach ( t; s.nfa_state.transitions ) 3171 { 3172 if ( t.predicate.type != processed_type ) 3173 continue; 3174 auto intpred = t.predicate.intersect(pred); 3175 if ( !intpred.empty() ) { 3176 StateElement se = new StateElement; 3177 se.maxPriority = max(t.priority, s.maxPriority); 3178 se.lastPriority = t.priority; 3179 se.nfa_state = t.target; 3180 se.tags = s.tags; 3181 r.elms ~= se; 3182 } 3183 } 3184 } 3185 3186 // if we prioritized lookaheads, the states that may consume are also added to the new subset state 3187 // this behaviour is somewhat similar to an epsilon closure 3188 if ( have_lookahead && have_consume ) 3189 { 3190 foreach ( s; subst.elms ) 3191 { 3192 foreach ( t; s.nfa_state.transitions ) 3193 { 3194 if ( t.predicate.type != predicate_t.Type.consume ) 3195 continue; 3196 auto intpred = t.predicate.intersect(pred); 3197 if ( !intpred.empty() ) { 3198 r.elms ~= s; 3199 break; 3200 } 3201 } 3202 } 3203 } 3204 return r; 3205 } 3206 3207 /* ******************************************************************************************** 3208 Extends the given SubsetState with the states that are reached through lookbehind transitions. 3209 3210 Params: from = SubsetState to create the lookbehind closure for 3211 previous = predicate "from" was reached with 3212 Returns: SubsetState containing "from" and all states of it's lookbehind closure 3213 **********************************************************************************************/ 3214 SubsetState lookbehindClosure(SubsetState from, predicate_t pred) 3215 { 3216 List!(StateElement) stack = new List!(StateElement); 3217 StateElement[size_t] closure; 3218 3219 foreach ( e; from.elms ) 3220 { 3221 stack ~= e; 3222 closure[e.nfa_state.index] = e; 3223 } 3224 3225 while ( !stack.empty() ) 3226 { 3227 StateElement se = stack.tail.value; 3228 stack.pop(); 3229 foreach ( t; se.nfa_state.transitions ) 3230 { 3231 if ( t.predicate.type != predicate_t.Type.lookbehind ) 3232 continue; 3233 if ( t.predicate.intersect(pred).empty() ) 3234 continue; 3235 uint new_maxPri = max(t.priority, se.maxPriority); 3236 3237 StateElement* tmp = t.target.index in closure; 3238 if ( tmp !is null ) 3239 { 3240 // if higher prio (smaller value) exists, do not use this transition 3241 if ( tmp.maxPriority < new_maxPri ) { 3242 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3243 continue; 3244 } 3245 else if ( tmp.maxPriority == new_maxPri ) 3246 { 3247 // "equal lastPrio -> first-come-first-serve" 3248 // doesn't work for lexer - how to solve it properly? 3249 if ( tmp.lastPriority <= t.priority ) { 3250 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3251 continue; 3252 } 3253 // else 3254 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3255 } 3256 // else 3257 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3258 } 3259 3260 StateElement new_se = new StateElement; 3261 new_se.maxPriority = max(t.priority, se.maxPriority); 3262 new_se.lastPriority = t.priority; 3263 new_se.nfa_state = t.target; 3264 new_se.tags = se.tags; 3265 3266 closure[t.target.index] = new_se; 3267 stack ~= new_se; 3268 } 3269 } 3270 3271 SubsetState res = new SubsetState; 3272 res.elms = closure.values; 3273 return res; 3274 } 3275 3276 /* ******************************************************************************************** 3277 Generates the epsilon closure of the given subset state, creating tag map entries 3278 if tags are passed. Takes priorities into account, effectively realizing 3279 greediness and reluctancy. 3280 3281 Params: from = SubsetState to create the epsilon closure for 3282 previous = SubsetState "from" was reached from 3283 Returns: SubsetState containing "from" and all states of it's epsilon closure 3284 **********************************************************************************************/ 3285 SubsetState epsilonClosure(SubsetState from, SubsetState previous) 3286 { 3287 int firstFreeIndex=-1; 3288 foreach ( e; previous.elms ) 3289 { 3290 foreach ( ti; e.tags ) 3291 firstFreeIndex = max(firstFreeIndex, cast(int)ti); 3292 } 3293 ++firstFreeIndex; 3294 3295 List!(StateElement) stack = new List!(StateElement); 3296 StateElement[size_t] closure; 3297 3298 foreach ( e; from.elms ) 3299 { 3300 stack ~= e; 3301 closure[e.nfa_state.index] = e; 3302 } 3303 3304 while ( !stack.empty() ) 3305 { 3306 StateElement se = stack.tail.value; 3307 stack.pop(); 3308 foreach ( t; se.nfa_state.transitions ) 3309 { 3310 if ( t.predicate.type != predicate_t.Type.epsilon ) 3311 continue; 3312 // this is different from Ville Laurikari's algorithm, but it's crucial 3313 // to take the max (instead of t.priority) to make reluctant operators work 3314 uint new_maxPri = max(t.priority, se.maxPriority); 3315 3316 StateElement* tmp = t.target.index in closure; 3317 if ( tmp !is null ) 3318 { 3319 // if higher prio (smaller value) exists, do not use this transition 3320 if ( tmp.maxPriority < new_maxPri ) { 3321 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3322 continue; 3323 } 3324 else if ( tmp.maxPriority == new_maxPri ) 3325 { 3326 // "equal lastPrio -> first-come-first-serve" 3327 // doesn't work for lexer - how to solve it properly? 3328 if ( tmp.lastPriority <= t.priority ) { 3329 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3330 continue; 3331 } 3332 // else 3333 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3334 } 3335 // else 3336 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3337 } 3338 3339 auto new_se = new StateElement; 3340 new_se.maxPriority = new_maxPri; 3341 new_se.lastPriority = t.priority; 3342 new_se.nfa_state = t.target; 3343 3344 if ( t.tag > 0 ) 3345 { 3346 foreach ( k, v; se.tags ) 3347 new_se.tags[k] = v; 3348 new_se.tags[t.tag] = firstFreeIndex; 3349 } 3350 else 3351 new_se.tags = se.tags; 3352 3353 closure[t.target.index] = new_se; 3354 stack ~= new_se; 3355 } 3356 } 3357 3358 SubsetState res = new SubsetState; 3359 res.elms = closure.values; 3360 3361 // optimize tag usage 3362 // all we need to do is to check whether the largest tag-index from the 3363 // previous state is actually used in the new state and move all tags with 3364 // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0 3365 if ( firstFreeIndex > 0 ) 3366 { 3367 bool seenLastUsedIndex = false; 3368 sluiLoop: foreach ( e; res.elms ) 3369 { 3370 foreach ( i; e.tags ) 3371 { 3372 if ( i == firstFreeIndex-1 ) { 3373 seenLastUsedIndex = true; 3374 break sluiLoop; 3375 } 3376 } 3377 } 3378 if ( !seenLastUsedIndex ) 3379 { 3380 foreach ( e; res.elms ) 3381 { 3382 foreach ( ref i; e.tags ) 3383 { 3384 // mark index by making it negative 3385 // to signal that it can be decremented 3386 // after it has been detected to be a newly used index 3387 if ( i == firstFreeIndex ) 3388 i = -firstFreeIndex; 3389 } 3390 } 3391 } 3392 } 3393 3394 return res; 3395 } 3396 3397 /* ******************************************************************************************** 3398 Tries to create commands that reorder the tag map of "previous", such that "from" becomes 3399 tag-wise identical to "to". If successful, these commands are added to "trans". This 3400 is done for state re-use. 3401 3402 Params: from = SubsetState to check for tag-wise equality to "to" 3403 to = existing SubsetState that we want to re-use 3404 previous = SubsetState we're coming from 3405 trans = Transition we went along 3406 Returns: true if "from" is tag-wise identical to "to" and the necessary commands have 3407 been added to "trans" 3408 **********************************************************************************************/ 3409 bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans) 3410 { 3411 if ( from.elms.length != to.elms.length ) 3412 return false; 3413 3414 bool[Command] 3415 cmds; 3416 uint[TagIndex] 3417 reorderedIndeces; 3418 StateElement[TagIndex] 3419 reordered_elements; 3420 3421 Louter: foreach ( fe; from.elms ) 3422 { 3423 foreach ( te; to.elms ) 3424 { 3425 if ( te.nfa_state.index != fe.nfa_state.index ) 3426 continue; 3427 if ( fe.tags.length != te.tags.length ) 3428 return false; 3429 foreach ( tag, findex; fe.tags ) 3430 { 3431 if ( (tag in te.tags) is null ) 3432 return false; 3433 3434 TagIndex ti; 3435 ti.tag = tag; 3436 ti.index = te.tags[tag]; 3437 3438 // apply priority for conflicting tag indeces 3439 if ( (ti in reorderedIndeces) !is null ) 3440 { 3441 auto rse = reordered_elements[ti]; 3442 auto ri = reorderedIndeces[ti]; 3443 if ( ri != findex 3444 && ( rse.maxPriority < fe.maxPriority 3445 || rse.maxPriority == fe.maxPriority 3446 && rse.lastPriority <= fe.lastPriority ) 3447 ) 3448 continue; 3449 Command cmd; 3450 cmd.src = registerFromTagIndex(tag,ri); 3451 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3452 cmds.remove(cmd); 3453 } 3454 // if target index differs, create reordering command 3455 if ( te.tags[tag] != findex ) 3456 { 3457 Command cmd; 3458 cmd.src = registerFromTagIndex(tag,findex); 3459 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3460 cmds[cmd] = true; 3461 } 3462 3463 reorderedIndeces[ti] = findex; 3464 reordered_elements[ti] = fe; 3465 } 3466 continue Louter; 3467 } 3468 return false; 3469 } 3470 3471 debug(tdfa) { 3472 Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index); 3473 } 3474 3475 trans.commands ~= cmds.keys; 3476 return true; 3477 } 3478 3479 /* ******************************************************************************************** 3480 Generate tag map initialization commands for start state. 3481 **********************************************************************************************/ 3482 void generateInitializers(SubsetState start) 3483 { 3484 uint[uint] cmds; 3485 foreach ( nds; start.elms ) 3486 { 3487 foreach ( k, v; nds.tags ) 3488 cmds[k] = v; 3489 } 3490 3491 foreach ( k, v; cmds ) { 3492 Command cmd; 3493 cmd.dst = registerFromTagIndex(k,v); 3494 cmd.src = CURRENT_POSITION_REGISTER; 3495 initializer ~= cmd; 3496 } 3497 } 3498 3499 /* ******************************************************************************************** 3500 Generates finisher commands for accepting states. 3501 **********************************************************************************************/ 3502 void generateFinishers(SubsetState r) 3503 { 3504 // if at least one of the TNFA states accepts, 3505 // set the finishers from active tags in increasing priority 3506 StateElement[] sorted_elms = r.elms.dup; 3507 tango.core.Array.sort(sorted_elms); 3508 bool reluctant = false; 3509 foreach ( se; sorted_elms ) { 3510 debug (Finishers) Stdout.formatln("Finisher: {}", se); 3511 if ( se.nfa_state.accept ) 3512 { 3513 r.dfa_state.accept = true; 3514 3515 // Knowing that we're looking at an epsilon closure with an accepting 3516 // state, we look at the involved transitions - if the path from the 3517 // nfa state in the set with the highest incoming priority (last in 3518 // sorted_elms list) to the accepting nfa state is via the highest 3519 // priority transitions, and they are all epsilon transitions, this 3520 // suggests we're looking at a regex ending with a reluctant pattern. 3521 // The NFA->DFA transformation will most likely extend the automata 3522 // further, but we want the matching to stop here. 3523 // NOTE: The grounds for choosing the last element in sorted_elms 3524 // are somewhat weak (empirical testing), but sofar no new 3525 // regressions have been discovered. larsivi 20090827 3526 TNFATransition!(char_t) highestPriTrans; 3527 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state && 3528 sorted_elms[$-1].nfa_state)) 3529 throw new Exception ("Something is NULL that is expected to 3530 be non-null", __FILE__, __LINE__); 3531 3532 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) { 3533 if (trans.canFinish()) { 3534 r.dfa_state.reluctant = true; 3535 break; 3536 } 3537 } 3538 3539 bool[uint] finished_tags; 3540 { 3541 foreach ( t, i; se.tags ) 3542 if ( i > 0 && !(t in finished_tags) ) { 3543 finished_tags[t] = true; 3544 Command cmd; 3545 cmd.dst = registerFromTagIndex(t, 0); 3546 cmd.src = registerFromTagIndex(t, i); 3547 r.dfa_state.finishers ~= cmd; 3548 } 3549 } 3550 } 3551 } 3552 } 3553 3554 /* ******************************************************************************************** 3555 Assumes that the command-lists are sorted and transitions are optimized 3556 **********************************************************************************************/ 3557 void minimizeDFA() 3558 { 3559 class DiffTable 3560 { 3561 this(size_t num) { 3562 diff_ = new bool[num*(num+1)/2]; 3563 } 3564 3565 ~this() { diff_.destroy; } 3566 3567 bool opCall(size_t a, size_t b) 3568 { 3569 if ( a < b ) 3570 return diff_[b*(b+1)/2+a]; 3571 return diff_[a*(a+1)/2+b]; 3572 } 3573 3574 void set(size_t a, size_t b) 3575 { 3576 if ( a < b ) 3577 diff_[b*(b+1)/2+a] = true; 3578 else 3579 diff_[a*(a+1)/2+b] = true; 3580 } 3581 3582 bool[] diff_; 3583 } 3584 3585 debug(tdfa) Stdout.formatln("Minimizing TDFA"); 3586 3587 scope diff = new DiffTable(states.length); 3588 bool new_diff = true; 3589 3590 while ( new_diff ) 3591 { 3592 new_diff = false; 3593 foreach ( i, a; states[0 .. $-1] ) 3594 { 3595 Linner: foreach ( j, b; states[i+1 .. $] ) 3596 { 3597 if ( diff(i, j+i+1) ) 3598 continue; 3599 3600 // assume optimized transitions 3601 if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) { 3602 diff.set(i, j+i+1); 3603 new_diff = true; 3604 continue; 3605 } 3606 3607 if ( a.accept ) // b accepts too 3608 { 3609 // assume sorted finishers 3610 if ( a.finishers.length != b.finishers.length ) { 3611 diff.set(i, j+i+1); 3612 new_diff = true; 3613 continue; 3614 } 3615 foreach ( k, cmd; a.finishers ) 3616 { 3617 if ( cmd != b.finishers[k] ) { 3618 diff.set(i, j+i+1); 3619 new_diff = true; 3620 continue Linner; 3621 } 3622 } 3623 } 3624 3625 Ltrans: foreach ( ta; a.transitions ) 3626 { 3627 foreach ( tb; b.transitions ) 3628 { 3629 if ( ta.predicate.intersects(tb.predicate) ) 3630 { 3631 if ( diff(ta.target.index, tb.target.index) ) { 3632 diff.set(i, j+i+1); 3633 new_diff = true; 3634 continue Linner; 3635 } 3636 // assume sorted commands 3637 if ( ta.commands.length != tb.commands.length ) { 3638 diff.set(i, j+i+1); 3639 new_diff = true; 3640 continue Linner; 3641 } 3642 foreach ( k, cmd; ta.commands ) 3643 { 3644 if ( cmd != tb.commands[k] ) { 3645 diff.set(i, j+i+1); 3646 new_diff = true; 3647 continue Linner; 3648 } 3649 } 3650 continue Ltrans; 3651 } 3652 } 3653 3654 diff.set(i, j+i+1); 3655 new_diff = true; 3656 continue Linner; 3657 } 3658 3659 } 3660 } 3661 } 3662 3663 foreach ( i, a; states[0 .. $-1] ) 3664 { 3665 foreach ( j, b; states[i+1 .. $] ) 3666 { 3667 if ( !diff(i, j+i+1) ) 3668 { 3669 debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1); 3670 // remap b to a 3671 foreach ( k, c; states ) 3672 { 3673 foreach ( t; c.transitions ) 3674 { 3675 if ( t.target.index == j+i+1 ) 3676 t.target = a; 3677 } 3678 } 3679 } 3680 } 3681 } 3682 3683 } 3684 } 3685 import tango.text.Util; 3686 3687 /************************************************************************************************** 3688 Regular expression compiler and interpreter. 3689 **************************************************************************************************/ 3690 class RegExpT(char_t) 3691 { 3692 alias TDFA!(dchar) tdfa_t; 3693 alias TNFA!(dchar) tnfa_t; 3694 alias CharClass!(dchar) charclass_t; 3695 alias Predicate!(dchar) predicate_t; 3696 3697 /********************************************************************************************** 3698 Construct a RegExpT object. 3699 Params: 3700 pattern = Regular expression. 3701 Throws: RegExpException if there are any compilation errors. 3702 Example: 3703 Declare two variables and assign to them a Regex object: 3704 --- 3705 auto r = new Regex("pattern"); 3706 auto s = new Regex(r"p[1-5]\s*"); 3707 --- 3708 **********************************************************************************************/ 3709 this(const(char_t)[] pattern, const(char_t)[] attributes=null) 3710 { 3711 this(pattern, false, true); 3712 } 3713 3714 /** ditto */ 3715 this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false) 3716 { 3717 pattern_ = pattern; 3718 3719 debug(TangoRegex) {} 3720 else { 3721 static if ( is(char_t == dchar) ) { 3722 scope tnfa_t tnfa_ = new tnfa_t(pattern_); 3723 } 3724 else { 3725 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_)); 3726 } 3727 } 3728 3729 tnfa_.swapMatchingBracketSyntax = swapMBS; 3730 tnfa_.parse(unanchored); 3731 if ( printNFA ) { 3732 debug(TangoRegex) Stdout.formatln("\nTNFA:"); 3733 debug(TangoRegex) tnfa_.print; 3734 } 3735 tdfa_ = new tdfa_t(tnfa_); 3736 registers_.length = tdfa_.num_regs(); 3737 } 3738 3739 /********************************************************************************************** 3740 Generate instance of Regex. 3741 Params: 3742 pattern = Regular expression. 3743 Throws: RegExpException if there are any compilation errors. 3744 Example: 3745 Declare two variables and assign to them a Regex object: 3746 --- 3747 auto r = Regex("pattern"); 3748 auto s = Regex(r"p[1-5]\s*"); 3749 --- 3750 **********************************************************************************************/ 3751 static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null) 3752 { 3753 return new RegExpT!(char_t)(pattern, attributes); 3754 } 3755 3756 /********************************************************************************************** 3757 Set up for start of foreach loop. 3758 Returns: Instance of RegExpT set up to search input. 3759 Example: 3760 --- 3761 import tango.io.Stdout; 3762 import tango.text.Regex; 3763 3764 void main() 3765 { 3766 foreach(m; Regex("ab").search("qwerabcabcababqwer")) 3767 Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post); 3768 } 3769 // Prints: 3770 // qwer[ab]cabcababqwer 3771 // qwerabc[ab]cababqwer 3772 // qwerabcabc[ab]abqwer 3773 // qwerabcabcab[ab]qwer 3774 --- 3775 **********************************************************************************************/ 3776 public RegExpT!(char_t) search(const(char_t)[] input) 3777 { 3778 input_ = input; 3779 next_start_ = 0; 3780 last_start_ = 0; 3781 return this; 3782 } 3783 3784 /** ditto */ 3785 public int opApply(int delegate(RegExpT!(char_t)) dg) 3786 { 3787 int result; 3788 while ( !result && test() ) 3789 result = dg(this); 3790 return result; 3791 } 3792 3793 /********************************************************************************************** 3794 Search input for match. 3795 Returns: false for no match, true for match 3796 **********************************************************************************************/ 3797 bool test(const(char_t)[] input) 3798 { 3799 this.input_ = input; 3800 next_start_ = 0; 3801 last_start_ = 0; 3802 return test(); 3803 } 3804 3805 /********************************************************************************************** 3806 Pick up where last test(input) or test() left off, and search again. 3807 Returns: false for no match, true for match 3808 **********************************************************************************************/ 3809 bool test() 3810 { 3811 // initialize registers 3812 assert(registers_.length == tdfa_.num_regs()); 3813 registers_[0..$] = -1; 3814 foreach ( cmd; tdfa_.initializer ) { 3815 assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER); 3816 registers_[cmd.dst] = 0; 3817 } 3818 3819 tdfa_t.Transition* tp, tp_end; 3820 3821 // DFA execution 3822 auto inp = input_[next_start_ .. $]; 3823 auto s = tdfa_.start; 3824 3825 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp); 3826 LmainLoop: for ( size_t p, next_p; p < inp.length; ) 3827 { 3828 Lread_char: 3829 dchar c = cast(dchar)inp[p]; 3830 if ( c & 0x80 ) 3831 c = decode(inp, next_p); 3832 else 3833 next_p = p+1; 3834 3835 Lprocess_char: 3836 debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c); 3837 3838 tdfa_t.Transition t = void; 3839 switch ( s.mode ) 3840 { 3841 case s.Mode.LOOKUP: 3842 if ( c < s.LOOKUP_LENGTH ) 3843 { 3844 debug(TangoRegex) Stdout.formatln("lookup"); 3845 auto i = s.lookup[c]; 3846 if ( i == s.INVALID_STATE ) 3847 break LmainLoop; 3848 t = s.transitions[ i ]; 3849 if ( t.predicate.type != t.predicate.Type.consume ) 3850 goto Lno_consume; 3851 goto Lconsume; 3852 } 3853 break LmainLoop; 3854 3855 case s.Mode.MIXED: 3856 if ( c < s.LOOKUP_LENGTH ) 3857 { 3858 debug(TangoRegex) Stdout.formatln("mixed"); 3859 auto i = s.lookup[c]; 3860 if ( i == s.INVALID_STATE ) 3861 break; 3862 t = s.transitions[ i ]; 3863 if ( t.predicate.type != t.predicate.Type.consume ) 3864 goto Lno_consume; 3865 goto Lconsume; 3866 } 3867 break; 3868 3869 case s.Mode.GENERIC: 3870 default: 3871 break; 3872 } 3873 3874 Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length; 3875 tp < tp_end; ++tp ) 3876 { 3877 t = *tp; 3878 switch ( t.predicate.mode ) 3879 { 3880 // single char 3881 case predicate_t.MatchMode.single_char: 3882 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3883 if ( c != t.predicate.data_chr ) 3884 continue Ltrans_loop; 3885 goto Lconsume; 3886 case predicate_t.MatchMode.single_char_l: 3887 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3888 if ( c != t.predicate.data_chr ) 3889 continue Ltrans_loop; 3890 goto Lno_consume; 3891 3892 // bitmap 3893 case predicate_t.MatchMode.bitmap: 3894 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3895 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3896 goto Lconsume; 3897 continue Ltrans_loop; 3898 case predicate_t.MatchMode.bitmap_l: 3899 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3900 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3901 goto Lno_consume; 3902 continue Ltrans_loop; 3903 3904 // string search 3905 case predicate_t.MatchMode.string_search: 3906 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3907 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3908 continue Ltrans_loop; 3909 goto Lconsume; 3910 case predicate_t.MatchMode.string_search_l: 3911 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3912 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3913 continue Ltrans_loop; 3914 goto Lno_consume; 3915 3916 // generic 3917 case predicate_t.MatchMode.generic: 3918 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3919 for ( auto cmp = t.predicate.data_str.ptr, 3920 cmpend = cmp + t.predicate.data_str.length; 3921 cmp < cmpend; ++cmp ) 3922 { 3923 if ( c < *cmp ) { 3924 ++cmp; 3925 continue; 3926 } 3927 ++cmp; 3928 if ( c <= *cmp ) 3929 goto Lconsume; 3930 } 3931 continue Ltrans_loop; 3932 case predicate_t.MatchMode.generic_l: 3933 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3934 for ( auto cmp = t.predicate.data_str.ptr, 3935 cmpend = cmp + t.predicate.data_str.length; 3936 cmp < cmpend; ++cmp ) 3937 { 3938 if ( c < *cmp ) { 3939 ++cmp; 3940 continue; 3941 } 3942 ++cmp; 3943 if ( c <= *cmp ) 3944 goto Lno_consume; 3945 } 3946 continue Ltrans_loop; 3947 3948 default: 3949 assert(0); 3950 } 3951 3952 Lconsume: 3953 p = next_p; 3954 Lno_consume: 3955 3956 s = t.target; 3957 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]); 3958 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length); 3959 3960 foreach ( cmd; t.commands ) 3961 { 3962 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 3963 registers_[cmd.dst] = cast(int)p; 3964 else 3965 registers_[cmd.dst] = registers_[cmd.src]; 3966 } 3967 3968 if (s.accept && s.reluctant) 3969 // Don't continue matching, the current find should be correct 3970 goto Laccept; 3971 3972 // if all input was consumed and we do not already accept, try to 3973 // add an explicit string/line end 3974 if ( p >= inp.length ) 3975 { 3976 if ( s.accept || c == 0 ) 3977 break; 3978 c = 0; 3979 goto Lprocess_char; 3980 } 3981 goto Lread_char; 3982 } 3983 // no applicable transition 3984 break; 3985 } 3986 3987 if ( s.accept ) 3988 { 3989 Laccept: 3990 foreach ( cmd; s.finishers ) { 3991 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER); 3992 registers_[cmd.dst] = registers_[cmd.src]; 3993 } 3994 if ( registers_.length > 1 && registers_[1] >= 0 ) { 3995 last_start_ = next_start_; 3996 next_start_ += registers_[1]; 3997 } 3998 return true; 3999 } 4000 4001 return false; 4002 } 4003 4004 /********************************************************************************************** 4005 Return submatch with the given index. 4006 Params: 4007 index = 0 returns whole match, index > 0 returns submatch of bracket #index 4008 Returns: 4009 Slice of input for the requested submatch, or null if no such submatch exists. 4010 **********************************************************************************************/ 4011 const(char_t)[] match(uint index) 4012 { 4013 if ( index > tdfa_.num_tags ) 4014 return null; 4015 int start = cast(int)last_start_+registers_[index*2], 4016 end = cast(int)last_start_+registers_[index*2+1]; 4017 if ( start >= 0 && start < end && end <= input_.length ) 4018 return input_[start .. end]; 4019 return null; 4020 } 4021 4022 /** ditto */ 4023 const(char_t)[] opIndex(uint index) 4024 { 4025 return match(index); 4026 } 4027 4028 /********************************************************************************************** 4029 Return the slice of the input that precedes the matched substring. 4030 If no match was found, null is returned. 4031 **********************************************************************************************/ 4032 const(char_t)[] pre() 4033 { 4034 auto start = registers_[0]; 4035 if ( start < 0 ) 4036 return null; 4037 return input_[0 .. last_start_+start]; 4038 } 4039 4040 /********************************************************************************************** 4041 Return the slice of the input that follows the matched substring. 4042 If no match was found, the whole slice of the input that was processed in the last test. 4043 **********************************************************************************************/ 4044 const(char_t)[] post() 4045 { 4046 if ( registers_[1] >= 0 ) 4047 return input_[next_start_ .. $]; 4048 return input_[last_start_ .. $]; 4049 } 4050 4051 /********************************************************************************************** 4052 Splits the input at the matches of this regular expression into an array of slices. 4053 Example: 4054 --- 4055 import tango.io.Stdout; 4056 import tango.text.Regex; 4057 4058 void main() 4059 { 4060 auto strs = Regex("ab").split("abcabcababqwer"); 4061 foreach( s; strs ) 4062 Stdout.formatln("{}", s); 4063 } 4064 // Prints: 4065 // c 4066 // c 4067 // qwer 4068 --- 4069 **********************************************************************************************/ 4070 inout(char_t)[][] split(inout(char_t)[] input) 4071 { 4072 auto res = new inout(char_t)[][PREALLOC]; 4073 uint index; 4074 inout(char_t)[] tmp = input; 4075 4076 foreach ( r; search(input) ) 4077 { 4078 tmp = cast(inout(char_t)[])pre(); 4079 res[index++] = tmp[last_start_ .. $]; 4080 if ( index >= res.length ) 4081 res.length = res.length*2; 4082 tmp = cast(inout(char_t)[])post(); 4083 } 4084 4085 res[index++] = tmp; 4086 res.length = index; 4087 return res; 4088 } 4089 4090 /********************************************************************************************** 4091 Returns a copy of the input with all matches replaced by replacement. 4092 **********************************************************************************************/ 4093 char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4094 { 4095 const(char_t)[] tmp = input; 4096 if ( output_buffer.length <= 0 ) 4097 output_buffer = new char_t[input.length+replacement.length]; 4098 output_buffer.length = 0; 4099 4100 foreach ( r; search(input) ) 4101 { 4102 tmp = pre(); 4103 if ( tmp.length > last_start_ ) 4104 output_buffer ~= tmp[last_start_ .. $]; 4105 output_buffer ~= replacement; 4106 tmp = post(); 4107 } 4108 output_buffer ~= tmp; 4109 return output_buffer; 4110 } 4111 4112 /********************************************************************************************** 4113 Returns a copy of the input with the last match replaced by replacement. 4114 **********************************************************************************************/ 4115 char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4116 { 4117 const(char_t)[] tmp_pre, tmp_post; 4118 if ( output_buffer.length <= 0 ) 4119 output_buffer = new char_t[input.length+replacement.length]; 4120 output_buffer.length = 0; 4121 4122 foreach ( r; search(input) ) { 4123 tmp_pre = pre(); 4124 tmp_post = post(); 4125 } 4126 4127 if ( tmp_pre !is null || tmp_post !is null ) { 4128 output_buffer ~= tmp_pre; 4129 output_buffer ~= replacement; 4130 output_buffer ~= tmp_post; 4131 } 4132 else 4133 output_buffer ~= input; 4134 4135 return output_buffer; 4136 } 4137 4138 /********************************************************************************************** 4139 Returns a copy of the input with the first match replaced by replacement. 4140 **********************************************************************************************/ 4141 char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4142 { 4143 const(char_t)[] tmp = input; 4144 if ( output_buffer.length <= 0 ) 4145 output_buffer = new char_t[input.length+replacement.length]; 4146 output_buffer.length = 0; 4147 4148 if ( test(input) ) 4149 { 4150 tmp = pre(); 4151 if ( tmp.length > last_start_ ) 4152 output_buffer ~= tmp[last_start_ .. $]; 4153 output_buffer ~= replacement; 4154 tmp = post(); 4155 } 4156 output_buffer ~= tmp; 4157 return output_buffer; 4158 } 4159 4160 /********************************************************************************************** 4161 Calls dg for each match and replaces it with dg's return value. 4162 **********************************************************************************************/ 4163 char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null) 4164 { 4165 const(char_t)[] tmp = input; 4166 if ( output_buffer.length <= 0 ) 4167 output_buffer = new char_t[input.length]; 4168 output_buffer.length = 0; 4169 4170 foreach ( r; search(input) ) 4171 { 4172 tmp = pre(); 4173 if ( tmp.length > last_start_ ) 4174 output_buffer ~= tmp[last_start_ .. $]; 4175 output_buffer ~= dg(this); 4176 tmp = post(); 4177 } 4178 output_buffer ~= tmp; 4179 return output_buffer; 4180 } 4181 4182 /********************************************************************************************** 4183 Compiles the regular expression to D code. 4184 4185 NOTE : Remember to import this module (tango.text.Regex) in the module where you put the 4186 generated D code. 4187 4188 **********************************************************************************************/ 4189 // TODO: input-end special case 4190 const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false) 4191 { 4192 const(char)[] code; 4193 const(char)[] str_type; 4194 static if ( is(char_t == char) ) 4195 str_type = "const(char)[]"; 4196 static if ( is(char_t == wchar) ) 4197 str_type = "const(wchar)[]"; 4198 static if ( is(char_t == dchar) ) 4199 str_type = "const(dchar)[]"; 4200 4201 if ( lexer ) 4202 code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type); 4203 else { 4204 code = Format.convert("// {}\nbool match({} input", pattern_, str_type); 4205 code ~= Format.convert(", ref {}[] groups", str_type); 4206 } 4207 code ~= Format.convert(")\n{{\n uint s = {};", tdfa_.start.index); 4208 4209 uint num_vars = tdfa_.num_regs(); 4210 if ( num_vars > 0 ) 4211 { 4212 if ( lexer ) 4213 code ~= "\n static int "; 4214 else 4215 code ~= "\n int "; 4216 bool first = true; 4217 for ( int i = 0, used = 0; i < num_vars; ++i ) 4218 { 4219 bool hasInit = false; 4220 foreach ( cmd; tdfa_.initializer ) 4221 { 4222 if ( cmd.dst == i ) { 4223 hasInit = true; 4224 break; 4225 } 4226 } 4227 4228 if ( first ) 4229 first = false; 4230 else 4231 code ~= ", "; 4232 if ( used > 0 && used % 10 == 0 ) 4233 code ~= "\n "; 4234 ++used; 4235 code ~= Format.convert("r{}", i); 4236 4237 if ( hasInit ) 4238 code ~= "=0"; 4239 else 4240 code ~= "=-1"; 4241 } 4242 code ~= ";"; 4243 } 4244 4245 code ~= "\n\n for ( size_t p, next_p; p < input.length; )\n {"; 4246 code ~= "\n dchar c = cast(dchar)input[p];\n if ( c & 0x80 )\n decode(input, next_p);"; 4247 code ~= "\n else\n next_p = p+1;\n switch ( s )\n {"; 4248 4249 size_t[] finish_states; 4250 foreach ( s; tdfa_.states ) 4251 { 4252 code ~= Format.convert("\n case {}:", s.index); 4253 4254 if ( s.accept ) 4255 { 4256 finish_states ~= s.index; 4257 4258 tdfa_t.State target; 4259 foreach ( t; s.transitions ) 4260 { 4261 if ( target is null ) 4262 target = t.target; 4263 else if ( target !is t.target ) 4264 { 4265 target = null; 4266 break; 4267 } 4268 } 4269 if ( target !is null && target is s ) 4270 s.transitions = null; 4271 } 4272 4273 bool first_if=true; 4274 charclass_t cc, ccTest; 4275 4276 tango.core.Array.sort(s.transitions); 4277 foreach ( t; s.transitions ) 4278 { 4279 ccTest.add(t.predicate.getInput()); 4280 ccTest.optimize(); 4281 if ( t.predicate.getInput() < ccTest ) 4282 cc = t.predicate.getInput(); 4283 else 4284 cc = ccTest; 4285 4286 if ( first_if ) { 4287 code ~= "\n if ( "; 4288 first_if = false; 4289 } 4290 else 4291 code ~= "\n else if ( "; 4292 bool first_cond=true; 4293 foreach ( cr; cc.parts ) 4294 { 4295 if ( first_cond ) 4296 first_cond = false; 4297 else 4298 code ~= " || "; 4299 if ( cr.l() == cr.r() ) 4300 code ~= Format.convert("c == 0x{:x}", cast(int)cr.l()); 4301 else 4302 code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r()); 4303 } 4304 code ~= Format.convert(" ) {{\n s = {};", t.target.index); 4305 4306 if ( t.predicate.type == typeof(t.predicate.type).consume ) 4307 code ~= "\n p = next_p;"; 4308 foreach ( cmd; t.commands ) 4309 code ~= compileCommand(cmd, " "); 4310 /* 4311 // if inp ends here and we do not already accept, try to add an explicit string/line end 4312 if ( p >= inp.length && !s.accept && c != 0 ) { 4313 c = 0; 4314 goto Lprocess_char; 4315 } 4316 */ 4317 code ~= "\n }"; 4318 } 4319 4320 if ( !first_if ) 4321 code ~= Format.convert( 4322 "\n else\n {};\n break;", 4323 s.accept?Format.convert("goto finish{}", s.index):"return false" 4324 ); 4325 else 4326 code ~= Format.convert("\n {};", s.accept?Format.convert("goto finish{}", s.index):"return false"); 4327 } 4328 4329 // create finisher groups 4330 size_t[][size_t] finisherGroup; 4331 foreach ( fs; finish_states ) 4332 { 4333 // check if finisher group with same commands exists 4334 bool haveFinisher = false; 4335 foreach ( fg; finisherGroup.keys ) 4336 { 4337 bool equalCommands = false; 4338 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length ) 4339 { 4340 equalCommands = true; 4341 foreach ( i, cmd; tdfa_.states[fs].finishers ) 4342 { 4343 if ( cmd != tdfa_.states[fg].finishers[i] ) { 4344 equalCommands = false; 4345 break; 4346 } 4347 } 4348 } 4349 if ( equalCommands ) { 4350 // use existing group for this state 4351 finisherGroup[fg] ~= fs; 4352 haveFinisher = true; 4353 break; 4354 } 4355 } 4356 // create new group 4357 if ( !haveFinisher ) 4358 finisherGroup[fs] ~= fs; 4359 } 4360 4361 4362 code ~= "\n default:\n assert(0);\n }\n }\n\n switch ( s )\n {"; 4363 foreach ( group, states; finisherGroup ) 4364 { 4365 foreach ( s; states ) 4366 code ~= Format.convert("\n case {}: finish{}:", s, s); 4367 4368 foreach ( cmd; tdfa_.states[group].finishers ) 4369 { 4370 if ( lexer ) 4371 { 4372 if ( tdfa_.states[group].finishers.length > 1 ) 4373 throw new RegExpException("Lexer error: more than one finisher in flm lexer!"); 4374 if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags ) 4375 throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup); 4376 code ~= Format.convert("\n match = input[0 .. r{}];\n token = {};", cmd.src, cmd.dst/2); 4377 } 4378 else 4379 code ~= compileCommand(cmd, " "); 4380 } 4381 4382 code ~= "\n break;"; 4383 } 4384 code ~= "\n default:\n return false;\n }\n"; 4385 4386 if ( !lexer ) 4387 { 4388 code ~= Format.convert("\n groups.length = {};", tdfa_.num_tags/2); 4389 for ( int i = 0; i < tdfa_.num_tags/2; ++i ) 4390 code ~= Format.convert("\n if ( r{} > -1 && r{} > -1 )\n groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1); 4391 } 4392 4393 code ~= "\n return true;\n}"; 4394 return code; 4395 } 4396 4397 /********************************************************************************************* 4398 Get the pattern with which this regex was constructed. 4399 **********************************************************************************************/ 4400 public const(char_t)[] pattern() 4401 { 4402 return pattern_; 4403 } 4404 4405 /********************************************************************************************* 4406 Get the tag count of this regex, representing the number of sub-matches. 4407 4408 This value is the max valid value for match/opIndex. 4409 **********************************************************************************************/ 4410 uint tagCount() 4411 { 4412 return tdfa_.num_tags; 4413 } 4414 4415 int[] registers_; 4416 size_t next_start_, 4417 last_start_; 4418 4419 debug(TangoRegex) tnfa_t tnfa_; 4420 tdfa_t tdfa_; 4421 private: 4422 enum int PREALLOC = 16; 4423 const(char_t)[] input_, 4424 pattern_; 4425 4426 const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent) 4427 { 4428 const(char)[] code, 4429 dst; 4430 code ~= Format.convert("\n{}r{} = ", indent, cmd.dst); 4431 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 4432 code ~= "p;"; 4433 else 4434 code ~= Format.convert("r{};", cmd.src); 4435 return code; 4436 } 4437 } 4438 4439 alias RegExpT!(char) Regex; 4440 4441 debug(utf) import tango.stdc.stdio; 4442 // the following block is stolen from phobos. 4443 // the copyright notice applies for this block only. 4444 /* 4445 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com 4446 * Written by Walter Bright 4447 * 4448 * This software is provided 'as-is', without any express or implied 4449 * warranty. In no event will the authors be held liable for any damages 4450 * arising from the use of this software. 4451 * 4452 * Permission is granted to anyone to use this software for any purpose, 4453 * including commercial applications, and to alter it and redistribute it 4454 * freely, subject to the following restrictions: 4455 * 4456 * o The origin of this software must not be misrepresented; you must not 4457 * claim that you wrote the original software. If you use this software 4458 * in a product, an acknowledgment in the product documentation would be 4459 * appreciated but is not required. 4460 * o Altered source versions must be plainly marked as such, and must not 4461 * be misrepresented as being the original software. 4462 * o This notice may not be removed or altered from any source 4463 * distribution. 4464 */ 4465 4466 class UtfException : Exception 4467 { 4468 size_t idx; /// index in string of where error occurred 4469 4470 this(immutable(char)[] s, size_t i) 4471 { 4472 idx = i; 4473 super(s); 4474 } 4475 } 4476 4477 bool isValidDchar(dchar c) 4478 { 4479 /* Note: FFFE and FFFF are specifically permitted by the 4480 * Unicode standard for application internal use, but are not 4481 * allowed for interchange. 4482 * (thanks to Arcane Jill) 4483 */ 4484 4485 return c < 0xD800 || 4486 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 4487 } 4488 4489 /* ************* 4490 * Decodes and returns character starting at s[idx]. idx is advanced past the 4491 * decoded character. If the character is not well formed, a UtfException is 4492 * thrown and idx remains unchanged. 4493 */ 4494 4495 dchar decode(const(char)[] s, ref size_t idx) 4496 { 4497 size_t len = s.length; 4498 dchar V; 4499 size_t i = idx; 4500 char u = s[i]; 4501 4502 if (u & 0x80) 4503 { uint n; 4504 char u2; 4505 4506 /* The following encodings are valid, except for the 5 and 6 byte 4507 * combinations: 4508 * 0xxxxxxx 4509 * 110xxxxx 10xxxxxx 4510 * 1110xxxx 10xxxxxx 10xxxxxx 4511 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4512 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4513 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4514 */ 4515 for (n = 1; ; n++) 4516 { 4517 if (n > 4) 4518 goto Lerr; // only do the first 4 of 6 encodings 4519 if (((u << n) & 0x80) == 0) 4520 { 4521 if (n == 1) 4522 goto Lerr; 4523 break; 4524 } 4525 } 4526 4527 // Pick off (7 - n) significant bits of B from first byte of octet 4528 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 4529 4530 if (i + (n - 1) >= len) 4531 goto Lerr; // off end of string 4532 4533 /* The following combinations are overlong, and illegal: 4534 * 1100000x (10xxxxxx) 4535 * 11100000 100xxxxx (10xxxxxx) 4536 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 4537 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 4538 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 4539 */ 4540 u2 = s[i + 1]; 4541 if ((u & 0xFE) == 0xC0 || 4542 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 4543 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 4544 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 4545 (u == 0xFC && (u2 & 0xFC) == 0x80)) 4546 goto Lerr; // overlong combination 4547 4548 for (uint j = 1; j != n; j++) 4549 { 4550 u = s[i + j]; 4551 if ((u & 0xC0) != 0x80) 4552 goto Lerr; // trailing bytes are 10xxxxxx 4553 V = (V << 6) | (u & 0x3F); 4554 } 4555 if (!isValidDchar(V)) 4556 goto Lerr; 4557 i += n; 4558 } 4559 else 4560 { 4561 V = cast(dchar) u; 4562 i++; 4563 } 4564 4565 idx = i; 4566 return V; 4567 4568 Lerr: 4569 throw new Exception("invalid UTF-8 sequence"); 4570 } 4571 4572 /* ditto */ 4573 4574 dchar decode(const(wchar)[] s, ref size_t idx) 4575 in 4576 { 4577 assert(idx >= 0 && idx < s.length); 4578 } 4579 out (result) 4580 { 4581 assert(isValidDchar(result)); 4582 } 4583 body 4584 { 4585 const(char)[] msg; 4586 dchar V; 4587 size_t i = idx; 4588 uint u = s[i]; 4589 4590 if (u & ~0x7F) 4591 { if (u >= 0xD800 && u <= 0xDBFF) 4592 { uint u2; 4593 4594 if (i + 1 == s.length) 4595 { msg = "surrogate UTF-16 high value past end of string"; 4596 goto Lerr; 4597 } 4598 u2 = s[i + 1]; 4599 if (u2 < 0xDC00 || u2 > 0xDFFF) 4600 { msg = "surrogate UTF-16 low value out of range"; 4601 goto Lerr; 4602 } 4603 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 4604 i += 2; 4605 } 4606 else if (u >= 0xDC00 && u <= 0xDFFF) 4607 { msg = "unpaired surrogate UTF-16 value"; 4608 goto Lerr; 4609 } 4610 else if (u == 0xFFFE || u == 0xFFFF) 4611 { msg = "illegal UTF-16 value"; 4612 goto Lerr; 4613 } 4614 else 4615 i++; 4616 } 4617 else 4618 { 4619 i++; 4620 } 4621 4622 idx = i; 4623 return cast(dchar)u; 4624 4625 Lerr: 4626 throw new UtfException(msg.idup, i); 4627 } 4628 4629 /* ditto */ 4630 4631 dchar decode(const(dchar)[] s, ref size_t idx) 4632 in 4633 { 4634 assert(idx >= 0 && idx < s.length); 4635 } 4636 body 4637 { 4638 size_t i = idx; 4639 dchar c = s[i]; 4640 4641 if (!isValidDchar(c)) 4642 goto Lerr; 4643 idx = i + 1; 4644 return c; 4645 4646 Lerr: 4647 throw new UtfException("5invalid UTF-32 value", i); 4648 } 4649 4650 4651 4652 /* =================== Encode ======================= */ 4653 4654 /* ***************************** 4655 * Encodes character c and appends it to array s[]. 4656 */ 4657 4658 void encode(ref const(char)[] s, dchar c) 4659 in 4660 { 4661 assert(isValidDchar(c)); 4662 } 4663 body 4664 { 4665 const(char)[] r = s; 4666 4667 if (c <= 0x7F) 4668 { 4669 r ~= cast(char) c; 4670 } 4671 else 4672 { 4673 char[4] buf; 4674 uint L; 4675 4676 if (c <= 0x7FF) 4677 { 4678 buf[0] = cast(char)(0xC0 | (c >> 6)); 4679 buf[1] = cast(char)(0x80 | (c & 0x3F)); 4680 L = 2; 4681 } 4682 else if (c <= 0xFFFF) 4683 { 4684 buf[0] = cast(char)(0xE0 | (c >> 12)); 4685 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4686 buf[2] = cast(char)(0x80 | (c & 0x3F)); 4687 L = 3; 4688 } 4689 else if (c <= 0x10FFFF) 4690 { 4691 buf[0] = cast(char)(0xF0 | (c >> 18)); 4692 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 4693 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4694 buf[3] = cast(char)(0x80 | (c & 0x3F)); 4695 L = 4; 4696 } 4697 else 4698 { 4699 assert(0); 4700 } 4701 r ~= buf[0 .. L]; 4702 } 4703 s = r; 4704 } 4705 4706 /* ditto */ 4707 4708 void encode(ref const(wchar)[] s, dchar c) 4709 in 4710 { 4711 assert(isValidDchar(c)); 4712 } 4713 body 4714 { 4715 const(wchar)[] r = s; 4716 4717 if (c <= 0xFFFF) 4718 { 4719 r ~= cast(wchar) c; 4720 } 4721 else 4722 { 4723 wchar[2] buf; 4724 4725 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 4726 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 4727 r ~= buf; 4728 } 4729 s = r; 4730 } 4731 4732 /* ditto */ 4733 4734 void encode(ref const(dchar)[] s, dchar c) 4735 in 4736 { 4737 assert(isValidDchar(c)); 4738 } 4739 body 4740 { 4741 s ~= c; 4742 } 4743 4744 debug(UnitTest) 4745 { 4746 unittest 4747 { 4748 // match a fixed string 4749 Regex r; 4750 r = new Regex("abc"); 4751 assert(r.test("abc123")); 4752 assert(r.test("feabc123")); 4753 assert(!r.test("abe123")); 4754 4755 // match a non-ASCII string 4756 r = new Regex("☃"); 4757 assert(r.test("a☃c123")); 4758 4759 // capture 4760 r = new Regex("☃(c)"); 4761 assert(r.test("a☃c123")); 4762 assert(r.match(1) == "c"); 4763 4764 // dot 4765 r = new Regex(".."); 4766 assert(r.test("a☃c123")); 4767 4768 // dot capture 4769 r = new Regex("(..)"); 4770 assert(r.test("a☃c123")); 4771 assert(r.match(1) == "a☃"); 4772 4773 4774 4775 // two captures 4776 r = new Regex("(.)(.)"); 4777 assert(r.test("a☃c123")); 4778 assert(r.match(1) == "a"); 4779 assert(r.match(2) == "☃"); 4780 4781 // multiple letters 4782 r = new Regex(".*(e+).*"); 4783 assert(r.test("aeeeeec123")); 4784 assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1)); 4785 assert(!r.test("abfua")); 4786 4787 // multiple snowmen 4788 r = new Regex(".*(☃+).*"); 4789 assert(r.test("a☃☃☃☃☃c123")); 4790 assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1)); 4791 assert(!r.test("abeua")); 4792 4793 /** 4794 "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed. 4795 */ 4796 // multiple snowmen 4797 r = new Regex("(☃+)"); 4798 assert(r.test("a☃☃☃☃☃c123")); 4799 assert(r.match(0) == "☃☃☃☃☃"); 4800 assert(r.match(1) == "☃☃☃☃☃"); 4801 assert(r.test("a☃beua")); 4802 } 4803 4804 debug(RegexTestOnly) 4805 { 4806 import tango.io.Stdout; 4807 void main() { Stdout("All tests pass").newline(); } 4808 } 4809 }