1 /******************************************************************************* 2 3 copyright: Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved. 4 5 license: BSD style: $(LICENSE) 6 7 version: Initial release: Jan 2008 8 9 authors: Jascha Wetzel 10 11 This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method. 12 13 The Regex class is not thread safe 14 15 See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> 16 for details on regular expressions in general. 17 18 The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it, 19 as opposed to what "regular expression" means in most implementations 20 (e.g. PCRE or those from the standard libraries of Perl, Java or Python). 21 The advantage of this method is it's performance, it's disadvantage is the inability to realize some features 22 that Perl-like regular expressions have (e.g. back-references). 23 See <a href="http://swtch.com/~rsc/regexp/regexp1.html">"Regular Expression Matching Can Be Simple And Fast"</a> 24 for details on the differences. 25 26 The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the 27 number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input 28 and therefore matching is a linear-time process. 29 30 The syntax of a regular expressions is as follows. 31 <i>X</i> and <i>Y</i> stand for an arbitrary regular expression. 32 33 <table border=1 cellspacing=0 cellpadding=5> 34 <caption>Operators</caption> 35 $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) ) 36 $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) ) 37 $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) ) 38 $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) ) 39 $(TR $(TD [^Z]) $(TD negated character class specification) ) 40 $(TR $(TD <X) $(TD lookbehind, X may be a single character or a character class) ) 41 $(TR $(TD >X) $(TD lookahead, X may be a single character or a character class) ) 42 $(TR $(TD ^) $(TD start of input or start of line) ) 43 $(TR $(TD $) $(TD end of input or end of line) ) 44 $(TR $(TD \b) $(TD start or end of word, equals (?<\s>\S|<\S>\s)) ) 45 $(TR $(TD \B) $(TD opposite of \b, equals (?<\S>\S|<\s>\s)) ) 46 </table> 47 48 <table border=1 cellspacing=0 cellpadding=5> 49 <caption>Quantifiers</caption> 50 $(TR $(TD X?) $(TD zero or one) ) 51 $(TR $(TD X*) $(TD zero or more) ) 52 $(TR $(TD X+) $(TD one or more) ) 53 $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) ) 54 $(TR $(TD X??) $(TD non-greedy version of the above operators) ) 55 $(TR $(TD X*?) $(TD see above) ) 56 $(TR $(TD X+?) $(TD see above) ) 57 $(TR $(TD X{n,m}?) $(TD see above) ) 58 </table> 59 60 <table border=1 cellspacing=0 cellpadding=5> 61 <caption>Pre-defined character classes</caption> 62 $(TR $(TD .) $(TD any printable character) ) 63 $(TR $(TD \s) $(TD whitespace) ) 64 $(TR $(TD \S) $(TD non-whitespace) ) 65 $(TR $(TD \w) $(TD alpha-numeric characters or underscore) ) 66 $(TR $(TD \W) $(TD opposite of \w) ) 67 $(TR $(TD \d) $(TD digits) ) 68 $(TR $(TD \D) $(TD non-digit) ) 69 </table> 70 71 Note that "alphanumeric" only applies to Latin-1. 72 *******************************************************************************/ 73 module tango.text.Regex; 74 75 static import tango.core.Array; 76 77 static import tango.text.convert.Utf; 78 79 debug(TangoRegex) import tango.io.Stdout; 80 81 /* ***************************************************************************** 82 A simple pair 83 *******************************************************************************/ 84 private struct Pair(T) 85 { 86 static Pair opCall(T a, T b) 87 { 88 Pair p; 89 p.a = a; 90 p.b = b; 91 return p; 92 } 93 94 union 95 { 96 struct { 97 T first, second; 98 } 99 struct { 100 T a, b; 101 } 102 } 103 } 104 105 /* ***************************************************************************** 106 Double linked list 107 *******************************************************************************/ 108 private class List(T) 109 { 110 class Element 111 { 112 T value; 113 Element prev, 114 next; 115 116 this(T v) 117 { 118 value = v; 119 } 120 } 121 122 size_t len; 123 Element head, 124 tail; 125 126 List opCatAssign(T v) 127 { 128 if ( tail is null ) 129 head = tail = new Element(v); 130 else { 131 tail.next = new Element(v); 132 tail.next.prev = tail; 133 tail = tail.next; 134 } 135 ++len; 136 return this; 137 } 138 139 List insertAfter(T w, T v) 140 { 141 foreach ( e; &this.elements ) 142 { 143 if ( e.value is w ) 144 return insertAfter(e, v); 145 } 146 return null; 147 } 148 149 List insertAfter(Element e, T v) 150 { 151 auto tmp = new Element(v); 152 tmp.prev = e; 153 tmp.next = e.next; 154 e.next.prev = tmp; 155 e.next = tmp; 156 if ( e is tail ) 157 tail = tmp; 158 ++len; 159 return this; 160 } 161 162 List opCatAssign(List l) 163 { 164 if ( l.empty() ) 165 return this; 166 if ( tail is null ) { 167 head = l.head; 168 tail = l.tail; 169 } 170 else { 171 tail.next = l.head; 172 tail.next.prev = tail; 173 tail = l.tail; 174 } 175 len += l.len; 176 return this; 177 } 178 179 List pushFront(T v) 180 { 181 if ( head is null ) 182 head = tail = new Element(v); 183 else 184 { 185 head.prev = new Element(v); 186 head.prev.next = head; 187 head = head.prev; 188 } 189 ++len; 190 return this; 191 } 192 193 List insertBefore(T w, T v) 194 { 195 foreach ( e; &this.elements ) 196 { 197 if ( e.value is w ) 198 return insertBefore(e, v); 199 } 200 return null; 201 } 202 203 List insertBefore(Element e, T v) 204 { 205 auto tmp = new Element(v); 206 tmp.prev = e.prev; 207 tmp.next = e; 208 e.prev.next = tmp; 209 e.prev = tmp; 210 if ( e is head ) 211 head = tmp; 212 ++len; 213 return this; 214 } 215 216 List pushFront(List l) 217 { 218 if ( l.empty() ) 219 return this; 220 if ( head is null ) { 221 head = l.head; 222 tail = l.tail; 223 } 224 else { 225 head.prev = l.tail; 226 head.prev.next = head; 227 head = l.head; 228 } 229 len += l.len; 230 return this; 231 } 232 233 size_t length() 234 { 235 return len; 236 } 237 238 bool empty() 239 { 240 return head is null; 241 } 242 243 void clear() 244 { 245 head = null; 246 tail = null; 247 len = 0; 248 } 249 250 void pop() 251 { 252 remove(tail); 253 } 254 255 void remove(Element e) 256 { 257 if ( e is null ) 258 return; 259 if ( e.prev is null ) 260 head = e.next; 261 else 262 e.prev.next = e.next; 263 if ( e.next is null ) 264 tail = e.prev; 265 else 266 e.next.prev = e.prev; 267 --len; 268 } 269 270 int elements(int delegate(ref Element) dg) 271 { 272 for ( Element e=head; e !is null; e = e.next ) 273 { 274 int ret = dg(e); 275 if ( ret ) 276 return ret; 277 } 278 return 0; 279 } 280 281 int elements_reverse(int delegate(ref Element) dg) 282 { 283 for ( Element e=tail; e !is null; e = e.prev ) 284 { 285 int ret = dg(e); 286 if ( ret ) 287 return ret; 288 } 289 return 0; 290 } 291 292 int opApply(int delegate(ref T) dg) 293 { 294 for ( Element e=head; e !is null; e = e.next ) 295 { 296 int ret = dg(e.value); 297 if ( ret ) 298 return ret; 299 } 300 return 0; 301 } 302 303 int opApplyReverse(int delegate(ref T) dg) 304 { 305 for ( Element e=tail; e !is null; e = e.prev ) 306 { 307 int ret = dg(e.value); 308 if ( ret ) 309 return ret; 310 } 311 return 0; 312 } 313 } 314 315 /* ***************************************************************************** 316 Stack based on dynamic array 317 *******************************************************************************/ 318 private struct Stack(T) 319 { 320 size_t _top; 321 T[] stack; 322 323 void push(T v) 324 { 325 if ( _top >= stack.length ) 326 stack.length = stack.length*2+1; 327 stack[_top] = v; 328 ++_top; 329 } 330 alias push opCatAssign; 331 332 void opCatAssign(T[] vs) 333 { 334 size_t end = _top+vs.length; 335 if ( end > stack.length ) 336 stack.length = end*2; 337 stack[_top..end] = vs[]; 338 _top = end; 339 } 340 341 void pop(size_t num) 342 { 343 assert(_top>=num); 344 _top -= num; 345 } 346 347 T pop() 348 { 349 assert(_top>0); 350 return stack[--_top]; 351 } 352 353 T top() 354 { 355 assert(_top>0); 356 return stack[_top-1]; 357 } 358 359 T* topPtr() 360 { 361 assert(_top>0); 362 return &stack[_top-1]; 363 } 364 365 bool empty() 366 { 367 return _top == 0; 368 } 369 370 void clear() 371 { 372 _top = 0; 373 } 374 375 size_t length() 376 { 377 return _top; 378 } 379 380 T[] array() 381 { 382 return stack[0.._top]; 383 } 384 385 T opIndex(size_t i) 386 { 387 return stack[i]; 388 } 389 390 Stack dup() 391 { 392 Stack s; 393 s._top = _top; 394 s.stack = stack.dup; 395 return s; 396 } 397 } 398 399 /* ************************************************************************************************ 400 Set container based on assoc array 401 **************************************************************************************************/ 402 private struct Set(T) 403 { 404 bool[T] data; 405 406 static Set opCall() 407 { 408 Set s; 409 return s; 410 } 411 412 static Set opCall(T v) 413 { 414 Set s; 415 s ~= v; 416 return s; 417 } 418 419 void opAddAssign(T v) 420 { 421 data[v] = true; 422 } 423 424 void opAddAssign(Set s) 425 { 426 foreach ( v; s.elements ) 427 data[v] = true; 428 } 429 alias opAddAssign opCatAssign; 430 431 size_t length() 432 { 433 return data.length; 434 } 435 436 T[] elements() 437 { 438 return data.keys; 439 } 440 441 bool remove(T v) 442 { 443 if ( (v in data) is null ) 444 return false; 445 data.remove(v); 446 return true; 447 } 448 449 bool contains(T v) 450 { 451 return (v in data) !is null; 452 } 453 454 bool contains(Set s) 455 { 456 Set tmp = s - this; 457 return tmp.empty; 458 } 459 460 bool empty() 461 { 462 return data.length==0; 463 } 464 465 Set opSub(Set s) 466 { 467 Set res = dup; 468 foreach ( v; s.elements ) 469 res.remove(v); 470 return res; 471 } 472 473 Set dup() 474 { 475 Set s; 476 foreach ( v; data.keys ) 477 s.data[v] = true; 478 return s; 479 } 480 } 481 482 /* ************************************************************************************************ 483 484 **************************************************************************************************/ 485 void quickSort(T)(T[] a) 486 { 487 quickSort(a,cast(size_t)0,a.length); 488 } 489 490 void quickSort(T)(T[] a, size_t l, size_t r) 491 { 492 T t; 493 auto i = r-l; 494 if ( i < 3 ) 495 { 496 if ( i < 2 ) 497 return; 498 if ( a[l] < a[l+1] ) 499 return; 500 t = a[l]; 501 a[l] = a[l+1]; 502 a[l+1] = t; 503 return; 504 } 505 506 auto p = a[l]; 507 i = l; 508 auto j = r; 509 510 while ( true ) 511 { 512 ++i; 513 for ( ; i < j && a[i] < p; ++i ) {} 514 --j; 515 for ( ; i < j && a[j] >= p; --j ) {} 516 if ( i >= j ) 517 break; 518 t = a[i]; 519 a[i] = a[j]; 520 a[j] = t; 521 } 522 --i; 523 a[l] = a[i]; 524 a[i] = p; 525 526 quickSort(a, l, i); 527 quickSort(a, i+1, r); 528 } 529 import tango.math.Math; 530 531 /* ************************************************************************************************ 532 A range of characters 533 **************************************************************************************************/ 534 struct CharRange(char_t) 535 { 536 char_t l_, r_; 537 538 static CharRange opCall(char_t c) 539 { 540 CharRange cr; 541 cr.l_ = c; 542 cr.r_ = c; 543 return cr; 544 } 545 546 static CharRange opCall(char_t a, char_t b) 547 { 548 CharRange cr; 549 cr.l_ = min(a,b); 550 cr.r_ = max(a,b); 551 return cr; 552 } 553 554 char_t l() 555 { 556 return l_; 557 } 558 559 char_t r() 560 { 561 return r_; 562 } 563 564 /* ******************************************************************************************** 565 Compares the ranges according to their beginning. 566 **********************************************************************************************/ 567 int opCmp(const ref CharRange cr) const 568 { 569 if ( l_ == cr.l_ ) 570 return 0; 571 if ( l_ < cr.l_ ) 572 return -1; 573 return 1; 574 } 575 576 577 const bool opEquals(ref const CharRange cr) 578 { 579 if ( l_ == cr.l_ && r_ == cr.r_ ) 580 return true; 581 return false; 582 } 583 584 bool contains(char_t c) 585 { 586 return c >= l_ && c <= r_; 587 } 588 589 bool contains(CharRange cr) 590 { 591 return l_ <= cr.l_ && r_ >= cr.r_; 592 } 593 594 bool intersects(CharRange cr) 595 { 596 return r_ >= cr.l_ && l_ <= cr.r_; 597 } 598 599 CharRange intersect(CharRange cr) 600 { 601 assert(intersects(cr)); 602 CharRange ir; 603 ir.l_ = max(l_, cr.l_); 604 ir.r_ = min(r_, cr.r_); 605 if ( ir.l_ > ir.r_ ) 606 ir.l_ = ir.r_ = char_t.min; 607 return ir; 608 } 609 610 CharRange[] subtract(CharRange cr) 611 { 612 CharRange[] sr; 613 if ( cr.contains(this) ) 614 return sr; 615 if ( !intersects(cr) ) 616 sr ~= this; 617 else 618 { 619 CharRange d; 620 if ( contains(cr) ) 621 { 622 d.l_ = l_; 623 d.r_ = cast(char_t)(cr.l_-1); 624 if ( d.l_ <= d.r_ ) 625 sr ~= d; 626 d.l_ = cast(char_t)(cr.r_+1); 627 d.r_ = r_; 628 if ( d.l_ <= d.r_ ) 629 sr ~= d; 630 } 631 else if ( cr.r_ > l_ ) 632 { 633 d.l_ = cast(char_t)(cr.r_+1); 634 d.r_ = r_; 635 if ( d.l_ <= d.r_ ) 636 sr ~= d; 637 } 638 else if ( cr.l_ < r_ ) 639 { 640 d.l_ = l_; 641 d.r_ = cast(char_t)(cr.l_-1); 642 if ( d.l_ <= d.r_ ) 643 sr ~= d; 644 } 645 } 646 return sr; 647 } 648 649 string toString() 650 { 651 string str; 652 653 if ( l_ == r_ ) 654 { 655 if ( l_ > 0x20 && l_ < 0x7f ) 656 str = Format.convert("'{}'", l_).idup; 657 else 658 str = Format.convert("({:x})", cast(int)l_).idup; 659 } 660 else 661 { 662 if ( l_ > 0x20 && l_ < 0x7f ) 663 str = Format.convert("'{}'", l_).idup; 664 else 665 str = Format.convert("({:x})", cast(int)l_).idup; 666 str ~= "-"; 667 if ( r_ > 0x20 && r_ < 0x7f ) 668 str ~= Format.convert("'{}'", r_).idup; 669 else 670 str ~= Format.convert("({:x})", cast(int)r_).idup; 671 } 672 return str; 673 } 674 } 675 676 import tango.core.Compiler; 677 678 static if(DMDFE_Version == 2063) 679 { 680 /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */ 681 void _bug(CharRange!(dchar) a) {} 682 } 683 684 /* ************************************************************************************************ 685 Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.) 686 **************************************************************************************************/ 687 struct CharClass(char_t) 688 { 689 alias CharRange!(char_t) range_t; 690 691 //--------------------------------------------------------------------------------------------- 692 // pre-defined character classes 693 static const CharClass!(char_t) 694 line_startend = {parts: [ 695 {l_:0x00, r_:0x00}, 696 {l_:0x0a, r_:0x0a}, 697 {l_:0x13, r_:0x13} 698 ]}, 699 digit = {parts: [ 700 {l_:0x30, r_:0x39} 701 ]}, 702 whitespace = {parts: [ 703 {l_:0x09, r_:0x09}, 704 {l_:0x0a, r_:0x0a}, 705 {l_:0x0b, r_:0x0b}, 706 {l_:0x13, r_:0x13}, 707 {l_:0x14, r_:0x14}, 708 {l_:0x20, r_:0x20} 709 ]}; 710 711 // 8bit classes 712 static if ( is(char_t == char) ) 713 { 714 static const CharClass!(char_t) 715 any_char = {parts: [ 716 {l_:0x01, r_:0xff} 717 ]}, 718 dot_oper = {parts: [ 719 {l_:0x01, r_:0xff} 720 ]}, 721 alphanum_ = {parts: [ 722 {l_:0x30, r_:0x39}, 723 {l_:0x41, r_:0x5a}, 724 {l_:0x5f, r_:0x5f}, 725 {l_:0x61, r_:0x7a} 726 ]}; 727 } 728 // 16bit and 32bit classes 729 static if ( is(char_t == wchar) || is(char_t == dchar) ) 730 { 731 static const CharClass!(char_t) 732 any_char = {parts: [ 733 {l_:0x0001, r_:0xffff} 734 ]}, 735 dot_oper = {parts: [ 736 {l_:0x0001, r_:0xffff} 737 ]}, 738 alphanum_ = {parts: [ 739 {l_:0x30, r_:0x39}, 740 {l_:0x41, r_:0x5a}, 741 {l_:0x5f, r_:0x5f}, 742 {l_:0x61, r_:0x7a} 743 ]}; 744 } 745 746 //--------------------------------------------------------------------------------------------- 747 range_t[] parts; 748 749 invariant() 750 { 751 // foreach ( i, p; parts ) 752 // assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_)); 753 } 754 755 static CharClass opCall(const(CharClass) cc) 756 { 757 CharClass ncc; 758 ncc.parts = cc.parts.dup; 759 return ncc; 760 } 761 762 int opCmp(const ref CharClass cc) const 763 { 764 if ( parts.length < cc.parts.length ) 765 return -1; 766 if ( parts.length > cc.parts.length ) 767 return 1; 768 foreach ( i, p; cc.parts ) 769 { 770 if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ ) 771 return 1; 772 } 773 return 0; 774 } 775 776 bool empty() 777 { 778 return parts.length <= 0; 779 } 780 781 bool matches(char_t c) 782 { 783 foreach ( p; parts ) 784 { 785 if ( p.contains(c) ) 786 return true; 787 } 788 return false; 789 } 790 791 CharClass intersect(CharClass cc) 792 { 793 CharClass ic; 794 foreach ( p; parts ) 795 { 796 foreach ( cp; cc.parts ) 797 { 798 if ( p.intersects(cp) ) 799 ic.parts ~= p.intersect(cp); 800 } 801 } 802 return ic; 803 } 804 805 // requires the class to be optimized 806 bool contains(range_t cr) 807 { 808 foreach ( p; parts ) 809 { 810 if ( p.contains(cr) ) 811 return true; 812 } 813 return false; 814 } 815 816 // requires the class to be optimized 817 bool contains(CharClass cc) 818 { 819 Louter: foreach ( p; cc.parts ) 820 { 821 foreach ( p2; parts ) 822 { 823 if ( p2.contains(p) ) 824 continue Louter; 825 } 826 return false; 827 } 828 return true; 829 } 830 831 void subtract(CharClass cc) 832 { 833 negate(); 834 add(cc); 835 negate(); 836 } 837 838 void add(CharClass cc) 839 { 840 parts ~= cc.parts; 841 } 842 843 void add(range_t cr) 844 { 845 parts ~= cr; 846 } 847 848 void add(char_t c) 849 { 850 parts ~= CharRange!(char_t)(c); 851 } 852 853 /* ******************************************************************************************** 854 Requires the CharClass to be optimized. 855 **********************************************************************************************/ 856 void negate() 857 { 858 optimize(); 859 char_t start = char_t.min; 860 861 // first part touches left boundary of value range 862 if ( parts.length > 0 && parts[0].l_ == start ) 863 { 864 start = parts[0].r_; 865 if ( start < char_t.max ) 866 ++start; 867 868 foreach ( i, ref cr; parts[0 .. $-1] ) 869 { 870 cr.l_ = start; 871 cr.r_ = cast(char_t)(parts[i+1].l_-1); 872 start = parts[i+1].r_; 873 if ( start < char_t.max ) 874 ++start; 875 } 876 if ( start != char_t.max ) { 877 parts[$-1].l_ = start; 878 parts[$-1].r_ = char_t.max; 879 } 880 else 881 parts.length = parts.length-1; 882 return; 883 } 884 885 foreach ( i, ref cr; parts ) 886 { 887 char_t tmp = cast(char_t)(cr.l_-1); 888 cr.l_ = start; 889 start = cr.r_; 890 if ( start < char_t.max ) 891 ++start; 892 cr.r_ = tmp; 893 } 894 895 // last part does not touch right boundary 896 if ( start != char_t.max ) 897 parts ~= range_t(start, char_t.max); 898 } 899 900 void optimize() 901 { 902 if ( empty() ) 903 return; 904 905 tango.core.Array.sort(parts); 906 907 size_t i = 0; 908 foreach ( p; parts[1 .. $] ) 909 { 910 if ( p.l_ > parts[i].r_+1 ) { 911 ++i; 912 parts[i].l_ = p.l_; 913 parts[i].r_ = p.r_; 914 continue; 915 } 916 parts[i].r_ = max(p.r_, parts[i].r_); 917 if ( parts[i].r_ >= char_t.max ) 918 break; 919 } 920 parts.length = i+1; 921 } 922 923 string toString() 924 { 925 string str; 926 str ~= "["; 927 foreach ( p; parts ) 928 str ~= p.toString(); 929 str ~= "]"; 930 return str; 931 } 932 } 933 934 debug(UnitTest) 935 { 936 unittest 937 { 938 static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] }; 939 assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']"); 940 cc.optimize(); 941 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 942 cc.negate(); 943 assert(cc.toString(), " [(12)-(13)'e'-(ff)]"); 944 cc.optimize(); 945 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 946 cc.negate(); 947 assert(cc.toString(), "[(12)-(13)'e'-(ff)]"); 948 949 static CharClass!(char) cc2 = { parts: [] }; 950 assert(cc.toString(), "[]"); 951 cc2.optimize(); 952 assert(cc.toString(), "[]"); 953 cc2.negate(); 954 assert(cc.toString(), "[(0)-(ff)]"); 955 cc2.optimize(); 956 assert(cc.toString(), "[(0)-(ff)]"); 957 cc2.negate(); 958 assert(cc.toString(), "[]"); 959 960 static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] }; 961 assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]"); 962 cc3.negate(); 963 assert(cc.toString(), "['e'-(c7)]"); 964 cc3.negate(); 965 assert(cc.toString(), "[(0)-'d'(c8)-(ff)]"); 966 967 static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] }; 968 assert(cc.toString(), "[(0)-(c8)'d'-(ff)]"); 969 cc4.optimize(); 970 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]"); 971 972 static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] }; 973 cc5.optimize(); 974 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 975 cc5.negate(); 976 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 977 cc5.optimize(); 978 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 979 cc5.negate(); 980 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 981 } 982 } 983 984 /* ************************************************************************************************ 985 986 **************************************************************************************************/ 987 private struct Predicate(char_t) 988 { 989 alias const(char_t)[] string_t; 990 alias CharClass!(char_t) cc_t; 991 alias CharRange!(char_t) cr_t; 992 993 // generic data 994 enum Type { 995 consume, epsilon, lookahead, lookbehind 996 } 997 998 cc_t input; 999 Type type; 1000 1001 // data for compiled predicates 1002 enum uint MAX_BITMAP_LENGTH = 256, 1003 MAX_SEARCH_LENGTH = 256; 1004 enum MatchMode { 1005 generic, generic_l, 1006 single_char, bitmap, string_search, // consume 1007 single_char_l, bitmap_l, string_search_l // lookahead 1008 } 1009 1010 MatchMode mode; 1011 // data_chr had to be pulled out of the union due to 1012 // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back 1013 // in until this is resolved! 1014 // 1015 // Keep in mind that data_str.length can't be modified directly unless the 1016 // new length is strictly greater than the old length. This is essentially 1017 // because ubyte.sizeof can be less than char_t.sizeof. If you set 1018 // data_str.length to anything less than or equal to data_bmp.length, 1019 // data_str will not be reallocated: only the length value will change but 1020 // nothing will realize that not that much space has actually been 1021 // allocated. data_str will be too small and you'll likely get segfaults or 1022 // such. 1023 // 1024 // In short: don't mess with data_str.length. If you have to, remove the 1025 // union entirely. 1026 // 1027 // -- Deewiant 1028 union { 1029 ubyte[] data_bmp; 1030 string_t data_str; 1031 }; 1032 char_t data_chr; 1033 1034 1035 void compile() 1036 { 1037 assert(input.parts.length > 0); 1038 1039 // single char? 1040 if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ ) 1041 { 1042 mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l; 1043 data_chr = input.parts[0].l_; 1044 return; 1045 } 1046 // check whether we can use a bitmap 1047 foreach ( p; input.parts ) 1048 { 1049 if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH ) 1050 goto LnoBitmap; 1051 } 1052 1053 // setup bitmap 1054 data_bmp.length = MAX_BITMAP_LENGTH/8; 1055 foreach ( p; input.parts ) 1056 { 1057 for ( char_t c = p.l_; c <= p.r_; ++c ) 1058 data_bmp[c/8] |= 1 << (c&7); 1059 } 1060 mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l; 1061 return; 1062 1063 LnoBitmap: 1064 /* 1065 // check whether the class is small enough to justify a string-search 1066 // TODO: consider inverse class for 8bit chars? 1067 uint class_size; 1068 foreach ( p; input.parts ) 1069 class_size += cast(uint)p.r_+1-p.l_; 1070 if ( class_size > MAX_SEARCH_LENGTH ) 1071 goto Lgeneric; 1072 data_str.length = class_size; 1073 size_t ind; 1074 foreach ( p; input.parts ) 1075 { 1076 for ( char_t c = p.l_; c <= p.r_; ++c ) 1077 data_str[ind++] = c; 1078 } 1079 mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l; 1080 return; 1081 */ 1082 Lgeneric: 1083 data_str = cast(char_t[])input.parts; 1084 mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l; 1085 } 1086 1087 bool matches(char_t c) 1088 { 1089 if ( type == Type.consume || type == Type.lookahead ) 1090 return input.matches(c); 1091 assert(0); 1092 } 1093 1094 Predicate intersect(Predicate p) 1095 { 1096 Predicate p2; 1097 if ( type != Type.epsilon && p.type != Type.epsilon ) 1098 p2.input = input.intersect(p.input); 1099 return p2; 1100 } 1101 1102 bool intersects(Predicate p) 1103 { 1104 if ( type != p.type ) 1105 return false; 1106 foreach ( cr; input.parts ) 1107 { 1108 foreach ( cr2; p.input.parts ) 1109 { 1110 if ( cr.intersects(cr2) ) 1111 return true; 1112 } 1113 } 1114 return false; 1115 } 1116 1117 bool exceedsMax(uint maxc) 1118 { 1119 foreach ( p; input.parts ) 1120 { 1121 if ( p.l_ > maxc || p.r_ > maxc ) 1122 return true; 1123 } 1124 1125 return false; 1126 } 1127 1128 @property bool empty() 1129 { 1130 return type != Type.epsilon && input.empty(); 1131 } 1132 1133 void subtract(Predicate p) 1134 { 1135 if ( type != Type.epsilon && p.type != Type.epsilon ) 1136 input.subtract(p.input); 1137 } 1138 1139 void negate() 1140 { 1141 assert(type != Type.epsilon); 1142 input.negate(); 1143 } 1144 1145 void optimize() 1146 { 1147 assert(type != Type.epsilon); 1148 input.optimize(); 1149 } 1150 1151 int opCmp(const ref Predicate p) const 1152 { 1153 return input.opCmp(p.input); 1154 } 1155 1156 const bool opEquals(ref const Predicate p) 1157 { 1158 if ( type != p.type ) 1159 return false; 1160 if ( input.opCmp(p.input) != 0 ) 1161 return false; 1162 return true; 1163 } 1164 1165 cc_t getInput() 1166 { 1167 return input; 1168 } 1169 1170 void setInput(const(cc_t) cc) 1171 { 1172 input = cast(cc_t)cc; 1173 } 1174 1175 void appendInput(cr_t cr) 1176 { 1177 input.add(cr); 1178 } 1179 1180 void appendInput(cc_t cc) 1181 { 1182 input.add(cc); 1183 } 1184 1185 void appendInput(Predicate p) 1186 { 1187 input.add(p.input); 1188 } 1189 1190 string toString() 1191 { 1192 string str; 1193 switch ( type ) 1194 { 1195 case Type.consume: str = input.toString(); break; 1196 case Type.epsilon: str = "eps"; break; 1197 case Type.lookahead: str = "la:"~input.toString(); break; 1198 case Type.lookbehind: str = "lb:"~input.toString(); break; 1199 default: 1200 assert(0); 1201 } 1202 return str; 1203 } 1204 } 1205 import Utf = tango.text.convert.Utf; 1206 import tango.text.convert.Format; 1207 1208 /* ************************************************************************************************ 1209 1210 **************************************************************************************************/ 1211 class RegExpException : Exception 1212 { 1213 this(string msg) 1214 { 1215 super("RegExp: "~msg); 1216 } 1217 } 1218 1219 /* ************************************************************************************************ 1220 TNFA state 1221 **************************************************************************************************/ 1222 private class TNFAState(char_t) 1223 { 1224 bool accept = false, 1225 visited = false; 1226 size_t index; 1227 List!(TNFATransition!(char_t)) transitions; 1228 1229 this() 1230 { 1231 transitions = new List!(TNFATransition!(char_t)); 1232 } 1233 } 1234 1235 1236 /* ************************************************************************************************ 1237 Priority classes used to linearize priorities after non-linear transition creation. 1238 **************************************************************************************************/ 1239 private enum PriorityClass { 1240 greedy=0, normal=1, reluctant=2, extraReluctant=3 1241 } 1242 1243 /* ******************************************************************************** 1244 TNFA tagged transition 1245 ***********************************************************************************/ 1246 private class TNFATransition(char_t) 1247 { 1248 TNFAState!(char_t) target; 1249 Predicate!(char_t) predicate; 1250 uint priority, 1251 tag; /// one-based tag number, 0 = untagged 1252 PriorityClass priorityClass; 1253 1254 this(PriorityClass pc) 1255 { 1256 priorityClass = pc; 1257 } 1258 1259 /****************************************************************************** 1260 Move through states only going via epsilon transitions, and only choosing 1261 the one with highest priority. If the highest priority transition from a 1262 state isn't an epsilon transition, false is returned. 1263 If the accepting NFA state can be reached in this manner, true is returned. 1264 1265 NOTE: This method does not look for cycles which should be kept in mind for 1266 later. larsivi 20090827 1267 *******************************************************************************/ 1268 bool canFinish() 1269 { 1270 TNFAState!(char_t) t = target; 1271 while (!t.accept) { 1272 TNFATransition!(char_t) highestPriTrans; 1273 foreach (trans; t.transitions) { 1274 if (!highestPriTrans || highestPriTrans.priority > trans.priority) 1275 highestPriTrans = trans; 1276 } 1277 if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon)) 1278 return false; 1279 1280 t = highestPriTrans.target; 1281 } 1282 return true; 1283 } 1284 1285 } 1286 1287 /* ************************************************************************************************ 1288 Fragments of TNFAs as used in the Thompson method 1289 **************************************************************************************************/ 1290 private class TNFAFragment(char_t) 1291 { 1292 alias TNFAState!(char_t) state_t; 1293 alias TNFATransition!(char_t) trans_t; 1294 1295 List!(trans_t) entries, /// transitions to be added to the entry state 1296 exits, /// transitions to be added to the exit state 1297 entry_state, /// transitions to write the entry state to 1298 exit_state; /// transitions to write the exit state to 1299 1300 bool swapMatchingBracketSyntax; 1301 1302 this() 1303 { 1304 entries = new List!(trans_t); 1305 exits = new List!(trans_t); 1306 entry_state = new List!(trans_t); 1307 exit_state = new List!(trans_t); 1308 } 1309 1310 /* ******************************************************************************************** 1311 Write the given state as entry state to this fragment. 1312 **********************************************************************************************/ 1313 void setEntry(state_t state) 1314 { 1315 state.transitions ~= entries; 1316 foreach ( t; entry_state ) 1317 t.target = state; 1318 } 1319 1320 /* ******************************************************************************************** 1321 Write the given state as exit state to this fragment. 1322 **********************************************************************************************/ 1323 void setExit(state_t state) 1324 { 1325 state.transitions ~= exits; 1326 foreach ( t; exit_state ) 1327 t.target = state; 1328 } 1329 } 1330 1331 /* ************************************************************************************************ 1332 Tagged NFA 1333 **************************************************************************************************/ 1334 private final class TNFA(char_t) 1335 { 1336 alias TNFATransition!(char_t) trans_t; 1337 alias TNFAFragment!(char_t) frag_t; 1338 alias TNFAState!(char_t) state_t; 1339 alias Predicate!(char_t) predicate_t; 1340 alias char_t[] string_t; 1341 alias CharRange!(char_t) range_t; 1342 alias CharClass!(char_t) cc_t; 1343 1344 string_t pattern; 1345 state_t[] states; 1346 state_t start; 1347 1348 bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching 1349 1350 /* ******************************************************************************************** 1351 Creates the TNFA from the given regex pattern 1352 **********************************************************************************************/ 1353 this(string_t regex) 1354 { 1355 next_tag = 1; 1356 transitions = new List!(trans_t); 1357 1358 pattern = regex; 1359 } 1360 1361 /* ******************************************************************************************** 1362 Print the TNFA (tabular representation of the delta function) 1363 **********************************************************************************************/ 1364 debug(TangoRegex) void print() 1365 { 1366 foreach ( int i, s; states ) 1367 { 1368 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 1369 1370 bool first=true; 1371 Stdout(" {"); 1372 foreach ( t; s.transitions ) 1373 { 1374 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index); 1375 if ( t.tag > 0 ) { 1376 Stdout.format(" t{}", t.tag); 1377 } 1378 first = false; 1379 } 1380 Stdout("}").newline; 1381 } 1382 } 1383 1384 uint tagCount() 1385 { 1386 return next_tag-1; 1387 } 1388 1389 /* ******************************************************************************************** 1390 Constructs the TNFA using extended Thompson method. 1391 Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert 1392 the regexp from infix notation. 1393 **********************************************************************************************/ 1394 void parse(bool unanchored) 1395 { 1396 List!(frag_t) frags = new List!(frag_t); 1397 Stack!(Operator) opStack; 1398 Stack!(uint) tagStack; 1399 Stack!(Pair!(uint)) occurStack; 1400 opStack ~= Operator.eos; 1401 1402 /* **************************************************************************************** 1403 Perform action on operator stack 1404 ******************************************************************************************/ 1405 bool perform(Operator next_op, bool explicit_operator=true) 1406 { 1407 // calculate index in action matrix 1408 int index = cast(int)opStack.top()*(Operator.max+1); 1409 index += cast(int)next_op; 1410 1411 debug(tnfa) Stdout.formatln("\t{}:{} -> {} {} frag(s)", 1412 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length 1413 ); 1414 switch ( action_lookup[index] ) 1415 { 1416 case Act.pua: 1417 opStack ~= next_op; 1418 if ( next_op == Operator.open_par ) { 1419 tagStack ~= next_tag; 1420 next_tag += 2; 1421 } 1422 break; 1423 case Act.poc: 1424 switch ( opStack.top() ) 1425 { 1426 case Operator.concat: constructConcat(frags); break; 1427 case Operator.altern: constructAltern(frags); break; 1428 case Operator.zero_one_g: constructZeroOne(frags, PriorityClass.greedy); break; 1429 case Operator.zero_one_ng: constructZeroOne(frags, PriorityClass.reluctant); break; 1430 case Operator.zero_one_xr: constructZeroOne(frags, PriorityClass.extraReluctant); break; 1431 case Operator.zero_more_g: constructZeroMore(frags, PriorityClass.greedy); break; 1432 case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant); break; 1433 case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break; 1434 case Operator.one_more_g: constructOneMore(frags, PriorityClass.greedy); break; 1435 case Operator.one_more_ng: constructOneMore(frags, PriorityClass.reluctant); break; 1436 case Operator.one_more_xr: constructOneMore(frags, PriorityClass.extraReluctant); break; 1437 case Operator.occur_g: 1438 Pair!(uint) occur = occurStack.pop(); 1439 constructOccur(frags, occur.a, occur.b, PriorityClass.greedy); 1440 break; 1441 case Operator.occur_ng: 1442 Pair!(uint) occur = occurStack.pop(); 1443 constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant); 1444 break; 1445 default: 1446 throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1447 } 1448 opStack.pop(); 1449 1450 perform(next_op, false); 1451 break; 1452 case Act.poa: 1453 opStack.pop(); 1454 break; 1455 case Act.pca: 1456 if ( opStack.top() == Operator.open_par ) 1457 { 1458 if ( tagStack.empty() ) 1459 throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup); 1460 constructBracket(frags, tagStack.top()); 1461 tagStack.pop(); 1462 } 1463 else { 1464 assert(opStack.top() == Operator.open_par_nm); 1465 constructBracket(frags); 1466 } 1467 opStack.pop(); 1468 break; 1469 case Act.don: 1470 return true; 1471 case Act.err: 1472 default: 1473 throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup); 1474 } 1475 1476 return false; 1477 } 1478 1479 // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches 1480 // and matching bracket for total match group 1481 if ( unanchored ) { 1482 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1483 perform(Operator.zero_more_xr, false); 1484 perform(Operator.concat, false); 1485 perform(Operator.open_par, false); 1486 } 1487 1488 // convert regex to postfix and create TNFA 1489 bool implicit_concat; 1490 predicate_t.Type pred_type; 1491 1492 while ( !endOfPattern() ) 1493 { 1494 pred_type = predicate_t.Type.consume; 1495 1496 dchar c = readPattern(); 1497 switch ( c ) 1498 { 1499 case '|': 1500 perform(Operator.altern); 1501 implicit_concat = false; 1502 break; 1503 case '(': 1504 if ( implicit_concat ) 1505 perform(Operator.concat, false); 1506 implicit_concat = false; 1507 if ( peekPattern() == '?' ) { 1508 readPattern(); 1509 perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm); 1510 } 1511 else 1512 perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par); 1513 break; 1514 case ')': 1515 perform(Operator.close_par); 1516 break; 1517 case '?': 1518 if ( peekPattern() == '?' ) { 1519 readPattern(); 1520 perform(Operator.zero_one_ng); 1521 } 1522 else 1523 perform(Operator.zero_one_g); 1524 break; 1525 case '*': 1526 if ( peekPattern() == '?' ) { 1527 readPattern(); 1528 perform(Operator.zero_more_ng); 1529 } 1530 else 1531 perform(Operator.zero_more_g); 1532 break; 1533 case '+': 1534 if ( peekPattern() == '?' ) { 1535 readPattern(); 1536 perform(Operator.one_more_ng); 1537 } 1538 else 1539 perform(Operator.one_more_g); 1540 break; 1541 case '{': 1542 Pair!(uint) occur; 1543 parseOccurCount(occur.a, occur.b); 1544 occurStack ~= occur; 1545 if ( peekPattern() == '?' ) { 1546 readPattern(); 1547 perform(Operator.occur_ng); 1548 } 1549 else 1550 perform(Operator.occur_g); 1551 break; 1552 case '[': 1553 if ( implicit_concat ) 1554 perform(Operator.concat, false); 1555 implicit_concat = true; 1556 frags ~= constructCharClass(pred_type); 1557 break; 1558 case '.': 1559 if ( implicit_concat ) 1560 perform(Operator.concat, false); 1561 implicit_concat = true; 1562 frags ~= constructChars(cc_t.dot_oper, pred_type); 1563 break; 1564 case '$': 1565 if ( implicit_concat ) 1566 perform(Operator.concat, false); 1567 implicit_concat = true; 1568 1569 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead); 1570 break; 1571 case '^': 1572 if ( implicit_concat ) 1573 perform(Operator.concat, false); 1574 implicit_concat = true; 1575 1576 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind); 1577 break; 1578 case '>': 1579 c = readPattern(); 1580 pred_type = predicate_t.Type.lookahead; 1581 if ( c == '[' ) 1582 goto case '['; 1583 else if ( c == '\\' ) 1584 goto case '\\'; 1585 else if ( c == '.' ) 1586 goto case '.'; 1587 else 1588 goto default; 1589 case '<': 1590 c = readPattern(); 1591 pred_type = predicate_t.Type.lookbehind; 1592 if ( c == '[' ) 1593 goto case '['; 1594 else if ( c == '\\' ) 1595 goto case '\\'; 1596 else if ( c == '.' ) 1597 goto case '.'; 1598 else 1599 goto default; 1600 case '\\': 1601 c = readPattern(); 1602 1603 if ( implicit_concat ) 1604 perform(Operator.concat, false); 1605 implicit_concat = true; 1606 1607 switch ( c ) 1608 { 1609 case 't': 1610 frags ~= constructSingleChar('\t', pred_type); 1611 break; 1612 case 'n': 1613 frags ~= constructSingleChar('\n', pred_type); 1614 break; 1615 case 'r': 1616 frags ~= constructSingleChar('\r', pred_type); 1617 break; 1618 case 'w': // alphanumeric and _ 1619 frags ~= constructChars(cc_t.alphanum_, pred_type); 1620 break; 1621 case 'W': // non-(alphanum and _) 1622 auto cc = cc_t(cc_t.alphanum_); 1623 cc.negate(); 1624 frags ~= constructChars(cc, pred_type); 1625 break; 1626 case 's': // whitespace 1627 frags ~= constructChars(cc_t.whitespace, pred_type); 1628 break; 1629 case 'S': // non-whitespace 1630 auto cc = cc_t(cc_t.whitespace); 1631 cc.negate(); 1632 frags ~= constructChars(cc, pred_type); 1633 break; 1634 case 'd': // digit 1635 frags ~= constructChars(cc_t.digit, pred_type); 1636 break; 1637 case 'D': // non-digit 1638 auto cc = cc_t(cc_t.digit); 1639 cc.negate(); 1640 frags ~= constructChars(cc, pred_type); 1641 break; 1642 case 'b': // either end of word 1643 if ( pred_type != predicate_t.Type.consume ) 1644 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind"); 1645 1646 // create (?<\S>\s|<\s>\S) 1647 auto cc = cc_t(cc_t.whitespace); 1648 cc.negate(); 1649 1650 perform(Operator.open_par_nm); 1651 1652 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1653 perform(Operator.concat, false); 1654 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1655 perform(Operator.altern, false); 1656 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1657 perform(Operator.concat, false); 1658 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1659 1660 perform(Operator.close_par, false); 1661 break; 1662 case 'B': // neither end of word 1663 if ( pred_type != predicate_t.Type.consume ) 1664 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind"); 1665 1666 // create (?<\S>\S|<\s>\s) 1667 auto cc = cc_t(cc_t.whitespace); 1668 cc.negate(); 1669 1670 perform(Operator.open_par_nm); 1671 1672 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1673 perform(Operator.concat, false); 1674 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1675 perform(Operator.altern, false); 1676 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1677 perform(Operator.concat, false); 1678 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1679 1680 perform(Operator.close_par, false); 1681 break; 1682 case '(': 1683 case ')': 1684 case '[': 1685 case ']': 1686 case '{': 1687 case '}': 1688 case '*': 1689 case '+': 1690 case '?': 1691 case '.': 1692 case '\\': 1693 case '^': 1694 case '$': 1695 case '|': 1696 case '<': 1697 case '>': 1698 default: 1699 frags ~= constructSingleChar(c, pred_type); 1700 break; 1701 // throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c)); 1702 } 1703 break; 1704 1705 default: 1706 if ( implicit_concat ) 1707 perform(Operator.concat, false); 1708 implicit_concat = true; 1709 frags ~= constructSingleChar(c, pred_type); 1710 } 1711 } 1712 1713 // add implicit reluctant .* (with . == any_char) at the end for unanchored matches 1714 if ( unanchored ) 1715 { 1716 perform(Operator.close_par, false); 1717 if ( implicit_concat ) 1718 perform(Operator.concat, false); 1719 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1720 perform(Operator.zero_more_ng, false); 1721 } 1722 1723 // empty operator stack 1724 while ( !perform(Operator.eos) ) {} 1725 1726 // set start and finish states 1727 start = addState(); 1728 state_t finish = addState(); 1729 finish.accept = true; 1730 1731 foreach ( f; frags ) { 1732 f.setExit(finish); 1733 f.setEntry(start); 1734 } 1735 1736 // set transition priorities 1737 List!(trans_t)[PriorityClass.max+1] trans; 1738 foreach ( ref t; trans ) 1739 t = new List!(trans_t); 1740 1741 Stack!(trans_t) todo; 1742 state_t state = start; 1743 1744 while ( !todo.empty() || !state.visited ) 1745 { 1746 if ( !state.visited ) 1747 { 1748 state.visited = true; 1749 foreach_reverse ( t; state.transitions ) 1750 todo ~= t; 1751 } 1752 1753 if ( todo.empty() ) 1754 break; 1755 trans_t t = todo.top(); 1756 todo.pop(); 1757 assert(t.priorityClass<=PriorityClass.max); 1758 trans[t.priorityClass] ~= t; 1759 state = t.target; 1760 } 1761 1762 uint nextPrio; 1763 foreach ( ts; trans ) 1764 { 1765 foreach ( t; ts ) 1766 t.priority = nextPrio++; 1767 } 1768 } 1769 1770 private: 1771 uint next_tag; 1772 size_t cursor, 1773 next_cursor; 1774 List!(trans_t) transitions; 1775 1776 state_t[state_t] clonedStates; 1777 trans_t[trans_t] clonedTransitions; 1778 1779 /// RegEx operators 1780 enum Operator { 1781 eos, concat, altern, open_par, close_par, 1782 zero_one_g, zero_more_g, one_more_g, // greedy 1783 zero_one_ng, zero_more_ng, one_more_ng, // non-greedy/reluctant 1784 zero_one_xr, zero_more_xr, one_more_xr, // extra-reluctant 1785 open_par_nm, occur_g, occur_ng 1786 } 1787 __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"]; 1788 1789 /// Actions for to-postfix transformation 1790 enum Act { 1791 pua, poc, poa, pca, don, err 1792 } 1793 __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"]; 1794 1795 /// Action lookup for to-postfix transformation 1796 __gshared immutable Act[] action_lookup = 1797 [ 1798 // eos concat | ( ) ? * + ?? *? +? ??extra *?extra +?extra (? {x,y} {x,y}? 1799 Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1800 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1801 Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1802 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1803 Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, 1804 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1805 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1806 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1807 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1808 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1809 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1810 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1811 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1812 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1813 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1814 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1815 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc 1816 ]; 1817 1818 final dchar peekPattern() 1819 { 1820 auto tmp = next_cursor; 1821 if ( tmp < pattern.length ) 1822 return decode(pattern, tmp); 1823 return 0; 1824 } 1825 1826 final dchar readPattern() 1827 { 1828 cursor = next_cursor; 1829 if ( next_cursor < pattern.length ) 1830 return decode(pattern, next_cursor); 1831 return 0; 1832 } 1833 1834 final bool endOfPattern() 1835 { 1836 return next_cursor >= pattern.length; 1837 } 1838 1839 state_t addState() 1840 { 1841 state_t s = new state_t; 1842 s.index = states.length; 1843 states ~= s; 1844 return s; 1845 } 1846 1847 trans_t addTransition(PriorityClass pc = PriorityClass.normal) 1848 { 1849 trans_t trans = new trans_t(pc); 1850 transitions ~= trans; 1851 return trans; 1852 } 1853 1854 uint parseNumber() 1855 { 1856 uint res; 1857 while ( !endOfPattern() ) 1858 { 1859 auto c = peekPattern(); 1860 if ( c < '0' || c > '9' ) 1861 break; 1862 res = res*10+(c-'0'); 1863 readPattern(); 1864 } 1865 return res; 1866 } 1867 1868 void parseOccurCount(out uint minOccur, out uint maxOccur) 1869 { 1870 assert(pattern[cursor] == '{'); 1871 1872 minOccur = parseNumber(); 1873 if ( peekPattern() == '}' ) { 1874 readPattern(); 1875 maxOccur = minOccur; 1876 return; 1877 } 1878 if ( peekPattern() != ',' ) 1879 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1880 readPattern(); 1881 maxOccur = parseNumber(); 1882 if ( peekPattern() != '}' ) 1883 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1884 readPattern(); 1885 if ( maxOccur > 0 && maxOccur < minOccur ) 1886 throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1887 } 1888 1889 trans_t clone(trans_t t) 1890 { 1891 if ( t is null ) 1892 return null; 1893 trans_t* tmp = t in clonedTransitions; 1894 if ( tmp !is null ) 1895 return *tmp; 1896 1897 trans_t t2 = new trans_t(t.priorityClass); 1898 clonedTransitions[t] = t2; 1899 t2.tag = t.tag; 1900 t2.priority = t.priority; 1901 t2.predicate = t.predicate; 1902 t2.target = clone(t.target); 1903 transitions ~= t2; 1904 return t2; 1905 } 1906 1907 state_t clone(state_t s) 1908 { 1909 if ( s is null ) 1910 return null; 1911 state_t* tmp = s in clonedStates; 1912 if ( tmp !is null ) 1913 return *tmp; 1914 1915 state_t s2 = new state_t; 1916 clonedStates[s] = s2; 1917 s2.accept = s.accept; 1918 s2.visited = s.visited; 1919 foreach ( t; s.transitions ) 1920 s2.transitions ~= clone(t); 1921 s2.index = states.length; 1922 states ~= s2; 1923 return s2; 1924 } 1925 1926 frag_t clone(frag_t f) 1927 { 1928 if ( f is null ) 1929 return null; 1930 clonedStates = null; 1931 clonedTransitions = null; 1932 1933 frag_t f2 = new frag_t; 1934 foreach ( t; f.entries ) 1935 f2.entries ~= clone(t); 1936 foreach ( t; f.exits ) 1937 f2.exits ~= clone(t); 1938 foreach ( t; f.entry_state ) 1939 f2.entry_state ~= clone(t); 1940 foreach ( t; f.exit_state ) 1941 f2.exit_state ~= clone(t); 1942 return f2; 1943 } 1944 1945 //--------------------------------------------------------------------------------------------- 1946 // Thompson constructions of NFA fragments 1947 1948 frag_t constructSingleChar(char_t c, predicate_t.Type type) 1949 { 1950 debug(tnfa) Stdout.formatln("constructCharFrag {}", c); 1951 1952 trans_t trans = addTransition(); 1953 trans.predicate.appendInput(CharRange!(char_t)(c)); 1954 1955 trans.predicate.type = type; 1956 1957 frag_t frag = new frag_t; 1958 frag.exit_state ~= trans; 1959 frag.entries ~= trans; 1960 return frag; 1961 } 1962 1963 frag_t constructChars(string_t chars, in predicate_t.Type type) 1964 { 1965 cc_t cc; 1966 for ( int i = 0; i < chars.length; ++i ) 1967 cc.add(chars[i]); 1968 1969 return constructChars(cc, type); 1970 } 1971 1972 frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type) 1973 { 1974 debug(tnfa) Stdout.format("constructChars type={}", type); 1975 1976 trans_t trans = addTransition(); 1977 trans.predicate.type = type; 1978 1979 trans.predicate.setInput(cc_t(charclass)); 1980 1981 trans.predicate.optimize(); 1982 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 1983 1984 frag_t frag = new frag_t; 1985 frag.exit_state ~= trans; 1986 frag.entries ~= trans; 1987 return frag; 1988 } 1989 1990 frag_t constructCharClass(predicate_t.Type type) 1991 { 1992 debug(tnfa) Stdout.format("constructCharClass type={}", type); 1993 auto oldCursor = cursor; 1994 1995 trans_t trans = addTransition(); 1996 1997 bool negated=false; 1998 if ( peekPattern() == '^' ) { 1999 readPattern(); 2000 negated = true; 2001 } 2002 2003 char_t last; 2004 bool have_range_start, 2005 first_char = true; 2006 for ( ; !endOfPattern() && peekPattern() != ']'; ) 2007 { 2008 dchar c = readPattern(); 2009 switch ( c ) 2010 { 2011 case '-': 2012 if ( first_char ) { 2013 trans.predicate.appendInput(range_t(c)); 2014 break; 2015 } 2016 if ( !have_range_start ) 2017 throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2018 else if ( endOfPattern() || peekPattern() == ']' ) 2019 throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2020 else { 2021 c = readPattern(); 2022 trans.predicate.appendInput(range_t(last, c)); 2023 have_range_start = false; 2024 } 2025 break; 2026 case '\\': 2027 if ( endOfPattern() ) 2028 throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\""); 2029 c = readPattern(); 2030 switch ( c ) 2031 { 2032 case 't': 2033 c = '\t'; 2034 break; 2035 case 'n': 2036 c = '\n'; 2037 break; 2038 case 'r': 2039 c = '\r'; 2040 break; 2041 default: 2042 break; 2043 } 2044 goto default; 2045 default: 2046 if ( have_range_start ) 2047 trans.predicate.appendInput(range_t(last)); 2048 last = c; 2049 have_range_start = true; 2050 } 2051 first_char = false; 2052 } 2053 if ( !endOfPattern() ) 2054 readPattern(); 2055 if ( last != char_t.init ) 2056 trans.predicate.appendInput(range_t(last)); 2057 debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]); 2058 2059 if ( negated ) { 2060 auto tmp = cc_t(cc_t.any_char); 2061 tmp.subtract(trans.predicate.input); 2062 trans.predicate.input = tmp; 2063 } 2064 else 2065 trans.predicate.optimize(); 2066 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 2067 2068 trans.predicate.type = type; 2069 2070 frag_t frag = new frag_t; 2071 frag.exit_state ~= trans; 2072 frag.entries ~= trans; 2073 return frag; 2074 } 2075 2076 void constructBracket(List!(frag_t) frags, uint tag=0) 2077 { 2078 debug(tnfa) Stdout.formatln("constructBracket"); 2079 2080 state_t entry = addState(), 2081 exit = addState(); 2082 frags.tail.value.setEntry(entry); 2083 frags.tail.value.setExit(exit); 2084 2085 trans_t tag1 = addTransition(), 2086 tag2 = addTransition(); 2087 tag1.predicate.type = predicate_t.Type.epsilon; 2088 tag2.predicate.type = predicate_t.Type.epsilon; 2089 if ( tag > 0 ) 2090 { 2091 // make sure the tag indeces for bracket x are always 2092 // x*2 for the opening bracket and x*2+1 for the closing bracket 2093 tag1.tag = tag++; 2094 tag2.tag = tag; 2095 } 2096 tag1.target = entry; 2097 exit.transitions ~= tag2; 2098 2099 frag_t frag = new frag_t; 2100 frag.entries ~= tag1; 2101 frag.exit_state ~= tag2; 2102 frags.pop(); 2103 frags ~= frag; 2104 } 2105 2106 void constructOneMore(List!(frag_t) frags, PriorityClass prioClass) 2107 { 2108 debug(tnfa) Stdout.formatln("constructOneMore"); 2109 2110 if ( frags.empty() ) 2111 throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2112 2113 trans_t repeat = addTransition(prioClass), 2114 cont = addTransition(); 2115 repeat.predicate.type = predicate_t.Type.epsilon; 2116 cont.predicate.type = predicate_t.Type.epsilon; 2117 2118 state_t s = addState(); 2119 frags.tail.value.setExit(s); 2120 s.transitions ~= repeat; 2121 s.transitions ~= cont; 2122 2123 frag_t frag = new frag_t; 2124 frag.entries ~= frags.tail.value.entries; 2125 frag.entry_state ~= frags.tail.value.entry_state; 2126 frag.entry_state ~= repeat; 2127 frag.exit_state ~= cont; 2128 frags.pop(); 2129 frags ~= frag; 2130 } 2131 2132 void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass) 2133 { 2134 debug(tnfa) Stdout.formatln("constructZeroMore"); 2135 2136 if ( frags.empty() ) 2137 throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2138 2139 trans_t enter = addTransition(prioClass), 2140 repeat = addTransition(prioClass), 2141 skip = addTransition(); 2142 skip.predicate.type = predicate_t.Type.epsilon; 2143 repeat.predicate.type = predicate_t.Type.epsilon; 2144 enter.predicate.type = predicate_t.Type.epsilon; 2145 2146 state_t entry = addState(), 2147 exit = addState(); 2148 frags.tail.value.setEntry(entry); 2149 frags.tail.value.setExit(exit); 2150 exit.transitions ~= repeat; 2151 enter.target = entry; 2152 2153 frag_t frag = new frag_t; 2154 frag.entries ~= skip; 2155 frag.entries ~= enter; 2156 frag.exit_state ~= skip; 2157 frag.entry_state ~= repeat; 2158 frags.pop(); 2159 frags ~= frag; 2160 } 2161 2162 void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass) 2163 { 2164 debug(tnfa) Stdout.formatln("constructZeroOne"); 2165 2166 if ( frags.empty() ) 2167 throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2168 2169 trans_t use = addTransition(prioClass), 2170 skip = addTransition(); 2171 use.predicate.type = predicate_t.Type.epsilon; 2172 skip.predicate.type = predicate_t.Type.epsilon; 2173 2174 state_t s = addState(); 2175 frags.tail.value.setEntry(s); 2176 use.target = s; 2177 2178 frag_t frag = new frag_t; 2179 frag.entries ~= use; 2180 frag.entries ~= skip; 2181 frag.exits ~= frags.tail.value.exits; 2182 frag.exit_state ~= frags.tail.value.exit_state; 2183 frag.exit_state ~= skip; 2184 frags.pop(); 2185 frags ~= frag; 2186 } 2187 2188 void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass) 2189 { 2190 debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur); 2191 2192 if ( frags.empty() ) 2193 throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2194 2195 state_t s; 2196 frag_t total = new frag_t, 2197 prev; 2198 2199 for ( int i = 0; i < minOccur; ++i ) 2200 { 2201 frag_t f = clone(frags.tail.value); 2202 if ( prev !is null ) { 2203 s = addState(); 2204 prev.setExit(s); 2205 f.setEntry(s); 2206 } 2207 else { 2208 total.entries = f.entries; 2209 total.entry_state = f.entry_state; 2210 } 2211 prev = f; 2212 } 2213 2214 if ( maxOccur == 0 ) 2215 { 2216 frag_t f = frags.tail.value; 2217 trans_t t = addTransition(); 2218 t.predicate.type = predicate_t.Type.epsilon; 2219 f.entries ~= t; 2220 f.exit_state ~= t; 2221 2222 t = addTransition(); 2223 t.predicate.type = predicate_t.Type.epsilon; 2224 f.exits ~= t; 2225 f.entry_state ~= t; 2226 2227 s = addState(); 2228 f.setEntry(s); 2229 2230 if ( prev !is null ) 2231 prev.setExit(s); 2232 else { 2233 total.entries = f.entries; 2234 total.entry_state = f.entry_state; 2235 } 2236 2237 prev = f; 2238 } 2239 2240 for ( int i = minOccur; i < maxOccur; ++i ) 2241 { 2242 frag_t f; 2243 if ( i < maxOccur-1 ) 2244 f = clone(frags.tail.value); 2245 else 2246 f = frags.tail.value; 2247 trans_t t = addTransition(); 2248 t.predicate.type = predicate_t.Type.epsilon; 2249 f.entries ~= t; 2250 f.exit_state ~= t; 2251 2252 if ( prev !is null ) { 2253 s = addState(); 2254 prev.setExit(s); 2255 f.setEntry(s); 2256 } 2257 else { 2258 total.entries = f.entries; 2259 total.entry_state = f.entry_state; 2260 } 2261 prev = f; 2262 } 2263 2264 total.exits = prev.exits; 2265 total.exit_state = prev.exit_state; 2266 2267 frags.pop(); 2268 frags ~= total; 2269 } 2270 2271 void constructAltern(List!(frag_t) frags) 2272 { 2273 debug(tnfa) Stdout.formatln("constructAltern"); 2274 2275 if ( frags.empty() || frags.head is frags.tail ) 2276 throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2277 2278 frag_t frag = new frag_t, 2279 f1 = frags.tail.value, 2280 f2 = frags.tail.prev.value; 2281 frag.entry_state ~= f2.entry_state; 2282 frag.entry_state ~= f1.entry_state; 2283 frag.exit_state ~= f2.exit_state; 2284 frag.exit_state ~= f1.exit_state; 2285 frag.entries ~= f2.entries; 2286 frag.entries ~= f1.entries; 2287 frag.exits ~= f2.exits; 2288 frag.exits ~= f1.exits; 2289 2290 frags.pop(); 2291 frags.pop(); 2292 frags ~= frag; 2293 } 2294 2295 void constructConcat(List!(frag_t) frags) 2296 { 2297 debug(tnfa) Stdout.formatln("constructConcat"); 2298 2299 if ( frags.empty() || frags.head is frags.tail ) 2300 throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2301 2302 frag_t f1 = frags.tail.value, 2303 f2 = frags.tail.prev.value; 2304 2305 state_t state = addState(); 2306 f2.setExit(state); 2307 f1.setEntry(state); 2308 2309 frag_t frag = new frag_t; 2310 frag.entries ~= f2.entries; 2311 frag.exits ~= f1.exits; 2312 frag.entry_state ~= f2.entry_state; 2313 frag.exit_state ~= f1.exit_state; 2314 frags.pop(); 2315 frags.pop(); 2316 frags ~= frag; 2317 } 2318 } 2319 import tango.core.Array; 2320 2321 /* ************************************************************************************************ 2322 Tagged DFA 2323 **************************************************************************************************/ 2324 private class TDFA(char_t) 2325 { 2326 alias Predicate!(char_t) predicate_t; 2327 alias CharRange!(char_t) range_t; 2328 alias CharClass!(char_t) charclass_t; 2329 alias char_t[] string_t; 2330 2331 enum uint CURRENT_POSITION_REGISTER = ~0; 2332 2333 /* ******************************************************************************************** 2334 Tag map assignment command 2335 **********************************************************************************************/ 2336 struct Command 2337 { 2338 uint dst, /// register index to recieve data 2339 src; /// register index or CURRENT_POSITION_REGISTER for current position 2340 2341 string toString() 2342 { 2343 return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup; 2344 } 2345 2346 /* **************************************************************************************** 2347 Order transitions by the order of their predicates. 2348 ******************************************************************************************/ 2349 int opCmp(const ref Command cmd) const 2350 { 2351 if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER ) 2352 return 1; 2353 if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER ) 2354 return -1; 2355 if ( dst < cmd.dst ) 2356 return -1; 2357 if ( dst == cmd.dst ) 2358 return 0; 2359 return 1; 2360 } 2361 } 2362 2363 struct TagIndex 2364 { 2365 uint tag, 2366 index; 2367 2368 const int opCmp(ref const TagIndex o) 2369 { 2370 if (tag == o.tag && index == o.index) 2371 return 0; 2372 if (tag > o.tag) 2373 return 1; 2374 if (tag < o.tag) 2375 return -1; 2376 if (index > o.index) 2377 return 1; 2378 if (index < o.index) 2379 return -1; 2380 assert(0); 2381 } 2382 } 2383 2384 /* ******************************************************************************************** 2385 TDFA state 2386 **********************************************************************************************/ 2387 class State 2388 { 2389 enum Mode { 2390 GENERIC, MIXED, LOOKUP 2391 } 2392 2393 enum uint LOOKUP_LENGTH = 256, 2394 INVALID_STATE = 255; 2395 2396 bool accept = false; 2397 bool reluctant = false; 2398 size_t index; 2399 Transition[] transitions, 2400 generic_transitions; 2401 Command[] finishers; 2402 2403 ubyte[] lookup; 2404 Mode mode; 2405 2406 void optimize() 2407 { 2408 // merge transitions with equal targets (same state index and equal commands) 2409 size_t[] remove_indeces; 2410 foreach ( i, t; transitions[0 .. $-1] ) 2411 { 2412 foreach ( t2; transitions[i+1 .. $] ) 2413 { 2414 if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) ) 2415 continue; 2416 t2.predicate.appendInput(t.predicate); 2417 remove_indeces ~= i; 2418 break; 2419 } 2420 } 2421 2422 // remove transitions that have been merged into another 2423 if ( remove_indeces.length > 0 ) 2424 { 2425 Transition[] tmp; 2426 tmp.length = transitions.length - remove_indeces.length; 2427 size_t next_remove, next; 2428 foreach ( i, t; transitions ) 2429 { 2430 if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) { 2431 ++next_remove; 2432 continue; 2433 } 2434 tmp[next++] = t; 2435 } 2436 transitions = tmp; 2437 2438 foreach ( t; transitions ) 2439 t.predicate.optimize(); 2440 } 2441 } 2442 2443 void createLookup() 2444 { 2445 size_t count; 2446 foreach ( t; transitions ) 2447 { 2448 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2449 ++count; 2450 } 2451 2452 if ( count < 1 || transitions.length > INVALID_STATE ) { 2453 generic_transitions.length = transitions.length; 2454 generic_transitions[] = transitions[]; 2455 return; 2456 } 2457 2458 foreach ( t; transitions ) 2459 { 2460 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2461 generic_transitions ~= t; 2462 } 2463 2464 // setup lookup table 2465 lookup.length = LOOKUP_LENGTH; 2466 lookup[] = INVALID_STATE; 2467 foreach ( i, t; transitions ) 2468 { 2469 foreach ( p; t.predicate.input.parts ) 2470 { 2471 if ( p.l_ >= lookup.length ) 2472 continue; 2473 for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c ) 2474 lookup[c] = cast(ubyte)i; 2475 } 2476 } 2477 2478 mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP; 2479 } 2480 } 2481 2482 /* ******************************************************************************************** 2483 TDFA transition 2484 **********************************************************************************************/ 2485 class Transition 2486 { 2487 State target; 2488 predicate_t predicate; 2489 Command[] commands; 2490 2491 /* **************************************************************************************** 2492 Order transitions by the order of their predicates. 2493 ******************************************************************************************/ 2494 override final int opCmp(Object o) 2495 { 2496 Transition t = cast(Transition)o; 2497 assert(t !is null); 2498 return predicate.opCmp(t.predicate); 2499 } 2500 2501 override final const bool opEquals(const Object o) 2502 { 2503 auto t = cast(const(Transition))o; 2504 if ( t is null ) 2505 return false; 2506 if ( equalTarget(t) && t.predicate == predicate ) 2507 return true; 2508 return false; 2509 } 2510 2511 const bool equalTarget(in Transition t) 2512 { 2513 if ( t.target.index != target.index ) 2514 return false; 2515 if ( commands.length != t.commands.length ) 2516 return false; 2517 Louter: foreach ( cmd; commands ) 2518 { 2519 foreach ( cmd2; t.commands ) 2520 { 2521 if ( cmd == cmd2 ) 2522 continue Louter; 2523 } 2524 return false; 2525 } 2526 return true; 2527 } 2528 } 2529 2530 2531 State[] states; 2532 State start; 2533 Command[] initializer; 2534 uint num_tags; 2535 2536 uint[TagIndex] registers; 2537 uint next_register; 2538 2539 uint num_regs() 2540 { 2541 return next_register; 2542 } 2543 2544 /* ******************************************************************************************** 2545 Constructs the TDFA from the given TNFA using extended power set method 2546 **********************************************************************************************/ 2547 this(TNFA!(char_t) tnfa) 2548 { 2549 num_tags = tnfa.tagCount(); 2550 2551 next_register = num_tags; 2552 for ( int i = 1; i <= num_tags; ++i ) { 2553 TagIndex ti; 2554 ti.tag = i; 2555 registers[ti] = i-1; 2556 } 2557 2558 // create epsilon closure of TNFA start state 2559 SubsetState subset_start = new SubsetState; 2560 StateElement se = new StateElement; 2561 se.nfa_state = tnfa.start; 2562 subset_start.elms ~= se; 2563 2564 // apply lookbehind closure for string/line start 2565 predicate_t tmp_pred; 2566 tmp_pred.setInput(CharClass!(char_t).line_startend); 2567 subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start); 2568 2569 start = addState(); 2570 subset_start.dfa_state = start; 2571 2572 // generate initializer and finisher commands for TDFA start state 2573 generateInitializers(subset_start); 2574 generateFinishers(subset_start); 2575 2576 // initialize stack for state traversal 2577 List!(SubsetState) subset_states = new List!(SubsetState), 2578 unmarked = new List!(SubsetState); 2579 subset_states ~= subset_start; 2580 unmarked ~= subset_start; 2581 debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString); 2582 2583 while ( !unmarked.empty() ) 2584 { 2585 SubsetState state = unmarked.tail.value; 2586 unmarked.pop(); 2587 2588 // create transitions for each class, creating new states when necessary 2589 foreach ( pred; disjointPredicates(state) ) 2590 { 2591 // find NFA state we reach with pred 2592 // reach will set predicate type correctly 2593 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString); 2594 SubsetState target = reach(state, pred); 2595 if ( target is null ) { 2596 continue; 2597 debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping"); 2598 } 2599 debug(tdfa) Stdout.formatln(" {}", target.toString); 2600 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state); 2601 2602 Transition trans = new Transition; 2603 state.dfa_state.transitions ~= trans; 2604 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred); 2605 trans.predicate = pred; 2606 2607 // generate indeces for pos commands 2608 // delay creation of pos command until we have reorder-commands 2609 uint[uint] cmds = null; 2610 foreach ( e; target.elms ) 2611 { 2612 foreach ( tag, ref index; e.tags ) 2613 { 2614 bool found=false; 2615 foreach ( e2; state.elms ) 2616 { 2617 int* i = tag in e2.tags; 2618 if ( i !is null && *i == index ) { 2619 found=true; 2620 break; 2621 } 2622 } 2623 if ( !found ) 2624 { 2625 // if index is < 0 it is a temporary index 2626 // used only to distinguish the state from existing ones. 2627 // the previous index can be reused instead. 2628 if ( index < 0 ) 2629 index = -index-1; 2630 cmds[tag] = index; 2631 } 2632 else 2633 assert(index>=0); 2634 } 2635 } 2636 2637 // check whether a state exists that is identical except for tag index reorder-commands 2638 bool exists=false; 2639 foreach ( equivTarget; subset_states ) 2640 { 2641 if ( reorderTagIndeces(target, equivTarget, state, trans) ) { 2642 target = equivTarget; 2643 exists = true; 2644 break; 2645 } 2646 } 2647 // else create new target state 2648 if ( !exists ) 2649 { 2650 State ts = addState(); 2651 target.dfa_state = ts; 2652 subset_states ~= target; 2653 unmarked ~= target; 2654 debug(tdfa_add) { 2655 Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString); 2656 } 2657 generateFinishers(target); 2658 } 2659 2660 // now generate pos commands, rewriting reorder-commands if existent 2661 foreach ( tag, index; cmds ) 2662 { 2663 // check whether reordering used this tag, if so, overwrite the command directly, 2664 // for it's effect would be overwritten by a subsequent pos-command anyway 2665 uint reg = registerFromTagIndex(tag, index); 2666 bool found = false; 2667 foreach ( ref cmd; trans.commands ) 2668 { 2669 if ( cmd.src == reg ) { 2670 found = true; 2671 cmd.src = CURRENT_POSITION_REGISTER; 2672 break; 2673 } 2674 } 2675 if ( !found ) { 2676 Command cmd; 2677 cmd.dst = reg; 2678 cmd.src = CURRENT_POSITION_REGISTER; 2679 trans.commands ~= cmd; 2680 } 2681 } 2682 2683 trans.target = target.dfa_state; 2684 debug(tdfa) { 2685 Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index); 2686 } 2687 } 2688 2689 state.dfa_state.optimize(); 2690 } 2691 2692 // renumber registers continuously 2693 uint[uint] regNums; 2694 2695 for ( next_register = 0; next_register < num_tags; ++next_register ) 2696 regNums[next_register] = next_register; 2697 2698 void renumberCommand(ref Command cmd) 2699 { 2700 if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null ) 2701 regNums[cmd.src] = next_register++; 2702 if ( (cmd.dst in regNums) is null ) 2703 regNums[cmd.dst] = next_register++; 2704 if ( cmd.src != CURRENT_POSITION_REGISTER ) 2705 cmd.src = regNums[cmd.src]; 2706 cmd.dst = regNums[cmd.dst]; 2707 } 2708 2709 foreach ( state; states ) 2710 { 2711 foreach ( ref cmd; state.finishers ) 2712 renumberCommand(cmd); 2713 // make sure pos-commands are executed after reorder-commands and 2714 // reorder-commands do not overwrite each other 2715 tango.core.Array.sort(state.finishers); 2716 2717 foreach ( trans; state.transitions ) 2718 { 2719 foreach ( ref cmd; trans.commands ) 2720 renumberCommand(cmd); 2721 tango.core.Array.sort(trans.commands); 2722 trans.predicate.compile(); 2723 } 2724 } 2725 2726 debug(TangoRegex) 2727 { 2728 foreach ( ref v; registers ) 2729 { 2730 if ( (v in regNums) !is null ) 2731 v = regNums[v]; 2732 } 2733 } 2734 2735 minimizeDFA(); 2736 2737 foreach ( state; states ) 2738 state.createLookup(); 2739 2740 // TODO: optimize memory layout of TDFA 2741 2742 // TODO: add lookahead for string-end somewhere 2743 // TODO: mark dead-end states (not leaving a non-finishing susbet) 2744 // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition 2745 // (execution may stop in that state) 2746 } 2747 2748 /* ******************************************************************************************** 2749 Print the TDFA (tabular representation of the delta function) 2750 **********************************************************************************************/ 2751 debug(TangoRegex) void print() 2752 { 2753 Stdout.formatln("#tags = {}", num_tags); 2754 2755 auto tis = new TagIndex[registers.length]; 2756 foreach ( k, v; registers ) 2757 tis [v] = k; 2758 foreach ( r, ti; tis ) { 2759 Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r); 2760 } 2761 Stdout.formatln("Initializer:"); 2762 foreach ( cmd; initializer ) { 2763 Stdout.formatln("{}", cmd.toString); 2764 } 2765 Stdout.formatln("Delta function:"); 2766 foreach ( int i, s; states ) 2767 { 2768 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 2769 2770 bool first=true; 2771 Stdout(" {"); 2772 foreach ( t; s.transitions ) 2773 { 2774 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index); 2775 bool firstcmd=true; 2776 foreach ( cmd; t.commands ) 2777 { 2778 if ( firstcmd ) 2779 firstcmd = false; 2780 else 2781 Stdout(","); 2782 Stdout.format("{}", cmd.toString); 2783 } 2784 Stdout(")"); 2785 first = false; 2786 } 2787 Stdout("} ("); 2788 2789 bool firstcmd=true; 2790 foreach ( cmd; s.finishers ) 2791 { 2792 if ( firstcmd ) 2793 firstcmd = false; 2794 else 2795 Stdout(","); 2796 Stdout.format("{}", cmd.toString); 2797 } 2798 Stdout.formatln(")"); 2799 } 2800 } 2801 2802 private: 2803 /* ******************************************************************************************** 2804 A (TNFA state, tags) pair element of a subset state. 2805 **********************************************************************************************/ 2806 class StateElement 2807 { 2808 TNFAState!(char_t) nfa_state; 2809 int[uint] tags; 2810 // use place-value priority with 2 places, value(maxPrio) > value(lastPrio) 2811 uint maxPriority, 2812 lastPriority; 2813 2814 bool prioGreater(StateElement se) 2815 { 2816 if ( maxPriority < se.maxPriority ) 2817 return true; 2818 if ( maxPriority == se.maxPriority ) { 2819 assert(lastPriority != se.lastPriority); 2820 return lastPriority < se.lastPriority; 2821 } 2822 return false; 2823 } 2824 2825 override int opCmp(Object o) 2826 { 2827 StateElement se = cast(StateElement)o; 2828 assert(se !is null); 2829 if ( maxPriority < se.maxPriority ) 2830 return 1; 2831 if ( maxPriority == se.maxPriority ) 2832 { 2833 if ( lastPriority == se.lastPriority ) 2834 return 0; 2835 return lastPriority < se.lastPriority; 2836 } 2837 return -1; 2838 } 2839 2840 override string toString() 2841 { 2842 const(char)[] str; 2843 str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority); 2844 bool first = true; 2845 foreach ( k, v; tags ) { 2846 str ~= Format.convert("{}m({},{})", first?"":",", k, v); 2847 first = false; 2848 } 2849 str ~= "}"; 2850 return str.idup; 2851 } 2852 } 2853 2854 /* ******************************************************************************************** 2855 Represents a state in the NFA to DFA conversion. 2856 Contains the set of states (StateElements) the NFA might be in at the same time and the 2857 corresponding DFA state that we create. 2858 **********************************************************************************************/ 2859 class SubsetState 2860 { 2861 StateElement[] elms; 2862 State dfa_state; 2863 2864 this(StateElement[] elms=null) 2865 { 2866 this.elms = elms; 2867 } 2868 2869 int opApply(int delegate (ref TNFATransition!(char_t)) dg) 2870 { 2871 int res; 2872 foreach ( elm; elms ) 2873 { 2874 foreach ( t; elm.nfa_state.transitions ) 2875 { 2876 res = dg(t); 2877 if ( res ) 2878 return res; 2879 } 2880 } 2881 return res; 2882 } 2883 2884 override string toString() 2885 { 2886 string str = "[ "; 2887 bool first = true; 2888 foreach ( s; elms ) { 2889 if ( !first ) 2890 str ~= ", "; 2891 str ~= s.toString(); 2892 first = false; 2893 } 2894 return str~" ]"; 2895 } 2896 } 2897 2898 /* ******************************************************************************************** 2899 Temporary structure used for disjoint predicate computation 2900 **********************************************************************************************/ 2901 struct Mark 2902 { 2903 char_t c; 2904 bool end; /// false = start of range 2905 2906 int opCmp(const ref Mark m) const 2907 { 2908 if ( c < m.c ) 2909 return -1; 2910 if ( c > m.c ) 2911 return 1; 2912 if ( end < m.end ) 2913 return -1; 2914 if ( end > m.end ) 2915 return 1; 2916 return 0; 2917 } 2918 } 2919 2920 /* ******************************************************************************************** 2921 Calculates the register index for a given tag map entry. The TDFA implementation uses 2922 registers to save potential tag positions, the index space gets linearized here. 2923 2924 Params: tag = tag number 2925 index = tag map index 2926 Returns: index of the register to use for the tag map entry 2927 **********************************************************************************************/ 2928 uint registerFromTagIndex(uint tag, uint index) 2929 { 2930 if ( index > 0 ) 2931 { 2932 TagIndex ti; 2933 ti.tag = tag; 2934 ti.index = index; 2935 uint* i = ti in registers; 2936 if ( i !is null ) 2937 return *i; 2938 return registers[ti] = next_register++; 2939 } 2940 else 2941 return tag-1; 2942 } 2943 2944 Mark[] marks_; 2945 2946 /* ******************************************************************************************** 2947 Add new TDFA state to the automaton. 2948 **********************************************************************************************/ 2949 State addState() 2950 { 2951 State s = new State; 2952 s.index = states.length; 2953 states ~= s; 2954 return s; 2955 } 2956 2957 /* ******************************************************************************************** 2958 Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions. 2959 2960 Params: state = SubsetState to create the predicates from 2961 Returns: List of disjoint predicates that can be used for a DFA state 2962 **********************************************************************************************/ 2963 predicate_t[] disjointPredicates(SubsetState state) 2964 { 2965 alias CharRange!(char_t) range_t; 2966 debug(tdfa) Stdout.formatln("disjointPredicates()"); 2967 2968 size_t num_marks; 2969 foreach ( t; state ) 2970 { 2971 // partitioning will consider lookbehind transitions, 2972 // st. lb-closure will not expand for transitions with a superset of the lb-predicate 2973 if ( t.predicate.type != predicate_t.Type.epsilon ) 2974 { 2975 debug(tdfa) Stdout.formatln("{}", t.predicate.toString); 2976 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length ) 2977 marks_.length = num_marks+2*t.predicate.getInput().parts.length; 2978 foreach ( p; t.predicate.getInput().parts ) { 2979 marks_[num_marks++] = Mark(p.l(), false); 2980 marks_[num_marks++] = Mark(p.r(), true); 2981 } 2982 } 2983 } 2984 2985 if ( num_marks <= 1 ) 2986 throw new Exception("disjointPredicates: No transitions in subset state"); 2987 2988 debug(tdfa) Stdout("\nsorting...").newline; 2989 // using built-in sort somtimes gives an AV in TypeInfo.swap 2990 quickSort(marks_[0 .. num_marks]); 2991 assert(!marks_[0].end); 2992 2993 debug(tdfa) 2994 { 2995 Stdout("\nsorted marks:\n"); 2996 bool first=true; 2997 foreach ( m; marks_[0 .. num_marks] ) 2998 { 2999 if ( first ) 3000 first = false; 3001 else 3002 Stdout(","); 3003 if ( m.c > 0x20 && m.c < 0x7f ) 3004 Stdout.format("{}{}", m.end?"e":"s", m.c); 3005 else 3006 Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c); 3007 } 3008 Stdout.newline; 3009 } 3010 3011 size_t next, 3012 active = 1; 3013 char_t start = marks_[0].c, 3014 end; 3015 range_t[] disjoint = new range_t[num_marks/2+1]; 3016 3017 foreach ( m; marks_[1 .. num_marks] ) 3018 { 3019 if ( m.end ) 3020 { 3021 assert(active>0); 3022 --active; 3023 if ( m.c < start ) 3024 continue; 3025 end = m.c; 3026 // the next range cannot start at the same pos 3027 // because starts are sorted before endings 3028 if ( active > 0 ) 3029 ++m.c; 3030 } 3031 else 3032 { 3033 ++active; 3034 if ( active == 1 ) 3035 { 3036 // skip uncovered interval 3037 if ( m.c > start ) { 3038 start = m.c; 3039 continue; 3040 } 3041 end = m.c; 3042 ++m.c; 3043 } 3044 // skip range start if cursor already marks it 3045 else if ( m.c <= start ) 3046 continue; 3047 else 3048 end = m.c-1; 3049 } 3050 3051 // save range 3052 if ( disjoint.length <= next ) 3053 disjoint.length = disjoint.length*2; 3054 3055 disjoint[next].l_ = start; 3056 disjoint[next].r_ = end; 3057 ++next; 3058 3059 // advance cursor 3060 start = m.c; 3061 } 3062 disjoint.length = next; 3063 3064 // merge isolated ranges into sets of ranges 3065 // no range in a set may occur separated from the others in any predicate 3066 predicate_t[] preds; 3067 preds.length = 1; 3068 Lmerge: foreach ( r; disjoint ) 3069 { 3070 if ( preds[$-1].empty() ) 3071 preds[$-1].appendInput(r); 3072 else 3073 { 3074 // we can merge r into the current predicate if 3075 // pred contains r <=> pred contains all the other ranges 3076 foreach ( t; state ) 3077 { 3078 if ( t.predicate.type == predicate_t.Type.epsilon ) 3079 continue; 3080 3081 if ( t.predicate.getInput().contains(r) 3082 != t.predicate.getInput().contains(preds[$-1].getInput()) ) 3083 { 3084 preds.length = preds.length+1; 3085 break; 3086 } 3087 } 3088 preds[$-1].appendInput(r); 3089 } 3090 } 3091 3092 debug(tdfa) 3093 { 3094 Stdout("\ndisjoint ranges:\n"); 3095 first=true; 3096 foreach ( r; disjoint ) 3097 { 3098 if ( first ) 3099 first = false; 3100 else 3101 Stdout(","); 3102 Stdout.format("{}", r); 3103 } 3104 Stdout.newline; 3105 Stdout("\ndisjoint predicates:\n"); 3106 first=true; 3107 foreach ( ref p; preds ) 3108 { 3109 if ( first ) 3110 first = false; 3111 else 3112 Stdout(","); 3113 Stdout.format("{}", p.toString); 3114 } 3115 Stdout.newline; 3116 } 3117 3118 debug(tdfa) Stdout.formatln("disjointPredicates() end"); 3119 return preds; 3120 } 3121 3122 /* ******************************************************************************************** 3123 Finds all TNFA states that can be reached directly with the given predicate and creates 3124 a new SubsetState containing those target states. 3125 3126 Params: subst = SubsetState to start from 3127 pred = predicate that is matched against outgoing transitions 3128 Returns: SubsetState containing the reached target states 3129 **********************************************************************************************/ 3130 SubsetState reach(SubsetState subst, ref predicate_t pred) 3131 { 3132 // to handle the special case of overlapping consume and lookahead predicates, 3133 // we find the different intersecting predicate types 3134 bool have_consume, 3135 have_lookahead; 3136 foreach ( t; subst ) 3137 { 3138 if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead ) 3139 continue; 3140 auto intpred = t.predicate.intersect(pred); 3141 if ( !intpred.empty() ) 3142 { 3143 if ( t.predicate.type == predicate_t.Type.consume ) 3144 have_consume = true; 3145 else if ( t.predicate.type == predicate_t.Type.lookahead ) 3146 have_lookahead = true; 3147 } 3148 } 3149 3150 // if there is consume/lookahead overlap, 3151 // lookahead predicates are handled first 3152 predicate_t.Type processed_type; 3153 if ( have_lookahead ) 3154 processed_type = predicate_t.Type.lookahead; 3155 else if ( have_consume ) 3156 processed_type = predicate_t.Type.consume; 3157 else { 3158 debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString); 3159 return null; 3160 } 3161 pred.type = processed_type; 3162 3163 // add destination states to new subsetstate 3164 SubsetState r = new SubsetState; 3165 foreach ( s; subst.elms ) 3166 { 3167 foreach ( t; s.nfa_state.transitions ) 3168 { 3169 if ( t.predicate.type != processed_type ) 3170 continue; 3171 auto intpred = t.predicate.intersect(pred); 3172 if ( !intpred.empty() ) { 3173 StateElement se = new StateElement; 3174 se.maxPriority = max(t.priority, s.maxPriority); 3175 se.lastPriority = t.priority; 3176 se.nfa_state = t.target; 3177 se.tags = s.tags; 3178 r.elms ~= se; 3179 } 3180 } 3181 } 3182 3183 // if we prioritized lookaheads, the states that may consume are also added to the new subset state 3184 // this behaviour is somewhat similar to an epsilon closure 3185 if ( have_lookahead && have_consume ) 3186 { 3187 foreach ( s; subst.elms ) 3188 { 3189 foreach ( t; s.nfa_state.transitions ) 3190 { 3191 if ( t.predicate.type != predicate_t.Type.consume ) 3192 continue; 3193 auto intpred = t.predicate.intersect(pred); 3194 if ( !intpred.empty() ) { 3195 r.elms ~= s; 3196 break; 3197 } 3198 } 3199 } 3200 } 3201 return r; 3202 } 3203 3204 /* ******************************************************************************************** 3205 Extends the given SubsetState with the states that are reached through lookbehind transitions. 3206 3207 Params: from = SubsetState to create the lookbehind closure for 3208 previous = predicate "from" was reached with 3209 Returns: SubsetState containing "from" and all states of it's lookbehind closure 3210 **********************************************************************************************/ 3211 SubsetState lookbehindClosure(SubsetState from, predicate_t pred) 3212 { 3213 List!(StateElement) stack = new List!(StateElement); 3214 StateElement[size_t] closure; 3215 3216 foreach ( e; from.elms ) 3217 { 3218 stack ~= e; 3219 closure[e.nfa_state.index] = e; 3220 } 3221 3222 while ( !stack.empty() ) 3223 { 3224 StateElement se = stack.tail.value; 3225 stack.pop(); 3226 foreach ( t; se.nfa_state.transitions ) 3227 { 3228 if ( t.predicate.type != predicate_t.Type.lookbehind ) 3229 continue; 3230 if ( t.predicate.intersect(pred).empty() ) 3231 continue; 3232 uint new_maxPri = max(t.priority, se.maxPriority); 3233 3234 StateElement* tmp = t.target.index in closure; 3235 if ( tmp !is null ) 3236 { 3237 // if higher prio (smaller value) exists, do not use this transition 3238 if ( tmp.maxPriority < new_maxPri ) { 3239 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3240 continue; 3241 } 3242 else if ( tmp.maxPriority == new_maxPri ) 3243 { 3244 // "equal lastPrio -> first-come-first-serve" 3245 // doesn't work for lexer - how to solve it properly? 3246 if ( tmp.lastPriority <= t.priority ) { 3247 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3248 continue; 3249 } 3250 // else 3251 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3252 } 3253 // else 3254 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3255 } 3256 3257 StateElement new_se = new StateElement; 3258 new_se.maxPriority = max(t.priority, se.maxPriority); 3259 new_se.lastPriority = t.priority; 3260 new_se.nfa_state = t.target; 3261 new_se.tags = se.tags; 3262 3263 closure[t.target.index] = new_se; 3264 stack ~= new_se; 3265 } 3266 } 3267 3268 SubsetState res = new SubsetState; 3269 res.elms = closure.values; 3270 return res; 3271 } 3272 3273 /* ******************************************************************************************** 3274 Generates the epsilon closure of the given subset state, creating tag map entries 3275 if tags are passed. Takes priorities into account, effectively realizing 3276 greediness and reluctancy. 3277 3278 Params: from = SubsetState to create the epsilon closure for 3279 previous = SubsetState "from" was reached from 3280 Returns: SubsetState containing "from" and all states of it's epsilon closure 3281 **********************************************************************************************/ 3282 SubsetState epsilonClosure(SubsetState from, SubsetState previous) 3283 { 3284 int firstFreeIndex=-1; 3285 foreach ( e; previous.elms ) 3286 { 3287 foreach ( ti; e.tags ) 3288 firstFreeIndex = max(firstFreeIndex, cast(int)ti); 3289 } 3290 ++firstFreeIndex; 3291 3292 List!(StateElement) stack = new List!(StateElement); 3293 StateElement[size_t] closure; 3294 3295 foreach ( e; from.elms ) 3296 { 3297 stack ~= e; 3298 closure[e.nfa_state.index] = e; 3299 } 3300 3301 while ( !stack.empty() ) 3302 { 3303 StateElement se = stack.tail.value; 3304 stack.pop(); 3305 foreach ( t; se.nfa_state.transitions ) 3306 { 3307 if ( t.predicate.type != predicate_t.Type.epsilon ) 3308 continue; 3309 // this is different from Ville Laurikari's algorithm, but it's crucial 3310 // to take the max (instead of t.priority) to make reluctant operators work 3311 uint new_maxPri = max(t.priority, se.maxPriority); 3312 3313 StateElement* tmp = t.target.index in closure; 3314 if ( tmp !is null ) 3315 { 3316 // if higher prio (smaller value) exists, do not use this transition 3317 if ( tmp.maxPriority < new_maxPri ) { 3318 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3319 continue; 3320 } 3321 else if ( tmp.maxPriority == new_maxPri ) 3322 { 3323 // "equal lastPrio -> first-come-first-serve" 3324 // doesn't work for lexer - how to solve it properly? 3325 if ( tmp.lastPriority <= t.priority ) { 3326 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3327 continue; 3328 } 3329 // else 3330 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3331 } 3332 // else 3333 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3334 } 3335 3336 auto new_se = new StateElement; 3337 new_se.maxPriority = new_maxPri; 3338 new_se.lastPriority = t.priority; 3339 new_se.nfa_state = t.target; 3340 3341 if ( t.tag > 0 ) 3342 { 3343 foreach ( k, v; se.tags ) 3344 new_se.tags[k] = v; 3345 new_se.tags[t.tag] = firstFreeIndex; 3346 } 3347 else 3348 new_se.tags = se.tags; 3349 3350 closure[t.target.index] = new_se; 3351 stack ~= new_se; 3352 } 3353 } 3354 3355 SubsetState res = new SubsetState; 3356 res.elms = closure.values; 3357 3358 // optimize tag usage 3359 // all we need to do is to check whether the largest tag-index from the 3360 // previous state is actually used in the new state and move all tags with 3361 // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0 3362 if ( firstFreeIndex > 0 ) 3363 { 3364 bool seenLastUsedIndex = false; 3365 sluiLoop: foreach ( e; res.elms ) 3366 { 3367 foreach ( i; e.tags ) 3368 { 3369 if ( i == firstFreeIndex-1 ) { 3370 seenLastUsedIndex = true; 3371 break sluiLoop; 3372 } 3373 } 3374 } 3375 if ( !seenLastUsedIndex ) 3376 { 3377 foreach ( e; res.elms ) 3378 { 3379 foreach ( ref i; e.tags ) 3380 { 3381 // mark index by making it negative 3382 // to signal that it can be decremented 3383 // after it has been detected to be a newly used index 3384 if ( i == firstFreeIndex ) 3385 i = -firstFreeIndex; 3386 } 3387 } 3388 } 3389 } 3390 3391 return res; 3392 } 3393 3394 /* ******************************************************************************************** 3395 Tries to create commands that reorder the tag map of "previous", such that "from" becomes 3396 tag-wise identical to "to". If successful, these commands are added to "trans". This 3397 is done for state re-use. 3398 3399 Params: from = SubsetState to check for tag-wise equality to "to" 3400 to = existing SubsetState that we want to re-use 3401 previous = SubsetState we're coming from 3402 trans = Transition we went along 3403 Returns: true if "from" is tag-wise identical to "to" and the necessary commands have 3404 been added to "trans" 3405 **********************************************************************************************/ 3406 bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans) 3407 { 3408 if ( from.elms.length != to.elms.length ) 3409 return false; 3410 3411 bool[Command] 3412 cmds; 3413 uint[TagIndex] 3414 reorderedIndeces; 3415 StateElement[TagIndex] 3416 reordered_elements; 3417 3418 Louter: foreach ( fe; from.elms ) 3419 { 3420 foreach ( te; to.elms ) 3421 { 3422 if ( te.nfa_state.index != fe.nfa_state.index ) 3423 continue; 3424 if ( fe.tags.length != te.tags.length ) 3425 return false; 3426 foreach ( tag, findex; fe.tags ) 3427 { 3428 if ( (tag in te.tags) is null ) 3429 return false; 3430 3431 TagIndex ti; 3432 ti.tag = tag; 3433 ti.index = te.tags[tag]; 3434 3435 // apply priority for conflicting tag indeces 3436 if ( (ti in reorderedIndeces) !is null ) 3437 { 3438 auto rse = reordered_elements[ti]; 3439 auto ri = reorderedIndeces[ti]; 3440 if ( ri != findex 3441 && ( rse.maxPriority < fe.maxPriority 3442 || rse.maxPriority == fe.maxPriority 3443 && rse.lastPriority <= fe.lastPriority ) 3444 ) 3445 continue; 3446 Command cmd; 3447 cmd.src = registerFromTagIndex(tag,ri); 3448 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3449 cmds.remove(cmd); 3450 } 3451 // if target index differs, create reordering command 3452 if ( te.tags[tag] != findex ) 3453 { 3454 Command cmd; 3455 cmd.src = registerFromTagIndex(tag,findex); 3456 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3457 cmds[cmd] = true; 3458 } 3459 3460 reorderedIndeces[ti] = findex; 3461 reordered_elements[ti] = fe; 3462 } 3463 continue Louter; 3464 } 3465 return false; 3466 } 3467 3468 debug(tdfa) { 3469 Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index); 3470 } 3471 3472 trans.commands ~= cmds.keys; 3473 return true; 3474 } 3475 3476 /* ******************************************************************************************** 3477 Generate tag map initialization commands for start state. 3478 **********************************************************************************************/ 3479 void generateInitializers(SubsetState start) 3480 { 3481 uint[uint] cmds; 3482 foreach ( nds; start.elms ) 3483 { 3484 foreach ( k, v; nds.tags ) 3485 cmds[k] = v; 3486 } 3487 3488 foreach ( k, v; cmds ) { 3489 Command cmd; 3490 cmd.dst = registerFromTagIndex(k,v); 3491 cmd.src = CURRENT_POSITION_REGISTER; 3492 initializer ~= cmd; 3493 } 3494 } 3495 3496 /* ******************************************************************************************** 3497 Generates finisher commands for accepting states. 3498 **********************************************************************************************/ 3499 void generateFinishers(SubsetState r) 3500 { 3501 // if at least one of the TNFA states accepts, 3502 // set the finishers from active tags in increasing priority 3503 StateElement[] sorted_elms = r.elms.dup; 3504 tango.core.Array.sort(sorted_elms); 3505 bool reluctant = false; 3506 foreach ( se; sorted_elms ) { 3507 debug (Finishers) Stdout.formatln("Finisher: {}", se); 3508 if ( se.nfa_state.accept ) 3509 { 3510 r.dfa_state.accept = true; 3511 3512 // Knowing that we're looking at an epsilon closure with an accepting 3513 // state, we look at the involved transitions - if the path from the 3514 // nfa state in the set with the highest incoming priority (last in 3515 // sorted_elms list) to the accepting nfa state is via the highest 3516 // priority transitions, and they are all epsilon transitions, this 3517 // suggests we're looking at a regex ending with a reluctant pattern. 3518 // The NFA->DFA transformation will most likely extend the automata 3519 // further, but we want the matching to stop here. 3520 // NOTE: The grounds for choosing the last element in sorted_elms 3521 // are somewhat weak (empirical testing), but sofar no new 3522 // regressions have been discovered. larsivi 20090827 3523 TNFATransition!(char_t) highestPriTrans; 3524 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state && 3525 sorted_elms[$-1].nfa_state)) 3526 throw new Exception ("Something is NULL that is expected to 3527 be non-null", __FILE__, __LINE__); 3528 3529 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) { 3530 if (trans.canFinish()) { 3531 r.dfa_state.reluctant = true; 3532 break; 3533 } 3534 } 3535 3536 bool[uint] finished_tags; 3537 { 3538 foreach ( t, i; se.tags ) 3539 if ( i > 0 && !(t in finished_tags) ) { 3540 finished_tags[t] = true; 3541 Command cmd; 3542 cmd.dst = registerFromTagIndex(t, 0); 3543 cmd.src = registerFromTagIndex(t, i); 3544 r.dfa_state.finishers ~= cmd; 3545 } 3546 } 3547 } 3548 } 3549 } 3550 3551 /* ******************************************************************************************** 3552 Assumes that the command-lists are sorted and transitions are optimized 3553 **********************************************************************************************/ 3554 void minimizeDFA() 3555 { 3556 class DiffTable 3557 { 3558 this(size_t num) { 3559 diff_ = new bool[num*(num+1)/2]; 3560 } 3561 3562 ~this() { diff_.destroy; } 3563 3564 bool opCall(size_t a, size_t b) 3565 { 3566 if ( a < b ) 3567 return diff_[b*(b+1)/2+a]; 3568 return diff_[a*(a+1)/2+b]; 3569 } 3570 3571 void set(size_t a, size_t b) 3572 { 3573 if ( a < b ) 3574 diff_[b*(b+1)/2+a] = true; 3575 else 3576 diff_[a*(a+1)/2+b] = true; 3577 } 3578 3579 bool[] diff_; 3580 } 3581 3582 debug(tdfa) Stdout.formatln("Minimizing TDFA"); 3583 3584 scope diff = new DiffTable(states.length); 3585 bool new_diff = true; 3586 3587 while ( new_diff ) 3588 { 3589 new_diff = false; 3590 foreach ( i, a; states[0 .. $-1] ) 3591 { 3592 Linner: foreach ( j, b; states[i+1 .. $] ) 3593 { 3594 if ( diff(i, j+i+1) ) 3595 continue; 3596 3597 // assume optimized transitions 3598 if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) { 3599 diff.set(i, j+i+1); 3600 new_diff = true; 3601 continue; 3602 } 3603 3604 if ( a.accept ) // b accepts too 3605 { 3606 // assume sorted finishers 3607 if ( a.finishers.length != b.finishers.length ) { 3608 diff.set(i, j+i+1); 3609 new_diff = true; 3610 continue; 3611 } 3612 foreach ( k, cmd; a.finishers ) 3613 { 3614 if ( cmd != b.finishers[k] ) { 3615 diff.set(i, j+i+1); 3616 new_diff = true; 3617 continue Linner; 3618 } 3619 } 3620 } 3621 3622 Ltrans: foreach ( ta; a.transitions ) 3623 { 3624 foreach ( tb; b.transitions ) 3625 { 3626 if ( ta.predicate.intersects(tb.predicate) ) 3627 { 3628 if ( diff(ta.target.index, tb.target.index) ) { 3629 diff.set(i, j+i+1); 3630 new_diff = true; 3631 continue Linner; 3632 } 3633 // assume sorted commands 3634 if ( ta.commands.length != tb.commands.length ) { 3635 diff.set(i, j+i+1); 3636 new_diff = true; 3637 continue Linner; 3638 } 3639 foreach ( k, cmd; ta.commands ) 3640 { 3641 if ( cmd != tb.commands[k] ) { 3642 diff.set(i, j+i+1); 3643 new_diff = true; 3644 continue Linner; 3645 } 3646 } 3647 continue Ltrans; 3648 } 3649 } 3650 3651 diff.set(i, j+i+1); 3652 new_diff = true; 3653 continue Linner; 3654 } 3655 3656 } 3657 } 3658 } 3659 3660 foreach ( i, a; states[0 .. $-1] ) 3661 { 3662 foreach ( j, b; states[i+1 .. $] ) 3663 { 3664 if ( !diff(i, j+i+1) ) 3665 { 3666 debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1); 3667 // remap b to a 3668 foreach ( k, c; states ) 3669 { 3670 foreach ( t; c.transitions ) 3671 { 3672 if ( t.target.index == j+i+1 ) 3673 t.target = a; 3674 } 3675 } 3676 } 3677 } 3678 } 3679 3680 } 3681 } 3682 import tango.text.Util; 3683 3684 /************************************************************************************************** 3685 Regular expression compiler and interpreter. 3686 **************************************************************************************************/ 3687 class RegExpT(char_t) 3688 { 3689 alias TDFA!(dchar) tdfa_t; 3690 alias TNFA!(dchar) tnfa_t; 3691 alias CharClass!(dchar) charclass_t; 3692 alias Predicate!(dchar) predicate_t; 3693 3694 /********************************************************************************************** 3695 Construct a RegExpT object. 3696 Params: 3697 pattern = Regular expression. 3698 Throws: RegExpException if there are any compilation errors. 3699 Example: 3700 Declare two variables and assign to them a Regex object: 3701 --- 3702 auto r = new Regex("pattern"); 3703 auto s = new Regex(r"p[1-5]\s*"); 3704 --- 3705 **********************************************************************************************/ 3706 this(const(char_t)[] pattern, const(char_t)[] attributes=null) 3707 { 3708 this(pattern, false, true); 3709 } 3710 3711 /** ditto */ 3712 this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false) 3713 { 3714 pattern_ = pattern; 3715 3716 debug(TangoRegex) {} 3717 else { 3718 static if ( is(char_t == dchar) ) { 3719 scope tnfa_t tnfa_ = new tnfa_t(pattern_); 3720 } 3721 else { 3722 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_)); 3723 } 3724 } 3725 3726 tnfa_.swapMatchingBracketSyntax = swapMBS; 3727 tnfa_.parse(unanchored); 3728 if ( printNFA ) { 3729 debug(TangoRegex) Stdout.formatln("\nTNFA:"); 3730 debug(TangoRegex) tnfa_.print; 3731 } 3732 tdfa_ = new tdfa_t(tnfa_); 3733 registers_.length = tdfa_.num_regs(); 3734 } 3735 3736 /********************************************************************************************** 3737 Generate instance of Regex. 3738 Params: 3739 pattern = Regular expression. 3740 Throws: RegExpException if there are any compilation errors. 3741 Example: 3742 Declare two variables and assign to them a Regex object: 3743 --- 3744 auto r = Regex("pattern"); 3745 auto s = Regex(r"p[1-5]\s*"); 3746 --- 3747 **********************************************************************************************/ 3748 static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null) 3749 { 3750 return new RegExpT!(char_t)(pattern, attributes); 3751 } 3752 3753 /********************************************************************************************** 3754 Set up for start of foreach loop. 3755 Returns: Instance of RegExpT set up to search input. 3756 Example: 3757 --- 3758 import tango.io.Stdout; 3759 import tango.text.Regex; 3760 3761 void main() 3762 { 3763 foreach(m; Regex("ab").search("qwerabcabcababqwer")) 3764 Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post); 3765 } 3766 // Prints: 3767 // qwer[ab]cabcababqwer 3768 // qwerabc[ab]cababqwer 3769 // qwerabcabc[ab]abqwer 3770 // qwerabcabcab[ab]qwer 3771 --- 3772 **********************************************************************************************/ 3773 public RegExpT!(char_t) search(const(char_t)[] input) 3774 { 3775 input_ = input; 3776 next_start_ = 0; 3777 last_start_ = 0; 3778 return this; 3779 } 3780 3781 /** ditto */ 3782 public int opApply(int delegate(RegExpT!(char_t)) dg) 3783 { 3784 int result; 3785 while ( !result && test() ) 3786 result = dg(this); 3787 return result; 3788 } 3789 3790 /********************************************************************************************** 3791 Search input for match. 3792 Returns: false for no match, true for match 3793 **********************************************************************************************/ 3794 bool test(const(char_t)[] input) 3795 { 3796 this.input_ = input; 3797 next_start_ = 0; 3798 last_start_ = 0; 3799 return test(); 3800 } 3801 3802 /********************************************************************************************** 3803 Pick up where last test(input) or test() left off, and search again. 3804 Returns: false for no match, true for match 3805 **********************************************************************************************/ 3806 bool test() 3807 { 3808 // initialize registers 3809 assert(registers_.length == tdfa_.num_regs()); 3810 registers_[0..$] = -1; 3811 foreach ( cmd; tdfa_.initializer ) { 3812 assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER); 3813 registers_[cmd.dst] = 0; 3814 } 3815 3816 tdfa_t.Transition* tp, tp_end; 3817 3818 // DFA execution 3819 auto inp = input_[next_start_ .. $]; 3820 auto s = tdfa_.start; 3821 3822 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp); 3823 LmainLoop: for ( size_t p, next_p; p < inp.length; ) 3824 { 3825 Lread_char: 3826 dchar c = cast(dchar)inp[p]; 3827 if ( c & 0x80 ) 3828 c = decode(inp, next_p); 3829 else 3830 next_p = p+1; 3831 3832 Lprocess_char: 3833 debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c); 3834 3835 tdfa_t.Transition t = void; 3836 switch ( s.mode ) 3837 { 3838 case s.Mode.LOOKUP: 3839 if ( c < s.LOOKUP_LENGTH ) 3840 { 3841 debug(TangoRegex) Stdout.formatln("lookup"); 3842 auto i = s.lookup[c]; 3843 if ( i == s.INVALID_STATE ) 3844 break LmainLoop; 3845 t = s.transitions[ i ]; 3846 if ( t.predicate.type != t.predicate.Type.consume ) 3847 goto Lno_consume; 3848 goto Lconsume; 3849 } 3850 break LmainLoop; 3851 3852 case s.Mode.MIXED: 3853 if ( c < s.LOOKUP_LENGTH ) 3854 { 3855 debug(TangoRegex) Stdout.formatln("mixed"); 3856 auto i = s.lookup[c]; 3857 if ( i == s.INVALID_STATE ) 3858 break; 3859 t = s.transitions[ i ]; 3860 if ( t.predicate.type != t.predicate.Type.consume ) 3861 goto Lno_consume; 3862 goto Lconsume; 3863 } 3864 break; 3865 3866 case s.Mode.GENERIC: 3867 default: 3868 break; 3869 } 3870 3871 Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length; 3872 tp < tp_end; ++tp ) 3873 { 3874 t = *tp; 3875 switch ( t.predicate.mode ) 3876 { 3877 // single char 3878 case predicate_t.MatchMode.single_char: 3879 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3880 if ( c != t.predicate.data_chr ) 3881 continue Ltrans_loop; 3882 goto Lconsume; 3883 case predicate_t.MatchMode.single_char_l: 3884 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3885 if ( c != t.predicate.data_chr ) 3886 continue Ltrans_loop; 3887 goto Lno_consume; 3888 3889 // bitmap 3890 case predicate_t.MatchMode.bitmap: 3891 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3892 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3893 goto Lconsume; 3894 continue Ltrans_loop; 3895 case predicate_t.MatchMode.bitmap_l: 3896 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3897 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3898 goto Lno_consume; 3899 continue Ltrans_loop; 3900 3901 // string search 3902 case predicate_t.MatchMode.string_search: 3903 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3904 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3905 continue Ltrans_loop; 3906 goto Lconsume; 3907 case predicate_t.MatchMode.string_search_l: 3908 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3909 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3910 continue Ltrans_loop; 3911 goto Lno_consume; 3912 3913 // generic 3914 case predicate_t.MatchMode.generic: 3915 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3916 for ( auto cmp = t.predicate.data_str.ptr, 3917 cmpend = cmp + t.predicate.data_str.length; 3918 cmp < cmpend; ++cmp ) 3919 { 3920 if ( c < *cmp ) { 3921 ++cmp; 3922 continue; 3923 } 3924 ++cmp; 3925 if ( c <= *cmp ) 3926 goto Lconsume; 3927 } 3928 continue Ltrans_loop; 3929 case predicate_t.MatchMode.generic_l: 3930 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3931 for ( auto cmp = t.predicate.data_str.ptr, 3932 cmpend = cmp + t.predicate.data_str.length; 3933 cmp < cmpend; ++cmp ) 3934 { 3935 if ( c < *cmp ) { 3936 ++cmp; 3937 continue; 3938 } 3939 ++cmp; 3940 if ( c <= *cmp ) 3941 goto Lno_consume; 3942 } 3943 continue Ltrans_loop; 3944 3945 default: 3946 assert(0); 3947 } 3948 3949 Lconsume: 3950 p = next_p; 3951 Lno_consume: 3952 3953 s = t.target; 3954 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]); 3955 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length); 3956 3957 foreach ( cmd; t.commands ) 3958 { 3959 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 3960 registers_[cmd.dst] = cast(int)p; 3961 else 3962 registers_[cmd.dst] = registers_[cmd.src]; 3963 } 3964 3965 if (s.accept && s.reluctant) 3966 // Don't continue matching, the current find should be correct 3967 goto Laccept; 3968 3969 // if all input was consumed and we do not already accept, try to 3970 // add an explicit string/line end 3971 if ( p >= inp.length ) 3972 { 3973 if ( s.accept || c == 0 ) 3974 break; 3975 c = 0; 3976 goto Lprocess_char; 3977 } 3978 goto Lread_char; 3979 } 3980 // no applicable transition 3981 break; 3982 } 3983 3984 if ( s.accept ) 3985 { 3986 Laccept: 3987 foreach ( cmd; s.finishers ) { 3988 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER); 3989 registers_[cmd.dst] = registers_[cmd.src]; 3990 } 3991 if ( registers_.length > 1 && registers_[1] >= 0 ) { 3992 last_start_ = next_start_; 3993 next_start_ += registers_[1]; 3994 } 3995 return true; 3996 } 3997 3998 return false; 3999 } 4000 4001 /********************************************************************************************** 4002 Return submatch with the given index. 4003 Params: 4004 index = 0 returns whole match, index > 0 returns submatch of bracket #index 4005 Returns: 4006 Slice of input for the requested submatch, or null if no such submatch exists. 4007 **********************************************************************************************/ 4008 const(char_t)[] match(uint index) 4009 { 4010 if ( index > tdfa_.num_tags ) 4011 return null; 4012 int start = cast(int)last_start_+registers_[index*2], 4013 end = cast(int)last_start_+registers_[index*2+1]; 4014 if ( start >= 0 && start < end && end <= input_.length ) 4015 return input_[start .. end]; 4016 return null; 4017 } 4018 4019 /** ditto */ 4020 const(char_t)[] opIndex(uint index) 4021 { 4022 return match(index); 4023 } 4024 4025 /********************************************************************************************** 4026 Return the slice of the input that precedes the matched substring. 4027 If no match was found, null is returned. 4028 **********************************************************************************************/ 4029 const(char_t)[] pre() 4030 { 4031 auto start = registers_[0]; 4032 if ( start < 0 ) 4033 return null; 4034 return input_[0 .. last_start_+start]; 4035 } 4036 4037 /********************************************************************************************** 4038 Return the slice of the input that follows the matched substring. 4039 If no match was found, the whole slice of the input that was processed in the last test. 4040 **********************************************************************************************/ 4041 const(char_t)[] post() 4042 { 4043 if ( registers_[1] >= 0 ) 4044 return input_[next_start_ .. $]; 4045 return input_[last_start_ .. $]; 4046 } 4047 4048 /********************************************************************************************** 4049 Splits the input at the matches of this regular expression into an array of slices. 4050 Example: 4051 --- 4052 import tango.io.Stdout; 4053 import tango.text.Regex; 4054 4055 void main() 4056 { 4057 auto strs = Regex("ab").split("abcabcababqwer"); 4058 foreach( s; strs ) 4059 Stdout.formatln("{}", s); 4060 } 4061 // Prints: 4062 // c 4063 // c 4064 // qwer 4065 --- 4066 **********************************************************************************************/ 4067 inout(char_t)[][] split(inout(char_t)[] input) 4068 { 4069 auto res = new inout(char_t)[][PREALLOC]; 4070 uint index; 4071 inout(char_t)[] tmp = input; 4072 4073 foreach ( r; search(input) ) 4074 { 4075 tmp = cast(inout(char_t)[])pre(); 4076 res[index++] = tmp[last_start_ .. $]; 4077 if ( index >= res.length ) 4078 res.length = res.length*2; 4079 tmp = cast(inout(char_t)[])post(); 4080 } 4081 4082 res[index++] = tmp; 4083 res.length = index; 4084 return res; 4085 } 4086 4087 /********************************************************************************************** 4088 Returns a copy of the input with all matches replaced by replacement. 4089 **********************************************************************************************/ 4090 char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4091 { 4092 const(char_t)[] tmp = input; 4093 if ( output_buffer.length <= 0 ) 4094 output_buffer = new char_t[input.length+replacement.length]; 4095 output_buffer.length = 0; 4096 4097 foreach ( r; search(input) ) 4098 { 4099 tmp = pre(); 4100 if ( tmp.length > last_start_ ) 4101 output_buffer ~= tmp[last_start_ .. $]; 4102 output_buffer ~= replacement; 4103 tmp = post(); 4104 } 4105 output_buffer ~= tmp; 4106 return output_buffer; 4107 } 4108 4109 /********************************************************************************************** 4110 Returns a copy of the input with the last match replaced by replacement. 4111 **********************************************************************************************/ 4112 char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4113 { 4114 const(char_t)[] tmp_pre, tmp_post; 4115 if ( output_buffer.length <= 0 ) 4116 output_buffer = new char_t[input.length+replacement.length]; 4117 output_buffer.length = 0; 4118 4119 foreach ( r; search(input) ) { 4120 tmp_pre = pre(); 4121 tmp_post = post(); 4122 } 4123 4124 if ( tmp_pre !is null || tmp_post !is null ) { 4125 output_buffer ~= tmp_pre; 4126 output_buffer ~= replacement; 4127 output_buffer ~= tmp_post; 4128 } 4129 else 4130 output_buffer ~= input; 4131 4132 return output_buffer; 4133 } 4134 4135 /********************************************************************************************** 4136 Returns a copy of the input with the first match replaced by replacement. 4137 **********************************************************************************************/ 4138 char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4139 { 4140 const(char_t)[] tmp = input; 4141 if ( output_buffer.length <= 0 ) 4142 output_buffer = new char_t[input.length+replacement.length]; 4143 output_buffer.length = 0; 4144 4145 if ( test(input) ) 4146 { 4147 tmp = pre(); 4148 if ( tmp.length > last_start_ ) 4149 output_buffer ~= tmp[last_start_ .. $]; 4150 output_buffer ~= replacement; 4151 tmp = post(); 4152 } 4153 output_buffer ~= tmp; 4154 return output_buffer; 4155 } 4156 4157 /********************************************************************************************** 4158 Calls dg for each match and replaces it with dg's return value. 4159 **********************************************************************************************/ 4160 char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null) 4161 { 4162 const(char_t)[] tmp = input; 4163 if ( output_buffer.length <= 0 ) 4164 output_buffer = new char_t[input.length]; 4165 output_buffer.length = 0; 4166 4167 foreach ( r; search(input) ) 4168 { 4169 tmp = pre(); 4170 if ( tmp.length > last_start_ ) 4171 output_buffer ~= tmp[last_start_ .. $]; 4172 output_buffer ~= dg(this); 4173 tmp = post(); 4174 } 4175 output_buffer ~= tmp; 4176 return output_buffer; 4177 } 4178 4179 /********************************************************************************************** 4180 Compiles the regular expression to D code. 4181 4182 NOTE : Remember to import this module (tango.text.Regex) in the module where you put the 4183 generated D code. 4184 4185 **********************************************************************************************/ 4186 // TODO: input-end special case 4187 const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false) 4188 { 4189 const(char)[] code; 4190 const(char)[] str_type; 4191 static if ( is(char_t == char) ) 4192 str_type = "const(char)[]"; 4193 static if ( is(char_t == wchar) ) 4194 str_type = "const(wchar)[]"; 4195 static if ( is(char_t == dchar) ) 4196 str_type = "const(dchar)[]"; 4197 4198 if ( lexer ) 4199 code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type); 4200 else { 4201 code = Format.convert("// {}\nbool match({} input", pattern_, str_type); 4202 code ~= Format.convert(", ref {}[] groups", str_type); 4203 } 4204 code ~= Format.convert(")\n{{\n uint s = {};", tdfa_.start.index); 4205 4206 uint num_vars = tdfa_.num_regs(); 4207 if ( num_vars > 0 ) 4208 { 4209 if ( lexer ) 4210 code ~= "\n static int "; 4211 else 4212 code ~= "\n int "; 4213 bool first = true; 4214 for ( int i = 0, used = 0; i < num_vars; ++i ) 4215 { 4216 bool hasInit = false; 4217 foreach ( cmd; tdfa_.initializer ) 4218 { 4219 if ( cmd.dst == i ) { 4220 hasInit = true; 4221 break; 4222 } 4223 } 4224 4225 if ( first ) 4226 first = false; 4227 else 4228 code ~= ", "; 4229 if ( used > 0 && used % 10 == 0 ) 4230 code ~= "\n "; 4231 ++used; 4232 code ~= Format.convert("r{}", i); 4233 4234 if ( hasInit ) 4235 code ~= "=0"; 4236 else 4237 code ~= "=-1"; 4238 } 4239 code ~= ";"; 4240 } 4241 4242 code ~= "\n\n for ( size_t p, next_p; p < input.length; )\n {"; 4243 code ~= "\n dchar c = cast(dchar)input[p];\n if ( c & 0x80 )\n decode(input, next_p);"; 4244 code ~= "\n else\n next_p = p+1;\n switch ( s )\n {"; 4245 4246 size_t[] finish_states; 4247 foreach ( s; tdfa_.states ) 4248 { 4249 code ~= Format.convert("\n case {}:", s.index); 4250 4251 if ( s.accept ) 4252 { 4253 finish_states ~= s.index; 4254 4255 tdfa_t.State target; 4256 foreach ( t; s.transitions ) 4257 { 4258 if ( target is null ) 4259 target = t.target; 4260 else if ( target !is t.target ) 4261 { 4262 target = null; 4263 break; 4264 } 4265 } 4266 if ( target !is null && target is s ) 4267 s.transitions = null; 4268 } 4269 4270 bool first_if=true; 4271 charclass_t cc, ccTest; 4272 4273 tango.core.Array.sort(s.transitions); 4274 foreach ( t; s.transitions ) 4275 { 4276 ccTest.add(t.predicate.getInput()); 4277 ccTest.optimize(); 4278 if ( t.predicate.getInput() < ccTest ) 4279 cc = t.predicate.getInput(); 4280 else 4281 cc = ccTest; 4282 4283 if ( first_if ) { 4284 code ~= "\n if ( "; 4285 first_if = false; 4286 } 4287 else 4288 code ~= "\n else if ( "; 4289 bool first_cond=true; 4290 foreach ( cr; cc.parts ) 4291 { 4292 if ( first_cond ) 4293 first_cond = false; 4294 else 4295 code ~= " || "; 4296 if ( cr.l() == cr.r() ) 4297 code ~= Format.convert("c == 0x{:x}", cast(int)cr.l()); 4298 else 4299 code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r()); 4300 } 4301 code ~= Format.convert(" ) {{\n s = {};", t.target.index); 4302 4303 if ( t.predicate.type == typeof(t.predicate.type).consume ) 4304 code ~= "\n p = next_p;"; 4305 foreach ( cmd; t.commands ) 4306 code ~= compileCommand(cmd, " "); 4307 /* 4308 // if inp ends here and we do not already accept, try to add an explicit string/line end 4309 if ( p >= inp.length && !s.accept && c != 0 ) { 4310 c = 0; 4311 goto Lprocess_char; 4312 } 4313 */ 4314 code ~= "\n }"; 4315 } 4316 4317 if ( !first_if ) 4318 code ~= Format.convert( 4319 "\n else\n {};\n break;", 4320 s.accept?Format.convert("goto finish{}", s.index):"return false" 4321 ); 4322 else 4323 code ~= Format.convert("\n {};", s.accept?Format.convert("goto finish{}", s.index):"return false"); 4324 } 4325 4326 // create finisher groups 4327 size_t[][size_t] finisherGroup; 4328 foreach ( fs; finish_states ) 4329 { 4330 // check if finisher group with same commands exists 4331 bool haveFinisher = false; 4332 foreach ( fg; finisherGroup.keys ) 4333 { 4334 bool equalCommands = false; 4335 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length ) 4336 { 4337 equalCommands = true; 4338 foreach ( i, cmd; tdfa_.states[fs].finishers ) 4339 { 4340 if ( cmd != tdfa_.states[fg].finishers[i] ) { 4341 equalCommands = false; 4342 break; 4343 } 4344 } 4345 } 4346 if ( equalCommands ) { 4347 // use existing group for this state 4348 finisherGroup[fg] ~= fs; 4349 haveFinisher = true; 4350 break; 4351 } 4352 } 4353 // create new group 4354 if ( !haveFinisher ) 4355 finisherGroup[fs] ~= fs; 4356 } 4357 4358 4359 code ~= "\n default:\n assert(0);\n }\n }\n\n switch ( s )\n {"; 4360 foreach ( group, states; finisherGroup ) 4361 { 4362 foreach ( s; states ) 4363 code ~= Format.convert("\n case {}: finish{}:", s, s); 4364 4365 foreach ( cmd; tdfa_.states[group].finishers ) 4366 { 4367 if ( lexer ) 4368 { 4369 if ( tdfa_.states[group].finishers.length > 1 ) 4370 throw new RegExpException("Lexer error: more than one finisher in flm lexer!"); 4371 if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags ) 4372 throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup); 4373 code ~= Format.convert("\n match = input[0 .. r{}];\n token = {};", cmd.src, cmd.dst/2); 4374 } 4375 else 4376 code ~= compileCommand(cmd, " "); 4377 } 4378 4379 code ~= "\n break;"; 4380 } 4381 code ~= "\n default:\n return false;\n }\n"; 4382 4383 if ( !lexer ) 4384 { 4385 code ~= Format.convert("\n groups.length = {};", tdfa_.num_tags/2); 4386 for ( int i = 0; i < tdfa_.num_tags/2; ++i ) 4387 code ~= Format.convert("\n if ( r{} > -1 && r{} > -1 )\n groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1); 4388 } 4389 4390 code ~= "\n return true;\n}"; 4391 return code; 4392 } 4393 4394 /********************************************************************************************* 4395 Get the pattern with which this regex was constructed. 4396 **********************************************************************************************/ 4397 public const(char_t)[] pattern() 4398 { 4399 return pattern_; 4400 } 4401 4402 /********************************************************************************************* 4403 Get the tag count of this regex, representing the number of sub-matches. 4404 4405 This value is the max valid value for match/opIndex. 4406 **********************************************************************************************/ 4407 uint tagCount() 4408 { 4409 return tdfa_.num_tags; 4410 } 4411 4412 int[] registers_; 4413 size_t next_start_, 4414 last_start_; 4415 4416 debug(TangoRegex) tnfa_t tnfa_; 4417 tdfa_t tdfa_; 4418 private: 4419 enum int PREALLOC = 16; 4420 const(char_t)[] input_, 4421 pattern_; 4422 4423 const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent) 4424 { 4425 const(char)[] code, 4426 dst; 4427 code ~= Format.convert("\n{}r{} = ", indent, cmd.dst); 4428 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 4429 code ~= "p;"; 4430 else 4431 code ~= Format.convert("r{};", cmd.src); 4432 return code; 4433 } 4434 } 4435 4436 alias RegExpT!(char) Regex; 4437 4438 debug(utf) import tango.stdc.stdio; 4439 // the following block is stolen from phobos. 4440 // the copyright notice applies for this block only. 4441 /* 4442 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com 4443 * Written by Walter Bright 4444 * 4445 * This software is provided 'as-is', without any express or implied 4446 * warranty. In no event will the authors be held liable for any damages 4447 * arising from the use of this software. 4448 * 4449 * Permission is granted to anyone to use this software for any purpose, 4450 * including commercial applications, and to alter it and redistribute it 4451 * freely, subject to the following restrictions: 4452 * 4453 * o The origin of this software must not be misrepresented; you must not 4454 * claim that you wrote the original software. If you use this software 4455 * in a product, an acknowledgment in the product documentation would be 4456 * appreciated but is not required. 4457 * o Altered source versions must be plainly marked as such, and must not 4458 * be misrepresented as being the original software. 4459 * o This notice may not be removed or altered from any source 4460 * distribution. 4461 */ 4462 4463 class UtfException : Exception 4464 { 4465 size_t idx; /// index in string of where error occurred 4466 4467 this(immutable(char)[] s, size_t i) 4468 { 4469 idx = i; 4470 super(s); 4471 } 4472 } 4473 4474 bool isValidDchar(dchar c) 4475 { 4476 /* Note: FFFE and FFFF are specifically permitted by the 4477 * Unicode standard for application internal use, but are not 4478 * allowed for interchange. 4479 * (thanks to Arcane Jill) 4480 */ 4481 4482 return c < 0xD800 || 4483 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 4484 } 4485 4486 /* ************* 4487 * Decodes and returns character starting at s[idx]. idx is advanced past the 4488 * decoded character. If the character is not well formed, a UtfException is 4489 * thrown and idx remains unchanged. 4490 */ 4491 4492 dchar decode(const(char)[] s, ref size_t idx) 4493 { 4494 size_t len = s.length; 4495 dchar V; 4496 size_t i = idx; 4497 char u = s[i]; 4498 4499 if (u & 0x80) 4500 { uint n; 4501 char u2; 4502 4503 /* The following encodings are valid, except for the 5 and 6 byte 4504 * combinations: 4505 * 0xxxxxxx 4506 * 110xxxxx 10xxxxxx 4507 * 1110xxxx 10xxxxxx 10xxxxxx 4508 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4509 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4510 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4511 */ 4512 for (n = 1; ; n++) 4513 { 4514 if (n > 4) 4515 goto Lerr; // only do the first 4 of 6 encodings 4516 if (((u << n) & 0x80) == 0) 4517 { 4518 if (n == 1) 4519 goto Lerr; 4520 break; 4521 } 4522 } 4523 4524 // Pick off (7 - n) significant bits of B from first byte of octet 4525 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 4526 4527 if (i + (n - 1) >= len) 4528 goto Lerr; // off end of string 4529 4530 /* The following combinations are overlong, and illegal: 4531 * 1100000x (10xxxxxx) 4532 * 11100000 100xxxxx (10xxxxxx) 4533 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 4534 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 4535 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 4536 */ 4537 u2 = s[i + 1]; 4538 if ((u & 0xFE) == 0xC0 || 4539 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 4540 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 4541 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 4542 (u == 0xFC && (u2 & 0xFC) == 0x80)) 4543 goto Lerr; // overlong combination 4544 4545 for (uint j = 1; j != n; j++) 4546 { 4547 u = s[i + j]; 4548 if ((u & 0xC0) != 0x80) 4549 goto Lerr; // trailing bytes are 10xxxxxx 4550 V = (V << 6) | (u & 0x3F); 4551 } 4552 if (!isValidDchar(V)) 4553 goto Lerr; 4554 i += n; 4555 } 4556 else 4557 { 4558 V = cast(dchar) u; 4559 i++; 4560 } 4561 4562 idx = i; 4563 return V; 4564 4565 Lerr: 4566 throw new Exception("invalid UTF-8 sequence"); 4567 } 4568 4569 /* ditto */ 4570 4571 dchar decode(const(wchar)[] s, ref size_t idx) 4572 in 4573 { 4574 assert(idx >= 0 && idx < s.length); 4575 } 4576 out (result) 4577 { 4578 assert(isValidDchar(result)); 4579 } 4580 body 4581 { 4582 const(char)[] msg; 4583 dchar V; 4584 size_t i = idx; 4585 uint u = s[i]; 4586 4587 if (u & ~0x7F) 4588 { if (u >= 0xD800 && u <= 0xDBFF) 4589 { uint u2; 4590 4591 if (i + 1 == s.length) 4592 { msg = "surrogate UTF-16 high value past end of string"; 4593 goto Lerr; 4594 } 4595 u2 = s[i + 1]; 4596 if (u2 < 0xDC00 || u2 > 0xDFFF) 4597 { msg = "surrogate UTF-16 low value out of range"; 4598 goto Lerr; 4599 } 4600 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 4601 i += 2; 4602 } 4603 else if (u >= 0xDC00 && u <= 0xDFFF) 4604 { msg = "unpaired surrogate UTF-16 value"; 4605 goto Lerr; 4606 } 4607 else if (u == 0xFFFE || u == 0xFFFF) 4608 { msg = "illegal UTF-16 value"; 4609 goto Lerr; 4610 } 4611 else 4612 i++; 4613 } 4614 else 4615 { 4616 i++; 4617 } 4618 4619 idx = i; 4620 return cast(dchar)u; 4621 4622 Lerr: 4623 throw new UtfException(msg.idup, i); 4624 } 4625 4626 /* ditto */ 4627 4628 dchar decode(const(dchar)[] s, ref size_t idx) 4629 in 4630 { 4631 assert(idx >= 0 && idx < s.length); 4632 } 4633 body 4634 { 4635 size_t i = idx; 4636 dchar c = s[i]; 4637 4638 if (!isValidDchar(c)) 4639 goto Lerr; 4640 idx = i + 1; 4641 return c; 4642 4643 Lerr: 4644 throw new UtfException("5invalid UTF-32 value", i); 4645 } 4646 4647 4648 4649 /* =================== Encode ======================= */ 4650 4651 /* ***************************** 4652 * Encodes character c and appends it to array s[]. 4653 */ 4654 4655 void encode(ref const(char)[] s, dchar c) 4656 in 4657 { 4658 assert(isValidDchar(c)); 4659 } 4660 body 4661 { 4662 const(char)[] r = s; 4663 4664 if (c <= 0x7F) 4665 { 4666 r ~= cast(char) c; 4667 } 4668 else 4669 { 4670 char[4] buf; 4671 uint L; 4672 4673 if (c <= 0x7FF) 4674 { 4675 buf[0] = cast(char)(0xC0 | (c >> 6)); 4676 buf[1] = cast(char)(0x80 | (c & 0x3F)); 4677 L = 2; 4678 } 4679 else if (c <= 0xFFFF) 4680 { 4681 buf[0] = cast(char)(0xE0 | (c >> 12)); 4682 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4683 buf[2] = cast(char)(0x80 | (c & 0x3F)); 4684 L = 3; 4685 } 4686 else if (c <= 0x10FFFF) 4687 { 4688 buf[0] = cast(char)(0xF0 | (c >> 18)); 4689 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 4690 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4691 buf[3] = cast(char)(0x80 | (c & 0x3F)); 4692 L = 4; 4693 } 4694 else 4695 { 4696 assert(0); 4697 } 4698 r ~= buf[0 .. L]; 4699 } 4700 s = r; 4701 } 4702 4703 /* ditto */ 4704 4705 void encode(ref const(wchar)[] s, dchar c) 4706 in 4707 { 4708 assert(isValidDchar(c)); 4709 } 4710 body 4711 { 4712 const(wchar)[] r = s; 4713 4714 if (c <= 0xFFFF) 4715 { 4716 r ~= cast(wchar) c; 4717 } 4718 else 4719 { 4720 wchar[2] buf; 4721 4722 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 4723 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 4724 r ~= buf; 4725 } 4726 s = r; 4727 } 4728 4729 /* ditto */ 4730 4731 void encode(ref const(dchar)[] s, dchar c) 4732 in 4733 { 4734 assert(isValidDchar(c)); 4735 } 4736 body 4737 { 4738 s ~= c; 4739 } 4740 4741 debug(UnitTest) 4742 { 4743 unittest 4744 { 4745 // match a fixed string 4746 Regex r; 4747 r = new Regex("abc"); 4748 assert(r.test("abc123")); 4749 assert(r.test("feabc123")); 4750 assert(!r.test("abe123")); 4751 4752 // match a non-ASCII string 4753 r = new Regex("☃"); 4754 assert(r.test("a☃c123")); 4755 4756 // capture 4757 r = new Regex("☃(c)"); 4758 assert(r.test("a☃c123")); 4759 assert(r.match(1) == "c"); 4760 4761 // dot 4762 r = new Regex(".."); 4763 assert(r.test("a☃c123")); 4764 4765 // dot capture 4766 r = new Regex("(..)"); 4767 assert(r.test("a☃c123")); 4768 assert(r.match(1) == "a☃"); 4769 4770 4771 4772 // two captures 4773 r = new Regex("(.)(.)"); 4774 assert(r.test("a☃c123")); 4775 assert(r.match(1) == "a"); 4776 assert(r.match(2) == "☃"); 4777 4778 // multiple letters 4779 r = new Regex(".*(e+).*"); 4780 assert(r.test("aeeeeec123")); 4781 assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1)); 4782 assert(!r.test("abfua")); 4783 4784 // multiple snowmen 4785 r = new Regex(".*(☃+).*"); 4786 assert(r.test("a☃☃☃☃☃c123")); 4787 assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1)); 4788 assert(!r.test("abeua")); 4789 4790 /** 4791 "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed. 4792 */ 4793 // multiple snowmen 4794 r = new Regex("(☃+)"); 4795 assert(r.test("a☃☃☃☃☃c123")); 4796 assert(r.match(0) == "☃☃☃☃☃"); 4797 assert(r.match(1) == "☃☃☃☃☃"); 4798 assert(r.test("a☃beua")); 4799 } 4800 4801 debug(RegexTestOnly) 4802 { 4803 import tango.io.Stdout; 4804 void main() { Stdout("All tests pass").newline(); } 4805 } 4806 }