1 /******************************************************************************* 2 3 copyright: Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved. 4 5 license: BSD style: $(LICENSE) 6 7 version: Initial release: Jan 2008 8 9 authors: Jascha Wetzel 10 11 This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method. 12 13 The Regex class is not thread safe 14 15 See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> 16 for details on regular expressions in general. 17 18 The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it, 19 as opposed to what "regular expression" means in most implementations 20 (e.g. PCRE or those from the standard libraries of Perl, Java or Python). 21 The advantage of this method is it's performance, it's disadvantage is the inability to realize some features 22 that Perl-like regular expressions have (e.g. back-references). 23 See <a href="http://swtch.com/~rsc/regexp/regexp1.html">"Regular Expression Matching Can Be Simple And Fast"</a> 24 for details on the differences. 25 26 The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the 27 number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input 28 and therefore matching is a linear-time process. 29 30 The syntax of a regular expressions is as follows. 31 <i>X</i> and <i>Y</i> stand for an arbitrary regular expression. 32 33 <table border=1 cellspacing=0 cellpadding=5> 34 <caption>Operators</caption> 35 $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) ) 36 $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) ) 37 $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) ) 38 $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) ) 39 $(TR $(TD [^Z]) $(TD negated character class specification) ) 40 $(TR $(TD <X) $(TD lookbehind, X may be a single character or a character class) ) 41 $(TR $(TD >X) $(TD lookahead, X may be a single character or a character class) ) 42 $(TR $(TD ^) $(TD start of input or start of line) ) 43 $(TR $(TD $) $(TD end of input or end of line) ) 44 $(TR $(TD \b) $(TD start or end of word, equals (?<\s>\S|<\S>\s)) ) 45 $(TR $(TD \B) $(TD opposite of \b, equals (?<\S>\S|<\s>\s)) ) 46 </table> 47 48 <table border=1 cellspacing=0 cellpadding=5> 49 <caption>Quantifiers</caption> 50 $(TR $(TD X?) $(TD zero or one) ) 51 $(TR $(TD X*) $(TD zero or more) ) 52 $(TR $(TD X+) $(TD one or more) ) 53 $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) ) 54 $(TR $(TD X??) $(TD non-greedy version of the above operators) ) 55 $(TR $(TD X*?) $(TD see above) ) 56 $(TR $(TD X+?) $(TD see above) ) 57 $(TR $(TD X{n,m}?) $(TD see above) ) 58 </table> 59 60 <table border=1 cellspacing=0 cellpadding=5> 61 <caption>Pre-defined character classes</caption> 62 $(TR $(TD .) $(TD any printable character) ) 63 $(TR $(TD \s) $(TD whitespace) ) 64 $(TR $(TD \S) $(TD non-whitespace) ) 65 $(TR $(TD \w) $(TD alpha-numeric characters or underscore) ) 66 $(TR $(TD \W) $(TD opposite of \w) ) 67 $(TR $(TD \d) $(TD digits) ) 68 $(TR $(TD \D) $(TD non-digit) ) 69 </table> 70 71 Note that "alphanumeric" only applies to Latin-1. 72 *******************************************************************************/ 73 module tango.text.Regex; 74 75 debug(TangoRegex) import tango.io.Stdout; 76 77 /* ***************************************************************************** 78 A simple pair 79 *******************************************************************************/ 80 private struct Pair(T) 81 { 82 static Pair opCall(T a, T b) 83 { 84 Pair p; 85 p.a = a; 86 p.b = b; 87 return p; 88 } 89 90 union 91 { 92 struct { 93 T first, second; 94 } 95 struct { 96 T a, b; 97 } 98 } 99 } 100 101 /* ***************************************************************************** 102 Double linked list 103 *******************************************************************************/ 104 private class List(T) 105 { 106 class Element 107 { 108 T value; 109 Element prev, 110 next; 111 112 this(T v) 113 { 114 value = v; 115 } 116 } 117 118 size_t len; 119 Element head, 120 tail; 121 122 List opCatAssign(T v) 123 { 124 if ( tail is null ) 125 head = tail = new Element(v); 126 else { 127 tail.next = new Element(v); 128 tail.next.prev = tail; 129 tail = tail.next; 130 } 131 ++len; 132 return this; 133 } 134 135 List insertAfter(T w, T v) 136 { 137 foreach ( e; &this.elements ) 138 { 139 if ( e.value is w ) 140 return insertAfter(e, v); 141 } 142 return null; 143 } 144 145 List insertAfter(Element e, T v) 146 { 147 auto tmp = new Element(v); 148 tmp.prev = e; 149 tmp.next = e.next; 150 e.next.prev = tmp; 151 e.next = tmp; 152 if ( e is tail ) 153 tail = tmp; 154 ++len; 155 return this; 156 } 157 158 List opCatAssign(List l) 159 { 160 if ( l.empty() ) 161 return this; 162 if ( tail is null ) { 163 head = l.head; 164 tail = l.tail; 165 } 166 else { 167 tail.next = l.head; 168 tail.next.prev = tail; 169 tail = l.tail; 170 } 171 len += l.len; 172 return this; 173 } 174 175 List pushFront(T v) 176 { 177 if ( head is null ) 178 head = tail = new Element(v); 179 else 180 { 181 head.prev = new Element(v); 182 head.prev.next = head; 183 head = head.prev; 184 } 185 ++len; 186 return this; 187 } 188 189 List insertBefore(T w, T v) 190 { 191 foreach ( e; &this.elements ) 192 { 193 if ( e.value is w ) 194 return insertBefore(e, v); 195 } 196 return null; 197 } 198 199 List insertBefore(Element e, T v) 200 { 201 auto tmp = new Element(v); 202 tmp.prev = e.prev; 203 tmp.next = e; 204 e.prev.next = tmp; 205 e.prev = tmp; 206 if ( e is head ) 207 head = tmp; 208 ++len; 209 return this; 210 } 211 212 List pushFront(List l) 213 { 214 if ( l.empty() ) 215 return this; 216 if ( head is null ) { 217 head = l.head; 218 tail = l.tail; 219 } 220 else { 221 head.prev = l.tail; 222 head.prev.next = head; 223 head = l.head; 224 } 225 len += l.len; 226 return this; 227 } 228 229 size_t length() 230 { 231 return len; 232 } 233 234 bool empty() 235 { 236 return head is null; 237 } 238 239 void clear() 240 { 241 head = null; 242 tail = null; 243 len = 0; 244 } 245 246 void pop() 247 { 248 remove(tail); 249 } 250 251 void remove(Element e) 252 { 253 if ( e is null ) 254 return; 255 if ( e.prev is null ) 256 head = e.next; 257 else 258 e.prev.next = e.next; 259 if ( e.next is null ) 260 tail = e.prev; 261 else 262 e.next.prev = e.prev; 263 --len; 264 } 265 266 int elements(int delegate(ref Element) dg) 267 { 268 for ( Element e=head; e !is null; e = e.next ) 269 { 270 int ret = dg(e); 271 if ( ret ) 272 return ret; 273 } 274 return 0; 275 } 276 277 int elements_reverse(int delegate(ref Element) dg) 278 { 279 for ( Element e=tail; e !is null; e = e.prev ) 280 { 281 int ret = dg(e); 282 if ( ret ) 283 return ret; 284 } 285 return 0; 286 } 287 288 int opApply(int delegate(ref T) dg) 289 { 290 for ( Element e=head; e !is null; e = e.next ) 291 { 292 int ret = dg(e.value); 293 if ( ret ) 294 return ret; 295 } 296 return 0; 297 } 298 299 int opApplyReverse(int delegate(ref T) dg) 300 { 301 for ( Element e=tail; e !is null; e = e.prev ) 302 { 303 int ret = dg(e.value); 304 if ( ret ) 305 return ret; 306 } 307 return 0; 308 } 309 } 310 311 /* ***************************************************************************** 312 Stack based on dynamic array 313 *******************************************************************************/ 314 private struct Stack(T) 315 { 316 size_t _top; 317 T[] stack; 318 319 void push(T v) 320 { 321 if ( _top >= stack.length ) 322 stack.length = stack.length*2+1; 323 stack[_top] = v; 324 ++_top; 325 } 326 alias push opCatAssign; 327 328 void opCatAssign(T[] vs) 329 { 330 size_t end = _top+vs.length; 331 if ( end > stack.length ) 332 stack.length = end*2; 333 stack[_top..end] = vs[]; 334 _top = end; 335 } 336 337 void pop(size_t num) 338 { 339 assert(_top>=num); 340 _top -= num; 341 } 342 343 T pop() 344 { 345 assert(_top>0); 346 return stack[--_top]; 347 } 348 349 T top() 350 { 351 assert(_top>0); 352 return stack[_top-1]; 353 } 354 355 T* topPtr() 356 { 357 assert(_top>0); 358 return &stack[_top-1]; 359 } 360 361 bool empty() 362 { 363 return _top == 0; 364 } 365 366 void clear() 367 { 368 _top = 0; 369 } 370 371 size_t length() 372 { 373 return _top; 374 } 375 376 T[] array() 377 { 378 return stack[0.._top]; 379 } 380 381 T opIndex(size_t i) 382 { 383 return stack[i]; 384 } 385 386 Stack dup() 387 { 388 Stack s; 389 s._top = _top; 390 s.stack = stack.dup; 391 return s; 392 } 393 } 394 395 /* ************************************************************************************************ 396 Set container based on assoc array 397 **************************************************************************************************/ 398 private struct Set(T) 399 { 400 bool[T] data; 401 402 static Set opCall() 403 { 404 Set s; 405 return s; 406 } 407 408 static Set opCall(T v) 409 { 410 Set s; 411 s ~= v; 412 return s; 413 } 414 415 void opAddAssign(T v) 416 { 417 data[v] = true; 418 } 419 420 void opAddAssign(Set s) 421 { 422 foreach ( v; s.elements ) 423 data[v] = true; 424 } 425 alias opAddAssign opCatAssign; 426 427 size_t length() 428 { 429 return data.length; 430 } 431 432 T[] elements() 433 { 434 return data.keys; 435 } 436 437 bool remove(T v) 438 { 439 if ( (v in data) is null ) 440 return false; 441 data.remove(v); 442 return true; 443 } 444 445 bool contains(T v) 446 { 447 return (v in data) !is null; 448 } 449 450 bool contains(Set s) 451 { 452 Set tmp = s - this; 453 return tmp.empty; 454 } 455 456 bool empty() 457 { 458 return data.length==0; 459 } 460 461 Set opSub(Set s) 462 { 463 Set res = dup; 464 foreach ( v; s.elements ) 465 res.remove(v); 466 return res; 467 } 468 469 Set dup() 470 { 471 Set s; 472 foreach ( v; data.keys ) 473 s.data[v] = true; 474 return s; 475 } 476 } 477 478 /* ************************************************************************************************ 479 480 **************************************************************************************************/ 481 void quickSort(T)(T[] a) 482 { 483 quickSort(a,cast(size_t)0,a.length); 484 } 485 486 void quickSort(T)(T[] a, size_t l, size_t r) 487 { 488 T t; 489 auto i = r-l; 490 if ( i < 3 ) 491 { 492 if ( i < 2 ) 493 return; 494 if ( a[l] < a[l+1] ) 495 return; 496 t = a[l]; 497 a[l] = a[l+1]; 498 a[l+1] = t; 499 return; 500 } 501 502 auto p = a[l]; 503 i = l; 504 auto j = r; 505 506 while ( true ) 507 { 508 ++i; 509 for ( ; i < j && a[i] < p; ++i ) {} 510 --j; 511 for ( ; i < j && a[j] >= p; --j ) {} 512 if ( i >= j ) 513 break; 514 t = a[i]; 515 a[i] = a[j]; 516 a[j] = t; 517 } 518 --i; 519 a[l] = a[i]; 520 a[i] = p; 521 522 quickSort(a, l, i); 523 quickSort(a, i+1, r); 524 } 525 import tango.math.Math; 526 527 /* ************************************************************************************************ 528 A range of characters 529 **************************************************************************************************/ 530 struct CharRange(char_t) 531 { 532 char_t l_, r_; 533 534 static CharRange opCall(char_t c) 535 { 536 CharRange cr; 537 cr.l_ = c; 538 cr.r_ = c; 539 return cr; 540 } 541 542 static CharRange opCall(char_t a, char_t b) 543 { 544 CharRange cr; 545 cr.l_ = min(a,b); 546 cr.r_ = max(a,b); 547 return cr; 548 } 549 550 char_t l() 551 { 552 return l_; 553 } 554 555 char_t r() 556 { 557 return r_; 558 } 559 560 /* ******************************************************************************************** 561 Compares the ranges according to their beginning. 562 **********************************************************************************************/ 563 int opCmp(const ref CharRange cr) const 564 { 565 if ( l_ == cr.l_ ) 566 return 0; 567 if ( l_ < cr.l_ ) 568 return -1; 569 return 1; 570 } 571 572 573 const bool opEquals(ref const CharRange cr) 574 { 575 if ( l_ == cr.l_ && r_ == cr.r_ ) 576 return true; 577 return false; 578 } 579 580 bool contains(char_t c) 581 { 582 return c >= l_ && c <= r_; 583 } 584 585 bool contains(CharRange cr) 586 { 587 return l_ <= cr.l_ && r_ >= cr.r_; 588 } 589 590 bool intersects(CharRange cr) 591 { 592 return r_ >= cr.l_ && l_ <= cr.r_; 593 } 594 595 CharRange intersect(CharRange cr) 596 { 597 assert(intersects(cr)); 598 CharRange ir; 599 ir.l_ = max(l_, cr.l_); 600 ir.r_ = min(r_, cr.r_); 601 if ( ir.l_ > ir.r_ ) 602 ir.l_ = ir.r_ = char_t.min; 603 return ir; 604 } 605 606 CharRange[] subtract(CharRange cr) 607 { 608 CharRange[] sr; 609 if ( cr.contains(this) ) 610 return sr; 611 if ( !intersects(cr) ) 612 sr ~= this; 613 else 614 { 615 CharRange d; 616 if ( contains(cr) ) 617 { 618 d.l_ = l_; 619 d.r_ = cast(char_t)(cr.l_-1); 620 if ( d.l_ <= d.r_ ) 621 sr ~= d; 622 d.l_ = cast(char_t)(cr.r_+1); 623 d.r_ = r_; 624 if ( d.l_ <= d.r_ ) 625 sr ~= d; 626 } 627 else if ( cr.r_ > l_ ) 628 { 629 d.l_ = cast(char_t)(cr.r_+1); 630 d.r_ = r_; 631 if ( d.l_ <= d.r_ ) 632 sr ~= d; 633 } 634 else if ( cr.l_ < r_ ) 635 { 636 d.l_ = l_; 637 d.r_ = cast(char_t)(cr.l_-1); 638 if ( d.l_ <= d.r_ ) 639 sr ~= d; 640 } 641 } 642 return sr; 643 } 644 645 string toString() 646 { 647 string str; 648 649 if ( l_ == r_ ) 650 { 651 if ( l_ > 0x20 && l_ < 0x7f ) 652 str = Format.convert("'{}'", l_).idup; 653 else 654 str = Format.convert("({:x})", cast(int)l_).idup; 655 } 656 else 657 { 658 if ( l_ > 0x20 && l_ < 0x7f ) 659 str = Format.convert("'{}'", l_).idup; 660 else 661 str = Format.convert("({:x})", cast(int)l_).idup; 662 str ~= "-"; 663 if ( r_ > 0x20 && r_ < 0x7f ) 664 str ~= Format.convert("'{}'", r_).idup; 665 else 666 str ~= Format.convert("({:x})", cast(int)r_).idup; 667 } 668 return str; 669 } 670 } 671 672 import tango.core.Compiler; 673 674 static if(DMDFE_Version == 2063) 675 { 676 /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */ 677 void _bug(CharRange!(dchar) a) {} 678 } 679 680 /* ************************************************************************************************ 681 Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.) 682 **************************************************************************************************/ 683 struct CharClass(char_t) 684 { 685 alias CharRange!(char_t) range_t; 686 687 //--------------------------------------------------------------------------------------------- 688 // pre-defined character classes 689 static const CharClass!(char_t) 690 line_startend = {parts: [ 691 {l_:0x00, r_:0x00}, 692 {l_:0x0a, r_:0x0a}, 693 {l_:0x13, r_:0x13} 694 ]}, 695 digit = {parts: [ 696 {l_:0x30, r_:0x39} 697 ]}, 698 whitespace = {parts: [ 699 {l_:0x09, r_:0x09}, 700 {l_:0x0a, r_:0x0a}, 701 {l_:0x0b, r_:0x0b}, 702 {l_:0x13, r_:0x13}, 703 {l_:0x14, r_:0x14}, 704 {l_:0x20, r_:0x20} 705 ]}; 706 707 // 8bit classes 708 static if ( is(char_t == char) ) 709 { 710 static const CharClass!(char_t) 711 any_char = {parts: [ 712 {l_:0x01, r_:0xff} 713 ]}, 714 dot_oper = {parts: [ 715 {l_:0x01, r_:0xff} 716 ]}, 717 alphanum_ = {parts: [ 718 {l_:0x30, r_:0x39}, 719 {l_:0x41, r_:0x5a}, 720 {l_:0x5f, r_:0x5f}, 721 {l_:0x61, r_:0x7a} 722 ]}; 723 } 724 // 16bit and 32bit classes 725 static if ( is(char_t == wchar) || is(char_t == dchar) ) 726 { 727 static const CharClass!(char_t) 728 any_char = {parts: [ 729 {l_:0x0001, r_:0xffff} 730 ]}, 731 dot_oper = {parts: [ 732 {l_:0x0001, r_:0xffff} 733 ]}, 734 alphanum_ = {parts: [ 735 {l_:0x30, r_:0x39}, 736 {l_:0x41, r_:0x5a}, 737 {l_:0x5f, r_:0x5f}, 738 {l_:0x61, r_:0x7a} 739 ]}; 740 } 741 742 //--------------------------------------------------------------------------------------------- 743 range_t[] parts; 744 745 invariant() 746 { 747 // foreach ( i, p; parts ) 748 // assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_)); 749 } 750 751 static CharClass opCall(const(CharClass) cc) 752 { 753 CharClass ncc; 754 ncc.parts = cc.parts.dup; 755 return ncc; 756 } 757 758 int opCmp(const ref CharClass cc) const 759 { 760 if ( parts.length < cc.parts.length ) 761 return -1; 762 if ( parts.length > cc.parts.length ) 763 return 1; 764 foreach ( i, p; cc.parts ) 765 { 766 if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ ) 767 return 1; 768 } 769 return 0; 770 } 771 772 bool empty() 773 { 774 return parts.length <= 0; 775 } 776 777 bool matches(char_t c) 778 { 779 foreach ( p; parts ) 780 { 781 if ( p.contains(c) ) 782 return true; 783 } 784 return false; 785 } 786 787 CharClass intersect(CharClass cc) 788 { 789 CharClass ic; 790 foreach ( p; parts ) 791 { 792 foreach ( cp; cc.parts ) 793 { 794 if ( p.intersects(cp) ) 795 ic.parts ~= p.intersect(cp); 796 } 797 } 798 return ic; 799 } 800 801 // requires the class to be optimized 802 bool contains(range_t cr) 803 { 804 foreach ( p; parts ) 805 { 806 if ( p.contains(cr) ) 807 return true; 808 } 809 return false; 810 } 811 812 // requires the class to be optimized 813 bool contains(CharClass cc) 814 { 815 Louter: foreach ( p; cc.parts ) 816 { 817 foreach ( p2; parts ) 818 { 819 if ( p2.contains(p) ) 820 continue Louter; 821 } 822 return false; 823 } 824 return true; 825 } 826 827 void subtract(CharClass cc) 828 { 829 negate(); 830 add(cc); 831 negate(); 832 } 833 834 void add(CharClass cc) 835 { 836 parts ~= cc.parts; 837 } 838 839 void add(range_t cr) 840 { 841 parts ~= cr; 842 } 843 844 void add(char_t c) 845 { 846 parts ~= CharRange!(char_t)(c); 847 } 848 849 /* ******************************************************************************************** 850 Requires the CharClass to be optimized. 851 **********************************************************************************************/ 852 void negate() 853 { 854 optimize(); 855 char_t start = char_t.min; 856 857 // first part touches left boundary of value range 858 if ( parts.length > 0 && parts[0].l_ == start ) 859 { 860 start = parts[0].r_; 861 if ( start < char_t.max ) 862 ++start; 863 864 foreach ( i, ref cr; parts[0 .. $-1] ) 865 { 866 cr.l_ = start; 867 cr.r_ = cast(char_t)(parts[i+1].l_-1); 868 start = parts[i+1].r_; 869 if ( start < char_t.max ) 870 ++start; 871 } 872 if ( start != char_t.max ) { 873 parts[$-1].l_ = start; 874 parts[$-1].r_ = char_t.max; 875 } 876 else 877 parts.length = parts.length-1; 878 return; 879 } 880 881 foreach ( i, ref cr; parts ) 882 { 883 char_t tmp = cast(char_t)(cr.l_-1); 884 cr.l_ = start; 885 start = cr.r_; 886 if ( start < char_t.max ) 887 ++start; 888 cr.r_ = tmp; 889 } 890 891 // last part does not touch right boundary 892 if ( start != char_t.max ) 893 parts ~= range_t(start, char_t.max); 894 } 895 896 void optimize() 897 { 898 if ( empty() ) 899 return; 900 901 parts.sort; 902 903 size_t i = 0; 904 foreach ( p; parts[1 .. $] ) 905 { 906 if ( p.l_ > parts[i].r_+1 ) { 907 ++i; 908 parts[i].l_ = p.l_; 909 parts[i].r_ = p.r_; 910 continue; 911 } 912 parts[i].r_ = max(p.r_, parts[i].r_); 913 if ( parts[i].r_ >= char_t.max ) 914 break; 915 } 916 parts.length = i+1; 917 } 918 919 string toString() 920 { 921 string str; 922 str ~= "["; 923 foreach ( p; parts ) 924 str ~= p.toString(); 925 str ~= "]"; 926 return str; 927 } 928 } 929 930 debug(UnitTest) 931 { 932 unittest 933 { 934 static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] }; 935 assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']"); 936 cc.optimize(); 937 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 938 cc.negate(); 939 assert(cc.toString(), " [(12)-(13)'e'-(ff)]"); 940 cc.optimize(); 941 assert(cc.toString(), "[(0)-(11)(14)-'d']"); 942 cc.negate(); 943 assert(cc.toString(), "[(12)-(13)'e'-(ff)]"); 944 945 static CharClass!(char) cc2 = { parts: [] }; 946 assert(cc.toString(), "[]"); 947 cc2.optimize(); 948 assert(cc.toString(), "[]"); 949 cc2.negate(); 950 assert(cc.toString(), "[(0)-(ff)]"); 951 cc2.optimize(); 952 assert(cc.toString(), "[(0)-(ff)]"); 953 cc2.negate(); 954 assert(cc.toString(), "[]"); 955 956 static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] }; 957 assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]"); 958 cc3.negate(); 959 assert(cc.toString(), "['e'-(c7)]"); 960 cc3.negate(); 961 assert(cc.toString(), "[(0)-'d'(c8)-(ff)]"); 962 963 static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] }; 964 assert(cc.toString(), "[(0)-(c8)'d'-(ff)]"); 965 cc4.optimize(); 966 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]"); 967 968 static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] }; 969 cc5.optimize(); 970 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 971 cc5.negate(); 972 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 973 cc5.optimize(); 974 assert(cc.toString(), "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]"); 975 cc5.negate(); 976 assert(cc.toString(), "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]"); 977 } 978 } 979 980 /* ************************************************************************************************ 981 982 **************************************************************************************************/ 983 private struct Predicate(char_t) 984 { 985 alias const(char_t)[] string_t; 986 alias CharClass!(char_t) cc_t; 987 alias CharRange!(char_t) cr_t; 988 989 // generic data 990 enum Type { 991 consume, epsilon, lookahead, lookbehind 992 } 993 994 cc_t input; 995 Type type; 996 997 // data for compiled predicates 998 enum uint MAX_BITMAP_LENGTH = 256, 999 MAX_SEARCH_LENGTH = 256; 1000 enum MatchMode { 1001 generic, generic_l, 1002 single_char, bitmap, string_search, // consume 1003 single_char_l, bitmap_l, string_search_l // lookahead 1004 } 1005 1006 MatchMode mode; 1007 // data_chr had to be pulled out of the union due to 1008 // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back 1009 // in until this is resolved! 1010 // 1011 // Keep in mind that data_str.length can't be modified directly unless the 1012 // new length is strictly greater than the old length. This is essentially 1013 // because ubyte.sizeof can be less than char_t.sizeof. If you set 1014 // data_str.length to anything less than or equal to data_bmp.length, 1015 // data_str will not be reallocated: only the length value will change but 1016 // nothing will realize that not that much space has actually been 1017 // allocated. data_str will be too small and you'll likely get segfaults or 1018 // such. 1019 // 1020 // In short: don't mess with data_str.length. If you have to, remove the 1021 // union entirely. 1022 // 1023 // -- Deewiant 1024 union { 1025 ubyte[] data_bmp; 1026 string_t data_str; 1027 }; 1028 char_t data_chr; 1029 1030 1031 void compile() 1032 { 1033 assert(input.parts.length > 0); 1034 1035 // single char? 1036 if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ ) 1037 { 1038 mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l; 1039 data_chr = input.parts[0].l_; 1040 return; 1041 } 1042 // check whether we can use a bitmap 1043 foreach ( p; input.parts ) 1044 { 1045 if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH ) 1046 goto LnoBitmap; 1047 } 1048 1049 // setup bitmap 1050 data_bmp.length = MAX_BITMAP_LENGTH/8; 1051 foreach ( p; input.parts ) 1052 { 1053 for ( char_t c = p.l_; c <= p.r_; ++c ) 1054 data_bmp[c/8] |= 1 << (c&7); 1055 } 1056 mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l; 1057 return; 1058 1059 LnoBitmap: 1060 /* 1061 // check whether the class is small enough to justify a string-search 1062 // TODO: consider inverse class for 8bit chars? 1063 uint class_size; 1064 foreach ( p; input.parts ) 1065 class_size += cast(uint)p.r_+1-p.l_; 1066 if ( class_size > MAX_SEARCH_LENGTH ) 1067 goto Lgeneric; 1068 data_str.length = class_size; 1069 size_t ind; 1070 foreach ( p; input.parts ) 1071 { 1072 for ( char_t c = p.l_; c <= p.r_; ++c ) 1073 data_str[ind++] = c; 1074 } 1075 mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l; 1076 return; 1077 */ 1078 Lgeneric: 1079 data_str = cast(char_t[])input.parts; 1080 mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l; 1081 } 1082 1083 bool matches(char_t c) 1084 { 1085 if ( type == Type.consume || type == Type.lookahead ) 1086 return input.matches(c); 1087 assert(0); 1088 } 1089 1090 Predicate intersect(Predicate p) 1091 { 1092 Predicate p2; 1093 if ( type != Type.epsilon && p.type != Type.epsilon ) 1094 p2.input = input.intersect(p.input); 1095 return p2; 1096 } 1097 1098 bool intersects(Predicate p) 1099 { 1100 if ( type != p.type ) 1101 return false; 1102 foreach ( cr; input.parts ) 1103 { 1104 foreach ( cr2; p.input.parts ) 1105 { 1106 if ( cr.intersects(cr2) ) 1107 return true; 1108 } 1109 } 1110 return false; 1111 } 1112 1113 bool exceedsMax(uint maxc) 1114 { 1115 foreach ( p; input.parts ) 1116 { 1117 if ( p.l_ > maxc || p.r_ > maxc ) 1118 return true; 1119 } 1120 1121 return false; 1122 } 1123 1124 @property bool empty() 1125 { 1126 return type != Type.epsilon && input.empty(); 1127 } 1128 1129 void subtract(Predicate p) 1130 { 1131 if ( type != Type.epsilon && p.type != Type.epsilon ) 1132 input.subtract(p.input); 1133 } 1134 1135 void negate() 1136 { 1137 assert(type != Type.epsilon); 1138 input.negate(); 1139 } 1140 1141 void optimize() 1142 { 1143 assert(type != Type.epsilon); 1144 input.optimize(); 1145 } 1146 1147 int opCmp(const ref Predicate p) const 1148 { 1149 return input.opCmp(p.input); 1150 } 1151 1152 const bool opEquals(ref const Predicate p) 1153 { 1154 if ( type != p.type ) 1155 return false; 1156 if ( input.opCmp(p.input) != 0 ) 1157 return false; 1158 return true; 1159 } 1160 1161 cc_t getInput() 1162 { 1163 return input; 1164 } 1165 1166 void setInput(const(cc_t) cc) 1167 { 1168 input = cast(cc_t)cc; 1169 } 1170 1171 void appendInput(cr_t cr) 1172 { 1173 input.add(cr); 1174 } 1175 1176 void appendInput(cc_t cc) 1177 { 1178 input.add(cc); 1179 } 1180 1181 void appendInput(Predicate p) 1182 { 1183 input.add(p.input); 1184 } 1185 1186 string toString() 1187 { 1188 string str; 1189 switch ( type ) 1190 { 1191 case Type.consume: str = input.toString(); break; 1192 case Type.epsilon: str = "eps"; break; 1193 case Type.lookahead: str = "la:"~input.toString(); break; 1194 case Type.lookbehind: str = "lb:"~input.toString(); break; 1195 default: 1196 assert(0); 1197 } 1198 return str; 1199 } 1200 } 1201 import Utf = tango.text.convert.Utf; 1202 import tango.text.convert.Format; 1203 1204 /* ************************************************************************************************ 1205 1206 **************************************************************************************************/ 1207 class RegExpException : Exception 1208 { 1209 this(string msg) 1210 { 1211 super("RegExp: "~msg); 1212 } 1213 } 1214 1215 /* ************************************************************************************************ 1216 TNFA state 1217 **************************************************************************************************/ 1218 private class TNFAState(char_t) 1219 { 1220 bool accept = false, 1221 visited = false; 1222 size_t index; 1223 List!(TNFATransition!(char_t)) transitions; 1224 1225 this() 1226 { 1227 transitions = new List!(TNFATransition!(char_t)); 1228 } 1229 } 1230 1231 1232 /* ************************************************************************************************ 1233 Priority classes used to linearize priorities after non-linear transition creation. 1234 **************************************************************************************************/ 1235 private enum PriorityClass { 1236 greedy=0, normal=1, reluctant=2, extraReluctant=3 1237 } 1238 1239 /* ******************************************************************************** 1240 TNFA tagged transition 1241 ***********************************************************************************/ 1242 private class TNFATransition(char_t) 1243 { 1244 TNFAState!(char_t) target; 1245 Predicate!(char_t) predicate; 1246 uint priority, 1247 tag; /// one-based tag number, 0 = untagged 1248 PriorityClass priorityClass; 1249 1250 this(PriorityClass pc) 1251 { 1252 priorityClass = pc; 1253 } 1254 1255 /****************************************************************************** 1256 Move through states only going via epsilon transitions, and only choosing 1257 the one with highest priority. If the highest priority transition from a 1258 state isn't an epsilon transition, false is returned. 1259 If the accepting NFA state can be reached in this manner, true is returned. 1260 1261 NOTE: This method does not look for cycles which should be kept in mind for 1262 later. larsivi 20090827 1263 *******************************************************************************/ 1264 bool canFinish() 1265 { 1266 TNFAState!(char_t) t = target; 1267 while (!t.accept) { 1268 TNFATransition!(char_t) highestPriTrans; 1269 foreach (trans; t.transitions) { 1270 if (!highestPriTrans || highestPriTrans.priority > trans.priority) 1271 highestPriTrans = trans; 1272 } 1273 if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon)) 1274 return false; 1275 1276 t = highestPriTrans.target; 1277 } 1278 return true; 1279 } 1280 1281 } 1282 1283 /* ************************************************************************************************ 1284 Fragments of TNFAs as used in the Thompson method 1285 **************************************************************************************************/ 1286 private class TNFAFragment(char_t) 1287 { 1288 alias TNFAState!(char_t) state_t; 1289 alias TNFATransition!(char_t) trans_t; 1290 1291 List!(trans_t) entries, /// transitions to be added to the entry state 1292 exits, /// transitions to be added to the exit state 1293 entry_state, /// transitions to write the entry state to 1294 exit_state; /// transitions to write the exit state to 1295 1296 bool swapMatchingBracketSyntax; 1297 1298 this() 1299 { 1300 entries = new List!(trans_t); 1301 exits = new List!(trans_t); 1302 entry_state = new List!(trans_t); 1303 exit_state = new List!(trans_t); 1304 } 1305 1306 /* ******************************************************************************************** 1307 Write the given state as entry state to this fragment. 1308 **********************************************************************************************/ 1309 void setEntry(state_t state) 1310 { 1311 state.transitions ~= entries; 1312 foreach ( t; entry_state ) 1313 t.target = state; 1314 } 1315 1316 /* ******************************************************************************************** 1317 Write the given state as exit state to this fragment. 1318 **********************************************************************************************/ 1319 void setExit(state_t state) 1320 { 1321 state.transitions ~= exits; 1322 foreach ( t; exit_state ) 1323 t.target = state; 1324 } 1325 } 1326 1327 /* ************************************************************************************************ 1328 Tagged NFA 1329 **************************************************************************************************/ 1330 private final class TNFA(char_t) 1331 { 1332 alias TNFATransition!(char_t) trans_t; 1333 alias TNFAFragment!(char_t) frag_t; 1334 alias TNFAState!(char_t) state_t; 1335 alias Predicate!(char_t) predicate_t; 1336 alias char_t[] string_t; 1337 alias CharRange!(char_t) range_t; 1338 alias CharClass!(char_t) cc_t; 1339 1340 string_t pattern; 1341 state_t[] states; 1342 state_t start; 1343 1344 bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching 1345 1346 /* ******************************************************************************************** 1347 Creates the TNFA from the given regex pattern 1348 **********************************************************************************************/ 1349 this(string_t regex) 1350 { 1351 next_tag = 1; 1352 transitions = new List!(trans_t); 1353 1354 pattern = regex; 1355 } 1356 1357 /* ******************************************************************************************** 1358 Print the TNFA (tabular representation of the delta function) 1359 **********************************************************************************************/ 1360 debug(TangoRegex) void print() 1361 { 1362 foreach ( int i, s; states ) 1363 { 1364 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 1365 1366 bool first=true; 1367 Stdout(" {"); 1368 foreach ( t; s.transitions ) 1369 { 1370 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index); 1371 if ( t.tag > 0 ) { 1372 Stdout.format(" t{}", t.tag); 1373 } 1374 first = false; 1375 } 1376 Stdout("}").newline; 1377 } 1378 } 1379 1380 uint tagCount() 1381 { 1382 return next_tag-1; 1383 } 1384 1385 /* ******************************************************************************************** 1386 Constructs the TNFA using extended Thompson method. 1387 Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert 1388 the regexp from infix notation. 1389 **********************************************************************************************/ 1390 void parse(bool unanchored) 1391 { 1392 List!(frag_t) frags = new List!(frag_t); 1393 Stack!(Operator) opStack; 1394 Stack!(uint) tagStack; 1395 Stack!(Pair!(uint)) occurStack; 1396 opStack ~= Operator.eos; 1397 1398 /* **************************************************************************************** 1399 Perform action on operator stack 1400 ******************************************************************************************/ 1401 bool perform(Operator next_op, bool explicit_operator=true) 1402 { 1403 // calculate index in action matrix 1404 int index = cast(int)opStack.top()*(Operator.max+1); 1405 index += cast(int)next_op; 1406 1407 debug(tnfa) Stdout.formatln("\t{}:{} -> {} {} frag(s)", 1408 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length 1409 ); 1410 switch ( action_lookup[index] ) 1411 { 1412 case Act.pua: 1413 opStack ~= next_op; 1414 if ( next_op == Operator.open_par ) { 1415 tagStack ~= next_tag; 1416 next_tag += 2; 1417 } 1418 break; 1419 case Act.poc: 1420 switch ( opStack.top() ) 1421 { 1422 case Operator.concat: constructConcat(frags); break; 1423 case Operator.altern: constructAltern(frags); break; 1424 case Operator.zero_one_g: constructZeroOne(frags, PriorityClass.greedy); break; 1425 case Operator.zero_one_ng: constructZeroOne(frags, PriorityClass.reluctant); break; 1426 case Operator.zero_one_xr: constructZeroOne(frags, PriorityClass.extraReluctant); break; 1427 case Operator.zero_more_g: constructZeroMore(frags, PriorityClass.greedy); break; 1428 case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant); break; 1429 case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break; 1430 case Operator.one_more_g: constructOneMore(frags, PriorityClass.greedy); break; 1431 case Operator.one_more_ng: constructOneMore(frags, PriorityClass.reluctant); break; 1432 case Operator.one_more_xr: constructOneMore(frags, PriorityClass.extraReluctant); break; 1433 case Operator.occur_g: 1434 Pair!(uint) occur = occurStack.pop(); 1435 constructOccur(frags, occur.a, occur.b, PriorityClass.greedy); 1436 break; 1437 case Operator.occur_ng: 1438 Pair!(uint) occur = occurStack.pop(); 1439 constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant); 1440 break; 1441 default: 1442 throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1443 } 1444 opStack.pop(); 1445 1446 perform(next_op, false); 1447 break; 1448 case Act.poa: 1449 opStack.pop(); 1450 break; 1451 case Act.pca: 1452 if ( opStack.top() == Operator.open_par ) 1453 { 1454 if ( tagStack.empty() ) 1455 throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup); 1456 constructBracket(frags, tagStack.top()); 1457 tagStack.pop(); 1458 } 1459 else { 1460 assert(opStack.top() == Operator.open_par_nm); 1461 constructBracket(frags); 1462 } 1463 opStack.pop(); 1464 break; 1465 case Act.don: 1466 return true; 1467 case Act.err: 1468 default: 1469 throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup); 1470 } 1471 1472 return false; 1473 } 1474 1475 // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches 1476 // and matching bracket for total match group 1477 if ( unanchored ) { 1478 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1479 perform(Operator.zero_more_xr, false); 1480 perform(Operator.concat, false); 1481 perform(Operator.open_par, false); 1482 } 1483 1484 // convert regex to postfix and create TNFA 1485 bool implicit_concat; 1486 predicate_t.Type pred_type; 1487 1488 while ( !endOfPattern() ) 1489 { 1490 pred_type = predicate_t.Type.consume; 1491 1492 dchar c = readPattern(); 1493 switch ( c ) 1494 { 1495 case '|': 1496 perform(Operator.altern); 1497 implicit_concat = false; 1498 break; 1499 case '(': 1500 if ( implicit_concat ) 1501 perform(Operator.concat, false); 1502 implicit_concat = false; 1503 if ( peekPattern() == '?' ) { 1504 readPattern(); 1505 perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm); 1506 } 1507 else 1508 perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par); 1509 break; 1510 case ')': 1511 perform(Operator.close_par); 1512 break; 1513 case '?': 1514 if ( peekPattern() == '?' ) { 1515 readPattern(); 1516 perform(Operator.zero_one_ng); 1517 } 1518 else 1519 perform(Operator.zero_one_g); 1520 break; 1521 case '*': 1522 if ( peekPattern() == '?' ) { 1523 readPattern(); 1524 perform(Operator.zero_more_ng); 1525 } 1526 else 1527 perform(Operator.zero_more_g); 1528 break; 1529 case '+': 1530 if ( peekPattern() == '?' ) { 1531 readPattern(); 1532 perform(Operator.one_more_ng); 1533 } 1534 else 1535 perform(Operator.one_more_g); 1536 break; 1537 case '{': 1538 Pair!(uint) occur; 1539 parseOccurCount(occur.a, occur.b); 1540 occurStack ~= occur; 1541 if ( peekPattern() == '?' ) { 1542 readPattern(); 1543 perform(Operator.occur_ng); 1544 } 1545 else 1546 perform(Operator.occur_g); 1547 break; 1548 case '[': 1549 if ( implicit_concat ) 1550 perform(Operator.concat, false); 1551 implicit_concat = true; 1552 frags ~= constructCharClass(pred_type); 1553 break; 1554 case '.': 1555 if ( implicit_concat ) 1556 perform(Operator.concat, false); 1557 implicit_concat = true; 1558 frags ~= constructChars(cc_t.dot_oper, pred_type); 1559 break; 1560 case '$': 1561 if ( implicit_concat ) 1562 perform(Operator.concat, false); 1563 implicit_concat = true; 1564 1565 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead); 1566 break; 1567 case '^': 1568 if ( implicit_concat ) 1569 perform(Operator.concat, false); 1570 implicit_concat = true; 1571 1572 frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind); 1573 break; 1574 case '>': 1575 c = readPattern(); 1576 pred_type = predicate_t.Type.lookahead; 1577 if ( c == '[' ) 1578 goto case '['; 1579 else if ( c == '\\' ) 1580 goto case '\\'; 1581 else if ( c == '.' ) 1582 goto case '.'; 1583 else 1584 goto default; 1585 case '<': 1586 c = readPattern(); 1587 pred_type = predicate_t.Type.lookbehind; 1588 if ( c == '[' ) 1589 goto case '['; 1590 else if ( c == '\\' ) 1591 goto case '\\'; 1592 else if ( c == '.' ) 1593 goto case '.'; 1594 else 1595 goto default; 1596 case '\\': 1597 c = readPattern(); 1598 1599 if ( implicit_concat ) 1600 perform(Operator.concat, false); 1601 implicit_concat = true; 1602 1603 switch ( c ) 1604 { 1605 case 't': 1606 frags ~= constructSingleChar('\t', pred_type); 1607 break; 1608 case 'n': 1609 frags ~= constructSingleChar('\n', pred_type); 1610 break; 1611 case 'r': 1612 frags ~= constructSingleChar('\r', pred_type); 1613 break; 1614 case 'w': // alphanumeric and _ 1615 frags ~= constructChars(cc_t.alphanum_, pred_type); 1616 break; 1617 case 'W': // non-(alphanum and _) 1618 auto cc = cc_t(cc_t.alphanum_); 1619 cc.negate(); 1620 frags ~= constructChars(cc, pred_type); 1621 break; 1622 case 's': // whitespace 1623 frags ~= constructChars(cc_t.whitespace, pred_type); 1624 break; 1625 case 'S': // non-whitespace 1626 auto cc = cc_t(cc_t.whitespace); 1627 cc.negate(); 1628 frags ~= constructChars(cc, pred_type); 1629 break; 1630 case 'd': // digit 1631 frags ~= constructChars(cc_t.digit, pred_type); 1632 break; 1633 case 'D': // non-digit 1634 auto cc = cc_t(cc_t.digit); 1635 cc.negate(); 1636 frags ~= constructChars(cc, pred_type); 1637 break; 1638 case 'b': // either end of word 1639 if ( pred_type != predicate_t.Type.consume ) 1640 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind"); 1641 1642 // create (?<\S>\s|<\s>\S) 1643 auto cc = cc_t(cc_t.whitespace); 1644 cc.negate(); 1645 1646 perform(Operator.open_par_nm); 1647 1648 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1649 perform(Operator.concat, false); 1650 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1651 perform(Operator.altern, false); 1652 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1653 perform(Operator.concat, false); 1654 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1655 1656 perform(Operator.close_par, false); 1657 break; 1658 case 'B': // neither end of word 1659 if ( pred_type != predicate_t.Type.consume ) 1660 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind"); 1661 1662 // create (?<\S>\S|<\s>\s) 1663 auto cc = cc_t(cc_t.whitespace); 1664 cc.negate(); 1665 1666 perform(Operator.open_par_nm); 1667 1668 frags ~= constructChars(cc, predicate_t.Type.lookbehind); 1669 perform(Operator.concat, false); 1670 frags ~= constructChars(cc, predicate_t.Type.lookahead); 1671 perform(Operator.altern, false); 1672 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); 1673 perform(Operator.concat, false); 1674 frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); 1675 1676 perform(Operator.close_par, false); 1677 break; 1678 case '(': 1679 case ')': 1680 case '[': 1681 case ']': 1682 case '{': 1683 case '}': 1684 case '*': 1685 case '+': 1686 case '?': 1687 case '.': 1688 case '\\': 1689 case '^': 1690 case '$': 1691 case '|': 1692 case '<': 1693 case '>': 1694 default: 1695 frags ~= constructSingleChar(c, pred_type); 1696 break; 1697 // throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c)); 1698 } 1699 break; 1700 1701 default: 1702 if ( implicit_concat ) 1703 perform(Operator.concat, false); 1704 implicit_concat = true; 1705 frags ~= constructSingleChar(c, pred_type); 1706 } 1707 } 1708 1709 // add implicit reluctant .* (with . == any_char) at the end for unanchored matches 1710 if ( unanchored ) 1711 { 1712 perform(Operator.close_par, false); 1713 if ( implicit_concat ) 1714 perform(Operator.concat, false); 1715 frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume); 1716 perform(Operator.zero_more_ng, false); 1717 } 1718 1719 // empty operator stack 1720 while ( !perform(Operator.eos) ) {} 1721 1722 // set start and finish states 1723 start = addState(); 1724 state_t finish = addState(); 1725 finish.accept = true; 1726 1727 foreach ( f; frags ) { 1728 f.setExit(finish); 1729 f.setEntry(start); 1730 } 1731 1732 // set transition priorities 1733 List!(trans_t)[PriorityClass.max+1] trans; 1734 foreach ( ref t; trans ) 1735 t = new List!(trans_t); 1736 1737 Stack!(trans_t) todo; 1738 state_t state = start; 1739 1740 while ( !todo.empty() || !state.visited ) 1741 { 1742 if ( !state.visited ) 1743 { 1744 state.visited = true; 1745 foreach_reverse ( t; state.transitions ) 1746 todo ~= t; 1747 } 1748 1749 if ( todo.empty() ) 1750 break; 1751 trans_t t = todo.top(); 1752 todo.pop(); 1753 assert(t.priorityClass<=PriorityClass.max); 1754 trans[t.priorityClass] ~= t; 1755 state = t.target; 1756 } 1757 1758 uint nextPrio; 1759 foreach ( ts; trans ) 1760 { 1761 foreach ( t; ts ) 1762 t.priority = nextPrio++; 1763 } 1764 } 1765 1766 private: 1767 uint next_tag; 1768 size_t cursor, 1769 next_cursor; 1770 List!(trans_t) transitions; 1771 1772 state_t[state_t] clonedStates; 1773 trans_t[trans_t] clonedTransitions; 1774 1775 /// RegEx operators 1776 enum Operator { 1777 eos, concat, altern, open_par, close_par, 1778 zero_one_g, zero_more_g, one_more_g, // greedy 1779 zero_one_ng, zero_more_ng, one_more_ng, // non-greedy/reluctant 1780 zero_one_xr, zero_more_xr, one_more_xr, // extra-reluctant 1781 open_par_nm, occur_g, occur_ng 1782 } 1783 __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"]; 1784 1785 /// Actions for to-postfix transformation 1786 enum Act { 1787 pua, poc, poa, pca, don, err 1788 } 1789 __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"]; 1790 1791 /// Action lookup for to-postfix transformation 1792 __gshared immutable Act[] action_lookup = 1793 [ 1794 // eos concat | ( ) ? * + ?? *? +? ??extra *?extra +?extra (? {x,y} {x,y}? 1795 Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1796 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1797 Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1798 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1799 Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, 1800 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1801 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1802 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1803 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1804 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1805 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1806 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1807 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1808 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1809 Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, 1810 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, 1811 Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc 1812 ]; 1813 1814 final dchar peekPattern() 1815 { 1816 auto tmp = next_cursor; 1817 if ( tmp < pattern.length ) 1818 return decode(pattern, tmp); 1819 return 0; 1820 } 1821 1822 final dchar readPattern() 1823 { 1824 cursor = next_cursor; 1825 if ( next_cursor < pattern.length ) 1826 return decode(pattern, next_cursor); 1827 return 0; 1828 } 1829 1830 final bool endOfPattern() 1831 { 1832 return next_cursor >= pattern.length; 1833 } 1834 1835 state_t addState() 1836 { 1837 state_t s = new state_t; 1838 s.index = states.length; 1839 states ~= s; 1840 return s; 1841 } 1842 1843 trans_t addTransition(PriorityClass pc = PriorityClass.normal) 1844 { 1845 trans_t trans = new trans_t(pc); 1846 transitions ~= trans; 1847 return trans; 1848 } 1849 1850 uint parseNumber() 1851 { 1852 uint res; 1853 while ( !endOfPattern() ) 1854 { 1855 auto c = peekPattern(); 1856 if ( c < '0' || c > '9' ) 1857 break; 1858 res = res*10+(c-'0'); 1859 readPattern(); 1860 } 1861 return res; 1862 } 1863 1864 void parseOccurCount(out uint minOccur, out uint maxOccur) 1865 { 1866 assert(pattern[cursor] == '{'); 1867 1868 minOccur = parseNumber(); 1869 if ( peekPattern() == '}' ) { 1870 readPattern(); 1871 maxOccur = minOccur; 1872 return; 1873 } 1874 if ( peekPattern() != ',' ) 1875 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1876 readPattern(); 1877 maxOccur = parseNumber(); 1878 if ( peekPattern() != '}' ) 1879 throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1880 readPattern(); 1881 if ( maxOccur > 0 && maxOccur < minOccur ) 1882 throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 1883 } 1884 1885 trans_t clone(trans_t t) 1886 { 1887 if ( t is null ) 1888 return null; 1889 trans_t* tmp = t in clonedTransitions; 1890 if ( tmp !is null ) 1891 return *tmp; 1892 1893 trans_t t2 = new trans_t(t.priorityClass); 1894 clonedTransitions[t] = t2; 1895 t2.tag = t.tag; 1896 t2.priority = t.priority; 1897 t2.predicate = t.predicate; 1898 t2.target = clone(t.target); 1899 transitions ~= t2; 1900 return t2; 1901 } 1902 1903 state_t clone(state_t s) 1904 { 1905 if ( s is null ) 1906 return null; 1907 state_t* tmp = s in clonedStates; 1908 if ( tmp !is null ) 1909 return *tmp; 1910 1911 state_t s2 = new state_t; 1912 clonedStates[s] = s2; 1913 s2.accept = s.accept; 1914 s2.visited = s.visited; 1915 foreach ( t; s.transitions ) 1916 s2.transitions ~= clone(t); 1917 s2.index = states.length; 1918 states ~= s2; 1919 return s2; 1920 } 1921 1922 frag_t clone(frag_t f) 1923 { 1924 if ( f is null ) 1925 return null; 1926 clonedStates = null; 1927 clonedTransitions = null; 1928 1929 frag_t f2 = new frag_t; 1930 foreach ( t; f.entries ) 1931 f2.entries ~= clone(t); 1932 foreach ( t; f.exits ) 1933 f2.exits ~= clone(t); 1934 foreach ( t; f.entry_state ) 1935 f2.entry_state ~= clone(t); 1936 foreach ( t; f.exit_state ) 1937 f2.exit_state ~= clone(t); 1938 return f2; 1939 } 1940 1941 //--------------------------------------------------------------------------------------------- 1942 // Thompson constructions of NFA fragments 1943 1944 frag_t constructSingleChar(char_t c, predicate_t.Type type) 1945 { 1946 debug(tnfa) Stdout.formatln("constructCharFrag {}", c); 1947 1948 trans_t trans = addTransition(); 1949 trans.predicate.appendInput(CharRange!(char_t)(c)); 1950 1951 trans.predicate.type = type; 1952 1953 frag_t frag = new frag_t; 1954 frag.exit_state ~= trans; 1955 frag.entries ~= trans; 1956 return frag; 1957 } 1958 1959 frag_t constructChars(string_t chars, in predicate_t.Type type) 1960 { 1961 cc_t cc; 1962 for ( int i = 0; i < chars.length; ++i ) 1963 cc.add(chars[i]); 1964 1965 return constructChars(cc, type); 1966 } 1967 1968 frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type) 1969 { 1970 debug(tnfa) Stdout.format("constructChars type={}", type); 1971 1972 trans_t trans = addTransition(); 1973 trans.predicate.type = type; 1974 1975 trans.predicate.setInput(cc_t(charclass)); 1976 1977 trans.predicate.optimize(); 1978 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 1979 1980 frag_t frag = new frag_t; 1981 frag.exit_state ~= trans; 1982 frag.entries ~= trans; 1983 return frag; 1984 } 1985 1986 frag_t constructCharClass(predicate_t.Type type) 1987 { 1988 debug(tnfa) Stdout.format("constructCharClass type={}", type); 1989 auto oldCursor = cursor; 1990 1991 trans_t trans = addTransition(); 1992 1993 bool negated=false; 1994 if ( peekPattern() == '^' ) { 1995 readPattern(); 1996 negated = true; 1997 } 1998 1999 char_t last; 2000 bool have_range_start, 2001 first_char = true; 2002 for ( ; !endOfPattern() && peekPattern() != ']'; ) 2003 { 2004 dchar c = readPattern(); 2005 switch ( c ) 2006 { 2007 case '-': 2008 if ( first_char ) { 2009 trans.predicate.appendInput(range_t(c)); 2010 break; 2011 } 2012 if ( !have_range_start ) 2013 throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2014 else if ( endOfPattern() || peekPattern() == ']' ) 2015 throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\""); 2016 else { 2017 c = readPattern(); 2018 trans.predicate.appendInput(range_t(last, c)); 2019 have_range_start = false; 2020 } 2021 break; 2022 case '\\': 2023 if ( endOfPattern() ) 2024 throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\""); 2025 c = readPattern(); 2026 switch ( c ) 2027 { 2028 case 't': 2029 c = '\t'; 2030 break; 2031 case 'n': 2032 c = '\n'; 2033 break; 2034 case 'r': 2035 c = '\r'; 2036 break; 2037 default: 2038 break; 2039 } 2040 goto default; 2041 default: 2042 if ( have_range_start ) 2043 trans.predicate.appendInput(range_t(last)); 2044 last = c; 2045 have_range_start = true; 2046 } 2047 first_char = false; 2048 } 2049 if ( !endOfPattern() ) 2050 readPattern(); 2051 if ( last != char_t.init ) 2052 trans.predicate.appendInput(range_t(last)); 2053 debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]); 2054 2055 if ( negated ) { 2056 auto tmp = cc_t(cc_t.any_char); 2057 tmp.subtract(trans.predicate.input); 2058 trans.predicate.input = tmp; 2059 } 2060 else 2061 trans.predicate.optimize(); 2062 debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString); 2063 2064 trans.predicate.type = type; 2065 2066 frag_t frag = new frag_t; 2067 frag.exit_state ~= trans; 2068 frag.entries ~= trans; 2069 return frag; 2070 } 2071 2072 void constructBracket(List!(frag_t) frags, uint tag=0) 2073 { 2074 debug(tnfa) Stdout.formatln("constructBracket"); 2075 2076 state_t entry = addState(), 2077 exit = addState(); 2078 frags.tail.value.setEntry(entry); 2079 frags.tail.value.setExit(exit); 2080 2081 trans_t tag1 = addTransition(), 2082 tag2 = addTransition(); 2083 tag1.predicate.type = predicate_t.Type.epsilon; 2084 tag2.predicate.type = predicate_t.Type.epsilon; 2085 if ( tag > 0 ) 2086 { 2087 // make sure the tag indeces for bracket x are always 2088 // x*2 for the opening bracket and x*2+1 for the closing bracket 2089 tag1.tag = tag++; 2090 tag2.tag = tag; 2091 } 2092 tag1.target = entry; 2093 exit.transitions ~= tag2; 2094 2095 frag_t frag = new frag_t; 2096 frag.entries ~= tag1; 2097 frag.exit_state ~= tag2; 2098 frags.pop(); 2099 frags ~= frag; 2100 } 2101 2102 void constructOneMore(List!(frag_t) frags, PriorityClass prioClass) 2103 { 2104 debug(tnfa) Stdout.formatln("constructOneMore"); 2105 2106 if ( frags.empty() ) 2107 throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2108 2109 trans_t repeat = addTransition(prioClass), 2110 cont = addTransition(); 2111 repeat.predicate.type = predicate_t.Type.epsilon; 2112 cont.predicate.type = predicate_t.Type.epsilon; 2113 2114 state_t s = addState(); 2115 frags.tail.value.setExit(s); 2116 s.transitions ~= repeat; 2117 s.transitions ~= cont; 2118 2119 frag_t frag = new frag_t; 2120 frag.entries ~= frags.tail.value.entries; 2121 frag.entry_state ~= frags.tail.value.entry_state; 2122 frag.entry_state ~= repeat; 2123 frag.exit_state ~= cont; 2124 frags.pop(); 2125 frags ~= frag; 2126 } 2127 2128 void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass) 2129 { 2130 debug(tnfa) Stdout.formatln("constructZeroMore"); 2131 2132 if ( frags.empty() ) 2133 throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2134 2135 trans_t enter = addTransition(prioClass), 2136 repeat = addTransition(prioClass), 2137 skip = addTransition(); 2138 skip.predicate.type = predicate_t.Type.epsilon; 2139 repeat.predicate.type = predicate_t.Type.epsilon; 2140 enter.predicate.type = predicate_t.Type.epsilon; 2141 2142 state_t entry = addState(), 2143 exit = addState(); 2144 frags.tail.value.setEntry(entry); 2145 frags.tail.value.setExit(exit); 2146 exit.transitions ~= repeat; 2147 enter.target = entry; 2148 2149 frag_t frag = new frag_t; 2150 frag.entries ~= skip; 2151 frag.entries ~= enter; 2152 frag.exit_state ~= skip; 2153 frag.entry_state ~= repeat; 2154 frags.pop(); 2155 frags ~= frag; 2156 } 2157 2158 void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass) 2159 { 2160 debug(tnfa) Stdout.formatln("constructZeroOne"); 2161 2162 if ( frags.empty() ) 2163 throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2164 2165 trans_t use = addTransition(prioClass), 2166 skip = addTransition(); 2167 use.predicate.type = predicate_t.Type.epsilon; 2168 skip.predicate.type = predicate_t.Type.epsilon; 2169 2170 state_t s = addState(); 2171 frags.tail.value.setEntry(s); 2172 use.target = s; 2173 2174 frag_t frag = new frag_t; 2175 frag.entries ~= use; 2176 frag.entries ~= skip; 2177 frag.exits ~= frags.tail.value.exits; 2178 frag.exit_state ~= frags.tail.value.exit_state; 2179 frag.exit_state ~= skip; 2180 frags.pop(); 2181 frags ~= frag; 2182 } 2183 2184 void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass) 2185 { 2186 debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur); 2187 2188 if ( frags.empty() ) 2189 throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2190 2191 state_t s; 2192 frag_t total = new frag_t, 2193 prev; 2194 2195 for ( int i = 0; i < minOccur; ++i ) 2196 { 2197 frag_t f = clone(frags.tail.value); 2198 if ( prev !is null ) { 2199 s = addState(); 2200 prev.setExit(s); 2201 f.setEntry(s); 2202 } 2203 else { 2204 total.entries = f.entries; 2205 total.entry_state = f.entry_state; 2206 } 2207 prev = f; 2208 } 2209 2210 if ( maxOccur == 0 ) 2211 { 2212 frag_t f = frags.tail.value; 2213 trans_t t = addTransition(); 2214 t.predicate.type = predicate_t.Type.epsilon; 2215 f.entries ~= t; 2216 f.exit_state ~= t; 2217 2218 t = addTransition(); 2219 t.predicate.type = predicate_t.Type.epsilon; 2220 f.exits ~= t; 2221 f.entry_state ~= t; 2222 2223 s = addState(); 2224 f.setEntry(s); 2225 2226 if ( prev !is null ) 2227 prev.setExit(s); 2228 else { 2229 total.entries = f.entries; 2230 total.entry_state = f.entry_state; 2231 } 2232 2233 prev = f; 2234 } 2235 2236 for ( int i = minOccur; i < maxOccur; ++i ) 2237 { 2238 frag_t f; 2239 if ( i < maxOccur-1 ) 2240 f = clone(frags.tail.value); 2241 else 2242 f = frags.tail.value; 2243 trans_t t = addTransition(); 2244 t.predicate.type = predicate_t.Type.epsilon; 2245 f.entries ~= t; 2246 f.exit_state ~= t; 2247 2248 if ( prev !is null ) { 2249 s = addState(); 2250 prev.setExit(s); 2251 f.setEntry(s); 2252 } 2253 else { 2254 total.entries = f.entries; 2255 total.entry_state = f.entry_state; 2256 } 2257 prev = f; 2258 } 2259 2260 total.exits = prev.exits; 2261 total.exit_state = prev.exit_state; 2262 2263 frags.pop(); 2264 frags ~= total; 2265 } 2266 2267 void constructAltern(List!(frag_t) frags) 2268 { 2269 debug(tnfa) Stdout.formatln("constructAltern"); 2270 2271 if ( frags.empty() || frags.head is frags.tail ) 2272 throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2273 2274 frag_t frag = new frag_t, 2275 f1 = frags.tail.value, 2276 f2 = frags.tail.prev.value; 2277 frag.entry_state ~= f2.entry_state; 2278 frag.entry_state ~= f1.entry_state; 2279 frag.exit_state ~= f2.exit_state; 2280 frag.exit_state ~= f1.exit_state; 2281 frag.entries ~= f2.entries; 2282 frag.entries ~= f1.entries; 2283 frag.exits ~= f2.exits; 2284 frag.exits ~= f1.exits; 2285 2286 frags.pop(); 2287 frags.pop(); 2288 frags ~= frag; 2289 } 2290 2291 void constructConcat(List!(frag_t) frags) 2292 { 2293 debug(tnfa) Stdout.formatln("constructConcat"); 2294 2295 if ( frags.empty() || frags.head is frags.tail ) 2296 throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\""); 2297 2298 frag_t f1 = frags.tail.value, 2299 f2 = frags.tail.prev.value; 2300 2301 state_t state = addState(); 2302 f2.setExit(state); 2303 f1.setEntry(state); 2304 2305 frag_t frag = new frag_t; 2306 frag.entries ~= f2.entries; 2307 frag.exits ~= f1.exits; 2308 frag.entry_state ~= f2.entry_state; 2309 frag.exit_state ~= f1.exit_state; 2310 frags.pop(); 2311 frags.pop(); 2312 frags ~= frag; 2313 } 2314 } 2315 import tango.core.Array; 2316 2317 /* ************************************************************************************************ 2318 Tagged DFA 2319 **************************************************************************************************/ 2320 private class TDFA(char_t) 2321 { 2322 alias Predicate!(char_t) predicate_t; 2323 alias CharRange!(char_t) range_t; 2324 alias CharClass!(char_t) charclass_t; 2325 alias char_t[] string_t; 2326 2327 enum uint CURRENT_POSITION_REGISTER = ~0; 2328 2329 /* ******************************************************************************************** 2330 Tag map assignment command 2331 **********************************************************************************************/ 2332 struct Command 2333 { 2334 uint dst, /// register index to recieve data 2335 src; /// register index or CURRENT_POSITION_REGISTER for current position 2336 2337 string toString() 2338 { 2339 return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup; 2340 } 2341 2342 /* **************************************************************************************** 2343 Order transitions by the order of their predicates. 2344 ******************************************************************************************/ 2345 int opCmp(const ref Command cmd) const 2346 { 2347 if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER ) 2348 return 1; 2349 if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER ) 2350 return -1; 2351 if ( dst < cmd.dst ) 2352 return -1; 2353 if ( dst == cmd.dst ) 2354 return 0; 2355 return 1; 2356 } 2357 } 2358 2359 struct TagIndex 2360 { 2361 uint tag, 2362 index; 2363 2364 const int opCmp(ref const TagIndex o) 2365 { 2366 if (tag == o.tag && index == o.index) 2367 return 0; 2368 if (tag > o.tag) 2369 return 1; 2370 if (tag < o.tag) 2371 return -1; 2372 if (index > o.index) 2373 return 1; 2374 if (index < o.index) 2375 return -1; 2376 assert(0); 2377 } 2378 } 2379 2380 /* ******************************************************************************************** 2381 TDFA state 2382 **********************************************************************************************/ 2383 class State 2384 { 2385 enum Mode { 2386 GENERIC, MIXED, LOOKUP 2387 } 2388 2389 enum uint LOOKUP_LENGTH = 256, 2390 INVALID_STATE = 255; 2391 2392 bool accept = false; 2393 bool reluctant = false; 2394 size_t index; 2395 Transition[] transitions, 2396 generic_transitions; 2397 Command[] finishers; 2398 2399 ubyte[] lookup; 2400 Mode mode; 2401 2402 void optimize() 2403 { 2404 // merge transitions with equal targets (same state index and equal commands) 2405 size_t[] remove_indeces; 2406 foreach ( i, t; transitions[0 .. $-1] ) 2407 { 2408 foreach ( t2; transitions[i+1 .. $] ) 2409 { 2410 if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) ) 2411 continue; 2412 t2.predicate.appendInput(t.predicate); 2413 remove_indeces ~= i; 2414 break; 2415 } 2416 } 2417 2418 // remove transitions that have been merged into another 2419 if ( remove_indeces.length > 0 ) 2420 { 2421 Transition[] tmp; 2422 tmp.length = transitions.length - remove_indeces.length; 2423 size_t next_remove, next; 2424 foreach ( i, t; transitions ) 2425 { 2426 if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) { 2427 ++next_remove; 2428 continue; 2429 } 2430 tmp[next++] = t; 2431 } 2432 transitions = tmp; 2433 2434 foreach ( t; transitions ) 2435 t.predicate.optimize(); 2436 } 2437 } 2438 2439 void createLookup() 2440 { 2441 size_t count; 2442 foreach ( t; transitions ) 2443 { 2444 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2445 ++count; 2446 } 2447 2448 if ( count < 1 || transitions.length > INVALID_STATE ) { 2449 generic_transitions.length = transitions.length; 2450 generic_transitions[] = transitions[]; 2451 return; 2452 } 2453 2454 foreach ( t; transitions ) 2455 { 2456 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) ) 2457 generic_transitions ~= t; 2458 } 2459 2460 // setup lookup table 2461 lookup.length = LOOKUP_LENGTH; 2462 lookup[] = INVALID_STATE; 2463 foreach ( i, t; transitions ) 2464 { 2465 foreach ( p; t.predicate.input.parts ) 2466 { 2467 if ( p.l_ >= lookup.length ) 2468 continue; 2469 for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c ) 2470 lookup[c] = cast(ubyte)i; 2471 } 2472 } 2473 2474 mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP; 2475 } 2476 } 2477 2478 /* ******************************************************************************************** 2479 TDFA transition 2480 **********************************************************************************************/ 2481 class Transition 2482 { 2483 State target; 2484 predicate_t predicate; 2485 Command[] commands; 2486 2487 /* **************************************************************************************** 2488 Order transitions by the order of their predicates. 2489 ******************************************************************************************/ 2490 override final int opCmp(Object o) 2491 { 2492 Transition t = cast(Transition)o; 2493 assert(t !is null); 2494 return predicate.opCmp(t.predicate); 2495 } 2496 2497 final const bool opEquals(const Object o) 2498 { 2499 auto t = cast(const(Transition))o; 2500 if ( t is null ) 2501 return false; 2502 if ( equalTarget(t) && t.predicate == predicate ) 2503 return true; 2504 return false; 2505 } 2506 2507 const bool equalTarget(in Transition t) 2508 { 2509 if ( t.target.index != target.index ) 2510 return false; 2511 if ( commands.length != t.commands.length ) 2512 return false; 2513 Louter: foreach ( cmd; commands ) 2514 { 2515 foreach ( cmd2; t.commands ) 2516 { 2517 if ( cmd == cmd2 ) 2518 continue Louter; 2519 } 2520 return false; 2521 } 2522 return true; 2523 } 2524 } 2525 2526 2527 State[] states; 2528 State start; 2529 Command[] initializer; 2530 uint num_tags; 2531 2532 uint[TagIndex] registers; 2533 uint next_register; 2534 2535 uint num_regs() 2536 { 2537 return next_register; 2538 } 2539 2540 /* ******************************************************************************************** 2541 Constructs the TDFA from the given TNFA using extended power set method 2542 **********************************************************************************************/ 2543 this(TNFA!(char_t) tnfa) 2544 { 2545 num_tags = tnfa.tagCount(); 2546 2547 next_register = num_tags; 2548 for ( int i = 1; i <= num_tags; ++i ) { 2549 TagIndex ti; 2550 ti.tag = i; 2551 registers[ti] = i-1; 2552 } 2553 2554 // create epsilon closure of TNFA start state 2555 SubsetState subset_start = new SubsetState; 2556 StateElement se = new StateElement; 2557 se.nfa_state = tnfa.start; 2558 subset_start.elms ~= se; 2559 2560 // apply lookbehind closure for string/line start 2561 predicate_t tmp_pred; 2562 tmp_pred.setInput(CharClass!(char_t).line_startend); 2563 subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start); 2564 2565 start = addState(); 2566 subset_start.dfa_state = start; 2567 2568 // generate initializer and finisher commands for TDFA start state 2569 generateInitializers(subset_start); 2570 generateFinishers(subset_start); 2571 2572 // initialize stack for state traversal 2573 List!(SubsetState) subset_states = new List!(SubsetState), 2574 unmarked = new List!(SubsetState); 2575 subset_states ~= subset_start; 2576 unmarked ~= subset_start; 2577 debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString); 2578 2579 while ( !unmarked.empty() ) 2580 { 2581 SubsetState state = unmarked.tail.value; 2582 unmarked.pop(); 2583 2584 // create transitions for each class, creating new states when necessary 2585 foreach ( pred; disjointPredicates(state) ) 2586 { 2587 // find NFA state we reach with pred 2588 // reach will set predicate type correctly 2589 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString); 2590 SubsetState target = reach(state, pred); 2591 if ( target is null ) { 2592 continue; 2593 debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping"); 2594 } 2595 debug(tdfa) Stdout.formatln(" {}", target.toString); 2596 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state); 2597 2598 Transition trans = new Transition; 2599 state.dfa_state.transitions ~= trans; 2600 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred); 2601 trans.predicate = pred; 2602 2603 // generate indeces for pos commands 2604 // delay creation of pos command until we have reorder-commands 2605 uint[uint] cmds = null; 2606 foreach ( e; target.elms ) 2607 { 2608 foreach ( tag, ref index; e.tags ) 2609 { 2610 bool found=false; 2611 foreach ( e2; state.elms ) 2612 { 2613 int* i = tag in e2.tags; 2614 if ( i !is null && *i == index ) { 2615 found=true; 2616 break; 2617 } 2618 } 2619 if ( !found ) 2620 { 2621 // if index is < 0 it is a temporary index 2622 // used only to distinguish the state from existing ones. 2623 // the previous index can be reused instead. 2624 if ( index < 0 ) 2625 index = -index-1; 2626 cmds[tag] = index; 2627 } 2628 else 2629 assert(index>=0); 2630 } 2631 } 2632 2633 // check whether a state exists that is identical except for tag index reorder-commands 2634 bool exists=false; 2635 foreach ( equivTarget; subset_states ) 2636 { 2637 if ( reorderTagIndeces(target, equivTarget, state, trans) ) { 2638 target = equivTarget; 2639 exists = true; 2640 break; 2641 } 2642 } 2643 // else create new target state 2644 if ( !exists ) 2645 { 2646 State ts = addState(); 2647 target.dfa_state = ts; 2648 subset_states ~= target; 2649 unmarked ~= target; 2650 debug(tdfa_add) { 2651 Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString); 2652 } 2653 generateFinishers(target); 2654 } 2655 2656 // now generate pos commands, rewriting reorder-commands if existent 2657 foreach ( tag, index; cmds ) 2658 { 2659 // check whether reordering used this tag, if so, overwrite the command directly, 2660 // for it's effect would be overwritten by a subsequent pos-command anyway 2661 uint reg = registerFromTagIndex(tag, index); 2662 bool found = false; 2663 foreach ( ref cmd; trans.commands ) 2664 { 2665 if ( cmd.src == reg ) { 2666 found = true; 2667 cmd.src = CURRENT_POSITION_REGISTER; 2668 break; 2669 } 2670 } 2671 if ( !found ) { 2672 Command cmd; 2673 cmd.dst = reg; 2674 cmd.src = CURRENT_POSITION_REGISTER; 2675 trans.commands ~= cmd; 2676 } 2677 } 2678 2679 trans.target = target.dfa_state; 2680 debug(tdfa) { 2681 Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index); 2682 } 2683 } 2684 2685 state.dfa_state.optimize(); 2686 } 2687 2688 // renumber registers continuously 2689 uint[uint] regNums; 2690 2691 for ( next_register = 0; next_register < num_tags; ++next_register ) 2692 regNums[next_register] = next_register; 2693 2694 void renumberCommand(ref Command cmd) 2695 { 2696 if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null ) 2697 regNums[cmd.src] = next_register++; 2698 if ( (cmd.dst in regNums) is null ) 2699 regNums[cmd.dst] = next_register++; 2700 if ( cmd.src != CURRENT_POSITION_REGISTER ) 2701 cmd.src = regNums[cmd.src]; 2702 cmd.dst = regNums[cmd.dst]; 2703 } 2704 2705 foreach ( state; states ) 2706 { 2707 foreach ( ref cmd; state.finishers ) 2708 renumberCommand(cmd); 2709 // make sure pos-commands are executed after reorder-commands and 2710 // reorder-commands do not overwrite each other 2711 state.finishers.sort; 2712 2713 foreach ( trans; state.transitions ) 2714 { 2715 foreach ( ref cmd; trans.commands ) 2716 renumberCommand(cmd); 2717 trans.commands.sort; 2718 trans.predicate.compile(); 2719 } 2720 } 2721 2722 debug(TangoRegex) 2723 { 2724 foreach ( ref v; registers ) 2725 { 2726 if ( (v in regNums) !is null ) 2727 v = regNums[v]; 2728 } 2729 } 2730 2731 minimizeDFA(); 2732 2733 foreach ( state; states ) 2734 state.createLookup(); 2735 2736 // TODO: optimize memory layout of TDFA 2737 2738 // TODO: add lookahead for string-end somewhere 2739 // TODO: mark dead-end states (not leaving a non-finishing susbet) 2740 // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition 2741 // (execution may stop in that state) 2742 } 2743 2744 /* ******************************************************************************************** 2745 Print the TDFA (tabular representation of the delta function) 2746 **********************************************************************************************/ 2747 debug(TangoRegex) void print() 2748 { 2749 Stdout.formatln("#tags = {}", num_tags); 2750 2751 auto tis = new TagIndex[registers.length]; 2752 foreach ( k, v; registers ) 2753 tis [v] = k; 2754 foreach ( r, ti; tis ) { 2755 Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r); 2756 } 2757 Stdout.formatln("Initializer:"); 2758 foreach ( cmd; initializer ) { 2759 Stdout.formatln("{}", cmd.toString); 2760 } 2761 Stdout.formatln("Delta function:"); 2762 foreach ( int i, s; states ) 2763 { 2764 Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" "); 2765 2766 bool first=true; 2767 Stdout(" {"); 2768 foreach ( t; s.transitions ) 2769 { 2770 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index); 2771 bool firstcmd=true; 2772 foreach ( cmd; t.commands ) 2773 { 2774 if ( firstcmd ) 2775 firstcmd = false; 2776 else 2777 Stdout(","); 2778 Stdout.format("{}", cmd.toString); 2779 } 2780 Stdout(")"); 2781 first = false; 2782 } 2783 Stdout("} ("); 2784 2785 bool firstcmd=true; 2786 foreach ( cmd; s.finishers ) 2787 { 2788 if ( firstcmd ) 2789 firstcmd = false; 2790 else 2791 Stdout(","); 2792 Stdout.format("{}", cmd.toString); 2793 } 2794 Stdout.formatln(")"); 2795 } 2796 } 2797 2798 private: 2799 /* ******************************************************************************************** 2800 A (TNFA state, tags) pair element of a subset state. 2801 **********************************************************************************************/ 2802 class StateElement 2803 { 2804 TNFAState!(char_t) nfa_state; 2805 int[uint] tags; 2806 // use place-value priority with 2 places, value(maxPrio) > value(lastPrio) 2807 uint maxPriority, 2808 lastPriority; 2809 2810 bool prioGreater(StateElement se) 2811 { 2812 if ( maxPriority < se.maxPriority ) 2813 return true; 2814 if ( maxPriority == se.maxPriority ) { 2815 assert(lastPriority != se.lastPriority); 2816 return lastPriority < se.lastPriority; 2817 } 2818 return false; 2819 } 2820 2821 override int opCmp(Object o) 2822 { 2823 StateElement se = cast(StateElement)o; 2824 assert(se !is null); 2825 if ( maxPriority < se.maxPriority ) 2826 return 1; 2827 if ( maxPriority == se.maxPriority ) 2828 { 2829 if ( lastPriority == se.lastPriority ) 2830 return 0; 2831 return lastPriority < se.lastPriority; 2832 } 2833 return -1; 2834 } 2835 2836 override string toString() 2837 { 2838 const(char)[] str; 2839 str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority); 2840 bool first = true; 2841 foreach ( k, v; tags ) { 2842 str ~= Format.convert("{}m({},{})", first?"":",", k, v); 2843 first = false; 2844 } 2845 str ~= "}"; 2846 return str.idup; 2847 } 2848 } 2849 2850 /* ******************************************************************************************** 2851 Represents a state in the NFA to DFA conversion. 2852 Contains the set of states (StateElements) the NFA might be in at the same time and the 2853 corresponding DFA state that we create. 2854 **********************************************************************************************/ 2855 class SubsetState 2856 { 2857 StateElement[] elms; 2858 State dfa_state; 2859 2860 this(StateElement[] elms=null) 2861 { 2862 this.elms = elms; 2863 } 2864 2865 int opApply(int delegate (ref TNFATransition!(char_t)) dg) 2866 { 2867 int res; 2868 foreach ( elm; elms ) 2869 { 2870 foreach ( t; elm.nfa_state.transitions ) 2871 { 2872 res = dg(t); 2873 if ( res ) 2874 return res; 2875 } 2876 } 2877 return res; 2878 } 2879 2880 override string toString() 2881 { 2882 string str = "[ "; 2883 bool first = true; 2884 foreach ( s; elms ) { 2885 if ( !first ) 2886 str ~= ", "; 2887 str ~= s.toString(); 2888 first = false; 2889 } 2890 return str~" ]"; 2891 } 2892 } 2893 2894 /* ******************************************************************************************** 2895 Temporary structure used for disjoint predicate computation 2896 **********************************************************************************************/ 2897 struct Mark 2898 { 2899 char_t c; 2900 bool end; /// false = start of range 2901 2902 int opCmp(const ref Mark m) const 2903 { 2904 if ( c < m.c ) 2905 return -1; 2906 if ( c > m.c ) 2907 return 1; 2908 if ( end < m.end ) 2909 return -1; 2910 if ( end > m.end ) 2911 return 1; 2912 return 0; 2913 } 2914 } 2915 2916 /* ******************************************************************************************** 2917 Calculates the register index for a given tag map entry. The TDFA implementation uses 2918 registers to save potential tag positions, the index space gets linearized here. 2919 2920 Params: tag = tag number 2921 index = tag map index 2922 Returns: index of the register to use for the tag map entry 2923 **********************************************************************************************/ 2924 uint registerFromTagIndex(uint tag, uint index) 2925 { 2926 if ( index > 0 ) 2927 { 2928 TagIndex ti; 2929 ti.tag = tag; 2930 ti.index = index; 2931 uint* i = ti in registers; 2932 if ( i !is null ) 2933 return *i; 2934 return registers[ti] = next_register++; 2935 } 2936 else 2937 return tag-1; 2938 } 2939 2940 Mark[] marks_; 2941 2942 /* ******************************************************************************************** 2943 Add new TDFA state to the automaton. 2944 **********************************************************************************************/ 2945 State addState() 2946 { 2947 State s = new State; 2948 s.index = states.length; 2949 states ~= s; 2950 return s; 2951 } 2952 2953 /* ******************************************************************************************** 2954 Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions. 2955 2956 Params: state = SubsetState to create the predicates from 2957 Returns: List of disjoint predicates that can be used for a DFA state 2958 **********************************************************************************************/ 2959 predicate_t[] disjointPredicates(SubsetState state) 2960 { 2961 alias CharRange!(char_t) range_t; 2962 debug(tdfa) Stdout.formatln("disjointPredicates()"); 2963 2964 size_t num_marks; 2965 foreach ( t; state ) 2966 { 2967 // partitioning will consider lookbehind transitions, 2968 // st. lb-closure will not expand for transitions with a superset of the lb-predicate 2969 if ( t.predicate.type != predicate_t.Type.epsilon ) 2970 { 2971 debug(tdfa) Stdout.formatln("{}", t.predicate.toString); 2972 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length ) 2973 marks_.length = num_marks+2*t.predicate.getInput().parts.length; 2974 foreach ( p; t.predicate.getInput().parts ) { 2975 marks_[num_marks++] = Mark(p.l(), false); 2976 marks_[num_marks++] = Mark(p.r(), true); 2977 } 2978 } 2979 } 2980 2981 if ( num_marks <= 1 ) 2982 throw new Exception("disjointPredicates: No transitions in subset state"); 2983 2984 debug(tdfa) Stdout("\nsorting...").newline; 2985 // using built-in sort somtimes gives an AV in TypeInfo.swap 2986 quickSort(marks_[0 .. num_marks]); 2987 assert(!marks_[0].end); 2988 2989 debug(tdfa) 2990 { 2991 Stdout("\nsorted marks:\n"); 2992 bool first=true; 2993 foreach ( m; marks_[0 .. num_marks] ) 2994 { 2995 if ( first ) 2996 first = false; 2997 else 2998 Stdout(","); 2999 if ( m.c > 0x20 && m.c < 0x7f ) 3000 Stdout.format("{}{}", m.end?"e":"s", m.c); 3001 else 3002 Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c); 3003 } 3004 Stdout.newline; 3005 } 3006 3007 size_t next, 3008 active = 1; 3009 char_t start = marks_[0].c, 3010 end; 3011 range_t[] disjoint = new range_t[num_marks/2+1]; 3012 3013 foreach ( m; marks_[1 .. num_marks] ) 3014 { 3015 if ( m.end ) 3016 { 3017 assert(active>0); 3018 --active; 3019 if ( m.c < start ) 3020 continue; 3021 end = m.c; 3022 // the next range cannot start at the same pos 3023 // because starts are sorted before endings 3024 if ( active > 0 ) 3025 ++m.c; 3026 } 3027 else 3028 { 3029 ++active; 3030 if ( active == 1 ) 3031 { 3032 // skip uncovered interval 3033 if ( m.c > start ) { 3034 start = m.c; 3035 continue; 3036 } 3037 end = m.c; 3038 ++m.c; 3039 } 3040 // skip range start if cursor already marks it 3041 else if ( m.c <= start ) 3042 continue; 3043 else 3044 end = m.c-1; 3045 } 3046 3047 // save range 3048 if ( disjoint.length <= next ) 3049 disjoint.length = disjoint.length*2; 3050 3051 disjoint[next].l_ = start; 3052 disjoint[next].r_ = end; 3053 ++next; 3054 3055 // advance cursor 3056 start = m.c; 3057 } 3058 disjoint.length = next; 3059 3060 // merge isolated ranges into sets of ranges 3061 // no range in a set may occur separated from the others in any predicate 3062 predicate_t[] preds; 3063 preds.length = 1; 3064 Lmerge: foreach ( r; disjoint ) 3065 { 3066 if ( preds[$-1].empty() ) 3067 preds[$-1].appendInput(r); 3068 else 3069 { 3070 // we can merge r into the current predicate if 3071 // pred contains r <=> pred contains all the other ranges 3072 foreach ( t; state ) 3073 { 3074 if ( t.predicate.type == predicate_t.Type.epsilon ) 3075 continue; 3076 3077 if ( t.predicate.getInput().contains(r) 3078 != t.predicate.getInput().contains(preds[$-1].getInput()) ) 3079 { 3080 preds.length = preds.length+1; 3081 break; 3082 } 3083 } 3084 preds[$-1].appendInput(r); 3085 } 3086 } 3087 3088 debug(tdfa) 3089 { 3090 Stdout("\ndisjoint ranges:\n"); 3091 first=true; 3092 foreach ( r; disjoint ) 3093 { 3094 if ( first ) 3095 first = false; 3096 else 3097 Stdout(","); 3098 Stdout.format("{}", r); 3099 } 3100 Stdout.newline; 3101 Stdout("\ndisjoint predicates:\n"); 3102 first=true; 3103 foreach ( ref p; preds ) 3104 { 3105 if ( first ) 3106 first = false; 3107 else 3108 Stdout(","); 3109 Stdout.format("{}", p.toString); 3110 } 3111 Stdout.newline; 3112 } 3113 3114 debug(tdfa) Stdout.formatln("disjointPredicates() end"); 3115 return preds; 3116 } 3117 3118 /* ******************************************************************************************** 3119 Finds all TNFA states that can be reached directly with the given predicate and creates 3120 a new SubsetState containing those target states. 3121 3122 Params: subst = SubsetState to start from 3123 pred = predicate that is matched against outgoing transitions 3124 Returns: SubsetState containing the reached target states 3125 **********************************************************************************************/ 3126 SubsetState reach(SubsetState subst, ref predicate_t pred) 3127 { 3128 // to handle the special case of overlapping consume and lookahead predicates, 3129 // we find the different intersecting predicate types 3130 bool have_consume, 3131 have_lookahead; 3132 foreach ( t; subst ) 3133 { 3134 if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead ) 3135 continue; 3136 auto intpred = t.predicate.intersect(pred); 3137 if ( !intpred.empty() ) 3138 { 3139 if ( t.predicate.type == predicate_t.Type.consume ) 3140 have_consume = true; 3141 else if ( t.predicate.type == predicate_t.Type.lookahead ) 3142 have_lookahead = true; 3143 } 3144 } 3145 3146 // if there is consume/lookahead overlap, 3147 // lookahead predicates are handled first 3148 predicate_t.Type processed_type; 3149 if ( have_lookahead ) 3150 processed_type = predicate_t.Type.lookahead; 3151 else if ( have_consume ) 3152 processed_type = predicate_t.Type.consume; 3153 else { 3154 debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString); 3155 return null; 3156 } 3157 pred.type = processed_type; 3158 3159 // add destination states to new subsetstate 3160 SubsetState r = new SubsetState; 3161 foreach ( s; subst.elms ) 3162 { 3163 foreach ( t; s.nfa_state.transitions ) 3164 { 3165 if ( t.predicate.type != processed_type ) 3166 continue; 3167 auto intpred = t.predicate.intersect(pred); 3168 if ( !intpred.empty() ) { 3169 StateElement se = new StateElement; 3170 se.maxPriority = max(t.priority, s.maxPriority); 3171 se.lastPriority = t.priority; 3172 se.nfa_state = t.target; 3173 se.tags = s.tags; 3174 r.elms ~= se; 3175 } 3176 } 3177 } 3178 3179 // if we prioritized lookaheads, the states that may consume are also added to the new subset state 3180 // this behaviour is somewhat similar to an epsilon closure 3181 if ( have_lookahead && have_consume ) 3182 { 3183 foreach ( s; subst.elms ) 3184 { 3185 foreach ( t; s.nfa_state.transitions ) 3186 { 3187 if ( t.predicate.type != predicate_t.Type.consume ) 3188 continue; 3189 auto intpred = t.predicate.intersect(pred); 3190 if ( !intpred.empty() ) { 3191 r.elms ~= s; 3192 break; 3193 } 3194 } 3195 } 3196 } 3197 return r; 3198 } 3199 3200 /* ******************************************************************************************** 3201 Extends the given SubsetState with the states that are reached through lookbehind transitions. 3202 3203 Params: from = SubsetState to create the lookbehind closure for 3204 previous = predicate "from" was reached with 3205 Returns: SubsetState containing "from" and all states of it's lookbehind closure 3206 **********************************************************************************************/ 3207 SubsetState lookbehindClosure(SubsetState from, predicate_t pred) 3208 { 3209 List!(StateElement) stack = new List!(StateElement); 3210 StateElement[size_t] closure; 3211 3212 foreach ( e; from.elms ) 3213 { 3214 stack ~= e; 3215 closure[e.nfa_state.index] = e; 3216 } 3217 3218 while ( !stack.empty() ) 3219 { 3220 StateElement se = stack.tail.value; 3221 stack.pop(); 3222 foreach ( t; se.nfa_state.transitions ) 3223 { 3224 if ( t.predicate.type != predicate_t.Type.lookbehind ) 3225 continue; 3226 if ( t.predicate.intersect(pred).empty() ) 3227 continue; 3228 uint new_maxPri = max(t.priority, se.maxPriority); 3229 3230 StateElement* tmp = t.target.index in closure; 3231 if ( tmp !is null ) 3232 { 3233 // if higher prio (smaller value) exists, do not use this transition 3234 if ( tmp.maxPriority < new_maxPri ) { 3235 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3236 continue; 3237 } 3238 else if ( tmp.maxPriority == new_maxPri ) 3239 { 3240 // "equal lastPrio -> first-come-first-serve" 3241 // doesn't work for lexer - how to solve it properly? 3242 if ( tmp.lastPriority <= t.priority ) { 3243 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3244 continue; 3245 } 3246 // else 3247 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3248 } 3249 // else 3250 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3251 } 3252 3253 StateElement new_se = new StateElement; 3254 new_se.maxPriority = max(t.priority, se.maxPriority); 3255 new_se.lastPriority = t.priority; 3256 new_se.nfa_state = t.target; 3257 new_se.tags = se.tags; 3258 3259 closure[t.target.index] = new_se; 3260 stack ~= new_se; 3261 } 3262 } 3263 3264 SubsetState res = new SubsetState; 3265 res.elms = closure.values; 3266 return res; 3267 } 3268 3269 /* ******************************************************************************************** 3270 Generates the epsilon closure of the given subset state, creating tag map entries 3271 if tags are passed. Takes priorities into account, effectively realizing 3272 greediness and reluctancy. 3273 3274 Params: from = SubsetState to create the epsilon closure for 3275 previous = SubsetState "from" was reached from 3276 Returns: SubsetState containing "from" and all states of it's epsilon closure 3277 **********************************************************************************************/ 3278 SubsetState epsilonClosure(SubsetState from, SubsetState previous) 3279 { 3280 int firstFreeIndex=-1; 3281 foreach ( e; previous.elms ) 3282 { 3283 foreach ( ti; e.tags ) 3284 firstFreeIndex = max(firstFreeIndex, cast(int)ti); 3285 } 3286 ++firstFreeIndex; 3287 3288 List!(StateElement) stack = new List!(StateElement); 3289 StateElement[size_t] closure; 3290 3291 foreach ( e; from.elms ) 3292 { 3293 stack ~= e; 3294 closure[e.nfa_state.index] = e; 3295 } 3296 3297 while ( !stack.empty() ) 3298 { 3299 StateElement se = stack.tail.value; 3300 stack.pop(); 3301 foreach ( t; se.nfa_state.transitions ) 3302 { 3303 if ( t.predicate.type != predicate_t.Type.epsilon ) 3304 continue; 3305 // this is different from Ville Laurikari's algorithm, but it's crucial 3306 // to take the max (instead of t.priority) to make reluctant operators work 3307 uint new_maxPri = max(t.priority, se.maxPriority); 3308 3309 StateElement* tmp = t.target.index in closure; 3310 if ( tmp !is null ) 3311 { 3312 // if higher prio (smaller value) exists, do not use this transition 3313 if ( tmp.maxPriority < new_maxPri ) { 3314 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri); 3315 continue; 3316 } 3317 else if ( tmp.maxPriority == new_maxPri ) 3318 { 3319 // "equal lastPrio -> first-come-first-serve" 3320 // doesn't work for lexer - how to solve it properly? 3321 if ( tmp.lastPriority <= t.priority ) { 3322 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority); 3323 continue; 3324 } 3325 // else 3326 // debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority); 3327 } 3328 // else 3329 // debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority); 3330 } 3331 3332 auto new_se = new StateElement; 3333 new_se.maxPriority = new_maxPri; 3334 new_se.lastPriority = t.priority; 3335 new_se.nfa_state = t.target; 3336 3337 if ( t.tag > 0 ) 3338 { 3339 foreach ( k, v; se.tags ) 3340 new_se.tags[k] = v; 3341 new_se.tags[t.tag] = firstFreeIndex; 3342 } 3343 else 3344 new_se.tags = se.tags; 3345 3346 closure[t.target.index] = new_se; 3347 stack ~= new_se; 3348 } 3349 } 3350 3351 SubsetState res = new SubsetState; 3352 res.elms = closure.values; 3353 3354 // optimize tag usage 3355 // all we need to do is to check whether the largest tag-index from the 3356 // previous state is actually used in the new state and move all tags with 3357 // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0 3358 if ( firstFreeIndex > 0 ) 3359 { 3360 bool seenLastUsedIndex = false; 3361 sluiLoop: foreach ( e; res.elms ) 3362 { 3363 foreach ( i; e.tags ) 3364 { 3365 if ( i == firstFreeIndex-1 ) { 3366 seenLastUsedIndex = true; 3367 break sluiLoop; 3368 } 3369 } 3370 } 3371 if ( !seenLastUsedIndex ) 3372 { 3373 foreach ( e; res.elms ) 3374 { 3375 foreach ( ref i; e.tags ) 3376 { 3377 // mark index by making it negative 3378 // to signal that it can be decremented 3379 // after it has been detected to be a newly used index 3380 if ( i == firstFreeIndex ) 3381 i = -firstFreeIndex; 3382 } 3383 } 3384 } 3385 } 3386 3387 return res; 3388 } 3389 3390 /* ******************************************************************************************** 3391 Tries to create commands that reorder the tag map of "previous", such that "from" becomes 3392 tag-wise identical to "to". If successful, these commands are added to "trans". This 3393 is done for state re-use. 3394 3395 Params: from = SubsetState to check for tag-wise equality to "to" 3396 to = existing SubsetState that we want to re-use 3397 previous = SubsetState we're coming from 3398 trans = Transition we went along 3399 Returns: true if "from" is tag-wise identical to "to" and the necessary commands have 3400 been added to "trans" 3401 **********************************************************************************************/ 3402 bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans) 3403 { 3404 if ( from.elms.length != to.elms.length ) 3405 return false; 3406 3407 bool[Command] 3408 cmds; 3409 uint[TagIndex] 3410 reorderedIndeces; 3411 StateElement[TagIndex] 3412 reordered_elements; 3413 3414 Louter: foreach ( fe; from.elms ) 3415 { 3416 foreach ( te; to.elms ) 3417 { 3418 if ( te.nfa_state.index != fe.nfa_state.index ) 3419 continue; 3420 if ( fe.tags.length != te.tags.length ) 3421 return false; 3422 foreach ( tag, findex; fe.tags ) 3423 { 3424 if ( (tag in te.tags) is null ) 3425 return false; 3426 3427 TagIndex ti; 3428 ti.tag = tag; 3429 ti.index = te.tags[tag]; 3430 3431 // apply priority for conflicting tag indeces 3432 if ( (ti in reorderedIndeces) !is null ) 3433 { 3434 auto rse = reordered_elements[ti]; 3435 auto ri = reorderedIndeces[ti]; 3436 if ( ri != findex 3437 && ( rse.maxPriority < fe.maxPriority 3438 || rse.maxPriority == fe.maxPriority 3439 && rse.lastPriority <= fe.lastPriority ) 3440 ) 3441 continue; 3442 Command cmd; 3443 cmd.src = registerFromTagIndex(tag,ri); 3444 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3445 cmds.remove(cmd); 3446 } 3447 // if target index differs, create reordering command 3448 if ( te.tags[tag] != findex ) 3449 { 3450 Command cmd; 3451 cmd.src = registerFromTagIndex(tag,findex); 3452 cmd.dst = registerFromTagIndex(tag,te.tags[tag]); 3453 cmds[cmd] = true; 3454 } 3455 3456 reorderedIndeces[ti] = findex; 3457 reordered_elements[ti] = fe; 3458 } 3459 continue Louter; 3460 } 3461 return false; 3462 } 3463 3464 debug(tdfa) { 3465 Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index); 3466 } 3467 3468 trans.commands ~= cmds.keys; 3469 return true; 3470 } 3471 3472 /* ******************************************************************************************** 3473 Generate tag map initialization commands for start state. 3474 **********************************************************************************************/ 3475 void generateInitializers(SubsetState start) 3476 { 3477 uint[uint] cmds; 3478 foreach ( nds; start.elms ) 3479 { 3480 foreach ( k, v; nds.tags ) 3481 cmds[k] = v; 3482 } 3483 3484 foreach ( k, v; cmds ) { 3485 Command cmd; 3486 cmd.dst = registerFromTagIndex(k,v); 3487 cmd.src = CURRENT_POSITION_REGISTER; 3488 initializer ~= cmd; 3489 } 3490 } 3491 3492 /* ******************************************************************************************** 3493 Generates finisher commands for accepting states. 3494 **********************************************************************************************/ 3495 void generateFinishers(SubsetState r) 3496 { 3497 // if at least one of the TNFA states accepts, 3498 // set the finishers from active tags in increasing priority 3499 StateElement[] sorted_elms = r.elms.dup.sort; 3500 bool reluctant = false; 3501 foreach ( se; sorted_elms ) { 3502 debug (Finishers) Stdout.formatln("Finisher: {}", se); 3503 if ( se.nfa_state.accept ) 3504 { 3505 r.dfa_state.accept = true; 3506 3507 // Knowing that we're looking at an epsilon closure with an accepting 3508 // state, we look at the involved transitions - if the path from the 3509 // nfa state in the set with the highest incoming priority (last in 3510 // sorted_elms list) to the accepting nfa state is via the highest 3511 // priority transitions, and they are all epsilon transitions, this 3512 // suggests we're looking at a regex ending with a reluctant pattern. 3513 // The NFA->DFA transformation will most likely extend the automata 3514 // further, but we want the matching to stop here. 3515 // NOTE: The grounds for choosing the last element in sorted_elms 3516 // are somewhat weak (empirical testing), but sofar no new 3517 // regressions have been discovered. larsivi 20090827 3518 TNFATransition!(char_t) highestPriTrans; 3519 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state && 3520 sorted_elms[$-1].nfa_state)) 3521 throw new Exception ("Something is NULL that is expected to 3522 be non-null", __FILE__, __LINE__); 3523 3524 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) { 3525 if (trans.canFinish()) { 3526 r.dfa_state.reluctant = true; 3527 break; 3528 } 3529 } 3530 3531 bool[uint] finished_tags; 3532 { 3533 foreach ( t, i; se.tags ) 3534 if ( i > 0 && !(t in finished_tags) ) { 3535 finished_tags[t] = true; 3536 Command cmd; 3537 cmd.dst = registerFromTagIndex(t, 0); 3538 cmd.src = registerFromTagIndex(t, i); 3539 r.dfa_state.finishers ~= cmd; 3540 } 3541 } 3542 } 3543 } 3544 } 3545 3546 /* ******************************************************************************************** 3547 Assumes that the command-lists are sorted and transitions are optimized 3548 **********************************************************************************************/ 3549 void minimizeDFA() 3550 { 3551 class DiffTable 3552 { 3553 this(size_t num) { 3554 diff_ = new bool[num*(num+1)/2]; 3555 } 3556 3557 ~this() { delete diff_; } 3558 3559 bool opCall(size_t a, size_t b) 3560 { 3561 if ( a < b ) 3562 return diff_[b*(b+1)/2+a]; 3563 return diff_[a*(a+1)/2+b]; 3564 } 3565 3566 void set(size_t a, size_t b) 3567 { 3568 if ( a < b ) 3569 diff_[b*(b+1)/2+a] = true; 3570 else 3571 diff_[a*(a+1)/2+b] = true; 3572 } 3573 3574 bool[] diff_; 3575 } 3576 3577 debug(tdfa) Stdout.formatln("Minimizing TDFA"); 3578 3579 scope diff = new DiffTable(states.length); 3580 bool new_diff = true; 3581 3582 while ( new_diff ) 3583 { 3584 new_diff = false; 3585 foreach ( i, a; states[0 .. $-1] ) 3586 { 3587 Linner: foreach ( j, b; states[i+1 .. $] ) 3588 { 3589 if ( diff(i, j+i+1) ) 3590 continue; 3591 3592 // assume optimized transitions 3593 if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) { 3594 diff.set(i, j+i+1); 3595 new_diff = true; 3596 continue; 3597 } 3598 3599 if ( a.accept ) // b accepts too 3600 { 3601 // assume sorted finishers 3602 if ( a.finishers.length != b.finishers.length ) { 3603 diff.set(i, j+i+1); 3604 new_diff = true; 3605 continue; 3606 } 3607 foreach ( k, cmd; a.finishers ) 3608 { 3609 if ( cmd != b.finishers[k] ) { 3610 diff.set(i, j+i+1); 3611 new_diff = true; 3612 continue Linner; 3613 } 3614 } 3615 } 3616 3617 Ltrans: foreach ( ta; a.transitions ) 3618 { 3619 foreach ( tb; b.transitions ) 3620 { 3621 if ( ta.predicate.intersects(tb.predicate) ) 3622 { 3623 if ( diff(ta.target.index, tb.target.index) ) { 3624 diff.set(i, j+i+1); 3625 new_diff = true; 3626 continue Linner; 3627 } 3628 // assume sorted commands 3629 if ( ta.commands.length != tb.commands.length ) { 3630 diff.set(i, j+i+1); 3631 new_diff = true; 3632 continue Linner; 3633 } 3634 foreach ( k, cmd; ta.commands ) 3635 { 3636 if ( cmd != tb.commands[k] ) { 3637 diff.set(i, j+i+1); 3638 new_diff = true; 3639 continue Linner; 3640 } 3641 } 3642 continue Ltrans; 3643 } 3644 } 3645 3646 diff.set(i, j+i+1); 3647 new_diff = true; 3648 continue Linner; 3649 } 3650 3651 } 3652 } 3653 } 3654 3655 foreach ( i, a; states[0 .. $-1] ) 3656 { 3657 foreach ( j, b; states[i+1 .. $] ) 3658 { 3659 if ( !diff(i, j+i+1) ) 3660 { 3661 debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1); 3662 // remap b to a 3663 foreach ( k, c; states ) 3664 { 3665 foreach ( t; c.transitions ) 3666 { 3667 if ( t.target.index == j+i+1 ) 3668 t.target = a; 3669 } 3670 } 3671 } 3672 } 3673 } 3674 3675 } 3676 } 3677 import tango.text.Util; 3678 3679 /************************************************************************************************** 3680 Regular expression compiler and interpreter. 3681 **************************************************************************************************/ 3682 class RegExpT(char_t) 3683 { 3684 alias TDFA!(dchar) tdfa_t; 3685 alias TNFA!(dchar) tnfa_t; 3686 alias CharClass!(dchar) charclass_t; 3687 alias Predicate!(dchar) predicate_t; 3688 3689 /********************************************************************************************** 3690 Construct a RegExpT object. 3691 Params: 3692 pattern = Regular expression. 3693 Throws: RegExpException if there are any compilation errors. 3694 Example: 3695 Declare two variables and assign to them a Regex object: 3696 --- 3697 auto r = new Regex("pattern"); 3698 auto s = new Regex(r"p[1-5]\s*"); 3699 --- 3700 **********************************************************************************************/ 3701 this(const(char_t)[] pattern, const(char_t)[] attributes=null) 3702 { 3703 this(pattern, false, true); 3704 } 3705 3706 /** ditto */ 3707 this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false) 3708 { 3709 pattern_ = pattern; 3710 3711 debug(TangoRegex) {} 3712 else { 3713 static if ( is(char_t == dchar) ) { 3714 scope tnfa_t tnfa_ = new tnfa_t(pattern_); 3715 } 3716 else { 3717 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_)); 3718 } 3719 } 3720 3721 tnfa_.swapMatchingBracketSyntax = swapMBS; 3722 tnfa_.parse(unanchored); 3723 if ( printNFA ) { 3724 debug(TangoRegex) Stdout.formatln("\nTNFA:"); 3725 debug(TangoRegex) tnfa_.print; 3726 } 3727 tdfa_ = new tdfa_t(tnfa_); 3728 registers_.length = tdfa_.num_regs(); 3729 } 3730 3731 /********************************************************************************************** 3732 Generate instance of Regex. 3733 Params: 3734 pattern = Regular expression. 3735 Throws: RegExpException if there are any compilation errors. 3736 Example: 3737 Declare two variables and assign to them a Regex object: 3738 --- 3739 auto r = Regex("pattern"); 3740 auto s = Regex(r"p[1-5]\s*"); 3741 --- 3742 **********************************************************************************************/ 3743 static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null) 3744 { 3745 return new RegExpT!(char_t)(pattern, attributes); 3746 } 3747 3748 /********************************************************************************************** 3749 Set up for start of foreach loop. 3750 Returns: Instance of RegExpT set up to search input. 3751 Example: 3752 --- 3753 import tango.io.Stdout; 3754 import tango.text.Regex; 3755 3756 void main() 3757 { 3758 foreach(m; Regex("ab").search("qwerabcabcababqwer")) 3759 Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post); 3760 } 3761 // Prints: 3762 // qwer[ab]cabcababqwer 3763 // qwerabc[ab]cababqwer 3764 // qwerabcabc[ab]abqwer 3765 // qwerabcabcab[ab]qwer 3766 --- 3767 **********************************************************************************************/ 3768 public RegExpT!(char_t) search(const(char_t)[] input) 3769 { 3770 input_ = input; 3771 next_start_ = 0; 3772 last_start_ = 0; 3773 return this; 3774 } 3775 3776 /** ditto */ 3777 public int opApply(int delegate(ref RegExpT!(char_t)) dg) 3778 { 3779 int result; 3780 while ( !result && test() ) 3781 result = dg(this); 3782 return result; 3783 } 3784 3785 /********************************************************************************************** 3786 Search input for match. 3787 Returns: false for no match, true for match 3788 **********************************************************************************************/ 3789 bool test(const(char_t)[] input) 3790 { 3791 this.input_ = input; 3792 next_start_ = 0; 3793 last_start_ = 0; 3794 return test(); 3795 } 3796 3797 /********************************************************************************************** 3798 Pick up where last test(input) or test() left off, and search again. 3799 Returns: false for no match, true for match 3800 **********************************************************************************************/ 3801 bool test() 3802 { 3803 // initialize registers 3804 assert(registers_.length == tdfa_.num_regs()); 3805 registers_[0..$] = -1; 3806 foreach ( cmd; tdfa_.initializer ) { 3807 assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER); 3808 registers_[cmd.dst] = 0; 3809 } 3810 3811 tdfa_t.Transition* tp, tp_end; 3812 3813 // DFA execution 3814 auto inp = input_[next_start_ .. $]; 3815 auto s = tdfa_.start; 3816 3817 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp); 3818 LmainLoop: for ( size_t p, next_p; p < inp.length; ) 3819 { 3820 Lread_char: 3821 dchar c = cast(dchar)inp[p]; 3822 if ( c & 0x80 ) 3823 c = decode(inp, next_p); 3824 else 3825 next_p = p+1; 3826 3827 Lprocess_char: 3828 debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c); 3829 3830 tdfa_t.Transition t = void; 3831 switch ( s.mode ) 3832 { 3833 case s.Mode.LOOKUP: 3834 if ( c < s.LOOKUP_LENGTH ) 3835 { 3836 debug(TangoRegex) Stdout.formatln("lookup"); 3837 auto i = s.lookup[c]; 3838 if ( i == s.INVALID_STATE ) 3839 break LmainLoop; 3840 t = s.transitions[ i ]; 3841 if ( t.predicate.type != t.predicate.Type.consume ) 3842 goto Lno_consume; 3843 goto Lconsume; 3844 } 3845 break LmainLoop; 3846 3847 case s.Mode.MIXED: 3848 if ( c < s.LOOKUP_LENGTH ) 3849 { 3850 debug(TangoRegex) Stdout.formatln("mixed"); 3851 auto i = s.lookup[c]; 3852 if ( i == s.INVALID_STATE ) 3853 break; 3854 t = s.transitions[ i ]; 3855 if ( t.predicate.type != t.predicate.Type.consume ) 3856 goto Lno_consume; 3857 goto Lconsume; 3858 } 3859 break; 3860 3861 case s.Mode.GENERIC: 3862 default: 3863 break; 3864 } 3865 3866 Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length; 3867 tp < tp_end; ++tp ) 3868 { 3869 t = *tp; 3870 switch ( t.predicate.mode ) 3871 { 3872 // single char 3873 case predicate_t.MatchMode.single_char: 3874 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3875 if ( c != t.predicate.data_chr ) 3876 continue Ltrans_loop; 3877 goto Lconsume; 3878 case predicate_t.MatchMode.single_char_l: 3879 debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr); 3880 if ( c != t.predicate.data_chr ) 3881 continue Ltrans_loop; 3882 goto Lno_consume; 3883 3884 // bitmap 3885 case predicate_t.MatchMode.bitmap: 3886 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3887 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3888 goto Lconsume; 3889 continue Ltrans_loop; 3890 case predicate_t.MatchMode.bitmap_l: 3891 debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString); 3892 if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) ) 3893 goto Lno_consume; 3894 continue Ltrans_loop; 3895 3896 // string search 3897 case predicate_t.MatchMode.string_search: 3898 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3899 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3900 continue Ltrans_loop; 3901 goto Lconsume; 3902 case predicate_t.MatchMode.string_search_l: 3903 debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str); 3904 if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length ) 3905 continue Ltrans_loop; 3906 goto Lno_consume; 3907 3908 // generic 3909 case predicate_t.MatchMode.generic: 3910 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3911 for ( auto cmp = t.predicate.data_str.ptr, 3912 cmpend = cmp + t.predicate.data_str.length; 3913 cmp < cmpend; ++cmp ) 3914 { 3915 if ( c < *cmp ) { 3916 ++cmp; 3917 continue; 3918 } 3919 ++cmp; 3920 if ( c <= *cmp ) 3921 goto Lconsume; 3922 } 3923 continue Ltrans_loop; 3924 case predicate_t.MatchMode.generic_l: 3925 debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString); 3926 for ( auto cmp = t.predicate.data_str.ptr, 3927 cmpend = cmp + t.predicate.data_str.length; 3928 cmp < cmpend; ++cmp ) 3929 { 3930 if ( c < *cmp ) { 3931 ++cmp; 3932 continue; 3933 } 3934 ++cmp; 3935 if ( c <= *cmp ) 3936 goto Lno_consume; 3937 } 3938 continue Ltrans_loop; 3939 3940 default: 3941 assert(0); 3942 } 3943 3944 Lconsume: 3945 p = next_p; 3946 Lno_consume: 3947 3948 s = t.target; 3949 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]); 3950 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length); 3951 3952 foreach ( cmd; t.commands ) 3953 { 3954 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 3955 registers_[cmd.dst] = cast(int)p; 3956 else 3957 registers_[cmd.dst] = registers_[cmd.src]; 3958 } 3959 3960 if (s.accept && s.reluctant) 3961 // Don't continue matching, the current find should be correct 3962 goto Laccept; 3963 3964 // if all input was consumed and we do not already accept, try to 3965 // add an explicit string/line end 3966 if ( p >= inp.length ) 3967 { 3968 if ( s.accept || c == 0 ) 3969 break; 3970 c = 0; 3971 goto Lprocess_char; 3972 } 3973 goto Lread_char; 3974 } 3975 // no applicable transition 3976 break; 3977 } 3978 3979 if ( s.accept ) 3980 { 3981 Laccept: 3982 foreach ( cmd; s.finishers ) { 3983 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER); 3984 registers_[cmd.dst] = registers_[cmd.src]; 3985 } 3986 if ( registers_.length > 1 && registers_[1] >= 0 ) { 3987 last_start_ = next_start_; 3988 next_start_ += registers_[1]; 3989 } 3990 return true; 3991 } 3992 3993 return false; 3994 } 3995 3996 /********************************************************************************************** 3997 Return submatch with the given index. 3998 Params: 3999 index = 0 returns whole match, index > 0 returns submatch of bracket #index 4000 Returns: 4001 Slice of input for the requested submatch, or null if no such submatch exists. 4002 **********************************************************************************************/ 4003 const(char_t)[] match(uint index) 4004 { 4005 if ( index > tdfa_.num_tags ) 4006 return null; 4007 int start = cast(int)last_start_+registers_[index*2], 4008 end = cast(int)last_start_+registers_[index*2+1]; 4009 if ( start >= 0 && start < end && end <= input_.length ) 4010 return input_[start .. end]; 4011 return null; 4012 } 4013 4014 /** ditto */ 4015 const(char_t)[] opIndex(uint index) 4016 { 4017 return match(index); 4018 } 4019 4020 /********************************************************************************************** 4021 Return the slice of the input that precedes the matched substring. 4022 If no match was found, null is returned. 4023 **********************************************************************************************/ 4024 const(char_t)[] pre() 4025 { 4026 auto start = registers_[0]; 4027 if ( start < 0 ) 4028 return null; 4029 return input_[0 .. last_start_+start]; 4030 } 4031 4032 /********************************************************************************************** 4033 Return the slice of the input that follows the matched substring. 4034 If no match was found, the whole slice of the input that was processed in the last test. 4035 **********************************************************************************************/ 4036 const(char_t)[] post() 4037 { 4038 if ( registers_[1] >= 0 ) 4039 return input_[next_start_ .. $]; 4040 return input_[last_start_ .. $]; 4041 } 4042 4043 /********************************************************************************************** 4044 Splits the input at the matches of this regular expression into an array of slices. 4045 Example: 4046 --- 4047 import tango.io.Stdout; 4048 import tango.text.Regex; 4049 4050 void main() 4051 { 4052 auto strs = Regex("ab").split("abcabcababqwer"); 4053 foreach( s; strs ) 4054 Stdout.formatln("{}", s); 4055 } 4056 // Prints: 4057 // c 4058 // c 4059 // qwer 4060 --- 4061 **********************************************************************************************/ 4062 inout(char_t)[][] split(inout(char_t)[] input) 4063 { 4064 auto res = new inout(char_t)[][PREALLOC]; 4065 uint index; 4066 inout(char_t)[] tmp = input; 4067 4068 foreach ( r; search(input) ) 4069 { 4070 tmp = cast(inout(char_t)[])pre(); 4071 res[index++] = tmp[last_start_ .. $]; 4072 if ( index >= res.length ) 4073 res.length = res.length*2; 4074 tmp = cast(inout(char_t)[])post(); 4075 } 4076 4077 res[index++] = tmp; 4078 res.length = index; 4079 return res; 4080 } 4081 4082 /********************************************************************************************** 4083 Returns a copy of the input with all matches replaced by replacement. 4084 **********************************************************************************************/ 4085 char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4086 { 4087 const(char_t)[] tmp = input; 4088 if ( output_buffer.length <= 0 ) 4089 output_buffer = new char_t[input.length+replacement.length]; 4090 output_buffer.length = 0; 4091 4092 foreach ( r; search(input) ) 4093 { 4094 tmp = pre(); 4095 if ( tmp.length > last_start_ ) 4096 output_buffer ~= tmp[last_start_ .. $]; 4097 output_buffer ~= replacement; 4098 tmp = post(); 4099 } 4100 output_buffer ~= tmp; 4101 return output_buffer; 4102 } 4103 4104 /********************************************************************************************** 4105 Returns a copy of the input with the last match replaced by replacement. 4106 **********************************************************************************************/ 4107 char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4108 { 4109 const(char_t)[] tmp_pre, tmp_post; 4110 if ( output_buffer.length <= 0 ) 4111 output_buffer = new char_t[input.length+replacement.length]; 4112 output_buffer.length = 0; 4113 4114 foreach ( r; search(input) ) { 4115 tmp_pre = pre(); 4116 tmp_post = post(); 4117 } 4118 4119 if ( tmp_pre !is null || tmp_post !is null ) { 4120 output_buffer ~= tmp_pre; 4121 output_buffer ~= replacement; 4122 output_buffer ~= tmp_post; 4123 } 4124 else 4125 output_buffer ~= input; 4126 4127 return output_buffer; 4128 } 4129 4130 /********************************************************************************************** 4131 Returns a copy of the input with the first match replaced by replacement. 4132 **********************************************************************************************/ 4133 char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null) 4134 { 4135 const(char_t)[] tmp = input; 4136 if ( output_buffer.length <= 0 ) 4137 output_buffer = new char_t[input.length+replacement.length]; 4138 output_buffer.length = 0; 4139 4140 if ( test(input) ) 4141 { 4142 tmp = pre(); 4143 if ( tmp.length > last_start_ ) 4144 output_buffer ~= tmp[last_start_ .. $]; 4145 output_buffer ~= replacement; 4146 tmp = post(); 4147 } 4148 output_buffer ~= tmp; 4149 return output_buffer; 4150 } 4151 4152 /********************************************************************************************** 4153 Calls dg for each match and replaces it with dg's return value. 4154 **********************************************************************************************/ 4155 char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null) 4156 { 4157 const(char_t)[] tmp = input; 4158 if ( output_buffer.length <= 0 ) 4159 output_buffer = new char_t[input.length]; 4160 output_buffer.length = 0; 4161 4162 foreach ( r; search(input) ) 4163 { 4164 tmp = pre(); 4165 if ( tmp.length > last_start_ ) 4166 output_buffer ~= tmp[last_start_ .. $]; 4167 output_buffer ~= dg(this); 4168 tmp = post(); 4169 } 4170 output_buffer ~= tmp; 4171 return output_buffer; 4172 } 4173 4174 /********************************************************************************************** 4175 Compiles the regular expression to D code. 4176 4177 NOTE : Remember to import this module (tango.text.Regex) in the module where you put the 4178 generated D code. 4179 4180 **********************************************************************************************/ 4181 // TODO: input-end special case 4182 const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false) 4183 { 4184 const(char)[] code; 4185 const(char)[] str_type; 4186 static if ( is(char_t == char) ) 4187 str_type = "const(char)[]"; 4188 static if ( is(char_t == wchar) ) 4189 str_type = "const(wchar)[]"; 4190 static if ( is(char_t == dchar) ) 4191 str_type = "const(dchar)[]"; 4192 4193 if ( lexer ) 4194 code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type); 4195 else { 4196 code = Format.convert("// {}\nbool match({} input", pattern_, str_type); 4197 code ~= Format.convert(", ref {}[] groups", str_type); 4198 } 4199 code ~= Format.convert(")\n{{\n uint s = {};", tdfa_.start.index); 4200 4201 uint num_vars = tdfa_.num_regs(); 4202 if ( num_vars > 0 ) 4203 { 4204 if ( lexer ) 4205 code ~= "\n static int "; 4206 else 4207 code ~= "\n int "; 4208 bool first = true; 4209 for ( int i = 0, used = 0; i < num_vars; ++i ) 4210 { 4211 bool hasInit = false; 4212 foreach ( cmd; tdfa_.initializer ) 4213 { 4214 if ( cmd.dst == i ) { 4215 hasInit = true; 4216 break; 4217 } 4218 } 4219 4220 if ( first ) 4221 first = false; 4222 else 4223 code ~= ", "; 4224 if ( used > 0 && used % 10 == 0 ) 4225 code ~= "\n "; 4226 ++used; 4227 code ~= Format.convert("r{}", i); 4228 4229 if ( hasInit ) 4230 code ~= "=0"; 4231 else 4232 code ~= "=-1"; 4233 } 4234 code ~= ";"; 4235 } 4236 4237 code ~= "\n\n for ( size_t p, next_p; p < input.length; )\n {"; 4238 code ~= "\n dchar c = cast(dchar)input[p];\n if ( c & 0x80 )\n decode(input, next_p);"; 4239 code ~= "\n else\n next_p = p+1;\n switch ( s )\n {"; 4240 4241 size_t[] finish_states; 4242 foreach ( s; tdfa_.states ) 4243 { 4244 code ~= Format.convert("\n case {}:", s.index); 4245 4246 if ( s.accept ) 4247 { 4248 finish_states ~= s.index; 4249 4250 tdfa_t.State target; 4251 foreach ( t; s.transitions ) 4252 { 4253 if ( target is null ) 4254 target = t.target; 4255 else if ( target !is t.target ) 4256 { 4257 target = null; 4258 break; 4259 } 4260 } 4261 if ( target !is null && target is s ) 4262 s.transitions = null; 4263 } 4264 4265 bool first_if=true; 4266 charclass_t cc, ccTest; 4267 4268 foreach ( t; s.transitions.sort ) 4269 { 4270 ccTest.add(t.predicate.getInput()); 4271 ccTest.optimize(); 4272 if ( t.predicate.getInput() < ccTest ) 4273 cc = t.predicate.getInput(); 4274 else 4275 cc = ccTest; 4276 4277 if ( first_if ) { 4278 code ~= "\n if ( "; 4279 first_if = false; 4280 } 4281 else 4282 code ~= "\n else if ( "; 4283 bool first_cond=true; 4284 foreach ( cr; cc.parts ) 4285 { 4286 if ( first_cond ) 4287 first_cond = false; 4288 else 4289 code ~= " || "; 4290 if ( cr.l() == cr.r() ) 4291 code ~= Format.convert("c == 0x{:x}", cast(int)cr.l()); 4292 else 4293 code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r()); 4294 } 4295 code ~= Format.convert(" ) {{\n s = {};", t.target.index); 4296 4297 if ( t.predicate.type == typeof(t.predicate.type).consume ) 4298 code ~= "\n p = next_p;"; 4299 foreach ( cmd; t.commands ) 4300 code ~= compileCommand(cmd, " "); 4301 /* 4302 // if inp ends here and we do not already accept, try to add an explicit string/line end 4303 if ( p >= inp.length && !s.accept && c != 0 ) { 4304 c = 0; 4305 goto Lprocess_char; 4306 } 4307 */ 4308 code ~= "\n }"; 4309 } 4310 4311 if ( !first_if ) 4312 code ~= Format.convert( 4313 "\n else\n {};\n break;", 4314 s.accept?Format.convert("goto finish{}", s.index):"return false" 4315 ); 4316 else 4317 code ~= Format.convert("\n {};", s.accept?Format.convert("goto finish{}", s.index):"return false"); 4318 } 4319 4320 // create finisher groups 4321 size_t[][size_t] finisherGroup; 4322 foreach ( fs; finish_states ) 4323 { 4324 // check if finisher group with same commands exists 4325 bool haveFinisher = false; 4326 foreach ( fg; finisherGroup.keys ) 4327 { 4328 bool equalCommands = false; 4329 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length ) 4330 { 4331 equalCommands = true; 4332 foreach ( i, cmd; tdfa_.states[fs].finishers ) 4333 { 4334 if ( cmd != tdfa_.states[fg].finishers[i] ) { 4335 equalCommands = false; 4336 break; 4337 } 4338 } 4339 } 4340 if ( equalCommands ) { 4341 // use existing group for this state 4342 finisherGroup[fg] ~= fs; 4343 haveFinisher = true; 4344 break; 4345 } 4346 } 4347 // create new group 4348 if ( !haveFinisher ) 4349 finisherGroup[fs] ~= fs; 4350 } 4351 4352 4353 code ~= "\n default:\n assert(0);\n }\n }\n\n switch ( s )\n {"; 4354 foreach ( group, states; finisherGroup ) 4355 { 4356 foreach ( s; states ) 4357 code ~= Format.convert("\n case {}: finish{}:", s, s); 4358 4359 foreach ( cmd; tdfa_.states[group].finishers ) 4360 { 4361 if ( lexer ) 4362 { 4363 if ( tdfa_.states[group].finishers.length > 1 ) 4364 throw new RegExpException("Lexer error: more than one finisher in flm lexer!"); 4365 if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags ) 4366 throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup); 4367 code ~= Format.convert("\n match = input[0 .. r{}];\n token = {};", cmd.src, cmd.dst/2); 4368 } 4369 else 4370 code ~= compileCommand(cmd, " "); 4371 } 4372 4373 code ~= "\n break;"; 4374 } 4375 code ~= "\n default:\n return false;\n }\n"; 4376 4377 if ( !lexer ) 4378 { 4379 code ~= Format.convert("\n groups.length = {};", tdfa_.num_tags/2); 4380 for ( int i = 0; i < tdfa_.num_tags/2; ++i ) 4381 code ~= Format.convert("\n if ( r{} > -1 && r{} > -1 )\n groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1); 4382 } 4383 4384 code ~= "\n return true;\n}"; 4385 return code; 4386 } 4387 4388 /********************************************************************************************* 4389 Get the pattern with which this regex was constructed. 4390 **********************************************************************************************/ 4391 public const(char_t)[] pattern() 4392 { 4393 return pattern_; 4394 } 4395 4396 /********************************************************************************************* 4397 Get the tag count of this regex, representing the number of sub-matches. 4398 4399 This value is the max valid value for match/opIndex. 4400 **********************************************************************************************/ 4401 uint tagCount() 4402 { 4403 return tdfa_.num_tags; 4404 } 4405 4406 int[] registers_; 4407 size_t next_start_, 4408 last_start_; 4409 4410 debug(TangoRegex) tnfa_t tnfa_; 4411 tdfa_t tdfa_; 4412 private: 4413 enum int PREALLOC = 16; 4414 const(char_t)[] input_, 4415 pattern_; 4416 4417 const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent) 4418 { 4419 const(char)[] code, 4420 dst; 4421 code ~= Format.convert("\n{}r{} = ", indent, cmd.dst); 4422 if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER ) 4423 code ~= "p;"; 4424 else 4425 code ~= Format.convert("r{};", cmd.src); 4426 return code; 4427 } 4428 } 4429 4430 alias RegExpT!(char) Regex; 4431 4432 debug(utf) import tango.stdc.stdio; 4433 // the following block is stolen from phobos. 4434 // the copyright notice applies for this block only. 4435 /* 4436 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com 4437 * Written by Walter Bright 4438 * 4439 * This software is provided 'as-is', without any express or implied 4440 * warranty. In no event will the authors be held liable for any damages 4441 * arising from the use of this software. 4442 * 4443 * Permission is granted to anyone to use this software for any purpose, 4444 * including commercial applications, and to alter it and redistribute it 4445 * freely, subject to the following restrictions: 4446 * 4447 * o The origin of this software must not be misrepresented; you must not 4448 * claim that you wrote the original software. If you use this software 4449 * in a product, an acknowledgment in the product documentation would be 4450 * appreciated but is not required. 4451 * o Altered source versions must be plainly marked as such, and must not 4452 * be misrepresented as being the original software. 4453 * o This notice may not be removed or altered from any source 4454 * distribution. 4455 */ 4456 4457 class UtfException : Exception 4458 { 4459 size_t idx; /// index in string of where error occurred 4460 4461 this(immutable(char)[] s, size_t i) 4462 { 4463 idx = i; 4464 super(s); 4465 } 4466 } 4467 4468 bool isValidDchar(dchar c) 4469 { 4470 /* Note: FFFE and FFFF are specifically permitted by the 4471 * Unicode standard for application internal use, but are not 4472 * allowed for interchange. 4473 * (thanks to Arcane Jill) 4474 */ 4475 4476 return c < 0xD800 || 4477 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 4478 } 4479 4480 /* ************* 4481 * Decodes and returns character starting at s[idx]. idx is advanced past the 4482 * decoded character. If the character is not well formed, a UtfException is 4483 * thrown and idx remains unchanged. 4484 */ 4485 4486 dchar decode(const(char)[] s, ref size_t idx) 4487 { 4488 size_t len = s.length; 4489 dchar V; 4490 size_t i = idx; 4491 char u = s[i]; 4492 4493 if (u & 0x80) 4494 { uint n; 4495 char u2; 4496 4497 /* The following encodings are valid, except for the 5 and 6 byte 4498 * combinations: 4499 * 0xxxxxxx 4500 * 110xxxxx 10xxxxxx 4501 * 1110xxxx 10xxxxxx 10xxxxxx 4502 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4503 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4504 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 4505 */ 4506 for (n = 1; ; n++) 4507 { 4508 if (n > 4) 4509 goto Lerr; // only do the first 4 of 6 encodings 4510 if (((u << n) & 0x80) == 0) 4511 { 4512 if (n == 1) 4513 goto Lerr; 4514 break; 4515 } 4516 } 4517 4518 // Pick off (7 - n) significant bits of B from first byte of octet 4519 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 4520 4521 if (i + (n - 1) >= len) 4522 goto Lerr; // off end of string 4523 4524 /* The following combinations are overlong, and illegal: 4525 * 1100000x (10xxxxxx) 4526 * 11100000 100xxxxx (10xxxxxx) 4527 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 4528 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 4529 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 4530 */ 4531 u2 = s[i + 1]; 4532 if ((u & 0xFE) == 0xC0 || 4533 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 4534 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 4535 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 4536 (u == 0xFC && (u2 & 0xFC) == 0x80)) 4537 goto Lerr; // overlong combination 4538 4539 for (uint j = 1; j != n; j++) 4540 { 4541 u = s[i + j]; 4542 if ((u & 0xC0) != 0x80) 4543 goto Lerr; // trailing bytes are 10xxxxxx 4544 V = (V << 6) | (u & 0x3F); 4545 } 4546 if (!isValidDchar(V)) 4547 goto Lerr; 4548 i += n; 4549 } 4550 else 4551 { 4552 V = cast(dchar) u; 4553 i++; 4554 } 4555 4556 idx = i; 4557 return V; 4558 4559 Lerr: 4560 throw new Exception("invalid UTF-8 sequence"); 4561 } 4562 4563 /* ditto */ 4564 4565 dchar decode(const(wchar)[] s, ref size_t idx) 4566 in 4567 { 4568 assert(idx >= 0 && idx < s.length); 4569 } 4570 out (result) 4571 { 4572 assert(isValidDchar(result)); 4573 } 4574 body 4575 { 4576 const(char)[] msg; 4577 dchar V; 4578 size_t i = idx; 4579 uint u = s[i]; 4580 4581 if (u & ~0x7F) 4582 { if (u >= 0xD800 && u <= 0xDBFF) 4583 { uint u2; 4584 4585 if (i + 1 == s.length) 4586 { msg = "surrogate UTF-16 high value past end of string"; 4587 goto Lerr; 4588 } 4589 u2 = s[i + 1]; 4590 if (u2 < 0xDC00 || u2 > 0xDFFF) 4591 { msg = "surrogate UTF-16 low value out of range"; 4592 goto Lerr; 4593 } 4594 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 4595 i += 2; 4596 } 4597 else if (u >= 0xDC00 && u <= 0xDFFF) 4598 { msg = "unpaired surrogate UTF-16 value"; 4599 goto Lerr; 4600 } 4601 else if (u == 0xFFFE || u == 0xFFFF) 4602 { msg = "illegal UTF-16 value"; 4603 goto Lerr; 4604 } 4605 else 4606 i++; 4607 } 4608 else 4609 { 4610 i++; 4611 } 4612 4613 idx = i; 4614 return cast(dchar)u; 4615 4616 Lerr: 4617 throw new UtfException(msg.idup, i); 4618 } 4619 4620 /* ditto */ 4621 4622 dchar decode(const(dchar)[] s, ref size_t idx) 4623 in 4624 { 4625 assert(idx >= 0 && idx < s.length); 4626 } 4627 body 4628 { 4629 size_t i = idx; 4630 dchar c = s[i]; 4631 4632 if (!isValidDchar(c)) 4633 goto Lerr; 4634 idx = i + 1; 4635 return c; 4636 4637 Lerr: 4638 throw new UtfException("5invalid UTF-32 value", i); 4639 } 4640 4641 4642 4643 /* =================== Encode ======================= */ 4644 4645 /* ***************************** 4646 * Encodes character c and appends it to array s[]. 4647 */ 4648 4649 void encode(ref const(char)[] s, dchar c) 4650 in 4651 { 4652 assert(isValidDchar(c)); 4653 } 4654 body 4655 { 4656 const(char)[] r = s; 4657 4658 if (c <= 0x7F) 4659 { 4660 r ~= cast(char) c; 4661 } 4662 else 4663 { 4664 char[4] buf; 4665 uint L; 4666 4667 if (c <= 0x7FF) 4668 { 4669 buf[0] = cast(char)(0xC0 | (c >> 6)); 4670 buf[1] = cast(char)(0x80 | (c & 0x3F)); 4671 L = 2; 4672 } 4673 else if (c <= 0xFFFF) 4674 { 4675 buf[0] = cast(char)(0xE0 | (c >> 12)); 4676 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4677 buf[2] = cast(char)(0x80 | (c & 0x3F)); 4678 L = 3; 4679 } 4680 else if (c <= 0x10FFFF) 4681 { 4682 buf[0] = cast(char)(0xF0 | (c >> 18)); 4683 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 4684 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 4685 buf[3] = cast(char)(0x80 | (c & 0x3F)); 4686 L = 4; 4687 } 4688 else 4689 { 4690 assert(0); 4691 } 4692 r ~= buf[0 .. L]; 4693 } 4694 s = r; 4695 } 4696 4697 /* ditto */ 4698 4699 void encode(ref const(wchar)[] s, dchar c) 4700 in 4701 { 4702 assert(isValidDchar(c)); 4703 } 4704 body 4705 { 4706 const(wchar)[] r = s; 4707 4708 if (c <= 0xFFFF) 4709 { 4710 r ~= cast(wchar) c; 4711 } 4712 else 4713 { 4714 wchar[2] buf; 4715 4716 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 4717 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 4718 r ~= buf; 4719 } 4720 s = r; 4721 } 4722 4723 /* ditto */ 4724 4725 void encode(ref const(dchar)[] s, dchar c) 4726 in 4727 { 4728 assert(isValidDchar(c)); 4729 } 4730 body 4731 { 4732 s ~= c; 4733 } 4734 4735 debug(UnitTest) 4736 { 4737 unittest 4738 { 4739 // match a fixed string 4740 Regex r; 4741 r = new Regex("abc"); 4742 assert(r.test("abc123")); 4743 assert(r.test("feabc123")); 4744 assert(!r.test("abe123")); 4745 4746 // match a non-ASCII string 4747 r = new Regex("☃"); 4748 assert(r.test("a☃c123")); 4749 4750 // capture 4751 r = new Regex("☃(c)"); 4752 assert(r.test("a☃c123")); 4753 assert(r.match(1) == "c"); 4754 4755 // dot 4756 r = new Regex(".."); 4757 assert(r.test("a☃c123")); 4758 4759 // dot capture 4760 r = new Regex("(..)"); 4761 assert(r.test("a☃c123")); 4762 assert(r.match(1) == "a☃"); 4763 4764 4765 4766 // two captures 4767 r = new Regex("(.)(.)"); 4768 assert(r.test("a☃c123")); 4769 assert(r.match(1) == "a"); 4770 assert(r.match(2) == "☃"); 4771 4772 // multiple letters 4773 r = new Regex(".*(e+).*"); 4774 assert(r.test("aeeeeec123")); 4775 assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1)); 4776 assert(!r.test("abfua")); 4777 4778 // multiple snowmen 4779 r = new Regex(".*(☃+).*"); 4780 assert(r.test("a☃☃☃☃☃c123")); 4781 assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1)); 4782 assert(!r.test("abeua")); 4783 4784 /** 4785 "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed. 4786 */ 4787 // multiple snowmen 4788 r = new Regex("(☃+)"); 4789 assert(r.test("a☃☃☃☃☃c123")); 4790 assert(r.match(0) == "☃☃☃☃☃"); 4791 assert(r.match(1) == "☃☃☃☃☃"); 4792 assert(r.test("a☃beua")); 4793 } 4794 4795 debug(RegexTestOnly) 4796 { 4797 import tango.io.Stdout; 4798 void main() { Stdout("All tests pass").newline(); } 4799 } 4800 }