tango.text.Regex source code

1 /*******************************************************************************
2 
3     copyright:      Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved.
4 
5     license:        BSD style: $(LICENSE)
6 
7     version:        Initial release: Jan 2008
8 
9     authors:        Jascha Wetzel
10 
11     This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method.
12 
13     The Regex class is not thread safe
14 
15     See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a>
16     for details on regular expressions in general.
17 
18     The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it,
19     as opposed to what &quot;regular expression&quot; means in most implementations
20     (e.g. PCRE or those from the standard libraries of Perl, Java or Python).
21     The advantage of this method is it's performance, it's disadvantage is the inability to realize some features
22     that Perl-like regular expressions have (e.g. back-references).
23     See <a href="http://swtch.com/~rsc/regexp/regexp1.html">&quot;Regular Expression Matching Can Be Simple And Fast&quot;</a>
24     for details on the differences.
25 
26     The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the
27     number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input
28     and therefore matching is a linear-time process.
29 
30     The syntax of a regular expressions is as follows.
31     <i>X</i> and <i>Y</i> stand for an arbitrary regular expression.
32 
33     <table border=1 cellspacing=0 cellpadding=5>
34     <caption>Operators</caption>
35     $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) )
36     $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) )
37     $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) )
38     $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) )
39     $(TR $(TD [^Z]) $(TD negated character class specification) )
40     $(TR $(TD &lt;X) $(TD lookbehind, X may be a single character or a character class) )
41     $(TR $(TD &gt;X) $(TD lookahead, X may be a single character or a character class) )
42     $(TR $(TD ^) $(TD start of input or start of line) )
43     $(TR $(TD $) $(TD end of input or end of line) )
44     $(TR $(TD \b) $(TD start or end of word, equals (?&lt;\s&gt;\S|&lt;\S&gt;\s)) )
45     $(TR $(TD \B) $(TD opposite of \b, equals (?&lt;\S&gt;\S|&lt;\s&gt;\s)) )
46     </table>
47 
48     <table border=1 cellspacing=0 cellpadding=5>
49     <caption>Quantifiers</caption>
50     $(TR $(TD X?) $(TD zero or one) )
51     $(TR $(TD X*) $(TD zero or more) )
52     $(TR $(TD X+) $(TD one or more) )
53     $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) )
54     $(TR $(TD X??) $(TD non-greedy version of the above operators) )
55     $(TR $(TD X*?) $(TD see above) )
56     $(TR $(TD X+?) $(TD see above) )
57     $(TR $(TD X{n,m}?) $(TD see above) )
58     </table>
59 
60     <table border=1 cellspacing=0 cellpadding=5>
61     <caption>Pre-defined character classes</caption>
62     $(TR $(TD .) $(TD any printable character) )
63     $(TR $(TD \s) $(TD whitespace) )
64     $(TR $(TD \S) $(TD non-whitespace) )
65     $(TR $(TD \w) $(TD alpha-numeric characters or underscore) )
66     $(TR $(TD \W) $(TD opposite of \w) )
67     $(TR $(TD \d) $(TD digits) )
68     $(TR $(TD \D) $(TD non-digit) )
69     </table>
70 
71     Note that "alphanumeric" only applies to Latin-1.
72 *******************************************************************************/
73 module tango.text.Regex;
74 
75 static import tango.core.Array;
76 
77 static import tango.text.convert.Utf;
78 
79 debug(TangoRegex) import tango.io.Stdout;
80 
81 /* *****************************************************************************
82     A simple pair
83 *******************************************************************************/
84 private struct Pair(T)
85 {
86     static Pair opCall(T a, T b)
87     {
88         Pair p;
89         p.a = a;
90         p.b = b;
91         return p;
92     }
93 
94     union
95     {
96         struct {
97             T first, second;
98         }
99         struct {
100             T a, b;
101         }
102     }
103 }
104 
105 /* *****************************************************************************
106     Double linked list
107 *******************************************************************************/
108 private class List(T)
109 {
110     class Element
111     {
112         T value;
113         Element prev,
114                 next;
115 
116         this(T v)
117         {
118             value = v;
119         }
120     }
121 
122     size_t  len;
123     Element head,
124             tail;
125 
126     List // opCatAssign(T v)
127 	    opOpAssign(string op)(T v) if (op == "~")
128     {
129         if ( tail is null )
130             head = tail = new Element(v);
131         else {
132             tail.next = new Element(v);
133             tail.next.prev = tail;
134             tail = tail.next;
135         }
136         ++len;
137         return this;
138     }
139 
140     List insertAfter(T w, T v)
141     {
142         foreach ( e; &this.elements )
143         {
144             if ( e.value is w )
145                 return insertAfter(e, v);
146         }
147         return null;
148     }
149 
150     List insertAfter(Element e, T v)
151     {
152         auto tmp = new Element(v);
153         tmp.prev = e;
154         tmp.next = e.next;
155         e.next.prev = tmp;
156         e.next = tmp;
157         if ( e is tail )
158             tail = tmp;
159         ++len;
160         return this;
161     }
162 
163     List // opCatAssign(List l)
164 	    opOpAssign(string op)(List l) if (op == "~")
165     {
166         if ( l.empty() )
167             return this;
168         if ( tail is null ) {
169             head = l.head;
170             tail = l.tail;
171         }
172         else {
173             tail.next = l.head;
174             tail.next.prev = tail;
175             tail = l.tail;
176         }
177         len += l.len;
178         return this;
179     }
180 
181     List pushFront(T v)
182     {
183         if ( head is null )
184             head = tail = new Element(v);
185         else
186         {
187             head.prev = new Element(v);
188             head.prev.next = head;
189             head = head.prev;
190         }
191         ++len;
192         return this;
193     }
194 
195     List insertBefore(T w, T v)
196     {
197         foreach ( e; &this.elements )
198         {
199             if ( e.value is w )
200                 return insertBefore(e, v);
201         }
202         return null;
203     }
204 
205     List insertBefore(Element e, T v)
206     {
207         auto tmp = new Element(v);
208         tmp.prev = e.prev;
209         tmp.next = e;
210         e.prev.next = tmp;
211         e.prev = tmp;
212         if ( e is head )
213             head = tmp;
214         ++len;
215         return this;
216     }
217 
218     List pushFront(List l)
219     {
220         if ( l.empty() )
221             return this;
222         if ( head is null ) {
223             head = l.head;
224             tail = l.tail;
225         }
226         else {
227             head.prev = l.tail;
228             head.prev.next = head;
229             head = l.head;
230         }
231         len += l.len;
232         return this;
233     }
234 
235     size_t length()
236     {
237         return len;
238     }
239 
240     bool empty()
241     {
242         return head is null;
243     }
244 
245     void clear()
246     {
247         head = null;
248         tail = null;
249         len = 0;
250     }
251 
252     void pop()
253     {
254         remove(tail);
255     }
256 
257     void remove(Element e)
258     {
259         if ( e is null )
260             return;
261         if ( e.prev is null )
262             head = e.next;
263         else
264             e.prev.next = e.next;
265         if ( e.next is null )
266             tail = e.prev;
267         else
268             e.next.prev = e.prev;
269         --len;
270     }
271 
272     int elements(int delegate(ref Element) dg)
273     {
274         for ( Element e=head; e !is null; e = e.next )
275         {
276             int ret = dg(e);
277             if ( ret )
278                 return ret;
279         }
280         return 0;
281     }
282 
283     int elements_reverse(int delegate(ref Element) dg)
284     {
285         for ( Element e=tail; e !is null; e = e.prev )
286         {
287             int ret = dg(e);
288             if ( ret )
289                 return ret;
290         }
291         return 0;
292     }
293 
294     int opApply(int delegate(ref T) dg)
295     {
296         for ( Element e=head; e !is null; e = e.next )
297         {
298             int ret = dg(e.value);
299             if ( ret )
300                 return ret;
301         }
302         return 0;
303     }
304 
305     int opApplyReverse(int delegate(ref T) dg)
306     {
307         for ( Element e=tail; e !is null; e = e.prev )
308         {
309             int ret = dg(e.value);
310             if ( ret )
311                 return ret;
312         }
313         return 0;
314     }
315 }
316 
317 /* *****************************************************************************
318     Stack based on dynamic array
319 *******************************************************************************/
320 private struct Stack(T)
321 {
322     size_t  _top;
323     T[]     stack;
324 
325     void push(T v)
326     {
327         if ( _top >= stack.length )
328             stack.length = stack.length*2+1;
329         stack[_top] = v;
330         ++_top;
331     }
332     // alias push opCatAssign;
333 
334     void // opCatAssign(T[] vs)
335 	    opOpAssign(string op)(T[] vs) if (op == "~")
336     {
337         size_t end = _top+vs.length;
338         if ( end > stack.length )
339             stack.length = end*2;
340         stack[_top..end] = vs[];
341         _top = end;
342     }
343 
344     void pop(size_t num)
345     {
346         assert(_top>=num);
347         _top -= num;
348     }
349 
350     T pop()
351     {
352         assert(_top>0);
353         return stack[--_top];
354     }
355 
356     T top()
357     {
358         assert(_top>0);
359         return stack[_top-1];
360     }
361 
362     T* topPtr()
363     {
364         assert(_top>0);
365         return &stack[_top-1];
366     }
367 
368     bool empty()
369     {
370         return _top == 0;
371     }
372 
373     void clear()
374     {
375         _top = 0;
376     }
377 
378     size_t length()
379     {
380         return _top;
381     }
382 
383     T[] array()
384     {
385         return stack[0.._top];
386     }
387 
388     T opIndex(size_t i)
389     {
390         return stack[i];
391     }
392 
393     Stack dup()
394     {
395         Stack s;
396         s._top = _top;
397         s.stack = stack.dup;
398         return s;
399     }
400 }
401 
402 /* ************************************************************************************************
403     Set container based on assoc array
404 **************************************************************************************************/
405 private struct Set(T)
406 {
407     bool[T] data;
408 
409     static Set opCall()
410     {
411         Set s;
412         return s;
413     }
414 
415     static Set opCall(T v)
416     {
417         Set s;
418         s ~= v;
419         return s;
420     }
421 
422     void opAddAssign(T v)
423     {
424         data[v] = true;
425     }
426 
427     void opAddAssign(Set s)
428     {
429         foreach ( v; s.elements )
430             data[v] = true;
431     }
432     alias opAddAssign opCatAssign;
433 
434     size_t length()
435     {
436         return data.length;
437     }
438 
439     T[] elements()
440     {
441         return data.keys;
442     }
443 
444     bool remove(T v)
445     {
446         if ( (v in data) is null )
447             return false;
448         data.remove(v);
449         return true;
450     }
451 
452     bool contains(T v)
453     {
454         return (v in data) !is null;
455     }
456 
457     bool contains(Set s)
458     {
459         Set tmp = s - this;
460         return tmp.empty;
461     }
462 
463     bool empty()
464     {
465         return data.length==0;
466     }
467 
468     Set opSub(Set s)
469     {
470         Set res = dup;
471         foreach ( v; s.elements )
472             res.remove(v);
473         return res;
474     }
475 
476     Set dup()
477     {
478         Set s;
479         foreach ( v; data.keys )
480             s.data[v] = true;
481         return s;
482     }
483 }
484 
485 /* ************************************************************************************************
486 
487 **************************************************************************************************/
488 void quickSort(T)(T[] a)
489 {
490     quickSort(a,cast(size_t)0,a.length);
491 }
492 
493 void quickSort(T)(T[] a, size_t l, size_t r)
494 {
495     T t;
496     auto i = r-l;
497     if ( i < 3 )
498     {
499         if ( i < 2 )
500             return;
501         if ( a[l] < a[l+1] )
502             return;
503         t = a[l];
504         a[l] = a[l+1];
505         a[l+1] = t;
506         return;
507     }
508 
509     auto p = a[l];
510     i = l;
511     auto j = r;
512 
513     while ( true )
514     {
515         ++i;
516         for ( ; i < j && a[i] < p; ++i ) {}
517         --j;
518         for ( ; i < j && a[j] >= p; --j ) {}
519         if ( i >= j )
520             break;
521         t = a[i];
522         a[i] = a[j];
523         a[j] = t;
524     }
525     --i;
526     a[l] = a[i];
527     a[i] = p;
528 
529     quickSort(a, l, i);
530     quickSort(a, i+1, r);
531 }
532 import tango.math.Math;
533 
534 /* ************************************************************************************************
535     A range of characters
536 **************************************************************************************************/
537 struct CharRange(char_t)
538 {
539     char_t  l_, r_;
540 
541     static CharRange opCall(char_t c)
542     {
543         CharRange cr;
544         cr.l_ = c;
545         cr.r_ = c;
546         return cr;
547     }
548 
549     static CharRange opCall(char_t a, char_t b)
550     {
551         CharRange cr;
552         cr.l_ = min(a,b);
553         cr.r_ = max(a,b);
554         return cr;
555     }
556 
557     char_t l()
558     {
559         return l_;
560     }
561 
562     char_t r()
563     {
564         return r_;
565     }
566 
567     /* ********************************************************************************************
568         Compares the ranges according to their beginning.
569     **********************************************************************************************/
570     int opCmp(const ref CharRange cr) const
571     {
572         if ( l_ == cr.l_ )
573             return 0;
574         if ( l_ < cr.l_ )
575             return -1;
576         return 1;
577     }
578 
579 
580     const bool opEquals(ref const CharRange cr)
581     {
582         if ( l_ == cr.l_ && r_ == cr.r_ )
583             return true;
584         return false;
585     }
586 
587     bool contains(char_t c)
588     {
589         return c >= l_ && c <= r_;
590     }
591 
592     bool contains(CharRange cr)
593     {
594         return l_ <= cr.l_ && r_ >= cr.r_;
595     }
596 
597     bool intersects(CharRange cr)
598     {
599         return r_ >= cr.l_ && l_ <= cr.r_;
600     }
601 
602     CharRange intersect(CharRange cr)
603     {
604         assert(intersects(cr));
605         CharRange ir;
606         ir.l_ = max(l_, cr.l_);
607         ir.r_ = min(r_, cr.r_);
608         if ( ir.l_ > ir.r_ )
609             ir.l_ = ir.r_ = char_t.min;
610         return ir;
611     }
612 
613     CharRange[] subtract(CharRange cr)
614     {
615         CharRange[] sr;
616         if ( cr.contains(this) )
617             return sr;
618         if ( !intersects(cr) )
619             sr ~= this;
620         else
621         {
622             CharRange d;
623             if ( contains(cr) )
624             {
625                 d.l_ = l_;
626                 d.r_ = cast(char_t)(cr.l_-1);
627                 if ( d.l_ <= d.r_ )
628                     sr ~= d;
629                 d.l_ = cast(char_t)(cr.r_+1);
630                 d.r_ = r_;
631                 if ( d.l_ <= d.r_ )
632                     sr ~= d;
633             }
634             else if ( cr.r_ > l_ )
635             {
636                 d.l_ = cast(char_t)(cr.r_+1);
637                 d.r_ = r_;
638                 if ( d.l_ <= d.r_ )
639                     sr ~= d;
640             }
641             else if ( cr.l_ < r_ )
642             {
643                 d.l_ = l_;
644                 d.r_ = cast(char_t)(cr.l_-1);
645                 if ( d.l_ <= d.r_ )
646                     sr ~= d;
647             }
648         }
649         return sr;
650     }
651 
652     string toString()
653     {
654         string str;
655 
656         if ( l_ == r_ )
657         {
658             if ( l_ > 0x20 && l_ < 0x7f )
659                 str = Format.convert("'{}'", l_).idup;
660             else
661                 str = Format.convert("({:x})", cast(int)l_).idup;
662         }
663         else
664         {
665             if ( l_ > 0x20 && l_ < 0x7f )
666                 str = Format.convert("'{}'", l_).idup;
667             else
668                 str = Format.convert("({:x})", cast(int)l_).idup;
669             str ~= "-";
670             if ( r_ > 0x20 && r_ < 0x7f )
671                 str ~= Format.convert("'{}'", r_).idup;
672             else
673                 str ~= Format.convert("({:x})", cast(int)r_).idup;
674         }
675         return str;
676     }
677 }
678 
679 import tango.core.Compiler;
680 
681 static if(DMDFE_Version == 2063)
682 {
683     /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */
684     void _bug(CharRange!(dchar) a) {}
685 }
686 
687 /* ************************************************************************************************
688     Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.)
689 **************************************************************************************************/
690 struct CharClass(char_t)
691 {
692     alias CharRange!(char_t) range_t;
693 
694     //---------------------------------------------------------------------------------------------
695     // pre-defined character classes
696     static const CharClass!(char_t)
697         line_startend = {parts: [
698             {l_:0x00, r_:0x00},
699             {l_:0x0a, r_:0x0a},
700             {l_:0x13, r_:0x13}
701         ]},
702         digit = {parts: [
703             {l_:0x30, r_:0x39}
704         ]},
705         whitespace = {parts: [
706             {l_:0x09, r_:0x09},
707             {l_:0x0a, r_:0x0a},
708             {l_:0x0b, r_:0x0b},
709             {l_:0x13, r_:0x13},
710             {l_:0x14, r_:0x14},
711             {l_:0x20, r_:0x20}
712         ]};
713 
714     // 8bit classes
715     static if ( is(char_t == char) )
716     {
717         static const CharClass!(char_t)
718             any_char = {parts: [
719                 {l_:0x01, r_:0xff}
720             ]},
721             dot_oper = {parts: [
722                 {l_:0x01, r_:0xff}
723             ]},
724             alphanum_ = {parts: [
725                 {l_:0x30, r_:0x39},
726                 {l_:0x41, r_:0x5a},
727                 {l_:0x5f, r_:0x5f},
728                 {l_:0x61, r_:0x7a}
729             ]};
730     }
731     // 16bit and 32bit classes
732     static if ( is(char_t == wchar) || is(char_t == dchar) )
733     {
734         static const CharClass!(char_t)
735             any_char = {parts: [
736                 {l_:0x0001, r_:0xffff}
737             ]},
738             dot_oper = {parts: [
739                 {l_:0x0001, r_:0xffff}
740             ]},
741             alphanum_ = {parts: [
742                 {l_:0x30, r_:0x39},
743                 {l_:0x41, r_:0x5a},
744                 {l_:0x5f, r_:0x5f},
745                 {l_:0x61, r_:0x7a}
746             ]};
747     }
748 
749     //---------------------------------------------------------------------------------------------
750     range_t[] parts;
751 
752     invariant()
753     {
754 //        foreach ( i, p; parts )
755 //            assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_));
756     }
757 
758     static CharClass opCall(const(CharClass) cc)
759     {
760         CharClass ncc;
761         ncc.parts = cc.parts.dup;
762         return ncc;
763     }
764 
765     int opCmp(const ref CharClass cc) const
766     {
767         if ( parts.length < cc.parts.length )
768             return -1;
769         if ( parts.length > cc.parts.length )
770             return 1;
771         foreach ( i, p; cc.parts )
772         {
773             if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ )
774                 return 1;
775         }
776         return 0;
777     }
778 
779     bool empty()
780     {
781         return parts.length <= 0;
782     }
783 
784     bool matches(char_t c)
785     {
786         foreach ( p; parts )
787         {
788             if ( p.contains(c) )
789                 return true;
790         }
791         return false;
792     }
793 
794     CharClass intersect(CharClass cc)
795     {
796         CharClass ic;
797         foreach ( p; parts )
798         {
799             foreach ( cp; cc.parts )
800             {
801                 if ( p.intersects(cp) )
802                     ic.parts ~= p.intersect(cp);
803             }
804         }
805         return ic;
806     }
807 
808     // requires the class to be optimized
809     bool contains(range_t cr)
810     {
811         foreach ( p; parts )
812         {
813             if ( p.contains(cr) )
814                 return true;
815         }
816         return false;
817     }
818 
819     // requires the class to be optimized
820     bool contains(CharClass cc)
821     {
822         Louter: foreach ( p; cc.parts )
823         {
824             foreach ( p2; parts )
825             {
826                 if ( p2.contains(p) )
827                     continue Louter;
828             }
829             return false;
830         }
831         return true;
832     }
833 
834     void subtract(CharClass cc)
835     {
836         negate();
837         add(cc);
838         negate();
839     }
840 
841     void add(CharClass cc)
842     {
843         parts ~= cc.parts;
844     }
845 
846     void add(range_t cr)
847     {
848         parts ~= cr;
849     }
850 
851     void add(char_t c)
852     {
853         parts ~= CharRange!(char_t)(c);
854     }
855 
856     /* ********************************************************************************************
857         Requires the CharClass to be optimized.
858     **********************************************************************************************/
859     void negate()
860     {
861         optimize();
862         char_t  start = char_t.min;
863 
864         // first part touches left boundary of value range
865         if ( parts.length > 0 && parts[0].l_ == start )
866         {
867             start = parts[0].r_;
868             if ( start < char_t.max )
869                 ++start;
870 
871             foreach ( i, ref cr; parts[0 .. $-1] )
872             {
873                 cr.l_ = start;
874                 cr.r_ = cast(char_t)(parts[i+1].l_-1);
875                 start = parts[i+1].r_;
876                 if ( start < char_t.max )
877                     ++start;
878             }
879             if ( start != char_t.max ) {
880                 parts[$-1].l_ = start;
881                 parts[$-1].r_ = char_t.max;
882             }
883             else
884                 parts.length = parts.length-1;
885             return;
886         }
887 
888         foreach ( i, ref cr; parts )
889         {
890             char_t tmp = cast(char_t)(cr.l_-1);
891             cr.l_ = start;
892             start = cr.r_;
893             if ( start < char_t.max )
894                 ++start;
895             cr.r_ = tmp;
896         }
897 
898         // last part does not touch right boundary
899         if ( start != char_t.max )
900             parts ~= range_t(start, char_t.max);
901     }
902 
903     void optimize()
904     {
905         if ( empty() )
906             return;
907 
908         tango.core.Array.sort(parts);
909 
910         size_t i = 0;
911         foreach ( p; parts[1 .. $] )
912         {
913             if ( p.l_ > parts[i].r_+1 ) {
914                 ++i;
915                 parts[i].l_ = p.l_;
916                 parts[i].r_ = p.r_;
917                 continue;
918             }
919             parts[i].r_ = max(p.r_, parts[i].r_);
920             if ( parts[i].r_ >= char_t.max )
921                 break;
922         }
923         parts.length = i+1;
924     }
925 
926     string toString()
927     {
928         string str;
929         str ~= "[";
930         foreach ( p; parts )
931             str ~= p.toString();
932         str ~= "]";
933         return str;
934     }
935 }
936 
937 debug(UnitTest)
938 {
939 unittest
940 {
941     static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] };
942     assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']");
943     cc.optimize();
944     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
945     cc.negate();
946     assert(cc.toString(),  " [(12)-(13)'e'-(ff)]");
947     cc.optimize();
948     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
949     cc.negate();
950     assert(cc.toString(),  "[(12)-(13)'e'-(ff)]");
951 
952     static CharClass!(char) cc2 = { parts: [] };
953     assert(cc.toString(),  "[]");
954     cc2.optimize();
955     assert(cc.toString(),  "[]");
956     cc2.negate();
957     assert(cc.toString(),  "[(0)-(ff)]");
958     cc2.optimize();
959     assert(cc.toString(),  "[(0)-(ff)]");
960     cc2.negate();
961     assert(cc.toString(),  "[]");
962 
963     static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] };
964     assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]");
965     cc3.negate();
966     assert(cc.toString(),  "['e'-(c7)]");
967     cc3.negate();
968     assert(cc.toString(),  "[(0)-'d'(c8)-(ff)]");
969 
970     static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] };
971     assert(cc.toString(),  "[(0)-(c8)'d'-(ff)]");
972     cc4.optimize();
973     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]");
974 
975     static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] };
976     cc5.optimize();
977     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
978     cc5.negate();
979     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
980     cc5.optimize();
981     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
982     cc5.negate();
983     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
984 }
985 }
986 
987 /* ************************************************************************************************
988 
989 **************************************************************************************************/
990 private struct Predicate(char_t)
991 {
992     alias const(char_t)[]       string_t;
993     alias CharClass!(char_t)    cc_t;
994     alias CharRange!(char_t)    cr_t;
995 
996     // generic data
997     enum Type {
998         consume, epsilon, lookahead, lookbehind
999     }
1000 
1001     cc_t    input;
1002     Type    type;
1003 
1004     // data for compiled predicates
1005     enum uint  MAX_BITMAP_LENGTH = 256,
1006                MAX_SEARCH_LENGTH = 256;
1007     enum MatchMode {
1008         generic, generic_l,
1009         single_char, bitmap, string_search,         // consume
1010         single_char_l, bitmap_l, string_search_l    // lookahead
1011     }
1012 
1013     MatchMode   mode;
1014     // data_chr had to be pulled out of the union due to
1015     // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back
1016     // in until this is resolved!
1017     //
1018     // Keep in mind that data_str.length can't be modified directly unless the
1019     // new length is strictly greater than the old length. This is essentially
1020     // because ubyte.sizeof can be less than char_t.sizeof. If you set
1021     // data_str.length to anything less than or equal to data_bmp.length,
1022     // data_str will not be reallocated: only the length value will change but
1023     // nothing will realize that not that much space has actually been
1024     // allocated. data_str will be too small and you'll likely get segfaults or
1025     // such.
1026     //
1027     // In short: don't mess with data_str.length. If you have to, remove the
1028     // union entirely.
1029     //
1030     // -- Deewiant
1031     union {
1032         ubyte[]     data_bmp;
1033         string_t    data_str;
1034     };
1035     char_t      data_chr;
1036 
1037 
1038     void compile()
1039     {
1040         assert(input.parts.length > 0);
1041 
1042         // single char?
1043         if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ )
1044         {
1045             mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l;
1046             data_chr = input.parts[0].l_;
1047             return;
1048         }
1049         // check whether we can use a bitmap
1050         foreach ( p; input.parts )
1051         {
1052             if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH )
1053                 goto LnoBitmap;
1054         }
1055 
1056         // setup bitmap
1057         data_bmp.length = MAX_BITMAP_LENGTH/8;
1058         foreach ( p; input.parts )
1059         {
1060             for ( char_t c = p.l_; c <= p.r_; ++c )
1061                 data_bmp[c/8] |= 1 << (c&7);
1062         }
1063         mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l;
1064         return;
1065 
1066     LnoBitmap:
1067 /*
1068         // check whether the class is small enough to justify a string-search
1069         // TODO: consider inverse class for 8bit chars?
1070         uint class_size;
1071         foreach ( p; input.parts )
1072             class_size += cast(uint)p.r_+1-p.l_;
1073         if ( class_size > MAX_SEARCH_LENGTH )
1074             goto Lgeneric;
1075         data_str.length = class_size;
1076         size_t ind;
1077         foreach ( p; input.parts )
1078         {
1079             for ( char_t c = p.l_; c <= p.r_; ++c )
1080                 data_str[ind++] = c;
1081         }
1082         mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l;
1083         return;
1084 */
1085     Lgeneric:
1086         data_str = cast(char_t[])input.parts;
1087         mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l;
1088     }
1089 
1090     bool matches(char_t c)
1091     {
1092         if ( type == Type.consume || type == Type.lookahead )
1093             return input.matches(c);
1094         assert(0);
1095     }
1096 
1097     Predicate intersect(Predicate p)
1098     {
1099         Predicate p2;
1100         if ( type != Type.epsilon && p.type != Type.epsilon )
1101             p2.input = input.intersect(p.input);
1102         return p2;
1103     }
1104 
1105     bool intersects(Predicate p)
1106     {
1107         if ( type != p.type )
1108             return false;
1109         foreach ( cr; input.parts )
1110         {
1111             foreach ( cr2; p.input.parts )
1112             {
1113                 if ( cr.intersects(cr2) )
1114                     return true;
1115             }
1116         }
1117         return false;
1118     }
1119 
1120     bool exceedsMax(uint maxc)
1121     {
1122         foreach ( p; input.parts )
1123         {
1124             if ( p.l_ > maxc || p.r_ > maxc )
1125                 return true;
1126         }
1127 
1128         return false;
1129     }
1130 
1131     @property bool empty()
1132     {
1133         return type != Type.epsilon && input.empty();
1134     }
1135 
1136     void subtract(Predicate p)
1137     {
1138         if ( type != Type.epsilon && p.type != Type.epsilon )
1139             input.subtract(p.input);
1140     }
1141 
1142     void negate()
1143     {
1144         assert(type != Type.epsilon);
1145         input.negate();
1146     }
1147 
1148     void optimize()
1149     {
1150         assert(type != Type.epsilon);
1151         input.optimize();
1152     }
1153 
1154     int opCmp(const ref Predicate p) const
1155     {
1156         return input.opCmp(p.input);
1157     }
1158 
1159     const bool opEquals(ref const Predicate p)
1160     {
1161         if ( type != p.type )
1162             return false;
1163         if ( input.opCmp(p.input) != 0 )
1164             return false;
1165         return true;
1166     }
1167 
1168     cc_t getInput()
1169     {
1170         return input;
1171     }
1172 
1173     void setInput(const(cc_t) cc)
1174     {
1175         input = cast(cc_t)cc;
1176     }
1177 
1178     void appendInput(cr_t cr)
1179     {
1180         input.add(cr);
1181     }
1182 
1183     void appendInput(cc_t cc)
1184     {
1185         input.add(cc);
1186     }
1187 
1188     void appendInput(Predicate p)
1189     {
1190         input.add(p.input);
1191     }
1192 
1193     string toString()
1194     {
1195         string str;
1196         switch ( type )
1197         {
1198             case Type.consume:      str = input.toString();       break;
1199             case Type.epsilon:      str = "eps";                break;
1200             case Type.lookahead:    str = "la:"~input.toString(); break;
1201             case Type.lookbehind:   str = "lb:"~input.toString(); break;
1202             default:
1203                 assert(0);
1204         }
1205         return str;
1206     }
1207 }
1208 import Utf = tango.text.convert.Utf;
1209 import tango.text.convert.Format;
1210 
1211 /* ************************************************************************************************
1212 
1213 **************************************************************************************************/
1214 class RegExpException : Exception
1215 {
1216     this(string msg)
1217     {
1218         super("RegExp: "~msg);
1219     }
1220 }
1221 
1222 /* ************************************************************************************************
1223     TNFA state
1224 **************************************************************************************************/
1225 private class TNFAState(char_t)
1226 {
1227     bool    accept = false,
1228             visited = false;
1229     size_t  index;
1230     List!(TNFATransition!(char_t))  transitions;
1231 
1232     this()
1233     {
1234         transitions = new List!(TNFATransition!(char_t));
1235     }
1236 }
1237 
1238 
1239 /* ************************************************************************************************
1240     Priority classes used to linearize priorities after non-linear transition creation.
1241 **************************************************************************************************/
1242 private enum PriorityClass {
1243     greedy=0, normal=1, reluctant=2, extraReluctant=3
1244 }
1245 
1246 /* ********************************************************************************
1247     TNFA tagged transition
1248 ***********************************************************************************/
1249 private class TNFATransition(char_t)
1250 {
1251     TNFAState!(char_t)  target;
1252     Predicate!(char_t)  predicate;
1253     uint                priority,
1254                         tag;        /// one-based tag number, 0 = untagged
1255     PriorityClass       priorityClass;
1256 
1257     this(PriorityClass pc)
1258     {
1259         priorityClass = pc;
1260     }
1261 
1262     /******************************************************************************
1263         Move through states only going via epsilon transitions, and only choosing
1264         the one with highest priority. If the highest priority transition from a
1265         state isn't an epsilon transition, false is returned.
1266         If the accepting NFA state can be reached in this manner, true is returned.
1267 
1268         NOTE: This method does not look for cycles which should be kept in mind for
1269         later. larsivi 20090827
1270     *******************************************************************************/
1271     bool canFinish()
1272     {
1273         TNFAState!(char_t)  t = target;
1274         while (!t.accept) {
1275             TNFATransition!(char_t) highestPriTrans;
1276             foreach (trans; t.transitions) {
1277                 if (!highestPriTrans || highestPriTrans.priority > trans.priority)
1278                     highestPriTrans = trans;
1279             }
1280             if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon))
1281                 return false;
1282 
1283             t = highestPriTrans.target;
1284         }
1285         return true;
1286     }
1287 
1288 }
1289 
1290 /* ************************************************************************************************
1291     Fragments of TNFAs as used in the Thompson method
1292 **************************************************************************************************/
1293 private class TNFAFragment(char_t)
1294 {
1295     alias TNFAState!(char_t)        state_t;
1296     alias TNFATransition!(char_t)   trans_t;
1297 
1298     List!(trans_t)  entries,        /// transitions to be added to the entry state
1299                     exits,          /// transitions to be added to the exit state
1300                     entry_state,    /// transitions to write the entry state to
1301                     exit_state;     /// transitions to write the exit state to
1302 
1303     bool swapMatchingBracketSyntax;
1304 
1305     this()
1306     {
1307         entries     = new List!(trans_t);
1308         exits       = new List!(trans_t);
1309         entry_state = new List!(trans_t);
1310         exit_state  = new List!(trans_t);
1311     }
1312 
1313     /* ********************************************************************************************
1314         Write the given state as entry state to this fragment.
1315     **********************************************************************************************/
1316     void setEntry(state_t state)
1317     {
1318         state.transitions ~= entries;
1319         foreach ( t; entry_state )
1320             t.target = state;
1321     }
1322 
1323     /* ********************************************************************************************
1324         Write the given state as exit state to this fragment.
1325     **********************************************************************************************/
1326     void setExit(state_t state)
1327     {
1328         state.transitions ~= exits;
1329         foreach ( t; exit_state )
1330             t.target = state;
1331     }
1332 }
1333 
1334 /* ************************************************************************************************
1335     Tagged NFA
1336 **************************************************************************************************/
1337 private final class TNFA(char_t)
1338 {
1339     alias TNFATransition!(char_t)   trans_t;
1340     alias TNFAFragment!(char_t)     frag_t;
1341     alias TNFAState!(char_t)        state_t;
1342     alias Predicate!(char_t)        predicate_t;
1343     alias char_t[]                  string_t;
1344     alias CharRange!(char_t)        range_t;
1345     alias CharClass!(char_t)        cc_t;
1346 
1347     string_t    pattern;
1348     state_t[]   states;
1349     state_t     start;
1350 
1351     bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching
1352 
1353     /* ********************************************************************************************
1354         Creates the TNFA from the given regex pattern
1355     **********************************************************************************************/
1356     this(string_t regex)
1357     {
1358         next_tag        = 1;
1359         transitions     = new List!(trans_t);
1360 
1361         pattern = regex;
1362     }
1363 
1364     /* ********************************************************************************************
1365         Print the TNFA (tabular representation of the delta function)
1366     **********************************************************************************************/
1367     debug(TangoRegex) void print()
1368     {
1369         foreach ( int i, s; states )
1370         {
1371             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
1372 
1373             bool first=true;
1374             Stdout(" {");
1375             foreach ( t; s.transitions )
1376             {
1377                 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index);
1378                 if ( t.tag > 0 ) {
1379                     Stdout.format(" t{}", t.tag);
1380                 }
1381                 first = false;
1382             }
1383             Stdout("}").newline;
1384         }
1385     }
1386 
1387     uint tagCount()
1388     {
1389         return next_tag-1;
1390     }
1391 
1392     /* ********************************************************************************************
1393         Constructs the TNFA using extended Thompson method.
1394         Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert
1395         the regexp from infix notation.
1396     **********************************************************************************************/
1397     void parse(bool unanchored)
1398     {
1399         List!(frag_t)       frags       = new List!(frag_t);
1400         Stack!(Operator)    opStack;
1401         Stack!(uint)        tagStack;
1402         Stack!(Pair!(uint)) occurStack;
1403         opStack.push(Operator.eos);
1404 
1405         /* ****************************************************************************************
1406             Perform action on operator stack
1407         ******************************************************************************************/
1408         bool perform(Operator next_op, bool explicit_operator=true)
1409         {
1410             // calculate index in action matrix
1411             int index = cast(int)opStack.top()*(Operator.max+1);
1412             index += cast(int)next_op;
1413 
1414             debug(tnfa) Stdout.formatln("\t{}:{} -> {}  {} frag(s)",
1415                 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length
1416             );
1417             switch ( action_lookup[index] )
1418             {
1419                 case Act.pua:
1420                     opStack.push(next_op);
1421                     if ( next_op == Operator.open_par ) {
1422                         tagStack.push(next_tag);
1423                         next_tag += 2;
1424                     }
1425                     break;
1426                 case Act.poc:
1427                     switch ( opStack.top() )
1428                     {
1429                         case Operator.concat:       constructConcat(frags);                             break;
1430                         case Operator.altern:       constructAltern(frags);                             break;
1431                         case Operator.zero_one_g:   constructZeroOne(frags, PriorityClass.greedy);      break;
1432                         case Operator.zero_one_ng:  constructZeroOne(frags, PriorityClass.reluctant);   break;
1433                         case Operator.zero_one_xr:  constructZeroOne(frags, PriorityClass.extraReluctant);  break;
1434                         case Operator.zero_more_g:  constructZeroMore(frags, PriorityClass.greedy);     break;
1435                         case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant);  break;
1436                         case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break;
1437                         case Operator.one_more_g:   constructOneMore(frags, PriorityClass.greedy);      break;
1438                         case Operator.one_more_ng:  constructOneMore(frags, PriorityClass.reluctant);   break;
1439                         case Operator.one_more_xr:  constructOneMore(frags, PriorityClass.extraReluctant);  break;
1440                         case Operator.occur_g:
1441                             Pair!(uint) occur = occurStack.pop();
1442                             constructOccur(frags, occur.a, occur.b, PriorityClass.greedy);
1443                             break;
1444                         case Operator.occur_ng:
1445                             Pair!(uint) occur = occurStack.pop();
1446                             constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant);
1447                             break;
1448                         default:
1449                             throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1450                     }
1451                     opStack.pop();
1452 
1453                     perform(next_op, false);
1454                     break;
1455                 case Act.poa:
1456                     opStack.pop();
1457                     break;
1458                 case Act.pca:
1459                     if ( opStack.top() == Operator.open_par )
1460                     {
1461                         if ( tagStack.empty() )
1462                             throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup);
1463                         constructBracket(frags, tagStack.top());
1464                         tagStack.pop();
1465                     }
1466                     else {
1467                         assert(opStack.top() == Operator.open_par_nm);
1468                         constructBracket(frags);
1469                     }
1470                     opStack.pop();
1471                     break;
1472                 case Act.don:
1473                     return true;
1474                 case Act.err:
1475                 default:
1476                     throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup);
1477             }
1478 
1479             return false;
1480         }
1481 
1482         // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches
1483         // and matching bracket for total match group
1484         if ( unanchored ) {
1485             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1486             perform(Operator.zero_more_xr, false);
1487             perform(Operator.concat, false);
1488             perform(Operator.open_par, false);
1489         }
1490 
1491         // convert regex to postfix and create TNFA
1492         bool implicit_concat;
1493         predicate_t.Type pred_type;
1494 
1495         while ( !endOfPattern() )
1496         {
1497             pred_type = predicate_t.Type.consume;
1498 
1499             dchar c = readPattern();
1500             switch ( c )
1501             {
1502                 case '|':
1503                     perform(Operator.altern);
1504                     implicit_concat = false;
1505                     break;
1506                 case '(':
1507                     if ( implicit_concat )
1508                         perform(Operator.concat, false);
1509                     implicit_concat = false;
1510                     if ( peekPattern() == '?' ) {
1511                         readPattern();
1512                         perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm);
1513                     }
1514                     else
1515                         perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par);
1516                     break;
1517                 case ')':
1518                     perform(Operator.close_par);
1519                     break;
1520                 case '?':
1521                     if ( peekPattern() == '?' ) {
1522                         readPattern();
1523                         perform(Operator.zero_one_ng);
1524                     }
1525                     else
1526                         perform(Operator.zero_one_g);
1527                     break;
1528                 case '*':
1529                     if ( peekPattern() == '?' ) {
1530                         readPattern();
1531                         perform(Operator.zero_more_ng);
1532                     }
1533                     else
1534                         perform(Operator.zero_more_g);
1535                     break;
1536                 case '+':
1537                     if ( peekPattern() == '?' ) {
1538                         readPattern();
1539                         perform(Operator.one_more_ng);
1540                     }
1541                     else
1542                         perform(Operator.one_more_g);
1543                     break;
1544                 case '{':
1545                     Pair!(uint) occur;
1546                     parseOccurCount(occur.a, occur.b);
1547                     occurStack.push(occur);
1548                     if ( peekPattern() == '?' ) {
1549                         readPattern();
1550                         perform(Operator.occur_ng);
1551                     }
1552                     else
1553                         perform(Operator.occur_g);
1554                     break;
1555                 case '[':
1556                     if ( implicit_concat )
1557                         perform(Operator.concat, false);
1558                     implicit_concat = true;
1559                     frags ~= constructCharClass(pred_type);
1560                     break;
1561                 case '.':
1562                     if ( implicit_concat )
1563                         perform(Operator.concat, false);
1564                     implicit_concat = true;
1565                     frags ~= constructChars(cc_t.dot_oper, pred_type);
1566                     break;
1567                 case '$':
1568                     if ( implicit_concat )
1569                         perform(Operator.concat, false);
1570                     implicit_concat = true;
1571 
1572                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead);
1573                     break;
1574                 case '^':
1575                     if ( implicit_concat )
1576                         perform(Operator.concat, false);
1577                     implicit_concat = true;
1578 
1579                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind);
1580                     break;
1581                 case '>':
1582                     c = readPattern();
1583                     pred_type = predicate_t.Type.lookahead;
1584                     if ( c == '[' )
1585                         goto case '[';
1586                     else if ( c == '\\' )
1587                         goto case '\\';
1588                     else if ( c == '.' )
1589                         goto case '.';
1590                     else
1591                         goto default;
1592                 case '<':
1593                     c = readPattern();
1594                     pred_type = predicate_t.Type.lookbehind;
1595                     if ( c == '[' )
1596                         goto case '[';
1597                     else if ( c == '\\' )
1598                         goto case '\\';
1599                     else if ( c == '.' )
1600                         goto case '.';
1601                     else
1602                         goto default;
1603                 case '\\':
1604                     c = readPattern();
1605 
1606                     if ( implicit_concat )
1607                         perform(Operator.concat, false);
1608                     implicit_concat = true;
1609 
1610                     switch ( c )
1611                     {
1612                         case 't':
1613                             frags ~= constructSingleChar('\t', pred_type);
1614                             break;
1615                         case 'n':
1616                             frags ~= constructSingleChar('\n', pred_type);
1617                             break;
1618                         case 'r':
1619                             frags ~= constructSingleChar('\r', pred_type);
1620                             break;
1621                         case 'w':   // alphanumeric and _
1622                             frags ~= constructChars(cc_t.alphanum_, pred_type);
1623                             break;
1624                         case 'W':   // non-(alphanum and _)
1625                             auto cc = cc_t(cc_t.alphanum_);
1626                             cc.negate();
1627                             frags ~= constructChars(cc, pred_type);
1628                             break;
1629                         case 's':   // whitespace
1630                             frags ~= constructChars(cc_t.whitespace, pred_type);
1631                             break;
1632                         case 'S':   // non-whitespace
1633                             auto cc = cc_t(cc_t.whitespace);
1634                             cc.negate();
1635                             frags ~= constructChars(cc, pred_type);
1636                             break;
1637                         case 'd':   // digit
1638                             frags ~= constructChars(cc_t.digit, pred_type);
1639                             break;
1640                         case 'D':   // non-digit
1641                             auto cc = cc_t(cc_t.digit);
1642                             cc.negate();
1643                             frags ~= constructChars(cc, pred_type);
1644                             break;
1645                         case 'b':   // either end of word
1646                             if ( pred_type != predicate_t.Type.consume )
1647                                 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind");
1648 
1649                             // create (?<\S>\s|<\s>\S)
1650                             auto cc = cc_t(cc_t.whitespace);
1651                             cc.negate();
1652 
1653                             perform(Operator.open_par_nm);
1654 
1655                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1656                             perform(Operator.concat, false);
1657                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1658                             perform(Operator.altern, false);
1659                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1660                             perform(Operator.concat, false);
1661                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1662 
1663                             perform(Operator.close_par, false);
1664                             break;
1665                         case 'B':   // neither end of word
1666                             if ( pred_type != predicate_t.Type.consume )
1667                                 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind");
1668 
1669                             // create (?<\S>\S|<\s>\s)
1670                             auto cc = cc_t(cc_t.whitespace);
1671                             cc.negate();
1672 
1673                             perform(Operator.open_par_nm);
1674 
1675                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1676                             perform(Operator.concat, false);
1677                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1678                             perform(Operator.altern, false);
1679                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1680                             perform(Operator.concat, false);
1681                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1682 
1683                             perform(Operator.close_par, false);
1684                             break;
1685                         case '(':
1686                         case ')':
1687                         case '[':
1688                         case ']':
1689                         case '{':
1690                         case '}':
1691                         case '*':
1692                         case '+':
1693                         case '?':
1694                         case '.':
1695                         case '\\':
1696                         case '^':
1697                         case '$':
1698                         case '|':
1699                         case '<':
1700                         case '>':
1701                         default:
1702                             frags ~= constructSingleChar(c, pred_type);
1703                             break;
1704 //                            throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c));
1705                     }
1706                     break;
1707 
1708                 default:
1709                     if ( implicit_concat )
1710                         perform(Operator.concat, false);
1711                     implicit_concat = true;
1712                     frags ~= constructSingleChar(c, pred_type);
1713             }
1714         }
1715 
1716         // add implicit reluctant .* (with . == any_char) at the end for unanchored matches
1717         if ( unanchored )
1718         {
1719             perform(Operator.close_par, false);
1720             if ( implicit_concat )
1721                 perform(Operator.concat, false);
1722             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1723             perform(Operator.zero_more_ng, false);
1724         }
1725 
1726         // empty operator stack
1727         while ( !perform(Operator.eos) ) {}
1728 
1729         // set start and finish states
1730         start = addState();
1731         state_t finish = addState();
1732         finish.accept = true;
1733 
1734         foreach ( f; frags ) {
1735             f.setExit(finish);
1736             f.setEntry(start);
1737         }
1738 
1739         // set transition priorities
1740         List!(trans_t)[PriorityClass.max+1] trans;
1741         foreach ( ref t; trans )
1742             t = new List!(trans_t);
1743 
1744         Stack!(trans_t) todo;
1745         state_t state = start;
1746 
1747         while ( !todo.empty() || !state.visited )
1748         {
1749             if ( !state.visited )
1750             {
1751                 state.visited = true;
1752                 foreach_reverse ( t; state.transitions )
1753                     todo.push(t);
1754             }
1755 
1756             if ( todo.empty() )
1757                 break;
1758             trans_t t = todo.top();
1759             todo.pop();
1760             assert(t.priorityClass<=PriorityClass.max);
1761             trans[t.priorityClass] ~= t;
1762             state = t.target;
1763         }
1764 
1765         uint nextPrio;
1766         foreach ( ts; trans )
1767         {
1768             foreach ( t; ts )
1769                 t.priority = nextPrio++;
1770         }
1771     }
1772 
1773 private:
1774     uint            next_tag;
1775     size_t          cursor,
1776                     next_cursor;
1777     List!(trans_t)  transitions;
1778 
1779     state_t[state_t]    clonedStates;
1780     trans_t[trans_t]    clonedTransitions;
1781 
1782     /// RegEx operators
1783     enum Operator {
1784         eos, concat, altern, open_par, close_par,
1785         zero_one_g, zero_more_g, one_more_g,        // greedy
1786         zero_one_ng, zero_more_ng, one_more_ng,     // non-greedy/reluctant
1787         zero_one_xr, zero_more_xr, one_more_xr,     // extra-reluctant
1788         open_par_nm, occur_g, occur_ng
1789     }
1790     __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"];
1791 
1792     /// Actions for to-postfix transformation
1793     enum Act {
1794         pua, poc, poa, pca, don, err
1795     }
1796     __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"];
1797 
1798     /// Action lookup for to-postfix transformation
1799     __gshared immutable Act[] action_lookup =
1800     [
1801     //  eos      concat   |        (        )        ?        *        +        ??       *?       +?       ??extra  *?extra  +?extra  (?       {x,y}    {x,y}?
1802         Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1803         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1804         Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1805         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1806         Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err,
1807         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1808         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1809         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1810         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1811         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1812         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1813         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1814         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1815         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1816         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1817         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1818         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc
1819     ];
1820 
1821     final dchar peekPattern()
1822     {
1823         auto tmp = next_cursor;
1824         if ( tmp < pattern.length )
1825             return decode(pattern, tmp);
1826         return 0;
1827     }
1828 
1829     final dchar readPattern()
1830     {
1831         cursor = next_cursor;
1832         if ( next_cursor < pattern.length )
1833             return decode(pattern, next_cursor);
1834         return 0;
1835     }
1836 
1837     final bool endOfPattern()
1838     {
1839         return next_cursor >= pattern.length;
1840     }
1841 
1842     state_t addState()
1843     {
1844         state_t s = new state_t;
1845         s.index = states.length;
1846         states ~= s;
1847         return s;
1848     }
1849 
1850     trans_t addTransition(PriorityClass pc = PriorityClass.normal)
1851     {
1852         trans_t trans = new trans_t(pc);
1853         transitions ~= trans;
1854         return trans;
1855     }
1856 
1857     uint parseNumber()
1858     {
1859         uint res;
1860         while ( !endOfPattern() )
1861         {
1862             auto c = peekPattern();
1863             if ( c < '0' || c > '9' )
1864                 break;
1865             res = res*10+(c-'0');
1866             readPattern();
1867         }
1868         return res;
1869     }
1870 
1871     void parseOccurCount(out uint minOccur, out uint maxOccur)
1872     {
1873         assert(pattern[cursor] == '{');
1874 
1875         minOccur = parseNumber();
1876         if ( peekPattern() == '}' ) {
1877             readPattern();
1878             maxOccur = minOccur;
1879             return;
1880         }
1881         if ( peekPattern() != ',' )
1882             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1883         readPattern();
1884         maxOccur = parseNumber();
1885         if ( peekPattern() != '}' )
1886             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1887         readPattern();
1888         if ( maxOccur > 0 && maxOccur < minOccur )
1889             throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1890     }
1891 
1892     trans_t clone(trans_t t)
1893     {
1894         if ( t is null )
1895             return null;
1896         trans_t* tmp = t in clonedTransitions;
1897         if ( tmp !is null )
1898             return *tmp;
1899 
1900         trans_t t2 = new trans_t(t.priorityClass);
1901         clonedTransitions[t] = t2;
1902         t2.tag = t.tag;
1903         t2.priority = t.priority;
1904         t2.predicate = t.predicate;
1905         t2.target = clone(t.target);
1906         transitions ~= t2;
1907         return t2;
1908     }
1909 
1910     state_t clone(state_t s)
1911     {
1912         if ( s is null )
1913             return null;
1914         state_t* tmp = s in clonedStates;
1915         if ( tmp !is null )
1916             return *tmp;
1917 
1918         state_t s2 = new state_t;
1919         clonedStates[s] = s2;
1920         s2.accept = s.accept;
1921         s2.visited = s.visited;
1922         foreach ( t; s.transitions )
1923             s2.transitions ~= clone(t);
1924         s2.index = states.length;
1925         states ~= s2;
1926         return s2;
1927     }
1928 
1929     frag_t clone(frag_t f)
1930     {
1931         if ( f is null )
1932             return null;
1933         clonedStates = null;
1934         clonedTransitions = null;
1935 
1936         frag_t f2 = new frag_t;
1937         foreach ( t; f.entries )
1938             f2.entries ~= clone(t);
1939         foreach ( t; f.exits )
1940             f2.exits ~= clone(t);
1941         foreach ( t; f.entry_state )
1942             f2.entry_state ~= clone(t);
1943         foreach ( t; f.exit_state )
1944             f2.exit_state ~= clone(t);
1945         return f2;
1946     }
1947 
1948     //---------------------------------------------------------------------------------------------
1949     // Thompson constructions of NFA fragments
1950 
1951     frag_t constructSingleChar(char_t c, predicate_t.Type type)
1952     {
1953         debug(tnfa) Stdout.formatln("constructCharFrag {}", c);
1954 
1955         trans_t trans = addTransition();
1956         trans.predicate.appendInput(CharRange!(char_t)(c));
1957 
1958         trans.predicate.type = type;
1959 
1960         frag_t frag = new frag_t;
1961         frag.exit_state ~= trans;
1962         frag.entries    ~= trans;
1963         return frag;
1964     }
1965 
1966     frag_t constructChars(string_t chars, in predicate_t.Type type)
1967     {
1968         cc_t cc;
1969         for ( int i = 0; i < chars.length; ++i )
1970             cc.add(chars[i]);
1971 
1972         return constructChars(cc, type);
1973     }
1974 
1975     frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type)
1976     {
1977         debug(tnfa) Stdout.format("constructChars type={}", type);
1978 
1979         trans_t trans = addTransition();
1980         trans.predicate.type = type;
1981 
1982         trans.predicate.setInput(cc_t(charclass));
1983 
1984         trans.predicate.optimize();
1985         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
1986 
1987         frag_t frag = new frag_t;
1988         frag.exit_state ~= trans;
1989         frag.entries    ~= trans;
1990         return frag;
1991     }
1992 
1993     frag_t constructCharClass(predicate_t.Type type)
1994     {
1995         debug(tnfa) Stdout.format("constructCharClass type={}", type);
1996         auto oldCursor = cursor;
1997 
1998         trans_t trans = addTransition();
1999 
2000         bool negated=false;
2001         if ( peekPattern() == '^' ) {
2002             readPattern();
2003             negated = true;
2004         }
2005 
2006         char_t  last;
2007         bool    have_range_start,
2008                 first_char = true;
2009         for ( ; !endOfPattern() && peekPattern() != ']'; )
2010         {
2011             dchar c = readPattern();
2012             switch ( c )
2013             {
2014                 case '-':
2015                     if ( first_char ) {
2016                         trans.predicate.appendInput(range_t(c));
2017                         break;
2018                     }
2019                     if ( !have_range_start )
2020                         throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2021                     else if ( endOfPattern() || peekPattern() == ']' )
2022                         throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2023                     else {
2024                         c = readPattern();
2025                         trans.predicate.appendInput(range_t(last, c));
2026                         have_range_start = false;
2027                     }
2028                     break;
2029                 case '\\':
2030                     if ( endOfPattern() )
2031                         throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\"");
2032                     c = readPattern();
2033                     switch ( c )
2034                     {
2035                         case 't':
2036                             c = '\t';
2037                             break;
2038                         case 'n':
2039                             c = '\n';
2040                             break;
2041                         case 'r':
2042                             c = '\r';
2043                             break;
2044                         default:
2045                             break;
2046                     }
2047                     goto default;
2048                 default:
2049                     if ( have_range_start )
2050                         trans.predicate.appendInput(range_t(last));
2051                     last = c;
2052                     have_range_start = true;
2053             }
2054             first_char = false;
2055         }
2056         if ( !endOfPattern() )
2057             readPattern();
2058         if ( last != char_t.init )
2059             trans.predicate.appendInput(range_t(last));
2060         debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]);
2061 
2062         if ( negated ) {
2063             auto tmp = cc_t(cc_t.any_char);
2064             tmp.subtract(trans.predicate.input);
2065             trans.predicate.input = tmp;
2066         }
2067         else
2068             trans.predicate.optimize();
2069         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
2070 
2071         trans.predicate.type = type;
2072 
2073         frag_t frag = new frag_t;
2074         frag.exit_state ~= trans;
2075         frag.entries    ~= trans;
2076         return frag;
2077     }
2078 
2079     void constructBracket(List!(frag_t) frags, uint tag=0)
2080     {
2081         debug(tnfa) Stdout.formatln("constructBracket");
2082 
2083         state_t entry = addState(),
2084                 exit = addState();
2085         frags.tail.value.setEntry(entry);
2086         frags.tail.value.setExit(exit);
2087 
2088         trans_t tag1 = addTransition(),
2089                 tag2 = addTransition();
2090         tag1.predicate.type = predicate_t.Type.epsilon;
2091         tag2.predicate.type = predicate_t.Type.epsilon;
2092         if ( tag > 0 )
2093         {
2094             // make sure the tag indeces for bracket x are always
2095             // x*2 for the opening bracket and x*2+1 for the closing bracket
2096             tag1.tag = tag++;
2097             tag2.tag = tag;
2098         }
2099         tag1.target = entry;
2100         exit.transitions ~= tag2;
2101 
2102         frag_t frag = new frag_t;
2103         frag.entries ~= tag1;
2104         frag.exit_state ~= tag2;
2105         frags.pop();
2106         frags ~= frag;
2107     }
2108 
2109     void constructOneMore(List!(frag_t) frags, PriorityClass prioClass)
2110     {
2111         debug(tnfa) Stdout.formatln("constructOneMore");
2112 
2113         if ( frags.empty() )
2114             throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2115 
2116         trans_t repeat = addTransition(prioClass),
2117                 cont = addTransition();
2118         repeat.predicate.type = predicate_t.Type.epsilon;
2119         cont.predicate.type = predicate_t.Type.epsilon;
2120 
2121         state_t s = addState();
2122         frags.tail.value.setExit(s);
2123         s.transitions ~= repeat;
2124         s.transitions ~= cont;
2125 
2126         frag_t frag = new frag_t;
2127         frag.entries ~= frags.tail.value.entries;
2128         frag.entry_state ~= frags.tail.value.entry_state;
2129         frag.entry_state ~= repeat;
2130         frag.exit_state ~= cont;
2131         frags.pop();
2132         frags ~= frag;
2133     }
2134 
2135     void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass)
2136     {
2137         debug(tnfa) Stdout.formatln("constructZeroMore");
2138 
2139         if ( frags.empty() )
2140             throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2141 
2142         trans_t enter = addTransition(prioClass),
2143                 repeat = addTransition(prioClass),
2144                 skip = addTransition();
2145         skip.predicate.type = predicate_t.Type.epsilon;
2146         repeat.predicate.type = predicate_t.Type.epsilon;
2147         enter.predicate.type = predicate_t.Type.epsilon;
2148 
2149         state_t entry = addState(),
2150                 exit = addState();
2151         frags.tail.value.setEntry(entry);
2152         frags.tail.value.setExit(exit);
2153         exit.transitions ~= repeat;
2154         enter.target = entry;
2155 
2156         frag_t frag = new frag_t;
2157         frag.entries ~= skip;
2158         frag.entries ~= enter;
2159         frag.exit_state ~= skip;
2160         frag.entry_state ~= repeat;
2161         frags.pop();
2162         frags ~= frag;
2163     }
2164 
2165     void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass)
2166     {
2167         debug(tnfa) Stdout.formatln("constructZeroOne");
2168 
2169         if ( frags.empty() )
2170             throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2171 
2172         trans_t use = addTransition(prioClass),
2173                 skip = addTransition();
2174         use.predicate.type = predicate_t.Type.epsilon;
2175         skip.predicate.type = predicate_t.Type.epsilon;
2176 
2177         state_t s = addState();
2178         frags.tail.value.setEntry(s);
2179         use.target = s;
2180 
2181         frag_t frag = new frag_t;
2182         frag.entries ~= use;
2183         frag.entries ~= skip;
2184         frag.exits ~= frags.tail.value.exits;
2185         frag.exit_state ~= frags.tail.value.exit_state;
2186         frag.exit_state ~= skip;
2187         frags.pop();
2188         frags ~= frag;
2189     }
2190 
2191     void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass)
2192     {
2193         debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur);
2194 
2195         if ( frags.empty() )
2196             throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2197 
2198         state_t s;
2199         frag_t  total = new frag_t,
2200                 prev;
2201 
2202         for ( int i = 0; i < minOccur; ++i )
2203         {
2204             frag_t f = clone(frags.tail.value);
2205             if ( prev !is null ) {
2206                 s = addState();
2207                 prev.setExit(s);
2208                 f.setEntry(s);
2209             }
2210             else {
2211                 total.entries = f.entries;
2212                 total.entry_state = f.entry_state;
2213             }
2214             prev = f;
2215         }
2216 
2217         if ( maxOccur == 0 )
2218         {
2219             frag_t f = frags.tail.value;
2220             trans_t t = addTransition();
2221             t.predicate.type = predicate_t.Type.epsilon;
2222             f.entries ~= t;
2223             f.exit_state ~= t;
2224 
2225             t = addTransition();
2226             t.predicate.type = predicate_t.Type.epsilon;
2227             f.exits ~= t;
2228             f.entry_state ~= t;
2229 
2230             s = addState();
2231             f.setEntry(s);
2232 
2233             if ( prev !is null )
2234                 prev.setExit(s);
2235             else {
2236                 total.entries = f.entries;
2237                 total.entry_state = f.entry_state;
2238             }
2239 
2240             prev = f;
2241         }
2242 
2243         for ( int i = minOccur; i < maxOccur; ++i )
2244         {
2245             frag_t f;
2246             if ( i < maxOccur-1 )
2247                 f = clone(frags.tail.value);
2248             else
2249                 f = frags.tail.value;
2250             trans_t t = addTransition();
2251             t.predicate.type = predicate_t.Type.epsilon;
2252             f.entries ~= t;
2253             f.exit_state ~= t;
2254 
2255             if ( prev !is null ) {
2256                 s = addState();
2257                 prev.setExit(s);
2258                 f.setEntry(s);
2259             }
2260             else {
2261                 total.entries = f.entries;
2262                 total.entry_state = f.entry_state;
2263             }
2264             prev = f;
2265         }
2266 
2267         total.exits = prev.exits;
2268         total.exit_state = prev.exit_state;
2269 
2270         frags.pop();
2271         frags ~= total;
2272     }
2273 
2274     void constructAltern(List!(frag_t) frags)
2275     {
2276         debug(tnfa) Stdout.formatln("constructAltern");
2277 
2278         if ( frags.empty() || frags.head is frags.tail )
2279             throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2280 
2281         frag_t  frag = new frag_t,
2282                 f1 = frags.tail.value,
2283                 f2 = frags.tail.prev.value;
2284         frag.entry_state ~= f2.entry_state;
2285         frag.entry_state ~= f1.entry_state;
2286         frag.exit_state ~= f2.exit_state;
2287         frag.exit_state ~= f1.exit_state;
2288         frag.entries ~= f2.entries;
2289         frag.entries ~= f1.entries;
2290         frag.exits ~= f2.exits;
2291         frag.exits ~= f1.exits;
2292 
2293         frags.pop();
2294         frags.pop();
2295         frags ~= frag;
2296     }
2297 
2298     void constructConcat(List!(frag_t) frags)
2299     {
2300         debug(tnfa) Stdout.formatln("constructConcat");
2301 
2302         if ( frags.empty() || frags.head is frags.tail )
2303             throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2304 
2305         frag_t  f1 = frags.tail.value,
2306                 f2 = frags.tail.prev.value;
2307 
2308         state_t state = addState();
2309         f2.setExit(state);
2310         f1.setEntry(state);
2311 
2312         frag_t frag = new frag_t;
2313         frag.entries ~= f2.entries;
2314         frag.exits ~= f1.exits;
2315         frag.entry_state ~= f2.entry_state;
2316         frag.exit_state ~= f1.exit_state;
2317         frags.pop();
2318         frags.pop();
2319         frags ~= frag;
2320     }
2321 }
2322 import tango.core.Array;
2323 
2324 /* ************************************************************************************************
2325     Tagged DFA
2326 **************************************************************************************************/
2327 private class TDFA(char_t)
2328 {
2329     alias Predicate!(char_t)    predicate_t;
2330     alias CharRange!(char_t)    range_t;
2331     alias CharClass!(char_t)    charclass_t;
2332     alias char_t[]              string_t;
2333 
2334     enum uint CURRENT_POSITION_REGISTER = ~0;
2335 
2336     /* ********************************************************************************************
2337         Tag map assignment command
2338     **********************************************************************************************/
2339     struct Command
2340     {
2341         uint        dst,    /// register index to recieve data
2342                     src;    /// register index or CURRENT_POSITION_REGISTER for current position
2343 
2344         string toString()
2345         {
2346             return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup;
2347         }
2348 
2349         /* ****************************************************************************************
2350             Order transitions by the order of their predicates.
2351         ******************************************************************************************/
2352         int opCmp(const ref Command cmd) const
2353         {
2354             if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER )
2355                 return 1;
2356             if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER )
2357                 return -1;
2358             if ( dst < cmd.dst )
2359                 return -1;
2360             if ( dst == cmd.dst )
2361                 return 0;
2362             return 1;
2363         }
2364     }
2365 
2366     struct TagIndex
2367     {
2368         uint    tag,
2369                 index;
2370 
2371         const int opCmp(ref const TagIndex o)
2372         {
2373             if (tag == o.tag && index == o.index)
2374                 return 0;
2375             if (tag > o.tag)
2376                 return 1;
2377             if (tag < o.tag)
2378                 return -1;
2379             if (index > o.index)
2380                 return 1;
2381             if (index < o.index)
2382                 return -1;
2383             assert(0);
2384         }
2385     }
2386 
2387     /* ********************************************************************************************
2388         TDFA state
2389     **********************************************************************************************/
2390     class State
2391     {
2392         enum Mode {
2393             GENERIC, MIXED, LOOKUP
2394         }
2395 
2396         enum uint  LOOKUP_LENGTH = 256,
2397                    INVALID_STATE = 255;
2398 
2399         bool            accept = false;
2400         bool            reluctant = false;
2401         size_t          index;
2402         Transition[]    transitions,
2403                         generic_transitions;
2404         Command[]       finishers;
2405 
2406         ubyte[]         lookup;
2407         Mode            mode;
2408 
2409         void optimize()
2410         {
2411             // merge transitions with equal targets (same state index and equal commands)
2412             size_t[] remove_indeces;
2413             foreach ( i, t; transitions[0 .. $-1] )
2414             {
2415                 foreach ( t2; transitions[i+1 .. $] )
2416                 {
2417                     if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) )
2418                         continue;
2419                     t2.predicate.appendInput(t.predicate);
2420                     remove_indeces ~= i;
2421                     break;
2422                 }
2423             }
2424 
2425             // remove transitions that have been merged into another
2426             if ( remove_indeces.length > 0 )
2427             {
2428                 Transition[] tmp;
2429                 tmp.length = transitions.length - remove_indeces.length;
2430                 size_t next_remove, next;
2431                 foreach ( i, t; transitions )
2432                 {
2433                     if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) {
2434                         ++next_remove;
2435                         continue;
2436                     }
2437                     tmp[next++] = t;
2438                 }
2439                 transitions = tmp;
2440 
2441                 foreach ( t; transitions )
2442                     t.predicate.optimize();
2443             }
2444         }
2445 
2446         void createLookup()
2447         {
2448             size_t count;
2449             foreach ( t; transitions )
2450             {
2451                 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) )
2452                     ++count;
2453             }
2454 
2455             if ( count < 1 || transitions.length > INVALID_STATE ) {
2456                 generic_transitions.length = transitions.length;
2457                 generic_transitions[] = transitions[];
2458                 return;
2459             }
2460 
2461             foreach ( t; transitions )
2462             {
2463                 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) )
2464                     generic_transitions ~= t;
2465             }
2466 
2467             // setup lookup table
2468             lookup.length = LOOKUP_LENGTH;
2469             lookup[] = INVALID_STATE;
2470             foreach ( i, t; transitions )
2471             {
2472                 foreach ( p; t.predicate.input.parts )
2473                 {
2474                     if ( p.l_ >= lookup.length )
2475                         continue;
2476                     for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c )
2477                         lookup[c] = cast(ubyte)i;
2478                 }
2479             }
2480 
2481             mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP;
2482         }
2483     }
2484 
2485     /* ********************************************************************************************
2486         TDFA transition
2487     **********************************************************************************************/
2488     class Transition
2489     {
2490         State       target;
2491         predicate_t predicate;
2492         Command[]   commands;
2493 
2494         /* ****************************************************************************************
2495             Order transitions by the order of their predicates.
2496         ******************************************************************************************/
2497         override final int opCmp(Object o)
2498         {
2499             Transition t = cast(Transition)o;
2500             assert(t !is null);
2501             return predicate.opCmp(t.predicate);
2502         }
2503 
2504         override final const bool opEquals(const Object o)
2505         {
2506             auto t = cast(const(Transition))o;
2507             if ( t is null )
2508                 return false;
2509             if ( equalTarget(t) && t.predicate == predicate )
2510                 return true;
2511             return false;
2512         }
2513 
2514         const bool equalTarget(in Transition t)
2515         {
2516             if ( t.target.index != target.index )
2517                 return false;
2518             if ( commands.length != t.commands.length )
2519                 return false;
2520             Louter: foreach ( cmd; commands )
2521             {
2522                 foreach ( cmd2; t.commands )
2523                 {
2524                     if ( cmd == cmd2 )
2525                         continue Louter;
2526                 }
2527                 return false;
2528             }
2529             return true;
2530         }
2531     }
2532 
2533 
2534     State[]     states;
2535     State       start;
2536     Command[]   initializer;
2537     uint        num_tags;
2538 
2539     uint[TagIndex]  registers;
2540     uint            next_register;
2541 
2542     uint num_regs()
2543     {
2544         return next_register;
2545     }
2546 
2547     /* ********************************************************************************************
2548         Constructs the TDFA from the given TNFA using extended power set method
2549     **********************************************************************************************/
2550     this(TNFA!(char_t) tnfa)
2551     {
2552         num_tags        = tnfa.tagCount();
2553 
2554         next_register   = num_tags;
2555         for ( int i = 1; i <= num_tags; ++i ) {
2556             TagIndex ti;
2557             ti.tag = i;
2558             registers[ti] = i-1;
2559         }
2560 
2561         // create epsilon closure of TNFA start state
2562         SubsetState subset_start    = new SubsetState;
2563         StateElement se             = new StateElement;
2564         se.nfa_state = tnfa.start;
2565         subset_start.elms ~= se;
2566 
2567         // apply lookbehind closure for string/line start
2568         predicate_t tmp_pred;
2569         tmp_pred.setInput(CharClass!(char_t).line_startend);
2570         subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start);
2571 
2572         start = addState();
2573         subset_start.dfa_state = start;
2574 
2575         // generate initializer and finisher commands for TDFA start state
2576         generateInitializers(subset_start);
2577         generateFinishers(subset_start);
2578 
2579         // initialize stack for state traversal
2580         List!(SubsetState)  subset_states   = new List!(SubsetState),
2581                             unmarked        = new List!(SubsetState);
2582         subset_states   ~= subset_start;
2583         unmarked        ~= subset_start;
2584         debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString);
2585 
2586         while ( !unmarked.empty() )
2587         {
2588             SubsetState state = unmarked.tail.value;
2589             unmarked.pop();
2590 
2591             // create transitions for each class, creating new states when necessary
2592             foreach ( pred; disjointPredicates(state) )
2593             {
2594                 // find NFA state we reach with pred
2595                 // reach will set predicate type correctly
2596                 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString);
2597                 SubsetState target = reach(state, pred);
2598                 if ( target is null ) {
2599                     continue;
2600                     debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping");
2601                 }
2602                 debug(tdfa) Stdout.formatln(" {}", target.toString);
2603                 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state);
2604 
2605                 Transition trans = new Transition;
2606                 state.dfa_state.transitions ~= trans;
2607                 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred);
2608                 trans.predicate = pred;
2609 
2610                 // generate indeces for pos commands
2611                 // delay creation of pos command until we have reorder-commands
2612                 uint[uint] cmds = null;
2613                 foreach ( e; target.elms )
2614                 {
2615                     foreach ( tag, ref index; e.tags )
2616                     {
2617                         bool found=false;
2618                         foreach ( e2; state.elms )
2619                         {
2620                             int* i = tag in e2.tags;
2621                             if ( i !is null && *i == index ) {
2622                                 found=true;
2623                                 break;
2624                             }
2625                         }
2626                         if ( !found )
2627                         {
2628                             // if index is < 0 it is a temporary index
2629                             // used only to distinguish the state from existing ones.
2630                             // the previous index can be reused instead.
2631                             if ( index < 0 )
2632                                 index = -index-1;
2633                             cmds[tag] = index;
2634                         }
2635                         else
2636                             assert(index>=0);
2637                     }
2638                 }
2639 
2640                 // check whether a state exists that is identical except for tag index reorder-commands
2641                 bool exists=false;
2642                 foreach ( equivTarget; subset_states )
2643                 {
2644                     if ( reorderTagIndeces(target, equivTarget, state, trans) ) {
2645                         target = equivTarget;
2646                         exists = true;
2647                         break;
2648                     }
2649                 }
2650                 // else create new target state
2651                 if ( !exists )
2652                 {
2653                     State ts = addState();
2654                     target.dfa_state = ts;
2655                     subset_states   ~= target;
2656                     unmarked        ~= target;
2657                     debug(tdfa_add) {
2658                         Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString);
2659                     }
2660                     generateFinishers(target);
2661                 }
2662 
2663                 // now generate pos commands, rewriting reorder-commands if existent
2664                 foreach ( tag, index; cmds )
2665                 {
2666                     // check whether reordering used this tag, if so, overwrite the command directly,
2667                     // for it's effect would be overwritten by a subsequent pos-command anyway
2668                     uint reg = registerFromTagIndex(tag, index);
2669                     bool found = false;
2670                     foreach ( ref cmd; trans.commands )
2671                     {
2672                         if ( cmd.src == reg ) {
2673                             found = true;
2674                             cmd.src = CURRENT_POSITION_REGISTER;
2675                             break;
2676                         }
2677                     }
2678                     if ( !found ) {
2679                         Command cmd;
2680                         cmd.dst = reg;
2681                         cmd.src = CURRENT_POSITION_REGISTER;
2682                         trans.commands ~= cmd;
2683                     }
2684                 }
2685 
2686                 trans.target = target.dfa_state;
2687                 debug(tdfa) {
2688                     Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index);
2689                 }
2690             }
2691 
2692             state.dfa_state.optimize();
2693         }
2694 
2695         // renumber registers continuously
2696         uint[uint]  regNums;
2697 
2698         for ( next_register = 0; next_register < num_tags; ++next_register )
2699             regNums[next_register] = next_register;
2700 
2701         void renumberCommand(ref Command cmd)
2702         {
2703             if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null )
2704                 regNums[cmd.src] = next_register++;
2705             if ( (cmd.dst in regNums) is null )
2706                 regNums[cmd.dst] = next_register++;
2707             if ( cmd.src != CURRENT_POSITION_REGISTER )
2708                 cmd.src = regNums[cmd.src];
2709             cmd.dst = regNums[cmd.dst];
2710         }
2711 
2712         foreach ( state; states )
2713         {
2714             foreach ( ref cmd; state.finishers )
2715                 renumberCommand(cmd);
2716             // make sure pos-commands are executed after reorder-commands and
2717             // reorder-commands do not overwrite each other
2718             tango.core.Array.sort(state.finishers);
2719 
2720             foreach ( trans; state.transitions )
2721             {
2722                 foreach ( ref cmd; trans.commands )
2723                     renumberCommand(cmd);
2724                 tango.core.Array.sort(trans.commands);
2725                 trans.predicate.compile();
2726             }
2727         }
2728 
2729         debug(TangoRegex)
2730         {
2731             foreach ( ref v; registers )
2732             {
2733                 if ( (v in regNums) !is null )
2734                     v = regNums[v];
2735             }
2736         }
2737 
2738         minimizeDFA();
2739 
2740         foreach ( state; states )
2741             state.createLookup();
2742 
2743         // TODO: optimize memory layout of TDFA
2744 
2745         // TODO: add lookahead for string-end somewhere
2746         // TODO: mark dead-end states (not leaving a non-finishing susbet)
2747         // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition
2748         //       (execution may stop in that state)
2749     }
2750 
2751     /* ********************************************************************************************
2752         Print the TDFA (tabular representation of the delta function)
2753     **********************************************************************************************/
2754     debug(TangoRegex) void print()
2755     {
2756         Stdout.formatln("#tags = {}", num_tags);
2757 
2758         auto tis = new TagIndex[registers.length];
2759         foreach ( k, v; registers )
2760             tis [v] = k;
2761         foreach ( r, ti; tis ) {
2762             Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r);
2763         }
2764         Stdout.formatln("Initializer:");
2765         foreach ( cmd; initializer ) {
2766             Stdout.formatln("{}", cmd.toString);
2767         }
2768         Stdout.formatln("Delta function:");
2769         foreach ( int i, s; states )
2770         {
2771             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
2772 
2773             bool first=true;
2774             Stdout(" {");
2775             foreach ( t; s.transitions )
2776             {
2777                 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index);
2778                 bool firstcmd=true;
2779                 foreach ( cmd; t.commands )
2780                 {
2781                     if ( firstcmd )
2782                         firstcmd = false;
2783                     else
2784                         Stdout(",");
2785                     Stdout.format("{}", cmd.toString);
2786                 }
2787                 Stdout(")");
2788                 first = false;
2789             }
2790             Stdout("} (");
2791 
2792             bool firstcmd=true;
2793             foreach ( cmd; s.finishers )
2794             {
2795                 if ( firstcmd )
2796                     firstcmd = false;
2797                 else
2798                     Stdout(",");
2799                 Stdout.format("{}", cmd.toString);
2800             }
2801             Stdout.formatln(")");
2802         }
2803     }
2804 
2805 private:
2806     /* ********************************************************************************************
2807         A (TNFA state, tags) pair element of a subset state.
2808     **********************************************************************************************/
2809     class StateElement
2810     {
2811         TNFAState!(char_t)  nfa_state;
2812         int[uint]           tags;
2813         // use place-value priority with 2 places, value(maxPrio) > value(lastPrio)
2814         uint                maxPriority,
2815                             lastPriority;
2816 
2817         bool prioGreater(StateElement se)
2818         {
2819             if ( maxPriority < se.maxPriority )
2820                 return true;
2821             if ( maxPriority == se.maxPriority ) {
2822                 assert(lastPriority != se.lastPriority);
2823                 return lastPriority < se.lastPriority;
2824             }
2825             return false;
2826         }
2827 
2828         override int opCmp(Object o)
2829         {
2830             StateElement se = cast(StateElement)o;
2831             assert(se !is null);
2832             if ( maxPriority < se.maxPriority )
2833                 return 1;
2834             if ( maxPriority == se.maxPriority )
2835             {
2836                 if ( lastPriority == se.lastPriority )
2837                     return 0;
2838                 return lastPriority < se.lastPriority;
2839             }
2840             return -1;
2841         }
2842 
2843         override string toString()
2844         {
2845             const(char)[] str;
2846             str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority);
2847             bool first = true;
2848             foreach ( k, v; tags ) {
2849                 str ~= Format.convert("{}m({},{})", first?"":",", k, v);
2850                 first = false;
2851             }
2852             str ~= "}";
2853             return str.idup;
2854         }
2855     }
2856 
2857     /* ********************************************************************************************
2858         Represents a state in the NFA to DFA conversion.
2859         Contains the set of states (StateElements) the NFA might be in at the same time and the
2860         corresponding DFA state that we create.
2861     **********************************************************************************************/
2862     class SubsetState
2863     {
2864         StateElement[]  elms;
2865         State           dfa_state;
2866 
2867         this(StateElement[] elms=null)
2868         {
2869             this.elms = elms;
2870         }
2871 
2872         int opApply(int delegate (ref TNFATransition!(char_t)) dg)
2873         {
2874             int res;
2875             foreach ( elm; elms )
2876             {
2877                 foreach ( t; elm.nfa_state.transitions )
2878                 {
2879                     res = dg(t);
2880                     if ( res )
2881                         return res;
2882                 }
2883             }
2884             return res;
2885         }
2886 
2887         override string toString()
2888         {
2889             string str = "[ ";
2890             bool first = true;
2891             foreach ( s; elms ) {
2892                 if ( !first )
2893                     str ~= ", ";
2894                 str ~= s.toString();
2895                 first = false;
2896             }
2897             return str~" ]";
2898         }
2899     }
2900 
2901     /* ********************************************************************************************
2902         Temporary structure used for disjoint predicate computation
2903     **********************************************************************************************/
2904     struct Mark
2905     {
2906         char_t  c;
2907         bool    end;    /// false = start of range
2908 
2909         int opCmp(const ref Mark m) const
2910         {
2911             if ( c < m.c )
2912                 return -1;
2913             if ( c > m.c )
2914                 return 1;
2915             if ( end < m.end )
2916                 return -1;
2917             if ( end > m.end )
2918                 return 1;
2919             return 0;
2920         }
2921     }
2922 
2923     /* ********************************************************************************************
2924         Calculates the register index for a given tag map entry. The TDFA implementation uses
2925         registers to save potential tag positions, the index space gets linearized here.
2926 
2927         Params:     tag =   tag number
2928                     index = tag map index
2929         Returns:    index of the register to use for the tag map entry
2930     **********************************************************************************************/
2931     uint registerFromTagIndex(uint tag, uint index)
2932     {
2933         if ( index > 0 )
2934         {
2935             TagIndex ti;
2936             ti.tag = tag;
2937             ti.index = index;
2938             uint* i = ti in registers;
2939             if ( i !is null )
2940                 return *i;
2941             return registers[ti] = next_register++;
2942         }
2943         else
2944             return tag-1;
2945     }
2946 
2947     Mark[] marks_;
2948 
2949     /* ********************************************************************************************
2950         Add new TDFA state to the automaton.
2951     **********************************************************************************************/
2952     State addState()
2953     {
2954         State s = new State;
2955         s.index = states.length;
2956         states ~= s;
2957         return s;
2958     }
2959 
2960     /* ********************************************************************************************
2961         Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions.
2962 
2963         Params:     state = SubsetState to create the predicates from
2964         Returns:    List of disjoint predicates that can be used for a DFA state
2965     **********************************************************************************************/
2966     predicate_t[] disjointPredicates(SubsetState state)
2967     {
2968         alias CharRange!(char_t) range_t;
2969         debug(tdfa) Stdout.formatln("disjointPredicates()");
2970 
2971         size_t num_marks;
2972         foreach ( t; state )
2973         {
2974             // partitioning will consider lookbehind transitions,
2975             // st. lb-closure will not expand for transitions with a superset of the lb-predicate
2976             if ( t.predicate.type != predicate_t.Type.epsilon )
2977             {
2978                 debug(tdfa) Stdout.formatln("{}", t.predicate.toString);
2979                 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length )
2980                     marks_.length = num_marks+2*t.predicate.getInput().parts.length;
2981                 foreach ( p; t.predicate.getInput().parts ) {
2982                     marks_[num_marks++] = Mark(p.l(), false);
2983                     marks_[num_marks++] = Mark(p.r(), true);
2984                 }
2985             }
2986         }
2987 
2988         if ( num_marks <= 1 )
2989             throw new Exception("disjointPredicates: No transitions in subset state");
2990 
2991         debug(tdfa) Stdout("\nsorting...").newline;
2992         // using built-in sort somtimes gives an AV in TypeInfo.swap
2993         quickSort(marks_[0 .. num_marks]);
2994         assert(!marks_[0].end);
2995 
2996         debug(tdfa)
2997         {
2998             Stdout("\nsorted marks:\n");
2999             bool first=true;
3000             foreach ( m; marks_[0 .. num_marks] )
3001             {
3002                 if ( first )
3003                     first = false;
3004                 else
3005                     Stdout(",");
3006                 if ( m.c > 0x20 && m.c < 0x7f )
3007                     Stdout.format("{}{}", m.end?"e":"s", m.c);
3008                 else
3009                     Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c);
3010             }
3011             Stdout.newline;
3012         }
3013 
3014         size_t  next,
3015                 active = 1;
3016         char_t  start = marks_[0].c,
3017                 end;
3018         range_t[]   disjoint = new range_t[num_marks/2+1];
3019 
3020         foreach ( m; marks_[1 .. num_marks] )
3021         {
3022             if ( m.end )
3023             {
3024                 assert(active>0);
3025                 --active;
3026                 if ( m.c < start )
3027                     continue;
3028                 end = m.c;
3029                 // the next range cannot start at the same pos
3030                 // because starts are sorted before endings
3031                 if ( active > 0 )
3032                     ++m.c;
3033             }
3034             else
3035             {
3036                 ++active;
3037                 if ( active == 1 )
3038                 {
3039                     // skip uncovered interval
3040                     if ( m.c > start ) {
3041                         start = m.c;
3042                         continue;
3043                     }
3044                     end = m.c;
3045                     ++m.c;
3046                 }
3047                 // skip range start if cursor already marks it
3048                 else if ( m.c <= start )
3049                     continue;
3050                 else
3051                     end = m.c-1;
3052             }
3053 
3054             // save range
3055             if ( disjoint.length <= next )
3056                 disjoint.length = disjoint.length*2;
3057 
3058             disjoint[next].l_ = start;
3059             disjoint[next].r_ = end;
3060             ++next;
3061 
3062             // advance cursor
3063             start = m.c;
3064         }
3065         disjoint.length = next;
3066 
3067         // merge isolated ranges into sets of ranges
3068         // no range in a set may occur separated from the others in any predicate
3069         predicate_t[]   preds;
3070         preds.length = 1;
3071         Lmerge: foreach ( r; disjoint )
3072         {
3073             if ( preds[$-1].empty() )
3074                 preds[$-1].appendInput(r);
3075             else
3076             {
3077                 // we can merge r into the current predicate if
3078                 // pred contains r <=> pred contains all the other ranges
3079                 foreach ( t; state )
3080                 {
3081                     if ( t.predicate.type == predicate_t.Type.epsilon )
3082                         continue;
3083 
3084                     if ( t.predicate.getInput().contains(r)
3085                         != t.predicate.getInput().contains(preds[$-1].getInput()) )
3086                     {
3087                         preds.length = preds.length+1;
3088                         break;
3089                     }
3090                 }
3091                 preds[$-1].appendInput(r);
3092             }
3093         }
3094 
3095         debug(tdfa)
3096         {
3097             Stdout("\ndisjoint ranges:\n");
3098             first=true;
3099             foreach ( r; disjoint )
3100             {
3101                 if ( first )
3102                     first = false;
3103                 else
3104                     Stdout(",");
3105                 Stdout.format("{}", r);
3106             }
3107             Stdout.newline;
3108             Stdout("\ndisjoint predicates:\n");
3109             first=true;
3110             foreach ( ref p; preds )
3111             {
3112                 if ( first )
3113                     first = false;
3114                 else
3115                     Stdout(",");
3116                 Stdout.format("{}", p.toString);
3117             }
3118             Stdout.newline;
3119         }
3120 
3121         debug(tdfa) Stdout.formatln("disjointPredicates() end");
3122         return preds;
3123     }
3124 
3125     /* ********************************************************************************************
3126         Finds all TNFA states that can be reached directly with the given predicate and creates
3127         a new SubsetState containing those target states.
3128 
3129         Params:     subst = SubsetState to start from
3130                     pred =  predicate that is matched against outgoing transitions
3131         Returns:    SubsetState containing the reached target states
3132     **********************************************************************************************/
3133     SubsetState reach(SubsetState subst, ref predicate_t pred)
3134     {
3135         // to handle the special case of overlapping consume and lookahead predicates,
3136         // we find the different intersecting predicate types
3137         bool    have_consume,
3138                 have_lookahead;
3139         foreach ( t; subst )
3140         {
3141             if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead )
3142                 continue;
3143             auto intpred = t.predicate.intersect(pred);
3144             if ( !intpred.empty() )
3145             {
3146                 if ( t.predicate.type == predicate_t.Type.consume )
3147                     have_consume = true;
3148                 else if ( t.predicate.type == predicate_t.Type.lookahead )
3149                     have_lookahead = true;
3150             }
3151         }
3152 
3153         // if there is consume/lookahead overlap,
3154         // lookahead predicates are handled first
3155         predicate_t.Type processed_type;
3156         if ( have_lookahead )
3157             processed_type = predicate_t.Type.lookahead;
3158         else if ( have_consume )
3159             processed_type = predicate_t.Type.consume;
3160         else {
3161             debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString);
3162             return null;
3163         }
3164         pred.type = processed_type;
3165 
3166         // add destination states to new subsetstate
3167         SubsetState r = new SubsetState;
3168         foreach ( s; subst.elms )
3169         {
3170             foreach ( t; s.nfa_state.transitions )
3171             {
3172                 if ( t.predicate.type != processed_type )
3173                     continue;
3174                 auto intpred = t.predicate.intersect(pred);
3175                 if ( !intpred.empty() ) {
3176                     StateElement se = new StateElement;
3177                     se.maxPriority = max(t.priority, s.maxPriority);
3178                     se.lastPriority = t.priority;
3179                     se.nfa_state = t.target;
3180                     se.tags = s.tags;
3181                     r.elms ~= se;
3182                 }
3183             }
3184         }
3185 
3186         // if we prioritized lookaheads, the states that may consume are also added to the new subset state
3187         // this behaviour is somewhat similar to an epsilon closure
3188         if ( have_lookahead && have_consume )
3189         {
3190             foreach ( s; subst.elms )
3191             {
3192                 foreach ( t; s.nfa_state.transitions )
3193                 {
3194                     if ( t.predicate.type != predicate_t.Type.consume )
3195                         continue;
3196                     auto intpred = t.predicate.intersect(pred);
3197                     if ( !intpred.empty() ) {
3198                         r.elms ~= s;
3199                         break;
3200                     }
3201                 }
3202             }
3203         }
3204         return r;
3205     }
3206 
3207     /* ********************************************************************************************
3208         Extends the given SubsetState with the states that are reached through lookbehind transitions.
3209 
3210         Params:     from =      SubsetState to create the lookbehind closure for
3211                     previous =  predicate "from" was reached with
3212         Returns:    SubsetState containing "from" and all states of it's lookbehind closure
3213     **********************************************************************************************/
3214     SubsetState lookbehindClosure(SubsetState from, predicate_t pred)
3215     {
3216         List!(StateElement) stack = new List!(StateElement);
3217         StateElement[size_t]  closure;
3218 
3219         foreach ( e; from.elms )
3220         {
3221             stack ~= e;
3222             closure[e.nfa_state.index] = e;
3223         }
3224 
3225         while ( !stack.empty() )
3226         {
3227             StateElement se = stack.tail.value;
3228             stack.pop();
3229             foreach ( t; se.nfa_state.transitions )
3230             {
3231                 if ( t.predicate.type != predicate_t.Type.lookbehind )
3232                     continue;
3233                 if ( t.predicate.intersect(pred).empty() )
3234                     continue;
3235                 uint new_maxPri = max(t.priority, se.maxPriority);
3236 
3237                 StateElement* tmp = t.target.index in closure;
3238                 if ( tmp !is null )
3239                 {
3240                     // if higher prio (smaller value) exists, do not use this transition
3241                     if ( tmp.maxPriority < new_maxPri ) {
3242 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3243                         continue;
3244                     }
3245                     else if ( tmp.maxPriority == new_maxPri )
3246                     {
3247                         // "equal lastPrio -> first-come-first-serve"
3248                         // doesn't work for lexer - how to solve it properly?
3249                         if ( tmp.lastPriority <= t.priority ) {
3250 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3251                             continue;
3252                         }
3253 //                         else
3254 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3255                     }
3256 //                     else
3257 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3258                 }
3259 
3260                 StateElement new_se = new StateElement;
3261                 new_se.maxPriority = max(t.priority, se.maxPriority);
3262                 new_se.lastPriority = t.priority;
3263                 new_se.nfa_state = t.target;
3264                 new_se.tags = se.tags;
3265 
3266                 closure[t.target.index] = new_se;
3267                 stack ~= new_se;
3268             }
3269         }
3270 
3271         SubsetState res = new SubsetState;
3272         res.elms = closure.values;
3273         return res;
3274     }
3275 
3276     /* ********************************************************************************************
3277         Generates the epsilon closure of the given subset state, creating tag map entries
3278         if tags are passed. Takes priorities into account, effectively realizing
3279         greediness and reluctancy.
3280 
3281         Params:     from =      SubsetState to create the epsilon closure for
3282                     previous =  SubsetState "from" was reached from
3283         Returns:    SubsetState containing "from" and all states of it's epsilon closure
3284     **********************************************************************************************/
3285     SubsetState epsilonClosure(SubsetState from, SubsetState previous)
3286     {
3287         int firstFreeIndex=-1;
3288         foreach ( e; previous.elms )
3289         {
3290             foreach ( ti; e.tags )
3291                 firstFreeIndex = max(firstFreeIndex, cast(int)ti);
3292         }
3293         ++firstFreeIndex;
3294 
3295         List!(StateElement) stack = new List!(StateElement);
3296         StateElement[size_t]  closure;
3297 
3298         foreach ( e; from.elms )
3299         {
3300             stack ~= e;
3301             closure[e.nfa_state.index] = e;
3302         }
3303 
3304         while ( !stack.empty() )
3305         {
3306             StateElement se = stack.tail.value;
3307             stack.pop();
3308             foreach ( t; se.nfa_state.transitions )
3309             {
3310                 if ( t.predicate.type != predicate_t.Type.epsilon )
3311                     continue;
3312                 // this is different from Ville Laurikari's algorithm, but it's crucial
3313                 // to take the max (instead of t.priority) to make reluctant operators work
3314                 uint new_maxPri = max(t.priority, se.maxPriority);
3315 
3316                 StateElement* tmp = t.target.index in closure;
3317                 if ( tmp !is null )
3318                 {
3319                     // if higher prio (smaller value) exists, do not use this transition
3320                     if ( tmp.maxPriority < new_maxPri ) {
3321 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3322                         continue;
3323                     }
3324                     else if ( tmp.maxPriority == new_maxPri )
3325                     {
3326                         // "equal lastPrio -> first-come-first-serve"
3327                         // doesn't work for lexer - how to solve it properly?
3328                         if ( tmp.lastPriority <= t.priority ) {
3329 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3330                             continue;
3331                         }
3332 //                         else
3333 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3334                     }
3335 //                     else
3336 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3337                 }
3338 
3339                 auto new_se = new StateElement;
3340                 new_se.maxPriority = new_maxPri;
3341                 new_se.lastPriority = t.priority;
3342                 new_se.nfa_state = t.target;
3343 
3344                 if ( t.tag > 0 )
3345                 {
3346                     foreach ( k, v; se.tags )
3347                         new_se.tags[k] = v;
3348                     new_se.tags[t.tag] = firstFreeIndex;
3349                 }
3350                 else
3351                     new_se.tags = se.tags;
3352 
3353                 closure[t.target.index] = new_se;
3354                 stack ~= new_se;
3355             }
3356         }
3357 
3358         SubsetState res = new SubsetState;
3359         res.elms = closure.values;
3360 
3361         // optimize tag usage
3362         // all we need to do is to check whether the largest tag-index from the
3363         // previous state is actually used in the new state and move all tags with
3364         // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0
3365         if ( firstFreeIndex > 0 )
3366         {
3367             bool seenLastUsedIndex = false;
3368             sluiLoop: foreach ( e; res.elms )
3369             {
3370                 foreach ( i; e.tags )
3371                 {
3372                     if ( i == firstFreeIndex-1 ) {
3373                         seenLastUsedIndex = true;
3374                         break sluiLoop;
3375                     }
3376                 }
3377             }
3378             if ( !seenLastUsedIndex )
3379             {
3380                 foreach ( e; res.elms )
3381                 {
3382                     foreach ( ref i; e.tags )
3383                     {
3384                         // mark index by making it negative
3385                         // to signal that it can be decremented
3386                         // after it has been detected to be a newly used index
3387                         if ( i == firstFreeIndex )
3388                             i = -firstFreeIndex;
3389                     }
3390                 }
3391             }
3392         }
3393 
3394         return res;
3395     }
3396 
3397     /* ********************************************************************************************
3398         Tries to create commands that reorder the tag map of "previous", such that "from" becomes
3399         tag-wise identical to "to". If successful, these commands are added to "trans". This
3400         is done for state re-use.
3401 
3402         Params:     from =      SubsetState to check for tag-wise equality to "to"
3403                     to =        existing SubsetState that we want to re-use
3404                     previous =  SubsetState we're coming from
3405                     trans =     Transition we went along
3406         Returns:    true if "from" is tag-wise identical to "to" and the necessary commands have
3407                     been added to "trans"
3408     **********************************************************************************************/
3409     bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans)
3410     {
3411         if ( from.elms.length != to.elms.length )
3412             return false;
3413 
3414         bool[Command]
3415             cmds;
3416         uint[TagIndex]
3417             reorderedIndeces;
3418         StateElement[TagIndex]
3419             reordered_elements;
3420 
3421         Louter: foreach ( fe; from.elms )
3422         {
3423             foreach ( te; to.elms )
3424             {
3425                 if ( te.nfa_state.index != fe.nfa_state.index )
3426                     continue;
3427                 if ( fe.tags.length != te.tags.length )
3428                     return false;
3429                 foreach ( tag, findex; fe.tags )
3430                 {
3431                     if ( (tag in te.tags) is null )
3432                         return false;
3433 
3434                     TagIndex ti;
3435                     ti.tag = tag;
3436                     ti.index = te.tags[tag];
3437 
3438                     // apply priority for conflicting tag indeces
3439                     if ( (ti in reorderedIndeces) !is null )
3440                     {
3441                         auto rse = reordered_elements[ti];
3442                         auto ri = reorderedIndeces[ti];
3443                         if ( ri != findex
3444                             && ( rse.maxPriority < fe.maxPriority
3445                                 || rse.maxPriority == fe.maxPriority
3446                                 && rse.lastPriority <= fe.lastPriority )
3447                         )
3448                             continue;
3449                         Command cmd;
3450                         cmd.src = registerFromTagIndex(tag,ri);
3451                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3452                         cmds.remove(cmd);
3453                     }
3454                     // if target index differs, create reordering command
3455                     if ( te.tags[tag] != findex )
3456                     {
3457                         Command cmd;
3458                         cmd.src = registerFromTagIndex(tag,findex);
3459                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3460                         cmds[cmd] = true;
3461                     }
3462 
3463                     reorderedIndeces[ti] = findex;
3464                     reordered_elements[ti] = fe;
3465                 }
3466                 continue Louter;
3467             }
3468             return false;
3469         }
3470 
3471         debug(tdfa) {
3472             Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index);
3473         }
3474 
3475         trans.commands ~= cmds.keys;
3476         return true;
3477     }
3478 
3479     /* ********************************************************************************************
3480         Generate tag map initialization commands for start state.
3481     **********************************************************************************************/
3482     void generateInitializers(SubsetState start)
3483     {
3484         uint[uint] cmds;
3485         foreach ( nds; start.elms )
3486         {
3487             foreach ( k, v; nds.tags )
3488                 cmds[k] = v;
3489         }
3490 
3491         foreach ( k, v; cmds ) {
3492             Command cmd;
3493             cmd.dst = registerFromTagIndex(k,v);
3494             cmd.src = CURRENT_POSITION_REGISTER;
3495             initializer ~= cmd;
3496         }
3497     }
3498 
3499     /* ********************************************************************************************
3500         Generates finisher commands for accepting states.
3501     **********************************************************************************************/
3502     void generateFinishers(SubsetState r)
3503     {
3504         // if at least one of the TNFA states accepts,
3505         // set the finishers from active tags in increasing priority
3506         StateElement[]  sorted_elms = r.elms.dup;
3507         tango.core.Array.sort(sorted_elms);
3508         bool reluctant = false;
3509         foreach ( se; sorted_elms ) {
3510             debug (Finishers) Stdout.formatln("Finisher: {}", se);
3511             if ( se.nfa_state.accept )
3512             {
3513                 r.dfa_state.accept = true;
3514 
3515                 // Knowing that we're looking at an epsilon closure with an accepting
3516                 // state, we look at the involved transitions - if the path from the
3517                 // nfa state in the set with the highest incoming priority (last in
3518                 // sorted_elms list) to the accepting nfa state is via the highest
3519                 // priority transitions, and they are all epsilon transitions, this
3520                 // suggests we're looking at a regex ending with a reluctant pattern.
3521                 // The NFA->DFA transformation will most likely extend the automata
3522                 // further, but we want the matching to stop here.
3523                 // NOTE: The grounds for choosing the last element in sorted_elms
3524                 // are somewhat weak (empirical testing), but sofar no new
3525                 // regressions have been discovered. larsivi 20090827
3526                 TNFATransition!(char_t) highestPriTrans;
3527                 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state &&
3528                             sorted_elms[$-1].nfa_state))
3529                     throw new Exception ("Something is NULL that is expected to
3530                             be non-null", __FILE__, __LINE__);
3531 
3532                 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) {
3533                     if (trans.canFinish()) {
3534                         r.dfa_state.reluctant = true;
3535                         break;
3536                     }
3537                 }
3538 
3539                 bool[uint]  finished_tags;
3540                 {
3541                     foreach ( t, i; se.tags )
3542                         if ( i > 0 && !(t in finished_tags) ) {
3543                             finished_tags[t] = true;
3544                             Command cmd;
3545                             cmd.dst = registerFromTagIndex(t, 0);
3546                             cmd.src = registerFromTagIndex(t, i);
3547                             r.dfa_state.finishers ~= cmd;
3548                         }
3549                 }
3550             }
3551         }
3552     }
3553 
3554     /* ********************************************************************************************
3555         Assumes that the command-lists are sorted and transitions are optimized
3556     **********************************************************************************************/
3557     void minimizeDFA()
3558     {
3559         class DiffTable
3560         {
3561             this(size_t num) {
3562                 diff_ = new bool[num*(num+1)/2];
3563             }
3564 
3565             ~this() { diff_.destroy; }
3566 
3567             bool opCall(size_t a, size_t b)
3568             {
3569                 if ( a < b )
3570                     return diff_[b*(b+1)/2+a];
3571                 return diff_[a*(a+1)/2+b];
3572             }
3573 
3574             void set(size_t a, size_t b)
3575             {
3576                 if ( a < b )
3577                     diff_[b*(b+1)/2+a] = true;
3578                 else
3579                     diff_[a*(a+1)/2+b] = true;
3580             }
3581 
3582             bool[]  diff_;
3583         }
3584 
3585         debug(tdfa) Stdout.formatln("Minimizing TDFA");
3586 
3587         scope diff = new DiffTable(states.length);
3588         bool new_diff = true;
3589 
3590         while ( new_diff )
3591         {
3592             new_diff = false;
3593             foreach ( i, a; states[0 .. $-1] )
3594             {
3595                 Linner: foreach ( j, b; states[i+1 .. $] )
3596                 {
3597                     if ( diff(i, j+i+1) )
3598                         continue;
3599 
3600                     // assume optimized transitions
3601                     if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) {
3602                         diff.set(i, j+i+1);
3603                         new_diff = true;
3604                         continue;
3605                     }
3606 
3607                     if ( a.accept ) // b accepts too
3608                     {
3609                         // assume sorted finishers
3610                         if ( a.finishers.length != b.finishers.length ) {
3611                             diff.set(i, j+i+1);
3612                             new_diff = true;
3613                             continue;
3614                         }
3615                         foreach ( k, cmd; a.finishers )
3616                         {
3617                             if ( cmd != b.finishers[k] ) {
3618                                 diff.set(i, j+i+1);
3619                                 new_diff = true;
3620                                 continue Linner;
3621                             }
3622                         }
3623                     }
3624 
3625                     Ltrans: foreach ( ta; a.transitions )
3626                     {
3627                         foreach ( tb; b.transitions )
3628                         {
3629                             if ( ta.predicate.intersects(tb.predicate) )
3630                             {
3631                                 if ( diff(ta.target.index, tb.target.index) ) {
3632                                     diff.set(i, j+i+1);
3633                                     new_diff = true;
3634                                     continue Linner;
3635                                 }
3636                                 // assume sorted commands
3637                                 if ( ta.commands.length != tb.commands.length ) {
3638                                     diff.set(i, j+i+1);
3639                                     new_diff = true;
3640                                     continue Linner;
3641                                 }
3642                                 foreach ( k, cmd; ta.commands )
3643                                 {
3644                                     if ( cmd != tb.commands[k] ) {
3645                                         diff.set(i, j+i+1);
3646                                         new_diff = true;
3647                                         continue Linner;
3648                                     }
3649                                 }
3650                                 continue Ltrans;
3651                             }
3652                         }
3653 
3654                         diff.set(i, j+i+1);
3655                         new_diff = true;
3656                         continue Linner;
3657                     }
3658 
3659                 }
3660             }
3661         }
3662 
3663         foreach ( i, a; states[0 .. $-1] )
3664         {
3665             foreach ( j, b; states[i+1 .. $] )
3666             {
3667                 if ( !diff(i, j+i+1) )
3668                 {
3669                     debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1);
3670                     // remap b to a
3671                     foreach ( k, c; states )
3672                     {
3673                         foreach ( t; c.transitions )
3674                         {
3675                             if ( t.target.index == j+i+1 )
3676                                 t.target = a;
3677                         }
3678                     }
3679                 }
3680             }
3681         }
3682 
3683     }
3684 }
3685 import tango.text.Util;
3686 
3687 /**************************************************************************************************
3688     Regular expression compiler and interpreter.
3689 **************************************************************************************************/
3690 class RegExpT(char_t)
3691 {
3692     alias TDFA!(dchar)      tdfa_t;
3693     alias TNFA!(dchar)      tnfa_t;
3694     alias CharClass!(dchar) charclass_t;
3695     alias Predicate!(dchar) predicate_t;
3696 
3697     /**********************************************************************************************
3698         Construct a RegExpT object.
3699         Params:
3700             pattern = Regular expression.
3701         Throws: RegExpException if there are any compilation errors.
3702         Example:
3703             Declare two variables and assign to them a Regex object:
3704             ---
3705             auto r = new Regex("pattern");
3706             auto s = new Regex(r"p[1-5]\s*");
3707             ---
3708     **********************************************************************************************/
3709     this(const(char_t)[] pattern, const(char_t)[] attributes=null)
3710     {
3711         this(pattern, false, true);
3712     }
3713 
3714     /** ditto */
3715     this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false)
3716     {
3717         pattern_ = pattern;
3718 
3719         debug(TangoRegex) {}
3720         else {
3721             static if ( is(char_t == dchar) ) {
3722                 scope tnfa_t tnfa_ = new tnfa_t(pattern_);
3723             }
3724             else {
3725                 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_));
3726             }
3727         }
3728        
3729         tnfa_.swapMatchingBracketSyntax = swapMBS;
3730         tnfa_.parse(unanchored);
3731         if ( printNFA ) {
3732             debug(TangoRegex) Stdout.formatln("\nTNFA:");
3733             debug(TangoRegex) tnfa_.print;
3734         }
3735         tdfa_ = new tdfa_t(tnfa_);
3736         registers_.length = tdfa_.num_regs();
3737     }
3738 
3739     /**********************************************************************************************
3740         Generate instance of Regex.
3741         Params:
3742             pattern = Regular expression.
3743         Throws: RegExpException if there are any compilation errors.
3744         Example:
3745             Declare two variables and assign to them a Regex object:
3746             ---
3747             auto r = Regex("pattern");
3748             auto s = Regex(r"p[1-5]\s*");
3749             ---
3750     **********************************************************************************************/
3751     static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null)
3752     {
3753         return new RegExpT!(char_t)(pattern, attributes);
3754     }
3755 
3756     /**********************************************************************************************
3757         Set up for start of foreach loop.
3758         Returns:    Instance of RegExpT set up to search input.
3759         Example:
3760             ---
3761             import tango.io.Stdout;
3762             import tango.text.Regex;
3763 
3764             void main()
3765             {
3766                 foreach(m; Regex("ab").search("qwerabcabcababqwer"))
3767                     Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post);
3768             }
3769             // Prints:
3770             // qwer[ab]cabcababqwer
3771             // qwerabc[ab]cababqwer
3772             // qwerabcabc[ab]abqwer
3773             // qwerabcabcab[ab]qwer
3774             ---
3775     **********************************************************************************************/
3776     public RegExpT!(char_t) search(const(char_t)[] input)
3777     {
3778         input_ = input;
3779         next_start_ = 0;
3780         last_start_ = 0;
3781         return this;
3782     }
3783 
3784     /** ditto */
3785     public int opApply(int delegate(RegExpT!(char_t)) dg)
3786     {
3787         int result;
3788         while ( !result && test() )
3789             result = dg(this);
3790         return result;
3791     }
3792 
3793     /**********************************************************************************************
3794         Search input for match.
3795         Returns: false for no match, true for match
3796     **********************************************************************************************/
3797     bool test(const(char_t)[] input)
3798     {
3799         this.input_ = input;
3800         next_start_ = 0;
3801         last_start_ = 0;
3802         return test();
3803     }
3804 
3805     /**********************************************************************************************
3806         Pick up where last test(input) or test() left off, and search again.
3807         Returns: false for no match, true for match
3808     **********************************************************************************************/
3809     bool test()
3810     {
3811         // initialize registers
3812         assert(registers_.length == tdfa_.num_regs());
3813         registers_[0..$] = -1;
3814         foreach ( cmd; tdfa_.initializer ) {
3815             assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER);
3816             registers_[cmd.dst] = 0;
3817         }
3818 
3819         tdfa_t.Transition* tp, tp_end;
3820 
3821         // DFA execution
3822         auto inp = input_[next_start_ .. $];
3823         auto s = tdfa_.start;
3824 
3825         debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp);
3826         LmainLoop: for ( size_t p, next_p; p < inp.length; )
3827         {
3828         Lread_char:
3829             dchar c = cast(dchar)inp[p];
3830             if ( c & 0x80 )
3831                 c = decode(inp, next_p);
3832             else
3833                 next_p = p+1;
3834 
3835         Lprocess_char:
3836             debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c);
3837 
3838             tdfa_t.Transition t = void;
3839             switch ( s.mode )
3840             {
3841                 case s.Mode.LOOKUP:
3842                     if ( c < s.LOOKUP_LENGTH )
3843                     {
3844                         debug(TangoRegex) Stdout.formatln("lookup");
3845                         auto i = s.lookup[c];
3846                         if ( i == s.INVALID_STATE )
3847                             break LmainLoop;
3848                         t = s.transitions[ i ];
3849                         if ( t.predicate.type != t.predicate.Type.consume )
3850                             goto Lno_consume;
3851                         goto Lconsume;
3852                     }
3853                     break LmainLoop;
3854 
3855                 case s.Mode.MIXED:
3856                     if ( c < s.LOOKUP_LENGTH )
3857                     {
3858                         debug(TangoRegex) Stdout.formatln("mixed");
3859                         auto i = s.lookup[c];
3860                         if ( i == s.INVALID_STATE )
3861                             break;
3862                         t = s.transitions[ i ];
3863                         if ( t.predicate.type != t.predicate.Type.consume )
3864                             goto Lno_consume;
3865                         goto Lconsume;
3866                     }
3867                     break;
3868 
3869                 case s.Mode.GENERIC:
3870                 default:
3871                     break;
3872             }
3873 
3874             Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length;
3875                 tp < tp_end; ++tp )
3876             {
3877                 t = *tp;
3878                 switch ( t.predicate.mode )
3879                 {
3880                     // single char
3881                     case predicate_t.MatchMode.single_char:
3882                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3883                         if ( c != t.predicate.data_chr )
3884                             continue Ltrans_loop;
3885                         goto Lconsume;
3886                     case predicate_t.MatchMode.single_char_l:
3887                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3888                         if ( c != t.predicate.data_chr )
3889                             continue Ltrans_loop;
3890                         goto Lno_consume;
3891 
3892                     // bitmap
3893                     case predicate_t.MatchMode.bitmap:
3894                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3895                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3896                             goto Lconsume;
3897                         continue Ltrans_loop;
3898                     case predicate_t.MatchMode.bitmap_l:
3899                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3900                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3901                             goto Lno_consume;
3902                         continue Ltrans_loop;
3903 
3904                     // string search
3905                     case predicate_t.MatchMode.string_search:
3906                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3907                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3908                             continue Ltrans_loop;
3909                         goto Lconsume;
3910                     case predicate_t.MatchMode.string_search_l:
3911                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3912                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3913                             continue Ltrans_loop;
3914                         goto Lno_consume;
3915 
3916                     // generic
3917                     case predicate_t.MatchMode.generic:
3918                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3919                         for ( auto cmp = t.predicate.data_str.ptr,
3920                             cmpend = cmp + t.predicate.data_str.length;
3921                             cmp < cmpend; ++cmp )
3922                         {
3923                             if ( c < *cmp ) {
3924                                 ++cmp;
3925                                 continue;
3926                             }
3927                             ++cmp;
3928                             if ( c <= *cmp )
3929                                 goto Lconsume;
3930                         }
3931                         continue Ltrans_loop;
3932                     case predicate_t.MatchMode.generic_l:
3933                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3934                         for ( auto cmp = t.predicate.data_str.ptr,
3935                             cmpend = cmp + t.predicate.data_str.length;
3936                             cmp < cmpend; ++cmp )
3937                         {
3938                             if ( c < *cmp ) {
3939                                 ++cmp;
3940                                 continue;
3941                             }
3942                             ++cmp;
3943                             if ( c <= *cmp )
3944                                 goto Lno_consume;
3945                         }
3946                         continue Ltrans_loop;
3947 
3948                     default:
3949                         assert(0);
3950                 }
3951 
3952             Lconsume:
3953                 p = next_p;
3954             Lno_consume:
3955 
3956                 s = t.target;
3957                 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]);
3958                 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length);
3959 
3960                 foreach ( cmd; t.commands )
3961                 {
3962                     if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
3963                         registers_[cmd.dst] = cast(int)p;
3964                     else
3965                         registers_[cmd.dst] = registers_[cmd.src];
3966                 }
3967 
3968                 if (s.accept && s.reluctant)
3969                     // Don't continue matching, the current find should be correct
3970                     goto Laccept;
3971 
3972                 // if all input was consumed and we do not already accept, try to
3973                 // add an explicit string/line end
3974                 if ( p >= inp.length )
3975                 {
3976                     if ( s.accept || c == 0 )
3977                         break;
3978                     c = 0;
3979                     goto Lprocess_char;
3980                 }
3981                 goto Lread_char;
3982             }
3983             // no applicable transition
3984             break;
3985         }
3986 
3987         if ( s.accept )
3988         {
3989         Laccept:
3990             foreach ( cmd; s.finishers ) {
3991                 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER);
3992                 registers_[cmd.dst] = registers_[cmd.src];
3993             }
3994             if ( registers_.length > 1 && registers_[1] >= 0 ) {
3995                 last_start_ = next_start_;
3996                 next_start_ += registers_[1];
3997             }
3998             return true;
3999         }
4000 
4001         return false;
4002     }
4003 
4004     /**********************************************************************************************
4005         Return submatch with the given index.
4006         Params:
4007             index = 0 returns whole match, index > 0 returns submatch of bracket #index
4008         Returns:
4009             Slice of input for the requested submatch, or null if no such submatch exists.
4010     **********************************************************************************************/
4011     const(char_t)[] match(uint index)
4012     {
4013         if ( index > tdfa_.num_tags )
4014             return null;
4015         int start   = cast(int)last_start_+registers_[index*2],
4016             end     = cast(int)last_start_+registers_[index*2+1];
4017         if ( start >= 0 && start < end && end <= input_.length )
4018             return input_[start .. end];
4019         return null;
4020     }
4021 
4022     /** ditto */
4023     const(char_t)[] opIndex(uint index)
4024     {
4025         return match(index);
4026     }
4027 
4028     /**********************************************************************************************
4029         Return the slice of the input that precedes the matched substring.
4030         If no match was found, null is returned.
4031     **********************************************************************************************/
4032     const(char_t)[] pre()
4033     {
4034         auto start = registers_[0];
4035         if ( start < 0 )
4036             return null;
4037         return input_[0 .. last_start_+start];
4038     }
4039 
4040     /**********************************************************************************************
4041         Return the slice of the input that follows the matched substring.
4042         If no match was found, the whole slice of the input that was processed in the last test.
4043     **********************************************************************************************/
4044     const(char_t)[] post()
4045     {
4046         if ( registers_[1] >= 0 )
4047             return input_[next_start_ .. $];
4048         return input_[last_start_ .. $];
4049     }
4050 
4051     /**********************************************************************************************
4052         Splits the input at the matches of this regular expression into an array of slices.
4053         Example:
4054             ---
4055             import tango.io.Stdout;
4056             import tango.text.Regex;
4057 
4058             void main()
4059             {
4060                 auto strs = Regex("ab").split("abcabcababqwer");
4061                 foreach( s; strs )
4062                     Stdout.formatln("{}", s);
4063             }
4064             // Prints:
4065             // c
4066             // c
4067             // qwer
4068             ---
4069     **********************************************************************************************/
4070     inout(char_t)[][] split(inout(char_t)[] input)
4071     {
4072         auto res = new inout(char_t)[][PREALLOC];
4073         uint index;
4074         inout(char_t)[] tmp = input;
4075 
4076         foreach ( r; search(input) )
4077         {
4078             tmp = cast(inout(char_t)[])pre();
4079             res[index++] = tmp[last_start_ .. $];
4080             if ( index >= res.length )
4081                 res.length = res.length*2;
4082             tmp = cast(inout(char_t)[])post();
4083         }
4084 
4085         res[index++] = tmp;
4086         res.length = index;
4087         return res;
4088     }
4089 
4090     /**********************************************************************************************
4091         Returns a copy of the input with all matches replaced by replacement.
4092     **********************************************************************************************/
4093     char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4094     {
4095         const(char_t)[] tmp = input;
4096         if ( output_buffer.length <= 0 )
4097             output_buffer = new char_t[input.length+replacement.length];
4098         output_buffer.length = 0;
4099 
4100         foreach ( r; search(input) )
4101         {
4102             tmp = pre();
4103             if ( tmp.length > last_start_ )
4104                 output_buffer ~= tmp[last_start_ .. $];
4105             output_buffer ~= replacement;
4106             tmp = post();
4107         }
4108         output_buffer ~= tmp;
4109         return output_buffer;
4110     }
4111 
4112     /**********************************************************************************************
4113         Returns a copy of the input with the last match replaced by replacement.
4114     **********************************************************************************************/
4115     char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4116     {
4117         const(char_t)[] tmp_pre, tmp_post;
4118         if ( output_buffer.length <= 0 )
4119             output_buffer = new char_t[input.length+replacement.length];
4120         output_buffer.length = 0;
4121 
4122         foreach ( r; search(input) ) {
4123             tmp_pre = pre();
4124             tmp_post = post();
4125         }
4126 
4127         if ( tmp_pre !is null || tmp_post !is null ) {
4128             output_buffer ~= tmp_pre;
4129             output_buffer ~= replacement;
4130             output_buffer ~= tmp_post;
4131         }
4132         else
4133             output_buffer ~= input;
4134 
4135         return output_buffer;
4136     }
4137 
4138     /**********************************************************************************************
4139         Returns a copy of the input with the first match replaced by replacement.
4140     **********************************************************************************************/
4141     char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4142     {
4143         const(char_t)[] tmp = input;
4144         if ( output_buffer.length <= 0 )
4145             output_buffer = new char_t[input.length+replacement.length];
4146         output_buffer.length = 0;
4147 
4148         if ( test(input) )
4149         {
4150             tmp = pre();
4151             if ( tmp.length > last_start_ )
4152                 output_buffer ~= tmp[last_start_ .. $];
4153             output_buffer ~= replacement;
4154             tmp = post();
4155         }
4156         output_buffer ~= tmp;
4157         return output_buffer;
4158     }
4159 
4160     /**********************************************************************************************
4161         Calls dg for each match and replaces it with dg's return value.
4162     **********************************************************************************************/
4163     char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null)
4164     {
4165         const(char_t)[]    tmp = input;
4166         if ( output_buffer.length <= 0 )
4167             output_buffer = new char_t[input.length];
4168         output_buffer.length = 0;
4169 
4170         foreach ( r; search(input) )
4171         {
4172             tmp = pre();
4173             if ( tmp.length > last_start_ )
4174                 output_buffer ~= tmp[last_start_ .. $];
4175             output_buffer ~= dg(this);
4176             tmp = post();
4177         }
4178         output_buffer ~= tmp;
4179         return output_buffer;
4180     }
4181 
4182     /**********************************************************************************************
4183         Compiles the regular expression to D code.
4184 
4185         NOTE : Remember to import this module (tango.text.Regex) in the module where you put the
4186         generated D code.
4187 
4188     **********************************************************************************************/
4189     // TODO: input-end special case
4190     const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false)
4191     {
4192         const(char)[] code;
4193         const(char)[] str_type;
4194         static if ( is(char_t == char) )
4195             str_type = "const(char)[]";
4196         static if ( is(char_t == wchar) )
4197             str_type = "const(wchar)[]";
4198         static if ( is(char_t == dchar) )
4199             str_type = "const(dchar)[]";
4200 
4201         if ( lexer )
4202             code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type);
4203         else {
4204             code = Format.convert("// {}\nbool match({} input", pattern_, str_type);
4205             code ~= Format.convert(", ref {}[] groups", str_type);
4206         }
4207         code ~= Format.convert(")\n{{\n    uint s = {};", tdfa_.start.index);
4208 
4209         uint num_vars = tdfa_.num_regs();
4210         if ( num_vars > 0 )
4211         {
4212             if ( lexer )
4213                 code ~= "\n    static int ";
4214             else
4215                 code ~= "\n    int ";
4216             bool first = true;
4217             for ( int i = 0, used = 0; i < num_vars; ++i )
4218             {
4219                 bool hasInit = false;
4220                 foreach ( cmd; tdfa_.initializer )
4221                 {
4222                     if ( cmd.dst == i ) {
4223                         hasInit = true;
4224                         break;
4225                     }
4226                 }
4227 
4228                 if ( first )
4229                     first = false;
4230                 else
4231                     code ~= ", ";
4232                 if ( used > 0 && used % 10 == 0 )
4233                     code ~= "\n        ";
4234                 ++used;
4235                 code ~= Format.convert("r{}", i);
4236 
4237                 if ( hasInit )
4238                     code ~= "=0";
4239                 else
4240                     code ~= "=-1";
4241             }
4242             code ~= ";";
4243         }
4244 
4245         code ~= "\n\n    for ( size_t p, next_p; p < input.length; )\n    {";
4246         code ~= "\n        dchar c = cast(dchar)input[p];\n        if ( c & 0x80 )\n            decode(input, next_p);";
4247         code ~= "\n        else\n            next_p = p+1;\n        switch ( s )\n        {";
4248 
4249         size_t[] finish_states;
4250         foreach ( s; tdfa_.states )
4251         {
4252             code ~= Format.convert("\n            case {}:", s.index);
4253 
4254             if ( s.accept )
4255             {
4256                 finish_states ~= s.index;
4257 
4258                 tdfa_t.State target;
4259                 foreach ( t; s.transitions )
4260                 {
4261                     if ( target is null )
4262                         target = t.target;
4263                     else if ( target !is t.target )
4264                     {
4265                         target = null;
4266                         break;
4267                     }
4268                 }
4269                 if ( target !is null && target is s )
4270                     s.transitions = null;
4271             }
4272 
4273             bool first_if=true;
4274             charclass_t cc, ccTest;
4275 
4276             tango.core.Array.sort(s.transitions);
4277             foreach ( t; s.transitions )
4278             {
4279                 ccTest.add(t.predicate.getInput());
4280                 ccTest.optimize();
4281                 if ( t.predicate.getInput() < ccTest )
4282                     cc = t.predicate.getInput();
4283                 else
4284                     cc = ccTest;
4285 
4286                 if ( first_if ) {
4287                     code ~= "\n                if ( ";
4288                     first_if = false;
4289                 }
4290                 else
4291                     code ~= "\n                else if ( ";
4292                 bool first_cond=true;
4293                 foreach ( cr; cc.parts )
4294                 {
4295                     if ( first_cond )
4296                         first_cond = false;
4297                     else
4298                         code ~= " || ";
4299                     if ( cr.l() == cr.r() )
4300                         code ~= Format.convert("c == 0x{:x}", cast(int)cr.l());
4301                     else
4302                         code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r());
4303                 }
4304                 code ~= Format.convert(" ) {{\n                    s = {};", t.target.index);
4305 
4306                 if ( t.predicate.type == typeof(t.predicate.type).consume )
4307                     code ~= "\n                    p = next_p;";
4308                 foreach ( cmd; t.commands )
4309                     code ~= compileCommand(cmd, "                    ");
4310 /*
4311                 // if inp ends here and we do not already accept, try to add an explicit string/line end
4312                 if ( p >= inp.length && !s.accept && c != 0 ) {
4313                     c = 0;
4314                     goto Lprocess_char;
4315                 }
4316 */
4317                 code ~= "\n                }";
4318             }
4319 
4320             if ( !first_if )
4321                 code ~= Format.convert(
4322                     "\n                else\n                    {};\n                break;",
4323                     s.accept?Format.convert("goto finish{}", s.index):"return false"
4324                 );
4325             else
4326                 code ~= Format.convert("\n                {};", s.accept?Format.convert("goto finish{}", s.index):"return false");
4327         }
4328 
4329         // create finisher groups
4330         size_t[][size_t] finisherGroup;
4331         foreach ( fs; finish_states )
4332         {
4333             // check if finisher group with same commands exists
4334             bool haveFinisher = false;
4335             foreach ( fg; finisherGroup.keys )
4336             {
4337                 bool equalCommands = false;
4338                 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length )
4339                 {
4340                     equalCommands = true;
4341                     foreach ( i, cmd; tdfa_.states[fs].finishers )
4342                     {
4343                         if ( cmd != tdfa_.states[fg].finishers[i] ) {
4344                             equalCommands = false;
4345                             break;
4346                         }
4347                     }
4348                 }
4349                 if ( equalCommands ) {
4350                     // use existing group for this state
4351                     finisherGroup[fg] ~= fs;
4352                     haveFinisher = true;
4353                     break;
4354                 }
4355             }
4356             // create new group
4357             if ( !haveFinisher )
4358                 finisherGroup[fs] ~= fs;
4359         }
4360 
4361 
4362         code ~= "\n            default:\n                assert(0);\n        }\n    }\n\n    switch ( s )\n    {";
4363         foreach ( group, states; finisherGroup )
4364         {
4365             foreach ( s; states )
4366                 code ~= Format.convert("\n        case {}: finish{}:", s, s);
4367 
4368             foreach ( cmd; tdfa_.states[group].finishers )
4369             {
4370                 if ( lexer )
4371                 {
4372                     if ( tdfa_.states[group].finishers.length > 1 )
4373                         throw new RegExpException("Lexer error: more than one finisher in flm lexer!");
4374                     if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags )
4375                         throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup);
4376                     code ~= Format.convert("\n            match = input[0 .. r{}];\n            token = {};", cmd.src, cmd.dst/2);
4377                 }
4378                 else
4379                     code ~= compileCommand(cmd, "            ");
4380             }
4381 
4382             code ~= "\n            break;";
4383         }
4384         code ~= "\n        default:\n            return false;\n    }\n";
4385 
4386         if ( !lexer )
4387         {
4388             code ~= Format.convert("\n    groups.length = {};", tdfa_.num_tags/2);
4389             for ( int i = 0; i < tdfa_.num_tags/2; ++i )
4390                 code ~= Format.convert("\n    if ( r{} > -1 && r{} > -1 )\n        groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1);
4391         }
4392 
4393         code ~= "\n    return true;\n}";
4394         return code;
4395     }
4396 
4397     /*********************************************************************************************
4398         Get the pattern with which this regex was constructed.
4399     **********************************************************************************************/
4400     public const(char_t)[] pattern()
4401     {
4402         return pattern_;
4403     }
4404 
4405     /*********************************************************************************************
4406         Get the tag count of this regex, representing the number of sub-matches.
4407 
4408         This value is the max valid value for match/opIndex.
4409     **********************************************************************************************/
4410     uint tagCount()
4411     {
4412         return tdfa_.num_tags;
4413     }
4414 
4415     int[]       registers_;
4416     size_t      next_start_,
4417                 last_start_;
4418 
4419     debug(TangoRegex) tnfa_t tnfa_;
4420     tdfa_t      tdfa_;
4421 private:
4422     enum int           PREALLOC = 16;
4423     const(char_t)[]    input_,
4424                        pattern_;
4425 
4426     const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent)
4427     {
4428         const(char)[]  code,
4429                 dst;
4430         code ~= Format.convert("\n{}r{} = ", indent, cmd.dst);
4431         if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
4432             code ~= "p;";
4433         else
4434             code ~= Format.convert("r{};", cmd.src);
4435         return code;
4436     }
4437 }
4438 
4439 alias RegExpT!(char) Regex;
4440 
4441 debug(utf) import tango.stdc.stdio;
4442 // the following block is stolen from phobos.
4443 // the copyright notice applies for this block only.
4444 /*
4445  *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
4446  *  Written by Walter Bright
4447  *
4448  *  This software is provided 'as-is', without any express or implied
4449  *  warranty. In no event will the authors be held liable for any damages
4450  *  arising from the use of this software.
4451  *
4452  *  Permission is granted to anyone to use this software for any purpose,
4453  *  including commercial applications, and to alter it and redistribute it
4454  *  freely, subject to the following restrictions:
4455  *
4456  *  o  The origin of this software must not be misrepresented; you must not
4457  *     claim that you wrote the original software. If you use this software
4458  *     in a product, an acknowledgment in the product documentation would be
4459  *     appreciated but is not required.
4460  *  o  Altered source versions must be plainly marked as such, and must not
4461  *     be misrepresented as being the original software.
4462  *  o  This notice may not be removed or altered from any source
4463  *     distribution.
4464  */
4465 
4466 class UtfException : Exception
4467 {
4468     size_t idx; /// index in string of where error occurred
4469 
4470     this(immutable(char)[] s, size_t i)
4471     {
4472         idx = i;
4473         super(s);
4474     }
4475 }
4476 
4477 bool isValidDchar(dchar c)
4478 {
4479     /* Note: FFFE and FFFF are specifically permitted by the
4480      * Unicode standard for application internal use, but are not
4481      * allowed for interchange.
4482      * (thanks to Arcane Jill)
4483      */
4484 
4485     return c < 0xD800 ||
4486         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
4487 }
4488 
4489 /* *************
4490  * Decodes and returns character starting at s[idx]. idx is advanced past the
4491  * decoded character. If the character is not well formed, a UtfException is
4492  * thrown and idx remains unchanged.
4493  */
4494 
4495 dchar decode(const(char)[] s, ref size_t idx)
4496     {
4497         size_t len = s.length;
4498         dchar V;
4499         size_t i = idx;
4500         char u = s[i];
4501 
4502         if (u & 0x80)
4503         {   uint n;
4504             char u2;
4505 
4506             /* The following encodings are valid, except for the 5 and 6 byte
4507              * combinations:
4508              *  0xxxxxxx
4509              *  110xxxxx 10xxxxxx
4510              *  1110xxxx 10xxxxxx 10xxxxxx
4511              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
4512              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4513              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4514              */
4515             for (n = 1; ; n++)
4516             {
4517                 if (n > 4)
4518                     goto Lerr;          // only do the first 4 of 6 encodings
4519                 if (((u << n) & 0x80) == 0)
4520                 {
4521                     if (n == 1)
4522                         goto Lerr;
4523                     break;
4524                 }
4525             }
4526 
4527             // Pick off (7 - n) significant bits of B from first byte of octet
4528             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
4529 
4530             if (i + (n - 1) >= len)
4531                 goto Lerr;                      // off end of string
4532 
4533             /* The following combinations are overlong, and illegal:
4534              *  1100000x (10xxxxxx)
4535              *  11100000 100xxxxx (10xxxxxx)
4536              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
4537              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
4538              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
4539              */
4540             u2 = s[i + 1];
4541             if ((u & 0xFE) == 0xC0 ||
4542                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
4543                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
4544                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
4545                 (u == 0xFC && (u2 & 0xFC) == 0x80))
4546                 goto Lerr;                      // overlong combination
4547 
4548             for (uint j = 1; j != n; j++)
4549             {
4550                 u = s[i + j];
4551                 if ((u & 0xC0) != 0x80)
4552                     goto Lerr;                  // trailing bytes are 10xxxxxx
4553                 V = (V << 6) | (u & 0x3F);
4554             }
4555             if (!isValidDchar(V))
4556                 goto Lerr;
4557             i += n;
4558         }
4559         else
4560         {
4561             V = cast(dchar) u;
4562             i++;
4563         }
4564 
4565         idx = i;
4566         return V;
4567 
4568       Lerr:
4569         throw new Exception("invalid UTF-8 sequence");
4570     }
4571 
4572 /*  ditto */
4573 
4574 dchar decode(const(wchar)[] s, ref size_t idx)
4575     in
4576     {
4577         assert(idx >= 0 && idx < s.length);
4578     }
4579     out (result)
4580     {
4581         assert(isValidDchar(result));
4582     }
4583     body
4584     {
4585         const(char)[] msg;
4586         dchar V;
4587         size_t i = idx;
4588         uint u = s[i];
4589 
4590         if (u & ~0x7F)
4591         {   if (u >= 0xD800 && u <= 0xDBFF)
4592             {   uint u2;
4593 
4594                 if (i + 1 == s.length)
4595                 {   msg = "surrogate UTF-16 high value past end of string";
4596                     goto Lerr;
4597                 }
4598                 u2 = s[i + 1];
4599                 if (u2 < 0xDC00 || u2 > 0xDFFF)
4600                 {   msg = "surrogate UTF-16 low value out of range";
4601                     goto Lerr;
4602                 }
4603                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
4604                 i += 2;
4605             }
4606             else if (u >= 0xDC00 && u <= 0xDFFF)
4607             {   msg = "unpaired surrogate UTF-16 value";
4608                 goto Lerr;
4609             }
4610             else if (u == 0xFFFE || u == 0xFFFF)
4611             {   msg = "illegal UTF-16 value";
4612                 goto Lerr;
4613             }
4614             else
4615                 i++;
4616         }
4617         else
4618         {
4619             i++;
4620         }
4621 
4622         idx = i;
4623         return cast(dchar)u;
4624 
4625       Lerr:
4626         throw new UtfException(msg.idup, i);
4627     }
4628 
4629 /*  ditto */
4630 
4631 dchar decode(const(dchar)[] s, ref size_t idx)
4632     in
4633     {
4634         assert(idx >= 0 && idx < s.length);
4635     }
4636     body
4637     {
4638         size_t i = idx;
4639         dchar c = s[i];
4640 
4641         if (!isValidDchar(c))
4642             goto Lerr;
4643         idx = i + 1;
4644         return c;
4645 
4646       Lerr:
4647         throw new UtfException("5invalid UTF-32 value", i);
4648     }
4649 
4650 
4651 
4652 /* =================== Encode ======================= */
4653 
4654 /* *****************************
4655  * Encodes character c and appends it to array s[].
4656  */
4657 
4658 void encode(ref const(char)[] s, dchar c)
4659     in
4660     {
4661         assert(isValidDchar(c));
4662     }
4663     body
4664     {
4665         const(char)[] r = s;
4666 
4667         if (c <= 0x7F)
4668         {
4669             r ~= cast(char) c;
4670         }
4671         else
4672         {
4673             char[4] buf;
4674             uint L;
4675 
4676             if (c <= 0x7FF)
4677             {
4678                 buf[0] = cast(char)(0xC0 | (c >> 6));
4679                 buf[1] = cast(char)(0x80 | (c & 0x3F));
4680                 L = 2;
4681             }
4682             else if (c <= 0xFFFF)
4683             {
4684                 buf[0] = cast(char)(0xE0 | (c >> 12));
4685                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4686                 buf[2] = cast(char)(0x80 | (c & 0x3F));
4687                 L = 3;
4688             }
4689             else if (c <= 0x10FFFF)
4690             {
4691                 buf[0] = cast(char)(0xF0 | (c >> 18));
4692                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
4693                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4694                 buf[3] = cast(char)(0x80 | (c & 0x3F));
4695                 L = 4;
4696             }
4697             else
4698             {
4699                 assert(0);
4700             }
4701             r ~= buf[0 .. L];
4702         }
4703         s = r;
4704     }
4705 
4706 /*  ditto */
4707 
4708 void encode(ref const(wchar)[] s, dchar c)
4709     in
4710     {
4711         assert(isValidDchar(c));
4712     }
4713     body
4714     {
4715         const(wchar)[] r = s;
4716 
4717         if (c <= 0xFFFF)
4718         {
4719             r ~= cast(wchar) c;
4720         }
4721         else
4722         {
4723             wchar[2] buf;
4724 
4725             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
4726             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
4727             r ~= buf;
4728         }
4729         s = r;
4730     }
4731 
4732 /*  ditto */
4733 
4734 void encode(ref const(dchar)[] s, dchar c)
4735     in
4736     {
4737         assert(isValidDchar(c));
4738     }
4739     body
4740     {
4741         s ~= c;
4742     }
4743 
4744 debug(UnitTest)
4745 {
4746     unittest
4747     {
4748         // match a fixed string
4749         Regex r;
4750         r = new Regex("abc");
4751         assert(r.test("abc123"));
4752         assert(r.test("feabc123"));
4753         assert(!r.test("abe123"));
4754 
4755         // match a non-ASCII string
4756         r = new Regex("☃");
4757         assert(r.test("a☃c123"));
4758 
4759         // capture
4760         r = new Regex("☃(c)");
4761         assert(r.test("a☃c123"));
4762         assert(r.match(1) == "c");
4763 
4764         // dot
4765         r = new Regex("..");
4766         assert(r.test("a☃c123"));
4767 
4768         // dot capture
4769         r = new Regex("(..)");
4770         assert(r.test("a☃c123"));
4771         assert(r.match(1) == "a☃");
4772 
4773 
4774 
4775         // two captures
4776         r = new Regex("(.)(.)");
4777         assert(r.test("a☃c123"));
4778         assert(r.match(1) == "a");
4779         assert(r.match(2) == "☃");
4780 
4781         // multiple letters
4782         r = new Regex(".*(e+).*");
4783         assert(r.test("aeeeeec123"));
4784         assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1));
4785         assert(!r.test("abfua"));
4786 
4787         // multiple snowmen
4788         r = new Regex(".*(☃+).*");
4789         assert(r.test("a☃☃☃☃☃c123"));
4790         assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1));
4791         assert(!r.test("abeua"));
4792 
4793         /**
4794            "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed.
4795         */
4796         // multiple snowmen
4797         r = new Regex("(☃+)");
4798         assert(r.test("a☃☃☃☃☃c123"));
4799         assert(r.match(0) == "☃☃☃☃☃");
4800         assert(r.match(1) == "☃☃☃☃☃");
4801         assert(r.test("a☃beua"));
4802     }
4803 
4804     debug(RegexTestOnly)
4805     {
4806         import tango.io.Stdout;
4807         void main() { Stdout("All tests pass").newline(); }
4808     }
4809 }