1 /*******************************************************************************
2 
3     copyright:      Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved.
4 
5     license:        BSD style: $(LICENSE)
6 
7     version:        Initial release: Jan 2008
8 
9     authors:        Jascha Wetzel
10 
11     This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method.
12 
13     The Regex class is not thread safe
14 
15     See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a>
16     for details on regular expressions in general.
17 
18     The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it,
19     as opposed to what &quot;regular expression&quot; means in most implementations
20     (e.g. PCRE or those from the standard libraries of Perl, Java or Python).
21     The advantage of this method is it's performance, it's disadvantage is the inability to realize some features
22     that Perl-like regular expressions have (e.g. back-references).
23     See <a href="http://swtch.com/~rsc/regexp/regexp1.html">&quot;Regular Expression Matching Can Be Simple And Fast&quot;</a>
24     for details on the differences.
25 
26     The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the
27     number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input
28     and therefore matching is a linear-time process.
29 
30     The syntax of a regular expressions is as follows.
31     <i>X</i> and <i>Y</i> stand for an arbitrary regular expression.
32 
33     <table border=1 cellspacing=0 cellpadding=5>
34     <caption>Operators</caption>
35     $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) )
36     $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) )
37     $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) )
38     $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) )
39     $(TR $(TD [^Z]) $(TD negated character class specification) )
40     $(TR $(TD &lt;X) $(TD lookbehind, X may be a single character or a character class) )
41     $(TR $(TD &gt;X) $(TD lookahead, X may be a single character or a character class) )
42     $(TR $(TD ^) $(TD start of input or start of line) )
43     $(TR $(TD $) $(TD end of input or end of line) )
44     $(TR $(TD \b) $(TD start or end of word, equals (?&lt;\s&gt;\S|&lt;\S&gt;\s)) )
45     $(TR $(TD \B) $(TD opposite of \b, equals (?&lt;\S&gt;\S|&lt;\s&gt;\s)) )
46     </table>
47 
48     <table border=1 cellspacing=0 cellpadding=5>
49     <caption>Quantifiers</caption>
50     $(TR $(TD X?) $(TD zero or one) )
51     $(TR $(TD X*) $(TD zero or more) )
52     $(TR $(TD X+) $(TD one or more) )
53     $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) )
54     $(TR $(TD X??) $(TD non-greedy version of the above operators) )
55     $(TR $(TD X*?) $(TD see above) )
56     $(TR $(TD X+?) $(TD see above) )
57     $(TR $(TD X{n,m}?) $(TD see above) )
58     </table>
59 
60     <table border=1 cellspacing=0 cellpadding=5>
61     <caption>Pre-defined character classes</caption>
62     $(TR $(TD .) $(TD any printable character) )
63     $(TR $(TD \s) $(TD whitespace) )
64     $(TR $(TD \S) $(TD non-whitespace) )
65     $(TR $(TD \w) $(TD alpha-numeric characters or underscore) )
66     $(TR $(TD \W) $(TD opposite of \w) )
67     $(TR $(TD \d) $(TD digits) )
68     $(TR $(TD \D) $(TD non-digit) )
69     </table>
70 
71     Note that "alphanumeric" only applies to Latin-1.
72 *******************************************************************************/
73 module tango.text.Regex;
74 
75 static import tango.core.Array;
76 
77 static import tango.text.convert.Utf;
78 
79 debug(TangoRegex) import tango.io.Stdout;
80 
81 /* *****************************************************************************
82     A simple pair
83 *******************************************************************************/
84 private struct Pair(T)
85 {
86     static Pair opCall(T a, T b)
87     {
88         Pair p;
89         p.a = a;
90         p.b = b;
91         return p;
92     }
93 
94     union
95     {
96         struct {
97             T first, second;
98         }
99         struct {
100             T a, b;
101         }
102     }
103 }
104 
105 /* *****************************************************************************
106     Double linked list
107 *******************************************************************************/
108 private class List(T)
109 {
110     class Element
111     {
112         T value;
113         Element prev,
114                 next;
115 
116         this(T v)
117         {
118             value = v;
119         }
120     }
121 
122     size_t  len;
123     Element head,
124             tail;
125 
126     List opCatAssign(T v)
127     {
128         if ( tail is null )
129             head = tail = new Element(v);
130         else {
131             tail.next = new Element(v);
132             tail.next.prev = tail;
133             tail = tail.next;
134         }
135         ++len;
136         return this;
137     }
138 
139     List insertAfter(T w, T v)
140     {
141         foreach ( e; &this.elements )
142         {
143             if ( e.value is w )
144                 return insertAfter(e, v);
145         }
146         return null;
147     }
148 
149     List insertAfter(Element e, T v)
150     {
151         auto tmp = new Element(v);
152         tmp.prev = e;
153         tmp.next = e.next;
154         e.next.prev = tmp;
155         e.next = tmp;
156         if ( e is tail )
157             tail = tmp;
158         ++len;
159         return this;
160     }
161 
162     List opCatAssign(List l)
163     {
164         if ( l.empty() )
165             return this;
166         if ( tail is null ) {
167             head = l.head;
168             tail = l.tail;
169         }
170         else {
171             tail.next = l.head;
172             tail.next.prev = tail;
173             tail = l.tail;
174         }
175         len += l.len;
176         return this;
177     }
178 
179     List pushFront(T v)
180     {
181         if ( head is null )
182             head = tail = new Element(v);
183         else
184         {
185             head.prev = new Element(v);
186             head.prev.next = head;
187             head = head.prev;
188         }
189         ++len;
190         return this;
191     }
192 
193     List insertBefore(T w, T v)
194     {
195         foreach ( e; &this.elements )
196         {
197             if ( e.value is w )
198                 return insertBefore(e, v);
199         }
200         return null;
201     }
202 
203     List insertBefore(Element e, T v)
204     {
205         auto tmp = new Element(v);
206         tmp.prev = e.prev;
207         tmp.next = e;
208         e.prev.next = tmp;
209         e.prev = tmp;
210         if ( e is head )
211             head = tmp;
212         ++len;
213         return this;
214     }
215 
216     List pushFront(List l)
217     {
218         if ( l.empty() )
219             return this;
220         if ( head is null ) {
221             head = l.head;
222             tail = l.tail;
223         }
224         else {
225             head.prev = l.tail;
226             head.prev.next = head;
227             head = l.head;
228         }
229         len += l.len;
230         return this;
231     }
232 
233     size_t length()
234     {
235         return len;
236     }
237 
238     bool empty()
239     {
240         return head is null;
241     }
242 
243     void clear()
244     {
245         head = null;
246         tail = null;
247         len = 0;
248     }
249 
250     void pop()
251     {
252         remove(tail);
253     }
254 
255     void remove(Element e)
256     {
257         if ( e is null )
258             return;
259         if ( e.prev is null )
260             head = e.next;
261         else
262             e.prev.next = e.next;
263         if ( e.next is null )
264             tail = e.prev;
265         else
266             e.next.prev = e.prev;
267         --len;
268     }
269 
270     int elements(int delegate(ref Element) dg)
271     {
272         for ( Element e=head; e !is null; e = e.next )
273         {
274             int ret = dg(e);
275             if ( ret )
276                 return ret;
277         }
278         return 0;
279     }
280 
281     int elements_reverse(int delegate(ref Element) dg)
282     {
283         for ( Element e=tail; e !is null; e = e.prev )
284         {
285             int ret = dg(e);
286             if ( ret )
287                 return ret;
288         }
289         return 0;
290     }
291 
292     int opApply(int delegate(ref T) dg)
293     {
294         for ( Element e=head; e !is null; e = e.next )
295         {
296             int ret = dg(e.value);
297             if ( ret )
298                 return ret;
299         }
300         return 0;
301     }
302 
303     int opApplyReverse(int delegate(ref T) dg)
304     {
305         for ( Element e=tail; e !is null; e = e.prev )
306         {
307             int ret = dg(e.value);
308             if ( ret )
309                 return ret;
310         }
311         return 0;
312     }
313 }
314 
315 /* *****************************************************************************
316     Stack based on dynamic array
317 *******************************************************************************/
318 private struct Stack(T)
319 {
320     size_t  _top;
321     T[]     stack;
322 
323     void push(T v)
324     {
325         if ( _top >= stack.length )
326             stack.length = stack.length*2+1;
327         stack[_top] = v;
328         ++_top;
329     }
330     alias push opCatAssign;
331 
332     void opCatAssign(T[] vs)
333     {
334         size_t end = _top+vs.length;
335         if ( end > stack.length )
336             stack.length = end*2;
337         stack[_top..end] = vs[];
338         _top = end;
339     }
340 
341     void pop(size_t num)
342     {
343         assert(_top>=num);
344         _top -= num;
345     }
346 
347     T pop()
348     {
349         assert(_top>0);
350         return stack[--_top];
351     }
352 
353     T top()
354     {
355         assert(_top>0);
356         return stack[_top-1];
357     }
358 
359     T* topPtr()
360     {
361         assert(_top>0);
362         return &stack[_top-1];
363     }
364 
365     bool empty()
366     {
367         return _top == 0;
368     }
369 
370     void clear()
371     {
372         _top = 0;
373     }
374 
375     size_t length()
376     {
377         return _top;
378     }
379 
380     T[] array()
381     {
382         return stack[0.._top];
383     }
384 
385     T opIndex(size_t i)
386     {
387         return stack[i];
388     }
389 
390     Stack dup()
391     {
392         Stack s;
393         s._top = _top;
394         s.stack = stack.dup;
395         return s;
396     }
397 }
398 
399 /* ************************************************************************************************
400     Set container based on assoc array
401 **************************************************************************************************/
402 private struct Set(T)
403 {
404     bool[T] data;
405 
406     static Set opCall()
407     {
408         Set s;
409         return s;
410     }
411 
412     static Set opCall(T v)
413     {
414         Set s;
415         s ~= v;
416         return s;
417     }
418 
419     void opAddAssign(T v)
420     {
421         data[v] = true;
422     }
423 
424     void opAddAssign(Set s)
425     {
426         foreach ( v; s.elements )
427             data[v] = true;
428     }
429     alias opAddAssign opCatAssign;
430 
431     size_t length()
432     {
433         return data.length;
434     }
435 
436     T[] elements()
437     {
438         return data.keys;
439     }
440 
441     bool remove(T v)
442     {
443         if ( (v in data) is null )
444             return false;
445         data.remove(v);
446         return true;
447     }
448 
449     bool contains(T v)
450     {
451         return (v in data) !is null;
452     }
453 
454     bool contains(Set s)
455     {
456         Set tmp = s - this;
457         return tmp.empty;
458     }
459 
460     bool empty()
461     {
462         return data.length==0;
463     }
464 
465     Set opSub(Set s)
466     {
467         Set res = dup;
468         foreach ( v; s.elements )
469             res.remove(v);
470         return res;
471     }
472 
473     Set dup()
474     {
475         Set s;
476         foreach ( v; data.keys )
477             s.data[v] = true;
478         return s;
479     }
480 }
481 
482 /* ************************************************************************************************
483 
484 **************************************************************************************************/
485 void quickSort(T)(T[] a)
486 {
487     quickSort(a,cast(size_t)0,a.length);
488 }
489 
490 void quickSort(T)(T[] a, size_t l, size_t r)
491 {
492     T t;
493     auto i = r-l;
494     if ( i < 3 )
495     {
496         if ( i < 2 )
497             return;
498         if ( a[l] < a[l+1] )
499             return;
500         t = a[l];
501         a[l] = a[l+1];
502         a[l+1] = t;
503         return;
504     }
505 
506     auto p = a[l];
507     i = l;
508     auto j = r;
509 
510     while ( true )
511     {
512         ++i;
513         for ( ; i < j && a[i] < p; ++i ) {}
514         --j;
515         for ( ; i < j && a[j] >= p; --j ) {}
516         if ( i >= j )
517             break;
518         t = a[i];
519         a[i] = a[j];
520         a[j] = t;
521     }
522     --i;
523     a[l] = a[i];
524     a[i] = p;
525 
526     quickSort(a, l, i);
527     quickSort(a, i+1, r);
528 }
529 import tango.math.Math;
530 
531 /* ************************************************************************************************
532     A range of characters
533 **************************************************************************************************/
534 struct CharRange(char_t)
535 {
536     char_t  l_, r_;
537 
538     static CharRange opCall(char_t c)
539     {
540         CharRange cr;
541         cr.l_ = c;
542         cr.r_ = c;
543         return cr;
544     }
545 
546     static CharRange opCall(char_t a, char_t b)
547     {
548         CharRange cr;
549         cr.l_ = min(a,b);
550         cr.r_ = max(a,b);
551         return cr;
552     }
553 
554     char_t l()
555     {
556         return l_;
557     }
558 
559     char_t r()
560     {
561         return r_;
562     }
563 
564     /* ********************************************************************************************
565         Compares the ranges according to their beginning.
566     **********************************************************************************************/
567     int opCmp(const ref CharRange cr) const
568     {
569         if ( l_ == cr.l_ )
570             return 0;
571         if ( l_ < cr.l_ )
572             return -1;
573         return 1;
574     }
575 
576 
577     const bool opEquals(ref const CharRange cr)
578     {
579         if ( l_ == cr.l_ && r_ == cr.r_ )
580             return true;
581         return false;
582     }
583 
584     bool contains(char_t c)
585     {
586         return c >= l_ && c <= r_;
587     }
588 
589     bool contains(CharRange cr)
590     {
591         return l_ <= cr.l_ && r_ >= cr.r_;
592     }
593 
594     bool intersects(CharRange cr)
595     {
596         return r_ >= cr.l_ && l_ <= cr.r_;
597     }
598 
599     CharRange intersect(CharRange cr)
600     {
601         assert(intersects(cr));
602         CharRange ir;
603         ir.l_ = max(l_, cr.l_);
604         ir.r_ = min(r_, cr.r_);
605         if ( ir.l_ > ir.r_ )
606             ir.l_ = ir.r_ = char_t.min;
607         return ir;
608     }
609 
610     CharRange[] subtract(CharRange cr)
611     {
612         CharRange[] sr;
613         if ( cr.contains(this) )
614             return sr;
615         if ( !intersects(cr) )
616             sr ~= this;
617         else
618         {
619             CharRange d;
620             if ( contains(cr) )
621             {
622                 d.l_ = l_;
623                 d.r_ = cast(char_t)(cr.l_-1);
624                 if ( d.l_ <= d.r_ )
625                     sr ~= d;
626                 d.l_ = cast(char_t)(cr.r_+1);
627                 d.r_ = r_;
628                 if ( d.l_ <= d.r_ )
629                     sr ~= d;
630             }
631             else if ( cr.r_ > l_ )
632             {
633                 d.l_ = cast(char_t)(cr.r_+1);
634                 d.r_ = r_;
635                 if ( d.l_ <= d.r_ )
636                     sr ~= d;
637             }
638             else if ( cr.l_ < r_ )
639             {
640                 d.l_ = l_;
641                 d.r_ = cast(char_t)(cr.l_-1);
642                 if ( d.l_ <= d.r_ )
643                     sr ~= d;
644             }
645         }
646         return sr;
647     }
648 
649     string toString()
650     {
651         string str;
652 
653         if ( l_ == r_ )
654         {
655             if ( l_ > 0x20 && l_ < 0x7f )
656                 str = Format.convert("'{}'", l_).idup;
657             else
658                 str = Format.convert("({:x})", cast(int)l_).idup;
659         }
660         else
661         {
662             if ( l_ > 0x20 && l_ < 0x7f )
663                 str = Format.convert("'{}'", l_).idup;
664             else
665                 str = Format.convert("({:x})", cast(int)l_).idup;
666             str ~= "-";
667             if ( r_ > 0x20 && r_ < 0x7f )
668                 str ~= Format.convert("'{}'", r_).idup;
669             else
670                 str ~= Format.convert("({:x})", cast(int)r_).idup;
671         }
672         return str;
673     }
674 }
675 
676 import tango.core.Compiler;
677 
678 static if(DMDFE_Version == 2063)
679 {
680     /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */
681     void _bug(CharRange!(dchar) a) {}
682 }
683 
684 /* ************************************************************************************************
685     Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.)
686 **************************************************************************************************/
687 struct CharClass(char_t)
688 {
689     alias CharRange!(char_t) range_t;
690 
691     //---------------------------------------------------------------------------------------------
692     // pre-defined character classes
693     static const CharClass!(char_t)
694         line_startend = {parts: [
695             {l_:0x00, r_:0x00},
696             {l_:0x0a, r_:0x0a},
697             {l_:0x13, r_:0x13}
698         ]},
699         digit = {parts: [
700             {l_:0x30, r_:0x39}
701         ]},
702         whitespace = {parts: [
703             {l_:0x09, r_:0x09},
704             {l_:0x0a, r_:0x0a},
705             {l_:0x0b, r_:0x0b},
706             {l_:0x13, r_:0x13},
707             {l_:0x14, r_:0x14},
708             {l_:0x20, r_:0x20}
709         ]};
710 
711     // 8bit classes
712     static if ( is(char_t == char) )
713     {
714         static const CharClass!(char_t)
715             any_char = {parts: [
716                 {l_:0x01, r_:0xff}
717             ]},
718             dot_oper = {parts: [
719                 {l_:0x01, r_:0xff}
720             ]},
721             alphanum_ = {parts: [
722                 {l_:0x30, r_:0x39},
723                 {l_:0x41, r_:0x5a},
724                 {l_:0x5f, r_:0x5f},
725                 {l_:0x61, r_:0x7a}
726             ]};
727     }
728     // 16bit and 32bit classes
729     static if ( is(char_t == wchar) || is(char_t == dchar) )
730     {
731         static const CharClass!(char_t)
732             any_char = {parts: [
733                 {l_:0x0001, r_:0xffff}
734             ]},
735             dot_oper = {parts: [
736                 {l_:0x0001, r_:0xffff}
737             ]},
738             alphanum_ = {parts: [
739                 {l_:0x30, r_:0x39},
740                 {l_:0x41, r_:0x5a},
741                 {l_:0x5f, r_:0x5f},
742                 {l_:0x61, r_:0x7a}
743             ]};
744     }
745 
746     //---------------------------------------------------------------------------------------------
747     range_t[] parts;
748 
749     invariant()
750     {
751 //        foreach ( i, p; parts )
752 //            assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_));
753     }
754 
755     static CharClass opCall(const(CharClass) cc)
756     {
757         CharClass ncc;
758         ncc.parts = cc.parts.dup;
759         return ncc;
760     }
761 
762     int opCmp(const ref CharClass cc) const
763     {
764         if ( parts.length < cc.parts.length )
765             return -1;
766         if ( parts.length > cc.parts.length )
767             return 1;
768         foreach ( i, p; cc.parts )
769         {
770             if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ )
771                 return 1;
772         }
773         return 0;
774     }
775 
776     bool empty()
777     {
778         return parts.length <= 0;
779     }
780 
781     bool matches(char_t c)
782     {
783         foreach ( p; parts )
784         {
785             if ( p.contains(c) )
786                 return true;
787         }
788         return false;
789     }
790 
791     CharClass intersect(CharClass cc)
792     {
793         CharClass ic;
794         foreach ( p; parts )
795         {
796             foreach ( cp; cc.parts )
797             {
798                 if ( p.intersects(cp) )
799                     ic.parts ~= p.intersect(cp);
800             }
801         }
802         return ic;
803     }
804 
805     // requires the class to be optimized
806     bool contains(range_t cr)
807     {
808         foreach ( p; parts )
809         {
810             if ( p.contains(cr) )
811                 return true;
812         }
813         return false;
814     }
815 
816     // requires the class to be optimized
817     bool contains(CharClass cc)
818     {
819         Louter: foreach ( p; cc.parts )
820         {
821             foreach ( p2; parts )
822             {
823                 if ( p2.contains(p) )
824                     continue Louter;
825             }
826             return false;
827         }
828         return true;
829     }
830 
831     void subtract(CharClass cc)
832     {
833         negate();
834         add(cc);
835         negate();
836     }
837 
838     void add(CharClass cc)
839     {
840         parts ~= cc.parts;
841     }
842 
843     void add(range_t cr)
844     {
845         parts ~= cr;
846     }
847 
848     void add(char_t c)
849     {
850         parts ~= CharRange!(char_t)(c);
851     }
852 
853     /* ********************************************************************************************
854         Requires the CharClass to be optimized.
855     **********************************************************************************************/
856     void negate()
857     {
858         optimize();
859         char_t  start = char_t.min;
860 
861         // first part touches left boundary of value range
862         if ( parts.length > 0 && parts[0].l_ == start )
863         {
864             start = parts[0].r_;
865             if ( start < char_t.max )
866                 ++start;
867 
868             foreach ( i, ref cr; parts[0 .. $-1] )
869             {
870                 cr.l_ = start;
871                 cr.r_ = cast(char_t)(parts[i+1].l_-1);
872                 start = parts[i+1].r_;
873                 if ( start < char_t.max )
874                     ++start;
875             }
876             if ( start != char_t.max ) {
877                 parts[$-1].l_ = start;
878                 parts[$-1].r_ = char_t.max;
879             }
880             else
881                 parts.length = parts.length-1;
882             return;
883         }
884 
885         foreach ( i, ref cr; parts )
886         {
887             char_t tmp = cast(char_t)(cr.l_-1);
888             cr.l_ = start;
889             start = cr.r_;
890             if ( start < char_t.max )
891                 ++start;
892             cr.r_ = tmp;
893         }
894 
895         // last part does not touch right boundary
896         if ( start != char_t.max )
897             parts ~= range_t(start, char_t.max);
898     }
899 
900     void optimize()
901     {
902         if ( empty() )
903             return;
904 
905         tango.core.Array.sort(parts);
906 
907         size_t i = 0;
908         foreach ( p; parts[1 .. $] )
909         {
910             if ( p.l_ > parts[i].r_+1 ) {
911                 ++i;
912                 parts[i].l_ = p.l_;
913                 parts[i].r_ = p.r_;
914                 continue;
915             }
916             parts[i].r_ = max(p.r_, parts[i].r_);
917             if ( parts[i].r_ >= char_t.max )
918                 break;
919         }
920         parts.length = i+1;
921     }
922 
923     string toString()
924     {
925         string str;
926         str ~= "[";
927         foreach ( p; parts )
928             str ~= p.toString();
929         str ~= "]";
930         return str;
931     }
932 }
933 
934 debug(UnitTest)
935 {
936 unittest
937 {
938     static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] };
939     assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']");
940     cc.optimize();
941     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
942     cc.negate();
943     assert(cc.toString(),  " [(12)-(13)'e'-(ff)]");
944     cc.optimize();
945     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
946     cc.negate();
947     assert(cc.toString(),  "[(12)-(13)'e'-(ff)]");
948 
949     static CharClass!(char) cc2 = { parts: [] };
950     assert(cc.toString(),  "[]");
951     cc2.optimize();
952     assert(cc.toString(),  "[]");
953     cc2.negate();
954     assert(cc.toString(),  "[(0)-(ff)]");
955     cc2.optimize();
956     assert(cc.toString(),  "[(0)-(ff)]");
957     cc2.negate();
958     assert(cc.toString(),  "[]");
959 
960     static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] };
961     assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]");
962     cc3.negate();
963     assert(cc.toString(),  "['e'-(c7)]");
964     cc3.negate();
965     assert(cc.toString(),  "[(0)-'d'(c8)-(ff)]");
966 
967     static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] };
968     assert(cc.toString(),  "[(0)-(c8)'d'-(ff)]");
969     cc4.optimize();
970     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]");
971 
972     static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] };
973     cc5.optimize();
974     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
975     cc5.negate();
976     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
977     cc5.optimize();
978     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
979     cc5.negate();
980     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
981 }
982 }
983 
984 /* ************************************************************************************************
985 
986 **************************************************************************************************/
987 private struct Predicate(char_t)
988 {
989     alias const(char_t)[]       string_t;
990     alias CharClass!(char_t)    cc_t;
991     alias CharRange!(char_t)    cr_t;
992 
993     // generic data
994     enum Type {
995         consume, epsilon, lookahead, lookbehind
996     }
997 
998     cc_t    input;
999     Type    type;
1000 
1001     // data for compiled predicates
1002     enum uint  MAX_BITMAP_LENGTH = 256,
1003                MAX_SEARCH_LENGTH = 256;
1004     enum MatchMode {
1005         generic, generic_l,
1006         single_char, bitmap, string_search,         // consume
1007         single_char_l, bitmap_l, string_search_l    // lookahead
1008     }
1009 
1010     MatchMode   mode;
1011     // data_chr had to be pulled out of the union due to
1012     // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back
1013     // in until this is resolved!
1014     //
1015     // Keep in mind that data_str.length can't be modified directly unless the
1016     // new length is strictly greater than the old length. This is essentially
1017     // because ubyte.sizeof can be less than char_t.sizeof. If you set
1018     // data_str.length to anything less than or equal to data_bmp.length,
1019     // data_str will not be reallocated: only the length value will change but
1020     // nothing will realize that not that much space has actually been
1021     // allocated. data_str will be too small and you'll likely get segfaults or
1022     // such.
1023     //
1024     // In short: don't mess with data_str.length. If you have to, remove the
1025     // union entirely.
1026     //
1027     // -- Deewiant
1028     union {
1029         ubyte[]     data_bmp;
1030         string_t    data_str;
1031     };
1032     char_t      data_chr;
1033 
1034 
1035     void compile()
1036     {
1037         assert(input.parts.length > 0);
1038 
1039         // single char?
1040         if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ )
1041         {
1042             mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l;
1043             data_chr = input.parts[0].l_;
1044             return;
1045         }
1046         // check whether we can use a bitmap
1047         foreach ( p; input.parts )
1048         {
1049             if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH )
1050                 goto LnoBitmap;
1051         }
1052 
1053         // setup bitmap
1054         data_bmp.length = MAX_BITMAP_LENGTH/8;
1055         foreach ( p; input.parts )
1056         {
1057             for ( char_t c = p.l_; c <= p.r_; ++c )
1058                 data_bmp[c/8] |= 1 << (c&7);
1059         }
1060         mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l;
1061         return;
1062 
1063     LnoBitmap:
1064 /*
1065         // check whether the class is small enough to justify a string-search
1066         // TODO: consider inverse class for 8bit chars?
1067         uint class_size;
1068         foreach ( p; input.parts )
1069             class_size += cast(uint)p.r_+1-p.l_;
1070         if ( class_size > MAX_SEARCH_LENGTH )
1071             goto Lgeneric;
1072         data_str.length = class_size;
1073         size_t ind;
1074         foreach ( p; input.parts )
1075         {
1076             for ( char_t c = p.l_; c <= p.r_; ++c )
1077                 data_str[ind++] = c;
1078         }
1079         mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l;
1080         return;
1081 */
1082     Lgeneric:
1083         data_str = cast(char_t[])input.parts;
1084         mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l;
1085     }
1086 
1087     bool matches(char_t c)
1088     {
1089         if ( type == Type.consume || type == Type.lookahead )
1090             return input.matches(c);
1091         assert(0);
1092     }
1093 
1094     Predicate intersect(Predicate p)
1095     {
1096         Predicate p2;
1097         if ( type != Type.epsilon && p.type != Type.epsilon )
1098             p2.input = input.intersect(p.input);
1099         return p2;
1100     }
1101 
1102     bool intersects(Predicate p)
1103     {
1104         if ( type != p.type )
1105             return false;
1106         foreach ( cr; input.parts )
1107         {
1108             foreach ( cr2; p.input.parts )
1109             {
1110                 if ( cr.intersects(cr2) )
1111                     return true;
1112             }
1113         }
1114         return false;
1115     }
1116 
1117     bool exceedsMax(uint maxc)
1118     {
1119         foreach ( p; input.parts )
1120         {
1121             if ( p.l_ > maxc || p.r_ > maxc )
1122                 return true;
1123         }
1124 
1125         return false;
1126     }
1127 
1128     @property bool empty()
1129     {
1130         return type != Type.epsilon && input.empty();
1131     }
1132 
1133     void subtract(Predicate p)
1134     {
1135         if ( type != Type.epsilon && p.type != Type.epsilon )
1136             input.subtract(p.input);
1137     }
1138 
1139     void negate()
1140     {
1141         assert(type != Type.epsilon);
1142         input.negate();
1143     }
1144 
1145     void optimize()
1146     {
1147         assert(type != Type.epsilon);
1148         input.optimize();
1149     }
1150 
1151     int opCmp(const ref Predicate p) const
1152     {
1153         return input.opCmp(p.input);
1154     }
1155 
1156     const bool opEquals(ref const Predicate p)
1157     {
1158         if ( type != p.type )
1159             return false;
1160         if ( input.opCmp(p.input) != 0 )
1161             return false;
1162         return true;
1163     }
1164 
1165     cc_t getInput()
1166     {
1167         return input;
1168     }
1169 
1170     void setInput(const(cc_t) cc)
1171     {
1172         input = cast(cc_t)cc;
1173     }
1174 
1175     void appendInput(cr_t cr)
1176     {
1177         input.add(cr);
1178     }
1179 
1180     void appendInput(cc_t cc)
1181     {
1182         input.add(cc);
1183     }
1184 
1185     void appendInput(Predicate p)
1186     {
1187         input.add(p.input);
1188     }
1189 
1190     string toString()
1191     {
1192         string str;
1193         switch ( type )
1194         {
1195             case Type.consume:      str = input.toString();       break;
1196             case Type.epsilon:      str = "eps";                break;
1197             case Type.lookahead:    str = "la:"~input.toString(); break;
1198             case Type.lookbehind:   str = "lb:"~input.toString(); break;
1199             default:
1200                 assert(0);
1201         }
1202         return str;
1203     }
1204 }
1205 import Utf = tango.text.convert.Utf;
1206 import tango.text.convert.Format;
1207 
1208 /* ************************************************************************************************
1209 
1210 **************************************************************************************************/
1211 class RegExpException : Exception
1212 {
1213     this(string msg)
1214     {
1215         super("RegExp: "~msg);
1216     }
1217 }
1218 
1219 /* ************************************************************************************************
1220     TNFA state
1221 **************************************************************************************************/
1222 private class TNFAState(char_t)
1223 {
1224     bool    accept = false,
1225             visited = false;
1226     size_t  index;
1227     List!(TNFATransition!(char_t))  transitions;
1228 
1229     this()
1230     {
1231         transitions = new List!(TNFATransition!(char_t));
1232     }
1233 }
1234 
1235 
1236 /* ************************************************************************************************
1237     Priority classes used to linearize priorities after non-linear transition creation.
1238 **************************************************************************************************/
1239 private enum PriorityClass {
1240     greedy=0, normal=1, reluctant=2, extraReluctant=3
1241 }
1242 
1243 /* ********************************************************************************
1244     TNFA tagged transition
1245 ***********************************************************************************/
1246 private class TNFATransition(char_t)
1247 {
1248     TNFAState!(char_t)  target;
1249     Predicate!(char_t)  predicate;
1250     uint                priority,
1251                         tag;        /// one-based tag number, 0 = untagged
1252     PriorityClass       priorityClass;
1253 
1254     this(PriorityClass pc)
1255     {
1256         priorityClass = pc;
1257     }
1258 
1259     /******************************************************************************
1260         Move through states only going via epsilon transitions, and only choosing
1261         the one with highest priority. If the highest priority transition from a
1262         state isn't an epsilon transition, false is returned.
1263         If the accepting NFA state can be reached in this manner, true is returned.
1264 
1265         NOTE: This method does not look for cycles which should be kept in mind for
1266         later. larsivi 20090827
1267     *******************************************************************************/
1268     bool canFinish()
1269     {
1270         TNFAState!(char_t)  t = target;
1271         while (!t.accept) {
1272             TNFATransition!(char_t) highestPriTrans;
1273             foreach (trans; t.transitions) {
1274                 if (!highestPriTrans || highestPriTrans.priority > trans.priority)
1275                     highestPriTrans = trans;
1276             }
1277             if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon))
1278                 return false;
1279 
1280             t = highestPriTrans.target;
1281         }
1282         return true;
1283     }
1284 
1285 }
1286 
1287 /* ************************************************************************************************
1288     Fragments of TNFAs as used in the Thompson method
1289 **************************************************************************************************/
1290 private class TNFAFragment(char_t)
1291 {
1292     alias TNFAState!(char_t)        state_t;
1293     alias TNFATransition!(char_t)   trans_t;
1294 
1295     List!(trans_t)  entries,        /// transitions to be added to the entry state
1296                     exits,          /// transitions to be added to the exit state
1297                     entry_state,    /// transitions to write the entry state to
1298                     exit_state;     /// transitions to write the exit state to
1299 
1300     bool swapMatchingBracketSyntax;
1301 
1302     this()
1303     {
1304         entries     = new List!(trans_t);
1305         exits       = new List!(trans_t);
1306         entry_state = new List!(trans_t);
1307         exit_state  = new List!(trans_t);
1308     }
1309 
1310     /* ********************************************************************************************
1311         Write the given state as entry state to this fragment.
1312     **********************************************************************************************/
1313     void setEntry(state_t state)
1314     {
1315         state.transitions ~= entries;
1316         foreach ( t; entry_state )
1317             t.target = state;
1318     }
1319 
1320     /* ********************************************************************************************
1321         Write the given state as exit state to this fragment.
1322     **********************************************************************************************/
1323     void setExit(state_t state)
1324     {
1325         state.transitions ~= exits;
1326         foreach ( t; exit_state )
1327             t.target = state;
1328     }
1329 }
1330 
1331 /* ************************************************************************************************
1332     Tagged NFA
1333 **************************************************************************************************/
1334 private final class TNFA(char_t)
1335 {
1336     alias TNFATransition!(char_t)   trans_t;
1337     alias TNFAFragment!(char_t)     frag_t;
1338     alias TNFAState!(char_t)        state_t;
1339     alias Predicate!(char_t)        predicate_t;
1340     alias char_t[]                  string_t;
1341     alias CharRange!(char_t)        range_t;
1342     alias CharClass!(char_t)        cc_t;
1343 
1344     string_t    pattern;
1345     state_t[]   states;
1346     state_t     start;
1347 
1348     bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching
1349 
1350     /* ********************************************************************************************
1351         Creates the TNFA from the given regex pattern
1352     **********************************************************************************************/
1353     this(string_t regex)
1354     {
1355         next_tag        = 1;
1356         transitions     = new List!(trans_t);
1357 
1358         pattern = regex;
1359     }
1360 
1361     /* ********************************************************************************************
1362         Print the TNFA (tabular representation of the delta function)
1363     **********************************************************************************************/
1364     debug(TangoRegex) void print()
1365     {
1366         foreach ( int i, s; states )
1367         {
1368             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
1369 
1370             bool first=true;
1371             Stdout(" {");
1372             foreach ( t; s.transitions )
1373             {
1374                 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index);
1375                 if ( t.tag > 0 ) {
1376                     Stdout.format(" t{}", t.tag);
1377                 }
1378                 first = false;
1379             }
1380             Stdout("}").newline;
1381         }
1382     }
1383 
1384     uint tagCount()
1385     {
1386         return next_tag-1;
1387     }
1388 
1389     /* ********************************************************************************************
1390         Constructs the TNFA using extended Thompson method.
1391         Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert
1392         the regexp from infix notation.
1393     **********************************************************************************************/
1394     void parse(bool unanchored)
1395     {
1396         List!(frag_t)       frags       = new List!(frag_t);
1397         Stack!(Operator)    opStack;
1398         Stack!(uint)        tagStack;
1399         Stack!(Pair!(uint)) occurStack;
1400         opStack ~= Operator.eos;
1401 
1402         /* ****************************************************************************************
1403             Perform action on operator stack
1404         ******************************************************************************************/
1405         bool perform(Operator next_op, bool explicit_operator=true)
1406         {
1407             // calculate index in action matrix
1408             int index = cast(int)opStack.top()*(Operator.max+1);
1409             index += cast(int)next_op;
1410 
1411             debug(tnfa) Stdout.formatln("\t{}:{} -> {}  {} frag(s)",
1412                 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length
1413             );
1414             switch ( action_lookup[index] )
1415             {
1416                 case Act.pua:
1417                     opStack ~= next_op;
1418                     if ( next_op == Operator.open_par ) {
1419                         tagStack ~= next_tag;
1420                         next_tag += 2;
1421                     }
1422                     break;
1423                 case Act.poc:
1424                     switch ( opStack.top() )
1425                     {
1426                         case Operator.concat:       constructConcat(frags);                             break;
1427                         case Operator.altern:       constructAltern(frags);                             break;
1428                         case Operator.zero_one_g:   constructZeroOne(frags, PriorityClass.greedy);      break;
1429                         case Operator.zero_one_ng:  constructZeroOne(frags, PriorityClass.reluctant);   break;
1430                         case Operator.zero_one_xr:  constructZeroOne(frags, PriorityClass.extraReluctant);  break;
1431                         case Operator.zero_more_g:  constructZeroMore(frags, PriorityClass.greedy);     break;
1432                         case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant);  break;
1433                         case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break;
1434                         case Operator.one_more_g:   constructOneMore(frags, PriorityClass.greedy);      break;
1435                         case Operator.one_more_ng:  constructOneMore(frags, PriorityClass.reluctant);   break;
1436                         case Operator.one_more_xr:  constructOneMore(frags, PriorityClass.extraReluctant);  break;
1437                         case Operator.occur_g:
1438                             Pair!(uint) occur = occurStack.pop();
1439                             constructOccur(frags, occur.a, occur.b, PriorityClass.greedy);
1440                             break;
1441                         case Operator.occur_ng:
1442                             Pair!(uint) occur = occurStack.pop();
1443                             constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant);
1444                             break;
1445                         default:
1446                             throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1447                     }
1448                     opStack.pop();
1449 
1450                     perform(next_op, false);
1451                     break;
1452                 case Act.poa:
1453                     opStack.pop();
1454                     break;
1455                 case Act.pca:
1456                     if ( opStack.top() == Operator.open_par )
1457                     {
1458                         if ( tagStack.empty() )
1459                             throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup);
1460                         constructBracket(frags, tagStack.top());
1461                         tagStack.pop();
1462                     }
1463                     else {
1464                         assert(opStack.top() == Operator.open_par_nm);
1465                         constructBracket(frags);
1466                     }
1467                     opStack.pop();
1468                     break;
1469                 case Act.don:
1470                     return true;
1471                 case Act.err:
1472                 default:
1473                     throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup);
1474             }
1475 
1476             return false;
1477         }
1478 
1479         // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches
1480         // and matching bracket for total match group
1481         if ( unanchored ) {
1482             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1483             perform(Operator.zero_more_xr, false);
1484             perform(Operator.concat, false);
1485             perform(Operator.open_par, false);
1486         }
1487 
1488         // convert regex to postfix and create TNFA
1489         bool implicit_concat;
1490         predicate_t.Type pred_type;
1491 
1492         while ( !endOfPattern() )
1493         {
1494             pred_type = predicate_t.Type.consume;
1495 
1496             dchar c = readPattern();
1497             switch ( c )
1498             {
1499                 case '|':
1500                     perform(Operator.altern);
1501                     implicit_concat = false;
1502                     break;
1503                 case '(':
1504                     if ( implicit_concat )
1505                         perform(Operator.concat, false);
1506                     implicit_concat = false;
1507                     if ( peekPattern() == '?' ) {
1508                         readPattern();
1509                         perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm);
1510                     }
1511                     else
1512                         perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par);
1513                     break;
1514                 case ')':
1515                     perform(Operator.close_par);
1516                     break;
1517                 case '?':
1518                     if ( peekPattern() == '?' ) {
1519                         readPattern();
1520                         perform(Operator.zero_one_ng);
1521                     }
1522                     else
1523                         perform(Operator.zero_one_g);
1524                     break;
1525                 case '*':
1526                     if ( peekPattern() == '?' ) {
1527                         readPattern();
1528                         perform(Operator.zero_more_ng);
1529                     }
1530                     else
1531                         perform(Operator.zero_more_g);
1532                     break;
1533                 case '+':
1534                     if ( peekPattern() == '?' ) {
1535                         readPattern();
1536                         perform(Operator.one_more_ng);
1537                     }
1538                     else
1539                         perform(Operator.one_more_g);
1540                     break;
1541                 case '{':
1542                     Pair!(uint) occur;
1543                     parseOccurCount(occur.a, occur.b);
1544                     occurStack ~= occur;
1545                     if ( peekPattern() == '?' ) {
1546                         readPattern();
1547                         perform(Operator.occur_ng);
1548                     }
1549                     else
1550                         perform(Operator.occur_g);
1551                     break;
1552                 case '[':
1553                     if ( implicit_concat )
1554                         perform(Operator.concat, false);
1555                     implicit_concat = true;
1556                     frags ~= constructCharClass(pred_type);
1557                     break;
1558                 case '.':
1559                     if ( implicit_concat )
1560                         perform(Operator.concat, false);
1561                     implicit_concat = true;
1562                     frags ~= constructChars(cc_t.dot_oper, pred_type);
1563                     break;
1564                 case '$':
1565                     if ( implicit_concat )
1566                         perform(Operator.concat, false);
1567                     implicit_concat = true;
1568 
1569                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead);
1570                     break;
1571                 case '^':
1572                     if ( implicit_concat )
1573                         perform(Operator.concat, false);
1574                     implicit_concat = true;
1575 
1576                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind);
1577                     break;
1578                 case '>':
1579                     c = readPattern();
1580                     pred_type = predicate_t.Type.lookahead;
1581                     if ( c == '[' )
1582                         goto case '[';
1583                     else if ( c == '\\' )
1584                         goto case '\\';
1585                     else if ( c == '.' )
1586                         goto case '.';
1587                     else
1588                         goto default;
1589                 case '<':
1590                     c = readPattern();
1591                     pred_type = predicate_t.Type.lookbehind;
1592                     if ( c == '[' )
1593                         goto case '[';
1594                     else if ( c == '\\' )
1595                         goto case '\\';
1596                     else if ( c == '.' )
1597                         goto case '.';
1598                     else
1599                         goto default;
1600                 case '\\':
1601                     c = readPattern();
1602 
1603                     if ( implicit_concat )
1604                         perform(Operator.concat, false);
1605                     implicit_concat = true;
1606 
1607                     switch ( c )
1608                     {
1609                         case 't':
1610                             frags ~= constructSingleChar('\t', pred_type);
1611                             break;
1612                         case 'n':
1613                             frags ~= constructSingleChar('\n', pred_type);
1614                             break;
1615                         case 'r':
1616                             frags ~= constructSingleChar('\r', pred_type);
1617                             break;
1618                         case 'w':   // alphanumeric and _
1619                             frags ~= constructChars(cc_t.alphanum_, pred_type);
1620                             break;
1621                         case 'W':   // non-(alphanum and _)
1622                             auto cc = cc_t(cc_t.alphanum_);
1623                             cc.negate();
1624                             frags ~= constructChars(cc, pred_type);
1625                             break;
1626                         case 's':   // whitespace
1627                             frags ~= constructChars(cc_t.whitespace, pred_type);
1628                             break;
1629                         case 'S':   // non-whitespace
1630                             auto cc = cc_t(cc_t.whitespace);
1631                             cc.negate();
1632                             frags ~= constructChars(cc, pred_type);
1633                             break;
1634                         case 'd':   // digit
1635                             frags ~= constructChars(cc_t.digit, pred_type);
1636                             break;
1637                         case 'D':   // non-digit
1638                             auto cc = cc_t(cc_t.digit);
1639                             cc.negate();
1640                             frags ~= constructChars(cc, pred_type);
1641                             break;
1642                         case 'b':   // either end of word
1643                             if ( pred_type != predicate_t.Type.consume )
1644                                 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind");
1645 
1646                             // create (?<\S>\s|<\s>\S)
1647                             auto cc = cc_t(cc_t.whitespace);
1648                             cc.negate();
1649 
1650                             perform(Operator.open_par_nm);
1651 
1652                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1653                             perform(Operator.concat, false);
1654                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1655                             perform(Operator.altern, false);
1656                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1657                             perform(Operator.concat, false);
1658                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1659 
1660                             perform(Operator.close_par, false);
1661                             break;
1662                         case 'B':   // neither end of word
1663                             if ( pred_type != predicate_t.Type.consume )
1664                                 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind");
1665 
1666                             // create (?<\S>\S|<\s>\s)
1667                             auto cc = cc_t(cc_t.whitespace);
1668                             cc.negate();
1669 
1670                             perform(Operator.open_par_nm);
1671 
1672                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1673                             perform(Operator.concat, false);
1674                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1675                             perform(Operator.altern, false);
1676                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1677                             perform(Operator.concat, false);
1678                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1679 
1680                             perform(Operator.close_par, false);
1681                             break;
1682                         case '(':
1683                         case ')':
1684                         case '[':
1685                         case ']':
1686                         case '{':
1687                         case '}':
1688                         case '*':
1689                         case '+':
1690                         case '?':
1691                         case '.':
1692                         case '\\':
1693                         case '^':
1694                         case '$':
1695                         case '|':
1696                         case '<':
1697                         case '>':
1698                         default:
1699                             frags ~= constructSingleChar(c, pred_type);
1700                             break;
1701 //                            throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c));
1702                     }
1703                     break;
1704 
1705                 default:
1706                     if ( implicit_concat )
1707                         perform(Operator.concat, false);
1708                     implicit_concat = true;
1709                     frags ~= constructSingleChar(c, pred_type);
1710             }
1711         }
1712 
1713         // add implicit reluctant .* (with . == any_char) at the end for unanchored matches
1714         if ( unanchored )
1715         {
1716             perform(Operator.close_par, false);
1717             if ( implicit_concat )
1718                 perform(Operator.concat, false);
1719             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1720             perform(Operator.zero_more_ng, false);
1721         }
1722 
1723         // empty operator stack
1724         while ( !perform(Operator.eos) ) {}
1725 
1726         // set start and finish states
1727         start = addState();
1728         state_t finish = addState();
1729         finish.accept = true;
1730 
1731         foreach ( f; frags ) {
1732             f.setExit(finish);
1733             f.setEntry(start);
1734         }
1735 
1736         // set transition priorities
1737         List!(trans_t)[PriorityClass.max+1] trans;
1738         foreach ( ref t; trans )
1739             t = new List!(trans_t);
1740 
1741         Stack!(trans_t) todo;
1742         state_t state = start;
1743 
1744         while ( !todo.empty() || !state.visited )
1745         {
1746             if ( !state.visited )
1747             {
1748                 state.visited = true;
1749                 foreach_reverse ( t; state.transitions )
1750                     todo ~= t;
1751             }
1752 
1753             if ( todo.empty() )
1754                 break;
1755             trans_t t = todo.top();
1756             todo.pop();
1757             assert(t.priorityClass<=PriorityClass.max);
1758             trans[t.priorityClass] ~= t;
1759             state = t.target;
1760         }
1761 
1762         uint nextPrio;
1763         foreach ( ts; trans )
1764         {
1765             foreach ( t; ts )
1766                 t.priority = nextPrio++;
1767         }
1768     }
1769 
1770 private:
1771     uint            next_tag;
1772     size_t          cursor,
1773                     next_cursor;
1774     List!(trans_t)  transitions;
1775 
1776     state_t[state_t]    clonedStates;
1777     trans_t[trans_t]    clonedTransitions;
1778 
1779     /// RegEx operators
1780     enum Operator {
1781         eos, concat, altern, open_par, close_par,
1782         zero_one_g, zero_more_g, one_more_g,        // greedy
1783         zero_one_ng, zero_more_ng, one_more_ng,     // non-greedy/reluctant
1784         zero_one_xr, zero_more_xr, one_more_xr,     // extra-reluctant
1785         open_par_nm, occur_g, occur_ng
1786     }
1787     __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"];
1788 
1789     /// Actions for to-postfix transformation
1790     enum Act {
1791         pua, poc, poa, pca, don, err
1792     }
1793     __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"];
1794 
1795     /// Action lookup for to-postfix transformation
1796     __gshared immutable Act[] action_lookup =
1797     [
1798     //  eos      concat   |        (        )        ?        *        +        ??       *?       +?       ??extra  *?extra  +?extra  (?       {x,y}    {x,y}?
1799         Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1800         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1801         Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1802         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1803         Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err,
1804         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1805         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1806         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1807         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1808         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1809         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1810         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1811         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1812         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1813         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1814         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1815         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc
1816     ];
1817 
1818     final dchar peekPattern()
1819     {
1820         auto tmp = next_cursor;
1821         if ( tmp < pattern.length )
1822             return decode(pattern, tmp);
1823         return 0;
1824     }
1825 
1826     final dchar readPattern()
1827     {
1828         cursor = next_cursor;
1829         if ( next_cursor < pattern.length )
1830             return decode(pattern, next_cursor);
1831         return 0;
1832     }
1833 
1834     final bool endOfPattern()
1835     {
1836         return next_cursor >= pattern.length;
1837     }
1838 
1839     state_t addState()
1840     {
1841         state_t s = new state_t;
1842         s.index = states.length;
1843         states ~= s;
1844         return s;
1845     }
1846 
1847     trans_t addTransition(PriorityClass pc = PriorityClass.normal)
1848     {
1849         trans_t trans = new trans_t(pc);
1850         transitions ~= trans;
1851         return trans;
1852     }
1853 
1854     uint parseNumber()
1855     {
1856         uint res;
1857         while ( !endOfPattern() )
1858         {
1859             auto c = peekPattern();
1860             if ( c < '0' || c > '9' )
1861                 break;
1862             res = res*10+(c-'0');
1863             readPattern();
1864         }
1865         return res;
1866     }
1867 
1868     void parseOccurCount(out uint minOccur, out uint maxOccur)
1869     {
1870         assert(pattern[cursor] == '{');
1871 
1872         minOccur = parseNumber();
1873         if ( peekPattern() == '}' ) {
1874             readPattern();
1875             maxOccur = minOccur;
1876             return;
1877         }
1878         if ( peekPattern() != ',' )
1879             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1880         readPattern();
1881         maxOccur = parseNumber();
1882         if ( peekPattern() != '}' )
1883             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1884         readPattern();
1885         if ( maxOccur > 0 && maxOccur < minOccur )
1886             throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1887     }
1888 
1889     trans_t clone(trans_t t)
1890     {
1891         if ( t is null )
1892             return null;
1893         trans_t* tmp = t in clonedTransitions;
1894         if ( tmp !is null )
1895             return *tmp;
1896 
1897         trans_t t2 = new trans_t(t.priorityClass);
1898         clonedTransitions[t] = t2;
1899         t2.tag = t.tag;
1900         t2.priority = t.priority;
1901         t2.predicate = t.predicate;
1902         t2.target = clone(t.target);
1903         transitions ~= t2;
1904         return t2;
1905     }
1906 
1907     state_t clone(state_t s)
1908     {
1909         if ( s is null )
1910             return null;
1911         state_t* tmp = s in clonedStates;
1912         if ( tmp !is null )
1913             return *tmp;
1914 
1915         state_t s2 = new state_t;
1916         clonedStates[s] = s2;
1917         s2.accept = s.accept;
1918         s2.visited = s.visited;
1919         foreach ( t; s.transitions )
1920             s2.transitions ~= clone(t);
1921         s2.index = states.length;
1922         states ~= s2;
1923         return s2;
1924     }
1925 
1926     frag_t clone(frag_t f)
1927     {
1928         if ( f is null )
1929             return null;
1930         clonedStates = null;
1931         clonedTransitions = null;
1932 
1933         frag_t f2 = new frag_t;
1934         foreach ( t; f.entries )
1935             f2.entries ~= clone(t);
1936         foreach ( t; f.exits )
1937             f2.exits ~= clone(t);
1938         foreach ( t; f.entry_state )
1939             f2.entry_state ~= clone(t);
1940         foreach ( t; f.exit_state )
1941             f2.exit_state ~= clone(t);
1942         return f2;
1943     }
1944 
1945     //---------------------------------------------------------------------------------------------
1946     // Thompson constructions of NFA fragments
1947 
1948     frag_t constructSingleChar(char_t c, predicate_t.Type type)
1949     {
1950         debug(tnfa) Stdout.formatln("constructCharFrag {}", c);
1951 
1952         trans_t trans = addTransition();
1953         trans.predicate.appendInput(CharRange!(char_t)(c));
1954 
1955         trans.predicate.type = type;
1956 
1957         frag_t frag = new frag_t;
1958         frag.exit_state ~= trans;
1959         frag.entries    ~= trans;
1960         return frag;
1961     }
1962 
1963     frag_t constructChars(string_t chars, in predicate_t.Type type)
1964     {
1965         cc_t cc;
1966         for ( int i = 0; i < chars.length; ++i )
1967             cc.add(chars[i]);
1968 
1969         return constructChars(cc, type);
1970     }
1971 
1972     frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type)
1973     {
1974         debug(tnfa) Stdout.format("constructChars type={}", type);
1975 
1976         trans_t trans = addTransition();
1977         trans.predicate.type = type;
1978 
1979         trans.predicate.setInput(cc_t(charclass));
1980 
1981         trans.predicate.optimize();
1982         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
1983 
1984         frag_t frag = new frag_t;
1985         frag.exit_state ~= trans;
1986         frag.entries    ~= trans;
1987         return frag;
1988     }
1989 
1990     frag_t constructCharClass(predicate_t.Type type)
1991     {
1992         debug(tnfa) Stdout.format("constructCharClass type={}", type);
1993         auto oldCursor = cursor;
1994 
1995         trans_t trans = addTransition();
1996 
1997         bool negated=false;
1998         if ( peekPattern() == '^' ) {
1999             readPattern();
2000             negated = true;
2001         }
2002 
2003         char_t  last;
2004         bool    have_range_start,
2005                 first_char = true;
2006         for ( ; !endOfPattern() && peekPattern() != ']'; )
2007         {
2008             dchar c = readPattern();
2009             switch ( c )
2010             {
2011                 case '-':
2012                     if ( first_char ) {
2013                         trans.predicate.appendInput(range_t(c));
2014                         break;
2015                     }
2016                     if ( !have_range_start )
2017                         throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2018                     else if ( endOfPattern() || peekPattern() == ']' )
2019                         throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2020                     else {
2021                         c = readPattern();
2022                         trans.predicate.appendInput(range_t(last, c));
2023                         have_range_start = false;
2024                     }
2025                     break;
2026                 case '\\':
2027                     if ( endOfPattern() )
2028                         throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\"");
2029                     c = readPattern();
2030                     switch ( c )
2031                     {
2032                         case 't':
2033                             c = '\t';
2034                             break;
2035                         case 'n':
2036                             c = '\n';
2037                             break;
2038                         case 'r':
2039                             c = '\r';
2040                             break;
2041                         default:
2042                             break;
2043                     }
2044                     goto default;
2045                 default:
2046                     if ( have_range_start )
2047                         trans.predicate.appendInput(range_t(last));
2048                     last = c;
2049                     have_range_start = true;
2050             }
2051             first_char = false;
2052         }
2053         if ( !endOfPattern() )
2054             readPattern();
2055         if ( last != char_t.init )
2056             trans.predicate.appendInput(range_t(last));
2057         debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]);
2058 
2059         if ( negated ) {
2060             auto tmp = cc_t(cc_t.any_char);
2061             tmp.subtract(trans.predicate.input);
2062             trans.predicate.input = tmp;
2063         }
2064         else
2065             trans.predicate.optimize();
2066         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
2067 
2068         trans.predicate.type = type;
2069 
2070         frag_t frag = new frag_t;
2071         frag.exit_state ~= trans;
2072         frag.entries    ~= trans;
2073         return frag;
2074     }
2075 
2076     void constructBracket(List!(frag_t) frags, uint tag=0)
2077     {
2078         debug(tnfa) Stdout.formatln("constructBracket");
2079 
2080         state_t entry = addState(),
2081                 exit = addState();
2082         frags.tail.value.setEntry(entry);
2083         frags.tail.value.setExit(exit);
2084 
2085         trans_t tag1 = addTransition(),
2086                 tag2 = addTransition();
2087         tag1.predicate.type = predicate_t.Type.epsilon;
2088         tag2.predicate.type = predicate_t.Type.epsilon;
2089         if ( tag > 0 )
2090         {
2091             // make sure the tag indeces for bracket x are always
2092             // x*2 for the opening bracket and x*2+1 for the closing bracket
2093             tag1.tag = tag++;
2094             tag2.tag = tag;
2095         }
2096         tag1.target = entry;
2097         exit.transitions ~= tag2;
2098 
2099         frag_t frag = new frag_t;
2100         frag.entries ~= tag1;
2101         frag.exit_state ~= tag2;
2102         frags.pop();
2103         frags ~= frag;
2104     }
2105 
2106     void constructOneMore(List!(frag_t) frags, PriorityClass prioClass)
2107     {
2108         debug(tnfa) Stdout.formatln("constructOneMore");
2109 
2110         if ( frags.empty() )
2111             throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2112 
2113         trans_t repeat = addTransition(prioClass),
2114                 cont = addTransition();
2115         repeat.predicate.type = predicate_t.Type.epsilon;
2116         cont.predicate.type = predicate_t.Type.epsilon;
2117 
2118         state_t s = addState();
2119         frags.tail.value.setExit(s);
2120         s.transitions ~= repeat;
2121         s.transitions ~= cont;
2122 
2123         frag_t frag = new frag_t;
2124         frag.entries ~= frags.tail.value.entries;
2125         frag.entry_state ~= frags.tail.value.entry_state;
2126         frag.entry_state ~= repeat;
2127         frag.exit_state ~= cont;
2128         frags.pop();
2129         frags ~= frag;
2130     }
2131 
2132     void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass)
2133     {
2134         debug(tnfa) Stdout.formatln("constructZeroMore");
2135 
2136         if ( frags.empty() )
2137             throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2138 
2139         trans_t enter = addTransition(prioClass),
2140                 repeat = addTransition(prioClass),
2141                 skip = addTransition();
2142         skip.predicate.type = predicate_t.Type.epsilon;
2143         repeat.predicate.type = predicate_t.Type.epsilon;
2144         enter.predicate.type = predicate_t.Type.epsilon;
2145 
2146         state_t entry = addState(),
2147                 exit = addState();
2148         frags.tail.value.setEntry(entry);
2149         frags.tail.value.setExit(exit);
2150         exit.transitions ~= repeat;
2151         enter.target = entry;
2152 
2153         frag_t frag = new frag_t;
2154         frag.entries ~= skip;
2155         frag.entries ~= enter;
2156         frag.exit_state ~= skip;
2157         frag.entry_state ~= repeat;
2158         frags.pop();
2159         frags ~= frag;
2160     }
2161 
2162     void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass)
2163     {
2164         debug(tnfa) Stdout.formatln("constructZeroOne");
2165 
2166         if ( frags.empty() )
2167             throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2168 
2169         trans_t use = addTransition(prioClass),
2170                 skip = addTransition();
2171         use.predicate.type = predicate_t.Type.epsilon;
2172         skip.predicate.type = predicate_t.Type.epsilon;
2173 
2174         state_t s = addState();
2175         frags.tail.value.setEntry(s);
2176         use.target = s;
2177 
2178         frag_t frag = new frag_t;
2179         frag.entries ~= use;
2180         frag.entries ~= skip;
2181         frag.exits ~= frags.tail.value.exits;
2182         frag.exit_state ~= frags.tail.value.exit_state;
2183         frag.exit_state ~= skip;
2184         frags.pop();
2185         frags ~= frag;
2186     }
2187 
2188     void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass)
2189     {
2190         debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur);
2191 
2192         if ( frags.empty() )
2193             throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2194 
2195         state_t s;
2196         frag_t  total = new frag_t,
2197                 prev;
2198 
2199         for ( int i = 0; i < minOccur; ++i )
2200         {
2201             frag_t f = clone(frags.tail.value);
2202             if ( prev !is null ) {
2203                 s = addState();
2204                 prev.setExit(s);
2205                 f.setEntry(s);
2206             }
2207             else {
2208                 total.entries = f.entries;
2209                 total.entry_state = f.entry_state;
2210             }
2211             prev = f;
2212         }
2213 
2214         if ( maxOccur == 0 )
2215         {
2216             frag_t f = frags.tail.value;
2217             trans_t t = addTransition();
2218             t.predicate.type = predicate_t.Type.epsilon;
2219             f.entries ~= t;
2220             f.exit_state ~= t;
2221 
2222             t = addTransition();
2223             t.predicate.type = predicate_t.Type.epsilon;
2224             f.exits ~= t;
2225             f.entry_state ~= t;
2226 
2227             s = addState();
2228             f.setEntry(s);
2229 
2230             if ( prev !is null )
2231                 prev.setExit(s);
2232             else {
2233                 total.entries = f.entries;
2234                 total.entry_state = f.entry_state;
2235             }
2236 
2237             prev = f;
2238         }
2239 
2240         for ( int i = minOccur; i < maxOccur; ++i )
2241         {
2242             frag_t f;
2243             if ( i < maxOccur-1 )
2244                 f = clone(frags.tail.value);
2245             else
2246                 f = frags.tail.value;
2247             trans_t t = addTransition();
2248             t.predicate.type = predicate_t.Type.epsilon;
2249             f.entries ~= t;
2250             f.exit_state ~= t;
2251 
2252             if ( prev !is null ) {
2253                 s = addState();
2254                 prev.setExit(s);
2255                 f.setEntry(s);
2256             }
2257             else {
2258                 total.entries = f.entries;
2259                 total.entry_state = f.entry_state;
2260             }
2261             prev = f;
2262         }
2263 
2264         total.exits = prev.exits;
2265         total.exit_state = prev.exit_state;
2266 
2267         frags.pop();
2268         frags ~= total;
2269     }
2270 
2271     void constructAltern(List!(frag_t) frags)
2272     {
2273         debug(tnfa) Stdout.formatln("constructAltern");
2274 
2275         if ( frags.empty() || frags.head is frags.tail )
2276             throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2277 
2278         frag_t  frag = new frag_t,
2279                 f1 = frags.tail.value,
2280                 f2 = frags.tail.prev.value;
2281         frag.entry_state ~= f2.entry_state;
2282         frag.entry_state ~= f1.entry_state;
2283         frag.exit_state ~= f2.exit_state;
2284         frag.exit_state ~= f1.exit_state;
2285         frag.entries ~= f2.entries;
2286         frag.entries ~= f1.entries;
2287         frag.exits ~= f2.exits;
2288         frag.exits ~= f1.exits;
2289 
2290         frags.pop();
2291         frags.pop();
2292         frags ~= frag;
2293     }
2294 
2295     void constructConcat(List!(frag_t) frags)
2296     {
2297         debug(tnfa) Stdout.formatln("constructConcat");
2298 
2299         if ( frags.empty() || frags.head is frags.tail )
2300             throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2301 
2302         frag_t  f1 = frags.tail.value,
2303                 f2 = frags.tail.prev.value;
2304 
2305         state_t state = addState();
2306         f2.setExit(state);
2307         f1.setEntry(state);
2308 
2309         frag_t frag = new frag_t;
2310         frag.entries ~= f2.entries;
2311         frag.exits ~= f1.exits;
2312         frag.entry_state ~= f2.entry_state;
2313         frag.exit_state ~= f1.exit_state;
2314         frags.pop();
2315         frags.pop();
2316         frags ~= frag;
2317     }
2318 }
2319 import tango.core.Array;
2320 
2321 /* ************************************************************************************************
2322     Tagged DFA
2323 **************************************************************************************************/
2324 private class TDFA(char_t)
2325 {
2326     alias Predicate!(char_t)    predicate_t;
2327     alias CharRange!(char_t)    range_t;
2328     alias CharClass!(char_t)    charclass_t;
2329     alias char_t[]              string_t;
2330 
2331     enum uint CURRENT_POSITION_REGISTER = ~0;
2332 
2333     /* ********************************************************************************************
2334         Tag map assignment command
2335     **********************************************************************************************/
2336     struct Command
2337     {
2338         uint        dst,    /// register index to recieve data
2339                     src;    /// register index or CURRENT_POSITION_REGISTER for current position
2340 
2341         string toString()
2342         {
2343             return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup;
2344         }
2345 
2346         /* ****************************************************************************************
2347             Order transitions by the order of their predicates.
2348         ******************************************************************************************/
2349         int opCmp(const ref Command cmd) const
2350         {
2351             if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER )
2352                 return 1;
2353             if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER )
2354                 return -1;
2355             if ( dst < cmd.dst )
2356                 return -1;
2357             if ( dst == cmd.dst )
2358                 return 0;
2359             return 1;
2360         }
2361     }
2362 
2363     struct TagIndex
2364     {
2365         uint    tag,
2366                 index;
2367 
2368         const int opCmp(ref const TagIndex o)
2369         {
2370             if (tag == o.tag && index == o.index)
2371                 return 0;
2372             if (tag > o.tag)
2373                 return 1;
2374             if (tag < o.tag)
2375                 return -1;
2376             if (index > o.index)
2377                 return 1;
2378             if (index < o.index)
2379                 return -1;
2380             assert(0);
2381         }
2382     }
2383 
2384     /* ********************************************************************************************
2385         TDFA state
2386     **********************************************************************************************/
2387     class State
2388     {
2389         enum Mode {
2390             GENERIC, MIXED, LOOKUP
2391         }
2392 
2393         enum uint  LOOKUP_LENGTH = 256,
2394                    INVALID_STATE = 255;
2395 
2396         bool            accept = false;
2397         bool            reluctant = false;
2398         size_t          index;
2399         Transition[]    transitions,
2400                         generic_transitions;
2401         Command[]       finishers;
2402 
2403         ubyte[]         lookup;
2404         Mode            mode;
2405 
2406         void optimize()
2407         {
2408             // merge transitions with equal targets (same state index and equal commands)
2409             size_t[] remove_indeces;
2410             foreach ( i, t; transitions[0 .. $-1] )
2411             {
2412                 foreach ( t2; transitions[i+1 .. $] )
2413                 {
2414                     if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) )
2415                         continue;
2416                     t2.predicate.appendInput(t.predicate);
2417                     remove_indeces ~= i;
2418                     break;
2419                 }
2420             }
2421 
2422             // remove transitions that have been merged into another
2423             if ( remove_indeces.length > 0 )
2424             {
2425                 Transition[] tmp;
2426                 tmp.length = transitions.length - remove_indeces.length;
2427                 size_t next_remove, next;
2428                 foreach ( i, t; transitions )
2429                 {
2430                     if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) {
2431                         ++next_remove;
2432                         continue;
2433                     }
2434                     tmp[next++] = t;
2435                 }
2436                 transitions = tmp;
2437 
2438                 foreach ( t; transitions )
2439                     t.predicate.optimize();
2440             }
2441         }
2442 
2443         void createLookup()
2444         {
2445             size_t count;
2446             foreach ( t; transitions )
2447             {
2448                 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) )
2449                     ++count;
2450             }
2451 
2452             if ( count < 1 || transitions.length > INVALID_STATE ) {
2453                 generic_transitions.length = transitions.length;
2454                 generic_transitions[] = transitions[];
2455                 return;
2456             }
2457 
2458             foreach ( t; transitions )
2459             {
2460                 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) )
2461                     generic_transitions ~= t;
2462             }
2463 
2464             // setup lookup table
2465             lookup.length = LOOKUP_LENGTH;
2466             lookup[] = INVALID_STATE;
2467             foreach ( i, t; transitions )
2468             {
2469                 foreach ( p; t.predicate.input.parts )
2470                 {
2471                     if ( p.l_ >= lookup.length )
2472                         continue;
2473                     for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c )
2474                         lookup[c] = cast(ubyte)i;
2475                 }
2476             }
2477 
2478             mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP;
2479         }
2480     }
2481 
2482     /* ********************************************************************************************
2483         TDFA transition
2484     **********************************************************************************************/
2485     class Transition
2486     {
2487         State       target;
2488         predicate_t predicate;
2489         Command[]   commands;
2490 
2491         /* ****************************************************************************************
2492             Order transitions by the order of their predicates.
2493         ******************************************************************************************/
2494         override final int opCmp(Object o)
2495         {
2496             Transition t = cast(Transition)o;
2497             assert(t !is null);
2498             return predicate.opCmp(t.predicate);
2499         }
2500 
2501         override final const bool opEquals(const Object o)
2502         {
2503             auto t = cast(const(Transition))o;
2504             if ( t is null )
2505                 return false;
2506             if ( equalTarget(t) && t.predicate == predicate )
2507                 return true;
2508             return false;
2509         }
2510 
2511         const bool equalTarget(in Transition t)
2512         {
2513             if ( t.target.index != target.index )
2514                 return false;
2515             if ( commands.length != t.commands.length )
2516                 return false;
2517             Louter: foreach ( cmd; commands )
2518             {
2519                 foreach ( cmd2; t.commands )
2520                 {
2521                     if ( cmd == cmd2 )
2522                         continue Louter;
2523                 }
2524                 return false;
2525             }
2526             return true;
2527         }
2528     }
2529 
2530 
2531     State[]     states;
2532     State       start;
2533     Command[]   initializer;
2534     uint        num_tags;
2535 
2536     uint[TagIndex]  registers;
2537     uint            next_register;
2538 
2539     uint num_regs()
2540     {
2541         return next_register;
2542     }
2543 
2544     /* ********************************************************************************************
2545         Constructs the TDFA from the given TNFA using extended power set method
2546     **********************************************************************************************/
2547     this(TNFA!(char_t) tnfa)
2548     {
2549         num_tags        = tnfa.tagCount();
2550 
2551         next_register   = num_tags;
2552         for ( int i = 1; i <= num_tags; ++i ) {
2553             TagIndex ti;
2554             ti.tag = i;
2555             registers[ti] = i-1;
2556         }
2557 
2558         // create epsilon closure of TNFA start state
2559         SubsetState subset_start    = new SubsetState;
2560         StateElement se             = new StateElement;
2561         se.nfa_state = tnfa.start;
2562         subset_start.elms ~= se;
2563 
2564         // apply lookbehind closure for string/line start
2565         predicate_t tmp_pred;
2566         tmp_pred.setInput(CharClass!(char_t).line_startend);
2567         subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start);
2568 
2569         start = addState();
2570         subset_start.dfa_state = start;
2571 
2572         // generate initializer and finisher commands for TDFA start state
2573         generateInitializers(subset_start);
2574         generateFinishers(subset_start);
2575 
2576         // initialize stack for state traversal
2577         List!(SubsetState)  subset_states   = new List!(SubsetState),
2578                             unmarked        = new List!(SubsetState);
2579         subset_states   ~= subset_start;
2580         unmarked        ~= subset_start;
2581         debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString);
2582 
2583         while ( !unmarked.empty() )
2584         {
2585             SubsetState state = unmarked.tail.value;
2586             unmarked.pop();
2587 
2588             // create transitions for each class, creating new states when necessary
2589             foreach ( pred; disjointPredicates(state) )
2590             {
2591                 // find NFA state we reach with pred
2592                 // reach will set predicate type correctly
2593                 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString);
2594                 SubsetState target = reach(state, pred);
2595                 if ( target is null ) {
2596                     continue;
2597                     debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping");
2598                 }
2599                 debug(tdfa) Stdout.formatln(" {}", target.toString);
2600                 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state);
2601 
2602                 Transition trans = new Transition;
2603                 state.dfa_state.transitions ~= trans;
2604                 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred);
2605                 trans.predicate = pred;
2606 
2607                 // generate indeces for pos commands
2608                 // delay creation of pos command until we have reorder-commands
2609                 uint[uint] cmds = null;
2610                 foreach ( e; target.elms )
2611                 {
2612                     foreach ( tag, ref index; e.tags )
2613                     {
2614                         bool found=false;
2615                         foreach ( e2; state.elms )
2616                         {
2617                             int* i = tag in e2.tags;
2618                             if ( i !is null && *i == index ) {
2619                                 found=true;
2620                                 break;
2621                             }
2622                         }
2623                         if ( !found )
2624                         {
2625                             // if index is < 0 it is a temporary index
2626                             // used only to distinguish the state from existing ones.
2627                             // the previous index can be reused instead.
2628                             if ( index < 0 )
2629                                 index = -index-1;
2630                             cmds[tag] = index;
2631                         }
2632                         else
2633                             assert(index>=0);
2634                     }
2635                 }
2636 
2637                 // check whether a state exists that is identical except for tag index reorder-commands
2638                 bool exists=false;
2639                 foreach ( equivTarget; subset_states )
2640                 {
2641                     if ( reorderTagIndeces(target, equivTarget, state, trans) ) {
2642                         target = equivTarget;
2643                         exists = true;
2644                         break;
2645                     }
2646                 }
2647                 // else create new target state
2648                 if ( !exists )
2649                 {
2650                     State ts = addState();
2651                     target.dfa_state = ts;
2652                     subset_states   ~= target;
2653                     unmarked        ~= target;
2654                     debug(tdfa_add) {
2655                         Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString);
2656                     }
2657                     generateFinishers(target);
2658                 }
2659 
2660                 // now generate pos commands, rewriting reorder-commands if existent
2661                 foreach ( tag, index; cmds )
2662                 {
2663                     // check whether reordering used this tag, if so, overwrite the command directly,
2664                     // for it's effect would be overwritten by a subsequent pos-command anyway
2665                     uint reg = registerFromTagIndex(tag, index);
2666                     bool found = false;
2667                     foreach ( ref cmd; trans.commands )
2668                     {
2669                         if ( cmd.src == reg ) {
2670                             found = true;
2671                             cmd.src = CURRENT_POSITION_REGISTER;
2672                             break;
2673                         }
2674                     }
2675                     if ( !found ) {
2676                         Command cmd;
2677                         cmd.dst = reg;
2678                         cmd.src = CURRENT_POSITION_REGISTER;
2679                         trans.commands ~= cmd;
2680                     }
2681                 }
2682 
2683                 trans.target = target.dfa_state;
2684                 debug(tdfa) {
2685                     Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index);
2686                 }
2687             }
2688 
2689             state.dfa_state.optimize();
2690         }
2691 
2692         // renumber registers continuously
2693         uint[uint]  regNums;
2694 
2695         for ( next_register = 0; next_register < num_tags; ++next_register )
2696             regNums[next_register] = next_register;
2697 
2698         void renumberCommand(ref Command cmd)
2699         {
2700             if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null )
2701                 regNums[cmd.src] = next_register++;
2702             if ( (cmd.dst in regNums) is null )
2703                 regNums[cmd.dst] = next_register++;
2704             if ( cmd.src != CURRENT_POSITION_REGISTER )
2705                 cmd.src = regNums[cmd.src];
2706             cmd.dst = regNums[cmd.dst];
2707         }
2708 
2709         foreach ( state; states )
2710         {
2711             foreach ( ref cmd; state.finishers )
2712                 renumberCommand(cmd);
2713             // make sure pos-commands are executed after reorder-commands and
2714             // reorder-commands do not overwrite each other
2715             tango.core.Array.sort(state.finishers);
2716 
2717             foreach ( trans; state.transitions )
2718             {
2719                 foreach ( ref cmd; trans.commands )
2720                     renumberCommand(cmd);
2721                 tango.core.Array.sort(trans.commands);
2722                 trans.predicate.compile();
2723             }
2724         }
2725 
2726         debug(TangoRegex)
2727         {
2728             foreach ( ref v; registers )
2729             {
2730                 if ( (v in regNums) !is null )
2731                     v = regNums[v];
2732             }
2733         }
2734 
2735         minimizeDFA();
2736 
2737         foreach ( state; states )
2738             state.createLookup();
2739 
2740         // TODO: optimize memory layout of TDFA
2741 
2742         // TODO: add lookahead for string-end somewhere
2743         // TODO: mark dead-end states (not leaving a non-finishing susbet)
2744         // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition
2745         //       (execution may stop in that state)
2746     }
2747 
2748     /* ********************************************************************************************
2749         Print the TDFA (tabular representation of the delta function)
2750     **********************************************************************************************/
2751     debug(TangoRegex) void print()
2752     {
2753         Stdout.formatln("#tags = {}", num_tags);
2754 
2755         auto tis = new TagIndex[registers.length];
2756         foreach ( k, v; registers )
2757             tis [v] = k;
2758         foreach ( r, ti; tis ) {
2759             Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r);
2760         }
2761         Stdout.formatln("Initializer:");
2762         foreach ( cmd; initializer ) {
2763             Stdout.formatln("{}", cmd.toString);
2764         }
2765         Stdout.formatln("Delta function:");
2766         foreach ( int i, s; states )
2767         {
2768             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
2769 
2770             bool first=true;
2771             Stdout(" {");
2772             foreach ( t; s.transitions )
2773             {
2774                 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index);
2775                 bool firstcmd=true;
2776                 foreach ( cmd; t.commands )
2777                 {
2778                     if ( firstcmd )
2779                         firstcmd = false;
2780                     else
2781                         Stdout(",");
2782                     Stdout.format("{}", cmd.toString);
2783                 }
2784                 Stdout(")");
2785                 first = false;
2786             }
2787             Stdout("} (");
2788 
2789             bool firstcmd=true;
2790             foreach ( cmd; s.finishers )
2791             {
2792                 if ( firstcmd )
2793                     firstcmd = false;
2794                 else
2795                     Stdout(",");
2796                 Stdout.format("{}", cmd.toString);
2797             }
2798             Stdout.formatln(")");
2799         }
2800     }
2801 
2802 private:
2803     /* ********************************************************************************************
2804         A (TNFA state, tags) pair element of a subset state.
2805     **********************************************************************************************/
2806     class StateElement
2807     {
2808         TNFAState!(char_t)  nfa_state;
2809         int[uint]           tags;
2810         // use place-value priority with 2 places, value(maxPrio) > value(lastPrio)
2811         uint                maxPriority,
2812                             lastPriority;
2813 
2814         bool prioGreater(StateElement se)
2815         {
2816             if ( maxPriority < se.maxPriority )
2817                 return true;
2818             if ( maxPriority == se.maxPriority ) {
2819                 assert(lastPriority != se.lastPriority);
2820                 return lastPriority < se.lastPriority;
2821             }
2822             return false;
2823         }
2824 
2825         override int opCmp(Object o)
2826         {
2827             StateElement se = cast(StateElement)o;
2828             assert(se !is null);
2829             if ( maxPriority < se.maxPriority )
2830                 return 1;
2831             if ( maxPriority == se.maxPriority )
2832             {
2833                 if ( lastPriority == se.lastPriority )
2834                     return 0;
2835                 return lastPriority < se.lastPriority;
2836             }
2837             return -1;
2838         }
2839 
2840         override string toString()
2841         {
2842             const(char)[] str;
2843             str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority);
2844             bool first = true;
2845             foreach ( k, v; tags ) {
2846                 str ~= Format.convert("{}m({},{})", first?"":",", k, v);
2847                 first = false;
2848             }
2849             str ~= "}";
2850             return str.idup;
2851         }
2852     }
2853 
2854     /* ********************************************************************************************
2855         Represents a state in the NFA to DFA conversion.
2856         Contains the set of states (StateElements) the NFA might be in at the same time and the
2857         corresponding DFA state that we create.
2858     **********************************************************************************************/
2859     class SubsetState
2860     {
2861         StateElement[]  elms;
2862         State           dfa_state;
2863 
2864         this(StateElement[] elms=null)
2865         {
2866             this.elms = elms;
2867         }
2868 
2869         int opApply(int delegate (ref TNFATransition!(char_t)) dg)
2870         {
2871             int res;
2872             foreach ( elm; elms )
2873             {
2874                 foreach ( t; elm.nfa_state.transitions )
2875                 {
2876                     res = dg(t);
2877                     if ( res )
2878                         return res;
2879                 }
2880             }
2881             return res;
2882         }
2883 
2884         override string toString()
2885         {
2886             string str = "[ ";
2887             bool first = true;
2888             foreach ( s; elms ) {
2889                 if ( !first )
2890                     str ~= ", ";
2891                 str ~= s.toString();
2892                 first = false;
2893             }
2894             return str~" ]";
2895         }
2896     }
2897 
2898     /* ********************************************************************************************
2899         Temporary structure used for disjoint predicate computation
2900     **********************************************************************************************/
2901     struct Mark
2902     {
2903         char_t  c;
2904         bool    end;    /// false = start of range
2905 
2906         int opCmp(const ref Mark m) const
2907         {
2908             if ( c < m.c )
2909                 return -1;
2910             if ( c > m.c )
2911                 return 1;
2912             if ( end < m.end )
2913                 return -1;
2914             if ( end > m.end )
2915                 return 1;
2916             return 0;
2917         }
2918     }
2919 
2920     /* ********************************************************************************************
2921         Calculates the register index for a given tag map entry. The TDFA implementation uses
2922         registers to save potential tag positions, the index space gets linearized here.
2923 
2924         Params:     tag =   tag number
2925                     index = tag map index
2926         Returns:    index of the register to use for the tag map entry
2927     **********************************************************************************************/
2928     uint registerFromTagIndex(uint tag, uint index)
2929     {
2930         if ( index > 0 )
2931         {
2932             TagIndex ti;
2933             ti.tag = tag;
2934             ti.index = index;
2935             uint* i = ti in registers;
2936             if ( i !is null )
2937                 return *i;
2938             return registers[ti] = next_register++;
2939         }
2940         else
2941             return tag-1;
2942     }
2943 
2944     Mark[] marks_;
2945 
2946     /* ********************************************************************************************
2947         Add new TDFA state to the automaton.
2948     **********************************************************************************************/
2949     State addState()
2950     {
2951         State s = new State;
2952         s.index = states.length;
2953         states ~= s;
2954         return s;
2955     }
2956 
2957     /* ********************************************************************************************
2958         Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions.
2959 
2960         Params:     state = SubsetState to create the predicates from
2961         Returns:    List of disjoint predicates that can be used for a DFA state
2962     **********************************************************************************************/
2963     predicate_t[] disjointPredicates(SubsetState state)
2964     {
2965         alias CharRange!(char_t) range_t;
2966         debug(tdfa) Stdout.formatln("disjointPredicates()");
2967 
2968         size_t num_marks;
2969         foreach ( t; state )
2970         {
2971             // partitioning will consider lookbehind transitions,
2972             // st. lb-closure will not expand for transitions with a superset of the lb-predicate
2973             if ( t.predicate.type != predicate_t.Type.epsilon )
2974             {
2975                 debug(tdfa) Stdout.formatln("{}", t.predicate.toString);
2976                 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length )
2977                     marks_.length = num_marks+2*t.predicate.getInput().parts.length;
2978                 foreach ( p; t.predicate.getInput().parts ) {
2979                     marks_[num_marks++] = Mark(p.l(), false);
2980                     marks_[num_marks++] = Mark(p.r(), true);
2981                 }
2982             }
2983         }
2984 
2985         if ( num_marks <= 1 )
2986             throw new Exception("disjointPredicates: No transitions in subset state");
2987 
2988         debug(tdfa) Stdout("\nsorting...").newline;
2989         // using built-in sort somtimes gives an AV in TypeInfo.swap
2990         quickSort(marks_[0 .. num_marks]);
2991         assert(!marks_[0].end);
2992 
2993         debug(tdfa)
2994         {
2995             Stdout("\nsorted marks:\n");
2996             bool first=true;
2997             foreach ( m; marks_[0 .. num_marks] )
2998             {
2999                 if ( first )
3000                     first = false;
3001                 else
3002                     Stdout(",");
3003                 if ( m.c > 0x20 && m.c < 0x7f )
3004                     Stdout.format("{}{}", m.end?"e":"s", m.c);
3005                 else
3006                     Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c);
3007             }
3008             Stdout.newline;
3009         }
3010 
3011         size_t  next,
3012                 active = 1;
3013         char_t  start = marks_[0].c,
3014                 end;
3015         range_t[]   disjoint = new range_t[num_marks/2+1];
3016 
3017         foreach ( m; marks_[1 .. num_marks] )
3018         {
3019             if ( m.end )
3020             {
3021                 assert(active>0);
3022                 --active;
3023                 if ( m.c < start )
3024                     continue;
3025                 end = m.c;
3026                 // the next range cannot start at the same pos
3027                 // because starts are sorted before endings
3028                 if ( active > 0 )
3029                     ++m.c;
3030             }
3031             else
3032             {
3033                 ++active;
3034                 if ( active == 1 )
3035                 {
3036                     // skip uncovered interval
3037                     if ( m.c > start ) {
3038                         start = m.c;
3039                         continue;
3040                     }
3041                     end = m.c;
3042                     ++m.c;
3043                 }
3044                 // skip range start if cursor already marks it
3045                 else if ( m.c <= start )
3046                     continue;
3047                 else
3048                     end = m.c-1;
3049             }
3050 
3051             // save range
3052             if ( disjoint.length <= next )
3053                 disjoint.length = disjoint.length*2;
3054 
3055             disjoint[next].l_ = start;
3056             disjoint[next].r_ = end;
3057             ++next;
3058 
3059             // advance cursor
3060             start = m.c;
3061         }
3062         disjoint.length = next;
3063 
3064         // merge isolated ranges into sets of ranges
3065         // no range in a set may occur separated from the others in any predicate
3066         predicate_t[]   preds;
3067         preds.length = 1;
3068         Lmerge: foreach ( r; disjoint )
3069         {
3070             if ( preds[$-1].empty() )
3071                 preds[$-1].appendInput(r);
3072             else
3073             {
3074                 // we can merge r into the current predicate if
3075                 // pred contains r <=> pred contains all the other ranges
3076                 foreach ( t; state )
3077                 {
3078                     if ( t.predicate.type == predicate_t.Type.epsilon )
3079                         continue;
3080 
3081                     if ( t.predicate.getInput().contains(r)
3082                         != t.predicate.getInput().contains(preds[$-1].getInput()) )
3083                     {
3084                         preds.length = preds.length+1;
3085                         break;
3086                     }
3087                 }
3088                 preds[$-1].appendInput(r);
3089             }
3090         }
3091 
3092         debug(tdfa)
3093         {
3094             Stdout("\ndisjoint ranges:\n");
3095             first=true;
3096             foreach ( r; disjoint )
3097             {
3098                 if ( first )
3099                     first = false;
3100                 else
3101                     Stdout(",");
3102                 Stdout.format("{}", r);
3103             }
3104             Stdout.newline;
3105             Stdout("\ndisjoint predicates:\n");
3106             first=true;
3107             foreach ( ref p; preds )
3108             {
3109                 if ( first )
3110                     first = false;
3111                 else
3112                     Stdout(",");
3113                 Stdout.format("{}", p.toString);
3114             }
3115             Stdout.newline;
3116         }
3117 
3118         debug(tdfa) Stdout.formatln("disjointPredicates() end");
3119         return preds;
3120     }
3121 
3122     /* ********************************************************************************************
3123         Finds all TNFA states that can be reached directly with the given predicate and creates
3124         a new SubsetState containing those target states.
3125 
3126         Params:     subst = SubsetState to start from
3127                     pred =  predicate that is matched against outgoing transitions
3128         Returns:    SubsetState containing the reached target states
3129     **********************************************************************************************/
3130     SubsetState reach(SubsetState subst, ref predicate_t pred)
3131     {
3132         // to handle the special case of overlapping consume and lookahead predicates,
3133         // we find the different intersecting predicate types
3134         bool    have_consume,
3135                 have_lookahead;
3136         foreach ( t; subst )
3137         {
3138             if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead )
3139                 continue;
3140             auto intpred = t.predicate.intersect(pred);
3141             if ( !intpred.empty() )
3142             {
3143                 if ( t.predicate.type == predicate_t.Type.consume )
3144                     have_consume = true;
3145                 else if ( t.predicate.type == predicate_t.Type.lookahead )
3146                     have_lookahead = true;
3147             }
3148         }
3149 
3150         // if there is consume/lookahead overlap,
3151         // lookahead predicates are handled first
3152         predicate_t.Type processed_type;
3153         if ( have_lookahead )
3154             processed_type = predicate_t.Type.lookahead;
3155         else if ( have_consume )
3156             processed_type = predicate_t.Type.consume;
3157         else {
3158             debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString);
3159             return null;
3160         }
3161         pred.type = processed_type;
3162 
3163         // add destination states to new subsetstate
3164         SubsetState r = new SubsetState;
3165         foreach ( s; subst.elms )
3166         {
3167             foreach ( t; s.nfa_state.transitions )
3168             {
3169                 if ( t.predicate.type != processed_type )
3170                     continue;
3171                 auto intpred = t.predicate.intersect(pred);
3172                 if ( !intpred.empty() ) {
3173                     StateElement se = new StateElement;
3174                     se.maxPriority = max(t.priority, s.maxPriority);
3175                     se.lastPriority = t.priority;
3176                     se.nfa_state = t.target;
3177                     se.tags = s.tags;
3178                     r.elms ~= se;
3179                 }
3180             }
3181         }
3182 
3183         // if we prioritized lookaheads, the states that may consume are also added to the new subset state
3184         // this behaviour is somewhat similar to an epsilon closure
3185         if ( have_lookahead && have_consume )
3186         {
3187             foreach ( s; subst.elms )
3188             {
3189                 foreach ( t; s.nfa_state.transitions )
3190                 {
3191                     if ( t.predicate.type != predicate_t.Type.consume )
3192                         continue;
3193                     auto intpred = t.predicate.intersect(pred);
3194                     if ( !intpred.empty() ) {
3195                         r.elms ~= s;
3196                         break;
3197                     }
3198                 }
3199             }
3200         }
3201         return r;
3202     }
3203 
3204     /* ********************************************************************************************
3205         Extends the given SubsetState with the states that are reached through lookbehind transitions.
3206 
3207         Params:     from =      SubsetState to create the lookbehind closure for
3208                     previous =  predicate "from" was reached with
3209         Returns:    SubsetState containing "from" and all states of it's lookbehind closure
3210     **********************************************************************************************/
3211     SubsetState lookbehindClosure(SubsetState from, predicate_t pred)
3212     {
3213         List!(StateElement) stack = new List!(StateElement);
3214         StateElement[size_t]  closure;
3215 
3216         foreach ( e; from.elms )
3217         {
3218             stack ~= e;
3219             closure[e.nfa_state.index] = e;
3220         }
3221 
3222         while ( !stack.empty() )
3223         {
3224             StateElement se = stack.tail.value;
3225             stack.pop();
3226             foreach ( t; se.nfa_state.transitions )
3227             {
3228                 if ( t.predicate.type != predicate_t.Type.lookbehind )
3229                     continue;
3230                 if ( t.predicate.intersect(pred).empty() )
3231                     continue;
3232                 uint new_maxPri = max(t.priority, se.maxPriority);
3233 
3234                 StateElement* tmp = t.target.index in closure;
3235                 if ( tmp !is null )
3236                 {
3237                     // if higher prio (smaller value) exists, do not use this transition
3238                     if ( tmp.maxPriority < new_maxPri ) {
3239 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3240                         continue;
3241                     }
3242                     else if ( tmp.maxPriority == new_maxPri )
3243                     {
3244                         // "equal lastPrio -> first-come-first-serve"
3245                         // doesn't work for lexer - how to solve it properly?
3246                         if ( tmp.lastPriority <= t.priority ) {
3247 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3248                             continue;
3249                         }
3250 //                         else
3251 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3252                     }
3253 //                     else
3254 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3255                 }
3256 
3257                 StateElement new_se = new StateElement;
3258                 new_se.maxPriority = max(t.priority, se.maxPriority);
3259                 new_se.lastPriority = t.priority;
3260                 new_se.nfa_state = t.target;
3261                 new_se.tags = se.tags;
3262 
3263                 closure[t.target.index] = new_se;
3264                 stack ~= new_se;
3265             }
3266         }
3267 
3268         SubsetState res = new SubsetState;
3269         res.elms = closure.values;
3270         return res;
3271     }
3272 
3273     /* ********************************************************************************************
3274         Generates the epsilon closure of the given subset state, creating tag map entries
3275         if tags are passed. Takes priorities into account, effectively realizing
3276         greediness and reluctancy.
3277 
3278         Params:     from =      SubsetState to create the epsilon closure for
3279                     previous =  SubsetState "from" was reached from
3280         Returns:    SubsetState containing "from" and all states of it's epsilon closure
3281     **********************************************************************************************/
3282     SubsetState epsilonClosure(SubsetState from, SubsetState previous)
3283     {
3284         int firstFreeIndex=-1;
3285         foreach ( e; previous.elms )
3286         {
3287             foreach ( ti; e.tags )
3288                 firstFreeIndex = max(firstFreeIndex, cast(int)ti);
3289         }
3290         ++firstFreeIndex;
3291 
3292         List!(StateElement) stack = new List!(StateElement);
3293         StateElement[size_t]  closure;
3294 
3295         foreach ( e; from.elms )
3296         {
3297             stack ~= e;
3298             closure[e.nfa_state.index] = e;
3299         }
3300 
3301         while ( !stack.empty() )
3302         {
3303             StateElement se = stack.tail.value;
3304             stack.pop();
3305             foreach ( t; se.nfa_state.transitions )
3306             {
3307                 if ( t.predicate.type != predicate_t.Type.epsilon )
3308                     continue;
3309                 // this is different from Ville Laurikari's algorithm, but it's crucial
3310                 // to take the max (instead of t.priority) to make reluctant operators work
3311                 uint new_maxPri = max(t.priority, se.maxPriority);
3312 
3313                 StateElement* tmp = t.target.index in closure;
3314                 if ( tmp !is null )
3315                 {
3316                     // if higher prio (smaller value) exists, do not use this transition
3317                     if ( tmp.maxPriority < new_maxPri ) {
3318 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3319                         continue;
3320                     }
3321                     else if ( tmp.maxPriority == new_maxPri )
3322                     {
3323                         // "equal lastPrio -> first-come-first-serve"
3324                         // doesn't work for lexer - how to solve it properly?
3325                         if ( tmp.lastPriority <= t.priority ) {
3326 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3327                             continue;
3328                         }
3329 //                         else
3330 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3331                     }
3332 //                     else
3333 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3334                 }
3335 
3336                 auto new_se = new StateElement;
3337                 new_se.maxPriority = new_maxPri;
3338                 new_se.lastPriority = t.priority;
3339                 new_se.nfa_state = t.target;
3340 
3341                 if ( t.tag > 0 )
3342                 {
3343                     foreach ( k, v; se.tags )
3344                         new_se.tags[k] = v;
3345                     new_se.tags[t.tag] = firstFreeIndex;
3346                 }
3347                 else
3348                     new_se.tags = se.tags;
3349 
3350                 closure[t.target.index] = new_se;
3351                 stack ~= new_se;
3352             }
3353         }
3354 
3355         SubsetState res = new SubsetState;
3356         res.elms = closure.values;
3357 
3358         // optimize tag usage
3359         // all we need to do is to check whether the largest tag-index from the
3360         // previous state is actually used in the new state and move all tags with
3361         // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0
3362         if ( firstFreeIndex > 0 )
3363         {
3364             bool seenLastUsedIndex = false;
3365             sluiLoop: foreach ( e; res.elms )
3366             {
3367                 foreach ( i; e.tags )
3368                 {
3369                     if ( i == firstFreeIndex-1 ) {
3370                         seenLastUsedIndex = true;
3371                         break sluiLoop;
3372                     }
3373                 }
3374             }
3375             if ( !seenLastUsedIndex )
3376             {
3377                 foreach ( e; res.elms )
3378                 {
3379                     foreach ( ref i; e.tags )
3380                     {
3381                         // mark index by making it negative
3382                         // to signal that it can be decremented
3383                         // after it has been detected to be a newly used index
3384                         if ( i == firstFreeIndex )
3385                             i = -firstFreeIndex;
3386                     }
3387                 }
3388             }
3389         }
3390 
3391         return res;
3392     }
3393 
3394     /* ********************************************************************************************
3395         Tries to create commands that reorder the tag map of "previous", such that "from" becomes
3396         tag-wise identical to "to". If successful, these commands are added to "trans". This
3397         is done for state re-use.
3398 
3399         Params:     from =      SubsetState to check for tag-wise equality to "to"
3400                     to =        existing SubsetState that we want to re-use
3401                     previous =  SubsetState we're coming from
3402                     trans =     Transition we went along
3403         Returns:    true if "from" is tag-wise identical to "to" and the necessary commands have
3404                     been added to "trans"
3405     **********************************************************************************************/
3406     bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans)
3407     {
3408         if ( from.elms.length != to.elms.length )
3409             return false;
3410 
3411         bool[Command]
3412             cmds;
3413         uint[TagIndex]
3414             reorderedIndeces;
3415         StateElement[TagIndex]
3416             reordered_elements;
3417 
3418         Louter: foreach ( fe; from.elms )
3419         {
3420             foreach ( te; to.elms )
3421             {
3422                 if ( te.nfa_state.index != fe.nfa_state.index )
3423                     continue;
3424                 if ( fe.tags.length != te.tags.length )
3425                     return false;
3426                 foreach ( tag, findex; fe.tags )
3427                 {
3428                     if ( (tag in te.tags) is null )
3429                         return false;
3430 
3431                     TagIndex ti;
3432                     ti.tag = tag;
3433                     ti.index = te.tags[tag];
3434 
3435                     // apply priority for conflicting tag indeces
3436                     if ( (ti in reorderedIndeces) !is null )
3437                     {
3438                         auto rse = reordered_elements[ti];
3439                         auto ri = reorderedIndeces[ti];
3440                         if ( ri != findex
3441                             && ( rse.maxPriority < fe.maxPriority
3442                                 || rse.maxPriority == fe.maxPriority
3443                                 && rse.lastPriority <= fe.lastPriority )
3444                         )
3445                             continue;
3446                         Command cmd;
3447                         cmd.src = registerFromTagIndex(tag,ri);
3448                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3449                         cmds.remove(cmd);
3450                     }
3451                     // if target index differs, create reordering command
3452                     if ( te.tags[tag] != findex )
3453                     {
3454                         Command cmd;
3455                         cmd.src = registerFromTagIndex(tag,findex);
3456                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3457                         cmds[cmd] = true;
3458                     }
3459 
3460                     reorderedIndeces[ti] = findex;
3461                     reordered_elements[ti] = fe;
3462                 }
3463                 continue Louter;
3464             }
3465             return false;
3466         }
3467 
3468         debug(tdfa) {
3469             Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index);
3470         }
3471 
3472         trans.commands ~= cmds.keys;
3473         return true;
3474     }
3475 
3476     /* ********************************************************************************************
3477         Generate tag map initialization commands for start state.
3478     **********************************************************************************************/
3479     void generateInitializers(SubsetState start)
3480     {
3481         uint[uint] cmds;
3482         foreach ( nds; start.elms )
3483         {
3484             foreach ( k, v; nds.tags )
3485                 cmds[k] = v;
3486         }
3487 
3488         foreach ( k, v; cmds ) {
3489             Command cmd;
3490             cmd.dst = registerFromTagIndex(k,v);
3491             cmd.src = CURRENT_POSITION_REGISTER;
3492             initializer ~= cmd;
3493         }
3494     }
3495 
3496     /* ********************************************************************************************
3497         Generates finisher commands for accepting states.
3498     **********************************************************************************************/
3499     void generateFinishers(SubsetState r)
3500     {
3501         // if at least one of the TNFA states accepts,
3502         // set the finishers from active tags in increasing priority
3503         StateElement[]  sorted_elms = r.elms.dup;
3504         tango.core.Array.sort(sorted_elms);
3505         bool reluctant = false;
3506         foreach ( se; sorted_elms ) {
3507             debug (Finishers) Stdout.formatln("Finisher: {}", se);
3508             if ( se.nfa_state.accept )
3509             {
3510                 r.dfa_state.accept = true;
3511 
3512                 // Knowing that we're looking at an epsilon closure with an accepting
3513                 // state, we look at the involved transitions - if the path from the
3514                 // nfa state in the set with the highest incoming priority (last in
3515                 // sorted_elms list) to the accepting nfa state is via the highest
3516                 // priority transitions, and they are all epsilon transitions, this
3517                 // suggests we're looking at a regex ending with a reluctant pattern.
3518                 // The NFA->DFA transformation will most likely extend the automata
3519                 // further, but we want the matching to stop here.
3520                 // NOTE: The grounds for choosing the last element in sorted_elms
3521                 // are somewhat weak (empirical testing), but sofar no new
3522                 // regressions have been discovered. larsivi 20090827
3523                 TNFATransition!(char_t) highestPriTrans;
3524                 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state &&
3525                             sorted_elms[$-1].nfa_state))
3526                     throw new Exception ("Something is NULL that is expected to
3527                             be non-null", __FILE__, __LINE__);
3528 
3529                 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) {
3530                     if (trans.canFinish()) {
3531                         r.dfa_state.reluctant = true;
3532                         break;
3533                     }
3534                 }
3535 
3536                 bool[uint]  finished_tags;
3537                 {
3538                     foreach ( t, i; se.tags )
3539                         if ( i > 0 && !(t in finished_tags) ) {
3540                             finished_tags[t] = true;
3541                             Command cmd;
3542                             cmd.dst = registerFromTagIndex(t, 0);
3543                             cmd.src = registerFromTagIndex(t, i);
3544                             r.dfa_state.finishers ~= cmd;
3545                         }
3546                 }
3547             }
3548         }
3549     }
3550 
3551     /* ********************************************************************************************
3552         Assumes that the command-lists are sorted and transitions are optimized
3553     **********************************************************************************************/
3554     void minimizeDFA()
3555     {
3556         class DiffTable
3557         {
3558             this(size_t num) {
3559                 diff_ = new bool[num*(num+1)/2];
3560             }
3561 
3562             ~this() { diff_.destroy; }
3563 
3564             bool opCall(size_t a, size_t b)
3565             {
3566                 if ( a < b )
3567                     return diff_[b*(b+1)/2+a];
3568                 return diff_[a*(a+1)/2+b];
3569             }
3570 
3571             void set(size_t a, size_t b)
3572             {
3573                 if ( a < b )
3574                     diff_[b*(b+1)/2+a] = true;
3575                 else
3576                     diff_[a*(a+1)/2+b] = true;
3577             }
3578 
3579             bool[]  diff_;
3580         }
3581 
3582         debug(tdfa) Stdout.formatln("Minimizing TDFA");
3583 
3584         scope diff = new DiffTable(states.length);
3585         bool new_diff = true;
3586 
3587         while ( new_diff )
3588         {
3589             new_diff = false;
3590             foreach ( i, a; states[0 .. $-1] )
3591             {
3592                 Linner: foreach ( j, b; states[i+1 .. $] )
3593                 {
3594                     if ( diff(i, j+i+1) )
3595                         continue;
3596 
3597                     // assume optimized transitions
3598                     if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) {
3599                         diff.set(i, j+i+1);
3600                         new_diff = true;
3601                         continue;
3602                     }
3603 
3604                     if ( a.accept ) // b accepts too
3605                     {
3606                         // assume sorted finishers
3607                         if ( a.finishers.length != b.finishers.length ) {
3608                             diff.set(i, j+i+1);
3609                             new_diff = true;
3610                             continue;
3611                         }
3612                         foreach ( k, cmd; a.finishers )
3613                         {
3614                             if ( cmd != b.finishers[k] ) {
3615                                 diff.set(i, j+i+1);
3616                                 new_diff = true;
3617                                 continue Linner;
3618                             }
3619                         }
3620                     }
3621 
3622                     Ltrans: foreach ( ta; a.transitions )
3623                     {
3624                         foreach ( tb; b.transitions )
3625                         {
3626                             if ( ta.predicate.intersects(tb.predicate) )
3627                             {
3628                                 if ( diff(ta.target.index, tb.target.index) ) {
3629                                     diff.set(i, j+i+1);
3630                                     new_diff = true;
3631                                     continue Linner;
3632                                 }
3633                                 // assume sorted commands
3634                                 if ( ta.commands.length != tb.commands.length ) {
3635                                     diff.set(i, j+i+1);
3636                                     new_diff = true;
3637                                     continue Linner;
3638                                 }
3639                                 foreach ( k, cmd; ta.commands )
3640                                 {
3641                                     if ( cmd != tb.commands[k] ) {
3642                                         diff.set(i, j+i+1);
3643                                         new_diff = true;
3644                                         continue Linner;
3645                                     }
3646                                 }
3647                                 continue Ltrans;
3648                             }
3649                         }
3650 
3651                         diff.set(i, j+i+1);
3652                         new_diff = true;
3653                         continue Linner;
3654                     }
3655 
3656                 }
3657             }
3658         }
3659 
3660         foreach ( i, a; states[0 .. $-1] )
3661         {
3662             foreach ( j, b; states[i+1 .. $] )
3663             {
3664                 if ( !diff(i, j+i+1) )
3665                 {
3666                     debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1);
3667                     // remap b to a
3668                     foreach ( k, c; states )
3669                     {
3670                         foreach ( t; c.transitions )
3671                         {
3672                             if ( t.target.index == j+i+1 )
3673                                 t.target = a;
3674                         }
3675                     }
3676                 }
3677             }
3678         }
3679 
3680     }
3681 }
3682 import tango.text.Util;
3683 
3684 /**************************************************************************************************
3685     Regular expression compiler and interpreter.
3686 **************************************************************************************************/
3687 class RegExpT(char_t)
3688 {
3689     alias TDFA!(dchar)      tdfa_t;
3690     alias TNFA!(dchar)      tnfa_t;
3691     alias CharClass!(dchar) charclass_t;
3692     alias Predicate!(dchar) predicate_t;
3693 
3694     /**********************************************************************************************
3695         Construct a RegExpT object.
3696         Params:
3697             pattern = Regular expression.
3698         Throws: RegExpException if there are any compilation errors.
3699         Example:
3700             Declare two variables and assign to them a Regex object:
3701             ---
3702             auto r = new Regex("pattern");
3703             auto s = new Regex(r"p[1-5]\s*");
3704             ---
3705     **********************************************************************************************/
3706     this(const(char_t)[] pattern, const(char_t)[] attributes=null)
3707     {
3708         this(pattern, false, true);
3709     }
3710 
3711     /** ditto */
3712     this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false)
3713     {
3714         pattern_ = pattern;
3715 
3716         debug(TangoRegex) {}
3717         else {
3718             static if ( is(char_t == dchar) ) {
3719                 scope tnfa_t tnfa_ = new tnfa_t(pattern_);
3720             }
3721             else {
3722                 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_));
3723             }
3724         }
3725        
3726         tnfa_.swapMatchingBracketSyntax = swapMBS;
3727         tnfa_.parse(unanchored);
3728         if ( printNFA ) {
3729             debug(TangoRegex) Stdout.formatln("\nTNFA:");
3730             debug(TangoRegex) tnfa_.print;
3731         }
3732         tdfa_ = new tdfa_t(tnfa_);
3733         registers_.length = tdfa_.num_regs();
3734     }
3735 
3736     /**********************************************************************************************
3737         Generate instance of Regex.
3738         Params:
3739             pattern = Regular expression.
3740         Throws: RegExpException if there are any compilation errors.
3741         Example:
3742             Declare two variables and assign to them a Regex object:
3743             ---
3744             auto r = Regex("pattern");
3745             auto s = Regex(r"p[1-5]\s*");
3746             ---
3747     **********************************************************************************************/
3748     static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null)
3749     {
3750         return new RegExpT!(char_t)(pattern, attributes);
3751     }
3752 
3753     /**********************************************************************************************
3754         Set up for start of foreach loop.
3755         Returns:    Instance of RegExpT set up to search input.
3756         Example:
3757             ---
3758             import tango.io.Stdout;
3759             import tango.text.Regex;
3760 
3761             void main()
3762             {
3763                 foreach(m; Regex("ab").search("qwerabcabcababqwer"))
3764                     Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post);
3765             }
3766             // Prints:
3767             // qwer[ab]cabcababqwer
3768             // qwerabc[ab]cababqwer
3769             // qwerabcabc[ab]abqwer
3770             // qwerabcabcab[ab]qwer
3771             ---
3772     **********************************************************************************************/
3773     public RegExpT!(char_t) search(const(char_t)[] input)
3774     {
3775         input_ = input;
3776         next_start_ = 0;
3777         last_start_ = 0;
3778         return this;
3779     }
3780 
3781     /** ditto */
3782     public int opApply(int delegate(RegExpT!(char_t)) dg)
3783     {
3784         int result;
3785         while ( !result && test() )
3786             result = dg(this);
3787         return result;
3788     }
3789 
3790     /**********************************************************************************************
3791         Search input for match.
3792         Returns: false for no match, true for match
3793     **********************************************************************************************/
3794     bool test(const(char_t)[] input)
3795     {
3796         this.input_ = input;
3797         next_start_ = 0;
3798         last_start_ = 0;
3799         return test();
3800     }
3801 
3802     /**********************************************************************************************
3803         Pick up where last test(input) or test() left off, and search again.
3804         Returns: false for no match, true for match
3805     **********************************************************************************************/
3806     bool test()
3807     {
3808         // initialize registers
3809         assert(registers_.length == tdfa_.num_regs());
3810         registers_[0..$] = -1;
3811         foreach ( cmd; tdfa_.initializer ) {
3812             assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER);
3813             registers_[cmd.dst] = 0;
3814         }
3815 
3816         tdfa_t.Transition* tp, tp_end;
3817 
3818         // DFA execution
3819         auto inp = input_[next_start_ .. $];
3820         auto s = tdfa_.start;
3821 
3822         debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp);
3823         LmainLoop: for ( size_t p, next_p; p < inp.length; )
3824         {
3825         Lread_char:
3826             dchar c = cast(dchar)inp[p];
3827             if ( c & 0x80 )
3828                 c = decode(inp, next_p);
3829             else
3830                 next_p = p+1;
3831 
3832         Lprocess_char:
3833             debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c);
3834 
3835             tdfa_t.Transition t = void;
3836             switch ( s.mode )
3837             {
3838                 case s.Mode.LOOKUP:
3839                     if ( c < s.LOOKUP_LENGTH )
3840                     {
3841                         debug(TangoRegex) Stdout.formatln("lookup");
3842                         auto i = s.lookup[c];
3843                         if ( i == s.INVALID_STATE )
3844                             break LmainLoop;
3845                         t = s.transitions[ i ];
3846                         if ( t.predicate.type != t.predicate.Type.consume )
3847                             goto Lno_consume;
3848                         goto Lconsume;
3849                     }
3850                     break LmainLoop;
3851 
3852                 case s.Mode.MIXED:
3853                     if ( c < s.LOOKUP_LENGTH )
3854                     {
3855                         debug(TangoRegex) Stdout.formatln("mixed");
3856                         auto i = s.lookup[c];
3857                         if ( i == s.INVALID_STATE )
3858                             break;
3859                         t = s.transitions[ i ];
3860                         if ( t.predicate.type != t.predicate.Type.consume )
3861                             goto Lno_consume;
3862                         goto Lconsume;
3863                     }
3864                     break;
3865 
3866                 case s.Mode.GENERIC:
3867                 default:
3868                     break;
3869             }
3870 
3871             Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length;
3872                 tp < tp_end; ++tp )
3873             {
3874                 t = *tp;
3875                 switch ( t.predicate.mode )
3876                 {
3877                     // single char
3878                     case predicate_t.MatchMode.single_char:
3879                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3880                         if ( c != t.predicate.data_chr )
3881                             continue Ltrans_loop;
3882                         goto Lconsume;
3883                     case predicate_t.MatchMode.single_char_l:
3884                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3885                         if ( c != t.predicate.data_chr )
3886                             continue Ltrans_loop;
3887                         goto Lno_consume;
3888 
3889                     // bitmap
3890                     case predicate_t.MatchMode.bitmap:
3891                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3892                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3893                             goto Lconsume;
3894                         continue Ltrans_loop;
3895                     case predicate_t.MatchMode.bitmap_l:
3896                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3897                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3898                             goto Lno_consume;
3899                         continue Ltrans_loop;
3900 
3901                     // string search
3902                     case predicate_t.MatchMode.string_search:
3903                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3904                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3905                             continue Ltrans_loop;
3906                         goto Lconsume;
3907                     case predicate_t.MatchMode.string_search_l:
3908                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3909                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3910                             continue Ltrans_loop;
3911                         goto Lno_consume;
3912 
3913                     // generic
3914                     case predicate_t.MatchMode.generic:
3915                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3916                         for ( auto cmp = t.predicate.data_str.ptr,
3917                             cmpend = cmp + t.predicate.data_str.length;
3918                             cmp < cmpend; ++cmp )
3919                         {
3920                             if ( c < *cmp ) {
3921                                 ++cmp;
3922                                 continue;
3923                             }
3924                             ++cmp;
3925                             if ( c <= *cmp )
3926                                 goto Lconsume;
3927                         }
3928                         continue Ltrans_loop;
3929                     case predicate_t.MatchMode.generic_l:
3930                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3931                         for ( auto cmp = t.predicate.data_str.ptr,
3932                             cmpend = cmp + t.predicate.data_str.length;
3933                             cmp < cmpend; ++cmp )
3934                         {
3935                             if ( c < *cmp ) {
3936                                 ++cmp;
3937                                 continue;
3938                             }
3939                             ++cmp;
3940                             if ( c <= *cmp )
3941                                 goto Lno_consume;
3942                         }
3943                         continue Ltrans_loop;
3944 
3945                     default:
3946                         assert(0);
3947                 }
3948 
3949             Lconsume:
3950                 p = next_p;
3951             Lno_consume:
3952 
3953                 s = t.target;
3954                 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]);
3955                 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length);
3956 
3957                 foreach ( cmd; t.commands )
3958                 {
3959                     if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
3960                         registers_[cmd.dst] = cast(int)p;
3961                     else
3962                         registers_[cmd.dst] = registers_[cmd.src];
3963                 }
3964 
3965                 if (s.accept && s.reluctant)
3966                     // Don't continue matching, the current find should be correct
3967                     goto Laccept;
3968 
3969                 // if all input was consumed and we do not already accept, try to
3970                 // add an explicit string/line end
3971                 if ( p >= inp.length )
3972                 {
3973                     if ( s.accept || c == 0 )
3974                         break;
3975                     c = 0;
3976                     goto Lprocess_char;
3977                 }
3978                 goto Lread_char;
3979             }
3980             // no applicable transition
3981             break;
3982         }
3983 
3984         if ( s.accept )
3985         {
3986         Laccept:
3987             foreach ( cmd; s.finishers ) {
3988                 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER);
3989                 registers_[cmd.dst] = registers_[cmd.src];
3990             }
3991             if ( registers_.length > 1 && registers_[1] >= 0 ) {
3992                 last_start_ = next_start_;
3993                 next_start_ += registers_[1];
3994             }
3995             return true;
3996         }
3997 
3998         return false;
3999     }
4000 
4001     /**********************************************************************************************
4002         Return submatch with the given index.
4003         Params:
4004             index = 0 returns whole match, index > 0 returns submatch of bracket #index
4005         Returns:
4006             Slice of input for the requested submatch, or null if no such submatch exists.
4007     **********************************************************************************************/
4008     const(char_t)[] match(uint index)
4009     {
4010         if ( index > tdfa_.num_tags )
4011             return null;
4012         int start   = cast(int)last_start_+registers_[index*2],
4013             end     = cast(int)last_start_+registers_[index*2+1];
4014         if ( start >= 0 && start < end && end <= input_.length )
4015             return input_[start .. end];
4016         return null;
4017     }
4018 
4019     /** ditto */
4020     const(char_t)[] opIndex(uint index)
4021     {
4022         return match(index);
4023     }
4024 
4025     /**********************************************************************************************
4026         Return the slice of the input that precedes the matched substring.
4027         If no match was found, null is returned.
4028     **********************************************************************************************/
4029     const(char_t)[] pre()
4030     {
4031         auto start = registers_[0];
4032         if ( start < 0 )
4033             return null;
4034         return input_[0 .. last_start_+start];
4035     }
4036 
4037     /**********************************************************************************************
4038         Return the slice of the input that follows the matched substring.
4039         If no match was found, the whole slice of the input that was processed in the last test.
4040     **********************************************************************************************/
4041     const(char_t)[] post()
4042     {
4043         if ( registers_[1] >= 0 )
4044             return input_[next_start_ .. $];
4045         return input_[last_start_ .. $];
4046     }
4047 
4048     /**********************************************************************************************
4049         Splits the input at the matches of this regular expression into an array of slices.
4050         Example:
4051             ---
4052             import tango.io.Stdout;
4053             import tango.text.Regex;
4054 
4055             void main()
4056             {
4057                 auto strs = Regex("ab").split("abcabcababqwer");
4058                 foreach( s; strs )
4059                     Stdout.formatln("{}", s);
4060             }
4061             // Prints:
4062             // c
4063             // c
4064             // qwer
4065             ---
4066     **********************************************************************************************/
4067     inout(char_t)[][] split(inout(char_t)[] input)
4068     {
4069         auto res = new inout(char_t)[][PREALLOC];
4070         uint index;
4071         inout(char_t)[] tmp = input;
4072 
4073         foreach ( r; search(input) )
4074         {
4075             tmp = cast(inout(char_t)[])pre();
4076             res[index++] = tmp[last_start_ .. $];
4077             if ( index >= res.length )
4078                 res.length = res.length*2;
4079             tmp = cast(inout(char_t)[])post();
4080         }
4081 
4082         res[index++] = tmp;
4083         res.length = index;
4084         return res;
4085     }
4086 
4087     /**********************************************************************************************
4088         Returns a copy of the input with all matches replaced by replacement.
4089     **********************************************************************************************/
4090     char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4091     {
4092         const(char_t)[] tmp = input;
4093         if ( output_buffer.length <= 0 )
4094             output_buffer = new char_t[input.length+replacement.length];
4095         output_buffer.length = 0;
4096 
4097         foreach ( r; search(input) )
4098         {
4099             tmp = pre();
4100             if ( tmp.length > last_start_ )
4101                 output_buffer ~= tmp[last_start_ .. $];
4102             output_buffer ~= replacement;
4103             tmp = post();
4104         }
4105         output_buffer ~= tmp;
4106         return output_buffer;
4107     }
4108 
4109     /**********************************************************************************************
4110         Returns a copy of the input with the last match replaced by replacement.
4111     **********************************************************************************************/
4112     char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4113     {
4114         const(char_t)[] tmp_pre, tmp_post;
4115         if ( output_buffer.length <= 0 )
4116             output_buffer = new char_t[input.length+replacement.length];
4117         output_buffer.length = 0;
4118 
4119         foreach ( r; search(input) ) {
4120             tmp_pre = pre();
4121             tmp_post = post();
4122         }
4123 
4124         if ( tmp_pre !is null || tmp_post !is null ) {
4125             output_buffer ~= tmp_pre;
4126             output_buffer ~= replacement;
4127             output_buffer ~= tmp_post;
4128         }
4129         else
4130             output_buffer ~= input;
4131 
4132         return output_buffer;
4133     }
4134 
4135     /**********************************************************************************************
4136         Returns a copy of the input with the first match replaced by replacement.
4137     **********************************************************************************************/
4138     char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4139     {
4140         const(char_t)[] tmp = input;
4141         if ( output_buffer.length <= 0 )
4142             output_buffer = new char_t[input.length+replacement.length];
4143         output_buffer.length = 0;
4144 
4145         if ( test(input) )
4146         {
4147             tmp = pre();
4148             if ( tmp.length > last_start_ )
4149                 output_buffer ~= tmp[last_start_ .. $];
4150             output_buffer ~= replacement;
4151             tmp = post();
4152         }
4153         output_buffer ~= tmp;
4154         return output_buffer;
4155     }
4156 
4157     /**********************************************************************************************
4158         Calls dg for each match and replaces it with dg's return value.
4159     **********************************************************************************************/
4160     char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null)
4161     {
4162         const(char_t)[]    tmp = input;
4163         if ( output_buffer.length <= 0 )
4164             output_buffer = new char_t[input.length];
4165         output_buffer.length = 0;
4166 
4167         foreach ( r; search(input) )
4168         {
4169             tmp = pre();
4170             if ( tmp.length > last_start_ )
4171                 output_buffer ~= tmp[last_start_ .. $];
4172             output_buffer ~= dg(this);
4173             tmp = post();
4174         }
4175         output_buffer ~= tmp;
4176         return output_buffer;
4177     }
4178 
4179     /**********************************************************************************************
4180         Compiles the regular expression to D code.
4181 
4182         NOTE : Remember to import this module (tango.text.Regex) in the module where you put the
4183         generated D code.
4184 
4185     **********************************************************************************************/
4186     // TODO: input-end special case
4187     const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false)
4188     {
4189         const(char)[] code;
4190         const(char)[] str_type;
4191         static if ( is(char_t == char) )
4192             str_type = "const(char)[]";
4193         static if ( is(char_t == wchar) )
4194             str_type = "const(wchar)[]";
4195         static if ( is(char_t == dchar) )
4196             str_type = "const(dchar)[]";
4197 
4198         if ( lexer )
4199             code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type);
4200         else {
4201             code = Format.convert("// {}\nbool match({} input", pattern_, str_type);
4202             code ~= Format.convert(", ref {}[] groups", str_type);
4203         }
4204         code ~= Format.convert(")\n{{\n    uint s = {};", tdfa_.start.index);
4205 
4206         uint num_vars = tdfa_.num_regs();
4207         if ( num_vars > 0 )
4208         {
4209             if ( lexer )
4210                 code ~= "\n    static int ";
4211             else
4212                 code ~= "\n    int ";
4213             bool first = true;
4214             for ( int i = 0, used = 0; i < num_vars; ++i )
4215             {
4216                 bool hasInit = false;
4217                 foreach ( cmd; tdfa_.initializer )
4218                 {
4219                     if ( cmd.dst == i ) {
4220                         hasInit = true;
4221                         break;
4222                     }
4223                 }
4224 
4225                 if ( first )
4226                     first = false;
4227                 else
4228                     code ~= ", ";
4229                 if ( used > 0 && used % 10 == 0 )
4230                     code ~= "\n        ";
4231                 ++used;
4232                 code ~= Format.convert("r{}", i);
4233 
4234                 if ( hasInit )
4235                     code ~= "=0";
4236                 else
4237                     code ~= "=-1";
4238             }
4239             code ~= ";";
4240         }
4241 
4242         code ~= "\n\n    for ( size_t p, next_p; p < input.length; )\n    {";
4243         code ~= "\n        dchar c = cast(dchar)input[p];\n        if ( c & 0x80 )\n            decode(input, next_p);";
4244         code ~= "\n        else\n            next_p = p+1;\n        switch ( s )\n        {";
4245 
4246         size_t[] finish_states;
4247         foreach ( s; tdfa_.states )
4248         {
4249             code ~= Format.convert("\n            case {}:", s.index);
4250 
4251             if ( s.accept )
4252             {
4253                 finish_states ~= s.index;
4254 
4255                 tdfa_t.State target;
4256                 foreach ( t; s.transitions )
4257                 {
4258                     if ( target is null )
4259                         target = t.target;
4260                     else if ( target !is t.target )
4261                     {
4262                         target = null;
4263                         break;
4264                     }
4265                 }
4266                 if ( target !is null && target is s )
4267                     s.transitions = null;
4268             }
4269 
4270             bool first_if=true;
4271             charclass_t cc, ccTest;
4272 
4273             tango.core.Array.sort(s.transitions);
4274             foreach ( t; s.transitions )
4275             {
4276                 ccTest.add(t.predicate.getInput());
4277                 ccTest.optimize();
4278                 if ( t.predicate.getInput() < ccTest )
4279                     cc = t.predicate.getInput();
4280                 else
4281                     cc = ccTest;
4282 
4283                 if ( first_if ) {
4284                     code ~= "\n                if ( ";
4285                     first_if = false;
4286                 }
4287                 else
4288                     code ~= "\n                else if ( ";
4289                 bool first_cond=true;
4290                 foreach ( cr; cc.parts )
4291                 {
4292                     if ( first_cond )
4293                         first_cond = false;
4294                     else
4295                         code ~= " || ";
4296                     if ( cr.l() == cr.r() )
4297                         code ~= Format.convert("c == 0x{:x}", cast(int)cr.l());
4298                     else
4299                         code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r());
4300                 }
4301                 code ~= Format.convert(" ) {{\n                    s = {};", t.target.index);
4302 
4303                 if ( t.predicate.type == typeof(t.predicate.type).consume )
4304                     code ~= "\n                    p = next_p;";
4305                 foreach ( cmd; t.commands )
4306                     code ~= compileCommand(cmd, "                    ");
4307 /*
4308                 // if inp ends here and we do not already accept, try to add an explicit string/line end
4309                 if ( p >= inp.length && !s.accept && c != 0 ) {
4310                     c = 0;
4311                     goto Lprocess_char;
4312                 }
4313 */
4314                 code ~= "\n                }";
4315             }
4316 
4317             if ( !first_if )
4318                 code ~= Format.convert(
4319                     "\n                else\n                    {};\n                break;",
4320                     s.accept?Format.convert("goto finish{}", s.index):"return false"
4321                 );
4322             else
4323                 code ~= Format.convert("\n                {};", s.accept?Format.convert("goto finish{}", s.index):"return false");
4324         }
4325 
4326         // create finisher groups
4327         size_t[][size_t] finisherGroup;
4328         foreach ( fs; finish_states )
4329         {
4330             // check if finisher group with same commands exists
4331             bool haveFinisher = false;
4332             foreach ( fg; finisherGroup.keys )
4333             {
4334                 bool equalCommands = false;
4335                 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length )
4336                 {
4337                     equalCommands = true;
4338                     foreach ( i, cmd; tdfa_.states[fs].finishers )
4339                     {
4340                         if ( cmd != tdfa_.states[fg].finishers[i] ) {
4341                             equalCommands = false;
4342                             break;
4343                         }
4344                     }
4345                 }
4346                 if ( equalCommands ) {
4347                     // use existing group for this state
4348                     finisherGroup[fg] ~= fs;
4349                     haveFinisher = true;
4350                     break;
4351                 }
4352             }
4353             // create new group
4354             if ( !haveFinisher )
4355                 finisherGroup[fs] ~= fs;
4356         }
4357 
4358 
4359         code ~= "\n            default:\n                assert(0);\n        }\n    }\n\n    switch ( s )\n    {";
4360         foreach ( group, states; finisherGroup )
4361         {
4362             foreach ( s; states )
4363                 code ~= Format.convert("\n        case {}: finish{}:", s, s);
4364 
4365             foreach ( cmd; tdfa_.states[group].finishers )
4366             {
4367                 if ( lexer )
4368                 {
4369                     if ( tdfa_.states[group].finishers.length > 1 )
4370                         throw new RegExpException("Lexer error: more than one finisher in flm lexer!");
4371                     if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags )
4372                         throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup);
4373                     code ~= Format.convert("\n            match = input[0 .. r{}];\n            token = {};", cmd.src, cmd.dst/2);
4374                 }
4375                 else
4376                     code ~= compileCommand(cmd, "            ");
4377             }
4378 
4379             code ~= "\n            break;";
4380         }
4381         code ~= "\n        default:\n            return false;\n    }\n";
4382 
4383         if ( !lexer )
4384         {
4385             code ~= Format.convert("\n    groups.length = {};", tdfa_.num_tags/2);
4386             for ( int i = 0; i < tdfa_.num_tags/2; ++i )
4387                 code ~= Format.convert("\n    if ( r{} > -1 && r{} > -1 )\n        groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1);
4388         }
4389 
4390         code ~= "\n    return true;\n}";
4391         return code;
4392     }
4393 
4394     /*********************************************************************************************
4395         Get the pattern with which this regex was constructed.
4396     **********************************************************************************************/
4397     public const(char_t)[] pattern()
4398     {
4399         return pattern_;
4400     }
4401 
4402     /*********************************************************************************************
4403         Get the tag count of this regex, representing the number of sub-matches.
4404 
4405         This value is the max valid value for match/opIndex.
4406     **********************************************************************************************/
4407     uint tagCount()
4408     {
4409         return tdfa_.num_tags;
4410     }
4411 
4412     int[]       registers_;
4413     size_t      next_start_,
4414                 last_start_;
4415 
4416     debug(TangoRegex) tnfa_t tnfa_;
4417     tdfa_t      tdfa_;
4418 private:
4419     enum int           PREALLOC = 16;
4420     const(char_t)[]    input_,
4421                        pattern_;
4422 
4423     const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent)
4424     {
4425         const(char)[]  code,
4426                 dst;
4427         code ~= Format.convert("\n{}r{} = ", indent, cmd.dst);
4428         if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
4429             code ~= "p;";
4430         else
4431             code ~= Format.convert("r{};", cmd.src);
4432         return code;
4433     }
4434 }
4435 
4436 alias RegExpT!(char) Regex;
4437 
4438 debug(utf) import tango.stdc.stdio;
4439 // the following block is stolen from phobos.
4440 // the copyright notice applies for this block only.
4441 /*
4442  *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
4443  *  Written by Walter Bright
4444  *
4445  *  This software is provided 'as-is', without any express or implied
4446  *  warranty. In no event will the authors be held liable for any damages
4447  *  arising from the use of this software.
4448  *
4449  *  Permission is granted to anyone to use this software for any purpose,
4450  *  including commercial applications, and to alter it and redistribute it
4451  *  freely, subject to the following restrictions:
4452  *
4453  *  o  The origin of this software must not be misrepresented; you must not
4454  *     claim that you wrote the original software. If you use this software
4455  *     in a product, an acknowledgment in the product documentation would be
4456  *     appreciated but is not required.
4457  *  o  Altered source versions must be plainly marked as such, and must not
4458  *     be misrepresented as being the original software.
4459  *  o  This notice may not be removed or altered from any source
4460  *     distribution.
4461  */
4462 
4463 class UtfException : Exception
4464 {
4465     size_t idx; /// index in string of where error occurred
4466 
4467     this(immutable(char)[] s, size_t i)
4468     {
4469         idx = i;
4470         super(s);
4471     }
4472 }
4473 
4474 bool isValidDchar(dchar c)
4475 {
4476     /* Note: FFFE and FFFF are specifically permitted by the
4477      * Unicode standard for application internal use, but are not
4478      * allowed for interchange.
4479      * (thanks to Arcane Jill)
4480      */
4481 
4482     return c < 0xD800 ||
4483         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
4484 }
4485 
4486 /* *************
4487  * Decodes and returns character starting at s[idx]. idx is advanced past the
4488  * decoded character. If the character is not well formed, a UtfException is
4489  * thrown and idx remains unchanged.
4490  */
4491 
4492 dchar decode(const(char)[] s, ref size_t idx)
4493     {
4494         size_t len = s.length;
4495         dchar V;
4496         size_t i = idx;
4497         char u = s[i];
4498 
4499         if (u & 0x80)
4500         {   uint n;
4501             char u2;
4502 
4503             /* The following encodings are valid, except for the 5 and 6 byte
4504              * combinations:
4505              *  0xxxxxxx
4506              *  110xxxxx 10xxxxxx
4507              *  1110xxxx 10xxxxxx 10xxxxxx
4508              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
4509              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4510              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4511              */
4512             for (n = 1; ; n++)
4513             {
4514                 if (n > 4)
4515                     goto Lerr;          // only do the first 4 of 6 encodings
4516                 if (((u << n) & 0x80) == 0)
4517                 {
4518                     if (n == 1)
4519                         goto Lerr;
4520                     break;
4521                 }
4522             }
4523 
4524             // Pick off (7 - n) significant bits of B from first byte of octet
4525             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
4526 
4527             if (i + (n - 1) >= len)
4528                 goto Lerr;                      // off end of string
4529 
4530             /* The following combinations are overlong, and illegal:
4531              *  1100000x (10xxxxxx)
4532              *  11100000 100xxxxx (10xxxxxx)
4533              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
4534              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
4535              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
4536              */
4537             u2 = s[i + 1];
4538             if ((u & 0xFE) == 0xC0 ||
4539                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
4540                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
4541                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
4542                 (u == 0xFC && (u2 & 0xFC) == 0x80))
4543                 goto Lerr;                      // overlong combination
4544 
4545             for (uint j = 1; j != n; j++)
4546             {
4547                 u = s[i + j];
4548                 if ((u & 0xC0) != 0x80)
4549                     goto Lerr;                  // trailing bytes are 10xxxxxx
4550                 V = (V << 6) | (u & 0x3F);
4551             }
4552             if (!isValidDchar(V))
4553                 goto Lerr;
4554             i += n;
4555         }
4556         else
4557         {
4558             V = cast(dchar) u;
4559             i++;
4560         }
4561 
4562         idx = i;
4563         return V;
4564 
4565       Lerr:
4566         throw new Exception("invalid UTF-8 sequence");
4567     }
4568 
4569 /*  ditto */
4570 
4571 dchar decode(const(wchar)[] s, ref size_t idx)
4572     in
4573     {
4574         assert(idx >= 0 && idx < s.length);
4575     }
4576     out (result)
4577     {
4578         assert(isValidDchar(result));
4579     }
4580     body
4581     {
4582         const(char)[] msg;
4583         dchar V;
4584         size_t i = idx;
4585         uint u = s[i];
4586 
4587         if (u & ~0x7F)
4588         {   if (u >= 0xD800 && u <= 0xDBFF)
4589             {   uint u2;
4590 
4591                 if (i + 1 == s.length)
4592                 {   msg = "surrogate UTF-16 high value past end of string";
4593                     goto Lerr;
4594                 }
4595                 u2 = s[i + 1];
4596                 if (u2 < 0xDC00 || u2 > 0xDFFF)
4597                 {   msg = "surrogate UTF-16 low value out of range";
4598                     goto Lerr;
4599                 }
4600                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
4601                 i += 2;
4602             }
4603             else if (u >= 0xDC00 && u <= 0xDFFF)
4604             {   msg = "unpaired surrogate UTF-16 value";
4605                 goto Lerr;
4606             }
4607             else if (u == 0xFFFE || u == 0xFFFF)
4608             {   msg = "illegal UTF-16 value";
4609                 goto Lerr;
4610             }
4611             else
4612                 i++;
4613         }
4614         else
4615         {
4616             i++;
4617         }
4618 
4619         idx = i;
4620         return cast(dchar)u;
4621 
4622       Lerr:
4623         throw new UtfException(msg.idup, i);
4624     }
4625 
4626 /*  ditto */
4627 
4628 dchar decode(const(dchar)[] s, ref size_t idx)
4629     in
4630     {
4631         assert(idx >= 0 && idx < s.length);
4632     }
4633     body
4634     {
4635         size_t i = idx;
4636         dchar c = s[i];
4637 
4638         if (!isValidDchar(c))
4639             goto Lerr;
4640         idx = i + 1;
4641         return c;
4642 
4643       Lerr:
4644         throw new UtfException("5invalid UTF-32 value", i);
4645     }
4646 
4647 
4648 
4649 /* =================== Encode ======================= */
4650 
4651 /* *****************************
4652  * Encodes character c and appends it to array s[].
4653  */
4654 
4655 void encode(ref const(char)[] s, dchar c)
4656     in
4657     {
4658         assert(isValidDchar(c));
4659     }
4660     body
4661     {
4662         const(char)[] r = s;
4663 
4664         if (c <= 0x7F)
4665         {
4666             r ~= cast(char) c;
4667         }
4668         else
4669         {
4670             char[4] buf;
4671             uint L;
4672 
4673             if (c <= 0x7FF)
4674             {
4675                 buf[0] = cast(char)(0xC0 | (c >> 6));
4676                 buf[1] = cast(char)(0x80 | (c & 0x3F));
4677                 L = 2;
4678             }
4679             else if (c <= 0xFFFF)
4680             {
4681                 buf[0] = cast(char)(0xE0 | (c >> 12));
4682                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4683                 buf[2] = cast(char)(0x80 | (c & 0x3F));
4684                 L = 3;
4685             }
4686             else if (c <= 0x10FFFF)
4687             {
4688                 buf[0] = cast(char)(0xF0 | (c >> 18));
4689                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
4690                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4691                 buf[3] = cast(char)(0x80 | (c & 0x3F));
4692                 L = 4;
4693             }
4694             else
4695             {
4696                 assert(0);
4697             }
4698             r ~= buf[0 .. L];
4699         }
4700         s = r;
4701     }
4702 
4703 /*  ditto */
4704 
4705 void encode(ref const(wchar)[] s, dchar c)
4706     in
4707     {
4708         assert(isValidDchar(c));
4709     }
4710     body
4711     {
4712         const(wchar)[] r = s;
4713 
4714         if (c <= 0xFFFF)
4715         {
4716             r ~= cast(wchar) c;
4717         }
4718         else
4719         {
4720             wchar[2] buf;
4721 
4722             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
4723             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
4724             r ~= buf;
4725         }
4726         s = r;
4727     }
4728 
4729 /*  ditto */
4730 
4731 void encode(ref const(dchar)[] s, dchar c)
4732     in
4733     {
4734         assert(isValidDchar(c));
4735     }
4736     body
4737     {
4738         s ~= c;
4739     }
4740 
4741 debug(UnitTest)
4742 {
4743     unittest
4744     {
4745         // match a fixed string
4746         Regex r;
4747         r = new Regex("abc");
4748         assert(r.test("abc123"));
4749         assert(r.test("feabc123"));
4750         assert(!r.test("abe123"));
4751 
4752         // match a non-ASCII string
4753         r = new Regex("☃");
4754         assert(r.test("a☃c123"));
4755 
4756         // capture
4757         r = new Regex("☃(c)");
4758         assert(r.test("a☃c123"));
4759         assert(r.match(1) == "c");
4760 
4761         // dot
4762         r = new Regex("..");
4763         assert(r.test("a☃c123"));
4764 
4765         // dot capture
4766         r = new Regex("(..)");
4767         assert(r.test("a☃c123"));
4768         assert(r.match(1) == "a☃");
4769 
4770 
4771 
4772         // two captures
4773         r = new Regex("(.)(.)");
4774         assert(r.test("a☃c123"));
4775         assert(r.match(1) == "a");
4776         assert(r.match(2) == "☃");
4777 
4778         // multiple letters
4779         r = new Regex(".*(e+).*");
4780         assert(r.test("aeeeeec123"));
4781         assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1));
4782         assert(!r.test("abfua"));
4783 
4784         // multiple snowmen
4785         r = new Regex(".*(☃+).*");
4786         assert(r.test("a☃☃☃☃☃c123"));
4787         assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1));
4788         assert(!r.test("abeua"));
4789 
4790         /**
4791            "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed.
4792         */
4793         // multiple snowmen
4794         r = new Regex("(☃+)");
4795         assert(r.test("a☃☃☃☃☃c123"));
4796         assert(r.match(0) == "☃☃☃☃☃");
4797         assert(r.match(1) == "☃☃☃☃☃");
4798         assert(r.test("a☃beua"));
4799     }
4800 
4801     debug(RegexTestOnly)
4802     {
4803         import tango.io.Stdout;
4804         void main() { Stdout("All tests pass").newline(); }
4805     }
4806 }