tango.text.Regex source code

1 /*******************************************************************************
2 
3     copyright:      Copyright (c) 2007-2008 Jascha Wetzel. All rights reserved.
4 
5     license:        BSD style: $(LICENSE)
6 
7     version:        Initial release: Jan 2008
8 
9     authors:        Jascha Wetzel
10 
11     This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method.
12 
13     The Regex class is not thread safe
14 
15     See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a>
16     for details on regular expressions in general.
17 
18     The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it,
19     as opposed to what &quot;regular expression&quot; means in most implementations
20     (e.g. PCRE or those from the standard libraries of Perl, Java or Python).
21     The advantage of this method is it's performance, it's disadvantage is the inability to realize some features
22     that Perl-like regular expressions have (e.g. back-references).
23     See <a href="http://swtch.com/~rsc/regexp/regexp1.html">&quot;Regular Expression Matching Can Be Simple And Fast&quot;</a>
24     for details on the differences.
25 
26     The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the
27     number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input
28     and therefore matching is a linear-time process.
29 
30     The syntax of a regular expressions is as follows.
31     <i>X</i> and <i>Y</i> stand for an arbitrary regular expression.
32 
33     <table border=1 cellspacing=0 cellpadding=5>
34     <caption>Operators</caption>
35     $(TR $(TD X|Y) $(TD alternation, i.e. X or Y) )
36     $(TR $(TD (X)) $(TD matching brackets - creates a sub-match) )
37     $(TR $(TD (?X)) $(TD non-matching brackets - only groups X, no sub-match is created) )
38     $(TR $(TD [Z]) $(TD character class specification, Z is a string of characters or character ranges, e.g. [a-zA-Z0-9_.\-]) )
39     $(TR $(TD [^Z]) $(TD negated character class specification) )
40     $(TR $(TD &lt;X) $(TD lookbehind, X may be a single character or a character class) )
41     $(TR $(TD &gt;X) $(TD lookahead, X may be a single character or a character class) )
42     $(TR $(TD ^) $(TD start of input or start of line) )
43     $(TR $(TD $) $(TD end of input or end of line) )
44     $(TR $(TD \b) $(TD start or end of word, equals (?&lt;\s&gt;\S|&lt;\S&gt;\s)) )
45     $(TR $(TD \B) $(TD opposite of \b, equals (?&lt;\S&gt;\S|&lt;\s&gt;\s)) )
46     </table>
47 
48     <table border=1 cellspacing=0 cellpadding=5>
49     <caption>Quantifiers</caption>
50     $(TR $(TD X?) $(TD zero or one) )
51     $(TR $(TD X*) $(TD zero or more) )
52     $(TR $(TD X+) $(TD one or more) )
53     $(TR $(TD X{n,m}) $(TD at least n, at most m instances of X.<br>If n is missing, it's set to 0.<br>If m is missing, it is set to infinity.) )
54     $(TR $(TD X??) $(TD non-greedy version of the above operators) )
55     $(TR $(TD X*?) $(TD see above) )
56     $(TR $(TD X+?) $(TD see above) )
57     $(TR $(TD X{n,m}?) $(TD see above) )
58     </table>
59 
60     <table border=1 cellspacing=0 cellpadding=5>
61     <caption>Pre-defined character classes</caption>
62     $(TR $(TD .) $(TD any printable character) )
63     $(TR $(TD \s) $(TD whitespace) )
64     $(TR $(TD \S) $(TD non-whitespace) )
65     $(TR $(TD \w) $(TD alpha-numeric characters or underscore) )
66     $(TR $(TD \W) $(TD opposite of \w) )
67     $(TR $(TD \d) $(TD digits) )
68     $(TR $(TD \D) $(TD non-digit) )
69     </table>
70 
71     Note that "alphanumeric" only applies to Latin-1.
72 *******************************************************************************/
73 module tango.text.Regex;
74 
75 debug(TangoRegex) import tango.io.Stdout;
76 
77 /* *****************************************************************************
78     A simple pair
79 *******************************************************************************/
80 private struct Pair(T)
81 {
82     static Pair opCall(T a, T b)
83     {
84         Pair p;
85         p.a = a;
86         p.b = b;
87         return p;
88     }
89 
90     union
91     {
92         struct {
93             T first, second;
94         }
95         struct {
96             T a, b;
97         }
98     }
99 }
100 
101 /* *****************************************************************************
102     Double linked list
103 *******************************************************************************/
104 private class List(T)
105 {
106     class Element
107     {
108         T value;
109         Element prev,
110                 next;
111 
112         this(T v)
113         {
114             value = v;
115         }
116     }
117 
118     size_t  len;
119     Element head,
120             tail;
121 
122     List opCatAssign(T v)
123     {
124         if ( tail is null )
125             head = tail = new Element(v);
126         else {
127             tail.next = new Element(v);
128             tail.next.prev = tail;
129             tail = tail.next;
130         }
131         ++len;
132         return this;
133     }
134 
135     List insertAfter(T w, T v)
136     {
137         foreach ( e; &this.elements )
138         {
139             if ( e.value is w )
140                 return insertAfter(e, v);
141         }
142         return null;
143     }
144 
145     List insertAfter(Element e, T v)
146     {
147         auto tmp = new Element(v);
148         tmp.prev = e;
149         tmp.next = e.next;
150         e.next.prev = tmp;
151         e.next = tmp;
152         if ( e is tail )
153             tail = tmp;
154         ++len;
155         return this;
156     }
157 
158     List opCatAssign(List l)
159     {
160         if ( l.empty() )
161             return this;
162         if ( tail is null ) {
163             head = l.head;
164             tail = l.tail;
165         }
166         else {
167             tail.next = l.head;
168             tail.next.prev = tail;
169             tail = l.tail;
170         }
171         len += l.len;
172         return this;
173     }
174 
175     List pushFront(T v)
176     {
177         if ( head is null )
178             head = tail = new Element(v);
179         else
180         {
181             head.prev = new Element(v);
182             head.prev.next = head;
183             head = head.prev;
184         }
185         ++len;
186         return this;
187     }
188 
189     List insertBefore(T w, T v)
190     {
191         foreach ( e; &this.elements )
192         {
193             if ( e.value is w )
194                 return insertBefore(e, v);
195         }
196         return null;
197     }
198 
199     List insertBefore(Element e, T v)
200     {
201         auto tmp = new Element(v);
202         tmp.prev = e.prev;
203         tmp.next = e;
204         e.prev.next = tmp;
205         e.prev = tmp;
206         if ( e is head )
207             head = tmp;
208         ++len;
209         return this;
210     }
211 
212     List pushFront(List l)
213     {
214         if ( l.empty() )
215             return this;
216         if ( head is null ) {
217             head = l.head;
218             tail = l.tail;
219         }
220         else {
221             head.prev = l.tail;
222             head.prev.next = head;
223             head = l.head;
224         }
225         len += l.len;
226         return this;
227     }
228 
229     size_t length()
230     {
231         return len;
232     }
233 
234     bool empty()
235     {
236         return head is null;
237     }
238 
239     void clear()
240     {
241         head = null;
242         tail = null;
243         len = 0;
244     }
245 
246     void pop()
247     {
248         remove(tail);
249     }
250 
251     void remove(Element e)
252     {
253         if ( e is null )
254             return;
255         if ( e.prev is null )
256             head = e.next;
257         else
258             e.prev.next = e.next;
259         if ( e.next is null )
260             tail = e.prev;
261         else
262             e.next.prev = e.prev;
263         --len;
264     }
265 
266     int elements(int delegate(ref Element) dg)
267     {
268         for ( Element e=head; e !is null; e = e.next )
269         {
270             int ret = dg(e);
271             if ( ret )
272                 return ret;
273         }
274         return 0;
275     }
276 
277     int elements_reverse(int delegate(ref Element) dg)
278     {
279         for ( Element e=tail; e !is null; e = e.prev )
280         {
281             int ret = dg(e);
282             if ( ret )
283                 return ret;
284         }
285         return 0;
286     }
287 
288     int opApply(int delegate(ref T) dg)
289     {
290         for ( Element e=head; e !is null; e = e.next )
291         {
292             int ret = dg(e.value);
293             if ( ret )
294                 return ret;
295         }
296         return 0;
297     }
298 
299     int opApplyReverse(int delegate(ref T) dg)
300     {
301         for ( Element e=tail; e !is null; e = e.prev )
302         {
303             int ret = dg(e.value);
304             if ( ret )
305                 return ret;
306         }
307         return 0;
308     }
309 }
310 
311 /* *****************************************************************************
312     Stack based on dynamic array
313 *******************************************************************************/
314 private struct Stack(T)
315 {
316     size_t  _top;
317     T[]     stack;
318 
319     void push(T v)
320     {
321         if ( _top >= stack.length )
322             stack.length = stack.length*2+1;
323         stack[_top] = v;
324         ++_top;
325     }
326     alias push opCatAssign;
327 
328     void opCatAssign(T[] vs)
329     {
330         size_t end = _top+vs.length;
331         if ( end > stack.length )
332             stack.length = end*2;
333         stack[_top..end] = vs[];
334         _top = end;
335     }
336 
337     void pop(size_t num)
338     {
339         assert(_top>=num);
340         _top -= num;
341     }
342 
343     T pop()
344     {
345         assert(_top>0);
346         return stack[--_top];
347     }
348 
349     T top()
350     {
351         assert(_top>0);
352         return stack[_top-1];
353     }
354 
355     T* topPtr()
356     {
357         assert(_top>0);
358         return &stack[_top-1];
359     }
360 
361     bool empty()
362     {
363         return _top == 0;
364     }
365 
366     void clear()
367     {
368         _top = 0;
369     }
370 
371     size_t length()
372     {
373         return _top;
374     }
375 
376     T[] array()
377     {
378         return stack[0.._top];
379     }
380 
381     T opIndex(size_t i)
382     {
383         return stack[i];
384     }
385 
386     Stack dup()
387     {
388         Stack s;
389         s._top = _top;
390         s.stack = stack.dup;
391         return s;
392     }
393 }
394 
395 /* ************************************************************************************************
396     Set container based on assoc array
397 **************************************************************************************************/
398 private struct Set(T)
399 {
400     bool[T] data;
401 
402     static Set opCall()
403     {
404         Set s;
405         return s;
406     }
407 
408     static Set opCall(T v)
409     {
410         Set s;
411         s ~= v;
412         return s;
413     }
414 
415     void opAddAssign(T v)
416     {
417         data[v] = true;
418     }
419 
420     void opAddAssign(Set s)
421     {
422         foreach ( v; s.elements )
423             data[v] = true;
424     }
425     alias opAddAssign opCatAssign;
426 
427     size_t length()
428     {
429         return data.length;
430     }
431 
432     T[] elements()
433     {
434         return data.keys;
435     }
436 
437     bool remove(T v)
438     {
439         if ( (v in data) is null )
440             return false;
441         data.remove(v);
442         return true;
443     }
444 
445     bool contains(T v)
446     {
447         return (v in data) !is null;
448     }
449 
450     bool contains(Set s)
451     {
452         Set tmp = s - this;
453         return tmp.empty;
454     }
455 
456     bool empty()
457     {
458         return data.length==0;
459     }
460 
461     Set opSub(Set s)
462     {
463         Set res = dup;
464         foreach ( v; s.elements )
465             res.remove(v);
466         return res;
467     }
468 
469     Set dup()
470     {
471         Set s;
472         foreach ( v; data.keys )
473             s.data[v] = true;
474         return s;
475     }
476 }
477 
478 /* ************************************************************************************************
479 
480 **************************************************************************************************/
481 void quickSort(T)(T[] a)
482 {
483     quickSort(a,cast(size_t)0,a.length);
484 }
485 
486 void quickSort(T)(T[] a, size_t l, size_t r)
487 {
488     T t;
489     auto i = r-l;
490     if ( i < 3 )
491     {
492         if ( i < 2 )
493             return;
494         if ( a[l] < a[l+1] )
495             return;
496         t = a[l];
497         a[l] = a[l+1];
498         a[l+1] = t;
499         return;
500     }
501 
502     auto p = a[l];
503     i = l;
504     auto j = r;
505 
506     while ( true )
507     {
508         ++i;
509         for ( ; i < j && a[i] < p; ++i ) {}
510         --j;
511         for ( ; i < j && a[j] >= p; --j ) {}
512         if ( i >= j )
513             break;
514         t = a[i];
515         a[i] = a[j];
516         a[j] = t;
517     }
518     --i;
519     a[l] = a[i];
520     a[i] = p;
521 
522     quickSort(a, l, i);
523     quickSort(a, i+1, r);
524 }
525 import tango.math.Math;
526 
527 /* ************************************************************************************************
528     A range of characters
529 **************************************************************************************************/
530 struct CharRange(char_t)
531 {
532     char_t  l_, r_;
533 
534     static CharRange opCall(char_t c)
535     {
536         CharRange cr;
537         cr.l_ = c;
538         cr.r_ = c;
539         return cr;
540     }
541 
542     static CharRange opCall(char_t a, char_t b)
543     {
544         CharRange cr;
545         cr.l_ = min(a,b);
546         cr.r_ = max(a,b);
547         return cr;
548     }
549 
550     char_t l()
551     {
552         return l_;
553     }
554 
555     char_t r()
556     {
557         return r_;
558     }
559 
560     /* ********************************************************************************************
561         Compares the ranges according to their beginning.
562     **********************************************************************************************/
563     int opCmp(const ref CharRange cr) const
564     {
565         if ( l_ == cr.l_ )
566             return 0;
567         if ( l_ < cr.l_ )
568             return -1;
569         return 1;
570     }
571 
572 
573     const bool opEquals(ref const CharRange cr)
574     {
575         if ( l_ == cr.l_ && r_ == cr.r_ )
576             return true;
577         return false;
578     }
579 
580     bool contains(char_t c)
581     {
582         return c >= l_ && c <= r_;
583     }
584 
585     bool contains(CharRange cr)
586     {
587         return l_ <= cr.l_ && r_ >= cr.r_;
588     }
589 
590     bool intersects(CharRange cr)
591     {
592         return r_ >= cr.l_ && l_ <= cr.r_;
593     }
594 
595     CharRange intersect(CharRange cr)
596     {
597         assert(intersects(cr));
598         CharRange ir;
599         ir.l_ = max(l_, cr.l_);
600         ir.r_ = min(r_, cr.r_);
601         if ( ir.l_ > ir.r_ )
602             ir.l_ = ir.r_ = char_t.min;
603         return ir;
604     }
605 
606     CharRange[] subtract(CharRange cr)
607     {
608         CharRange[] sr;
609         if ( cr.contains(this) )
610             return sr;
611         if ( !intersects(cr) )
612             sr ~= this;
613         else
614         {
615             CharRange d;
616             if ( contains(cr) )
617             {
618                 d.l_ = l_;
619                 d.r_ = cast(char_t)(cr.l_-1);
620                 if ( d.l_ <= d.r_ )
621                     sr ~= d;
622                 d.l_ = cast(char_t)(cr.r_+1);
623                 d.r_ = r_;
624                 if ( d.l_ <= d.r_ )
625                     sr ~= d;
626             }
627             else if ( cr.r_ > l_ )
628             {
629                 d.l_ = cast(char_t)(cr.r_+1);
630                 d.r_ = r_;
631                 if ( d.l_ <= d.r_ )
632                     sr ~= d;
633             }
634             else if ( cr.l_ < r_ )
635             {
636                 d.l_ = l_;
637                 d.r_ = cast(char_t)(cr.l_-1);
638                 if ( d.l_ <= d.r_ )
639                     sr ~= d;
640             }
641         }
642         return sr;
643     }
644 
645     string toString()
646     {
647         string str;
648 
649         if ( l_ == r_ )
650         {
651             if ( l_ > 0x20 && l_ < 0x7f )
652                 str = Format.convert("'{}'", l_).idup;
653             else
654                 str = Format.convert("({:x})", cast(int)l_).idup;
655         }
656         else
657         {
658             if ( l_ > 0x20 && l_ < 0x7f )
659                 str = Format.convert("'{}'", l_).idup;
660             else
661                 str = Format.convert("({:x})", cast(int)l_).idup;
662             str ~= "-";
663             if ( r_ > 0x20 && r_ < 0x7f )
664                 str ~= Format.convert("'{}'", r_).idup;
665             else
666                 str ~= Format.convert("({:x})", cast(int)r_).idup;
667         }
668         return str;
669     }
670 }
671 
672 import tango.core.Compiler;
673 
674 static if(DMDFE_Version == 2063)
675 {
676     /* Workabout for http://d.puremagic.com/issues/show_bug.cgi?id=10425 */
677     void _bug(CharRange!(dchar) a) {}
678 }
679 
680 /* ************************************************************************************************
681     Represents a class of characters as used in regular expressions (e.g. [0-9a-z], etc.)
682 **************************************************************************************************/
683 struct CharClass(char_t)
684 {
685     alias CharRange!(char_t) range_t;
686 
687     //---------------------------------------------------------------------------------------------
688     // pre-defined character classes
689     static const CharClass!(char_t)
690         line_startend = {parts: [
691             {l_:0x00, r_:0x00},
692             {l_:0x0a, r_:0x0a},
693             {l_:0x13, r_:0x13}
694         ]},
695         digit = {parts: [
696             {l_:0x30, r_:0x39}
697         ]},
698         whitespace = {parts: [
699             {l_:0x09, r_:0x09},
700             {l_:0x0a, r_:0x0a},
701             {l_:0x0b, r_:0x0b},
702             {l_:0x13, r_:0x13},
703             {l_:0x14, r_:0x14},
704             {l_:0x20, r_:0x20}
705         ]};
706 
707     // 8bit classes
708     static if ( is(char_t == char) )
709     {
710         static const CharClass!(char_t)
711             any_char = {parts: [
712                 {l_:0x01, r_:0xff}
713             ]},
714             dot_oper = {parts: [
715                 {l_:0x01, r_:0xff}
716             ]},
717             alphanum_ = {parts: [
718                 {l_:0x30, r_:0x39},
719                 {l_:0x41, r_:0x5a},
720                 {l_:0x5f, r_:0x5f},
721                 {l_:0x61, r_:0x7a}
722             ]};
723     }
724     // 16bit and 32bit classes
725     static if ( is(char_t == wchar) || is(char_t == dchar) )
726     {
727         static const CharClass!(char_t)
728             any_char = {parts: [
729                 {l_:0x0001, r_:0xffff}
730             ]},
731             dot_oper = {parts: [
732                 {l_:0x0001, r_:0xffff}
733             ]},
734             alphanum_ = {parts: [
735                 {l_:0x30, r_:0x39},
736                 {l_:0x41, r_:0x5a},
737                 {l_:0x5f, r_:0x5f},
738                 {l_:0x61, r_:0x7a}
739             ]};
740     }
741 
742     //---------------------------------------------------------------------------------------------
743     range_t[] parts;
744 
745     invariant()
746     {
747 //        foreach ( i, p; parts )
748 //            assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_));
749     }
750 
751     static CharClass opCall(const(CharClass) cc)
752     {
753         CharClass ncc;
754         ncc.parts = cc.parts.dup;
755         return ncc;
756     }
757 
758     int opCmp(const ref CharClass cc) const
759     {
760         if ( parts.length < cc.parts.length )
761             return -1;
762         if ( parts.length > cc.parts.length )
763             return 1;
764         foreach ( i, p; cc.parts )
765         {
766             if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ )
767                 return 1;
768         }
769         return 0;
770     }
771 
772     bool empty()
773     {
774         return parts.length <= 0;
775     }
776 
777     bool matches(char_t c)
778     {
779         foreach ( p; parts )
780         {
781             if ( p.contains(c) )
782                 return true;
783         }
784         return false;
785     }
786 
787     CharClass intersect(CharClass cc)
788     {
789         CharClass ic;
790         foreach ( p; parts )
791         {
792             foreach ( cp; cc.parts )
793             {
794                 if ( p.intersects(cp) )
795                     ic.parts ~= p.intersect(cp);
796             }
797         }
798         return ic;
799     }
800 
801     // requires the class to be optimized
802     bool contains(range_t cr)
803     {
804         foreach ( p; parts )
805         {
806             if ( p.contains(cr) )
807                 return true;
808         }
809         return false;
810     }
811 
812     // requires the class to be optimized
813     bool contains(CharClass cc)
814     {
815         Louter: foreach ( p; cc.parts )
816         {
817             foreach ( p2; parts )
818             {
819                 if ( p2.contains(p) )
820                     continue Louter;
821             }
822             return false;
823         }
824         return true;
825     }
826 
827     void subtract(CharClass cc)
828     {
829         negate();
830         add(cc);
831         negate();
832     }
833 
834     void add(CharClass cc)
835     {
836         parts ~= cc.parts;
837     }
838 
839     void add(range_t cr)
840     {
841         parts ~= cr;
842     }
843 
844     void add(char_t c)
845     {
846         parts ~= CharRange!(char_t)(c);
847     }
848 
849     /* ********************************************************************************************
850         Requires the CharClass to be optimized.
851     **********************************************************************************************/
852     void negate()
853     {
854         optimize();
855         char_t  start = char_t.min;
856 
857         // first part touches left boundary of value range
858         if ( parts.length > 0 && parts[0].l_ == start )
859         {
860             start = parts[0].r_;
861             if ( start < char_t.max )
862                 ++start;
863 
864             foreach ( i, ref cr; parts[0 .. $-1] )
865             {
866                 cr.l_ = start;
867                 cr.r_ = cast(char_t)(parts[i+1].l_-1);
868                 start = parts[i+1].r_;
869                 if ( start < char_t.max )
870                     ++start;
871             }
872             if ( start != char_t.max ) {
873                 parts[$-1].l_ = start;
874                 parts[$-1].r_ = char_t.max;
875             }
876             else
877                 parts.length = parts.length-1;
878             return;
879         }
880 
881         foreach ( i, ref cr; parts )
882         {
883             char_t tmp = cast(char_t)(cr.l_-1);
884             cr.l_ = start;
885             start = cr.r_;
886             if ( start < char_t.max )
887                 ++start;
888             cr.r_ = tmp;
889         }
890 
891         // last part does not touch right boundary
892         if ( start != char_t.max )
893             parts ~= range_t(start, char_t.max);
894     }
895 
896     void optimize()
897     {
898         if ( empty() )
899             return;
900 
901         parts.sort;
902 
903         size_t i = 0;
904         foreach ( p; parts[1 .. $] )
905         {
906             if ( p.l_ > parts[i].r_+1 ) {
907                 ++i;
908                 parts[i].l_ = p.l_;
909                 parts[i].r_ = p.r_;
910                 continue;
911             }
912             parts[i].r_ = max(p.r_, parts[i].r_);
913             if ( parts[i].r_ >= char_t.max )
914                 break;
915         }
916         parts.length = i+1;
917     }
918 
919     string toString()
920     {
921         string str;
922         str ~= "[";
923         foreach ( p; parts )
924             str ~= p.toString();
925         str ~= "]";
926         return str;
927     }
928 }
929 
930 debug(UnitTest)
931 {
932 unittest
933 {
934     static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] };
935     assert(cc.toString(), "[(0)-(a)(0)-(6)(5)-(c)(c)-(11)(14)-'d']");
936     cc.optimize();
937     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
938     cc.negate();
939     assert(cc.toString(),  " [(12)-(13)'e'-(ff)]");
940     cc.optimize();
941     assert(cc.toString(),  "[(0)-(11)(14)-'d']");
942     cc.negate();
943     assert(cc.toString(),  "[(12)-(13)'e'-(ff)]");
944 
945     static CharClass!(char) cc2 = { parts: [] };
946     assert(cc.toString(),  "[]");
947     cc2.optimize();
948     assert(cc.toString(),  "[]");
949     cc2.negate();
950     assert(cc.toString(),  "[(0)-(ff)]");
951     cc2.optimize();
952     assert(cc.toString(),  "[(0)-(ff)]");
953     cc2.negate();
954     assert(cc.toString(),  "[]");
955 
956     static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] };
957     assert(cc3.toString(), "[(0)-'d'(c8)-(ff)]");
958     cc3.negate();
959     assert(cc.toString(),  "['e'-(c7)]");
960     cc3.negate();
961     assert(cc.toString(),  "[(0)-'d'(c8)-(ff)]");
962 
963     static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] };
964     assert(cc.toString(),  "[(0)-(c8)'d'-(ff)]");
965     cc4.optimize();
966     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(ff)(100)-(17f)(180)-(24f)(20a3)-(20b5)]");
967 
968     static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] };
969     cc5.optimize();
970     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
971     cc5.negate();
972     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
973     cc5.optimize();
974     assert(cc.toString(),  "[(0)-(8)(14)-(1f)(7f)-(9f)(250)-(20a2)(20b6)-(10ffff)]");
975     cc5.negate();
976     assert(cc.toString(),  "[(9)-(13)(20)-'~'(a0)-(24f)(20a3)-(20b5)]");
977 }
978 }
979 
980 /* ************************************************************************************************
981 
982 **************************************************************************************************/
983 private struct Predicate(char_t)
984 {
985     alias const(char_t)[]       string_t;
986     alias CharClass!(char_t)    cc_t;
987     alias CharRange!(char_t)    cr_t;
988 
989     // generic data
990     enum Type {
991         consume, epsilon, lookahead, lookbehind
992     }
993 
994     cc_t    input;
995     Type    type;
996 
997     // data for compiled predicates
998     enum uint  MAX_BITMAP_LENGTH = 256,
999                MAX_SEARCH_LENGTH = 256;
1000     enum MatchMode {
1001         generic, generic_l,
1002         single_char, bitmap, string_search,         // consume
1003         single_char_l, bitmap_l, string_search_l    // lookahead
1004     }
1005 
1006     MatchMode   mode;
1007     // data_chr had to be pulled out of the union due to
1008     // http://d.puremagic.com/issues/show_bug.cgi?id=2632 --- don't put it back
1009     // in until this is resolved!
1010     //
1011     // Keep in mind that data_str.length can't be modified directly unless the
1012     // new length is strictly greater than the old length. This is essentially
1013     // because ubyte.sizeof can be less than char_t.sizeof. If you set
1014     // data_str.length to anything less than or equal to data_bmp.length,
1015     // data_str will not be reallocated: only the length value will change but
1016     // nothing will realize that not that much space has actually been
1017     // allocated. data_str will be too small and you'll likely get segfaults or
1018     // such.
1019     //
1020     // In short: don't mess with data_str.length. If you have to, remove the
1021     // union entirely.
1022     //
1023     // -- Deewiant
1024     union {
1025         ubyte[]     data_bmp;
1026         string_t    data_str;
1027     };
1028     char_t      data_chr;
1029 
1030 
1031     void compile()
1032     {
1033         assert(input.parts.length > 0);
1034 
1035         // single char?
1036         if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ )
1037         {
1038             mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l;
1039             data_chr = input.parts[0].l_;
1040             return;
1041         }
1042         // check whether we can use a bitmap
1043         foreach ( p; input.parts )
1044         {
1045             if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH )
1046                 goto LnoBitmap;
1047         }
1048 
1049         // setup bitmap
1050         data_bmp.length = MAX_BITMAP_LENGTH/8;
1051         foreach ( p; input.parts )
1052         {
1053             for ( char_t c = p.l_; c <= p.r_; ++c )
1054                 data_bmp[c/8] |= 1 << (c&7);
1055         }
1056         mode = type==Type.consume ? MatchMode.bitmap : MatchMode.bitmap_l;
1057         return;
1058 
1059     LnoBitmap:
1060 /*
1061         // check whether the class is small enough to justify a string-search
1062         // TODO: consider inverse class for 8bit chars?
1063         uint class_size;
1064         foreach ( p; input.parts )
1065             class_size += cast(uint)p.r_+1-p.l_;
1066         if ( class_size > MAX_SEARCH_LENGTH )
1067             goto Lgeneric;
1068         data_str.length = class_size;
1069         size_t ind;
1070         foreach ( p; input.parts )
1071         {
1072             for ( char_t c = p.l_; c <= p.r_; ++c )
1073                 data_str[ind++] = c;
1074         }
1075         mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l;
1076         return;
1077 */
1078     Lgeneric:
1079         data_str = cast(char_t[])input.parts;
1080         mode = type==Type.consume ? MatchMode.generic : MatchMode.generic_l;
1081     }
1082 
1083     bool matches(char_t c)
1084     {
1085         if ( type == Type.consume || type == Type.lookahead )
1086             return input.matches(c);
1087         assert(0);
1088     }
1089 
1090     Predicate intersect(Predicate p)
1091     {
1092         Predicate p2;
1093         if ( type != Type.epsilon && p.type != Type.epsilon )
1094             p2.input = input.intersect(p.input);
1095         return p2;
1096     }
1097 
1098     bool intersects(Predicate p)
1099     {
1100         if ( type != p.type )
1101             return false;
1102         foreach ( cr; input.parts )
1103         {
1104             foreach ( cr2; p.input.parts )
1105             {
1106                 if ( cr.intersects(cr2) )
1107                     return true;
1108             }
1109         }
1110         return false;
1111     }
1112 
1113     bool exceedsMax(uint maxc)
1114     {
1115         foreach ( p; input.parts )
1116         {
1117             if ( p.l_ > maxc || p.r_ > maxc )
1118                 return true;
1119         }
1120 
1121         return false;
1122     }
1123 
1124     @property bool empty()
1125     {
1126         return type != Type.epsilon && input.empty();
1127     }
1128 
1129     void subtract(Predicate p)
1130     {
1131         if ( type != Type.epsilon && p.type != Type.epsilon )
1132             input.subtract(p.input);
1133     }
1134 
1135     void negate()
1136     {
1137         assert(type != Type.epsilon);
1138         input.negate();
1139     }
1140 
1141     void optimize()
1142     {
1143         assert(type != Type.epsilon);
1144         input.optimize();
1145     }
1146 
1147     int opCmp(const ref Predicate p) const
1148     {
1149         return input.opCmp(p.input);
1150     }
1151 
1152     const bool opEquals(ref const Predicate p)
1153     {
1154         if ( type != p.type )
1155             return false;
1156         if ( input.opCmp(p.input) != 0 )
1157             return false;
1158         return true;
1159     }
1160 
1161     cc_t getInput()
1162     {
1163         return input;
1164     }
1165 
1166     void setInput(const(cc_t) cc)
1167     {
1168         input = cast(cc_t)cc;
1169     }
1170 
1171     void appendInput(cr_t cr)
1172     {
1173         input.add(cr);
1174     }
1175 
1176     void appendInput(cc_t cc)
1177     {
1178         input.add(cc);
1179     }
1180 
1181     void appendInput(Predicate p)
1182     {
1183         input.add(p.input);
1184     }
1185 
1186     string toString()
1187     {
1188         string str;
1189         switch ( type )
1190         {
1191             case Type.consume:      str = input.toString();       break;
1192             case Type.epsilon:      str = "eps";                break;
1193             case Type.lookahead:    str = "la:"~input.toString(); break;
1194             case Type.lookbehind:   str = "lb:"~input.toString(); break;
1195             default:
1196                 assert(0);
1197         }
1198         return str;
1199     }
1200 }
1201 import Utf = tango.text.convert.Utf;
1202 import tango.text.convert.Format;
1203 
1204 /* ************************************************************************************************
1205 
1206 **************************************************************************************************/
1207 class RegExpException : Exception
1208 {
1209     this(string msg)
1210     {
1211         super("RegExp: "~msg);
1212     }
1213 }
1214 
1215 /* ************************************************************************************************
1216     TNFA state
1217 **************************************************************************************************/
1218 private class TNFAState(char_t)
1219 {
1220     bool    accept = false,
1221             visited = false;
1222     size_t  index;
1223     List!(TNFATransition!(char_t))  transitions;
1224 
1225     this()
1226     {
1227         transitions = new List!(TNFATransition!(char_t));
1228     }
1229 }
1230 
1231 
1232 /* ************************************************************************************************
1233     Priority classes used to linearize priorities after non-linear transition creation.
1234 **************************************************************************************************/
1235 private enum PriorityClass {
1236     greedy=0, normal=1, reluctant=2, extraReluctant=3
1237 }
1238 
1239 /* ********************************************************************************
1240     TNFA tagged transition
1241 ***********************************************************************************/
1242 private class TNFATransition(char_t)
1243 {
1244     TNFAState!(char_t)  target;
1245     Predicate!(char_t)  predicate;
1246     uint                priority,
1247                         tag;        /// one-based tag number, 0 = untagged
1248     PriorityClass       priorityClass;
1249 
1250     this(PriorityClass pc)
1251     {
1252         priorityClass = pc;
1253     }
1254 
1255     /******************************************************************************
1256         Move through states only going via epsilon transitions, and only choosing
1257         the one with highest priority. If the highest priority transition from a
1258         state isn't an epsilon transition, false is returned.
1259         If the accepting NFA state can be reached in this manner, true is returned.
1260 
1261         NOTE: This method does not look for cycles which should be kept in mind for
1262         later. larsivi 20090827
1263     *******************************************************************************/
1264     bool canFinish()
1265     {
1266         TNFAState!(char_t)  t = target;
1267         while (!t.accept) {
1268             TNFATransition!(char_t) highestPriTrans;
1269             foreach (trans; t.transitions) {
1270                 if (!highestPriTrans || highestPriTrans.priority > trans.priority)
1271                     highestPriTrans = trans;
1272             }
1273             if (!(highestPriTrans.predicate.type == Predicate!(char_t).Type.epsilon))
1274                 return false;
1275 
1276             t = highestPriTrans.target;
1277         }
1278         return true;
1279     }
1280 
1281 }
1282 
1283 /* ************************************************************************************************
1284     Fragments of TNFAs as used in the Thompson method
1285 **************************************************************************************************/
1286 private class TNFAFragment(char_t)
1287 {
1288     alias TNFAState!(char_t)        state_t;
1289     alias TNFATransition!(char_t)   trans_t;
1290 
1291     List!(trans_t)  entries,        /// transitions to be added to the entry state
1292                     exits,          /// transitions to be added to the exit state
1293                     entry_state,    /// transitions to write the entry state to
1294                     exit_state;     /// transitions to write the exit state to
1295 
1296     bool swapMatchingBracketSyntax;
1297 
1298     this()
1299     {
1300         entries     = new List!(trans_t);
1301         exits       = new List!(trans_t);
1302         entry_state = new List!(trans_t);
1303         exit_state  = new List!(trans_t);
1304     }
1305 
1306     /* ********************************************************************************************
1307         Write the given state as entry state to this fragment.
1308     **********************************************************************************************/
1309     void setEntry(state_t state)
1310     {
1311         state.transitions ~= entries;
1312         foreach ( t; entry_state )
1313             t.target = state;
1314     }
1315 
1316     /* ********************************************************************************************
1317         Write the given state as exit state to this fragment.
1318     **********************************************************************************************/
1319     void setExit(state_t state)
1320     {
1321         state.transitions ~= exits;
1322         foreach ( t; exit_state )
1323             t.target = state;
1324     }
1325 }
1326 
1327 /* ************************************************************************************************
1328     Tagged NFA
1329 **************************************************************************************************/
1330 private final class TNFA(char_t)
1331 {
1332     alias TNFATransition!(char_t)   trans_t;
1333     alias TNFAFragment!(char_t)     frag_t;
1334     alias TNFAState!(char_t)        state_t;
1335     alias Predicate!(char_t)        predicate_t;
1336     alias char_t[]                  string_t;
1337     alias CharRange!(char_t)        range_t;
1338     alias CharClass!(char_t)        cc_t;
1339 
1340     string_t    pattern;
1341     state_t[]   states;
1342     state_t     start;
1343 
1344     bool swapMatchingBracketSyntax; /// whether to make (?...) matching and (...) non-matching
1345 
1346     /* ********************************************************************************************
1347         Creates the TNFA from the given regex pattern
1348     **********************************************************************************************/
1349     this(string_t regex)
1350     {
1351         next_tag        = 1;
1352         transitions     = new List!(trans_t);
1353 
1354         pattern = regex;
1355     }
1356 
1357     /* ********************************************************************************************
1358         Print the TNFA (tabular representation of the delta function)
1359     **********************************************************************************************/
1360     debug(TangoRegex) void print()
1361     {
1362         foreach ( int i, s; states )
1363         {
1364             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
1365 
1366             bool first=true;
1367             Stdout(" {");
1368             foreach ( t; s.transitions )
1369             {
1370                 Stdout.format("{}{}{}:{}->{}", first?"":", ", t.priority, "gnrx"[t.priorityClass], t.predicate.toString, t.target is null?-1:t.target.index);
1371                 if ( t.tag > 0 ) {
1372                     Stdout.format(" t{}", t.tag);
1373                 }
1374                 first = false;
1375             }
1376             Stdout("}").newline;
1377         }
1378     }
1379 
1380     uint tagCount()
1381     {
1382         return next_tag-1;
1383     }
1384 
1385     /* ********************************************************************************************
1386         Constructs the TNFA using extended Thompson method.
1387         Uses a slightly extended version of Dijkstra's shunting yard algorithm to convert
1388         the regexp from infix notation.
1389     **********************************************************************************************/
1390     void parse(bool unanchored)
1391     {
1392         List!(frag_t)       frags       = new List!(frag_t);
1393         Stack!(Operator)    opStack;
1394         Stack!(uint)        tagStack;
1395         Stack!(Pair!(uint)) occurStack;
1396         opStack ~= Operator.eos;
1397 
1398         /* ****************************************************************************************
1399             Perform action on operator stack
1400         ******************************************************************************************/
1401         bool perform(Operator next_op, bool explicit_operator=true)
1402         {
1403             // calculate index in action matrix
1404             int index = cast(int)opStack.top()*(Operator.max+1);
1405             index += cast(int)next_op;
1406 
1407             debug(tnfa) Stdout.formatln("\t{}:{} -> {}  {} frag(s)",
1408                 operator_names[opStack.top], operator_names[next_op], action_names[action_lookup[index]], frags.length
1409             );
1410             switch ( action_lookup[index] )
1411             {
1412                 case Act.pua:
1413                     opStack ~= next_op;
1414                     if ( next_op == Operator.open_par ) {
1415                         tagStack ~= next_tag;
1416                         next_tag += 2;
1417                     }
1418                     break;
1419                 case Act.poc:
1420                     switch ( opStack.top() )
1421                     {
1422                         case Operator.concat:       constructConcat(frags);                             break;
1423                         case Operator.altern:       constructAltern(frags);                             break;
1424                         case Operator.zero_one_g:   constructZeroOne(frags, PriorityClass.greedy);      break;
1425                         case Operator.zero_one_ng:  constructZeroOne(frags, PriorityClass.reluctant);   break;
1426                         case Operator.zero_one_xr:  constructZeroOne(frags, PriorityClass.extraReluctant);  break;
1427                         case Operator.zero_more_g:  constructZeroMore(frags, PriorityClass.greedy);     break;
1428                         case Operator.zero_more_ng: constructZeroMore(frags, PriorityClass.reluctant);  break;
1429                         case Operator.zero_more_xr: constructZeroMore(frags, PriorityClass.extraReluctant); break;
1430                         case Operator.one_more_g:   constructOneMore(frags, PriorityClass.greedy);      break;
1431                         case Operator.one_more_ng:  constructOneMore(frags, PriorityClass.reluctant);   break;
1432                         case Operator.one_more_xr:  constructOneMore(frags, PriorityClass.extraReluctant);  break;
1433                         case Operator.occur_g:
1434                             Pair!(uint) occur = occurStack.pop();
1435                             constructOccur(frags, occur.a, occur.b, PriorityClass.greedy);
1436                             break;
1437                         case Operator.occur_ng:
1438                             Pair!(uint) occur = occurStack.pop();
1439                             constructOccur(frags, occur.a, occur.b, PriorityClass.reluctant);
1440                             break;
1441                         default:
1442                             throw new RegExpException("cannot process operand at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1443                     }
1444                     opStack.pop();
1445 
1446                     perform(next_op, false);
1447                     break;
1448                 case Act.poa:
1449                     opStack.pop();
1450                     break;
1451                 case Act.pca:
1452                     if ( opStack.top() == Operator.open_par )
1453                     {
1454                         if ( tagStack.empty() )
1455                             throw new RegExpException(Format.convert("Missing opening parentheses for closing parentheses at char {} \"{}\"", cursor, Utf.toString(pattern[cursor..$])).idup);
1456                         constructBracket(frags, tagStack.top());
1457                         tagStack.pop();
1458                     }
1459                     else {
1460                         assert(opStack.top() == Operator.open_par_nm);
1461                         constructBracket(frags);
1462                     }
1463                     opStack.pop();
1464                     break;
1465                 case Act.don:
1466                     return true;
1467                 case Act.err:
1468                 default:
1469                     throw new RegExpException(Format.convert("Unexpected operand at char {} \"{}\" in \"{}\"", cursor, Utf.toString(pattern[cursor..$]), Utf.toString(pattern)).idup);
1470             }
1471 
1472             return false;
1473         }
1474 
1475         // add implicit extra reluctant .* (with . == any_char) at the beginning for unanchored matches
1476         // and matching bracket for total match group
1477         if ( unanchored ) {
1478             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1479             perform(Operator.zero_more_xr, false);
1480             perform(Operator.concat, false);
1481             perform(Operator.open_par, false);
1482         }
1483 
1484         // convert regex to postfix and create TNFA
1485         bool implicit_concat;
1486         predicate_t.Type pred_type;
1487 
1488         while ( !endOfPattern() )
1489         {
1490             pred_type = predicate_t.Type.consume;
1491 
1492             dchar c = readPattern();
1493             switch ( c )
1494             {
1495                 case '|':
1496                     perform(Operator.altern);
1497                     implicit_concat = false;
1498                     break;
1499                 case '(':
1500                     if ( implicit_concat )
1501                         perform(Operator.concat, false);
1502                     implicit_concat = false;
1503                     if ( peekPattern() == '?' ) {
1504                         readPattern();
1505                         perform(swapMatchingBracketSyntax?Operator.open_par:Operator.open_par_nm);
1506                     }
1507                     else
1508                         perform(swapMatchingBracketSyntax?Operator.open_par_nm:Operator.open_par);
1509                     break;
1510                 case ')':
1511                     perform(Operator.close_par);
1512                     break;
1513                 case '?':
1514                     if ( peekPattern() == '?' ) {
1515                         readPattern();
1516                         perform(Operator.zero_one_ng);
1517                     }
1518                     else
1519                         perform(Operator.zero_one_g);
1520                     break;
1521                 case '*':
1522                     if ( peekPattern() == '?' ) {
1523                         readPattern();
1524                         perform(Operator.zero_more_ng);
1525                     }
1526                     else
1527                         perform(Operator.zero_more_g);
1528                     break;
1529                 case '+':
1530                     if ( peekPattern() == '?' ) {
1531                         readPattern();
1532                         perform(Operator.one_more_ng);
1533                     }
1534                     else
1535                         perform(Operator.one_more_g);
1536                     break;
1537                 case '{':
1538                     Pair!(uint) occur;
1539                     parseOccurCount(occur.a, occur.b);
1540                     occurStack ~= occur;
1541                     if ( peekPattern() == '?' ) {
1542                         readPattern();
1543                         perform(Operator.occur_ng);
1544                     }
1545                     else
1546                         perform(Operator.occur_g);
1547                     break;
1548                 case '[':
1549                     if ( implicit_concat )
1550                         perform(Operator.concat, false);
1551                     implicit_concat = true;
1552                     frags ~= constructCharClass(pred_type);
1553                     break;
1554                 case '.':
1555                     if ( implicit_concat )
1556                         perform(Operator.concat, false);
1557                     implicit_concat = true;
1558                     frags ~= constructChars(cc_t.dot_oper, pred_type);
1559                     break;
1560                 case '$':
1561                     if ( implicit_concat )
1562                         perform(Operator.concat, false);
1563                     implicit_concat = true;
1564 
1565                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead);
1566                     break;
1567                 case '^':
1568                     if ( implicit_concat )
1569                         perform(Operator.concat, false);
1570                     implicit_concat = true;
1571 
1572                     frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind);
1573                     break;
1574                 case '>':
1575                     c = readPattern();
1576                     pred_type = predicate_t.Type.lookahead;
1577                     if ( c == '[' )
1578                         goto case '[';
1579                     else if ( c == '\\' )
1580                         goto case '\\';
1581                     else if ( c == '.' )
1582                         goto case '.';
1583                     else
1584                         goto default;
1585                 case '<':
1586                     c = readPattern();
1587                     pred_type = predicate_t.Type.lookbehind;
1588                     if ( c == '[' )
1589                         goto case '[';
1590                     else if ( c == '\\' )
1591                         goto case '\\';
1592                     else if ( c == '.' )
1593                         goto case '.';
1594                     else
1595                         goto default;
1596                 case '\\':
1597                     c = readPattern();
1598 
1599                     if ( implicit_concat )
1600                         perform(Operator.concat, false);
1601                     implicit_concat = true;
1602 
1603                     switch ( c )
1604                     {
1605                         case 't':
1606                             frags ~= constructSingleChar('\t', pred_type);
1607                             break;
1608                         case 'n':
1609                             frags ~= constructSingleChar('\n', pred_type);
1610                             break;
1611                         case 'r':
1612                             frags ~= constructSingleChar('\r', pred_type);
1613                             break;
1614                         case 'w':   // alphanumeric and _
1615                             frags ~= constructChars(cc_t.alphanum_, pred_type);
1616                             break;
1617                         case 'W':   // non-(alphanum and _)
1618                             auto cc = cc_t(cc_t.alphanum_);
1619                             cc.negate();
1620                             frags ~= constructChars(cc, pred_type);
1621                             break;
1622                         case 's':   // whitespace
1623                             frags ~= constructChars(cc_t.whitespace, pred_type);
1624                             break;
1625                         case 'S':   // non-whitespace
1626                             auto cc = cc_t(cc_t.whitespace);
1627                             cc.negate();
1628                             frags ~= constructChars(cc, pred_type);
1629                             break;
1630                         case 'd':   // digit
1631                             frags ~= constructChars(cc_t.digit, pred_type);
1632                             break;
1633                         case 'D':   // non-digit
1634                             auto cc = cc_t(cc_t.digit);
1635                             cc.negate();
1636                             frags ~= constructChars(cc, pred_type);
1637                             break;
1638                         case 'b':   // either end of word
1639                             if ( pred_type != predicate_t.Type.consume )
1640                                 throw new RegExpException("Escape sequence \\b not allowed in look-ahead or -behind");
1641 
1642                             // create (?<\S>\s|<\s>\S)
1643                             auto cc = cc_t(cc_t.whitespace);
1644                             cc.negate();
1645 
1646                             perform(Operator.open_par_nm);
1647 
1648                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1649                             perform(Operator.concat, false);
1650                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1651                             perform(Operator.altern, false);
1652                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1653                             perform(Operator.concat, false);
1654                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1655 
1656                             perform(Operator.close_par, false);
1657                             break;
1658                         case 'B':   // neither end of word
1659                             if ( pred_type != predicate_t.Type.consume )
1660                                 throw new RegExpException("Escape sequence \\B not allowed in look-ahead or -behind");
1661 
1662                             // create (?<\S>\S|<\s>\s)
1663                             auto cc = cc_t(cc_t.whitespace);
1664                             cc.negate();
1665 
1666                             perform(Operator.open_par_nm);
1667 
1668                             frags ~= constructChars(cc, predicate_t.Type.lookbehind);
1669                             perform(Operator.concat, false);
1670                             frags ~= constructChars(cc, predicate_t.Type.lookahead);
1671                             perform(Operator.altern, false);
1672                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind);
1673                             perform(Operator.concat, false);
1674                             frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead);
1675 
1676                             perform(Operator.close_par, false);
1677                             break;
1678                         case '(':
1679                         case ')':
1680                         case '[':
1681                         case ']':
1682                         case '{':
1683                         case '}':
1684                         case '*':
1685                         case '+':
1686                         case '?':
1687                         case '.':
1688                         case '\\':
1689                         case '^':
1690                         case '$':
1691                         case '|':
1692                         case '<':
1693                         case '>':
1694                         default:
1695                             frags ~= constructSingleChar(c, pred_type);
1696                             break;
1697 //                            throw new RegExpException(Format.convert("Unknown escape sequence \\{}", c));
1698                     }
1699                     break;
1700 
1701                 default:
1702                     if ( implicit_concat )
1703                         perform(Operator.concat, false);
1704                     implicit_concat = true;
1705                     frags ~= constructSingleChar(c, pred_type);
1706             }
1707         }
1708 
1709         // add implicit reluctant .* (with . == any_char) at the end for unanchored matches
1710         if ( unanchored )
1711         {
1712             perform(Operator.close_par, false);
1713             if ( implicit_concat )
1714                 perform(Operator.concat, false);
1715             frags ~= constructChars(cc_t.any_char, predicate_t.Type.consume);
1716             perform(Operator.zero_more_ng, false);
1717         }
1718 
1719         // empty operator stack
1720         while ( !perform(Operator.eos) ) {}
1721 
1722         // set start and finish states
1723         start = addState();
1724         state_t finish = addState();
1725         finish.accept = true;
1726 
1727         foreach ( f; frags ) {
1728             f.setExit(finish);
1729             f.setEntry(start);
1730         }
1731 
1732         // set transition priorities
1733         List!(trans_t)[PriorityClass.max+1] trans;
1734         foreach ( ref t; trans )
1735             t = new List!(trans_t);
1736 
1737         Stack!(trans_t) todo;
1738         state_t state = start;
1739 
1740         while ( !todo.empty() || !state.visited )
1741         {
1742             if ( !state.visited )
1743             {
1744                 state.visited = true;
1745                 foreach_reverse ( t; state.transitions )
1746                     todo ~= t;
1747             }
1748 
1749             if ( todo.empty() )
1750                 break;
1751             trans_t t = todo.top();
1752             todo.pop();
1753             assert(t.priorityClass<=PriorityClass.max);
1754             trans[t.priorityClass] ~= t;
1755             state = t.target;
1756         }
1757 
1758         uint nextPrio;
1759         foreach ( ts; trans )
1760         {
1761             foreach ( t; ts )
1762                 t.priority = nextPrio++;
1763         }
1764     }
1765 
1766 private:
1767     uint            next_tag;
1768     size_t          cursor,
1769                     next_cursor;
1770     List!(trans_t)  transitions;
1771 
1772     state_t[state_t]    clonedStates;
1773     trans_t[trans_t]    clonedTransitions;
1774 
1775     /// RegEx operators
1776     enum Operator {
1777         eos, concat, altern, open_par, close_par,
1778         zero_one_g, zero_more_g, one_more_g,        // greedy
1779         zero_one_ng, zero_more_ng, one_more_ng,     // non-greedy/reluctant
1780         zero_one_xr, zero_more_xr, one_more_xr,     // extra-reluctant
1781         open_par_nm, occur_g, occur_ng
1782     }
1783     __gshared immutable const(char)[][] operator_names = ["EOS", "concat", "|", "(", ")", "?", "*", "+", "??", "*?", "+?", "??x", "*?x", "+?x", "(?", "{x,y}", "{x,y}?"];
1784 
1785     /// Actions for to-postfix transformation
1786     enum Act {
1787         pua, poc, poa, pca, don, err
1788     }
1789     __gshared immutable const(char)[][] action_names = ["push+advance", "pop+copy", "pop+advance", "pop+copy+advance", "done", "error"];
1790 
1791     /// Action lookup for to-postfix transformation
1792     __gshared immutable Act[] action_lookup =
1793     [
1794     //  eos      concat   |        (        )        ?        *        +        ??       *?       +?       ??extra  *?extra  +?extra  (?       {x,y}    {x,y}?
1795         Act.don, Act.pua, Act.pua, Act.pua, Act.err, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1796         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1797         Act.poc, Act.pua, Act.poc, Act.pua, Act.poc, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1798         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1799         Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err, Act.err,
1800         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1801         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1802         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1803         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1804         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1805         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1806         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1807         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1808         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1809         Act.err, Act.pua, Act.pua, Act.pua, Act.pca, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua, Act.pua,
1810         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc,
1811         Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.poc, Act.pua, Act.poc, Act.poc
1812     ];
1813 
1814     final dchar peekPattern()
1815     {
1816         auto tmp = next_cursor;
1817         if ( tmp < pattern.length )
1818             return decode(pattern, tmp);
1819         return 0;
1820     }
1821 
1822     final dchar readPattern()
1823     {
1824         cursor = next_cursor;
1825         if ( next_cursor < pattern.length )
1826             return decode(pattern, next_cursor);
1827         return 0;
1828     }
1829 
1830     final bool endOfPattern()
1831     {
1832         return next_cursor >= pattern.length;
1833     }
1834 
1835     state_t addState()
1836     {
1837         state_t s = new state_t;
1838         s.index = states.length;
1839         states ~= s;
1840         return s;
1841     }
1842 
1843     trans_t addTransition(PriorityClass pc = PriorityClass.normal)
1844     {
1845         trans_t trans = new trans_t(pc);
1846         transitions ~= trans;
1847         return trans;
1848     }
1849 
1850     uint parseNumber()
1851     {
1852         uint res;
1853         while ( !endOfPattern() )
1854         {
1855             auto c = peekPattern();
1856             if ( c < '0' || c > '9' )
1857                 break;
1858             res = res*10+(c-'0');
1859             readPattern();
1860         }
1861         return res;
1862     }
1863 
1864     void parseOccurCount(out uint minOccur, out uint maxOccur)
1865     {
1866         assert(pattern[cursor] == '{');
1867 
1868         minOccur = parseNumber();
1869         if ( peekPattern() == '}' ) {
1870             readPattern();
1871             maxOccur = minOccur;
1872             return;
1873         }
1874         if ( peekPattern() != ',' )
1875             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1876         readPattern();
1877         maxOccur = parseNumber();
1878         if ( peekPattern() != '}' )
1879             throw new RegExpException("Invalid occurence range at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1880         readPattern();
1881         if ( maxOccur > 0 && maxOccur < minOccur )
1882             throw new RegExpException("Invalid occurence range (max < min) at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
1883     }
1884 
1885     trans_t clone(trans_t t)
1886     {
1887         if ( t is null )
1888             return null;
1889         trans_t* tmp = t in clonedTransitions;
1890         if ( tmp !is null )
1891             return *tmp;
1892 
1893         trans_t t2 = new trans_t(t.priorityClass);
1894         clonedTransitions[t] = t2;
1895         t2.tag = t.tag;
1896         t2.priority = t.priority;
1897         t2.predicate = t.predicate;
1898         t2.target = clone(t.target);
1899         transitions ~= t2;
1900         return t2;
1901     }
1902 
1903     state_t clone(state_t s)
1904     {
1905         if ( s is null )
1906             return null;
1907         state_t* tmp = s in clonedStates;
1908         if ( tmp !is null )
1909             return *tmp;
1910 
1911         state_t s2 = new state_t;
1912         clonedStates[s] = s2;
1913         s2.accept = s.accept;
1914         s2.visited = s.visited;
1915         foreach ( t; s.transitions )
1916             s2.transitions ~= clone(t);
1917         s2.index = states.length;
1918         states ~= s2;
1919         return s2;
1920     }
1921 
1922     frag_t clone(frag_t f)
1923     {
1924         if ( f is null )
1925             return null;
1926         clonedStates = null;
1927         clonedTransitions = null;
1928 
1929         frag_t f2 = new frag_t;
1930         foreach ( t; f.entries )
1931             f2.entries ~= clone(t);
1932         foreach ( t; f.exits )
1933             f2.exits ~= clone(t);
1934         foreach ( t; f.entry_state )
1935             f2.entry_state ~= clone(t);
1936         foreach ( t; f.exit_state )
1937             f2.exit_state ~= clone(t);
1938         return f2;
1939     }
1940 
1941     //---------------------------------------------------------------------------------------------
1942     // Thompson constructions of NFA fragments
1943 
1944     frag_t constructSingleChar(char_t c, predicate_t.Type type)
1945     {
1946         debug(tnfa) Stdout.formatln("constructCharFrag {}", c);
1947 
1948         trans_t trans = addTransition();
1949         trans.predicate.appendInput(CharRange!(char_t)(c));
1950 
1951         trans.predicate.type = type;
1952 
1953         frag_t frag = new frag_t;
1954         frag.exit_state ~= trans;
1955         frag.entries    ~= trans;
1956         return frag;
1957     }
1958 
1959     frag_t constructChars(string_t chars, in predicate_t.Type type)
1960     {
1961         cc_t cc;
1962         for ( int i = 0; i < chars.length; ++i )
1963             cc.add(chars[i]);
1964 
1965         return constructChars(cc, type);
1966     }
1967 
1968     frag_t constructChars(const(cc_t) charclass, in predicate_t.Type type)
1969     {
1970         debug(tnfa) Stdout.format("constructChars type={}", type);
1971 
1972         trans_t trans = addTransition();
1973         trans.predicate.type = type;
1974 
1975         trans.predicate.setInput(cc_t(charclass));
1976 
1977         trans.predicate.optimize();
1978         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
1979 
1980         frag_t frag = new frag_t;
1981         frag.exit_state ~= trans;
1982         frag.entries    ~= trans;
1983         return frag;
1984     }
1985 
1986     frag_t constructCharClass(predicate_t.Type type)
1987     {
1988         debug(tnfa) Stdout.format("constructCharClass type={}", type);
1989         auto oldCursor = cursor;
1990 
1991         trans_t trans = addTransition();
1992 
1993         bool negated=false;
1994         if ( peekPattern() == '^' ) {
1995             readPattern();
1996             negated = true;
1997         }
1998 
1999         char_t  last;
2000         bool    have_range_start,
2001                 first_char = true;
2002         for ( ; !endOfPattern() && peekPattern() != ']'; )
2003         {
2004             dchar c = readPattern();
2005             switch ( c )
2006             {
2007                 case '-':
2008                     if ( first_char ) {
2009                         trans.predicate.appendInput(range_t(c));
2010                         break;
2011                     }
2012                     if ( !have_range_start )
2013                         throw new RegExpException("Missing range start for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2014                     else if ( endOfPattern() || peekPattern() == ']' )
2015                         throw new RegExpException("Missing range end for '-' operator after \""~Utf.toString(pattern).idup~"\"");
2016                     else {
2017                         c = readPattern();
2018                         trans.predicate.appendInput(range_t(last, c));
2019                         have_range_start = false;
2020                     }
2021                     break;
2022                 case '\\':
2023                     if ( endOfPattern() )
2024                         throw new RegExpException("unexpected end of string after \""~Utf.toString(pattern).idup~"\"");
2025                     c = readPattern();
2026                     switch ( c )
2027                     {
2028                         case 't':
2029                             c = '\t';
2030                             break;
2031                         case 'n':
2032                             c = '\n';
2033                             break;
2034                         case 'r':
2035                             c = '\r';
2036                             break;
2037                         default:
2038                             break;
2039                     }
2040                     goto default;
2041                 default:
2042                     if ( have_range_start )
2043                         trans.predicate.appendInput(range_t(last));
2044                     last = c;
2045                     have_range_start = true;
2046             }
2047             first_char = false;
2048         }
2049         if ( !endOfPattern() )
2050             readPattern();
2051         if ( last != char_t.init )
2052             trans.predicate.appendInput(range_t(last));
2053         debug(tnfa) Stdout.formatln(" {}", pattern[oldCursor..cursor]);
2054 
2055         if ( negated ) {
2056             auto tmp = cc_t(cc_t.any_char);
2057             tmp.subtract(trans.predicate.input);
2058             trans.predicate.input = tmp;
2059         }
2060         else
2061             trans.predicate.optimize();
2062         debug(tnfa) Stdout.formatln("-> {}", trans.predicate.toString);
2063 
2064         trans.predicate.type = type;
2065 
2066         frag_t frag = new frag_t;
2067         frag.exit_state ~= trans;
2068         frag.entries    ~= trans;
2069         return frag;
2070     }
2071 
2072     void constructBracket(List!(frag_t) frags, uint tag=0)
2073     {
2074         debug(tnfa) Stdout.formatln("constructBracket");
2075 
2076         state_t entry = addState(),
2077                 exit = addState();
2078         frags.tail.value.setEntry(entry);
2079         frags.tail.value.setExit(exit);
2080 
2081         trans_t tag1 = addTransition(),
2082                 tag2 = addTransition();
2083         tag1.predicate.type = predicate_t.Type.epsilon;
2084         tag2.predicate.type = predicate_t.Type.epsilon;
2085         if ( tag > 0 )
2086         {
2087             // make sure the tag indeces for bracket x are always
2088             // x*2 for the opening bracket and x*2+1 for the closing bracket
2089             tag1.tag = tag++;
2090             tag2.tag = tag;
2091         }
2092         tag1.target = entry;
2093         exit.transitions ~= tag2;
2094 
2095         frag_t frag = new frag_t;
2096         frag.entries ~= tag1;
2097         frag.exit_state ~= tag2;
2098         frags.pop();
2099         frags ~= frag;
2100     }
2101 
2102     void constructOneMore(List!(frag_t) frags, PriorityClass prioClass)
2103     {
2104         debug(tnfa) Stdout.formatln("constructOneMore");
2105 
2106         if ( frags.empty() )
2107             throw new RegExpException("too few arguments for + at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2108 
2109         trans_t repeat = addTransition(prioClass),
2110                 cont = addTransition();
2111         repeat.predicate.type = predicate_t.Type.epsilon;
2112         cont.predicate.type = predicate_t.Type.epsilon;
2113 
2114         state_t s = addState();
2115         frags.tail.value.setExit(s);
2116         s.transitions ~= repeat;
2117         s.transitions ~= cont;
2118 
2119         frag_t frag = new frag_t;
2120         frag.entries ~= frags.tail.value.entries;
2121         frag.entry_state ~= frags.tail.value.entry_state;
2122         frag.entry_state ~= repeat;
2123         frag.exit_state ~= cont;
2124         frags.pop();
2125         frags ~= frag;
2126     }
2127 
2128     void constructZeroMore(List!(frag_t) frags, PriorityClass prioClass)
2129     {
2130         debug(tnfa) Stdout.formatln("constructZeroMore");
2131 
2132         if ( frags.empty() )
2133             throw new RegExpException("too few arguments for * at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2134 
2135         trans_t enter = addTransition(prioClass),
2136                 repeat = addTransition(prioClass),
2137                 skip = addTransition();
2138         skip.predicate.type = predicate_t.Type.epsilon;
2139         repeat.predicate.type = predicate_t.Type.epsilon;
2140         enter.predicate.type = predicate_t.Type.epsilon;
2141 
2142         state_t entry = addState(),
2143                 exit = addState();
2144         frags.tail.value.setEntry(entry);
2145         frags.tail.value.setExit(exit);
2146         exit.transitions ~= repeat;
2147         enter.target = entry;
2148 
2149         frag_t frag = new frag_t;
2150         frag.entries ~= skip;
2151         frag.entries ~= enter;
2152         frag.exit_state ~= skip;
2153         frag.entry_state ~= repeat;
2154         frags.pop();
2155         frags ~= frag;
2156     }
2157 
2158     void constructZeroOne(List!(frag_t) frags, PriorityClass prioClass)
2159     {
2160         debug(tnfa) Stdout.formatln("constructZeroOne");
2161 
2162         if ( frags.empty() )
2163             throw new RegExpException("too few arguments for ? at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2164 
2165         trans_t use = addTransition(prioClass),
2166                 skip = addTransition();
2167         use.predicate.type = predicate_t.Type.epsilon;
2168         skip.predicate.type = predicate_t.Type.epsilon;
2169 
2170         state_t s = addState();
2171         frags.tail.value.setEntry(s);
2172         use.target = s;
2173 
2174         frag_t frag = new frag_t;
2175         frag.entries ~= use;
2176         frag.entries ~= skip;
2177         frag.exits ~= frags.tail.value.exits;
2178         frag.exit_state ~= frags.tail.value.exit_state;
2179         frag.exit_state ~= skip;
2180         frags.pop();
2181         frags ~= frag;
2182     }
2183 
2184     void constructOccur(List!(frag_t) frags, uint minOccur, uint maxOccur, PriorityClass prioClass)
2185     {
2186         debug(tnfa) Stdout.formatln("constructOccur {},{}", minOccur, maxOccur);
2187 
2188         if ( frags.empty() )
2189             throw new RegExpException("too few arguments for {x,y} at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2190 
2191         state_t s;
2192         frag_t  total = new frag_t,
2193                 prev;
2194 
2195         for ( int i = 0; i < minOccur; ++i )
2196         {
2197             frag_t f = clone(frags.tail.value);
2198             if ( prev !is null ) {
2199                 s = addState();
2200                 prev.setExit(s);
2201                 f.setEntry(s);
2202             }
2203             else {
2204                 total.entries = f.entries;
2205                 total.entry_state = f.entry_state;
2206             }
2207             prev = f;
2208         }
2209 
2210         if ( maxOccur == 0 )
2211         {
2212             frag_t f = frags.tail.value;
2213             trans_t t = addTransition();
2214             t.predicate.type = predicate_t.Type.epsilon;
2215             f.entries ~= t;
2216             f.exit_state ~= t;
2217 
2218             t = addTransition();
2219             t.predicate.type = predicate_t.Type.epsilon;
2220             f.exits ~= t;
2221             f.entry_state ~= t;
2222 
2223             s = addState();
2224             f.setEntry(s);
2225 
2226             if ( prev !is null )
2227                 prev.setExit(s);
2228             else {
2229                 total.entries = f.entries;
2230                 total.entry_state = f.entry_state;
2231             }
2232 
2233             prev = f;
2234         }
2235 
2236         for ( int i = minOccur; i < maxOccur; ++i )
2237         {
2238             frag_t f;
2239             if ( i < maxOccur-1 )
2240                 f = clone(frags.tail.value);
2241             else
2242                 f = frags.tail.value;
2243             trans_t t = addTransition();
2244             t.predicate.type = predicate_t.Type.epsilon;
2245             f.entries ~= t;
2246             f.exit_state ~= t;
2247 
2248             if ( prev !is null ) {
2249                 s = addState();
2250                 prev.setExit(s);
2251                 f.setEntry(s);
2252             }
2253             else {
2254                 total.entries = f.entries;
2255                 total.entry_state = f.entry_state;
2256             }
2257             prev = f;
2258         }
2259 
2260         total.exits = prev.exits;
2261         total.exit_state = prev.exit_state;
2262 
2263         frags.pop();
2264         frags ~= total;
2265     }
2266 
2267     void constructAltern(List!(frag_t) frags)
2268     {
2269         debug(tnfa) Stdout.formatln("constructAltern");
2270 
2271         if ( frags.empty() || frags.head is frags.tail )
2272             throw new RegExpException("too few arguments for | at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2273 
2274         frag_t  frag = new frag_t,
2275                 f1 = frags.tail.value,
2276                 f2 = frags.tail.prev.value;
2277         frag.entry_state ~= f2.entry_state;
2278         frag.entry_state ~= f1.entry_state;
2279         frag.exit_state ~= f2.exit_state;
2280         frag.exit_state ~= f1.exit_state;
2281         frag.entries ~= f2.entries;
2282         frag.entries ~= f1.entries;
2283         frag.exits ~= f2.exits;
2284         frag.exits ~= f1.exits;
2285 
2286         frags.pop();
2287         frags.pop();
2288         frags ~= frag;
2289     }
2290 
2291     void constructConcat(List!(frag_t) frags)
2292     {
2293         debug(tnfa) Stdout.formatln("constructConcat");
2294 
2295         if ( frags.empty() || frags.head is frags.tail )
2296             throw new RegExpException("too few operands for concatenation at \""~Utf.toString(pattern[cursor..$]).idup~"\"");
2297 
2298         frag_t  f1 = frags.tail.value,
2299                 f2 = frags.tail.prev.value;
2300 
2301         state_t state = addState();
2302         f2.setExit(state);
2303         f1.setEntry(state);
2304 
2305         frag_t frag = new frag_t;
2306         frag.entries ~= f2.entries;
2307         frag.exits ~= f1.exits;
2308         frag.entry_state ~= f2.entry_state;
2309         frag.exit_state ~= f1.exit_state;
2310         frags.pop();
2311         frags.pop();
2312         frags ~= frag;
2313     }
2314 }
2315 import tango.core.Array;
2316 
2317 /* ************************************************************************************************
2318     Tagged DFA
2319 **************************************************************************************************/
2320 private class TDFA(char_t)
2321 {
2322     alias Predicate!(char_t)    predicate_t;
2323     alias CharRange!(char_t)    range_t;
2324     alias CharClass!(char_t)    charclass_t;
2325     alias char_t[]              string_t;
2326 
2327     enum uint CURRENT_POSITION_REGISTER = ~0;
2328 
2329     /* ********************************************************************************************
2330         Tag map assignment command
2331     **********************************************************************************************/
2332     struct Command
2333     {
2334         uint        dst,    /// register index to recieve data
2335                     src;    /// register index or CURRENT_POSITION_REGISTER for current position
2336 
2337         string toString()
2338         {
2339             return Format.convert("{}<-{}", dst, src==CURRENT_POSITION_REGISTER?"p":Format.convert("{}", src)).idup;
2340         }
2341 
2342         /* ****************************************************************************************
2343             Order transitions by the order of their predicates.
2344         ******************************************************************************************/
2345         int opCmp(const ref Command cmd) const
2346         {
2347             if ( src == CURRENT_POSITION_REGISTER && cmd.src != CURRENT_POSITION_REGISTER )
2348                 return 1;
2349             if ( src != CURRENT_POSITION_REGISTER && cmd.src == CURRENT_POSITION_REGISTER )
2350                 return -1;
2351             if ( dst < cmd.dst )
2352                 return -1;
2353             if ( dst == cmd.dst )
2354                 return 0;
2355             return 1;
2356         }
2357     }
2358 
2359     struct TagIndex
2360     {
2361         uint    tag,
2362                 index;
2363 
2364         const int opCmp(ref const TagIndex o)
2365         {
2366             if (tag == o.tag && index == o.index)
2367                 return 0;
2368             if (tag > o.tag)
2369                 return 1;
2370             if (tag < o.tag)
2371                 return -1;
2372             if (index > o.index)
2373                 return 1;
2374             if (index < o.index)
2375                 return -1;
2376             assert(0);
2377         }
2378     }
2379 
2380     /* ********************************************************************************************
2381         TDFA state
2382     **********************************************************************************************/
2383     class State
2384     {
2385         enum Mode {
2386             GENERIC, MIXED, LOOKUP
2387         }
2388 
2389         enum uint  LOOKUP_LENGTH = 256,
2390                    INVALID_STATE = 255;
2391 
2392         bool            accept = false;
2393         bool            reluctant = false;
2394         size_t          index;
2395         Transition[]    transitions,
2396                         generic_transitions;
2397         Command[]       finishers;
2398 
2399         ubyte[]         lookup;
2400         Mode            mode;
2401 
2402         void optimize()
2403         {
2404             // merge transitions with equal targets (same state index and equal commands)
2405             size_t[] remove_indeces;
2406             foreach ( i, t; transitions[0 .. $-1] )
2407             {
2408                 foreach ( t2; transitions[i+1 .. $] )
2409                 {
2410                     if ( t.predicate.type != t2.predicate.type || !t.equalTarget(t2) )
2411                         continue;
2412                     t2.predicate.appendInput(t.predicate);
2413                     remove_indeces ~= i;
2414                     break;
2415                 }
2416             }
2417 
2418             // remove transitions that have been merged into another
2419             if ( remove_indeces.length > 0 )
2420             {
2421                 Transition[] tmp;
2422                 tmp.length = transitions.length - remove_indeces.length;
2423                 size_t next_remove, next;
2424                 foreach ( i, t; transitions )
2425                 {
2426                     if ( next_remove < remove_indeces.length && remove_indeces[next_remove] == i ) {
2427                         ++next_remove;
2428                         continue;
2429                     }
2430                     tmp[next++] = t;
2431                 }
2432                 transitions = tmp;
2433 
2434                 foreach ( t; transitions )
2435                     t.predicate.optimize();
2436             }
2437         }
2438 
2439         void createLookup()
2440         {
2441             size_t count;
2442             foreach ( t; transitions )
2443             {
2444                 if ( !t.predicate.exceedsMax(LOOKUP_LENGTH) )
2445                     ++count;
2446             }
2447 
2448             if ( count < 1 || transitions.length > INVALID_STATE ) {
2449                 generic_transitions.length = transitions.length;
2450                 generic_transitions[] = transitions[];
2451                 return;
2452             }
2453 
2454             foreach ( t; transitions )
2455             {
2456                 if ( t.predicate.exceedsMax(LOOKUP_LENGTH) )
2457                     generic_transitions ~= t;
2458             }
2459 
2460             // setup lookup table
2461             lookup.length = LOOKUP_LENGTH;
2462             lookup[] = INVALID_STATE;
2463             foreach ( i, t; transitions )
2464             {
2465                 foreach ( p; t.predicate.input.parts )
2466                 {
2467                     if ( p.l_ >= lookup.length )
2468                         continue;
2469                     for ( char_t c = p.l_; c <= min(p.r_, LOOKUP_LENGTH-1); ++c )
2470                         lookup[c] = cast(ubyte)i;
2471                 }
2472             }
2473 
2474             mode = count < transitions.length? Mode.MIXED : Mode.LOOKUP;
2475         }
2476     }
2477 
2478     /* ********************************************************************************************
2479         TDFA transition
2480     **********************************************************************************************/
2481     class Transition
2482     {
2483         State       target;
2484         predicate_t predicate;
2485         Command[]   commands;
2486 
2487         /* ****************************************************************************************
2488             Order transitions by the order of their predicates.
2489         ******************************************************************************************/
2490         override final int opCmp(Object o)
2491         {
2492             Transition t = cast(Transition)o;
2493             assert(t !is null);
2494             return predicate.opCmp(t.predicate);
2495         }
2496 
2497         final const bool opEquals(const Object o)
2498         {
2499             auto t = cast(const(Transition))o;
2500             if ( t is null )
2501                 return false;
2502             if ( equalTarget(t) && t.predicate == predicate )
2503                 return true;
2504             return false;
2505         }
2506 
2507         const bool equalTarget(in Transition t)
2508         {
2509             if ( t.target.index != target.index )
2510                 return false;
2511             if ( commands.length != t.commands.length )
2512                 return false;
2513             Louter: foreach ( cmd; commands )
2514             {
2515                 foreach ( cmd2; t.commands )
2516                 {
2517                     if ( cmd == cmd2 )
2518                         continue Louter;
2519                 }
2520                 return false;
2521             }
2522             return true;
2523         }
2524     }
2525 
2526 
2527     State[]     states;
2528     State       start;
2529     Command[]   initializer;
2530     uint        num_tags;
2531 
2532     uint[TagIndex]  registers;
2533     uint            next_register;
2534 
2535     uint num_regs()
2536     {
2537         return next_register;
2538     }
2539 
2540     /* ********************************************************************************************
2541         Constructs the TDFA from the given TNFA using extended power set method
2542     **********************************************************************************************/
2543     this(TNFA!(char_t) tnfa)
2544     {
2545         num_tags        = tnfa.tagCount();
2546 
2547         next_register   = num_tags;
2548         for ( int i = 1; i <= num_tags; ++i ) {
2549             TagIndex ti;
2550             ti.tag = i;
2551             registers[ti] = i-1;
2552         }
2553 
2554         // create epsilon closure of TNFA start state
2555         SubsetState subset_start    = new SubsetState;
2556         StateElement se             = new StateElement;
2557         se.nfa_state = tnfa.start;
2558         subset_start.elms ~= se;
2559 
2560         // apply lookbehind closure for string/line start
2561         predicate_t tmp_pred;
2562         tmp_pred.setInput(CharClass!(char_t).line_startend);
2563         subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start);
2564 
2565         start = addState();
2566         subset_start.dfa_state = start;
2567 
2568         // generate initializer and finisher commands for TDFA start state
2569         generateInitializers(subset_start);
2570         generateFinishers(subset_start);
2571 
2572         // initialize stack for state traversal
2573         List!(SubsetState)  subset_states   = new List!(SubsetState),
2574                             unmarked        = new List!(SubsetState);
2575         subset_states   ~= subset_start;
2576         unmarked        ~= subset_start;
2577         debug(tdfa) Stdout.formatln("\n{} = {}\n", subset_start.dfa_state.index, subset_start.toString);
2578 
2579         while ( !unmarked.empty() )
2580         {
2581             SubsetState state = unmarked.tail.value;
2582             unmarked.pop();
2583 
2584             // create transitions for each class, creating new states when necessary
2585             foreach ( pred; disjointPredicates(state) )
2586             {
2587                 // find NFA state we reach with pred
2588                 // reach will set predicate type correctly
2589                 debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString);
2590                 SubsetState target = reach(state, pred);
2591                 if ( target is null ) {
2592                     continue;
2593                     debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping");
2594                 }
2595                 debug(tdfa) Stdout.formatln(" {}", target.toString);
2596                 target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state);
2597 
2598                 Transition trans = new Transition;
2599                 state.dfa_state.transitions ~= trans;
2600                 debug (tdfa_new_trans) Stdout.formatln("Creating with pred: {}", pred);
2601                 trans.predicate = pred;
2602 
2603                 // generate indeces for pos commands
2604                 // delay creation of pos command until we have reorder-commands
2605                 uint[uint] cmds = null;
2606                 foreach ( e; target.elms )
2607                 {
2608                     foreach ( tag, ref index; e.tags )
2609                     {
2610                         bool found=false;
2611                         foreach ( e2; state.elms )
2612                         {
2613                             int* i = tag in e2.tags;
2614                             if ( i !is null && *i == index ) {
2615                                 found=true;
2616                                 break;
2617                             }
2618                         }
2619                         if ( !found )
2620                         {
2621                             // if index is < 0 it is a temporary index
2622                             // used only to distinguish the state from existing ones.
2623                             // the previous index can be reused instead.
2624                             if ( index < 0 )
2625                                 index = -index-1;
2626                             cmds[tag] = index;
2627                         }
2628                         else
2629                             assert(index>=0);
2630                     }
2631                 }
2632 
2633                 // check whether a state exists that is identical except for tag index reorder-commands
2634                 bool exists=false;
2635                 foreach ( equivTarget; subset_states )
2636                 {
2637                     if ( reorderTagIndeces(target, equivTarget, state, trans) ) {
2638                         target = equivTarget;
2639                         exists = true;
2640                         break;
2641                     }
2642                 }
2643                 // else create new target state
2644                 if ( !exists )
2645                 {
2646                     State ts = addState();
2647                     target.dfa_state = ts;
2648                     subset_states   ~= target;
2649                     unmarked        ~= target;
2650                     debug(tdfa_add) {
2651                         Stdout.formatln("\nAdded {} = {}\n", target.dfa_state.index, target.toString);
2652                     }
2653                     generateFinishers(target);
2654                 }
2655 
2656                 // now generate pos commands, rewriting reorder-commands if existent
2657                 foreach ( tag, index; cmds )
2658                 {
2659                     // check whether reordering used this tag, if so, overwrite the command directly,
2660                     // for it's effect would be overwritten by a subsequent pos-command anyway
2661                     uint reg = registerFromTagIndex(tag, index);
2662                     bool found = false;
2663                     foreach ( ref cmd; trans.commands )
2664                     {
2665                         if ( cmd.src == reg ) {
2666                             found = true;
2667                             cmd.src = CURRENT_POSITION_REGISTER;
2668                             break;
2669                         }
2670                     }
2671                     if ( !found ) {
2672                         Command cmd;
2673                         cmd.dst = reg;
2674                         cmd.src = CURRENT_POSITION_REGISTER;
2675                         trans.commands ~= cmd;
2676                     }
2677                 }
2678 
2679                 trans.target = target.dfa_state;
2680                 debug(tdfa) {
2681                     Stdout.formatln("=> from {} with {} reach {}", state.dfa_state.index, pred.toString, target.dfa_state.index);
2682                 }
2683             }
2684 
2685             state.dfa_state.optimize();
2686         }
2687 
2688         // renumber registers continuously
2689         uint[uint]  regNums;
2690 
2691         for ( next_register = 0; next_register < num_tags; ++next_register )
2692             regNums[next_register] = next_register;
2693 
2694         void renumberCommand(ref Command cmd)
2695         {
2696             if ( cmd.src != CURRENT_POSITION_REGISTER && (cmd.src in regNums) is null )
2697                 regNums[cmd.src] = next_register++;
2698             if ( (cmd.dst in regNums) is null )
2699                 regNums[cmd.dst] = next_register++;
2700             if ( cmd.src != CURRENT_POSITION_REGISTER )
2701                 cmd.src = regNums[cmd.src];
2702             cmd.dst = regNums[cmd.dst];
2703         }
2704 
2705         foreach ( state; states )
2706         {
2707             foreach ( ref cmd; state.finishers )
2708                 renumberCommand(cmd);
2709             // make sure pos-commands are executed after reorder-commands and
2710             // reorder-commands do not overwrite each other
2711             state.finishers.sort;
2712 
2713             foreach ( trans; state.transitions )
2714             {
2715                 foreach ( ref cmd; trans.commands )
2716                     renumberCommand(cmd);
2717                 trans.commands.sort;
2718                 trans.predicate.compile();
2719             }
2720         }
2721 
2722         debug(TangoRegex)
2723         {
2724             foreach ( ref v; registers )
2725             {
2726                 if ( (v in regNums) !is null )
2727                     v = regNums[v];
2728             }
2729         }
2730 
2731         minimizeDFA();
2732 
2733         foreach ( state; states )
2734             state.createLookup();
2735 
2736         // TODO: optimize memory layout of TDFA
2737 
2738         // TODO: add lookahead for string-end somewhere
2739         // TODO: mark dead-end states (not leaving a non-finishing susbet)
2740         // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition
2741         //       (execution may stop in that state)
2742     }
2743 
2744     /* ********************************************************************************************
2745         Print the TDFA (tabular representation of the delta function)
2746     **********************************************************************************************/
2747     debug(TangoRegex) void print()
2748     {
2749         Stdout.formatln("#tags = {}", num_tags);
2750 
2751         auto tis = new TagIndex[registers.length];
2752         foreach ( k, v; registers )
2753             tis [v] = k;
2754         foreach ( r, ti; tis ) {
2755             Stdout.formatln("tag({},{}) in reg {}", ti.tag, ti.index, r);
2756         }
2757         Stdout.formatln("Initializer:");
2758         foreach ( cmd; initializer ) {
2759             Stdout.formatln("{}", cmd.toString);
2760         }
2761         Stdout.formatln("Delta function:");
2762         foreach ( int i, s; states )
2763         {
2764             Stdout.format("{}{:d2}{}", s is start?">":" ", i, s.accept?"*":" ");
2765 
2766             bool first=true;
2767             Stdout(" {");
2768             foreach ( t; s.transitions )
2769             {
2770                 Stdout.format("{}{}->{} (", first?"":", ", t.predicate.toString, t.target.index);
2771                 bool firstcmd=true;
2772                 foreach ( cmd; t.commands )
2773                 {
2774                     if ( firstcmd )
2775                         firstcmd = false;
2776                     else
2777                         Stdout(",");
2778                     Stdout.format("{}", cmd.toString);
2779                 }
2780                 Stdout(")");
2781                 first = false;
2782             }
2783             Stdout("} (");
2784 
2785             bool firstcmd=true;
2786             foreach ( cmd; s.finishers )
2787             {
2788                 if ( firstcmd )
2789                     firstcmd = false;
2790                 else
2791                     Stdout(",");
2792                 Stdout.format("{}", cmd.toString);
2793             }
2794             Stdout.formatln(")");
2795         }
2796     }
2797 
2798 private:
2799     /* ********************************************************************************************
2800         A (TNFA state, tags) pair element of a subset state.
2801     **********************************************************************************************/
2802     class StateElement
2803     {
2804         TNFAState!(char_t)  nfa_state;
2805         int[uint]           tags;
2806         // use place-value priority with 2 places, value(maxPrio) > value(lastPrio)
2807         uint                maxPriority,
2808                             lastPriority;
2809 
2810         bool prioGreater(StateElement se)
2811         {
2812             if ( maxPriority < se.maxPriority )
2813                 return true;
2814             if ( maxPriority == se.maxPriority ) {
2815                 assert(lastPriority != se.lastPriority);
2816                 return lastPriority < se.lastPriority;
2817             }
2818             return false;
2819         }
2820 
2821         override int opCmp(Object o)
2822         {
2823             StateElement se = cast(StateElement)o;
2824             assert(se !is null);
2825             if ( maxPriority < se.maxPriority )
2826                 return 1;
2827             if ( maxPriority == se.maxPriority )
2828             {
2829                 if ( lastPriority == se.lastPriority )
2830                     return 0;
2831                 return lastPriority < se.lastPriority;
2832             }
2833             return -1;
2834         }
2835 
2836         override string toString()
2837         {
2838             const(char)[] str;
2839             str = Format.convert("{} p{}.{} {{", nfa_state.index, maxPriority, lastPriority);
2840             bool first = true;
2841             foreach ( k, v; tags ) {
2842                 str ~= Format.convert("{}m({},{})", first?"":",", k, v);
2843                 first = false;
2844             }
2845             str ~= "}";
2846             return str.idup;
2847         }
2848     }
2849 
2850     /* ********************************************************************************************
2851         Represents a state in the NFA to DFA conversion.
2852         Contains the set of states (StateElements) the NFA might be in at the same time and the
2853         corresponding DFA state that we create.
2854     **********************************************************************************************/
2855     class SubsetState
2856     {
2857         StateElement[]  elms;
2858         State           dfa_state;
2859 
2860         this(StateElement[] elms=null)
2861         {
2862             this.elms = elms;
2863         }
2864 
2865         int opApply(int delegate (ref TNFATransition!(char_t)) dg)
2866         {
2867             int res;
2868             foreach ( elm; elms )
2869             {
2870                 foreach ( t; elm.nfa_state.transitions )
2871                 {
2872                     res = dg(t);
2873                     if ( res )
2874                         return res;
2875                 }
2876             }
2877             return res;
2878         }
2879 
2880         override string toString()
2881         {
2882             string str = "[ ";
2883             bool first = true;
2884             foreach ( s; elms ) {
2885                 if ( !first )
2886                     str ~= ", ";
2887                 str ~= s.toString();
2888                 first = false;
2889             }
2890             return str~" ]";
2891         }
2892     }
2893 
2894     /* ********************************************************************************************
2895         Temporary structure used for disjoint predicate computation
2896     **********************************************************************************************/
2897     struct Mark
2898     {
2899         char_t  c;
2900         bool    end;    /// false = start of range
2901 
2902         int opCmp(const ref Mark m) const
2903         {
2904             if ( c < m.c )
2905                 return -1;
2906             if ( c > m.c )
2907                 return 1;
2908             if ( end < m.end )
2909                 return -1;
2910             if ( end > m.end )
2911                 return 1;
2912             return 0;
2913         }
2914     }
2915 
2916     /* ********************************************************************************************
2917         Calculates the register index for a given tag map entry. The TDFA implementation uses
2918         registers to save potential tag positions, the index space gets linearized here.
2919 
2920         Params:     tag =   tag number
2921                     index = tag map index
2922         Returns:    index of the register to use for the tag map entry
2923     **********************************************************************************************/
2924     uint registerFromTagIndex(uint tag, uint index)
2925     {
2926         if ( index > 0 )
2927         {
2928             TagIndex ti;
2929             ti.tag = tag;
2930             ti.index = index;
2931             uint* i = ti in registers;
2932             if ( i !is null )
2933                 return *i;
2934             return registers[ti] = next_register++;
2935         }
2936         else
2937             return tag-1;
2938     }
2939 
2940     Mark[] marks_;
2941 
2942     /* ********************************************************************************************
2943         Add new TDFA state to the automaton.
2944     **********************************************************************************************/
2945     State addState()
2946     {
2947         State s = new State;
2948         s.index = states.length;
2949         states ~= s;
2950         return s;
2951     }
2952 
2953     /* ********************************************************************************************
2954         Creates disjoint predicates from all outgoing, potentially overlapping TNFA transitions.
2955 
2956         Params:     state = SubsetState to create the predicates from
2957         Returns:    List of disjoint predicates that can be used for a DFA state
2958     **********************************************************************************************/
2959     predicate_t[] disjointPredicates(SubsetState state)
2960     {
2961         alias CharRange!(char_t) range_t;
2962         debug(tdfa) Stdout.formatln("disjointPredicates()");
2963 
2964         size_t num_marks;
2965         foreach ( t; state )
2966         {
2967             // partitioning will consider lookbehind transitions,
2968             // st. lb-closure will not expand for transitions with a superset of the lb-predicate
2969             if ( t.predicate.type != predicate_t.Type.epsilon )
2970             {
2971                 debug(tdfa) Stdout.formatln("{}", t.predicate.toString);
2972                 if ( marks_.length < num_marks+2*t.predicate.getInput().parts.length )
2973                     marks_.length = num_marks+2*t.predicate.getInput().parts.length;
2974                 foreach ( p; t.predicate.getInput().parts ) {
2975                     marks_[num_marks++] = Mark(p.l(), false);
2976                     marks_[num_marks++] = Mark(p.r(), true);
2977                 }
2978             }
2979         }
2980 
2981         if ( num_marks <= 1 )
2982             throw new Exception("disjointPredicates: No transitions in subset state");
2983 
2984         debug(tdfa) Stdout("\nsorting...").newline;
2985         // using built-in sort somtimes gives an AV in TypeInfo.swap
2986         quickSort(marks_[0 .. num_marks]);
2987         assert(!marks_[0].end);
2988 
2989         debug(tdfa)
2990         {
2991             Stdout("\nsorted marks:\n");
2992             bool first=true;
2993             foreach ( m; marks_[0 .. num_marks] )
2994             {
2995                 if ( first )
2996                     first = false;
2997                 else
2998                     Stdout(",");
2999                 if ( m.c > 0x20 && m.c < 0x7f )
3000                     Stdout.format("{}{}", m.end?"e":"s", m.c);
3001                 else
3002                     Stdout.format("{}{:x}", m.end?"e":"s", cast(int)m.c);
3003             }
3004             Stdout.newline;
3005         }
3006 
3007         size_t  next,
3008                 active = 1;
3009         char_t  start = marks_[0].c,
3010                 end;
3011         range_t[]   disjoint = new range_t[num_marks/2+1];
3012 
3013         foreach ( m; marks_[1 .. num_marks] )
3014         {
3015             if ( m.end )
3016             {
3017                 assert(active>0);
3018                 --active;
3019                 if ( m.c < start )
3020                     continue;
3021                 end = m.c;
3022                 // the next range cannot start at the same pos
3023                 // because starts are sorted before endings
3024                 if ( active > 0 )
3025                     ++m.c;
3026             }
3027             else
3028             {
3029                 ++active;
3030                 if ( active == 1 )
3031                 {
3032                     // skip uncovered interval
3033                     if ( m.c > start ) {
3034                         start = m.c;
3035                         continue;
3036                     }
3037                     end = m.c;
3038                     ++m.c;
3039                 }
3040                 // skip range start if cursor already marks it
3041                 else if ( m.c <= start )
3042                     continue;
3043                 else
3044                     end = m.c-1;
3045             }
3046 
3047             // save range
3048             if ( disjoint.length <= next )
3049                 disjoint.length = disjoint.length*2;
3050 
3051             disjoint[next].l_ = start;
3052             disjoint[next].r_ = end;
3053             ++next;
3054 
3055             // advance cursor
3056             start = m.c;
3057         }
3058         disjoint.length = next;
3059 
3060         // merge isolated ranges into sets of ranges
3061         // no range in a set may occur separated from the others in any predicate
3062         predicate_t[]   preds;
3063         preds.length = 1;
3064         Lmerge: foreach ( r; disjoint )
3065         {
3066             if ( preds[$-1].empty() )
3067                 preds[$-1].appendInput(r);
3068             else
3069             {
3070                 // we can merge r into the current predicate if
3071                 // pred contains r <=> pred contains all the other ranges
3072                 foreach ( t; state )
3073                 {
3074                     if ( t.predicate.type == predicate_t.Type.epsilon )
3075                         continue;
3076 
3077                     if ( t.predicate.getInput().contains(r)
3078                         != t.predicate.getInput().contains(preds[$-1].getInput()) )
3079                     {
3080                         preds.length = preds.length+1;
3081                         break;
3082                     }
3083                 }
3084                 preds[$-1].appendInput(r);
3085             }
3086         }
3087 
3088         debug(tdfa)
3089         {
3090             Stdout("\ndisjoint ranges:\n");
3091             first=true;
3092             foreach ( r; disjoint )
3093             {
3094                 if ( first )
3095                     first = false;
3096                 else
3097                     Stdout(",");
3098                 Stdout.format("{}", r);
3099             }
3100             Stdout.newline;
3101             Stdout("\ndisjoint predicates:\n");
3102             first=true;
3103             foreach ( ref p; preds )
3104             {
3105                 if ( first )
3106                     first = false;
3107                 else
3108                     Stdout(",");
3109                 Stdout.format("{}", p.toString);
3110             }
3111             Stdout.newline;
3112         }
3113 
3114         debug(tdfa) Stdout.formatln("disjointPredicates() end");
3115         return preds;
3116     }
3117 
3118     /* ********************************************************************************************
3119         Finds all TNFA states that can be reached directly with the given predicate and creates
3120         a new SubsetState containing those target states.
3121 
3122         Params:     subst = SubsetState to start from
3123                     pred =  predicate that is matched against outgoing transitions
3124         Returns:    SubsetState containing the reached target states
3125     **********************************************************************************************/
3126     SubsetState reach(SubsetState subst, ref predicate_t pred)
3127     {
3128         // to handle the special case of overlapping consume and lookahead predicates,
3129         // we find the different intersecting predicate types
3130         bool    have_consume,
3131                 have_lookahead;
3132         foreach ( t; subst )
3133         {
3134             if ( t.predicate.type != predicate_t.Type.consume && t.predicate.type != predicate_t.Type.lookahead )
3135                 continue;
3136             auto intpred = t.predicate.intersect(pred);
3137             if ( !intpred.empty() )
3138             {
3139                 if ( t.predicate.type == predicate_t.Type.consume )
3140                     have_consume = true;
3141                 else if ( t.predicate.type == predicate_t.Type.lookahead )
3142                     have_lookahead = true;
3143             }
3144         }
3145 
3146         // if there is consume/lookahead overlap,
3147         // lookahead predicates are handled first
3148         predicate_t.Type processed_type;
3149         if ( have_lookahead )
3150             processed_type = predicate_t.Type.lookahead;
3151         else if ( have_consume )
3152             processed_type = predicate_t.Type.consume;
3153         else {
3154             debug(tdfa) Stdout.formatln("\nERROR: reach found no consume/lookahead symbol for {} in \n{}", pred.toString, subst.toString);
3155             return null;
3156         }
3157         pred.type = processed_type;
3158 
3159         // add destination states to new subsetstate
3160         SubsetState r = new SubsetState;
3161         foreach ( s; subst.elms )
3162         {
3163             foreach ( t; s.nfa_state.transitions )
3164             {
3165                 if ( t.predicate.type != processed_type )
3166                     continue;
3167                 auto intpred = t.predicate.intersect(pred);
3168                 if ( !intpred.empty() ) {
3169                     StateElement se = new StateElement;
3170                     se.maxPriority = max(t.priority, s.maxPriority);
3171                     se.lastPriority = t.priority;
3172                     se.nfa_state = t.target;
3173                     se.tags = s.tags;
3174                     r.elms ~= se;
3175                 }
3176             }
3177         }
3178 
3179         // if we prioritized lookaheads, the states that may consume are also added to the new subset state
3180         // this behaviour is somewhat similar to an epsilon closure
3181         if ( have_lookahead && have_consume )
3182         {
3183             foreach ( s; subst.elms )
3184             {
3185                 foreach ( t; s.nfa_state.transitions )
3186                 {
3187                     if ( t.predicate.type != predicate_t.Type.consume )
3188                         continue;
3189                     auto intpred = t.predicate.intersect(pred);
3190                     if ( !intpred.empty() ) {
3191                         r.elms ~= s;
3192                         break;
3193                     }
3194                 }
3195             }
3196         }
3197         return r;
3198     }
3199 
3200     /* ********************************************************************************************
3201         Extends the given SubsetState with the states that are reached through lookbehind transitions.
3202 
3203         Params:     from =      SubsetState to create the lookbehind closure for
3204                     previous =  predicate "from" was reached with
3205         Returns:    SubsetState containing "from" and all states of it's lookbehind closure
3206     **********************************************************************************************/
3207     SubsetState lookbehindClosure(SubsetState from, predicate_t pred)
3208     {
3209         List!(StateElement) stack = new List!(StateElement);
3210         StateElement[size_t]  closure;
3211 
3212         foreach ( e; from.elms )
3213         {
3214             stack ~= e;
3215             closure[e.nfa_state.index] = e;
3216         }
3217 
3218         while ( !stack.empty() )
3219         {
3220             StateElement se = stack.tail.value;
3221             stack.pop();
3222             foreach ( t; se.nfa_state.transitions )
3223             {
3224                 if ( t.predicate.type != predicate_t.Type.lookbehind )
3225                     continue;
3226                 if ( t.predicate.intersect(pred).empty() )
3227                     continue;
3228                 uint new_maxPri = max(t.priority, se.maxPriority);
3229 
3230                 StateElement* tmp = t.target.index in closure;
3231                 if ( tmp !is null )
3232                 {
3233                     // if higher prio (smaller value) exists, do not use this transition
3234                     if ( tmp.maxPriority < new_maxPri ) {
3235 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3236                         continue;
3237                     }
3238                     else if ( tmp.maxPriority == new_maxPri )
3239                     {
3240                         // "equal lastPrio -> first-come-first-serve"
3241                         // doesn't work for lexer - how to solve it properly?
3242                         if ( tmp.lastPriority <= t.priority ) {
3243 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3244                             continue;
3245                         }
3246 //                         else
3247 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3248                     }
3249 //                     else
3250 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3251                 }
3252 
3253                 StateElement new_se = new StateElement;
3254                 new_se.maxPriority = max(t.priority, se.maxPriority);
3255                 new_se.lastPriority = t.priority;
3256                 new_se.nfa_state = t.target;
3257                 new_se.tags = se.tags;
3258 
3259                 closure[t.target.index] = new_se;
3260                 stack ~= new_se;
3261             }
3262         }
3263 
3264         SubsetState res = new SubsetState;
3265         res.elms = closure.values;
3266         return res;
3267     }
3268 
3269     /* ********************************************************************************************
3270         Generates the epsilon closure of the given subset state, creating tag map entries
3271         if tags are passed. Takes priorities into account, effectively realizing
3272         greediness and reluctancy.
3273 
3274         Params:     from =      SubsetState to create the epsilon closure for
3275                     previous =  SubsetState "from" was reached from
3276         Returns:    SubsetState containing "from" and all states of it's epsilon closure
3277     **********************************************************************************************/
3278     SubsetState epsilonClosure(SubsetState from, SubsetState previous)
3279     {
3280         int firstFreeIndex=-1;
3281         foreach ( e; previous.elms )
3282         {
3283             foreach ( ti; e.tags )
3284                 firstFreeIndex = max(firstFreeIndex, cast(int)ti);
3285         }
3286         ++firstFreeIndex;
3287 
3288         List!(StateElement) stack = new List!(StateElement);
3289         StateElement[size_t]  closure;
3290 
3291         foreach ( e; from.elms )
3292         {
3293             stack ~= e;
3294             closure[e.nfa_state.index] = e;
3295         }
3296 
3297         while ( !stack.empty() )
3298         {
3299             StateElement se = stack.tail.value;
3300             stack.pop();
3301             foreach ( t; se.nfa_state.transitions )
3302             {
3303                 if ( t.predicate.type != predicate_t.Type.epsilon )
3304                     continue;
3305                 // this is different from Ville Laurikari's algorithm, but it's crucial
3306                 // to take the max (instead of t.priority) to make reluctant operators work
3307                 uint new_maxPri = max(t.priority, se.maxPriority);
3308 
3309                 StateElement* tmp = t.target.index in closure;
3310                 if ( tmp !is null )
3311                 {
3312                     // if higher prio (smaller value) exists, do not use this transition
3313                     if ( tmp.maxPriority < new_maxPri ) {
3314 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}, continuing", t.target.index, tmp.maxPriority, new_maxPri);
3315                         continue;
3316                     }
3317                     else if ( tmp.maxPriority == new_maxPri )
3318                     {
3319                         // "equal lastPrio -> first-come-first-serve"
3320                         // doesn't work for lexer - how to solve it properly?
3321                         if ( tmp.lastPriority <= t.priority ) {
3322 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}, continuing", t.target.index, tmp.lastPriority, t.priority);
3323                             continue;
3324                         }
3325 //                         else
3326 //                             debug(tdfa) Stdout.formatln("lastPrio({}) {} beats {}", t.target.index, t.priority, tmp.lastPriority);
3327                     }
3328 //                     else
3329 //                         debug(tdfa) Stdout.formatln("maxPrio({}) {} beats {}", t.target.index, new_maxPri, tmp.maxPriority);
3330                 }
3331 
3332                 auto new_se = new StateElement;
3333                 new_se.maxPriority = new_maxPri;
3334                 new_se.lastPriority = t.priority;
3335                 new_se.nfa_state = t.target;
3336 
3337                 if ( t.tag > 0 )
3338                 {
3339                     foreach ( k, v; se.tags )
3340                         new_se.tags[k] = v;
3341                     new_se.tags[t.tag] = firstFreeIndex;
3342                 }
3343                 else
3344                     new_se.tags = se.tags;
3345 
3346                 closure[t.target.index] = new_se;
3347                 stack ~= new_se;
3348             }
3349         }
3350 
3351         SubsetState res = new SubsetState;
3352         res.elms = closure.values;
3353 
3354         // optimize tag usage
3355         // all we need to do is to check whether the largest tag-index from the
3356         // previous state is actually used in the new state and move all tags with
3357         // firstFreeIndex down by one if not, but only if firstFreeIndex is not 0
3358         if ( firstFreeIndex > 0 )
3359         {
3360             bool seenLastUsedIndex = false;
3361             sluiLoop: foreach ( e; res.elms )
3362             {
3363                 foreach ( i; e.tags )
3364                 {
3365                     if ( i == firstFreeIndex-1 ) {
3366                         seenLastUsedIndex = true;
3367                         break sluiLoop;
3368                     }
3369                 }
3370             }
3371             if ( !seenLastUsedIndex )
3372             {
3373                 foreach ( e; res.elms )
3374                 {
3375                     foreach ( ref i; e.tags )
3376                     {
3377                         // mark index by making it negative
3378                         // to signal that it can be decremented
3379                         // after it has been detected to be a newly used index
3380                         if ( i == firstFreeIndex )
3381                             i = -firstFreeIndex;
3382                     }
3383                 }
3384             }
3385         }
3386 
3387         return res;
3388     }
3389 
3390     /* ********************************************************************************************
3391         Tries to create commands that reorder the tag map of "previous", such that "from" becomes
3392         tag-wise identical to "to". If successful, these commands are added to "trans". This
3393         is done for state re-use.
3394 
3395         Params:     from =      SubsetState to check for tag-wise equality to "to"
3396                     to =        existing SubsetState that we want to re-use
3397                     previous =  SubsetState we're coming from
3398                     trans =     Transition we went along
3399         Returns:    true if "from" is tag-wise identical to "to" and the necessary commands have
3400                     been added to "trans"
3401     **********************************************************************************************/
3402     bool reorderTagIndeces(SubsetState from, SubsetState to, SubsetState previous, Transition trans)
3403     {
3404         if ( from.elms.length != to.elms.length )
3405             return false;
3406 
3407         bool[Command]
3408             cmds;
3409         uint[TagIndex]
3410             reorderedIndeces;
3411         StateElement[TagIndex]
3412             reordered_elements;
3413 
3414         Louter: foreach ( fe; from.elms )
3415         {
3416             foreach ( te; to.elms )
3417             {
3418                 if ( te.nfa_state.index != fe.nfa_state.index )
3419                     continue;
3420                 if ( fe.tags.length != te.tags.length )
3421                     return false;
3422                 foreach ( tag, findex; fe.tags )
3423                 {
3424                     if ( (tag in te.tags) is null )
3425                         return false;
3426 
3427                     TagIndex ti;
3428                     ti.tag = tag;
3429                     ti.index = te.tags[tag];
3430 
3431                     // apply priority for conflicting tag indeces
3432                     if ( (ti in reorderedIndeces) !is null )
3433                     {
3434                         auto rse = reordered_elements[ti];
3435                         auto ri = reorderedIndeces[ti];
3436                         if ( ri != findex
3437                             && ( rse.maxPriority < fe.maxPriority
3438                                 || rse.maxPriority == fe.maxPriority
3439                                 && rse.lastPriority <= fe.lastPriority )
3440                         )
3441                             continue;
3442                         Command cmd;
3443                         cmd.src = registerFromTagIndex(tag,ri);
3444                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3445                         cmds.remove(cmd);
3446                     }
3447                     // if target index differs, create reordering command
3448                     if ( te.tags[tag] != findex )
3449                     {
3450                         Command cmd;
3451                         cmd.src = registerFromTagIndex(tag,findex);
3452                         cmd.dst = registerFromTagIndex(tag,te.tags[tag]);
3453                         cmds[cmd] = true;
3454                     }
3455 
3456                     reorderedIndeces[ti] = findex;
3457                     reordered_elements[ti] = fe;
3458                 }
3459                 continue Louter;
3460             }
3461             return false;
3462         }
3463 
3464         debug(tdfa) {
3465             Stdout.formatln("\nreorder {} to {}\n", from.toString, to.dfa_state.index);
3466         }
3467 
3468         trans.commands ~= cmds.keys;
3469         return true;
3470     }
3471 
3472     /* ********************************************************************************************
3473         Generate tag map initialization commands for start state.
3474     **********************************************************************************************/
3475     void generateInitializers(SubsetState start)
3476     {
3477         uint[uint] cmds;
3478         foreach ( nds; start.elms )
3479         {
3480             foreach ( k, v; nds.tags )
3481                 cmds[k] = v;
3482         }
3483 
3484         foreach ( k, v; cmds ) {
3485             Command cmd;
3486             cmd.dst = registerFromTagIndex(k,v);
3487             cmd.src = CURRENT_POSITION_REGISTER;
3488             initializer ~= cmd;
3489         }
3490     }
3491 
3492     /* ********************************************************************************************
3493         Generates finisher commands for accepting states.
3494     **********************************************************************************************/
3495     void generateFinishers(SubsetState r)
3496     {
3497         // if at least one of the TNFA states accepts,
3498         // set the finishers from active tags in increasing priority
3499         StateElement[]  sorted_elms = r.elms.dup.sort;
3500         bool reluctant = false;
3501         foreach ( se; sorted_elms ) {
3502             debug (Finishers) Stdout.formatln("Finisher: {}", se);
3503             if ( se.nfa_state.accept )
3504             {
3505                 r.dfa_state.accept = true;
3506 
3507                 // Knowing that we're looking at an epsilon closure with an accepting
3508                 // state, we look at the involved transitions - if the path from the
3509                 // nfa state in the set with the highest incoming priority (last in
3510                 // sorted_elms list) to the accepting nfa state is via the highest
3511                 // priority transitions, and they are all epsilon transitions, this
3512                 // suggests we're looking at a regex ending with a reluctant pattern.
3513                 // The NFA->DFA transformation will most likely extend the automata
3514                 // further, but we want the matching to stop here.
3515                 // NOTE: The grounds for choosing the last element in sorted_elms
3516                 // are somewhat weak (empirical testing), but sofar no new
3517                 // regressions have been discovered. larsivi 20090827
3518                 TNFATransition!(char_t) highestPriTrans;
3519                 if (!(sorted_elms[$-1] && sorted_elms[$-1].nfa_state &&
3520                             sorted_elms[$-1].nfa_state))
3521                     throw new Exception ("Something is NULL that is expected to
3522                             be non-null", __FILE__, __LINE__);
3523 
3524                 foreach ( trans; sorted_elms[$-1].nfa_state.transitions ) {
3525                     if (trans.canFinish()) {
3526                         r.dfa_state.reluctant = true;
3527                         break;
3528                     }
3529                 }
3530 
3531                 bool[uint]  finished_tags;
3532                 {
3533                     foreach ( t, i; se.tags )
3534                         if ( i > 0 && !(t in finished_tags) ) {
3535                             finished_tags[t] = true;
3536                             Command cmd;
3537                             cmd.dst = registerFromTagIndex(t, 0);
3538                             cmd.src = registerFromTagIndex(t, i);
3539                             r.dfa_state.finishers ~= cmd;
3540                         }
3541                 }
3542             }
3543         }
3544     }
3545 
3546     /* ********************************************************************************************
3547         Assumes that the command-lists are sorted and transitions are optimized
3548     **********************************************************************************************/
3549     void minimizeDFA()
3550     {
3551         class DiffTable
3552         {
3553             this(size_t num) {
3554                 diff_ = new bool[num*(num+1)/2];
3555             }
3556 
3557             ~this() { delete diff_; }
3558 
3559             bool opCall(size_t a, size_t b)
3560             {
3561                 if ( a < b )
3562                     return diff_[b*(b+1)/2+a];
3563                 return diff_[a*(a+1)/2+b];
3564             }
3565 
3566             void set(size_t a, size_t b)
3567             {
3568                 if ( a < b )
3569                     diff_[b*(b+1)/2+a] = true;
3570                 else
3571                     diff_[a*(a+1)/2+b] = true;
3572             }
3573 
3574             bool[]  diff_;
3575         }
3576 
3577         debug(tdfa) Stdout.formatln("Minimizing TDFA");
3578 
3579         scope diff = new DiffTable(states.length);
3580         bool new_diff = true;
3581 
3582         while ( new_diff )
3583         {
3584             new_diff = false;
3585             foreach ( i, a; states[0 .. $-1] )
3586             {
3587                 Linner: foreach ( j, b; states[i+1 .. $] )
3588                 {
3589                     if ( diff(i, j+i+1) )
3590                         continue;
3591 
3592                     // assume optimized transitions
3593                     if ( a.accept != b.accept || a.transitions.length != b.transitions.length ) {
3594                         diff.set(i, j+i+1);
3595                         new_diff = true;
3596                         continue;
3597                     }
3598 
3599                     if ( a.accept ) // b accepts too
3600                     {
3601                         // assume sorted finishers
3602                         if ( a.finishers.length != b.finishers.length ) {
3603                             diff.set(i, j+i+1);
3604                             new_diff = true;
3605                             continue;
3606                         }
3607                         foreach ( k, cmd; a.finishers )
3608                         {
3609                             if ( cmd != b.finishers[k] ) {
3610                                 diff.set(i, j+i+1);
3611                                 new_diff = true;
3612                                 continue Linner;
3613                             }
3614                         }
3615                     }
3616 
3617                     Ltrans: foreach ( ta; a.transitions )
3618                     {
3619                         foreach ( tb; b.transitions )
3620                         {
3621                             if ( ta.predicate.intersects(tb.predicate) )
3622                             {
3623                                 if ( diff(ta.target.index, tb.target.index) ) {
3624                                     diff.set(i, j+i+1);
3625                                     new_diff = true;
3626                                     continue Linner;
3627                                 }
3628                                 // assume sorted commands
3629                                 if ( ta.commands.length != tb.commands.length ) {
3630                                     diff.set(i, j+i+1);
3631                                     new_diff = true;
3632                                     continue Linner;
3633                                 }
3634                                 foreach ( k, cmd; ta.commands )
3635                                 {
3636                                     if ( cmd != tb.commands[k] ) {
3637                                         diff.set(i, j+i+1);
3638                                         new_diff = true;
3639                                         continue Linner;
3640                                     }
3641                                 }
3642                                 continue Ltrans;
3643                             }
3644                         }
3645 
3646                         diff.set(i, j+i+1);
3647                         new_diff = true;
3648                         continue Linner;
3649                     }
3650 
3651                 }
3652             }
3653         }
3654 
3655         foreach ( i, a; states[0 .. $-1] )
3656         {
3657             foreach ( j, b; states[i+1 .. $] )
3658             {
3659                 if ( !diff(i, j+i+1) )
3660                 {
3661                     debug(tdfa) Stdout.formatln("State {} == {}", i, j+i+1);
3662                     // remap b to a
3663                     foreach ( k, c; states )
3664                     {
3665                         foreach ( t; c.transitions )
3666                         {
3667                             if ( t.target.index == j+i+1 )
3668                                 t.target = a;
3669                         }
3670                     }
3671                 }
3672             }
3673         }
3674 
3675     }
3676 }
3677 import tango.text.Util;
3678 
3679 /**************************************************************************************************
3680     Regular expression compiler and interpreter.
3681 **************************************************************************************************/
3682 class RegExpT(char_t)
3683 {
3684     alias TDFA!(dchar)      tdfa_t;
3685     alias TNFA!(dchar)      tnfa_t;
3686     alias CharClass!(dchar) charclass_t;
3687     alias Predicate!(dchar) predicate_t;
3688 
3689     /**********************************************************************************************
3690         Construct a RegExpT object.
3691         Params:
3692             pattern = Regular expression.
3693         Throws: RegExpException if there are any compilation errors.
3694         Example:
3695             Declare two variables and assign to them a Regex object:
3696             ---
3697             auto r = new Regex("pattern");
3698             auto s = new Regex(r"p[1-5]\s*");
3699             ---
3700     **********************************************************************************************/
3701     this(const(char_t)[] pattern, const(char_t)[] attributes=null)
3702     {
3703         this(pattern, false, true);
3704     }
3705 
3706     /** ditto */
3707     this(const(char_t)[] pattern, bool swapMBS, bool unanchored, bool printNFA=false)
3708     {
3709         pattern_ = pattern;
3710 
3711         debug(TangoRegex) {}
3712         else {
3713             static if ( is(char_t == dchar) ) {
3714                 scope tnfa_t tnfa_ = new tnfa_t(pattern_);
3715             }
3716             else {
3717                 scope tnfa_t tnfa_ = new tnfa_t(tango.text.convert.Utf.toString32(pattern_));
3718             }
3719         }
3720        
3721         tnfa_.swapMatchingBracketSyntax = swapMBS;
3722         tnfa_.parse(unanchored);
3723         if ( printNFA ) {
3724             debug(TangoRegex) Stdout.formatln("\nTNFA:");
3725             debug(TangoRegex) tnfa_.print;
3726         }
3727         tdfa_ = new tdfa_t(tnfa_);
3728         registers_.length = tdfa_.num_regs();
3729     }
3730 
3731     /**********************************************************************************************
3732         Generate instance of Regex.
3733         Params:
3734             pattern = Regular expression.
3735         Throws: RegExpException if there are any compilation errors.
3736         Example:
3737             Declare two variables and assign to them a Regex object:
3738             ---
3739             auto r = Regex("pattern");
3740             auto s = Regex(r"p[1-5]\s*");
3741             ---
3742     **********************************************************************************************/
3743     static RegExpT!(char_t) opCall(const(char_t)[] pattern, const(char_t)[] attributes = null)
3744     {
3745         return new RegExpT!(char_t)(pattern, attributes);
3746     }
3747 
3748     /**********************************************************************************************
3749         Set up for start of foreach loop.
3750         Returns:    Instance of RegExpT set up to search input.
3751         Example:
3752             ---
3753             import tango.io.Stdout;
3754             import tango.text.Regex;
3755 
3756             void main()
3757             {
3758                 foreach(m; Regex("ab").search("qwerabcabcababqwer"))
3759                     Stdout.formatln("{}[{}]{}", m.pre, m.match(0), m.post);
3760             }
3761             // Prints:
3762             // qwer[ab]cabcababqwer
3763             // qwerabc[ab]cababqwer
3764             // qwerabcabc[ab]abqwer
3765             // qwerabcabcab[ab]qwer
3766             ---
3767     **********************************************************************************************/
3768     public RegExpT!(char_t) search(const(char_t)[] input)
3769     {
3770         input_ = input;
3771         next_start_ = 0;
3772         last_start_ = 0;
3773         return this;
3774     }
3775 
3776     /** ditto */
3777     public int opApply(int delegate(ref RegExpT!(char_t)) dg)
3778     {
3779         int result;
3780         while ( !result && test() )
3781             result = dg(this);
3782         return result;
3783     }
3784 
3785     /**********************************************************************************************
3786         Search input for match.
3787         Returns: false for no match, true for match
3788     **********************************************************************************************/
3789     bool test(const(char_t)[] input)
3790     {
3791         this.input_ = input;
3792         next_start_ = 0;
3793         last_start_ = 0;
3794         return test();
3795     }
3796 
3797     /**********************************************************************************************
3798         Pick up where last test(input) or test() left off, and search again.
3799         Returns: false for no match, true for match
3800     **********************************************************************************************/
3801     bool test()
3802     {
3803         // initialize registers
3804         assert(registers_.length == tdfa_.num_regs());
3805         registers_[0..$] = -1;
3806         foreach ( cmd; tdfa_.initializer ) {
3807             assert(cmd.src == tdfa_.CURRENT_POSITION_REGISTER);
3808             registers_[cmd.dst] = 0;
3809         }
3810 
3811         tdfa_t.Transition* tp, tp_end;
3812 
3813         // DFA execution
3814         auto inp = input_[next_start_ .. $];
3815         auto s = tdfa_.start;
3816 
3817         debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp);
3818         LmainLoop: for ( size_t p, next_p; p < inp.length; )
3819         {
3820         Lread_char:
3821             dchar c = cast(dchar)inp[p];
3822             if ( c & 0x80 )
3823                 c = decode(inp, next_p);
3824             else
3825                 next_p = p+1;
3826 
3827         Lprocess_char:
3828             debug(TangoRegex) Stdout.formatln("{} (0x{:x})", c, cast(int)c);
3829 
3830             tdfa_t.Transition t = void;
3831             switch ( s.mode )
3832             {
3833                 case s.Mode.LOOKUP:
3834                     if ( c < s.LOOKUP_LENGTH )
3835                     {
3836                         debug(TangoRegex) Stdout.formatln("lookup");
3837                         auto i = s.lookup[c];
3838                         if ( i == s.INVALID_STATE )
3839                             break LmainLoop;
3840                         t = s.transitions[ i ];
3841                         if ( t.predicate.type != t.predicate.Type.consume )
3842                             goto Lno_consume;
3843                         goto Lconsume;
3844                     }
3845                     break LmainLoop;
3846 
3847                 case s.Mode.MIXED:
3848                     if ( c < s.LOOKUP_LENGTH )
3849                     {
3850                         debug(TangoRegex) Stdout.formatln("mixed");
3851                         auto i = s.lookup[c];
3852                         if ( i == s.INVALID_STATE )
3853                             break;
3854                         t = s.transitions[ i ];
3855                         if ( t.predicate.type != t.predicate.Type.consume )
3856                             goto Lno_consume;
3857                         goto Lconsume;
3858                     }
3859                     break;
3860 
3861                 case s.Mode.GENERIC:
3862                 default:
3863                     break;
3864             }
3865 
3866             Ltrans_loop: for ( tp = &s.generic_transitions[0], tp_end = tp+s.generic_transitions.length;
3867                 tp < tp_end; ++tp )
3868             {
3869                 t = *tp;
3870                 switch ( t.predicate.mode )
3871                 {
3872                     // single char
3873                     case predicate_t.MatchMode.single_char:
3874                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3875                         if ( c != t.predicate.data_chr )
3876                             continue Ltrans_loop;
3877                         goto Lconsume;
3878                     case predicate_t.MatchMode.single_char_l:
3879                         debug(TangoRegex) Stdout.formatln("single char 0x{:x} == 0x{:x}", cast(int)c, cast(int)t.predicate.data_chr);
3880                         if ( c != t.predicate.data_chr )
3881                             continue Ltrans_loop;
3882                         goto Lno_consume;
3883 
3884                     // bitmap
3885                     case predicate_t.MatchMode.bitmap:
3886                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3887                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3888                             goto Lconsume;
3889                         continue Ltrans_loop;
3890                     case predicate_t.MatchMode.bitmap_l:
3891                         debug(TangoRegex) Stdout.formatln("bitmap {}\n{}", c, t.predicate.toString);
3892                         if ( c <= predicate_t.MAX_BITMAP_LENGTH && ( t.predicate.data_bmp[c/8] & (1 << (c&7)) ) )
3893                             goto Lno_consume;
3894                         continue Ltrans_loop;
3895 
3896                     // string search
3897                     case predicate_t.MatchMode.string_search:
3898                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3899                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3900                             continue Ltrans_loop;
3901                         goto Lconsume;
3902                     case predicate_t.MatchMode.string_search_l:
3903                         debug(TangoRegex) Stdout.formatln("string search {} in {}", c, t.predicate.data_str);
3904                         if ( indexOf(t.predicate.data_str.ptr, c, t.predicate.data_str.length) >= t.predicate.data_str.length )
3905                             continue Ltrans_loop;
3906                         goto Lno_consume;
3907 
3908                     // generic
3909                     case predicate_t.MatchMode.generic:
3910                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3911                         for ( auto cmp = t.predicate.data_str.ptr,
3912                             cmpend = cmp + t.predicate.data_str.length;
3913                             cmp < cmpend; ++cmp )
3914                         {
3915                             if ( c < *cmp ) {
3916                                 ++cmp;
3917                                 continue;
3918                             }
3919                             ++cmp;
3920                             if ( c <= *cmp )
3921                                 goto Lconsume;
3922                         }
3923                         continue Ltrans_loop;
3924                     case predicate_t.MatchMode.generic_l:
3925                         debug(TangoRegex) Stdout.formatln("generic {}\n{}", c, t.predicate.toString);
3926                         for ( auto cmp = t.predicate.data_str.ptr,
3927                             cmpend = cmp + t.predicate.data_str.length;
3928                             cmp < cmpend; ++cmp )
3929                         {
3930                             if ( c < *cmp ) {
3931                                 ++cmp;
3932                                 continue;
3933                             }
3934                             ++cmp;
3935                             if ( c <= *cmp )
3936                                 goto Lno_consume;
3937                         }
3938                         continue Ltrans_loop;
3939 
3940                     default:
3941                         assert(0);
3942                 }
3943 
3944             Lconsume:
3945                 p = next_p;
3946             Lno_consume:
3947 
3948                 s = t.target;
3949                 debug(TangoRegex) Stdout.formatln("{}{}: {}", s.accept?"*":" ", s.index, inp[p..$]);
3950                 debug(TangoRegex) Stdout.formatln("{} commands", t.commands.length);
3951 
3952                 foreach ( cmd; t.commands )
3953                 {
3954                     if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
3955                         registers_[cmd.dst] = cast(int)p;
3956                     else
3957                         registers_[cmd.dst] = registers_[cmd.src];
3958                 }
3959 
3960                 if (s.accept && s.reluctant)
3961                     // Don't continue matching, the current find should be correct
3962                     goto Laccept;
3963 
3964                 // if all input was consumed and we do not already accept, try to
3965                 // add an explicit string/line end
3966                 if ( p >= inp.length )
3967                 {
3968                     if ( s.accept || c == 0 )
3969                         break;
3970                     c = 0;
3971                     goto Lprocess_char;
3972                 }
3973                 goto Lread_char;
3974             }
3975             // no applicable transition
3976             break;
3977         }
3978 
3979         if ( s.accept )
3980         {
3981         Laccept:
3982             foreach ( cmd; s.finishers ) {
3983                 assert(cmd.src != tdfa_.CURRENT_POSITION_REGISTER);
3984                 registers_[cmd.dst] = registers_[cmd.src];
3985             }
3986             if ( registers_.length > 1 && registers_[1] >= 0 ) {
3987                 last_start_ = next_start_;
3988                 next_start_ += registers_[1];
3989             }
3990             return true;
3991         }
3992 
3993         return false;
3994     }
3995 
3996     /**********************************************************************************************
3997         Return submatch with the given index.
3998         Params:
3999             index = 0 returns whole match, index > 0 returns submatch of bracket #index
4000         Returns:
4001             Slice of input for the requested submatch, or null if no such submatch exists.
4002     **********************************************************************************************/
4003     const(char_t)[] match(uint index)
4004     {
4005         if ( index > tdfa_.num_tags )
4006             return null;
4007         int start   = cast(int)last_start_+registers_[index*2],
4008             end     = cast(int)last_start_+registers_[index*2+1];
4009         if ( start >= 0 && start < end && end <= input_.length )
4010             return input_[start .. end];
4011         return null;
4012     }
4013 
4014     /** ditto */
4015     const(char_t)[] opIndex(uint index)
4016     {
4017         return match(index);
4018     }
4019 
4020     /**********************************************************************************************
4021         Return the slice of the input that precedes the matched substring.
4022         If no match was found, null is returned.
4023     **********************************************************************************************/
4024     const(char_t)[] pre()
4025     {
4026         auto start = registers_[0];
4027         if ( start < 0 )
4028             return null;
4029         return input_[0 .. last_start_+start];
4030     }
4031 
4032     /**********************************************************************************************
4033         Return the slice of the input that follows the matched substring.
4034         If no match was found, the whole slice of the input that was processed in the last test.
4035     **********************************************************************************************/
4036     const(char_t)[] post()
4037     {
4038         if ( registers_[1] >= 0 )
4039             return input_[next_start_ .. $];
4040         return input_[last_start_ .. $];
4041     }
4042 
4043     /**********************************************************************************************
4044         Splits the input at the matches of this regular expression into an array of slices.
4045         Example:
4046             ---
4047             import tango.io.Stdout;
4048             import tango.text.Regex;
4049 
4050             void main()
4051             {
4052                 auto strs = Regex("ab").split("abcabcababqwer");
4053                 foreach( s; strs )
4054                     Stdout.formatln("{}", s);
4055             }
4056             // Prints:
4057             // c
4058             // c
4059             // qwer
4060             ---
4061     **********************************************************************************************/
4062     inout(char_t)[][] split(inout(char_t)[] input)
4063     {
4064         auto res = new inout(char_t)[][PREALLOC];
4065         uint index;
4066         inout(char_t)[] tmp = input;
4067 
4068         foreach ( r; search(input) )
4069         {
4070             tmp = cast(inout(char_t)[])pre();
4071             res[index++] = tmp[last_start_ .. $];
4072             if ( index >= res.length )
4073                 res.length = res.length*2;
4074             tmp = cast(inout(char_t)[])post();
4075         }
4076 
4077         res[index++] = tmp;
4078         res.length = index;
4079         return res;
4080     }
4081 
4082     /**********************************************************************************************
4083         Returns a copy of the input with all matches replaced by replacement.
4084     **********************************************************************************************/
4085     char_t[] replaceAll(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4086     {
4087         const(char_t)[] tmp = input;
4088         if ( output_buffer.length <= 0 )
4089             output_buffer = new char_t[input.length+replacement.length];
4090         output_buffer.length = 0;
4091 
4092         foreach ( r; search(input) )
4093         {
4094             tmp = pre();
4095             if ( tmp.length > last_start_ )
4096                 output_buffer ~= tmp[last_start_ .. $];
4097             output_buffer ~= replacement;
4098             tmp = post();
4099         }
4100         output_buffer ~= tmp;
4101         return output_buffer;
4102     }
4103 
4104     /**********************************************************************************************
4105         Returns a copy of the input with the last match replaced by replacement.
4106     **********************************************************************************************/
4107     char_t[] replaceLast(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4108     {
4109         const(char_t)[] tmp_pre, tmp_post;
4110         if ( output_buffer.length <= 0 )
4111             output_buffer = new char_t[input.length+replacement.length];
4112         output_buffer.length = 0;
4113 
4114         foreach ( r; search(input) ) {
4115             tmp_pre = pre();
4116             tmp_post = post();
4117         }
4118 
4119         if ( tmp_pre !is null || tmp_post !is null ) {
4120             output_buffer ~= tmp_pre;
4121             output_buffer ~= replacement;
4122             output_buffer ~= tmp_post;
4123         }
4124         else
4125             output_buffer ~= input;
4126 
4127         return output_buffer;
4128     }
4129 
4130     /**********************************************************************************************
4131         Returns a copy of the input with the first match replaced by replacement.
4132     **********************************************************************************************/
4133     char_t[] replaceFirst(const(char_t)[] input, const(char_t)[] replacement, char_t[] output_buffer=null)
4134     {
4135         const(char_t)[] tmp = input;
4136         if ( output_buffer.length <= 0 )
4137             output_buffer = new char_t[input.length+replacement.length];
4138         output_buffer.length = 0;
4139 
4140         if ( test(input) )
4141         {
4142             tmp = pre();
4143             if ( tmp.length > last_start_ )
4144                 output_buffer ~= tmp[last_start_ .. $];
4145             output_buffer ~= replacement;
4146             tmp = post();
4147         }
4148         output_buffer ~= tmp;
4149         return output_buffer;
4150     }
4151 
4152     /**********************************************************************************************
4153         Calls dg for each match and replaces it with dg's return value.
4154     **********************************************************************************************/
4155     char_t[] replaceAll(const(char_t)[] input, char_t[] delegate(RegExpT!(char_t)) dg, char_t[] output_buffer=null)
4156     {
4157         const(char_t)[]    tmp = input;
4158         if ( output_buffer.length <= 0 )
4159             output_buffer = new char_t[input.length];
4160         output_buffer.length = 0;
4161 
4162         foreach ( r; search(input) )
4163         {
4164             tmp = pre();
4165             if ( tmp.length > last_start_ )
4166                 output_buffer ~= tmp[last_start_ .. $];
4167             output_buffer ~= dg(this);
4168             tmp = post();
4169         }
4170         output_buffer ~= tmp;
4171         return output_buffer;
4172     }
4173 
4174     /**********************************************************************************************
4175         Compiles the regular expression to D code.
4176 
4177         NOTE : Remember to import this module (tango.text.Regex) in the module where you put the
4178         generated D code.
4179 
4180     **********************************************************************************************/
4181     // TODO: input-end special case
4182     const(char)[] compileToD(const(char)[] func_name = "match", bool lexer=false)
4183     {
4184         const(char)[] code;
4185         const(char)[] str_type;
4186         static if ( is(char_t == char) )
4187             str_type = "const(char)[]";
4188         static if ( is(char_t == wchar) )
4189             str_type = "const(wchar)[]";
4190         static if ( is(char_t == dchar) )
4191             str_type = "const(dchar)[]";
4192 
4193         if ( lexer )
4194             code = Format.convert("// {}\nbool {}({} input, out uint token, out {} match", pattern_, func_name, str_type, str_type);
4195         else {
4196             code = Format.convert("// {}\nbool match({} input", pattern_, str_type);
4197             code ~= Format.convert(", ref {}[] groups", str_type);
4198         }
4199         code ~= Format.convert(")\n{{\n    uint s = {};", tdfa_.start.index);
4200 
4201         uint num_vars = tdfa_.num_regs();
4202         if ( num_vars > 0 )
4203         {
4204             if ( lexer )
4205                 code ~= "\n    static int ";
4206             else
4207                 code ~= "\n    int ";
4208             bool first = true;
4209             for ( int i = 0, used = 0; i < num_vars; ++i )
4210             {
4211                 bool hasInit = false;
4212                 foreach ( cmd; tdfa_.initializer )
4213                 {
4214                     if ( cmd.dst == i ) {
4215                         hasInit = true;
4216                         break;
4217                     }
4218                 }
4219 
4220                 if ( first )
4221                     first = false;
4222                 else
4223                     code ~= ", ";
4224                 if ( used > 0 && used % 10 == 0 )
4225                     code ~= "\n        ";
4226                 ++used;
4227                 code ~= Format.convert("r{}", i);
4228 
4229                 if ( hasInit )
4230                     code ~= "=0";
4231                 else
4232                     code ~= "=-1";
4233             }
4234             code ~= ";";
4235         }
4236 
4237         code ~= "\n\n    for ( size_t p, next_p; p < input.length; )\n    {";
4238         code ~= "\n        dchar c = cast(dchar)input[p];\n        if ( c & 0x80 )\n            decode(input, next_p);";
4239         code ~= "\n        else\n            next_p = p+1;\n        switch ( s )\n        {";
4240 
4241         size_t[] finish_states;
4242         foreach ( s; tdfa_.states )
4243         {
4244             code ~= Format.convert("\n            case {}:", s.index);
4245 
4246             if ( s.accept )
4247             {
4248                 finish_states ~= s.index;
4249 
4250                 tdfa_t.State target;
4251                 foreach ( t; s.transitions )
4252                 {
4253                     if ( target is null )
4254                         target = t.target;
4255                     else if ( target !is t.target )
4256                     {
4257                         target = null;
4258                         break;
4259                     }
4260                 }
4261                 if ( target !is null && target is s )
4262                     s.transitions = null;
4263             }
4264 
4265             bool first_if=true;
4266             charclass_t cc, ccTest;
4267 
4268             foreach ( t; s.transitions.sort )
4269             {
4270                 ccTest.add(t.predicate.getInput());
4271                 ccTest.optimize();
4272                 if ( t.predicate.getInput() < ccTest )
4273                     cc = t.predicate.getInput();
4274                 else
4275                     cc = ccTest;
4276 
4277                 if ( first_if ) {
4278                     code ~= "\n                if ( ";
4279                     first_if = false;
4280                 }
4281                 else
4282                     code ~= "\n                else if ( ";
4283                 bool first_cond=true;
4284                 foreach ( cr; cc.parts )
4285                 {
4286                     if ( first_cond )
4287                         first_cond = false;
4288                     else
4289                         code ~= " || ";
4290                     if ( cr.l() == cr.r() )
4291                         code ~= Format.convert("c == 0x{:x}", cast(int)cr.l());
4292                     else
4293                         code ~= Format.convert("c >= 0x{:x} && c <= 0x{:x}", cast(int)cr.l(), cast(int)cr.r());
4294                 }
4295                 code ~= Format.convert(" ) {{\n                    s = {};", t.target.index);
4296 
4297                 if ( t.predicate.type == typeof(t.predicate.type).consume )
4298                     code ~= "\n                    p = next_p;";
4299                 foreach ( cmd; t.commands )
4300                     code ~= compileCommand(cmd, "                    ");
4301 /*
4302                 // if inp ends here and we do not already accept, try to add an explicit string/line end
4303                 if ( p >= inp.length && !s.accept && c != 0 ) {
4304                     c = 0;
4305                     goto Lprocess_char;
4306                 }
4307 */
4308                 code ~= "\n                }";
4309             }
4310 
4311             if ( !first_if )
4312                 code ~= Format.convert(
4313                     "\n                else\n                    {};\n                break;",
4314                     s.accept?Format.convert("goto finish{}", s.index):"return false"
4315                 );
4316             else
4317                 code ~= Format.convert("\n                {};", s.accept?Format.convert("goto finish{}", s.index):"return false");
4318         }
4319 
4320         // create finisher groups
4321         size_t[][size_t] finisherGroup;
4322         foreach ( fs; finish_states )
4323         {
4324             // check if finisher group with same commands exists
4325             bool haveFinisher = false;
4326             foreach ( fg; finisherGroup.keys )
4327             {
4328                 bool equalCommands = false;
4329                 if ( tdfa_.states[fs].finishers.length == tdfa_.states[fg].finishers.length )
4330                 {
4331                     equalCommands = true;
4332                     foreach ( i, cmd; tdfa_.states[fs].finishers )
4333                     {
4334                         if ( cmd != tdfa_.states[fg].finishers[i] ) {
4335                             equalCommands = false;
4336                             break;
4337                         }
4338                     }
4339                 }
4340                 if ( equalCommands ) {
4341                     // use existing group for this state
4342                     finisherGroup[fg] ~= fs;
4343                     haveFinisher = true;
4344                     break;
4345                 }
4346             }
4347             // create new group
4348             if ( !haveFinisher )
4349                 finisherGroup[fs] ~= fs;
4350         }
4351 
4352 
4353         code ~= "\n            default:\n                assert(0);\n        }\n    }\n\n    switch ( s )\n    {";
4354         foreach ( group, states; finisherGroup )
4355         {
4356             foreach ( s; states )
4357                 code ~= Format.convert("\n        case {}: finish{}:", s, s);
4358 
4359             foreach ( cmd; tdfa_.states[group].finishers )
4360             {
4361                 if ( lexer )
4362                 {
4363                     if ( tdfa_.states[group].finishers.length > 1 )
4364                         throw new RegExpException("Lexer error: more than one finisher in flm lexer!");
4365                     if ( cmd.dst % 2 == 0 || cmd.dst >= tdfa_.num_tags )
4366                         throw new RegExpException(Format.convert("Lexer error: unexpected dst register {} in flm lexer!", cmd.dst).idup);
4367                     code ~= Format.convert("\n            match = input[0 .. r{}];\n            token = {};", cmd.src, cmd.dst/2);
4368                 }
4369                 else
4370                     code ~= compileCommand(cmd, "            ");
4371             }
4372 
4373             code ~= "\n            break;";
4374         }
4375         code ~= "\n        default:\n            return false;\n    }\n";
4376 
4377         if ( !lexer )
4378         {
4379             code ~= Format.convert("\n    groups.length = {};", tdfa_.num_tags/2);
4380             for ( int i = 0; i < tdfa_.num_tags/2; ++i )
4381                 code ~= Format.convert("\n    if ( r{} > -1 && r{} > -1 )\n        groups[{}] = input[r{} .. r{}];", 2*i, 2*i+1, i, 2*i, 2*i+1);
4382         }
4383 
4384         code ~= "\n    return true;\n}";
4385         return code;
4386     }
4387 
4388     /*********************************************************************************************
4389         Get the pattern with which this regex was constructed.
4390     **********************************************************************************************/
4391     public const(char_t)[] pattern()
4392     {
4393         return pattern_;
4394     }
4395 
4396     /*********************************************************************************************
4397         Get the tag count of this regex, representing the number of sub-matches.
4398 
4399         This value is the max valid value for match/opIndex.
4400     **********************************************************************************************/
4401     uint tagCount()
4402     {
4403         return tdfa_.num_tags;
4404     }
4405 
4406     int[]       registers_;
4407     size_t      next_start_,
4408                 last_start_;
4409 
4410     debug(TangoRegex) tnfa_t tnfa_;
4411     tdfa_t      tdfa_;
4412 private:
4413     enum int           PREALLOC = 16;
4414     const(char_t)[]    input_,
4415                        pattern_;
4416 
4417     const(char)[] compileCommand(tdfa_t.Command cmd, const(char)[] indent)
4418     {
4419         const(char)[]  code,
4420                 dst;
4421         code ~= Format.convert("\n{}r{} = ", indent, cmd.dst);
4422         if ( cmd.src == tdfa_.CURRENT_POSITION_REGISTER )
4423             code ~= "p;";
4424         else
4425             code ~= Format.convert("r{};", cmd.src);
4426         return code;
4427     }
4428 }
4429 
4430 alias RegExpT!(char) Regex;
4431 
4432 debug(utf) import tango.stdc.stdio;
4433 // the following block is stolen from phobos.
4434 // the copyright notice applies for this block only.
4435 /*
4436  *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
4437  *  Written by Walter Bright
4438  *
4439  *  This software is provided 'as-is', without any express or implied
4440  *  warranty. In no event will the authors be held liable for any damages
4441  *  arising from the use of this software.
4442  *
4443  *  Permission is granted to anyone to use this software for any purpose,
4444  *  including commercial applications, and to alter it and redistribute it
4445  *  freely, subject to the following restrictions:
4446  *
4447  *  o  The origin of this software must not be misrepresented; you must not
4448  *     claim that you wrote the original software. If you use this software
4449  *     in a product, an acknowledgment in the product documentation would be
4450  *     appreciated but is not required.
4451  *  o  Altered source versions must be plainly marked as such, and must not
4452  *     be misrepresented as being the original software.
4453  *  o  This notice may not be removed or altered from any source
4454  *     distribution.
4455  */
4456 
4457 class UtfException : Exception
4458 {
4459     size_t idx; /// index in string of where error occurred
4460 
4461     this(immutable(char)[] s, size_t i)
4462     {
4463         idx = i;
4464         super(s);
4465     }
4466 }
4467 
4468 bool isValidDchar(dchar c)
4469 {
4470     /* Note: FFFE and FFFF are specifically permitted by the
4471      * Unicode standard for application internal use, but are not
4472      * allowed for interchange.
4473      * (thanks to Arcane Jill)
4474      */
4475 
4476     return c < 0xD800 ||
4477         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
4478 }
4479 
4480 /* *************
4481  * Decodes and returns character starting at s[idx]. idx is advanced past the
4482  * decoded character. If the character is not well formed, a UtfException is
4483  * thrown and idx remains unchanged.
4484  */
4485 
4486 dchar decode(const(char)[] s, ref size_t idx)
4487     {
4488         size_t len = s.length;
4489         dchar V;
4490         size_t i = idx;
4491         char u = s[i];
4492 
4493         if (u & 0x80)
4494         {   uint n;
4495             char u2;
4496 
4497             /* The following encodings are valid, except for the 5 and 6 byte
4498              * combinations:
4499              *  0xxxxxxx
4500              *  110xxxxx 10xxxxxx
4501              *  1110xxxx 10xxxxxx 10xxxxxx
4502              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
4503              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4504              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
4505              */
4506             for (n = 1; ; n++)
4507             {
4508                 if (n > 4)
4509                     goto Lerr;          // only do the first 4 of 6 encodings
4510                 if (((u << n) & 0x80) == 0)
4511                 {
4512                     if (n == 1)
4513                         goto Lerr;
4514                     break;
4515                 }
4516             }
4517 
4518             // Pick off (7 - n) significant bits of B from first byte of octet
4519             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
4520 
4521             if (i + (n - 1) >= len)
4522                 goto Lerr;                      // off end of string
4523 
4524             /* The following combinations are overlong, and illegal:
4525              *  1100000x (10xxxxxx)
4526              *  11100000 100xxxxx (10xxxxxx)
4527              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
4528              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
4529              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
4530              */
4531             u2 = s[i + 1];
4532             if ((u & 0xFE) == 0xC0 ||
4533                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
4534                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
4535                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
4536                 (u == 0xFC && (u2 & 0xFC) == 0x80))
4537                 goto Lerr;                      // overlong combination
4538 
4539             for (uint j = 1; j != n; j++)
4540             {
4541                 u = s[i + j];
4542                 if ((u & 0xC0) != 0x80)
4543                     goto Lerr;                  // trailing bytes are 10xxxxxx
4544                 V = (V << 6) | (u & 0x3F);
4545             }
4546             if (!isValidDchar(V))
4547                 goto Lerr;
4548             i += n;
4549         }
4550         else
4551         {
4552             V = cast(dchar) u;
4553             i++;
4554         }
4555 
4556         idx = i;
4557         return V;
4558 
4559       Lerr:
4560         throw new Exception("invalid UTF-8 sequence");
4561     }
4562 
4563 /*  ditto */
4564 
4565 dchar decode(const(wchar)[] s, ref size_t idx)
4566     in
4567     {
4568         assert(idx >= 0 && idx < s.length);
4569     }
4570     out (result)
4571     {
4572         assert(isValidDchar(result));
4573     }
4574     body
4575     {
4576         const(char)[] msg;
4577         dchar V;
4578         size_t i = idx;
4579         uint u = s[i];
4580 
4581         if (u & ~0x7F)
4582         {   if (u >= 0xD800 && u <= 0xDBFF)
4583             {   uint u2;
4584 
4585                 if (i + 1 == s.length)
4586                 {   msg = "surrogate UTF-16 high value past end of string";
4587                     goto Lerr;
4588                 }
4589                 u2 = s[i + 1];
4590                 if (u2 < 0xDC00 || u2 > 0xDFFF)
4591                 {   msg = "surrogate UTF-16 low value out of range";
4592                     goto Lerr;
4593                 }
4594                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
4595                 i += 2;
4596             }
4597             else if (u >= 0xDC00 && u <= 0xDFFF)
4598             {   msg = "unpaired surrogate UTF-16 value";
4599                 goto Lerr;
4600             }
4601             else if (u == 0xFFFE || u == 0xFFFF)
4602             {   msg = "illegal UTF-16 value";
4603                 goto Lerr;
4604             }
4605             else
4606                 i++;
4607         }
4608         else
4609         {
4610             i++;
4611         }
4612 
4613         idx = i;
4614         return cast(dchar)u;
4615 
4616       Lerr:
4617         throw new UtfException(msg.idup, i);
4618     }
4619 
4620 /*  ditto */
4621 
4622 dchar decode(const(dchar)[] s, ref size_t idx)
4623     in
4624     {
4625         assert(idx >= 0 && idx < s.length);
4626     }
4627     body
4628     {
4629         size_t i = idx;
4630         dchar c = s[i];
4631 
4632         if (!isValidDchar(c))
4633             goto Lerr;
4634         idx = i + 1;
4635         return c;
4636 
4637       Lerr:
4638         throw new UtfException("5invalid UTF-32 value", i);
4639     }
4640 
4641 
4642 
4643 /* =================== Encode ======================= */
4644 
4645 /* *****************************
4646  * Encodes character c and appends it to array s[].
4647  */
4648 
4649 void encode(ref const(char)[] s, dchar c)
4650     in
4651     {
4652         assert(isValidDchar(c));
4653     }
4654     body
4655     {
4656         const(char)[] r = s;
4657 
4658         if (c <= 0x7F)
4659         {
4660             r ~= cast(char) c;
4661         }
4662         else
4663         {
4664             char[4] buf;
4665             uint L;
4666 
4667             if (c <= 0x7FF)
4668             {
4669                 buf[0] = cast(char)(0xC0 | (c >> 6));
4670                 buf[1] = cast(char)(0x80 | (c & 0x3F));
4671                 L = 2;
4672             }
4673             else if (c <= 0xFFFF)
4674             {
4675                 buf[0] = cast(char)(0xE0 | (c >> 12));
4676                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4677                 buf[2] = cast(char)(0x80 | (c & 0x3F));
4678                 L = 3;
4679             }
4680             else if (c <= 0x10FFFF)
4681             {
4682                 buf[0] = cast(char)(0xF0 | (c >> 18));
4683                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
4684                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
4685                 buf[3] = cast(char)(0x80 | (c & 0x3F));
4686                 L = 4;
4687             }
4688             else
4689             {
4690                 assert(0);
4691             }
4692             r ~= buf[0 .. L];
4693         }
4694         s = r;
4695     }
4696 
4697 /*  ditto */
4698 
4699 void encode(ref const(wchar)[] s, dchar c)
4700     in
4701     {
4702         assert(isValidDchar(c));
4703     }
4704     body
4705     {
4706         const(wchar)[] r = s;
4707 
4708         if (c <= 0xFFFF)
4709         {
4710             r ~= cast(wchar) c;
4711         }
4712         else
4713         {
4714             wchar[2] buf;
4715 
4716             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
4717             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
4718             r ~= buf;
4719         }
4720         s = r;
4721     }
4722 
4723 /*  ditto */
4724 
4725 void encode(ref const(dchar)[] s, dchar c)
4726     in
4727     {
4728         assert(isValidDchar(c));
4729     }
4730     body
4731     {
4732         s ~= c;
4733     }
4734 
4735 debug(UnitTest)
4736 {
4737     unittest
4738     {
4739         // match a fixed string
4740         Regex r;
4741         r = new Regex("abc");
4742         assert(r.test("abc123"));
4743         assert(r.test("feabc123"));
4744         assert(!r.test("abe123"));
4745 
4746         // match a non-ASCII string
4747         r = new Regex("☃");
4748         assert(r.test("a☃c123"));
4749 
4750         // capture
4751         r = new Regex("☃(c)");
4752         assert(r.test("a☃c123"));
4753         assert(r.match(1) == "c");
4754 
4755         // dot
4756         r = new Regex("..");
4757         assert(r.test("a☃c123"));
4758 
4759         // dot capture
4760         r = new Regex("(..)");
4761         assert(r.test("a☃c123"));
4762         assert(r.match(1) == "a☃");
4763 
4764 
4765 
4766         // two captures
4767         r = new Regex("(.)(.)");
4768         assert(r.test("a☃c123"));
4769         assert(r.match(1) == "a");
4770         assert(r.match(2) == "☃");
4771 
4772         // multiple letters
4773         r = new Regex(".*(e+).*");
4774         assert(r.test("aeeeeec123"));
4775         assert(r.match(1) == "eeeee", "Expected: eeeee Got: " ~ r.match(1));
4776         assert(!r.test("abfua"));
4777 
4778         // multiple snowmen
4779         r = new Regex(".*(☃+).*");
4780         assert(r.test("a☃☃☃☃☃c123"));
4781         assert(r.match(1) == "☃☃☃☃☃", "Expected: ☃☃☃☃☃ Got: " ~ r.match(1));
4782         assert(!r.test("abeua"));
4783 
4784         /**
4785            "*" quantifier bug. In "(☃+)" pattern test ((r.match(0) == "☃☃☃☃☃")&&(r.match(1) == "☃☃☃☃☃")) has passed.
4786         */
4787         // multiple snowmen
4788         r = new Regex("(☃+)");
4789         assert(r.test("a☃☃☃☃☃c123"));
4790         assert(r.match(0) == "☃☃☃☃☃");
4791         assert(r.match(1) == "☃☃☃☃☃");
4792         assert(r.test("a☃beua"));
4793     }
4794 
4795     debug(RegexTestOnly)
4796     {
4797         import tango.io.Stdout;
4798         void main() { Stdout("All tests pass").newline(); }
4799     }
4800 }