tango.text.Util source code

1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6 
7         version:        Apr 2004: Initial release
8                         Dec 2006: South Seas version
9 
10         author:         Kris
11 
12 
13         Placeholder for a variety of wee functions. These functions are all
14         templated with the intent of being used for arrays of char, wchar,
15         and dchar. However, they operate correctly with other array types
16         also.
17 
18         Several of these functions return an index value, representing where
19         some criteria was identified. When said criteria is not matched, the
20         functions return a value representing the array length provided to
21         them. That is, for those scenarios where C functions might typically
22         return -1 these functions return length instead. This operate nicely
23         with D slices:
24         ---
25         auto text = "happy:faces";
26         
27         assert (text[0 .. locate (text, ':')] == "happy");
28         
29         assert (text[0 .. locate (text, '!')] == "happy:faces");
30         ---
31 
32         The contains() function is more convenient for trivial lookup
33         cases:
34         ---
35         if (contains ("fubar", '!'))
36             ...
37         ---
38 
39         Note that where some functions expect a size_t as an argument, the
40         D template-matching algorithm will fail where an int is provided
41         instead. This is the typically the cause of "template not found"
42         errors. Also note that name overloading is not supported cleanly
43         by IFTI at this time, so is not applied here.
44 
45 
46         Applying the D "import alias" mechanism to this module is highly
47         recommended, in order to limit namespace pollution:
48         ---
49         import Util = tango.text.Util;
50 
51         auto s = Util.trim ("  foo ");
52         ---
53                 
54 
55         Function templates:
56         ---
57         trim (source)                               // trim whitespace
58         triml (source)                              // trim whitespace
59         trimr (source)                              // trim whitespace
60         strip (source, match)                       // trim elements
61         stripl (source, match)                      // trim elements
62         stripr (source, match)                      // trim elements
63         chopl (source, match)                       // trim pattern match
64         chopr (source, match)                       // trim pattern match
65         delimit (src, set)                          // split on delims
66         split (source, pattern)                     // split on pattern
67         splitLines (source);                        // split on lines
68         head (source, pattern, tail)                // split to head & tail
69         join (source, postfix, output)              // join text segments
70         prefix (dst, prefix, content...)            // prefix text segments
71         postfix (dst, postfix, content...)          // postfix text segments
72         combine (dst, prefix, postfix, content...)  // combine lotsa stuff
73         repeat (source, count, output)              // repeat source 
74         replace (source, match, replacement)        // replace chars
75         substitute (source, match, replacement)     // replace/remove matches
76         count (source, match)                       // count instances
77         contains (source, match)                    // has char?
78         containsPattern (source, match)             // has pattern?
79         index (source, match, start)                // find match index
80         locate (source, match, start)               // find char
81         locatePrior (source, match, start)          // find prior char
82         locatePattern (source, match, start);       // find pattern
83         locatePatternPrior (source, match, start);  // find prior pattern
84         indexOf (s*, match, length)                 // low-level lookup
85         mismatch (s1*, s2*, length)                 // low-level compare
86         matching (s1*, s2*, length)                 // low-level compare
87         isSpace (match)                             // is whitespace?
88         unescape(source, output)                    // convert '\' prefixes
89         layout (destination, format ...)            // featherweight printf
90         lines (str)                                 // foreach lines
91         quotes (str, set)                           // foreach quotes
92         delimiters (str, set)                       // foreach delimiters
93         patterns (str, pattern)                     // foreach patterns
94         ---
95 
96         Please note that any 'pattern' referred to within this module
97         refers to a pattern of characters, and not some kind of regex
98         descriptor. Use the Regex module for regex operation.
99 
100 *******************************************************************************/
101 
102 module tango.text.Util;
103 
104 /******************************************************************************
105 
106         Trim the provided array by stripping whitespace from both
107         ends. Returns a slice of the original content
108 
109 ******************************************************************************/
110 
111 
112 T[] trim(T) (T[] source)
113 {
114         auto head = source.ptr;
115         auto tail = head + source.length;
116 
117         while (head < tail && isSpace(*head))
118                ++head;
119 
120         while (tail > head && isSpace(*(tail-1)))
121                --tail;
122 
123         return head [0 .. tail - head];
124 }
125 
126 /******************************************************************************
127 
128         Trim the provided array by stripping whitespace from the left.
129         Returns a slice of the original content
130 
131 ******************************************************************************/
132 
133 T[] triml(T) (T[] source)
134 {
135         auto head = source.ptr;
136         auto tail = head + source.length;
137 
138         while (head < tail && isSpace(*head))
139                ++head;
140 
141         return head [0 .. tail - head];
142 }
143 
144 /******************************************************************************
145 
146         Trim the provided array by stripping whitespace from the right.
147         Returns a slice of the original content
148 
149 ******************************************************************************/
150 
151 T[] trimr(T) (T[] source)
152 {
153         auto head = source.ptr;
154         auto tail = head + source.length;
155 
156         while (tail > head && isSpace(*(tail-1)))
157                --tail;
158 
159         return head [0 .. tail - head];
160 }
161 
162 /******************************************************************************
163 
164         Trim the given array by stripping the provided match from
165         both ends. Returns a slice of the original content
166 
167 ******************************************************************************/
168 
169 T[] strip(T, S) (T[] source, S match)
170 {
171         auto head = source.ptr;
172         auto tail = head + source.length;
173 
174         while (head < tail && *head is match)
175                ++head;
176 
177         while (tail > head && *(tail-1) is match)
178                --tail;
179 
180         return head [0 .. tail - head];
181 }
182 
183 /******************************************************************************
184 
185         Trim the given array by stripping the provided match from
186         the left hand side. Returns a slice of the original content
187 
188 ******************************************************************************/
189 
190 T[] stripl(T, S) (T[] source, S match)
191 {
192         auto head = source.ptr;
193         auto tail = head + source.length;
194 
195         while (head < tail && *head is match)
196                ++head;
197 
198         return head [0 .. tail - head];
199 }
200 
201 /******************************************************************************
202 
203         Trim the given array by stripping the provided match from
204         the right hand side. Returns a slice of the original content
205 
206 ******************************************************************************/
207 
208 T[] stripr(T, S) (T[] source, S match)
209 {
210         auto head = source.ptr;
211         auto tail = head + source.length;
212 
213         while (tail > head && *(tail-1) is match)
214                --tail;
215 
216         return head [0 .. tail - head];
217 }
218 
219 /******************************************************************************
220 
221         Chop the given source by stripping the provided match from
222         the left hand side. Returns a slice of the original content
223 
224 ******************************************************************************/
225 
226 T[] chopl(T, S) (T[] source, S match)
227 {
228         if (match.length <= source.length)
229             if (source[0 .. match.length] == match)
230                 source = source [match.length .. $];
231 
232         return source;
233 }
234 
235 /******************************************************************************
236 
237         Chop the given source by stripping the provided match from
238         the right hand side. Returns a slice of the original content
239 
240 ******************************************************************************/
241 
242 T[] chopr(T, S) (T[] source, S match)
243 {
244         if (match.length <= source.length)
245             if (source[$-match.length .. $] == match)
246                 source = source [0 .. $-match.length];
247 
248         return source;
249 }
250 
251 /******************************************************************************
252 
253         Replace all instances of one element with another (in place)
254 
255 ******************************************************************************/
256 
257 T[] replace(T, S) (T[] source, S match, S replacement)
258 {
259         foreach (ref c; source)
260                  if (c is match)
261                      c = replacement;
262         return source;
263 }
264 
265 /******************************************************************************
266 
267         Substitute all instances of match from source. Set replacement
268         to null in order to remove instead of replace
269 
270 ******************************************************************************/
271 
272 T[] substitute(T) (const(T)[] source, const(T)[] match, const(T)[] replacement)
273 {
274         T[] output;
275 
276         foreach (s; patterns (source, match, replacement))
277                     output ~= s;
278         return output;
279 }
280 
281 /******************************************************************************
282 
283         Count all instances of match within source 
284 
285 ******************************************************************************/
286 
287 size_t count(T) (const(T)[] source, const(T)[] match)
288 {
289         size_t c;
290 
291         foreach (s; patterns (source, match))
292                     ++c;
293         assert(c > 0);
294         return c - 1;
295 }
296 
297 /******************************************************************************
298 
299         Returns whether or not the provided array contains an instance
300         of the given match
301         
302 ******************************************************************************/
303 
304 bool contains(T) (const(T)[] source, const(T) match)
305 {
306         return indexOf (source.ptr, match, source.length) != source.length;
307 }
308 
309 /******************************************************************************
310 
311         Returns whether or not the provided array contains an instance
312         of the given match
313         
314 ******************************************************************************/
315 
316 bool containsPattern(T) (const(T)[] source, const(T)[] match)
317 {
318         return locatePattern (source, match) != source.length;
319 }
320 
321 /******************************************************************************
322 
323         Return the index of the next instance of 'match' starting at
324         position 'start', or source.length where there is no match.
325 
326         Parameter 'start' defaults to 0
327 
328 ******************************************************************************/
329 size_t index(T) (const(T)[] source, const(T)[] match, size_t start=0)
330 {
331         return (match.length is 1) ? locate (source, match[0], start) 
332                                    : locatePattern (source, match, start);
333 }
334 
335 /******************************************************************************
336 
337         Return the index of the prior instance of 'match' starting
338         just before 'start', or source.length where there is no match.
339 
340         Parameter 'start' defaults to source.length
341 
342 ******************************************************************************/
343 size_t rindex(T) (const(T)[] source, const(T)[] match, size_t start=size_t.max)
344 {
345         return (match.length is 1) ? locatePrior (source, match[0], start) 
346                                    : locatePatternPrior (source, match, start);
347 }
348 
349 /******************************************************************************
350 
351         Return the index of the next instance of 'match' starting at
352         position 'start', or source.length where there is no match.
353 
354         Parameter 'start' defaults to 0
355 
356 ******************************************************************************/
357 size_t locate(T) (const(T)[] source, const(T) match, size_t start=0)
358 {
359         if (start > source.length)
360             start = source.length;
361         
362         return indexOf (source.ptr+start, match, source.length - start) + start;
363 }
364 
365 /******************************************************************************
366 
367         Return the index of the prior instance of 'match' starting
368         just before 'start', or source.length where there is no match.
369 
370         Parameter 'start' defaults to source.length
371 
372 ******************************************************************************/
373 size_t locatePrior(T) (const(T)[] source, const(T) match, size_t start=size_t.max)
374 {
375         if (start > source.length)
376             start = source.length;
377 
378         while (start > 0)
379                if (source[--start] is match)
380                    return start;
381         return source.length;
382 }
383 
384 /******************************************************************************
385 
386         Return the index of the next instance of 'match' starting at
387         position 'start', or source.length where there is no match. 
388 
389         Parameter 'start' defaults to 0
390 
391 ******************************************************************************/
392 size_t locatePattern(T) (const(T)[] source, const(T)[] match, size_t start=0)
393 {
394         size_t    idx;
395         const(T)* p = source.ptr + start;
396         size_t    extent = source.length - start - match.length + 1;
397 
398         if (match.length && extent <= source.length)
399             {
400             while (extent)
401                    if ((idx = indexOf (p, match[0], extent)) is extent)
402                         break;
403                    else
404                       if (matching (p+=idx, match.ptr, match.length))
405                           return p - source.ptr;
406                       else
407                          {
408                          extent -= (idx+1);
409                          ++p;
410                          }
411             }
412         return source.length;
413 }
414    
415 /******************************************************************************
416 
417         Return the index of the prior instance of 'match' starting
418         just before 'start', or source.length where there is no match.
419 
420         Parameter 'start' defaults to source.length
421 
422 ******************************************************************************/
423 size_t locatePatternPrior(T) (const(T)[] source, const(T)[] match, size_t start=size_t.max)
424 {
425         auto len = source.length;
426         
427         if (start > len)
428             start = len;
429 
430         if (match.length && match.length <= len)
431             while (start)
432                   {
433                   start = locatePrior (source, match[0], start);
434                   if (start is len)
435                       break;
436                   else
437                      if ((start + match.length) <= len)
438                           if (matching (source.ptr+start, match.ptr, match.length))
439                               return start;
440                   }
441 
442         return len;
443 }
444 
445 /******************************************************************************
446 
447         Split the provided array on the first pattern instance, and 
448         return the resultant head and tail. The pattern is excluded 
449         from the two segments. 
450 
451         Where a segment is not found, tail will be null and the return
452         value will be the original array.
453         
454 ******************************************************************************/
455 
456 T[] head(T, S) (T[] src, S[] pattern, out T[] tail)
457 {
458         auto i = locatePattern (src, pattern);
459         if (i != src.length)
460            {
461            tail = src [i + pattern.length .. $];
462            src = src [0 .. i];
463            }
464         return src;
465 }
466 
467 /******************************************************************************
468 
469         Split the provided array on the last pattern instance, and 
470         return the resultant head and tail. The pattern is excluded 
471         from the two segments. 
472 
473         Where a segment is not found, head will be null and the return
474         value will be the original array.
475         
476 ******************************************************************************/
477 
478 T[] tail(T, S) (T[] src, S[] pattern, out T[] head)
479 {
480         auto i = locatePatternPrior (src, pattern);
481         if (i != src.length)
482            {
483            head = src [0 .. i];
484            src = src [i + pattern.length .. $];
485            }
486         return src;
487 }
488 
489 /******************************************************************************
490 
491         Split the provided array wherever a delimiter-set instance is
492         found, and return the resultant segments. The delimiters are
493         excluded from each of the segments. Note that delimiters are
494         matched as a set of alternates rather than as a pattern.
495 
496         Splitting on a single delimiter is considerably faster than
497         splitting upon a set of alternatives. 
498 
499         Note that the src content is not duplicated by this function, 
500         but is sliced instead.
501 
502 ******************************************************************************/
503 
504 T[][] delimit(T, M) (T[] src, const(M)[] set)
505 {
506         T[][] result;
507 
508         foreach (segment; delimiters (src, set))
509                  result ~= segment;
510         return result;
511 }
512 
513 /******************************************************************************
514 
515         Split the provided array wherever a pattern instance is
516         found, and return the resultant segments. The pattern is
517         excluded from each of the segments.
518         
519         Note that the src content is not duplicated by this function, 
520         but is sliced instead.
521 
522 ******************************************************************************/
523 
524 inout(T)[][] split(T) (inout(T)[] src, const(T)[] pattern)
525 {
526         const(T)[][] result;
527 
528         foreach (segment; patterns (cast(const(T)[])src, pattern))
529                  result ~= segment;
530         return cast(inout(T)[][])result;
531 }
532 
533 /******************************************************************************
534 
535         Convert text into a set of lines, where each line is identified
536         by a \n or \r\n combination. The line terminator is stripped from
537         each resultant array
538 
539         Note that the src content is not duplicated by this function, but
540         is sliced instead.
541 
542 ******************************************************************************/
543 
544 alias toLines splitLines;
545 T[][] toLines(T) (T[] src)
546 {
547 
548         T[][] result;
549 
550         foreach (line; lines (src))
551                  result ~= line;
552         return result;
553 }
554 
555 /******************************************************************************
556 
557         Return the indexed line, where each line is identified by a \n 
558         or \r\n combination. The line terminator is stripped from the 
559         resultant line
560 
561         Note that src content is not duplicated by this function, but
562         is sliced instead.
563 
564 ******************************************************************************/
565 
566 T[] lineOf(T) (T[] src, size_t index)
567 {
568         int i = 0;
569         foreach (line; lines (src))
570                  if (i++ is index)
571                      return line;
572         return null;
573 }
574 
575 /******************************************************************************
576 
577         Combine a series of text segments together, each appended with 
578         a postfix pattern. An optional output buffer can be provided to
579         avoid heap activity - it should be large enough to contain the 
580         entire output, otherwise the heap will be used instead.
581 
582         Returns a valid slice of the output, containing the concatenated
583         text.
584 
585 ******************************************************************************/
586 
587 T[] join(T) (const(T[])[] src, const(T)[] postfix=null, T[] dst=null)
588 {
589         return combine!(T) (dst, null, postfix, src);
590 }
591 
592 /******************************************************************************
593 
594         Combine a series of text segments together, each prepended with 
595         a prefix pattern. An optional output buffer can be provided to 
596         avoid heap activity - it should be large enough to contain the 
597         entire output, otherwise the heap will be used instead.
598 
599         Note that, unlike join(), the output buffer is specified first
600         such that a set of trailing strings can be provided. 
601 
602         Returns a valid slice of the output, containing the concatenated
603         text.
604 
605 ******************************************************************************/
606 
607 T[] prefix(T) (T[] dst, const(T)[] prefix, const(T[])[] src...)
608 {
609         return combine!(T) (dst, prefix, null, src);
610 }
611 
612 /******************************************************************************
613 
614         Combine a series of text segments together, each appended with an 
615         optional postfix pattern. An optional output buffer can be provided
616         to avoid heap activity - it should be large enough to contain the 
617         entire output, otherwise the heap will be used instead.
618 
619         Note that, unlike join(), the output buffer is specified first
620         such that a set of trailing strings can be provided. 
621 
622         Returns a valid slice of the output, containing the concatenated
623         text.
624 
625 ******************************************************************************/
626 
627 T[] postfix(T) (T[] dst, const(T)[] postfix, const(T[])[] src...)
628 {
629         return combine!(T) (dst, null, postfix, src);
630 }
631 
632 /******************************************************************************
633 
634         Combine a series of text segments together, each prefixed and/or 
635         postfixed with optional strings. An optional output buffer can be 
636         provided to avoid heap activity - which should be large enough to 
637         contain the entire output, otherwise the heap will be used instead.
638 
639         Note that, unlike join(), the output buffer is specified first
640         such that a set of trailing strings can be provided. 
641 
642         Returns a valid slice of the output, containing the concatenated
643         text.
644 
645 ******************************************************************************/
646 
647 T[] combine(T) (T[] dst, const(T)[] prefix, const(T)[] postfix, const(T[])[] src ...)
648 {
649         size_t len = src.length * prefix.length + 
650                    src.length * postfix.length;
651 
652         foreach (segment; src)
653                  len += segment.length;
654                
655         if (dst.length < len)
656             dst.length = len;
657             
658         T* p = dst.ptr;
659         foreach (segment; src)
660                 {
661                 p[0 .. prefix.length] = prefix[];
662                 p += prefix.length;
663                 p[0 .. segment.length] = segment[];
664                 p += segment.length;
665                 p[0 .. postfix.length] = postfix[];
666                 p += postfix.length;
667                 }
668 
669         // remove trailing seperator
670         if (len)
671             len -= postfix.length;
672         return dst [0 .. len];       
673 }
674 
675 /******************************************************************************
676 
677         Repeat an array for a specific number of times. An optional output 
678         buffer can be provided to avoid heap activity - it should be large 
679         enough to contain the entire output, otherwise the heap will be used 
680         instead.
681 
682         Returns a valid slice of the output, containing the concatenated
683         text.
684 
685 ******************************************************************************/
686 
687 T[] repeat(T) (const(T)[] src, size_t count, T[] dst=null)
688 {
689         size_t len = src.length * count;
690         if (len is 0)
691             return null;
692 
693         if (dst.length < len)
694             dst.length = len;
695             
696         for (auto p = dst.ptr; count--; p += src.length)
697              p[0 .. src.length] = src[];
698 
699         return dst [0 .. len];
700 }
701 
702 /******************************************************************************
703 
704         Is the argument a whitespace character?
705 
706 ******************************************************************************/
707 
708 bool isSpace(T) (T c)
709 {
710         static if (T.sizeof is 1)
711                    return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v'));
712         else
713            return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')) || (c is '\u2028' || c is '\u2029');
714 }
715 
716 /******************************************************************************
717 
718         Return whether or not the two arrays have matching content
719         
720 ******************************************************************************/
721 
722 bool matching(T) (const(T)* s1, const(T)* s2, size_t length)
723 {
724         return mismatch(s1, s2, length) is length;
725 }
726 
727 /******************************************************************************
728 
729         Returns the index of the first match in str, failing once
730         length is reached. Note that we return 'length' for failure
731         and a 0-based index on success
732 
733 ******************************************************************************/
734 
735 size_t indexOf(T) (const(T)* str, const(T) match, size_t length)
736 {
737         //assert (str);
738 
739         static if (T.sizeof == 1)
740                    enum : size_t {m1 = cast(size_t) 0x0101010101010101, 
741                                   m2 = cast(size_t) 0x8080808080808080}
742         static if (T.sizeof == 2)
743                    enum : size_t {m1 = cast(size_t) 0x0001000100010001, 
744                                   m2 = cast(size_t) 0x8000800080008000}
745         static if (T.sizeof == 4)
746                    enum : size_t {m1 = cast(size_t) 0x0000000100000001, 
747                                   m2 = cast(size_t) 0x8000000080000000}
748 
749         static if (T.sizeof < size_t.sizeof)
750         {
751                 if (length)
752                    {
753                    size_t m = match;
754                    m += m << (8 * T.sizeof);
755 
756                    static if (T.sizeof < size_t.sizeof / 2)
757                               m += (m << (8 * T.sizeof * 2));
758 
759                    static if (T.sizeof < size_t.sizeof / 4)
760                               m += (m << (8 * T.sizeof * 4));
761 
762                    auto p = str;
763                    auto e = p + length - size_t.sizeof/T.sizeof;
764                    while (p < e)
765                          {
766                          // clear matching T segments
767                          auto v = (*cast(size_t*) p) ^ m;
768                          // test for zero, courtesy of Alan Mycroft
769                          if ((v - m1) & ~v & m2)
770                               break;
771                          p += size_t.sizeof/T.sizeof;
772                          }
773 
774                    e += size_t.sizeof/T.sizeof;
775                    while (p < e)
776                           if (*p++ is match)
777                               return cast(size_t) (p - str - 1);
778                    }
779                 return length;
780         }
781         else
782         {
783                 auto len = length;
784                 for (auto p=str-1; len--;)
785                      if (*++p is match)
786                          return cast(size_t) (p - str);
787                 return length;
788         }
789 }
790 
791 /******************************************************************************
792 
793         Returns the index of a mismatch between s1 & s2, failing when
794         length is reached. Note that we return 'length' upon failure
795         (array content matches) and a 0-based index upon success.
796 
797         Use this as a faster opEquals. Also provides the basis for a
798         faster opCmp, since the index of the first mismatched character
799         can be used to determine the return value
800 
801 ******************************************************************************/
802 
803 size_t mismatch(T) (const(T)* s1, const(T)* s2, size_t length)
804 {
805         assert (s1 && s2);
806 
807         static if (T.sizeof < size_t.sizeof)
808         {
809                 if (length)
810                    {
811                    auto start = s1;
812                    auto e = start + length - size_t.sizeof/T.sizeof;
813 
814                    while (s1 < e)
815                          {
816                          if (*cast(size_t*) s1 != *cast(size_t*) s2)
817                              break;
818                          s1 += size_t.sizeof/T.sizeof;
819                          s2 += size_t.sizeof/T.sizeof;
820                          }
821 
822                    e += size_t.sizeof/T.sizeof;
823                    while (s1 < e)
824                           if (*s1++ != *s2++)
825                               return s1 - start - 1;
826                    }
827                 return length;
828         }
829         else
830         {
831                 auto len = length;
832                 for (auto p=s1-1; len--;)
833                      if (*++p != *s2++)
834                          return p - s1;
835                 return length;
836         }
837 }
838 
839 /******************************************************************************
840 
841         Iterator to isolate lines.
842 
843         Converts text into a set of lines, where each line is identified
844         by a \n or \r\n combination. The line terminator is stripped from
845         each resultant array.
846 
847         ---
848         foreach (line; lines ("one\ntwo\nthree"))
849                  ...
850         ---
851         
852 ******************************************************************************/
853 
854 LineFruct!(T) lines(T) (T[] src)
855 {
856         LineFruct!(T) lines;
857         lines.src = src;
858         return lines;
859 }
860 
861 /******************************************************************************
862 
863         Iterator to isolate text elements.
864 
865         Splits the provided array wherever a delimiter-set instance is
866         found, and return the resultant segments. The delimiters are
867         excluded from each of the segments. Note that delimiters are
868         matched as a set of alternates rather than as a pattern.
869 
870         Splitting on a single delimiter is considerably faster than
871         splitting upon a set of alternatives.
872 
873         ---
874         foreach (segment; delimiters ("one,two;three", ",;"))
875                  ...
876         ---
877         
878 ******************************************************************************/
879 
880 DelimFruct!(T, M) delimiters(T, M) (T[] src, const(M)[] set)
881 {
882         DelimFruct!(T, M) elements;
883         elements.set = set;
884         elements.src = src;
885         return elements;
886 }
887 
888 /******************************************************************************
889 
890         Iterator to isolate text elements.
891 
892         Split the provided array wherever a pattern instance is found, 
893         and return the resultant segments. Pattern are excluded from
894         each of the segments, and an optional sub argument enables 
895         replacement.
896         
897         ---
898         foreach (segment; patterns ("one, two, three", ", "))
899                  ...
900         ---
901         
902 ******************************************************************************/
903 
904 PatternFruct!(T) patterns(T) (const(T)[] src, const(T)[] pattern, const(T)[] sub=null)
905 {
906         PatternFruct!(T) elements;
907         elements.pattern = pattern;
908         elements.sub = sub;
909         elements.src = src;
910         return elements;
911 }
912 
913 /******************************************************************************
914 
915         Iterator to isolate optionally quoted text elements.
916 
917         As per elements(), but with the extension of being quote-aware;
918         the set of delimiters is ignored inside a pair of quotes. Note
919         that an unterminated quote will consume remaining content.
920         
921         ---
922         foreach (quote; quotes ("one two 'three four' five", " "))
923                  ...
924         ---
925         
926 ******************************************************************************/
927 
928 QuoteFruct!(T, M) quotes(T, M) (T[] src, const(M)[] set)
929 {
930         QuoteFruct!(T, M) quotes;
931         quotes.set = set;
932         quotes.src = src;
933         return quotes;
934 }
935 
936 /*******************************************************************************
937 
938         Arranges text strings in order, using indices to specify where
939         each particular argument should be positioned within the text. 
940         This is handy for collating I18N components, or as a simplistic
941         and lightweight formatter. Indices range from zero through nine. 
942         
943         ---
944         // write ordered text to the console
945         char[64] tmp;
946 
947         Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline;
948         ---
949 
950 *******************************************************************************/
951 
952 T[] layout(T) (T[] output, const(T[])[] layout ...)
953 {
954         const(T)[] badarg  = cast(const(T)[])"{index out of range}";
955         const(T)[] toosmall = cast(const(T)[])"{output buffer too small}";
956         
957         size_t  pos,
958                 args;
959         bool    state;
960 
961         args = layout.length - 1;
962         foreach (c; layout[0])
963                 {
964                 if (state)
965                    {
966                    state = false;
967                    if (c >= '0' && c <= '9')
968                       {
969                       size_t index = c - '0';
970                       if (index < args)
971                          {
972                          const(T)[] x = layout[index+1];
973 
974                          size_t limit = pos + x.length;
975                          if (limit < output.length)
976                             {
977                             output [pos .. limit] = x[];
978                             pos = limit;
979                             continue;
980                             } 
981                          else
982                             return toosmall.dup;
983                          }
984                       else
985                          return badarg.dup;
986                       }
987                    }
988                 else
989                    if (c is '%')
990                       {
991                       state = true;
992                       continue;
993                       }
994 
995                 if (pos < output.length)
996                    {
997                    output[pos] = c;
998                    ++pos;
999                    }
1000                 else     
1001                    return toosmall.dup;
1002                 }
1003 
1004         return output [0..pos];
1005 }
1006 
1007 /******************************************************************************
1008 
1009         Convert 'escaped' chars to normal ones: \t => ^t for example.
1010         Supports \" \' \\ \a \b \f \n \r \t \v
1011         
1012 ******************************************************************************/
1013 
1014 inout(T)[] unescape(T) (inout(T)[] src, T[] dst = null)
1015 {
1016         size_t delta;
1017         auto s = src.ptr;
1018         auto len = src.length;
1019 
1020         // take a peek first to see if there's anything
1021         if ((delta = indexOf (s, '\\', len)) < len)
1022            {
1023            // make some room if not enough provided
1024            if (dst.length < src.length)
1025                dst.length = src.length;
1026            auto d = dst.ptr;
1027 
1028            // copy segments over, a chunk at a time
1029            do {
1030               d [0 .. delta] = s [0 .. delta];
1031               len -= delta;
1032               s += delta;
1033               d += delta;
1034 
1035               // bogus trailing '\'
1036               if (len < 2)
1037                  {
1038                  *d++ = '\\';
1039                  len = 0;
1040                  break;
1041                  }
1042 
1043               // translate \char
1044               T c = s[1];
1045               switch (c)
1046                      {
1047                       case '\\':
1048                            break;
1049                       case '\'':
1050                            c = '\'';
1051                            break;
1052                       case '"':
1053                            c = '"';
1054                            break;
1055                       case 'a':
1056                            c = '\a';
1057                            break;
1058                       case 'b':
1059                            c = '\b';
1060                            break;
1061                       case 'f':
1062                            c = '\f';
1063                            break;
1064                       case 'n':
1065                            c = '\n';
1066                            break;
1067                       case 'r':
1068                            c = '\r';
1069                            break;
1070                       case 't':
1071                            c = '\t';
1072                            break;
1073                       case 'v':
1074                            c = '\v';
1075                            break;
1076                       default:
1077                            *d++ = '\\';
1078                      }
1079               *d++ = c;  
1080               len -= 2;           
1081               s += 2;
1082               } while ((delta = indexOf (s, '\\', len)) < len);
1083 
1084            // copy tail too
1085            d [0 .. len] = s [0 .. len];
1086            return cast(inout)(dst [0 .. (d + len) - dst.ptr]);
1087            }
1088         return src;
1089 }
1090 
1091 
1092 /******************************************************************************
1093 
1094         jhash() -- hash a variable-length key into a 32-bit value
1095 
1096           k     : the key (the unaligned variable-length array of bytes)
1097           len   : the length of the key, counting by bytes
1098           level : can be any 4-byte value
1099 
1100         Returns a 32-bit value.  Every bit of the key affects every bit of
1101         the return value.  Every 1-bit and 2-bit delta achieves avalanche.
1102 
1103         About 4.3*len + 80 X86 instructions, with excellent pipelining
1104 
1105         The best hash table sizes are powers of 2.  There is no need to do
1106         mod a prime (mod is sooo slow!).  If you need less than 32 bits,
1107         use a bitmask.  For example, if you need only 10 bits, do
1108 
1109                     h = (h & hashmask(10));
1110 
1111         In which case, the hash table should have hashsize(10) elements.
1112         If you are hashing n strings (ub1 **)k, do it like this:
1113 
1114                     for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
1115 
1116         By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use 
1117         this code any way you wish, private, educational, or commercial.  
1118         It's free.
1119 
1120         See http://burtleburtle.net/bob/hash/evahash.html
1121         Use for hash table lookup, or anything where one collision in 2^32 
1122         is acceptable. Do NOT use for cryptographic purposes.
1123 
1124 ******************************************************************************/
1125 
1126 @trusted nothrow pure
1127 size_t jhash (const(ubyte)* k, size_t len, size_t c = 0)
1128 {
1129         size_t a = 0x9e3779b9,
1130              b = 0x9e3779b9,
1131              i = len;
1132 
1133         // handle most of the key 
1134         while (i >= 12) 
1135               {
1136               a += *cast(uint *)(k+0);
1137               b += *cast(uint *)(k+4);
1138               c += *cast(uint *)(k+8);
1139 
1140               a -= b; a -= c; a ^= (c>>13); 
1141               b -= c; b -= a; b ^= (a<<8); 
1142               c -= a; c -= b; c ^= (b>>13); 
1143               a -= b; a -= c; a ^= (c>>12);  
1144               b -= c; b -= a; b ^= (a<<16); 
1145               c -= a; c -= b; c ^= (b>>5); 
1146               a -= b; a -= c; a ^= (c>>3);  
1147               b -= c; b -= a; b ^= (a<<10); 
1148               c -= a; c -= b; c ^= (b>>15); 
1149               k += 12; i -= 12;
1150               }
1151 
1152         // handle the last 11 bytes 
1153         c += len;
1154         switch (i)
1155                {
1156                case 11: c+=(cast(uint)k[10]<<24); goto case;
1157                case 10: c+=(cast(uint)k[9]<<16); goto case;
1158                case 9 : c+=(cast(uint)k[8]<<8); goto case;
1159                case 8 : b+=(cast(uint)k[7]<<24); goto case;
1160                case 7 : b+=(cast(uint)k[6]<<16); goto case;
1161                case 6 : b+=(cast(uint)k[5]<<8); goto case;
1162                case 5 : b+=(cast(uint)k[4]); goto case;
1163                case 4 : a+=(cast(uint)k[3]<<24); goto case;
1164                case 3 : a+=(cast(uint)k[2]<<16); goto case;
1165                case 2 : a+=(cast(uint)k[1]<<8); goto case;
1166                case 1 : a+=(cast(uint)k[0]); goto default;
1167                default:
1168                }
1169 
1170         a -= b; a -= c; a ^= (c>>13); 
1171         b -= c; b -= a; b ^= (a<<8); 
1172         c -= a; c -= b; c ^= (b>>13); 
1173         a -= b; a -= c; a ^= (c>>12);  
1174         b -= c; b -= a; b ^= (a<<16); 
1175         c -= a; c -= b; c ^= (b>>5); 
1176         a -= b; a -= c; a ^= (c>>3);  
1177         b -= c; b -= a; b ^= (a<<10); 
1178         c -= a; c -= b; c ^= (b>>15); 
1179 
1180         return c;
1181 }
1182 
1183 /// ditto
1184 @trusted nothrow pure
1185 size_t jhash (const(void)[] x, size_t c = 0)
1186 {
1187         return jhash (cast(ubyte*) x.ptr, x.length, c);
1188 }
1189 
1190 
1191 /******************************************************************************
1192       
1193         Helper fruct for iterator lines(). A fruct is a low 
1194         impact mechanism for capturing context relating to an 
1195         opApply (conjunction of the names struct and foreach)
1196         
1197 ******************************************************************************/
1198 
1199 private struct LineFruct(T)
1200 {
1201         private T[] src;
1202 
1203         int opApply (scope int delegate (ref T[] line) dg)
1204         {
1205                 int        ret;
1206                 size_t     pos,
1207                            mark;
1208                 T[] line;
1209 
1210                 enum T nl = '\n';
1211                 enum T cr = '\r';
1212 
1213                 while ((pos = locate (src, nl, mark)) < src.length)
1214                       {
1215                       auto end = pos;
1216                       if (end && src[end-1] is cr)
1217                           --end;
1218 
1219                       line = src [mark .. end];
1220                       if ((ret = dg (line)) != 0)
1221                            return ret;
1222                       mark = pos + 1;
1223                       }
1224 
1225                 line = src [mark .. $];
1226                 if (mark <= src.length)
1227                     ret = dg (line);
1228 
1229                 return ret;
1230         }
1231 }
1232 
1233 /******************************************************************************
1234 
1235         Helper fruct for iterator delims(). A fruct is a low 
1236         impact mechanism for capturing context relating to an 
1237         opApply (conjunction of the names struct and foreach)
1238         
1239 ******************************************************************************/
1240 
1241 private struct DelimFruct(T, M)
1242 {
1243         private T[] src;
1244         private const(M)[] set;
1245 
1246         int opApply (scope int delegate (ref T[] token) dg)
1247         {
1248                 int     ret;
1249                 size_t  pos,
1250                         mark;
1251                 T[]     token;
1252 
1253                 // optimize for single delimiter case
1254                 if (set.length is 1)
1255                     while ((pos = locate (src, set[0], mark)) < src.length)
1256                           {
1257                           token = src [mark .. pos];
1258                           if ((ret = dg (token)) != 0)
1259                                return ret;
1260                           mark = pos + 1;
1261                           }
1262                 else
1263                    if (set.length > 1)
1264                        foreach (i, elem; src)
1265                                 if (contains (set, elem))
1266                                    {
1267                                    token = src [mark .. i];
1268                                    if ((ret = dg (token)) != 0)
1269                                         return ret;
1270                                    mark = i + 1;
1271                                    }
1272 
1273                 token = src [mark .. $];
1274                 if (mark <= src.length)
1275                     ret = dg (token);
1276 
1277                 return ret;
1278         }
1279 }
1280 
1281 /******************************************************************************
1282 
1283         Helper fruct for iterator patterns(). A fruct is a low 
1284         impact mechanism for capturing context relating to an 
1285         opApply (conjunction of the names struct and foreach)
1286         
1287 ******************************************************************************/
1288 
1289 private struct PatternFruct(T)
1290 {
1291         private const(T)[] src,
1292                            sub,
1293                            pattern;
1294 
1295         int opApply (scope int delegate (ref const(T)[] token) dg)
1296         {
1297                 int        ret;
1298                 size_t     pos,
1299                            mark;
1300                 const(T)[] token;
1301 
1302                 while ((pos = index (src, pattern, mark)) < src.length)
1303                       {
1304                       token = src [mark .. pos];
1305                       if ((ret = dg(token)) != 0)
1306                            return ret;
1307                       if (sub.ptr && (ret = dg(sub)) != 0)
1308                           return ret;
1309                       mark = pos + pattern.length;
1310                       }
1311 
1312                 token = src [mark .. $];
1313                 if (mark <= src.length)
1314                     ret = dg (token);
1315 
1316                 return ret;
1317         }
1318 }
1319 
1320 /******************************************************************************
1321 
1322         Helper fruct for iterator quotes(). A fruct is a low 
1323         impact mechanism for capturing context relating to an 
1324         opApply (conjunction of the names struct and foreach)
1325         
1326 ******************************************************************************/
1327 
1328 private struct QuoteFruct(T, M)
1329 {
1330         private T[]        src;
1331         private const(M)[] set;
1332         
1333         int opApply (scope int delegate (ref const(T)[] token) dg)
1334         {
1335                 int     ret;
1336                 size_t  mark;
1337                 const(T)[]     token;
1338 
1339                 if (set.length)
1340                     for (size_t i=0; i < src.length; ++i)
1341                         {
1342                         T c = src[i];
1343                         if (c is '"' || c is '\'')
1344                             i = locate (src, c, i+1);
1345                         else
1346                            if (contains (set, c))
1347                               {
1348                               token = src [mark .. i];
1349                               if ((ret = dg (token)) != 0)
1350                                    return ret;
1351                               mark = i + 1;
1352                               }
1353                         }
1354                 
1355                 token = src [mark .. $];
1356                 if (mark <= src.length)
1357                     ret = dg (token);
1358 
1359                 return ret;
1360         }
1361 }
1362 
1363 
1364 /******************************************************************************
1365 
1366 ******************************************************************************/
1367 
1368 debug (UnitTest)
1369 {    
1370         unittest
1371         {
1372         char[64] tmp;
1373 
1374         assert (isSpace (' ') && !isSpace ('d'));
1375 
1376         assert (indexOf ("abc".ptr, 'a', 3) is 0);
1377         assert (indexOf ("abc".ptr, 'b', 3) is 1);
1378         assert (indexOf ("abc".ptr, 'c', 3) is 2);
1379         assert (indexOf ("abc".ptr, 'd', 3) is 3);
1380         assert (indexOf ("abcabcabc".ptr, 'd', 9) is 9);
1381 
1382         assert (indexOf ("abc"d.ptr, cast(dchar)'c', 3) is 2);
1383         assert (indexOf ("abc"d.ptr, cast(dchar)'d', 3) is 3);
1384                                 
1385         assert (indexOf ("abc"w.ptr, cast(wchar)'c', 3) is 2);
1386         assert (indexOf ("abc"w.ptr, cast(wchar)'d', 3) is 3);
1387         assert (indexOf ("abcdefghijklmnopqrstuvwxyz"w.ptr, cast(wchar)'x', 25) is 23);
1388 
1389         assert (mismatch ("abc".ptr, "abc".ptr, 3) is 3);
1390         assert (mismatch ("abc".ptr, "abd".ptr, 3) is 2);
1391         assert (mismatch ("abc".ptr, "acc".ptr, 3) is 1);
1392         assert (mismatch ("abc".ptr, "ccc".ptr, 3) is 0);
1393 
1394         assert (mismatch ("abc"w.ptr, "abc"w.ptr, 3) is 3);
1395         assert (mismatch ("abc"w.ptr, "acc"w.ptr, 3) is 1);
1396                                                 
1397         assert (mismatch ("abc"d.ptr, "abc"d.ptr, 3) is 3);
1398         assert (mismatch ("abc"d.ptr, "acc"d.ptr, 3) is 1);
1399 
1400         assert (matching ("abc".ptr, "abc".ptr, 3));
1401         assert (matching ("abc".ptr, "abb".ptr, 3) is false);
1402         
1403         assert (contains ("abc", 'a'));
1404         assert (contains ("abc", 'b'));
1405         assert (contains ("abc", 'c'));
1406         assert (contains ("abc", 'd') is false);
1407 
1408         assert (containsPattern ("abc", "ab"));
1409         assert (containsPattern ("abc", "bc"));
1410         assert (containsPattern ("abc", "abc"));
1411         assert (containsPattern ("abc", "zabc") is false);
1412         assert (containsPattern ("abc", "abcd") is false);
1413         assert (containsPattern ("abc", "za") is false);
1414         assert (containsPattern ("abc", "cd") is false);
1415 
1416         assert (trim ("") == "");
1417         assert (trim (" abc  ") == "abc");
1418         assert (trim ("   ") == "");
1419 
1420         assert (strip ("", '%') == "");
1421         assert (strip ("%abc%%%", '%') == "abc");
1422         assert (strip ("#####", '#') == "");
1423         assert (stripl ("#####", '#') == "");
1424         assert (stripl (" ###", ' ') == "###");
1425         assert (stripl ("#####", 's') == "#####");
1426         assert (stripr ("#####", '#') == "");
1427         assert (stripr ("### ", ' ') == "###");
1428         assert (stripr ("#####", 's') == "#####");
1429 
1430         assert (replace ("abc".dup, 'b', ':') == "a:c");
1431         assert (substitute ("abc".dup, "bc", "x") == "ax");
1432 
1433         assert (locate ("abc", 'c', 1) is 2);
1434 
1435         assert (locate ("abc", 'c') is 2);
1436         assert (locate ("abc", 'a') is 0);
1437         assert (locate ("abc", 'd') is 3);
1438         assert (locate ("", 'c') is 0);
1439 
1440         assert (locatePrior ("abce", 'c') is 2);
1441         assert (locatePrior ("abce", 'a') is 0);
1442         assert (locatePrior ("abce", 'd') is 4);
1443         assert (locatePrior ("abce", 'c', 3) is 2);
1444         assert (locatePrior ("abce", 'c', 2) is 4);
1445         assert (locatePrior ("", 'c') is 0);
1446 
1447         auto x = delimit ("::b", ":");
1448         assert (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b");
1449         x = delimit ("a:bc:d", ":");
1450         assert (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d");
1451         x = delimit ("abcd", ":");
1452         assert (x.length is 1 && x[0] == "abcd");
1453         x = delimit ("abcd:", ":");
1454         assert (x.length is 2 && x[0] == "abcd" && x[1] == "");
1455         x = delimit ("a;b$c#d:e@f", ";:$#@");
1456         assert (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" &&
1457                                  x[3]=="d" && x[4]=="e" && x[5]=="f");
1458 
1459         assert (locatePattern ("abcdefg", "") is 7);
1460         assert (locatePattern ("abcdefg", "g") is 6);
1461         assert (locatePattern ("abcdefg", "abcdefg") is 0);
1462         assert (locatePattern ("abcdefg", "abcdefgx") is 7);
1463         assert (locatePattern ("abcdefg", "cce") is 7);
1464         assert (locatePattern ("abcdefg", "cde") is 2);
1465         assert (locatePattern ("abcdefgcde", "cde", 3) is 7);
1466 
1467         assert (locatePatternPrior ("abcdefg", "") is 7);
1468         assert (locatePatternPrior ("abcdefg", "cce") is 7);
1469         assert (locatePatternPrior ("abcdefg", "cde") is 2);
1470         assert (locatePatternPrior ("abcdefgcde", "cde", 6) is 2);
1471         assert (locatePatternPrior ("abcdefgcde", "cde", 4) is 2);
1472         assert (locatePatternPrior ("abcdefg", "abcdefgx") is 7);
1473 
1474         x = splitLines ("a\nb\n");
1475         assert (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == "");
1476         x = splitLines ("a\r\n");
1477         assert (x.length is 2 && x[0] == "a" && x[1] == "");
1478 
1479         x = splitLines ("a");
1480         assert (x.length is 1 && x[0] == "a");
1481         x = splitLines ("");
1482         assert (x.length is 1);
1483 
1484         const(char)[][] q;
1485         foreach (element; quotes ("1 'avcc   cc ' 3", " "))
1486                  q ~= element;
1487         assert (q.length is 3 && q[0] == "1" && q[1] == "'avcc   cc '" && q[2] == "3");
1488 
1489         assert (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc");
1490 
1491         x = split ("one, two, three", ",");
1492         assert (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three");
1493         x = split ("one, two, three", ", ");
1494         assert (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three");
1495         x = split ("one, two, three", ",,");
1496         assert (x.length is 1 && x[0] == "one, two, three");
1497         x = split ("one,,", ",");
1498         assert (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == "");
1499 
1500         immutable(char)[] h, t;
1501         h =  head ("one:two:three", ":", t);
1502         assert (h == "one" && t == "two:three");
1503         h = head ("one:::two:three", ":::", t);
1504         assert (h == "one" && t == "two:three");
1505         h = head ("one:two:three", "*", t);
1506         assert (h == "one:two:three" && t is null);
1507 
1508         t =  tail ("one:two:three", ":", h);
1509         assert (h == "one:two" && t == "three");
1510         t = tail ("one:::two:three", ":::", h);
1511         assert (h == "one" && t == "two:three");
1512         t = tail ("one:two:three", "*", h);
1513         assert (t == "one:two:three" && h is null);
1514 
1515         assert (chopl("hello world", "hello ") == "world");
1516         assert (chopl("hello", "hello") == "");
1517         assert (chopl("hello world", " ") == "hello world");
1518         assert (chopl("hello world", "") == "hello world");
1519 
1520         assert (chopr("hello world", " world") == "hello");
1521         assert (chopr("hello", "hello") == "");
1522         assert (chopr("hello world", " ") == "hello world");
1523         assert (chopr("hello world", "") == "hello world");
1524 
1525         const(char)[][] foo = ["one", "two", "three"];
1526         auto j = join (foo);
1527         assert (j == "onetwothree");
1528         j = join (foo, ", ");
1529         assert (j == "one, two, three");
1530         j = join (foo, " ", tmp);
1531         assert (j == "one two three");
1532         assert (j.ptr is tmp.ptr);
1533 
1534         assert (repeat ("abc", 0) == "");
1535         assert (repeat ("abc", 1) == "abc");
1536         assert (repeat ("abc", 2) == "abcabc");
1537         assert (repeat ("abc", 4) == "abcabcabcabc");
1538         assert (repeat ("", 4) == "");
1539         char[10] rep;
1540         assert (repeat ("abc", 0, rep) == "");
1541         assert (repeat ("abc", 1, rep) == "abc");
1542         assert (repeat ("abc", 2, rep) == "abcabc");
1543         assert (repeat ("", 4, rep) == "");
1544 
1545         assert (unescape ("abc") == "abc");
1546         assert (unescape ("abc\\") == "abc\\");
1547         assert (unescape ("abc\\t") == "abc\t");
1548         assert (unescape ("abc\\tc") == "abc\tc");
1549         assert (unescape ("\\t") == "\t");
1550         assert (unescape ("\\tx") == "\tx");
1551         assert (unescape ("\\v\\vx") == "\v\vx");
1552         assert (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc");
1553         }
1554 }
1555 
1556 
1557 
1558 debug (Util)
1559 {
1560         auto x = import("Util.d");
1561         
1562         void main()
1563         {
1564                 mismatch ("".ptr, S(x).ptr, 0);
1565                 indexOf ("".ptr, '@', 0);
1566                 char[] s;
1567                 split (s, " ");
1568                 //indexOf (s.ptr, '@', 0);
1569 
1570         }
1571 }