1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2009 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6 
7         version:        May 2009: Initial release
8 
9         since:          0.99.9
10 
11         author:         Kris
12 
13 *******************************************************************************/
14 
15 module tango.text.Search;
16 
17 private import Util = tango.text.Util;
18 
19 /******************************************************************************
20 
21         Returns a lightweight pattern matcher, good for short patterns 
22         and/or short to medium length content. Brute-force approach with
23         fast multi-byte comparisons
24 
25 ******************************************************************************/
26 
27 FindFruct!(T) find(T) (const(T)[] what)
28 {
29         return FindFruct!(T) (what);
30 }
31 
32 /******************************************************************************
33 
34         Returns a welterweight pattern matcher, good for long patterns 
35         and/or extensive content. Based on the QS algorithm which is a
36         Boyer-Moore variant. Does not allocate memory for the alphabet.
37 
38         Generally becomes faster as the match-length grows
39 
40 ******************************************************************************/
41 
42 SearchFruct!(T) search(T) (const(T)[] what)
43 {
44         return SearchFruct!(T) (what);
45 }
46 
47 /******************************************************************************
48 
49         Convenient bundle of lightweight find utilities, without the
50         hassle of IFTI problems. Create one of these using the find() 
51         function:
52         ---
53         auto match = find ("foo");
54         auto content = "wumpus foo bar"
55 
56         // search in the forward direction
57         auto index = match.forward (content);
58         assert (index is 7);
59 
60         // search again - returns length when no match found
61         assert (match.forward(content, index+1) is content.length);
62         ---
63 
64         Searching operates both forward and backward, with an optional
65         start offset (can be more convenient than slicing the content).
66         There are methods to replace matches within given content, and 
67         others which return foreach() iterators for traversing content.
68 
69         SearchFruct is a more sophisticated variant, which operates more
70         efficiently on longer matches and/or more extensive content.
71 
72 ******************************************************************************/
73 
74 package struct FindFruct(T)
75 {       
76         private const(T)[] what;
77 
78         /***********************************************************************
79 
80                 Search forward in the given content, starting at the 
81                 optional index.
82 
83                 Returns the index of a match, or content.length where
84                 no match was located.
85 
86         ***********************************************************************/
87 
88         size_t forward (const(T)[] content, size_t ofs = 0)
89         {
90                 return Util.index (content, what, ofs);
91         }
92 
93         /***********************************************************************
94 
95                 Search backward in the given content, starting at the 
96                 optional index.
97 
98                 Returns the index of a match, or content.length where
99                 no match was located.
100 
101         ***********************************************************************/
102 
103         size_t reverse (const(T)[] content, size_t ofs = size_t.max)
104         {       
105                 return Util.rindex (content, what, ofs);
106         }
107 
108         /***********************************************************************
109 
110                 Return the match text
111 
112         ***********************************************************************/
113 
114         @property
115         const(T)[] match ()
116         {
117                 return what;
118         }
119 
120         /***********************************************************************
121 
122                 Reset the text to match
123 
124         ***********************************************************************/
125 
126         @property
127         void match (const(T)[] what)
128         {
129                 this.what = what;
130         }
131 
132         /***********************************************************************
133 
134                 Returns true if there is a match within the given content
135 
136         ***********************************************************************/
137 
138         bool within (const(T)[] content)
139         {       
140                 return forward(content) != content.length;
141         }
142 
143         /***********************************************************************
144                 
145                 Returns number of matches within the given content
146 
147         ***********************************************************************/
148 
149         size_t count (const(T)[] content)
150         {       
151                 size_t mark, count;
152 
153                 while ((mark = Util.index (content, what, mark)) != content.length)
154                         ++count, ++mark;
155                 return count;
156         }
157 
158         /***********************************************************************
159 
160                 Replace all matches with the given character. Use method
161                 tokens() instead to avoid heap activity.
162 
163                 Returns a copy of the content with replacements made
164 
165         ***********************************************************************/
166 
167         T[] replace (const(T)[] content, T chr)
168         {     
169                 return replace (content, (&chr)[0..1]);  
170         }
171 
172         /***********************************************************************
173 
174                 Replace all matches with the given substitution. Use 
175                 method tokens() instead to avoid heap activity.
176 
177                 Returns a copy of the content with replacements made
178 
179         ***********************************************************************/
180 
181         T[] replace (const(T)[] content, const(T)[] sub = null)
182         {  
183                 T[] output;
184 
185                 foreach (s; tokens (content, sub))
186                          output ~= s;
187                 return output;
188         }
189 
190         /***********************************************************************
191 
192                 Returns a foreach() iterator which exposes text segments
193                 between all matches within the given content. Substitution
194                 text is also injected in place of each match, and null can
195                 be used to indicate removal instead:
196                 ---
197                 char[] result;
198 
199                 auto match = find ("foo");
200                 foreach (token; match.tokens ("$foo&&foo*", "bar"))
201                          result ~= token;
202                 assert (result == "$bar&&bar*");
203                 ---
204                 
205                 This mechanism avoids internal heap activity.                
206 
207         ***********************************************************************/
208 
209         Util.PatternFruct!(T) tokens (const(T)[] content, const(T)[] sub = null)
210         {
211                 return Util.patterns (content, what, sub);
212         }
213         
214         /***********************************************************************
215 
216                 Returns a foreach() iterator which exposes the indices of
217                 all matches within the given content:
218                 ---
219                 int count;
220 
221                 auto f = find ("foo");
222                 foreach (index; f.indices("$foo&&foo*"))
223                          ++count;
224                 assert (count is 2);
225                 ---
226 
227         ***********************************************************************/
228 
229         Indices indices (const(T)[] content)
230         {
231                 return Indices (what, content);
232         }
233  
234         /***********************************************************************
235 
236                 Simple foreach() iterator
237 
238         ***********************************************************************/
239 
240         private struct Indices
241         {
242                 const(T)[]     what,
243                                content;
244 
245                 int opApply (scope int delegate (ref size_t index) dg)
246                 {
247                         int    ret;
248                         size_t mark;
249 
250                         while ((mark = Util.index(content, what, mark)) != content.length)                        
251                                 if ((ret = dg(mark)) is 0)                                
252                                      ++mark;       
253                                 else
254                                    break;                        
255                         return ret;   
256                 }     
257         } 
258 }
259 
260 
261 /******************************************************************************
262 
263         Convenient bundle of welterweight search utilities, without the
264         hassle of IFTI problems. Create one of these using the search() 
265         function:
266         ---
267         auto match = search ("foo");
268         auto content = "wumpus foo bar"
269 
270         // search in the forward direction
271         auto index = match.forward (content);
272         assert (index is 7);
273 
274         // search again - returns length when no match found
275         assert (match.forward(content, index+1) is content.length);
276         ---
277 
278         Searching operates both forward and backward, with an optional
279         start offset (can be more convenient than slicing the content).
280         There are methods to replace matches within given content, and 
281         others which return foreach() iterators for traversing content.
282 
283         FindFruct is a simpler variant, which can operate efficiently on 
284         short matches and/or short content (employs brute-force strategy)
285 
286 ******************************************************************************/
287 
288 package struct SearchFruct(T)
289 {
290         private const(T)[]             what;
291         private bool                   fore;
292         private int[256]               offsets = void;
293 
294         /***********************************************************************
295 
296                 Construct the fruct
297 
298         ***********************************************************************/
299 
300         static SearchFruct opCall (const(T)[] what) 
301         {
302                 SearchFruct find = void;
303                 find.match = what;
304                 return find;
305         }
306         
307         /***********************************************************************
308 
309                 Return the match text
310 
311         ***********************************************************************/
312 
313         @property
314         const(T)[] match ()
315         {
316                 return what;
317         }
318 
319         /***********************************************************************
320 
321                 Reset the text to match
322 
323         ***********************************************************************/
324 
325         @property
326         void match (const(T)[] what)
327         {
328                 offsets[] = cast(int)(what.length + 1);
329                 this.fore = true;
330                 this.what = what;
331                 reset();
332         }
333 
334         /***********************************************************************
335 
336                 Search forward in the given content, starting at the 
337                 optional index.
338 
339                 Returns the index of a match, or content.length where
340                 no match was located.
341 
342         ***********************************************************************/
343 
344         size_t forward (const(T)[] content, size_t ofs = 0) 
345         {
346                 if (! fore)
347                       flip();
348 
349                 if (ofs > content.length)
350                     ofs = content.length;
351 
352                 return find (cast(char*) what.ptr, what.length * T.sizeof, 
353                              cast(char*) content.ptr, content.length * T.sizeof, 
354                              ofs * T.sizeof) / T.sizeof;
355         }
356 
357         /***********************************************************************
358 
359                 Search backward in the given content, starting at the 
360                 optional index.
361 
362                 Returns the index of a match, or content.length where
363                 no match was located.
364 
365         ***********************************************************************/
366 
367         size_t reverse (const(T)[] content, size_t ofs = size_t.max) 
368         {
369                 if (fore)
370                     flip();
371 
372                 if (ofs > content.length)
373                     ofs = content.length;
374 
375                 return rfind (cast(char*) what.ptr, what.length * T.sizeof, 
376                               cast(char*) content.ptr, content.length * T.sizeof, 
377                               ofs * T.sizeof) / T.sizeof;
378         }
379 
380         /***********************************************************************
381 
382                 Returns true if there is a match within the given content
383 
384         ***********************************************************************/
385 
386         bool within (const(T)[] content)
387         {       
388                 return forward(content) != content.length;
389         }
390 
391         /***********************************************************************
392                 
393                 Returns number of matches within the given content
394 
395         ***********************************************************************/
396 
397         size_t count (const(T)[] content)
398         {       
399                 size_t mark, count;
400 
401                 while ((mark = forward (content, mark)) != content.length)
402                         ++count, ++mark;
403                 return count;
404         }
405 
406         /***********************************************************************
407 
408                 Replace all matches with the given character. Use method
409                 tokens() instead to avoid heap activity.
410 
411                 Returns a copy of the content with replacements made
412 
413         ***********************************************************************/
414 
415         T[] replace (const(T)[] content, T chr)
416         {     
417                 return replace (content, (&chr)[0..1]);  
418         }
419 
420         /***********************************************************************
421 
422                 Replace all matches with the given substitution. Use 
423                 method tokens() instead to avoid heap activity.
424 
425                 Returns a copy of the content with replacements made
426 
427         ***********************************************************************/
428 
429         T[] replace (const(T)[] content, const(T)[] sub = null)
430         {  
431                 T[] output;
432 
433                 foreach (s; tokens (content, sub))
434                          output ~= s;
435                 return output;
436         }
437 
438         /***********************************************************************
439 
440                 Returns a foreach() iterator which exposes text segments
441                 between all matches within the given content. Substitution
442                 text is also injected in place of each match, and null can
443                 be used to indicate removal instead:
444                 ---
445                 char[] result;
446 
447                 auto match = search ("foo");
448                 foreach (token; match.tokens("$foo&&foo*", "bar"))
449                          result ~= token;
450                 assert (result == "$bar&&bar*");
451                 ---
452                 
453                 This mechanism avoids internal heap activity             
454 
455         ***********************************************************************/
456 
457         Substitute tokens (const(T)[] content, const(T)[] sub = null)
458         {
459                 return Substitute (sub, what, content, &forward);
460         }
461         
462         /***********************************************************************
463 
464                 Returns a foreach() iterator which exposes the indices of
465                 all matches within the given content:
466                 ---
467                 int count;
468 
469                 auto match = search ("foo");
470                 foreach (index; match.indices("$foo&&foo*"))
471                          ++count;
472                 assert (count is 2);
473                 ---
474 
475         ***********************************************************************/
476 
477         Indices indices (const(T)[] content)
478         {
479                 return Indices (content, &forward);
480         }
481         
482         /***********************************************************************
483 
484         ***********************************************************************/
485 
486         private size_t find (char* what, size_t wlen, char* content, size_t len, size_t ofs) 
487         {
488                 auto s = content;
489                 content += ofs;
490                 auto e = s + len - wlen;
491                 while (content <= e)
492                        if (*what is *content && matches(what, content, wlen))
493                            return content - s;
494                        else
495                           content += offsets [content[wlen]];
496                 return len;
497         }
498 
499         /***********************************************************************
500 
501         ***********************************************************************/
502 
503         private size_t rfind (char* what, size_t wlen, char* content, size_t len, size_t ofs) 
504         {
505                 auto s = content;
506                 auto e = s + ofs - wlen;
507                 while (e >= content)
508                        if (*what is *e && matches(what, e, wlen))
509                            return e - s;
510                        else
511                           e -= offsets [*(e-1)];
512                 return len;
513         }
514 
515         /***********************************************************************
516 
517         ***********************************************************************/
518 
519         private static bool matches (char* a, char* b, size_t length)
520         {
521                 while (length > size_t.sizeof)
522                        if (*cast(size_t*) a is *cast(size_t*) b)
523                             a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof;
524                        else
525                           return false;
526 
527                 while (length--)
528                        if (*a++ != *b++) 
529                            return false;
530                 return true;
531         }
532 
533         /***********************************************************************
534 
535                 Construct lookup table. We force the alphabet to be char[]
536                 always, and consider wider characters to be longer patterns
537                 instead
538 
539         ***********************************************************************/
540 
541         private void reset ()
542         {
543                 auto what = cast(char[]) this.what;
544                 if (fore)   
545                     for (int i=0; i < cast(int)what.length; ++i)
546                          offsets[what[i]] = cast(int)what.length - i;
547                 else
548                    for (int i=cast(int)what.length; i--;)
549                         offsets[what[i]] = cast(int)(i+1);
550         }
551 
552         /***********************************************************************
553 
554                 Reverse lookup-table direction
555 
556         ***********************************************************************/
557 
558         private void flip ()
559         {
560                 fore ^= true;
561                 reset();
562         }
563 
564         /***********************************************************************
565 
566                 Simple foreach() iterator
567 
568         ***********************************************************************/
569 
570         private struct Indices
571         {
572                 const(T)[]    content;
573                 size_t delegate(const(T)[], size_t) call;
574 
575                 int opApply (scope int delegate (ref size_t index) dg)
576                 {
577                         int     ret;
578                         size_t  mark;
579 
580                         while ((mark = call(content, mark)) != content.length)
581                                 if ((ret = dg(mark)) is 0)
582                                      ++mark;
583                                 else
584                                    break;
585                         return ret;   
586                 }     
587         } 
588 
589         /***********************************************************************
590 
591                 Substitution foreach() iterator
592 
593         ***********************************************************************/
594 
595         private struct Substitute
596         {
597                 private const(T)[] sub, 
598                                    what,
599                                    content;
600                 size_t             delegate(const(T)[], size_t) call;
601 
602                 int opApply (scope int delegate (ref const(T)[] token) dg)
603                 {
604                         int        ret;
605                         size_t     pos,
606                                    mark;
607                         const(T)[] token;
608 
609                         while ((pos = call (content, mark)) < content.length)
610                               {
611                               token = content [mark .. pos];
612                               if ((ret = dg(token)) != 0)
613                                    return ret;
614                               if (sub.ptr && (ret = dg(sub)) != 0)
615                                   return ret;
616                               mark = pos + what.length;
617                               }
618 
619                         token = content [mark .. $];
620                         if (mark <= content.length)
621                             ret = dg (token);
622                         return ret;
623                 }
624         }
625 }
626 
627 
628 
629 
630 /******************************************************************************
631 
632 ******************************************************************************/
633 
634 debug (Search)
635 {
636         import tango.io.Stdout;
637         import tango.time.StopWatch;
638 
639         auto x = import("Search.d");
640         
641         void main()
642         {
643                 StopWatch elapsed;
644         
645                 auto match = search("foo");
646                 auto index = match.reverse ("foo foo");
647                 assert (index is 4);
648                 index = match.reverse ("foo foo", index);
649                 assert (index is 0);
650                 index = match.reverse ("foo foo", 1);
651                 assert (index is 7);
652 
653                 foreach (index; find("delegate").indices(x))
654                          Stdout.formatln ("< {}", index);
655 
656                 foreach (index; search("delegate").indices(x))
657                          Stdout.formatln ("> {}", index);
658 
659                 elapsed.start;
660                 for (auto i=5000; i--;)
661                      Util.mismatch (x.ptr, x.ptr, x.length);
662                 Stdout.formatln ("mismatch {}", elapsed.stop);
663 
664                 elapsed.start;
665                 for (auto i=5000; i--;)
666                      Util.indexOf (x.ptr, '@', cast(uint) x.length);
667                 Stdout.formatln ("indexOf {}", elapsed.stop);
668 
669                 elapsed.start;
670                 for (auto i=5000; i--;)
671                      Util.locatePattern (x, "indexOf {}");
672                 Stdout.formatln ("pattern {}", elapsed.stop);
673 
674                 elapsed.start;
675                 auto f = find ("indexOf {}");
676                 for (auto i=5000; i--;)
677                      f.forward(x);
678                 Stdout.formatln ("find {}", elapsed.stop);
679 
680                 elapsed.start;
681                 auto s = search ("indexOf {}");
682                 for (auto i=5000; i--;)
683                      s.forward(x);
684                 Stdout.formatln ("search {}", elapsed.stop);
685         }
686 }
687