1 /******************************************************************************* 2 3 copyright: Copyright (c) 2009 Kris Bell. All rights reserved 4 5 license: BSD style: $(LICENSE) 6 7 version: May 2009: Initial release 8 9 since: 0.99.9 10 11 author: Kris 12 13 *******************************************************************************/ 14 15 module tango.text.Search; 16 17 private import Util = tango.text.Util; 18 19 /****************************************************************************** 20 21 Returns a lightweight pattern matcher, good for short patterns 22 and/or short to medium length content. Brute-force approach with 23 fast multi-byte comparisons 24 25 ******************************************************************************/ 26 27 FindFruct!(T) find(T) (const(T)[] what) 28 { 29 return FindFruct!(T) (what); 30 } 31 32 /****************************************************************************** 33 34 Returns a welterweight pattern matcher, good for long patterns 35 and/or extensive content. Based on the QS algorithm which is a 36 Boyer-Moore variant. Does not allocate memory for the alphabet. 37 38 Generally becomes faster as the match-length grows 39 40 ******************************************************************************/ 41 42 SearchFruct!(T) search(T) (const(T)[] what) 43 { 44 return SearchFruct!(T) (what); 45 } 46 47 /****************************************************************************** 48 49 Convenient bundle of lightweight find utilities, without the 50 hassle of IFTI problems. Create one of these using the find() 51 function: 52 --- 53 auto match = find ("foo"); 54 auto content = "wumpus foo bar" 55 56 // search in the forward direction 57 auto index = match.forward (content); 58 assert (index is 7); 59 60 // search again - returns length when no match found 61 assert (match.forward(content, index+1) is content.length); 62 --- 63 64 Searching operates both forward and backward, with an optional 65 start offset (can be more convenient than slicing the content). 66 There are methods to replace matches within given content, and 67 others which return foreach() iterators for traversing content. 68 69 SearchFruct is a more sophisticated variant, which operates more 70 efficiently on longer matches and/or more extensive content. 71 72 ******************************************************************************/ 73 74 package struct FindFruct(T) 75 { 76 private const(T)[] what; 77 78 /*********************************************************************** 79 80 Search forward in the given content, starting at the 81 optional index. 82 83 Returns the index of a match, or content.length where 84 no match was located. 85 86 ***********************************************************************/ 87 88 size_t forward (const(T)[] content, size_t ofs = 0) 89 { 90 return Util.index (content, what, ofs); 91 } 92 93 /*********************************************************************** 94 95 Search backward in the given content, starting at the 96 optional index. 97 98 Returns the index of a match, or content.length where 99 no match was located. 100 101 ***********************************************************************/ 102 103 size_t reverse (const(T)[] content, size_t ofs = size_t.max) 104 { 105 return Util.rindex (content, what, ofs); 106 } 107 108 /*********************************************************************** 109 110 Return the match text 111 112 ***********************************************************************/ 113 114 @property 115 const(T)[] match () 116 { 117 return what; 118 } 119 120 /*********************************************************************** 121 122 Reset the text to match 123 124 ***********************************************************************/ 125 126 @property 127 void match (const(T)[] what) 128 { 129 this.what = what; 130 } 131 132 /*********************************************************************** 133 134 Returns true if there is a match within the given content 135 136 ***********************************************************************/ 137 138 bool within (const(T)[] content) 139 { 140 return forward(content) != content.length; 141 } 142 143 /*********************************************************************** 144 145 Returns number of matches within the given content 146 147 ***********************************************************************/ 148 149 size_t count (const(T)[] content) 150 { 151 size_t mark, count; 152 153 while ((mark = Util.index (content, what, mark)) != content.length) 154 ++count, ++mark; 155 return count; 156 } 157 158 /*********************************************************************** 159 160 Replace all matches with the given character. Use method 161 tokens() instead to avoid heap activity. 162 163 Returns a copy of the content with replacements made 164 165 ***********************************************************************/ 166 167 T[] replace (const(T)[] content, T chr) 168 { 169 return replace (content, (&chr)[0..1]); 170 } 171 172 /*********************************************************************** 173 174 Replace all matches with the given substitution. Use 175 method tokens() instead to avoid heap activity. 176 177 Returns a copy of the content with replacements made 178 179 ***********************************************************************/ 180 181 T[] replace (const(T)[] content, const(T)[] sub = null) 182 { 183 T[] output; 184 185 foreach (s; tokens (content, sub)) 186 output ~= s; 187 return output; 188 } 189 190 /*********************************************************************** 191 192 Returns a foreach() iterator which exposes text segments 193 between all matches within the given content. Substitution 194 text is also injected in place of each match, and null can 195 be used to indicate removal instead: 196 --- 197 char[] result; 198 199 auto match = find ("foo"); 200 foreach (token; match.tokens ("$foo&&foo*", "bar")) 201 result ~= token; 202 assert (result == "$bar&&bar*"); 203 --- 204 205 This mechanism avoids internal heap activity. 206 207 ***********************************************************************/ 208 209 Util.PatternFruct!(T) tokens (const(T)[] content, const(T)[] sub = null) 210 { 211 return Util.patterns (content, what, sub); 212 } 213 214 /*********************************************************************** 215 216 Returns a foreach() iterator which exposes the indices of 217 all matches within the given content: 218 --- 219 int count; 220 221 auto f = find ("foo"); 222 foreach (index; f.indices("$foo&&foo*")) 223 ++count; 224 assert (count is 2); 225 --- 226 227 ***********************************************************************/ 228 229 Indices indices (const(T)[] content) 230 { 231 return Indices (what, content); 232 } 233 234 /*********************************************************************** 235 236 Simple foreach() iterator 237 238 ***********************************************************************/ 239 240 private struct Indices 241 { 242 const(T)[] what, 243 content; 244 245 int opApply (scope int delegate (ref size_t index) dg) 246 { 247 int ret; 248 size_t mark; 249 250 while ((mark = Util.index(content, what, mark)) != content.length) 251 if ((ret = dg(mark)) is 0) 252 ++mark; 253 else 254 break; 255 return ret; 256 } 257 } 258 } 259 260 261 /****************************************************************************** 262 263 Convenient bundle of welterweight search utilities, without the 264 hassle of IFTI problems. Create one of these using the search() 265 function: 266 --- 267 auto match = search ("foo"); 268 auto content = "wumpus foo bar" 269 270 // search in the forward direction 271 auto index = match.forward (content); 272 assert (index is 7); 273 274 // search again - returns length when no match found 275 assert (match.forward(content, index+1) is content.length); 276 --- 277 278 Searching operates both forward and backward, with an optional 279 start offset (can be more convenient than slicing the content). 280 There are methods to replace matches within given content, and 281 others which return foreach() iterators for traversing content. 282 283 FindFruct is a simpler variant, which can operate efficiently on 284 short matches and/or short content (employs brute-force strategy) 285 286 ******************************************************************************/ 287 288 package struct SearchFruct(T) 289 { 290 private const(T)[] what; 291 private bool fore; 292 private int[256] offsets = void; 293 294 /*********************************************************************** 295 296 Construct the fruct 297 298 ***********************************************************************/ 299 300 static SearchFruct opCall (const(T)[] what) 301 { 302 SearchFruct find = void; 303 find.match = what; 304 return find; 305 } 306 307 /*********************************************************************** 308 309 Return the match text 310 311 ***********************************************************************/ 312 313 @property 314 const(T)[] match () 315 { 316 return what; 317 } 318 319 /*********************************************************************** 320 321 Reset the text to match 322 323 ***********************************************************************/ 324 325 @property 326 void match (const(T)[] what) 327 { 328 offsets[] = cast(int)(what.length + 1); 329 this.fore = true; 330 this.what = what; 331 reset(); 332 } 333 334 /*********************************************************************** 335 336 Search forward in the given content, starting at the 337 optional index. 338 339 Returns the index of a match, or content.length where 340 no match was located. 341 342 ***********************************************************************/ 343 344 size_t forward (const(T)[] content, size_t ofs = 0) 345 { 346 if (! fore) 347 flip(); 348 349 if (ofs > content.length) 350 ofs = content.length; 351 352 return find (cast(char*) what.ptr, what.length * T.sizeof, 353 cast(char*) content.ptr, content.length * T.sizeof, 354 ofs * T.sizeof) / T.sizeof; 355 } 356 357 /*********************************************************************** 358 359 Search backward in the given content, starting at the 360 optional index. 361 362 Returns the index of a match, or content.length where 363 no match was located. 364 365 ***********************************************************************/ 366 367 size_t reverse (const(T)[] content, size_t ofs = size_t.max) 368 { 369 if (fore) 370 flip(); 371 372 if (ofs > content.length) 373 ofs = content.length; 374 375 return rfind (cast(char*) what.ptr, what.length * T.sizeof, 376 cast(char*) content.ptr, content.length * T.sizeof, 377 ofs * T.sizeof) / T.sizeof; 378 } 379 380 /*********************************************************************** 381 382 Returns true if there is a match within the given content 383 384 ***********************************************************************/ 385 386 bool within (const(T)[] content) 387 { 388 return forward(content) != content.length; 389 } 390 391 /*********************************************************************** 392 393 Returns number of matches within the given content 394 395 ***********************************************************************/ 396 397 size_t count (const(T)[] content) 398 { 399 size_t mark, count; 400 401 while ((mark = forward (content, mark)) != content.length) 402 ++count, ++mark; 403 return count; 404 } 405 406 /*********************************************************************** 407 408 Replace all matches with the given character. Use method 409 tokens() instead to avoid heap activity. 410 411 Returns a copy of the content with replacements made 412 413 ***********************************************************************/ 414 415 T[] replace (const(T)[] content, T chr) 416 { 417 return replace (content, (&chr)[0..1]); 418 } 419 420 /*********************************************************************** 421 422 Replace all matches with the given substitution. Use 423 method tokens() instead to avoid heap activity. 424 425 Returns a copy of the content with replacements made 426 427 ***********************************************************************/ 428 429 T[] replace (const(T)[] content, const(T)[] sub = null) 430 { 431 T[] output; 432 433 foreach (s; tokens (content, sub)) 434 output ~= s; 435 return output; 436 } 437 438 /*********************************************************************** 439 440 Returns a foreach() iterator which exposes text segments 441 between all matches within the given content. Substitution 442 text is also injected in place of each match, and null can 443 be used to indicate removal instead: 444 --- 445 char[] result; 446 447 auto match = search ("foo"); 448 foreach (token; match.tokens("$foo&&foo*", "bar")) 449 result ~= token; 450 assert (result == "$bar&&bar*"); 451 --- 452 453 This mechanism avoids internal heap activity 454 455 ***********************************************************************/ 456 457 Substitute tokens (const(T)[] content, const(T)[] sub = null) 458 { 459 return Substitute (sub, what, content, &forward); 460 } 461 462 /*********************************************************************** 463 464 Returns a foreach() iterator which exposes the indices of 465 all matches within the given content: 466 --- 467 int count; 468 469 auto match = search ("foo"); 470 foreach (index; match.indices("$foo&&foo*")) 471 ++count; 472 assert (count is 2); 473 --- 474 475 ***********************************************************************/ 476 477 Indices indices (const(T)[] content) 478 { 479 return Indices (content, &forward); 480 } 481 482 /*********************************************************************** 483 484 ***********************************************************************/ 485 486 private size_t find (char* what, size_t wlen, char* content, size_t len, size_t ofs) 487 { 488 auto s = content; 489 content += ofs; 490 auto e = s + len - wlen; 491 while (content <= e) 492 if (*what is *content && matches(what, content, wlen)) 493 return content - s; 494 else 495 content += offsets [content[wlen]]; 496 return len; 497 } 498 499 /*********************************************************************** 500 501 ***********************************************************************/ 502 503 private size_t rfind (char* what, size_t wlen, char* content, size_t len, size_t ofs) 504 { 505 auto s = content; 506 auto e = s + ofs - wlen; 507 while (e >= content) 508 if (*what is *e && matches(what, e, wlen)) 509 return e - s; 510 else 511 e -= offsets [*(e-1)]; 512 return len; 513 } 514 515 /*********************************************************************** 516 517 ***********************************************************************/ 518 519 private static bool matches (char* a, char* b, size_t length) 520 { 521 while (length > size_t.sizeof) 522 if (*cast(size_t*) a is *cast(size_t*) b) 523 a += size_t.sizeof, b += size_t.sizeof, length -= size_t.sizeof; 524 else 525 return false; 526 527 while (length--) 528 if (*a++ != *b++) 529 return false; 530 return true; 531 } 532 533 /*********************************************************************** 534 535 Construct lookup table. We force the alphabet to be char[] 536 always, and consider wider characters to be longer patterns 537 instead 538 539 ***********************************************************************/ 540 541 private void reset () 542 { 543 auto what = cast(char[]) this.what; 544 if (fore) 545 for (int i=0; i < cast(int)what.length; ++i) 546 offsets[what[i]] = cast(int)what.length - i; 547 else 548 for (int i=cast(int)what.length; i--;) 549 offsets[what[i]] = cast(int)(i+1); 550 } 551 552 /*********************************************************************** 553 554 Reverse lookup-table direction 555 556 ***********************************************************************/ 557 558 private void flip () 559 { 560 fore ^= true; 561 reset(); 562 } 563 564 /*********************************************************************** 565 566 Simple foreach() iterator 567 568 ***********************************************************************/ 569 570 private struct Indices 571 { 572 const(T)[] content; 573 size_t delegate(const(T)[], size_t) call; 574 575 int opApply (scope int delegate (ref size_t index) dg) 576 { 577 int ret; 578 size_t mark; 579 580 while ((mark = call(content, mark)) != content.length) 581 if ((ret = dg(mark)) is 0) 582 ++mark; 583 else 584 break; 585 return ret; 586 } 587 } 588 589 /*********************************************************************** 590 591 Substitution foreach() iterator 592 593 ***********************************************************************/ 594 595 private struct Substitute 596 { 597 private const(T)[] sub, 598 what, 599 content; 600 size_t delegate(const(T)[], size_t) call; 601 602 int opApply (scope int delegate (ref const(T)[] token) dg) 603 { 604 int ret; 605 size_t pos, 606 mark; 607 const(T)[] token; 608 609 while ((pos = call (content, mark)) < content.length) 610 { 611 token = content [mark .. pos]; 612 if ((ret = dg(token)) != 0) 613 return ret; 614 if (sub.ptr && (ret = dg(sub)) != 0) 615 return ret; 616 mark = pos + what.length; 617 } 618 619 token = content [mark .. $]; 620 if (mark <= content.length) 621 ret = dg (token); 622 return ret; 623 } 624 } 625 } 626 627 628 629 630 /****************************************************************************** 631 632 ******************************************************************************/ 633 634 debug (Search) 635 { 636 import tango.io.Stdout; 637 import tango.time.StopWatch; 638 639 auto x = import("Search.d"); 640 641 void main() 642 { 643 StopWatch elapsed; 644 645 auto match = search("foo"); 646 auto index = match.reverse ("foo foo"); 647 assert (index is 4); 648 index = match.reverse ("foo foo", index); 649 assert (index is 0); 650 index = match.reverse ("foo foo", 1); 651 assert (index is 7); 652 653 foreach (index; find("delegate").indices(x)) 654 Stdout.formatln ("< {}", index); 655 656 foreach (index; search("delegate").indices(x)) 657 Stdout.formatln ("> {}", index); 658 659 elapsed.start; 660 for (auto i=5000; i--;) 661 Util.mismatch (x.ptr, x.ptr, x.length); 662 Stdout.formatln ("mismatch {}", elapsed.stop); 663 664 elapsed.start; 665 for (auto i=5000; i--;) 666 Util.indexOf (x.ptr, '@', cast(uint) x.length); 667 Stdout.formatln ("indexOf {}", elapsed.stop); 668 669 elapsed.start; 670 for (auto i=5000; i--;) 671 Util.locatePattern (x, "indexOf {}"); 672 Stdout.formatln ("pattern {}", elapsed.stop); 673 674 elapsed.start; 675 auto f = find ("indexOf {}"); 676 for (auto i=5000; i--;) 677 f.forward(x); 678 Stdout.formatln ("find {}", elapsed.stop); 679 680 elapsed.start; 681 auto s = search ("indexOf {}"); 682 for (auto i=5000; i--;) 683 s.forward(x); 684 Stdout.formatln ("search {}", elapsed.stop); 685 } 686 } 687