1 /******************************************************************************* 2 3 Copyright: Copyright (C) 2007 Aaron Craelius and Kris Bell 4 All rights reserved. 5 6 License: BSD style: $(LICENSE) 7 8 version: Initial release: February 2008 9 10 Authors: Aaron, Kris 11 12 *******************************************************************************/ 13 14 module tango.text.xml.PullParser; 15 16 private import tango.text.Util : indexOf; 17 18 private import tango.core.Exception : XmlException; 19 20 private import Integer = tango.text.convert.Integer; 21 22 private import Utf = tango.text.convert.Utf : toString; 23 24 /******************************************************************************* 25 26 Use -version=whitespace to retain whitespace as data nodes. We 27 see a %25 increase in token count and 10% throughput drop when 28 parsing "hamlet.xml" with this option enabled (pullparser alone) 29 30 *******************************************************************************/ 31 32 version (whitespace) 33 version = retainwhite; 34 else 35 { 36 version = stripwhite; 37 version = partialwhite; 38 } 39 40 /******************************************************************************* 41 42 The XML node types 43 44 *******************************************************************************/ 45 46 public enum XmlNodeType {Element, Data, Attribute, CData, 47 Comment, PI, Doctype, Document}; 48 49 /******************************************************************************* 50 51 Values returned by the pull-parser 52 53 *******************************************************************************/ 54 55 public enum XmlTokenType {Done, StartElement, Attribute, EndElement, 56 EndEmptyElement, Data, Comment, CData, 57 Doctype, PI, None}; 58 59 60 /******************************************************************************* 61 62 Token based xml Parser. Templated to operate with char[], wchar[], 63 and dchar[] content. 64 65 The parser is constructed with some tradeoffs relating to document 66 integrity. It is generally optimized for well-formed documents, and 67 currently may read past a document-end for those that are not well 68 formed. There are various compilation options to enable checks and 69 balances, depending on how things should be handled. We'll settle 70 on a common configuration over the next few weeks, but for now all 71 settings are somewhat experimental. Partly because making some tiny 72 unrelated change to the code can cause notable throughput changes, 73 and we need to track that down. 74 75 We're not yet clear why these swings are so pronounced (for changes 76 outside the code path) but they seem to be related to the alignment 77 of codegen. It could be a cache-line issue, or something else. We'll 78 figure it out, yet it's interesting that some hardware buttons are 79 clearly being pushed 80 81 *******************************************************************************/ 82 83 class PullParser(Ch = char) 84 { 85 public int depth; 86 public const(Ch)[] prefix; 87 public const(Ch)[] rawValue; 88 public const(Ch)[] localName; 89 public XmlTokenType type = XmlTokenType.None; 90 91 package XmlText!(Ch) text; 92 private bool stream; 93 private const(char)[] errMsg; 94 95 /*********************************************************************** 96 97 Construct a parser on the given content (may be null) 98 99 ***********************************************************************/ 100 101 this(const(Ch[]) content = null) 102 { 103 reset (content); 104 } 105 106 /*********************************************************************** 107 108 Consume the next token and return its type 109 110 ***********************************************************************/ 111 112 @property final XmlTokenType next() 113 { 114 auto e = text.end; 115 auto p = text.point; 116 117 // at end of document? 118 if (p >= e) 119 return endOfInput(); 120 version (stripwhite) 121 { 122 // strip leading whitespace 123 while (*p <= 32) 124 if (++p >= e) 125 return endOfInput(); 126 } 127 // StartElement or Attribute? 128 if (type < XmlTokenType.EndElement) 129 { 130 version (retainwhite) 131 { 132 // strip leading whitespace (thanks to DRK) 133 while (*p <= 32) 134 if (++p >= e) 135 return endOfInput(); 136 } 137 switch (*p) 138 { 139 case '>': 140 // termination of StartElement 141 ++depth; 142 ++p; 143 break; 144 145 case '/': 146 // empty element closure 147 text.point = p; 148 return doEndEmptyElement(); 149 150 default: 151 // must be attribute instead 152 text.point = p; 153 return doAttributeName(); 154 } 155 } 156 157 // consume data between elements? 158 if (*p != '<') 159 { 160 auto q = p; 161 while (++p < e && *p != '<') {} 162 163 if (p < e) 164 { 165 version (partialwhite) 166 { 167 // include leading whitespace 168 while (*(q-1) <= 32) 169 --q; 170 } 171 text.point = p; 172 rawValue = q [0 .. p - q]; 173 return type = XmlTokenType.Data; 174 } 175 return endOfInput(); 176 } 177 178 // must be a '<' character, so peek ahead 179 switch (p[1]) 180 { 181 case '!': 182 // one of the following ... 183 if (p[2..4] == "--") 184 { 185 text.point = p + 4; 186 return doComment(); 187 } 188 else 189 if (p[2..9] == "[CDATA[") 190 { 191 text.point = p + 9; 192 return doCData(); 193 } 194 else 195 if (p[2..9] == "DOCTYPE") 196 { 197 text.point = p + 9; 198 return doDoctype(); 199 } 200 return doUnexpected("!", p); 201 202 case '\?': 203 // must be PI data 204 text.point = p + 2; 205 return doPI(); 206 207 case '/': 208 // should be a closing element name 209 p += 2; 210 auto q = p; 211 while (*q > 63 || text.name[*q]) 212 ++q; 213 214 if (*q is ':') 215 { 216 prefix = p[0 .. q - p]; 217 p = ++q; 218 while (*q > 63 || text.attributeName[*q]) 219 ++q; 220 221 localName = p[0 .. q - p]; 222 } 223 else 224 { 225 prefix = null; 226 localName = p[0 .. q - p]; 227 } 228 229 while (*q <= 32) 230 if (++q >= e) 231 return endOfInput(); 232 233 if (*q is '>') 234 { 235 --depth; 236 text.point = q + 1; 237 return type = XmlTokenType.EndElement; 238 } 239 return doExpected(">", q); 240 241 default: 242 // scan new element name 243 auto q = ++p; 244 while (*q > 63 || text.name[*q]) 245 ++q; 246 247 // check if we ran past the end 248 if (q >= e) 249 return endOfInput(); 250 251 if (*q != ':') 252 { 253 prefix = null; 254 localName = p [0 .. q - p]; 255 } 256 else 257 { 258 prefix = p[0 .. q - p]; 259 p = ++q; 260 while (*q > 63 || text.attributeName[*q]) 261 ++q; 262 localName = p[0 .. q - p]; 263 } 264 265 text.point = q; 266 return type = XmlTokenType.StartElement; 267 } 268 } 269 270 /*********************************************************************** 271 272 ***********************************************************************/ 273 274 private XmlTokenType doAttributeName() 275 { 276 auto p = text.point; 277 auto q = p; 278 auto e = text.end; 279 280 while (*q > 63 || text.attributeName[*q]) 281 ++q; 282 if (q >= e) 283 return endOfInput(); 284 285 if (*q is ':') 286 { 287 prefix = p[0 .. q - p]; 288 p = ++q; 289 290 while (*q > 63 || text.attributeName[*q]) 291 ++q; 292 293 localName = p[0 .. q - p]; 294 } 295 else 296 { 297 prefix = null; 298 localName = p[0 .. q - p]; 299 } 300 301 if (*q <= 32) 302 { 303 while (*++q <= 32) {} 304 if (q >= e) 305 return endOfInput(); 306 } 307 308 if (*q is '=') 309 { 310 while (*++q <= 32) {} 311 if (q >= e) 312 return endOfInput(); 313 314 auto quote = *q; 315 switch (quote) 316 { 317 case '"': 318 case '\'': 319 p = q + 1; 320 while (*++q != quote) {} 321 if (q < e) 322 { 323 rawValue = p[0 .. q - p]; 324 text.point = q + 1; // skip end quote 325 return type = XmlTokenType.Attribute; 326 } 327 return endOfInput(); 328 329 default: 330 return doExpected("\' or \"", q); 331 } 332 } 333 334 return doExpected ("=", q); 335 } 336 337 /*********************************************************************** 338 339 ***********************************************************************/ 340 341 private XmlTokenType doEndEmptyElement() 342 { 343 if (text.point[0] is '/' && text.point[1] is '>') 344 { 345 localName = prefix = null; 346 text.point += 2; 347 return type = XmlTokenType.EndEmptyElement; 348 } 349 return doExpected("/>", text.point); 350 } 351 352 /*********************************************************************** 353 354 ***********************************************************************/ 355 356 private XmlTokenType doComment() 357 { 358 auto e = text.end; 359 auto p = text.point; 360 auto q = p; 361 362 while (p < e) 363 { 364 while (*p != '-') 365 if (++p >= e) 366 return endOfInput(); 367 368 if (p[0..3] == "-->") 369 { 370 text.point = p + 3; 371 rawValue = q [0 .. p - q]; 372 return type = XmlTokenType.Comment; 373 } 374 ++p; 375 } 376 377 return endOfInput(); 378 } 379 380 /*********************************************************************** 381 382 ***********************************************************************/ 383 384 private XmlTokenType doCData() 385 { 386 auto e = text.end; 387 auto p = text.point; 388 389 while (p < e) 390 { 391 auto q = p; 392 while (*p != ']') 393 if (++p >= e) 394 return endOfInput(); 395 396 if (p[0..3] == "]]>") 397 { 398 text.point = p + 3; 399 rawValue = q [0 .. p - q]; 400 return type = XmlTokenType.CData; 401 } 402 ++p; 403 } 404 405 return endOfInput(); 406 } 407 408 /*********************************************************************** 409 410 ***********************************************************************/ 411 412 private XmlTokenType doPI() 413 { 414 auto e = text.end; 415 auto p = text.point; 416 auto q = p; 417 418 while (p < e) 419 { 420 while (*p != '\?') 421 if (++p >= e) 422 return endOfInput(); 423 424 if (p[1] == '>') 425 { 426 rawValue = q [0 .. p - q]; 427 text.point = p + 2; 428 return type = XmlTokenType.PI; 429 } 430 ++p; 431 } 432 return endOfInput(); 433 } 434 435 /*********************************************************************** 436 437 ***********************************************************************/ 438 439 private XmlTokenType doDoctype() 440 { 441 auto e = text.end; 442 auto p = text.point; 443 444 // strip leading whitespace 445 while (*p <= 32) 446 if (++p >= e) 447 return endOfInput(); 448 449 auto q = p; 450 while (p < e) 451 { 452 if (*p is '>') 453 { 454 rawValue = q [0 .. p - q]; 455 prefix = null; 456 text.point = p + 1; 457 return type = XmlTokenType.Doctype; 458 } 459 else 460 { 461 if (*p == '[') 462 do { 463 if (++p >= e) 464 return endOfInput(); 465 } while (*p != ']'); 466 ++p; 467 } 468 } 469 470 if (p >= e) 471 return endOfInput(); 472 return XmlTokenType.Doctype; 473 } 474 475 /*********************************************************************** 476 477 ***********************************************************************/ 478 479 private XmlTokenType endOfInput () 480 { 481 if (depth && (stream is false)) 482 error ("Unexpected EOF"); 483 484 return XmlTokenType.Done; 485 } 486 487 /*********************************************************************** 488 489 ***********************************************************************/ 490 491 private XmlTokenType doUnexpected (const(char[]) msg, const(Ch)* p) 492 { 493 return position ("parse error :: unexpected " ~ msg, p); 494 } 495 496 /*********************************************************************** 497 498 ***********************************************************************/ 499 500 private XmlTokenType doExpected (const(char[]) msg, const(Ch)* p) 501 { 502 char[6] tmp = void; 503 return position ("parse error :: expected " ~ msg ~ " instead of " ~ Utf.toString(p[0..1], tmp), p); 504 } 505 506 /*********************************************************************** 507 508 ***********************************************************************/ 509 510 private XmlTokenType position (const(char[]) msg, const(Ch)* p) 511 { 512 return error (msg ~ " at position " ~ Integer.toString(p-text.text.ptr)); 513 } 514 515 /*********************************************************************** 516 517 ***********************************************************************/ 518 519 @property protected final XmlTokenType error (const(char[]) msg) 520 { 521 errMsg = msg; 522 throw new XmlException (msg.idup); 523 } 524 525 /*********************************************************************** 526 527 Return the raw value of the current token 528 529 ***********************************************************************/ 530 531 @property final const const(Ch[]) value() 532 { 533 return rawValue; 534 } 535 536 /*********************************************************************** 537 538 Return the name of the current token 539 540 ***********************************************************************/ 541 542 @property final const const(Ch[]) name() 543 { 544 if (prefix.length) 545 return prefix ~ ":" ~ localName; 546 return localName; 547 } 548 549 /*********************************************************************** 550 551 Returns the text of the last error 552 553 ***********************************************************************/ 554 555 @property final const const(char[]) error() 556 { 557 return errMsg; 558 } 559 560 /*********************************************************************** 561 562 Reset the parser 563 564 ***********************************************************************/ 565 566 final bool reset() 567 { 568 text.reset (text.text); 569 reset_(); 570 return true; 571 } 572 573 /*********************************************************************** 574 575 Reset parser with new content 576 577 ***********************************************************************/ 578 579 final void reset(const(Ch[]) newText) 580 { 581 text.reset (newText); 582 reset_(); 583 } 584 585 /*********************************************************************** 586 587 experimental: set streaming mode 588 589 Use at your own risk, may be removed. 590 591 ***********************************************************************/ 592 593 final void incremental (bool yes = true) 594 { 595 stream = yes; 596 } 597 598 /*********************************************************************** 599 600 ***********************************************************************/ 601 602 private void reset_() 603 { 604 depth = 0; 605 errMsg = null; 606 type = XmlTokenType.None; 607 608 auto p = text.point; 609 if (p) 610 { 611 static if (Ch.sizeof == 1) 612 { 613 // consume UTF8 BOM 614 if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf) 615 p += 3; 616 } 617 618 //TODO enable optional declaration parsing 619 auto e = text.end; 620 while (p < e && *p <= 32) 621 ++p; 622 623 if (p < e) 624 if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml") 625 { 626 p += 5; 627 while (p < e && *p != '\?') 628 ++p; 629 p += 2; 630 } 631 text.point = p; 632 } 633 } 634 } 635 636 637 /******************************************************************************* 638 639 *******************************************************************************/ 640 641 package struct XmlText(Ch) 642 { 643 package const(Ch)* end; 644 package size_t len; 645 package const(Ch)[] text; 646 package const(Ch)* point; 647 648 final void reset(const(Ch[]) newText) 649 { 650 this.text = newText; 651 this.len = newText.length; 652 this.point = text.ptr; 653 this.end = point + len; 654 } 655 656 __gshared immutable ubyte[64] name = 657 [ 658 // 0 1 2 3 4 5 6 7 8 9 A B C D E F 659 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 660 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 661 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 662 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 // 3 663 ]; 664 665 __gshared immutable ubyte[64] attributeName = 666 [ 667 // 0 1 2 3 4 5 6 7 8 9 A B C D E F 668 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 670 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0 // 3 672 ]; 673 } 674 675 /******************************************************************************* 676 677 *******************************************************************************/ 678 679 debug (UnitTest) 680 { 681 /*********************************************************************** 682 683 ***********************************************************************/ 684 685 void testParser(Ch)(PullParser!(Ch) itr) 686 { 687 /* assert(itr.next); 688 assert(itr.value == ""); 689 assert(itr.type == XmlTokenType.Declaration, Integer.toString(itr.type)); 690 assert(itr.next); 691 assert(itr.value == "version"); 692 assert(itr.next); 693 assert(itr.value == "1.0");*/ 694 assert(itr.next); 695 assert(itr.value == "element [ <!ELEMENT element (#PCDATA)>]"); 696 assert(itr.type == XmlTokenType.Doctype); 697 assert(itr.next); 698 assert(itr.localName == "element"); 699 assert(itr.type == XmlTokenType.StartElement); 700 assert(itr.depth == 0); 701 assert(itr.next); 702 assert(itr.localName == "attr"); 703 assert(itr.value == "1"); 704 assert(itr.next); 705 assert(itr.type == XmlTokenType.Attribute); 706 assert(itr.localName == "attr2"); 707 assert(itr.value == "two"); 708 assert(itr.next); 709 assert(itr.value == "comment"); 710 assert(itr.next); 711 assert(itr.rawValue == "test&Z"); 712 assert(itr.next); 713 assert(itr.prefix == "qual"); 714 assert(itr.localName == "elem"); 715 assert(itr.next); 716 assert(itr.type == XmlTokenType.EndEmptyElement); 717 assert(itr.next); 718 assert(itr.localName == "el2"); 719 assert(itr.depth == 1); 720 assert(itr.next); 721 assert(itr.localName == "attr3"); 722 assert(itr.value == "3three", itr.value); 723 assert(itr.next); 724 assert(itr.rawValue == "sdlgjsh"); 725 assert(itr.next); 726 assert(itr.localName == "el3"); 727 assert(itr.depth == 2); 728 assert(itr.next); 729 assert(itr.type == XmlTokenType.EndEmptyElement); 730 assert(itr.next); 731 assert(itr.value == "data"); 732 assert(itr.next); 733 // assert(itr.qvalue == "pi", itr.qvalue); 734 // assert(itr.value == "test"); 735 assert(itr.rawValue == "pi test", itr.rawValue); 736 assert(itr.next); 737 assert(itr.localName == "el2"); 738 assert(itr.next); 739 assert(itr.localName == "element"); 740 assert(!itr.next); 741 } 742 743 744 /*********************************************************************** 745 746 ***********************************************************************/ 747 748 __gshared immutable immutable(char)[] testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element " ~ 749 "attr=\"1\" attr2=\"two\"><!--comment-->test&Z<qual:elem /><el2 attr3 = " ~ 750 "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>"; 751 752 unittest 753 { 754 auto itr = new PullParser!(char)(testXML); 755 testParser (itr); 756 } 757 }