tango.text.xml.PullParser source code

1 /*******************************************************************************
2  
3         Copyright: Copyright (C) 2007 Aaron Craelius and Kris Bell  
4                    All rights reserved.
5 
6         License:   BSD style: $(LICENSE)
7 
8         version:   Initial release: February 2008      
9 
10         Authors:   Aaron, Kris
11 
12 *******************************************************************************/
13 
14 module tango.text.xml.PullParser;
15 
16 private import tango.text.Util : indexOf;
17 
18 private import tango.core.Exception : XmlException;
19 
20 private import Integer = tango.text.convert.Integer;
21 
22 private import Utf = tango.text.convert.Utf : toString;
23 
24 /*******************************************************************************
25 
26         Use -version=whitespace to retain whitespace as data nodes. We
27         see a %25 increase in token count and 10% throughput drop when
28         parsing "hamlet.xml" with this option enabled (pullparser alone)
29 
30 *******************************************************************************/
31 
32 version (whitespace)
33          version = retainwhite;
34 else
35    {
36    version = stripwhite;
37    version = partialwhite;
38    }
39 
40 /*******************************************************************************
41 
42         The XML node types 
43 
44 *******************************************************************************/
45 
46 public enum XmlNodeType {Element, Data, Attribute, CData, 
47                          Comment, PI, Doctype, Document};
48 
49 /*******************************************************************************
50 
51         Values returned by the pull-parser
52 
53 *******************************************************************************/
54 
55 public enum XmlTokenType {Done, StartElement, Attribute, EndElement, 
56                           EndEmptyElement, Data, Comment, CData, 
57                           Doctype, PI, None};
58 
59 
60 /*******************************************************************************
61 
62         Token based xml Parser.  Templated to operate with char[], wchar[], 
63         and dchar[] content. 
64 
65         The parser is constructed with some tradeoffs relating to document
66         integrity. It is generally optimized for well-formed documents, and
67         currently may read past a document-end for those that are not well
68         formed. There are various compilation options to enable checks and
69         balances, depending on how things should be handled. We'll settle
70         on a common configuration over the next few weeks, but for now all
71         settings are somewhat experimental. Partly because making some tiny 
72         unrelated change to the code can cause notable throughput changes, 
73         and we need to track that down.
74 
75         We're not yet clear why these swings are so pronounced (for changes
76         outside the code path) but they seem to be related to the alignment
77         of codegen. It could be a cache-line issue, or something else. We'll
78         figure it out, yet it's interesting that some hardware buttons are 
79         clearly being pushed
80 
81 *******************************************************************************/
82 
83 class PullParser(Ch = char)
84 {
85         public int                      depth;
86         public const(Ch)[]              prefix;    
87         public const(Ch)[]              rawValue;
88         public const(Ch)[]              localName;     
89         public XmlTokenType             type = XmlTokenType.None;
90 
91         package XmlText!(Ch)            text;
92         private bool                    stream;
93         private const(char)[]           errMsg;
94 
95         /***********************************************************************
96                 
97                 Construct a parser on the given content (may be null)
98 
99         ***********************************************************************/
100 
101         this(const(Ch[]) content = null)
102         {
103                 reset (content);
104         }
105    
106         /***********************************************************************
107         
108                 Consume the next token and return its type
109 
110         ***********************************************************************/
111 
112         @property final XmlTokenType next()
113         {
114                 auto e = text.end;
115                 auto p = text.point;
116         
117                 // at end of document?
118                 if (p >= e)
119                     return endOfInput();
120 version (stripwhite)
121 {
122                 // strip leading whitespace
123                 while (*p <= 32)
124                        if (++p >= e)                                      
125                            return endOfInput();
126 }                
127                 // StartElement or Attribute?
128                 if (type < XmlTokenType.EndElement) 
129                    {
130 version (retainwhite)
131 {
132                    // strip leading whitespace (thanks to DRK)
133                    while (*p <= 32)
134                           if (++p >= e)                                      
135                               return endOfInput();
136 }                
137                    switch (*p)
138                           {
139                           case '>':
140                                // termination of StartElement
141                                ++depth;
142                                ++p;
143                                break;
144 
145                           case '/':
146                                // empty element closure
147                                text.point = p;
148                                return doEndEmptyElement();
149  
150                           default:
151                                // must be attribute instead
152                                text.point = p;
153                                return doAttributeName();
154                           }
155                    }
156 
157                 // consume data between elements?
158                 if (*p != '<') 
159                    {
160                    auto q = p;
161                    while (++p < e && *p != '<') {}
162 
163                    if (p < e)
164                       {
165 version (partialwhite)
166 {
167                       // include leading whitespace
168                       while (*(q-1) <= 32)
169                              --q;
170 }                
171                       text.point = p;
172                       rawValue = q [0 .. p - q];
173                       return type = XmlTokenType.Data;
174                       }
175                    return endOfInput();
176                    }
177 
178                 // must be a '<' character, so peek ahead
179                 switch (p[1])
180                        {
181                        case '!':
182                             // one of the following ...
183                             if (p[2..4] == "--") 
184                                {
185                                text.point = p + 4;
186                                return doComment();
187                                }       
188                             else 
189                                if (p[2..9] == "[CDATA[") 
190                                   {
191                                   text.point = p + 9;
192                                   return doCData();
193                                   }
194                                else 
195                                   if (p[2..9] == "DOCTYPE") 
196                                      {
197                                      text.point = p + 9;
198                                      return doDoctype();
199                                      }
200                             return doUnexpected("!", p);
201 
202                        case '\?':
203                             // must be PI data
204                             text.point = p + 2;
205                             return doPI();
206 
207                        case '/':
208                             // should be a closing element name
209                             p += 2;
210                             auto q = p;
211                             while (*q > 63 || text.name[*q]) 
212                                    ++q;
213 
214                             if (*q is ':') 
215                                {
216                                prefix = p[0 .. q - p];
217                                p = ++q;
218                                while (*q > 63 || text.attributeName[*q])
219                                       ++q;
220 
221                                localName = p[0 .. q - p];
222                                }
223                             else 
224                                {
225                                prefix = null;
226                                localName = p[0 .. q - p];
227                                }
228 
229                             while (*q <= 32) 
230                                    if (++q >= e)        
231                                        return endOfInput();
232 
233                             if (*q is '>')
234                                {
235                                --depth;
236                                text.point = q + 1;
237                                return type = XmlTokenType.EndElement;
238                                }
239                             return doExpected(">", q);
240 
241                        default:
242                             // scan new element name
243                             auto q = ++p;
244                             while (*q > 63 || text.name[*q]) 
245                                    ++q;
246 
247                             // check if we ran past the end
248                             if (q >= e)
249                                 return endOfInput();
250 
251                             if (*q != ':') 
252                                {
253                                prefix = null;
254                                localName = p [0 .. q - p];
255                                }
256                             else
257                                {
258                                prefix = p[0 .. q - p];
259                                p = ++q;
260                                while (*q > 63 || text.attributeName[*q])
261                                       ++q;
262                                localName = p[0 .. q - p];
263                                }  
264                                                       
265                             text.point = q;
266                             return type = XmlTokenType.StartElement;
267                        }
268         }
269 
270         /***********************************************************************
271         
272         ***********************************************************************/
273 
274         private XmlTokenType doAttributeName()
275         {
276                 auto p = text.point;
277                 auto q = p;
278                 auto e = text.end;
279 
280                 while (*q > 63 || text.attributeName[*q])
281                        ++q;
282                 if (q >= e)
283                     return endOfInput();
284 
285                 if (*q is ':')
286                    {
287                    prefix = p[0 .. q - p];
288                    p = ++q;
289 
290                    while (*q > 63 || text.attributeName[*q])
291                           ++q;
292 
293                    localName = p[0 .. q - p];
294                    }
295                 else 
296                    {
297                    prefix = null;
298                    localName = p[0 .. q - p];
299                    }
300                 
301                 if (*q <= 32) 
302                    {
303                    while (*++q <= 32) {}
304                    if (q >= e)
305                        return endOfInput();
306                    }
307 
308                 if (*q is '=')
309                    {
310                    while (*++q <= 32) {}
311                    if (q >= e)
312                        return endOfInput();
313 
314                    auto quote = *q;
315                    switch (quote)
316                           {
317                           case '"':
318                           case '\'':
319                                p = q + 1;
320                                while (*++q != quote) {}
321                                if (q < e)
322                                   {
323                                   rawValue = p[0 .. q - p];
324                                   text.point = q + 1;   // skip end quote
325                                   return type = XmlTokenType.Attribute;
326                                   }
327                                return endOfInput(); 
328 
329                           default: 
330                                return doExpected("\' or \"", q);
331                           }
332                    }
333                 
334                 return doExpected ("=", q);
335         }
336 
337         /***********************************************************************
338         
339         ***********************************************************************/
340 
341         private XmlTokenType doEndEmptyElement()
342         {
343                 if (text.point[0] is '/' && text.point[1] is '>')
344                    {
345                    localName = prefix = null;
346                    text.point += 2;
347                    return type = XmlTokenType.EndEmptyElement;
348                    }
349                 return doExpected("/>", text.point);
350        }
351         
352         /***********************************************************************
353         
354         ***********************************************************************/
355 
356         private XmlTokenType doComment()
357         {
358                 auto e = text.end;
359                 auto p = text.point;
360                 auto q = p;
361                 
362                 while (p < e)
363                       {
364                       while (*p != '-')
365                              if (++p >= e)
366                                  return endOfInput();
367 
368                       if (p[0..3] == "-->") 
369                          {
370                          text.point = p + 3;
371                          rawValue = q [0 .. p - q];
372                          return type = XmlTokenType.Comment;
373                          }
374                       ++p;
375                       }
376 
377                 return endOfInput();
378         }
379         
380         /***********************************************************************
381         
382         ***********************************************************************/
383 
384         private XmlTokenType doCData()
385         {
386                 auto e = text.end;
387                 auto p = text.point;
388                 
389                 while (p < e)
390                       {
391                       auto q = p;
392                       while (*p != ']')
393                              if (++p >= e)
394                                  return endOfInput();
395                 
396                       if (p[0..3] == "]]>") 
397                          {
398                          text.point = p + 3;                      
399                          rawValue = q [0 .. p - q];
400                          return type = XmlTokenType.CData;
401                          }
402                       ++p;
403                       }
404 
405                 return endOfInput();
406         }
407         
408         /***********************************************************************
409         
410         ***********************************************************************/
411 
412         private XmlTokenType doPI()
413         {
414                 auto e = text.end;
415                 auto p = text.point;
416                 auto q = p;
417 
418                 while (p < e)
419                       {
420                       while (*p != '\?')
421                              if (++p >= e)
422                                  return endOfInput();
423 
424                       if (p[1] == '>') 
425                          {
426                          rawValue = q [0 .. p - q];
427                          text.point = p + 2;
428                          return type = XmlTokenType.PI;
429                          }
430                       ++p;
431                       }
432                 return endOfInput();
433         }
434         
435         /***********************************************************************
436         
437         ***********************************************************************/
438 
439         private XmlTokenType doDoctype()
440         {
441                 auto e = text.end;
442                 auto p = text.point;
443 
444                 // strip leading whitespace
445                 while (*p <= 32)
446                        if (++p >= e)                                      
447                            return endOfInput();
448                 
449                 auto q = p;              
450                 while (p < e) 
451                       {
452                       if (*p is '>') 
453                          {
454                          rawValue = q [0 .. p - q];
455                          prefix = null;
456                          text.point = p + 1;
457                          return type = XmlTokenType.Doctype;
458                          }
459                       else 
460                          {
461                          if (*p == '[') 
462                              do {
463                                 if (++p >= e)
464                                     return endOfInput();
465                                 } while (*p != ']');
466                          ++p;
467                          }
468                       }
469 
470                 if (p >= e)
471                     return endOfInput();
472                 return XmlTokenType.Doctype;
473         }
474         
475         /***********************************************************************
476         
477         ***********************************************************************/
478 
479         private XmlTokenType endOfInput ()
480         {
481                 if (depth && (stream is false))
482                     error ("Unexpected EOF");
483 
484                 return XmlTokenType.Done;
485         }
486         
487         /***********************************************************************
488         
489         ***********************************************************************/
490 
491         private XmlTokenType doUnexpected (const(char[]) msg, const(Ch)* p)
492         {
493                 return position ("parse error :: unexpected  " ~ msg, p);
494         }
495         
496         /***********************************************************************
497         
498         ***********************************************************************/
499 
500         private XmlTokenType doExpected (const(char[]) msg, const(Ch)* p)
501         {
502                 char[6] tmp = void;
503                 return position ("parse error :: expected  " ~ msg ~ " instead of " ~ Utf.toString(p[0..1], tmp), p);
504         }
505         
506         /***********************************************************************
507         
508         ***********************************************************************/
509 
510         private XmlTokenType position (const(char[]) msg, const(Ch)* p)
511         {
512                 return error (msg ~ " at position " ~ Integer.toString(p-text.text.ptr));
513         }
514 
515         /***********************************************************************
516         
517         ***********************************************************************/
518 
519         @property protected final XmlTokenType error (const(char[]) msg)
520         {
521                 errMsg = msg;
522                 throw new XmlException (msg.idup);
523         }
524 
525         /***********************************************************************
526         
527                 Return the raw value of the current token
528 
529         ***********************************************************************/
530 
531         @property final const const(Ch[]) value()
532         {
533                 return rawValue;
534         }
535         
536         /***********************************************************************
537         
538                 Return the name of the current token
539 
540         ***********************************************************************/
541 
542         @property final const const(Ch[]) name()
543         {
544                 if (prefix.length)
545                     return prefix ~ ":" ~ localName;
546                 return localName;
547         }
548                 
549         /***********************************************************************
550         
551                 Returns the text of the last error
552 
553         ***********************************************************************/
554 
555         @property final const const(char[]) error()
556         {
557                 return errMsg;
558         }
559 
560         /***********************************************************************
561         
562                 Reset the parser
563 
564         ***********************************************************************/
565 
566         final bool reset()
567         {
568                 text.reset (text.text);
569                 reset_();
570                 return true;
571         }
572         
573         /***********************************************************************
574                 
575                 Reset parser with new content
576 
577         ***********************************************************************/
578 
579         final void reset(const(Ch[]) newText)
580         {
581                 text.reset (newText);
582                 reset_();                
583         }
584         
585         /***********************************************************************
586         
587                 experimental: set streaming mode
588 
589                 Use at your own risk, may be removed.
590 
591         ***********************************************************************/
592 
593         final void incremental (bool yes = true)
594         {
595                 stream = yes;
596         }
597         
598         /***********************************************************************
599         
600         ***********************************************************************/
601 
602         private void reset_()
603         {
604                 depth = 0;
605                 errMsg = null;
606                 type = XmlTokenType.None;
607 
608                 auto p = text.point;
609                 if (p)
610                    {
611                    static if (Ch.sizeof == 1)
612                           {
613                           // consume UTF8 BOM
614                           if (p[0] is 0xef && p[1] is 0xbb && p[2] is 0xbf)
615                               p += 3;
616                           }
617                 
618                    //TODO enable optional declaration parsing
619                    auto e = text.end;
620                    while (p < e && *p <= 32)
621                           ++p;
622                 
623                    if (p < e)
624                        if (p[0] is '<' && p[1] is '\?' && p[2..5] == "xml")
625                           {
626                           p += 5;
627                           while (p < e && *p != '\?') 
628                                  ++p;
629                           p += 2;
630                           }
631                    text.point = p;
632                    }
633         }
634 }
635 
636 
637 /*******************************************************************************
638 
639 *******************************************************************************/
640 
641 package struct XmlText(Ch)
642 {
643         package const(Ch)*     end;
644         package size_t  len;
645         package const(Ch)[]    text;
646         package const(Ch)*     point;
647 
648         final void reset(const(Ch[]) newText)
649         {
650                 this.text = newText;
651                 this.len = newText.length;
652                 this.point = text.ptr;
653                 this.end = point + len;
654         }
655 
656         __gshared immutable ubyte[64] name =
657         [
658              // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
659                 0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
660                 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
661                 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
662                 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  0   // 3
663         ];
664 
665         __gshared immutable ubyte[64] attributeName =
666         [
667              // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
668                 0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  // 0
669                 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 1
670                 0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  // 2
671                 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  0,  0,  0   // 3
672         ];
673 }
674 
675 /*******************************************************************************
676 
677 *******************************************************************************/
678 
679 debug (UnitTest)
680 {
681 	/***********************************************************************
682 	
683 	***********************************************************************/
684 
685 	void testParser(Ch)(PullParser!(Ch) itr)
686 	{
687 	  /*      assert(itr.next);
688 	        assert(itr.value == "");
689 	        assert(itr.type == XmlTokenType.Declaration, Integer.toString(itr.type));
690 	        assert(itr.next);
691 	        assert(itr.value == "version");
692 	        assert(itr.next);
693 	        assert(itr.value == "1.0");*/
694 	        assert(itr.next);
695 	        assert(itr.value == "element [ <!ELEMENT element (#PCDATA)>]");
696 	        assert(itr.type == XmlTokenType.Doctype);
697 	        assert(itr.next);
698 	        assert(itr.localName == "element");
699 	        assert(itr.type == XmlTokenType.StartElement);
700 	        assert(itr.depth == 0);
701 	        assert(itr.next);
702 	        assert(itr.localName == "attr");
703 	        assert(itr.value == "1");
704 	        assert(itr.next);
705 	        assert(itr.type == XmlTokenType.Attribute);
706 	        assert(itr.localName == "attr2");
707 	        assert(itr.value == "two");
708 	        assert(itr.next);
709 	        assert(itr.value == "comment");
710 	        assert(itr.next);
711 	        assert(itr.rawValue == "test&amp;&#x5a;");
712 	        assert(itr.next);
713 	        assert(itr.prefix == "qual");
714 	        assert(itr.localName == "elem");
715 	        assert(itr.next);
716 	        assert(itr.type == XmlTokenType.EndEmptyElement);
717 	        assert(itr.next);
718 	        assert(itr.localName == "el2");
719 	        assert(itr.depth == 1);
720 	        assert(itr.next);
721 	        assert(itr.localName == "attr3");
722 	        assert(itr.value == "3three", itr.value);
723 	        assert(itr.next);
724 	        assert(itr.rawValue == "sdlgjsh");
725 	        assert(itr.next);
726 	        assert(itr.localName == "el3");
727 	        assert(itr.depth == 2);
728 	        assert(itr.next);
729 	        assert(itr.type == XmlTokenType.EndEmptyElement);
730 	        assert(itr.next);
731 	        assert(itr.value == "data");
732 	        assert(itr.next);
733 	      //  assert(itr.qvalue == "pi", itr.qvalue);
734 	      //  assert(itr.value == "test");
735 	        assert(itr.rawValue == "pi test", itr.rawValue);
736 	        assert(itr.next);
737 	        assert(itr.localName == "el2");
738 	        assert(itr.next);
739 	        assert(itr.localName == "element");
740 	        assert(!itr.next);
741 	}
742 	
743 	
744 	/***********************************************************************
745 	
746 	***********************************************************************/
747 	
748 	__gshared immutable immutable(char)[] testXML = "<?xml version=\"1.0\" ?><!DOCTYPE element [ <!ELEMENT element (#PCDATA)>]><element " ~
749 	    "attr=\"1\" attr2=\"two\"><!--comment-->test&amp;&#x5a;<qual:elem /><el2 attr3 = " ~
750 	    "'3three'><![CDATA[sdlgjsh]]><el3 />data<?pi test?></el2></element>";
751 	
752 	unittest
753 	{       
754 	        auto itr = new PullParser!(char)(testXML);     
755 	        testParser (itr);
756 	}
757 }