1 /*******************************************************************************
2 
3         Copyright: Copyright (C) 2008 Kris Bell.  All rights reserved.
4 
5         License:   BSD style: $(LICENSE)
6 
7         version:   Aug 2008: Initial release
8 
9         Authors:   Kris
10 
11 *******************************************************************************/
12 
13 module tango.text.xml.DocEntity;
14 
15 private import Util = tango.text.Util;
16 
17 /******************************************************************************
18 
19         Convert XML entity patterns to normal characters
20         
21         <pre>
22         &amp; => ;
23         &quot; => "
24         etc.
25         </pre>
26         
27 ******************************************************************************/
28 
29 T[] fromEntity (T) (const(T)[] src, T[] dst = null)
30 {
31         ptrdiff_t delta;
32         auto s = src.ptr;
33         auto len = src.length;
34 
35         // take a peek first to see if there's anything
36         if ((delta = Util.indexOf (s, '&', len)) < len)
37            {
38            // make some room if not enough provided
39            if (dst.length < src.length)
40                dst.length = src.length;
41            auto d = dst.ptr;
42 
43            // copy segments over, a chunk at a time
44            do {
45               d [0 .. delta] = s [0 .. delta];
46               len -= delta;
47               s += delta;
48               d += delta;
49 
50               // translate entity
51               auto token = 0;
52 
53               switch (s[1])
54                      {
55                       case 'a':
56                            if (len > 4 && s[1..5] == "amp;")
57                                *d++ = '&', token = 5;
58                            else
59                            if (len > 5 && s[1..6] == "apos;")
60                                *d++ = '\'', token = 6;
61                            break;
62                            
63                       case 'g':
64                            if (len > 3 && s[1..4] == "gt;")
65                                *d++ = '>', token = 4;
66                            break;
67                            
68                       case 'l':
69                            if (len > 3 && s[1..4] == "lt;")
70                                *d++ = '<', token = 4;
71                            break;
72                            
73                       case 'q':
74                            if (len > 5 && s[1..6] == "quot;")
75                                *d++ = '"', token = 6;
76                            break;
77 
78                       default:
79                            break;
80                      }
81 
82               if (token is 0)
83                   *d++ = '&', token = 1;
84 
85               s += token, len -= token;
86               } while ((delta = Util.indexOf (s, '&', len)) < len);
87 
88            // copy tail too
89            d [0 .. len] = s [0 .. len];
90            return dst [0 .. (d + len) - dst.ptr];
91            }
92 
93         if(dst.length < src.length)
94             dst.length = src.length;
95         dst[0..src.length] = src;
96         return dst[0..src.length];
97 }
98 
99 
100 /******************************************************************************
101 
102         Convert XML entity patterns to normal characters
103         <pre>
104         &amp; => ;
105         &quot => "
106         etc
107         </pre>
108         
109         This variant does not require an interim workspace, and instead
110         emits directly via the provided delegate
111               
112 ******************************************************************************/
113 
114 void fromEntity (T) (const(T)[] src, scope void delegate(const(T)[]) emit)
115 {
116         ptrdiff_t delta;
117         auto s = src.ptr;
118         auto len = src.length;
119 
120         // take a peek first to see if there's anything
121         if ((delta = Util.indexOf (s, '&', len)) < len)
122            {
123            // copy segments over, a chunk at a time
124            do {
125               emit (s [0 .. delta]);
126               len -= delta;
127               s += delta;
128 
129               // translate entity
130               auto token = 0;
131 
132               switch (s[1])
133                      {
134                       case 'a':
135                            if (len > 4 && s[1..5] == "amp;")
136                                emit("&"), token = 5;
137                            else
138                            if (len > 5 && s[1..6] == "apos;")
139                                emit("'"), token = 6;
140                            break;
141                            
142                       case 'g':
143                            if (len > 3 && s[1..4] == "gt;")
144                                emit(">"), token = 4;
145                            break;
146                            
147                       case 'l':
148                            if (len > 3 && s[1..4] == "lt;")
149                                emit("<"), token = 4;
150                            break;
151                            
152                       case 'q':
153                            if (len > 5 && s[1..6] == "quot;")
154                                emit("\""), token = 6;
155                            break;
156 
157                       default:
158                            break;
159                      }
160 
161               if (token is 0)
162                   emit ("&"), token = 1;
163 
164               s += token, len -= token;
165               } while ((delta = Util.indexOf (s, '&', len)) < len);
166 
167            // copy tail too
168            emit (s [0 .. len]);
169            }
170         else
171            emit (src);
172 }
173 
174 
175 /******************************************************************************
176 
177         Convert reserved chars to entities. For example: " => &quot; 
178 
179         A slice of the provided output buffer is returned. The output buffer should be sufficiently large to  
180         accomodate the converted output, or it will be allocated from the 
181         heap instead 
182         
183 ******************************************************************************/
184 
185 T[] toEntity(T) (const(T)[] src, T[] dst = null)
186 {
187         const(T)[]  entity;
188         auto s = src.ptr;
189         auto t = s;
190         auto e = s + src.length;
191         auto index = 0;
192 
193         while (s < e)
194                switch (*s)
195                       {
196                       case '"':
197                            entity = "&quot;";
198                            goto common;
199 
200                       case '>':
201                            entity = "&gt;";
202                            goto common;
203 
204                       case '<':
205                            entity = "&lt;";
206                            goto common;
207 
208                       case '&':
209                            entity = "&amp;";
210                            goto common;
211 
212                       case '\'':
213                            entity = "&apos;";
214                            goto common;
215 
216                       common:
217                            auto len = s - t;
218                            if (dst.length <= index + len + entity.length)
219                                dst.length = (dst.length + len + entity.length) + dst.length / 2;
220 
221                            dst [index .. index + len] = t [0 .. len];
222                            index += len;
223 
224                            dst [index .. index + entity.length] = entity;
225                            index += entity.length;
226                            t = ++s;
227                            break;
228 
229                       default:
230                            ++s;
231                            break;
232                       }
233 
234 
235         // did we change anything?
236         if (index)
237            {
238            // copy tail too
239            auto len = e - t;
240            if (dst.length <= index + len)
241                dst.length = index + len;
242 
243            dst [index .. index + len] = t [0 .. len];
244            return dst [0 .. index + len];
245            }
246 
247         if(dst.length < src.length)
248             dst.length = src.length;
249         dst[0..src.length] = src;
250         return dst[0..src.length];
251 }
252 
253 
254 /******************************************************************************
255 
256         Convert reserved chars to entities. For example: " => &quot; 
257 
258         This variant does not require an interim workspace, and instead
259         emits directly via the provided delegate
260         
261 ******************************************************************************/
262 
263 void toEntity(T) (const(T)[] src, scope void delegate(const(T)[]) emit)
264 {
265         const(T)[]  entity;
266         auto s = src.ptr;
267         auto t = s;
268         auto e = s + src.length;
269 
270         while (s < e)
271                switch (*s)
272                       {
273                       case '"':
274                            entity = "&quot;";
275                            goto common;
276 
277                       case '>':
278                            entity = "&gt;";
279                            goto common;
280 
281                       case '<':
282                            entity = "&lt;";
283                            goto common;
284 
285                       case '&':
286                            entity = "&amp;";
287                            goto common;
288 
289                       case '\'':
290                            entity = "&apos;";
291                            goto common;
292 
293                       common:
294                            if (s - t > 0)
295                                emit (t [0 .. s - t]);
296                            emit (entity);
297                            t = ++s;
298                            break;
299 
300                       default:
301                            ++s;
302                            break;
303                       }
304 
305         // did we change anything? Copy tail also
306         if (entity.length)
307             emit (t [0 .. e - t]);
308         else
309            emit (src);
310 }
311 
312 
313 
314 /*******************************************************************************
315 
316 *******************************************************************************/
317 
318 debug (DocEntity)
319 {
320         import tango.io.Console;
321 
322         void main()
323         {
324                 auto s = fromEntity ("&amp;");
325                 assert (s == "&");
326                 s = fromEntity ("&quot;");
327                 assert (s == "\"");
328                 s = fromEntity ("&apos;");
329                 assert (s == "'");
330                 s = fromEntity ("&gt;");
331                 assert (s == ">");
332                 s = fromEntity ("&lt;");
333                 assert (s == "<");
334                 s = fromEntity ("&lt;&amp;&apos;");
335                 assert (s == "<&'");
336                 s = fromEntity ("*&lt;&amp;&apos;*");
337                 assert (s == "*<&'*");
338 
339                 assert (fromEntity ("abc") == "abc");
340                 assert (fromEntity ("abc&") == "abc&");
341                 assert (fromEntity ("abc&lt;") == "abc<");
342                 assert (fromEntity ("abc&gt;goo") == "abc>goo");
343                 assert (fromEntity ("&amp;") == "&");
344                 assert (fromEntity ("&quot;&apos;") == "\"'");
345                 assert (fromEntity ("&q&s") == "&q&s");
346 
347                 auto d = toEntity (">");
348                 assert (d == "&gt;");
349                 d = toEntity ("<");
350                 assert (d == "&lt;");
351                 d = toEntity ("&");
352                 assert (d == "&amp;");
353                 d = toEntity ("'");
354                 assert (d == "&apos;");
355                 d = toEntity ("\"");
356                 assert (d == "&quot;");
357                 d = toEntity ("^^>*>*");
358                 assert (d == "^^&gt;*&gt;*");
359         }
360 }