tango.text.convert.UnicodeBom source code

1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6 
7         version:        Initial release: December 2005      
8         
9         author:         Kris
10 
11 *******************************************************************************/
12 
13 module tango.text.convert.UnicodeBom;
14 
15 private import  tango.core.ByteSwap;
16 
17 private import  Utf = tango.text.convert.Utf;
18 
19 
20 private extern (C) void onUnicodeError (const(char[]) msg, size_t idx = 0);
21 
22 /*******************************************************************************
23 
24         see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2
25 
26 *******************************************************************************/
27 
28 enum Encoding {
29               Unknown,
30               UTF_8N,
31               UTF_8,
32               UTF_16,
33               UTF_16BE,
34               UTF_16LE,
35               UTF_32,
36               UTF_32BE,
37               UTF_32LE,
38               };
39 
40 /*******************************************************************************
41 
42         Convert unicode content
43 
44         Unicode is an encoding of textual material. The purpose of this module 
45         is to interface external-encoding with a programmer-defined internal-
46         encoding. This internal encoding is declared via the template argument 
47         T, whilst the external encoding is either specified or derived.
48 
49         Three internal encodings are supported: char, wchar, and dchar. The
50         methods herein operate upon arrays of this type. That is, decode()
51         returns an array of the type, while encode() expect an array of said 
52         type.
53 
54         Supported external encodings are as follow:
55 
56                 Encoding.Unknown 
57                 Encoding.UTF_8N
58                 Encoding.UTF_8
59                 Encoding.UTF_16
60                 Encoding.UTF_16BE
61                 Encoding.UTF_16LE 
62                 Encoding.UTF_32 
63                 Encoding.UTF_32BE
64                 Encoding.UTF_32LE 
65 
66         These can be divided into non-explicit and explicit encodings:
67 
68                 Encoding.Unknown 
69                 Encoding.UTF_8
70                 Encoding.UTF_16
71                 Encoding.UTF_32 
72 
73 
74                 Encoding.UTF_8N
75                 Encoding.UTF_16BE
76                 Encoding.UTF_16LE 
77                 Encoding.UTF_32BE
78                 Encoding.UTF_32LE 
79         
80         The former group of non-explicit encodings may be used to 'discover'
81         an unknown encoding, by examining the first few bytes of the content
82         for a signature. This signature is optional, but is often written such 
83         that the content is self-describing. When an encoding is unknown, using 
84         one of the non-explicit encodings will cause the decode() method to look 
85         for a signature and adjust itself accordingly. It is possible that a 
86         ZWNBSP character might be confused with the signature; today's unicode 
87         content is supposed to use the WORD-JOINER character instead.
88        
89         The group of explicit encodings are for use when the content encoding 
90         is known. These *must* be used when converting back to external encoding, 
91         since written content must be in a known format. It should be noted that, 
92         during a decode() operation, the existence of a signature is in conflict 
93         with these explicit varieties.
94 
95 
96         See 
97         $(LINK http://www.utf-8.com/)
98         $(LINK http://www.hackcraft.net/xmlUnicode/)
99         $(LINK http://www.unicode.org/faq/utf_bom.html/)
100         $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
101         $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
102 
103 *******************************************************************************/
104 
105 class UnicodeBom(T) : BomSniffer
106 {
107         static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 
108                     pragma (msg, "Template type must be char, wchar, or dchar");
109 
110         /***********************************************************************
111         
112                 Construct a instance using the given external encoding ~ one 
113                 of the Encoding.xx types 
114 
115         ***********************************************************************/
116                                   
117         this (Encoding encoding)
118         {
119                 setup (encoding);
120         }
121         
122         /***********************************************************************
123 
124                 Convert the provided content. The content is inspected 
125                 for a BOM signature, which is stripped. An exception is
126                 thrown if a signature is present when, according to the
127                 encoding type, it should not be. Conversely, An exception
128                 is thrown if there is no known signature where the current
129                 encoding expects one to be present.
130 
131                 Where 'ate' is provided, it will be set to the number of 
132                 elements consumed from the input and the decoder operates 
133                 in streaming-mode. That is: 'dst' should be supplied since 
134                 it is not resized or allocated.
135 
136         ***********************************************************************/
137 
138         final T[] decode (void[] content, T[] dst=null, size_t* ate=null)
139         {
140                 // look for a BOM
141                 auto info = test (content);
142 
143                 // are we expecting a BOM?
144                 if (lookup[encoding].test)
145                     if (info)
146                        {
147                        // yep ~ and we got one
148                        setup (info.encoding, true);
149 
150                        // strip BOM from content
151                        content = content [info.bom.length .. $];
152                        }
153                     else
154                        // can this encoding be defaulted?
155                        if (settings.fallback)
156                            setup (settings.fallback, false);
157                        else
158                           onUnicodeError ("UnicodeBom.decode :: unknown or missing BOM");
159                 else
160                    if (info)
161                        // found a BOM when using an explicit encoding
162                        onUnicodeError ("UnicodeBom.decode :: explicit encoding does not permit BOM");   
163                 
164                 // convert it to internal representation
165                 auto ret = into (swapBytes(content), settings.type, dst, ate);
166                 if (ate && info)
167                     *ate += info.bom.length;
168                 return ret;
169         }
170 
171         /***********************************************************************
172 
173                 Perform encoding of content. Note that the encoding must be 
174                 of the explicit variety by the time we get here
175 
176         ***********************************************************************/
177 
178         final void[] encode (T[] content, void[] dst=null)
179         {
180                 if (settings.test)
181                     onUnicodeError ("UnicodeBom.encode :: cannot write to a non-specific encoding");
182 
183                 // convert it to external representation, and write
184 		return swapBytes (from (content, settings.type, dst));
185         }
186 
187         /***********************************************************************
188 
189                 Swap bytes around, as required by the encoding
190 
191         ***********************************************************************/
192 
193         private final void[] swapBytes (void[] content)
194         {
195                 bool endian = settings.endian;
196                 bool swap   = settings.bigEndian;
197 
198                 version (BigEndian)
199                          swap = !swap;
200 
201                 if (endian && swap)
202                    {
203                    if (settings.type == Utf16)
204                        ByteSwap.swap16 (content.ptr, content.length);
205                    else
206                        ByteSwap.swap32 (content.ptr, content.length);
207                    }
208                 return content;
209         }
210         
211         /***********************************************************************
212       
213                 Convert from 'type' into the given T.
214 
215                 Where 'ate' is provided, it will be set to the number of 
216                 elements consumed from the input and the decoder operates 
217                 in streaming-mode. That is: 'dst' should be supplied since 
218                 it is not resized or allocated.
219 
220         ***********************************************************************/
221 
222         static T[] into (void[] x, uint type, T[] dst=null, size_t* ate = null)
223         {
224                 T[] ret;
225                 
226                 static if (is (T == char))
227                           {
228                           if (type == Utf8)
229                              {
230                              if (ate)
231                                  *ate = x.length;
232                              ret = cast(char[]) x;
233                              }
234                           else
235                           if (type == Utf16)
236                               ret = Utf.toString (cast(wchar[]) x, dst, ate);
237                           else
238                           if (type == Utf32)
239                               ret = Utf.toString (cast(dchar[]) x, dst, ate);
240                           }
241 
242                 static if (is (T == wchar))
243                           {
244                           if (type == Utf16)
245                              {
246                              if (ate)
247                                  *ate = x.length;
248                              ret = cast(wchar[]) x;
249                              }
250                           else
251                           if (type == Utf8)
252                               ret = Utf.toString16 (cast(char[]) x, dst, ate);
253                           else
254                           if (type == Utf32)
255                               ret = Utf.toString16 (cast(dchar[]) x, dst, ate);
256                           }
257 
258                 static if (is (T == dchar))
259                           {
260                           if (type == Utf32)
261                              {
262                              if (ate)
263                                  *ate = x.length;
264                              ret = cast(dchar[]) x;
265                              }
266                           else
267                           if (type == Utf8)
268                               ret = Utf.toString32 (cast(char[]) x, dst, ate);
269                           else
270                           if (type == Utf16)
271                               ret = Utf.toString32 (cast(wchar[]) x, dst, ate);
272                           }
273                 return ret;
274         }
275 
276 
277         /***********************************************************************
278       
279                 Convert from T into the given 'type'.
280 
281                 Where 'ate' is provided, it will be set to the number of 
282                 elements consumed from the input and the decoder operates 
283                 in streaming-mode. That is: 'dst' should be supplied since 
284                 it is not resized or allocated.
285 
286         ***********************************************************************/
287 
288         static void[] from (T[] x, uint type, void[] dst=null, size_t* ate=null)
289         {
290                 void[] ret;
291 
292                 static if (is (T == char))
293                           {
294                           if (type == Utf8)
295                              {
296                              if (ate)
297                                  *ate = x.length;
298                              ret = x;
299                              }
300                           else
301                           if (type == Utf16)
302                               ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
303                           else
304                           if (type == Utf32)
305                               ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
306                           }
307 
308                 static if (is (T == wchar))
309                           {
310                           if (type == Utf16)
311                              {
312                              if (ate)
313                                  *ate = x.length;
314                              ret = x;
315                              }
316                           else
317                           if (type == Utf8)
318                               ret = Utf.toString (x, cast(char[]) dst, ate);
319                           else
320                           if (type == Utf32)
321                               ret = Utf.toString32 (x, cast(dchar[]) dst, ate);
322                           }
323 
324                 static if (is (T == dchar))
325                           {
326                           if (type == Utf32)
327                              {
328                              if (ate)
329                                  *ate = x.length;
330                              ret = x;
331                              }
332                           else
333                           if (type == Utf8)
334                               ret = Utf.toString (x, cast(char[]) dst, ate);
335                           else
336                           if (type == Utf16)
337                               ret = Utf.toString16 (x, cast(wchar[]) dst, ate);
338                           }
339 
340                 return ret;
341         }
342 }
343 
344 
345 
346 /*******************************************************************************
347 
348         Handle byte-order-mark prefixes  
349 
350 *******************************************************************************/
351 
352 class BomSniffer 
353 {
354         private bool            found;        // was an encoding discovered?
355         private Encoding        encoder;      // the current encoding 
356         private const(Info)*    settings;     // pointer to encoding configuration
357 
358         private struct  Info
359                 {
360                 int           type;          // type of element (char/wchar/dchar)
361                 Encoding      encoding;      // Encoding.xx encoding
362                 const(char)[] bom;           // pattern to match for signature
363                 bool          test,          // should we test for this encoding?
364                               endian,        // this encoding have endian concerns?
365                               bigEndian;     // is this a big-endian encoding?
366                 Encoding      fallback;      // can this encoding be defaulted?
367                 };
368 
369         private enum {Utf8, Utf16, Utf32};
370         
371         private __gshared const Info[] lookup =
372         [
373         {Utf8,  Encoding.Unknown,  null,        true,  false, false, Encoding.UTF_8},
374         {Utf8,  Encoding.UTF_8N,   null,        true,  false, false, Encoding.UTF_8},
375         {Utf8,  Encoding.UTF_8,    "\xef\xbb\xbf",   false},
376         {Utf16, Encoding.UTF_16,   null,        true,  false, false, Encoding.UTF_16BE},
377         {Utf16, Encoding.UTF_16BE, "\xfe\xff",     false, true, true},
378         {Utf16, Encoding.UTF_16LE, "\xff\xfe",     false, true},
379         {Utf32, Encoding.UTF_32,   null,        true,  false, false, Encoding.UTF_32BE},
380         {Utf32, Encoding.UTF_32BE, "\x00\x00\xfe\xff", false, true, true},
381         {Utf32, Encoding.UTF_32LE, "\xff\xfe\x00\x00", false, true},
382         ];
383 
384         /***********************************************************************
385 
386                 Return the current encoding. This is either the originally
387                 specified encoding, or a derived one obtained by inspecting
388                 the content for a BOM. The latter is performed as part of 
389                 the decode() method
390 
391         ***********************************************************************/
392 
393         @property final Encoding encoding ()
394         {
395                 return encoder;
396         }
397         
398         /***********************************************************************
399 
400                 Was an encoding located in the text (configured via setup)
401 
402         ***********************************************************************/
403 
404         @property final bool encoded ()
405         {
406                 return found;
407         }
408 
409         /***********************************************************************
410 
411                 Return the signature (BOM) of the current encoding
412 
413         ***********************************************************************/
414 
415         @property final const(void)[] signature ()
416         {
417                 return settings.bom;
418         }
419 
420         /***********************************************************************
421 
422                 Configure this instance with unicode converters
423 
424         ***********************************************************************/
425 
426         final void setup (Encoding encoding, bool found = false)
427         {
428                 this.settings = &lookup[encoding];
429                 this.encoder = encoding;
430                 this.found = found;
431         }
432         
433         /***********************************************************************
434 
435                 Scan the BOM signatures looking for a match. We scan in 
436                 reverse order to get the longest match first
437 
438         ***********************************************************************/
439 
440         static final const(Info)* test (void[] content)
441         {
442                 for (const(Info)* info=lookup.ptr+lookup.length; --info >= lookup.ptr;)
443                      if (info.bom)
444                         {
445                         size_t len = info.bom.length;
446                         if (len <= content.length)
447                             if (content[0..len] == info.bom[0..len])
448                                 return info;
449                         }
450                 return null;
451         }
452 }
453 
454 /*******************************************************************************
455 
456 *******************************************************************************/
457 
458 debug (UnitTest)
459 {
460         unittest
461         {
462                 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup;
463                 void[] INPUT = "\xef\xbb\xbf" ~ INPUT2;
464                 auto bom = new UnicodeBom!(char)(Encoding.Unknown);
465                 size_t ate;
466                 char[256] buf;
467                 
468                 auto temp = bom.decode (INPUT, buf, &ate);
469                 assert (ate == INPUT.length);
470                 assert (bom.encoding == Encoding.UTF_8);
471                 
472                 temp = bom.decode (INPUT2, buf, &ate);
473                 assert (ate == INPUT2.length);
474                 assert (bom.encoding == Encoding.UTF_8);
475         }
476 }
477 
478 debug (UnicodeBom)
479 {
480         import tango.io.Stdout;
481 
482         void main()
483         {
484                 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup;
485                 void[] INPUT = "\xef\xbb\xbf" ~ INPUT2;
486                 auto bom = new UnicodeBom!(char)(Encoding.Unknown);
487                 size_t ate;
488                 char[256] buf;
489                 
490                 auto temp = bom.decode (INPUT, buf, &ate);
491                 assert (temp == INPUT2);
492                 assert (ate == INPUT.length);
493                 assert (bom.encoding == Encoding.UTF_8);
494                 
495                 temp = bom.decode (INPUT2, buf, &ate);
496                 assert (temp == INPUT2);
497                 assert (ate == INPUT2.length);
498                 assert (bom.encoding == Encoding.UTF_8);
499         }
500 }