1 /******************************************************************************* 2 3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved 4 5 license: BSD style: $(LICENSE) 6 7 version: Initial release: December 2005 8 9 author: Kris 10 11 *******************************************************************************/ 12 13 module tango.text.convert.UnicodeBom; 14 15 private import tango.core.ByteSwap; 16 17 private import Utf = tango.text.convert.Utf; 18 19 20 private extern (C) void onUnicodeError (const(char[]) msg, size_t idx = 0); 21 22 /******************************************************************************* 23 24 see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2 25 26 *******************************************************************************/ 27 28 enum Encoding { 29 Unknown, 30 UTF_8N, 31 UTF_8, 32 UTF_16, 33 UTF_16BE, 34 UTF_16LE, 35 UTF_32, 36 UTF_32BE, 37 UTF_32LE, 38 }; 39 40 /******************************************************************************* 41 42 Convert unicode content 43 44 Unicode is an encoding of textual material. The purpose of this module 45 is to interface external-encoding with a programmer-defined internal- 46 encoding. This internal encoding is declared via the template argument 47 T, whilst the external encoding is either specified or derived. 48 49 Three internal encodings are supported: char, wchar, and dchar. The 50 methods herein operate upon arrays of this type. That is, decode() 51 returns an array of the type, while encode() expect an array of said 52 type. 53 54 Supported external encodings are as follow: 55 56 Encoding.Unknown 57 Encoding.UTF_8N 58 Encoding.UTF_8 59 Encoding.UTF_16 60 Encoding.UTF_16BE 61 Encoding.UTF_16LE 62 Encoding.UTF_32 63 Encoding.UTF_32BE 64 Encoding.UTF_32LE 65 66 These can be divided into non-explicit and explicit encodings: 67 68 Encoding.Unknown 69 Encoding.UTF_8 70 Encoding.UTF_16 71 Encoding.UTF_32 72 73 74 Encoding.UTF_8N 75 Encoding.UTF_16BE 76 Encoding.UTF_16LE 77 Encoding.UTF_32BE 78 Encoding.UTF_32LE 79 80 The former group of non-explicit encodings may be used to 'discover' 81 an unknown encoding, by examining the first few bytes of the content 82 for a signature. This signature is optional, but is often written such 83 that the content is self-describing. When an encoding is unknown, using 84 one of the non-explicit encodings will cause the decode() method to look 85 for a signature and adjust itself accordingly. It is possible that a 86 ZWNBSP character might be confused with the signature; today's unicode 87 content is supposed to use the WORD-JOINER character instead. 88 89 The group of explicit encodings are for use when the content encoding 90 is known. These *must* be used when converting back to external encoding, 91 since written content must be in a known format. It should be noted that, 92 during a decode() operation, the existence of a signature is in conflict 93 with these explicit varieties. 94 95 96 See 97 $(LINK http://www.utf-8.com/) 98 $(LINK http://www.hackcraft.net/xmlUnicode/) 99 $(LINK http://www.unicode.org/faq/utf_bom.html/) 100 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/) 101 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/) 102 103 *******************************************************************************/ 104 105 class UnicodeBom(T) : BomSniffer 106 { 107 static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 108 pragma (msg, "Template type must be char, wchar, or dchar"); 109 110 /*********************************************************************** 111 112 Construct a instance using the given external encoding ~ one 113 of the Encoding.xx types 114 115 ***********************************************************************/ 116 117 this (Encoding encoding) 118 { 119 setup (encoding); 120 } 121 122 /*********************************************************************** 123 124 Convert the provided content. The content is inspected 125 for a BOM signature, which is stripped. An exception is 126 thrown if a signature is present when, according to the 127 encoding type, it should not be. Conversely, An exception 128 is thrown if there is no known signature where the current 129 encoding expects one to be present. 130 131 Where 'ate' is provided, it will be set to the number of 132 elements consumed from the input and the decoder operates 133 in streaming-mode. That is: 'dst' should be supplied since 134 it is not resized or allocated. 135 136 ***********************************************************************/ 137 138 final T[] decode (void[] content, T[] dst=null, size_t* ate=null) 139 { 140 // look for a BOM 141 auto info = test (content); 142 143 // are we expecting a BOM? 144 if (lookup[encoding].test) 145 if (info) 146 { 147 // yep ~ and we got one 148 setup (info.encoding, true); 149 150 // strip BOM from content 151 content = content [info.bom.length .. $]; 152 } 153 else 154 // can this encoding be defaulted? 155 if (settings.fallback) 156 setup (settings.fallback, false); 157 else 158 onUnicodeError ("UnicodeBom.decode :: unknown or missing BOM"); 159 else 160 if (info) 161 // found a BOM when using an explicit encoding 162 onUnicodeError ("UnicodeBom.decode :: explicit encoding does not permit BOM"); 163 164 // convert it to internal representation 165 auto ret = into (swapBytes(content), settings.type, dst, ate); 166 if (ate && info) 167 *ate += info.bom.length; 168 return ret; 169 } 170 171 /*********************************************************************** 172 173 Perform encoding of content. Note that the encoding must be 174 of the explicit variety by the time we get here 175 176 ***********************************************************************/ 177 178 final void[] encode (T[] content, void[] dst=null) 179 { 180 if (settings.test) 181 onUnicodeError ("UnicodeBom.encode :: cannot write to a non-specific encoding"); 182 183 // convert it to external representation, and write 184 return swapBytes (from (content, settings.type, dst)); 185 } 186 187 /*********************************************************************** 188 189 Swap bytes around, as required by the encoding 190 191 ***********************************************************************/ 192 193 private final void[] swapBytes (void[] content) 194 { 195 bool endian = settings.endian; 196 bool swap = settings.bigEndian; 197 198 version (BigEndian) 199 swap = !swap; 200 201 if (endian && swap) 202 { 203 if (settings.type == Utf16) 204 ByteSwap.swap16 (content.ptr, content.length); 205 else 206 ByteSwap.swap32 (content.ptr, content.length); 207 } 208 return content; 209 } 210 211 /*********************************************************************** 212 213 Convert from 'type' into the given T. 214 215 Where 'ate' is provided, it will be set to the number of 216 elements consumed from the input and the decoder operates 217 in streaming-mode. That is: 'dst' should be supplied since 218 it is not resized or allocated. 219 220 ***********************************************************************/ 221 222 static T[] into (void[] x, uint type, T[] dst=null, size_t* ate = null) 223 { 224 T[] ret; 225 226 static if (is (T == char)) 227 { 228 if (type == Utf8) 229 { 230 if (ate) 231 *ate = x.length; 232 ret = cast(char[]) x; 233 } 234 else 235 if (type == Utf16) 236 ret = Utf.toString (cast(wchar[]) x, dst, ate); 237 else 238 if (type == Utf32) 239 ret = Utf.toString (cast(dchar[]) x, dst, ate); 240 } 241 242 static if (is (T == wchar)) 243 { 244 if (type == Utf16) 245 { 246 if (ate) 247 *ate = x.length; 248 ret = cast(wchar[]) x; 249 } 250 else 251 if (type == Utf8) 252 ret = Utf.toString16 (cast(char[]) x, dst, ate); 253 else 254 if (type == Utf32) 255 ret = Utf.toString16 (cast(dchar[]) x, dst, ate); 256 } 257 258 static if (is (T == dchar)) 259 { 260 if (type == Utf32) 261 { 262 if (ate) 263 *ate = x.length; 264 ret = cast(dchar[]) x; 265 } 266 else 267 if (type == Utf8) 268 ret = Utf.toString32 (cast(char[]) x, dst, ate); 269 else 270 if (type == Utf16) 271 ret = Utf.toString32 (cast(wchar[]) x, dst, ate); 272 } 273 return ret; 274 } 275 276 277 /*********************************************************************** 278 279 Convert from T into the given 'type'. 280 281 Where 'ate' is provided, it will be set to the number of 282 elements consumed from the input and the decoder operates 283 in streaming-mode. That is: 'dst' should be supplied since 284 it is not resized or allocated. 285 286 ***********************************************************************/ 287 288 static void[] from (T[] x, uint type, void[] dst=null, size_t* ate=null) 289 { 290 void[] ret; 291 292 static if (is (T == char)) 293 { 294 if (type == Utf8) 295 { 296 if (ate) 297 *ate = x.length; 298 ret = x; 299 } 300 else 301 if (type == Utf16) 302 ret = Utf.toString16 (x, cast(wchar[]) dst, ate); 303 else 304 if (type == Utf32) 305 ret = Utf.toString32 (x, cast(dchar[]) dst, ate); 306 } 307 308 static if (is (T == wchar)) 309 { 310 if (type == Utf16) 311 { 312 if (ate) 313 *ate = x.length; 314 ret = x; 315 } 316 else 317 if (type == Utf8) 318 ret = Utf.toString (x, cast(char[]) dst, ate); 319 else 320 if (type == Utf32) 321 ret = Utf.toString32 (x, cast(dchar[]) dst, ate); 322 } 323 324 static if (is (T == dchar)) 325 { 326 if (type == Utf32) 327 { 328 if (ate) 329 *ate = x.length; 330 ret = x; 331 } 332 else 333 if (type == Utf8) 334 ret = Utf.toString (x, cast(char[]) dst, ate); 335 else 336 if (type == Utf16) 337 ret = Utf.toString16 (x, cast(wchar[]) dst, ate); 338 } 339 340 return ret; 341 } 342 } 343 344 345 346 /******************************************************************************* 347 348 Handle byte-order-mark prefixes 349 350 *******************************************************************************/ 351 352 class BomSniffer 353 { 354 private bool found; // was an encoding discovered? 355 private Encoding encoder; // the current encoding 356 private const(Info)* settings; // pointer to encoding configuration 357 358 private struct Info 359 { 360 int type; // type of element (char/wchar/dchar) 361 Encoding encoding; // Encoding.xx encoding 362 const(char)[] bom; // pattern to match for signature 363 bool test, // should we test for this encoding? 364 endian, // this encoding have endian concerns? 365 bigEndian; // is this a big-endian encoding? 366 Encoding fallback; // can this encoding be defaulted? 367 }; 368 369 private enum {Utf8, Utf16, Utf32}; 370 371 private __gshared const Info[] lookup = 372 [ 373 {Utf8, Encoding.Unknown, null, true, false, false, Encoding.UTF_8}, 374 {Utf8, Encoding.UTF_8N, null, true, false, false, Encoding.UTF_8}, 375 {Utf8, Encoding.UTF_8, x"efbbbf", false}, 376 {Utf16, Encoding.UTF_16, null, true, false, false, Encoding.UTF_16BE}, 377 {Utf16, Encoding.UTF_16BE, x"feff", false, true, true}, 378 {Utf16, Encoding.UTF_16LE, x"fffe", false, true}, 379 {Utf32, Encoding.UTF_32, null, true, false, false, Encoding.UTF_32BE}, 380 {Utf32, Encoding.UTF_32BE, x"0000feff", false, true, true}, 381 {Utf32, Encoding.UTF_32LE, x"fffe0000", false, true}, 382 ]; 383 384 /*********************************************************************** 385 386 Return the current encoding. This is either the originally 387 specified encoding, or a derived one obtained by inspecting 388 the content for a BOM. The latter is performed as part of 389 the decode() method 390 391 ***********************************************************************/ 392 393 @property final Encoding encoding () 394 { 395 return encoder; 396 } 397 398 /*********************************************************************** 399 400 Was an encoding located in the text (configured via setup) 401 402 ***********************************************************************/ 403 404 @property final bool encoded () 405 { 406 return found; 407 } 408 409 /*********************************************************************** 410 411 Return the signature (BOM) of the current encoding 412 413 ***********************************************************************/ 414 415 @property final const(void)[] signature () 416 { 417 return settings.bom; 418 } 419 420 /*********************************************************************** 421 422 Configure this instance with unicode converters 423 424 ***********************************************************************/ 425 426 final void setup (Encoding encoding, bool found = false) 427 { 428 this.settings = &lookup[encoding]; 429 this.encoder = encoding; 430 this.found = found; 431 } 432 433 /*********************************************************************** 434 435 Scan the BOM signatures looking for a match. We scan in 436 reverse order to get the longest match first 437 438 ***********************************************************************/ 439 440 static final const(Info)* test (void[] content) 441 { 442 for (const(Info)* info=lookup.ptr+lookup.length; --info >= lookup.ptr;) 443 if (info.bom) 444 { 445 size_t len = info.bom.length; 446 if (len <= content.length) 447 if (content[0..len] == info.bom[0..len]) 448 return info; 449 } 450 return null; 451 } 452 } 453 454 /******************************************************************************* 455 456 *******************************************************************************/ 457 458 debug (UnitTest) 459 { 460 unittest 461 { 462 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup; 463 void[] INPUT = x"efbbbf" ~ INPUT2; 464 auto bom = new UnicodeBom!(char)(Encoding.Unknown); 465 size_t ate; 466 char[256] buf; 467 468 auto temp = bom.decode (INPUT, buf, &ate); 469 assert (ate == INPUT.length); 470 assert (bom.encoding == Encoding.UTF_8); 471 472 temp = bom.decode (INPUT2, buf, &ate); 473 assert (ate == INPUT2.length); 474 assert (bom.encoding == Encoding.UTF_8); 475 } 476 } 477 478 debug (UnicodeBom) 479 { 480 import tango.io.Stdout; 481 482 void main() 483 { 484 void[] INPUT2 = "abc\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86".dup; 485 void[] INPUT = x"efbbbf" ~ INPUT2; 486 auto bom = new UnicodeBom!(char)(Encoding.Unknown); 487 size_t ate; 488 char[256] buf; 489 490 auto temp = bom.decode (INPUT, buf, &ate); 491 assert (temp == INPUT2); 492 assert (ate == INPUT.length); 493 assert (bom.encoding == Encoding.UTF_8); 494 495 temp = bom.decode (INPUT2, buf, &ate); 496 assert (temp == INPUT2); 497 assert (ate == INPUT2.length); 498 assert (bom.encoding == Encoding.UTF_8); 499 } 500 }