1 /******************************************************************************* 2 3 copyright: Copyright (c) 2005 Kris Bell. All rights reserved 4 5 license: BSD style: $(LICENSE) 6 7 version: Initial release: December 2005 8 9 author: Kris 10 11 *******************************************************************************/ 12 13 module tango.io.UnicodeFile; 14 15 private import tango.io.device.File; 16 17 public import tango.text.convert.UnicodeBom; 18 19 /******************************************************************************* 20 21 Read and write Unicode files 22 23 For our purposes, Unicode files are an encoding of textual material. 24 The goal of this module is to interface that external-encoding with 25 a programmer-defined internal-encoding. This internal encoding is 26 declared via the template argument T, whilst the external encoding 27 is either specified or derived. 28 29 Three internal encodings are supported: char, wchar, and dchar. The 30 methods herein operate upon arrays of this type. For example, read() 31 returns an array of the type, whilst write() and append() expect an 32 array of said type. 33 34 Supported external encodings are as follows: 35 36 $(UL 37 $(LI Encoding.Unknown) 38 $(LI Encoding.UTF_8) 39 $(LI Encoding.UTF_8N) 40 $(LI Encoding.UTF_16) 41 $(LI Encoding.UTF_16BE) 42 $(LI Encoding.UTF_16LE) 43 $(LI Encoding.UTF_32) 44 $(LI Encoding.UTF_32BE) 45 $(LI Encoding.UTF_32LE)) 46 47 These can be divided into implicit and explicit encodings. Here is 48 the implicit subset: 49 50 $(UL 51 $(LI Encoding.Unknown) 52 $(LI Encoding.UTF_8) 53 $(LI Encoding.UTF_16) 54 $(LI Encoding.UTF_32)) 55 56 Implicit encodings may be used to 'discover' 57 an unknown encoding, by examining the first few bytes of the file 58 content for a signature. This signature is optional for all files, 59 but is often written such that the content is self-describing. When 60 the encoding is unknown, using one of the non-explicit encodings will 61 cause the read() method to look for a signature and adjust itself 62 accordingly. It is possible that a ZWNBSP character might be confused 63 with the signature; today's files are supposed to use the WORD-JOINER 64 character instead. 65 66 Explicit encodings are as follows: 67 68 $(UL 69 $(LI Encoding.UTF_8N) 70 $(LI Encoding.UTF_16BE) 71 $(LI Encoding.UTF_16LE) 72 $(LI Encoding.UTF_32BE) 73 $(LI Encoding.UTF_32LE)) 74 75 This group of encodings are for use when the file encoding is 76 known. These *must* be used when writing or appending, since written 77 content must be in a known format. It should be noted that, during a 78 read operation, the presence of a signature is in conflict with these 79 explicit varieties. 80 81 Method read() returns the current content of the file, whilst write() 82 sets the file content, and file length, to the provided array. Method 83 append() adds content to the tail of the file. When appending, it is 84 your responsibility to ensure the existing and current encodings are 85 correctly matched. 86 87 Methods to inspect the file system, check the status of a file or 88 directory, and other facilities are made available via the FilePath 89 superclass. 90 91 See these links for more info: 92 $(UL 93 $(LI $(LINK http://www.utf-8.com/)) 94 $(LI $(LINK http://www.hackcraft.net/xmlUnicode/)) 95 $(LI $(LINK http://www.unicode.org/faq/utf_bom.html/)) 96 $(LI $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)) 97 $(LI $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))) 98 99 *******************************************************************************/ 100 101 class UnicodeFile(T) 102 { 103 private UnicodeBom!(T) bom_; 104 private const(char)[] path_; 105 106 /*********************************************************************** 107 108 Construct a UnicodeFile from the provided FilePath. The given 109 encoding represents the external file encoding, and should 110 be one of the Encoding.* types. 111 112 ***********************************************************************/ 113 114 this (const(char)[] path, Encoding encoding) 115 { 116 bom_ = new UnicodeBom!(T)(encoding); 117 path_ = path; 118 } 119 120 /*********************************************************************** 121 122 Call-site shortcut to create a UnicodeFile instance. This 123 enables the same syntax as struct usage, so may expose 124 a migration path. 125 126 ***********************************************************************/ 127 128 static UnicodeFile opCall (const(char)[] name, Encoding encoding) 129 { 130 return new UnicodeFile (name, encoding); 131 } 132 133 /*********************************************************************** 134 135 Return the associated file path. 136 137 ***********************************************************************/ 138 139 override immutable(char)[] toString () 140 { 141 return path_.idup; 142 } 143 144 /*********************************************************************** 145 146 Return the current encoding. This is either the originally 147 specified encoding, or a derived one obtained by inspecting 148 the file content for a bom. The latter is performed as part 149 of the read() method. 150 151 ***********************************************************************/ 152 153 Encoding encoding () 154 { 155 return bom_.encoding; 156 } 157 158 /*********************************************************************** 159 160 Return the associated bom instance. Use this to find more 161 information about the encoding status. 162 163 ***********************************************************************/ 164 165 UnicodeBom!(T) bom () 166 { 167 return bom_; 168 } 169 170 /*********************************************************************** 171 172 Return the content of the file. The content is inspected 173 for a bom signature, which is stripped. An exception is 174 thrown if a signature is present when, according to the 175 encoding type, it should not be. Conversely, An exception 176 is thrown if there is no known signature where the current 177 encoding expects one to be present. 178 179 ***********************************************************************/ 180 181 final T[] read () 182 { 183 auto content = File.get (path_); 184 return bom_.decode (content); 185 } 186 187 /*********************************************************************** 188 189 Set the file content and length to reflect the given array. 190 The content will be encoded accordingly. 191 192 ***********************************************************************/ 193 194 final void write (const(T)[] content, bool writeBom) 195 { 196 // convert to external representation (may throw an exeption) 197 void[] converted = bom_.encode (content.dup); 198 199 // open file after conversion ~ in case of exceptions 200 scope conduit = new File (path_, File.ReadWriteCreate); 201 scope (exit) 202 conduit.close(); 203 204 if (writeBom) 205 conduit.write (bom_.signature); 206 207 // and write 208 conduit.write (converted); 209 } 210 211 /*********************************************************************** 212 213 Append content to the file; the content will be encoded 214 accordingly. 215 216 Note that it is your responsibility to ensure the 217 existing and current encodings are correctly matched. 218 219 ***********************************************************************/ 220 221 final void append (const(T)[] content) 222 { 223 // convert to external representation (may throw an exception) 224 File.append (path_, bom_.encode (content.dup)); 225 } 226 } 227 228 229 /******************************************************************************* 230 231 *******************************************************************************/ 232 233 debug (UnicodeFile) 234 { 235 import tango.io.Stdout; 236 237 void main() 238 { 239 auto file = UnicodeFile!(char)("UnicodeFile.d", Encoding.UTF_8); 240 auto content = file.read; 241 Stdout (content).newline; 242 } 243 }