1 /*******************************************************************************
2 
3         copyright:      Copyright (c) 2005 Kris Bell. All rights reserved
4 
5         license:        BSD style: $(LICENSE)
6 
7         version:        Initial release: December 2005
8 
9         author:         Kris
10 
11 *******************************************************************************/
12 
13 module tango.io.UnicodeFile;
14 
15 private import tango.io.device.File;
16 
17 public  import tango.text.convert.UnicodeBom;
18 
19 /*******************************************************************************
20 
21         Read and write Unicode files
22 
23         For our purposes, Unicode files are an encoding of textual material.
24         The goal of this module is to interface that external-encoding with
25         a programmer-defined internal-encoding. This internal encoding is
26         declared via the template argument T, whilst the external encoding
27         is either specified or derived.
28 
29         Three internal encodings are supported: char, wchar, and dchar. The
30         methods herein operate upon arrays of this type. For example, read()
31         returns an array of the type, whilst write() and append() expect an
32         array of said type.
33 
34         Supported external encodings are as follows:
35 
36         $(UL
37           $(LI Encoding.Unknown)
38           $(LI Encoding.UTF_8)
39           $(LI Encoding.UTF_8N)
40           $(LI Encoding.UTF_16)
41           $(LI Encoding.UTF_16BE)
42           $(LI Encoding.UTF_16LE)
43           $(LI Encoding.UTF_32)
44           $(LI Encoding.UTF_32BE)
45           $(LI Encoding.UTF_32LE))
46 
47         These can be divided into implicit and explicit encodings. Here is
48         the implicit subset:
49 
50         $(UL
51           $(LI Encoding.Unknown)
52           $(LI Encoding.UTF_8)
53           $(LI Encoding.UTF_16)
54           $(LI Encoding.UTF_32))
55 
56         Implicit encodings may be used to 'discover'
57         an unknown encoding, by examining the first few bytes of the file
58         content for a signature. This signature is optional for all files,
59         but is often written such that the content is self-describing. When
60         the encoding is unknown, using one of the non-explicit encodings will
61         cause the read() method to look for a signature and adjust itself
62         accordingly. It is possible that a ZWNBSP character might be confused
63         with the signature; today's files are supposed to use the WORD-JOINER
64         character instead.
65 
66         Explicit encodings are as follows:
67 
68         $(UL
69           $(LI Encoding.UTF_8N)
70           $(LI Encoding.UTF_16BE)
71           $(LI Encoding.UTF_16LE)
72           $(LI Encoding.UTF_32BE)
73           $(LI Encoding.UTF_32LE))
74 
75         This group of encodings are for use when the file encoding is
76         known. These *must* be used when writing or appending, since written
77         content must be in a known format. It should be noted that, during a
78         read operation, the presence of a signature is in conflict with these
79         explicit varieties.
80 
81         Method read() returns the current content of the file, whilst write()
82         sets the file content, and file length, to the provided array. Method
83         append() adds content to the tail of the file. When appending, it is
84         your responsibility to ensure the existing and current encodings are
85         correctly matched.
86 
87         Methods to inspect the file system, check the status of a file or
88         directory, and other facilities are made available via the FilePath
89         superclass.
90 
91         See these links for more info:
92       $(UL
93         $(LI $(LINK http://www.utf-8.com/))
94         $(LI $(LINK http://www.hackcraft.net/xmlUnicode/))
95         $(LI $(LINK http://www.unicode.org/faq/utf_bom.html/))
96         $(LI $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
97         $(LI $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)))
98 
99 *******************************************************************************/
100 
101 class UnicodeFile(T)
102 {
103         private UnicodeBom!(T)  bom_;
104         private const(char)[]   path_;
105 
106         /***********************************************************************
107 
108                 Construct a UnicodeFile from the provided FilePath. The given
109                 encoding represents the external file encoding, and should
110                 be one of the Encoding.* types.
111 
112         ***********************************************************************/
113 
114         this (const(char)[] path, Encoding encoding)
115         {
116                 bom_ = new UnicodeBom!(T)(encoding);
117                 path_ = path;
118         }
119 
120         /***********************************************************************
121 
122                 Call-site shortcut to create a UnicodeFile instance. This
123                 enables the same syntax as struct usage, so may expose
124                 a migration path.
125 
126         ***********************************************************************/
127 
128         static UnicodeFile opCall (const(char)[] name, Encoding encoding)
129         {
130                 return new UnicodeFile (name, encoding);
131         }
132 
133         /***********************************************************************
134 
135                 Return the associated file path.
136 
137         ***********************************************************************/
138 
139         override immutable(char)[] toString ()
140         {
141                 return path_.idup;
142         }
143 
144         /***********************************************************************
145 
146                 Return the current encoding. This is either the originally
147                 specified encoding, or a derived one obtained by inspecting
148                 the file content for a bom. The latter is performed as part
149                 of the read() method.
150 
151         ***********************************************************************/
152 
153         Encoding encoding ()
154         {
155                 return bom_.encoding;
156         }
157 
158         /***********************************************************************
159 
160                 Return the associated bom instance. Use this to find more
161                 information about the encoding status.
162 
163         ***********************************************************************/
164 
165         UnicodeBom!(T) bom ()
166         {
167                 return bom_;
168         }
169 
170         /***********************************************************************
171 
172                 Return the content of the file. The content is inspected
173                 for a bom signature, which is stripped. An exception is
174                 thrown if a signature is present when, according to the
175                 encoding type, it should not be. Conversely, An exception
176                 is thrown if there is no known signature where the current
177                 encoding expects one to be present.
178 
179         ***********************************************************************/
180 
181         final T[] read ()
182         {
183                 auto content = File.get (path_);
184                 return bom_.decode (content);
185         }
186 
187         /***********************************************************************
188 
189                 Set the file content and length to reflect the given array.
190                 The content will be encoded accordingly.
191 
192         ***********************************************************************/
193 
194         final void write (const(T)[] content, bool writeBom)
195         {
196                 // convert to external representation (may throw an exeption)
197                 void[] converted = bom_.encode (content.dup);
198 
199                 // open file after conversion ~ in case of exceptions
200                 scope conduit = new File (path_, File.ReadWriteCreate);
201                 scope (exit)
202                        conduit.close();
203 
204                 if (writeBom)
205                     conduit.write (bom_.signature);
206 
207                 // and write
208                 conduit.write (converted);
209         }
210 
211         /***********************************************************************
212 
213                 Append content to the file; the content will be encoded
214                 accordingly.
215 
216                 Note that it is your responsibility to ensure the
217                 existing and current encodings are correctly matched.
218 
219         ***********************************************************************/
220 
221         final void append (const(T)[] content)
222         {
223                 // convert to external representation (may throw an exception)
224                 File.append (path_, bom_.encode (content.dup));
225         }
226 }
227 
228 
229 /*******************************************************************************
230 
231 *******************************************************************************/
232 
233 debug (UnicodeFile)
234 {
235         import tango.io.Stdout;
236 
237         void main()
238         {
239                 auto file = UnicodeFile!(char)("UnicodeFile.d", Encoding.UTF_8);
240                 auto content = file.read;
241                 Stdout (content).newline;
242         }
243 }