• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.apache.velocity.io;
2 
3 /*
4  * Licensed to the Apache Software Foundation (ASF) under one
5  * or more contributor license agreements.  See the NOTICE file
6  * distributed with this work for additional information
7  * regarding copyright ownership.  The ASF licenses this file
8  * to you under the Apache License, Version 2.0 (the
9  * "License"); you may not use this file except in compliance
10  * with the License.  You may obtain a copy of the License at
11  *
12  *   http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing,
15  * software distributed under the License is distributed on an
16  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17  * KIND, either express or implied.  See the License for the
18  * specific language governing permissions and limitations
19  * under the License.
20  */
21 
22 
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.PushbackInputStream;
26 
27 import java.util.Locale;
28 
29 /**
30  * This is an input stream that is unicode BOM aware. This allows you to e.g. read
31  * Windows Notepad Unicode files as Velocity templates.
32  *
33  * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on
34  * the input stream reader.
35  *
36  * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream,
37  * the caller must provide synchronization.
38  *
39  * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
40  * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
41  * @version $Id$
42  * @since 1.5
43  */
44 public class UnicodeInputStream
45     extends InputStream
46 {
47 
48     /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
49     public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf });
50 
51     /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
52     public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe });
53 
54     /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */
55     public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff });
56 
57     /**
58      * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html
59      *
60      */
61     public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 });
62 
63     /**
64      * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html
65      *
66      */
67     public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff });
68 
69     /** The maximum amount of bytes to read for a BOM */
70     private static final int MAX_BOM_SIZE = 4;
71 
72     /** Buffer for BOM reading */
73     private byte [] buf = new byte[MAX_BOM_SIZE];
74 
75     /** Buffer pointer. */
76     private int pos = 0;
77 
78     /** The stream encoding as read from the BOM or null. */
79     private final String encoding;
80 
81     /** True if the BOM itself should be skipped and not read. */
82     private final boolean skipBOM;
83 
84     private final PushbackInputStream inputStream;
85 
86     /**
87      * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding.
88      *
89      * @param  inputStream The input stream to use for reading.
90      * @throws IllegalStateException
91      * @throws IOException
92      */
UnicodeInputStream(final InputStream inputStream)93     public UnicodeInputStream(final InputStream inputStream)
94             throws IllegalStateException, IOException
95     {
96         this(inputStream, true);
97     }
98 
99     /**
100      * Creates a new UnicodeInputStream object.
101      *
102      * @param  inputStream The input stream to use for reading.
103      * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true.
104      * @throws IllegalStateException
105      * @throws IOException
106      */
UnicodeInputStream(final InputStream inputStream, boolean skipBOM)107     public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
108             throws IllegalStateException, IOException
109     {
110         super();
111 
112         this.skipBOM = skipBOM;
113         this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);
114 
115         try
116         {
117             this.encoding = readEncoding();
118         }
119         catch (IOException ioe)
120         {
121             throw new IllegalStateException("Could not read BOM from Stream", ioe);
122         }
123     }
124 
125     /**
126      * Returns true if the input stream discards the BOM.
127      *
128      * @return  True if the input stream discards the BOM.
129      */
isSkipBOM()130     public boolean isSkipBOM()
131     {
132         return skipBOM;
133     }
134 
135     /**
136      * Read encoding based on BOM.
137      *
138      * @return  The encoding based on the BOM.
139      *
140      * @throws  IllegalStateException  When a problem reading the BOM occured.
141      */
getEncodingFromStream()142     public String getEncodingFromStream()
143     {
144         return encoding;
145     }
146 
147     /**
148      * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding
149      * is undefined.
150      *
151      * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found.
152      * @throws IOException
153      */
readEncoding()154     protected String readEncoding()
155         throws IOException
156     {
157         pos = 0;
158 
159         UnicodeBOM encoding = null;
160 
161         // read first byte.
162         if (readByte())
163         {
164             // Build a list of matches
165             //
166             // 00 00 FE FF --> UTF 32 BE
167             // EF BB BF    --> UTF 8
168             // FE FF       --> UTF 16 BE
169             // FF FE       --> UTF 16 LE
170             // FF FE 00 00 --> UTF 32 LE
171 
172             switch (buf[0])
173             {
174             case (byte)0x00: // UTF32 BE
175                 encoding = match(UTF32BE_BOM, null);
176                 break;
177             case (byte)0xef: // UTF8
178                 encoding = match(UTF8_BOM, null);
179                 break;
180             case (byte)0xfe: // UTF16 BE
181                 encoding = match(UTF16BE_BOM, null);
182                 break;
183             case (byte)0xff: // UTF16/32 LE
184                 encoding = match(UTF16LE_BOM, null);
185 
186                 if (encoding != null)
187                 {
188                     encoding = match(UTF32LE_BOM, encoding);
189                 }
190                 break;
191 
192             default:
193                 encoding = null;
194                 break;
195             }
196         }
197 
198         pushback(encoding);
199 
200         return (encoding != null) ? encoding.getEncoding() : null;
201     }
202 
match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)203     private UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
204         throws IOException
205     {
206         byte [] bom = matchEncoding.getBytes();
207 
208         for (int i = 0; i < bom.length; i++)
209         {
210             if (pos <= i) // Byte has not yet been read
211             {
212                 if (!readByte())
213                 {
214                     return noMatchEncoding;
215                 }
216             }
217 
218             if (bom[i] != buf[i])
219             {
220                 return noMatchEncoding;
221             }
222         }
223 
224         return matchEncoding;
225     }
226 
readByte()227     private boolean readByte()
228             throws IOException
229     {
230         int res = inputStream.read();
231         if (res == -1)
232         {
233             return false;
234         }
235 
236         if (pos >= buf.length)
237         {
238             throw new IOException("BOM read error");
239         }
240 
241         buf[pos++] = (byte) res;
242         return true;
243     }
244 
pushback(final UnicodeBOM matchBOM)245     private void pushback(final UnicodeBOM matchBOM)
246         throws IOException
247     {
248         int count = pos; // By default, all bytes are pushed back.
249         int start = 0;
250 
251         if (matchBOM != null && skipBOM)
252         {
253             // We have a match (some bytes are part of the BOM)
254             // and we want to skip the BOM. Push back only the bytes
255             // after the BOM.
256             start = matchBOM.getBytes().length;
257             count = (pos - start);
258 
259             if (count < 0)
260             {
261                 throw new IllegalStateException("Match has more bytes than available!");
262             }
263         }
264 
265         inputStream.unread(buf, start, count);
266     }
267 
268     /**
269      * @throws IOException
270      * @see java.io.InputStream#close()
271      */
272     @Override
close()273     public void close()
274         throws IOException
275     {
276         inputStream.close();
277     }
278 
279     /**
280      * @throws IOException
281      * @see java.io.InputStream#available()
282      */
283     @Override
available()284     public int available()
285         throws IOException
286     {
287         return inputStream.available();
288     }
289 
290     /**
291      * @param readlimit
292      * @see java.io.InputStream#mark(int)
293      */
294     @Override
mark(final int readlimit)295     public void mark(final int readlimit)
296     {
297         inputStream.mark(readlimit);
298     }
299 
300     /**
301      * @return mark supported
302      * @see java.io.InputStream#markSupported()
303      */
304     @Override
markSupported()305     public boolean markSupported()
306     {
307         return inputStream.markSupported();
308     }
309 
310     /**
311      * @return read char
312      * @see java.io.InputStream#read()
313      */
314     @Override
read()315     public int read()
316         throws IOException
317     {
318         return inputStream.read();
319     }
320 
321     /**
322      * @param b buffer
323      * @return read chars count
324      * @see java.io.InputStream#read(byte[])
325      */
326     @Override
read(final byte [] b)327     public int read(final byte [] b)
328         throws IOException
329     {
330         return inputStream.read(b);
331     }
332 
333     /**
334      * @param b buffer
335      * @param off offset
336      * @param len length
337      * @return reac char
338      * @see java.io.InputStream#read(byte[], int, int)
339      */
340     @Override
read(final byte [] b, final int off, final int len)341     public int read(final byte [] b, final int off, final int len)
342         throws IOException
343     {
344         return inputStream.read(b, off, len);
345     }
346 
347     /**
348      * @see java.io.InputStream#reset()
349      */
350     @Override
reset()351     public void reset()
352         throws IOException
353     {
354         inputStream.reset();
355     }
356 
357     /**
358      * @param n
359      * @return skipped count
360      * @see java.io.InputStream#skip(long)
361      */
362     @Override
skip(final long n)363     public long skip(final long n)
364         throws IOException
365     {
366         return inputStream.skip(n);
367     }
368 
369 
370     /**
371      * Helper function to compare encodings
372      * @param left
373      * @param right
374      * @return true for same encoding
375      */
sameEncoding(String left, String right)376     public static boolean sameEncoding(String left, String right)
377     {
378         left = left.toUpperCase(Locale.ROOT).replace("-", "").replace("_","");
379         right = right.toUpperCase(Locale.ROOT).replace("-", "").replace("_","");
380         return left.equals(right);
381     }
382 
383     /**
384      * Helper class to bundle encoding and BOM marker.
385      *
386      * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
387      * @version $Id$
388      */
389     static final class UnicodeBOM
390     {
391         private final String encoding;
392 
393         private final byte [] bytes;
394 
UnicodeBOM(final String encoding, final byte [] bytes)395         private UnicodeBOM(final String encoding, final byte [] bytes)
396         {
397             this.encoding = encoding;
398             this.bytes = bytes;
399         }
400 
getEncoding()401         String getEncoding()
402         {
403             return encoding;
404         }
405 
getBytes()406         byte [] getBytes()
407         {
408             return bytes;
409         }
410     }
411 }
412