1 package org.apache.velocity.io; 2 3 /* 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, 15 * software distributed under the License is distributed on an 16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 * KIND, either express or implied. See the License for the 18 * specific language governing permissions and limitations 19 * under the License. 20 */ 21 22 23 import java.io.IOException; 24 import java.io.InputStream; 25 import java.io.PushbackInputStream; 26 27 import java.util.Locale; 28 29 /** 30 * This is an input stream that is unicode BOM aware. This allows you to e.g. read 31 * Windows Notepad Unicode files as Velocity templates. 32 * 33 * It allows you to check the actual encoding of a file by calling {@link #getEncodingFromStream()} on 34 * the input stream reader. 35 * 36 * This class is not thread safe! When more than one thread wants to use an instance of UnicodeInputStream, 37 * the caller must provide synchronization. 38 * 39 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a> 40 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a> 41 * @version $Id$ 42 * @since 1.5 43 */ 44 public class UnicodeInputStream 45 extends InputStream 46 { 47 48 /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */ 49 public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte [] { (byte)0xef, (byte)0xbb, (byte)0xbf }); 50 51 /** BOM Marker for UTF 16, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html */ 52 public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte [] { (byte)0xff, (byte)0xfe }); 53 54 /** BOM Marker for UTF 16, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html */ 55 public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte [] { (byte)0xfe, (byte)0xff }); 56 57 /** 58 * BOM Marker for UTF 32, little endian. See http://www.unicode.org/unicode/faq/utf_bom.html 59 * 60 */ 61 public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte [] { (byte)0xff, (byte)0xfe, (byte)0x00, (byte)0x00 }); 62 63 /** 64 * BOM Marker for UTF 32, big endian. See http://www.unicode.org/unicode/faq/utf_bom.html 65 * 66 */ 67 public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte [] { (byte)0x00, (byte)0x00, (byte)0xfe, (byte)0xff }); 68 69 /** The maximum amount of bytes to read for a BOM */ 70 private static final int MAX_BOM_SIZE = 4; 71 72 /** Buffer for BOM reading */ 73 private byte [] buf = new byte[MAX_BOM_SIZE]; 74 75 /** Buffer pointer. */ 76 private int pos = 0; 77 78 /** The stream encoding as read from the BOM or null. */ 79 private final String encoding; 80 81 /** True if the BOM itself should be skipped and not read. */ 82 private final boolean skipBOM; 83 84 private final PushbackInputStream inputStream; 85 86 /** 87 * Creates a new UnicodeInputStream object. Skips a BOM which defines the file encoding. 88 * 89 * @param inputStream The input stream to use for reading. 90 * @throws IllegalStateException 91 * @throws IOException 92 */ UnicodeInputStream(final InputStream inputStream)93 public UnicodeInputStream(final InputStream inputStream) 94 throws IllegalStateException, IOException 95 { 96 this(inputStream, true); 97 } 98 99 /** 100 * Creates a new UnicodeInputStream object. 101 * 102 * @param inputStream The input stream to use for reading. 103 * @param skipBOM If this is set to true, a BOM read from the stream is discarded. This parameter should normally be true. 104 * @throws IllegalStateException 105 * @throws IOException 106 */ UnicodeInputStream(final InputStream inputStream, boolean skipBOM)107 public UnicodeInputStream(final InputStream inputStream, boolean skipBOM) 108 throws IllegalStateException, IOException 109 { 110 super(); 111 112 this.skipBOM = skipBOM; 113 this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE); 114 115 try 116 { 117 this.encoding = readEncoding(); 118 } 119 catch (IOException ioe) 120 { 121 throw new IllegalStateException("Could not read BOM from Stream", ioe); 122 } 123 } 124 125 /** 126 * Returns true if the input stream discards the BOM. 127 * 128 * @return True if the input stream discards the BOM. 129 */ isSkipBOM()130 public boolean isSkipBOM() 131 { 132 return skipBOM; 133 } 134 135 /** 136 * Read encoding based on BOM. 137 * 138 * @return The encoding based on the BOM. 139 * 140 * @throws IllegalStateException When a problem reading the BOM occured. 141 */ getEncodingFromStream()142 public String getEncodingFromStream() 143 { 144 return encoding; 145 } 146 147 /** 148 * This method gets the encoding from the stream contents if a BOM exists. If no BOM exists, the encoding 149 * is undefined. 150 * 151 * @return The encoding of this streams contents as decided by the BOM or null if no BOM was found. 152 * @throws IOException 153 */ readEncoding()154 protected String readEncoding() 155 throws IOException 156 { 157 pos = 0; 158 159 UnicodeBOM encoding = null; 160 161 // read first byte. 162 if (readByte()) 163 { 164 // Build a list of matches 165 // 166 // 00 00 FE FF --> UTF 32 BE 167 // EF BB BF --> UTF 8 168 // FE FF --> UTF 16 BE 169 // FF FE --> UTF 16 LE 170 // FF FE 00 00 --> UTF 32 LE 171 172 switch (buf[0]) 173 { 174 case (byte)0x00: // UTF32 BE 175 encoding = match(UTF32BE_BOM, null); 176 break; 177 case (byte)0xef: // UTF8 178 encoding = match(UTF8_BOM, null); 179 break; 180 case (byte)0xfe: // UTF16 BE 181 encoding = match(UTF16BE_BOM, null); 182 break; 183 case (byte)0xff: // UTF16/32 LE 184 encoding = match(UTF16LE_BOM, null); 185 186 if (encoding != null) 187 { 188 encoding = match(UTF32LE_BOM, encoding); 189 } 190 break; 191 192 default: 193 encoding = null; 194 break; 195 } 196 } 197 198 pushback(encoding); 199 200 return (encoding != null) ? encoding.getEncoding() : null; 201 } 202 match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)203 private UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding) 204 throws IOException 205 { 206 byte [] bom = matchEncoding.getBytes(); 207 208 for (int i = 0; i < bom.length; i++) 209 { 210 if (pos <= i) // Byte has not yet been read 211 { 212 if (!readByte()) 213 { 214 return noMatchEncoding; 215 } 216 } 217 218 if (bom[i] != buf[i]) 219 { 220 return noMatchEncoding; 221 } 222 } 223 224 return matchEncoding; 225 } 226 readByte()227 private boolean readByte() 228 throws IOException 229 { 230 int res = inputStream.read(); 231 if (res == -1) 232 { 233 return false; 234 } 235 236 if (pos >= buf.length) 237 { 238 throw new IOException("BOM read error"); 239 } 240 241 buf[pos++] = (byte) res; 242 return true; 243 } 244 pushback(final UnicodeBOM matchBOM)245 private void pushback(final UnicodeBOM matchBOM) 246 throws IOException 247 { 248 int count = pos; // By default, all bytes are pushed back. 249 int start = 0; 250 251 if (matchBOM != null && skipBOM) 252 { 253 // We have a match (some bytes are part of the BOM) 254 // and we want to skip the BOM. Push back only the bytes 255 // after the BOM. 256 start = matchBOM.getBytes().length; 257 count = (pos - start); 258 259 if (count < 0) 260 { 261 throw new IllegalStateException("Match has more bytes than available!"); 262 } 263 } 264 265 inputStream.unread(buf, start, count); 266 } 267 268 /** 269 * @throws IOException 270 * @see java.io.InputStream#close() 271 */ 272 @Override close()273 public void close() 274 throws IOException 275 { 276 inputStream.close(); 277 } 278 279 /** 280 * @throws IOException 281 * @see java.io.InputStream#available() 282 */ 283 @Override available()284 public int available() 285 throws IOException 286 { 287 return inputStream.available(); 288 } 289 290 /** 291 * @param readlimit 292 * @see java.io.InputStream#mark(int) 293 */ 294 @Override mark(final int readlimit)295 public void mark(final int readlimit) 296 { 297 inputStream.mark(readlimit); 298 } 299 300 /** 301 * @return mark supported 302 * @see java.io.InputStream#markSupported() 303 */ 304 @Override markSupported()305 public boolean markSupported() 306 { 307 return inputStream.markSupported(); 308 } 309 310 /** 311 * @return read char 312 * @see java.io.InputStream#read() 313 */ 314 @Override read()315 public int read() 316 throws IOException 317 { 318 return inputStream.read(); 319 } 320 321 /** 322 * @param b buffer 323 * @return read chars count 324 * @see java.io.InputStream#read(byte[]) 325 */ 326 @Override read(final byte [] b)327 public int read(final byte [] b) 328 throws IOException 329 { 330 return inputStream.read(b); 331 } 332 333 /** 334 * @param b buffer 335 * @param off offset 336 * @param len length 337 * @return reac char 338 * @see java.io.InputStream#read(byte[], int, int) 339 */ 340 @Override read(final byte [] b, final int off, final int len)341 public int read(final byte [] b, final int off, final int len) 342 throws IOException 343 { 344 return inputStream.read(b, off, len); 345 } 346 347 /** 348 * @see java.io.InputStream#reset() 349 */ 350 @Override reset()351 public void reset() 352 throws IOException 353 { 354 inputStream.reset(); 355 } 356 357 /** 358 * @param n 359 * @return skipped count 360 * @see java.io.InputStream#skip(long) 361 */ 362 @Override skip(final long n)363 public long skip(final long n) 364 throws IOException 365 { 366 return inputStream.skip(n); 367 } 368 369 370 /** 371 * Helper function to compare encodings 372 * @param left 373 * @param right 374 * @return true for same encoding 375 */ sameEncoding(String left, String right)376 public static boolean sameEncoding(String left, String right) 377 { 378 left = left.toUpperCase(Locale.ROOT).replace("-", "").replace("_",""); 379 right = right.toUpperCase(Locale.ROOT).replace("-", "").replace("_",""); 380 return left.equals(right); 381 } 382 383 /** 384 * Helper class to bundle encoding and BOM marker. 385 * 386 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a> 387 * @version $Id$ 388 */ 389 static final class UnicodeBOM 390 { 391 private final String encoding; 392 393 private final byte [] bytes; 394 UnicodeBOM(final String encoding, final byte [] bytes)395 private UnicodeBOM(final String encoding, final byte [] bytes) 396 { 397 this.encoding = encoding; 398 this.bytes = bytes; 399 } 400 getEncoding()401 String getEncoding() 402 { 403 return encoding; 404 } 405 getBytes()406 byte [] getBytes() 407 { 408 return bytes; 409 } 410 } 411 } 412