1 /** 2 * Copyright (c) 2008, http://www.snakeyaml.org 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package org.yaml.snakeyaml.reader; 17 18 /** 19 version: 1.1 / 2007-01-25 20 - changed BOM recognition ordering (longer boms first) 21 22 Original pseudocode : Thomas Weidenfeller 23 Implementation tweaked: Aki Nieminen 24 Implementation changed: Andrey Somov 25 * UTF-32 removed because it is not supported by YAML 26 * no default encoding 27 28 http://www.unicode.org/unicode/faq/utf_bom.html 29 BOMs: 30 00 00 FE FF = UTF-32, big-endian 31 FF FE 00 00 = UTF-32, little-endian 32 EF BB BF = UTF-8, 33 FE FF = UTF-16, big-endian 34 FF FE = UTF-16, little-endian 35 36 Win2k Notepad: 37 Unicode format = UTF-16LE 38 ***/ 39 40 import java.io.IOException; 41 import java.io.InputStream; 42 import java.io.InputStreamReader; 43 import java.io.PushbackInputStream; 44 import java.io.Reader; 45 import java.nio.charset.Charset; 46 import java.nio.charset.CharsetDecoder; 47 import java.nio.charset.CodingErrorAction; 48 49 /** 50 * Generic unicode textreader, which will use BOM mark to identify the encoding 51 * to be used. If BOM is not found then use a given default or system encoding. 52 */ 53 public class UnicodeReader extends Reader { 54 private static final Charset UTF8 = Charset.forName("UTF-8"); 55 private static final Charset UTF16BE = Charset.forName("UTF-16BE"); 56 private static final Charset UTF16LE = Charset.forName("UTF-16LE"); 57 58 PushbackInputStream internalIn; 59 InputStreamReader internalIn2 = null; 60 61 private static final int BOM_SIZE = 3; 62 63 /** 64 * @param in 65 * InputStream to be read 66 */ UnicodeReader(InputStream in)67 public UnicodeReader(InputStream in) { 68 internalIn = new PushbackInputStream(in, BOM_SIZE); 69 } 70 71 /** 72 * Get stream encoding or NULL if stream is uninitialized. Call init() or 73 * read() method to initialize it. 74 */ getEncoding()75 public String getEncoding() { 76 return internalIn2.getEncoding(); 77 } 78 79 /** 80 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread 81 * back to the stream, only BOM bytes are skipped. 82 */ init()83 protected void init() throws IOException { 84 if (internalIn2 != null) 85 return; 86 87 Charset encoding; 88 byte bom[] = new byte[BOM_SIZE]; 89 int n, unread; 90 n = internalIn.read(bom, 0, bom.length); 91 92 if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { 93 encoding = UTF8; 94 unread = n - 3; 95 } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { 96 encoding = UTF16BE; 97 unread = n - 2; 98 } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { 99 encoding = UTF16LE; 100 unread = n - 2; 101 } else { 102 // Unicode BOM mark not found, unread all bytes 103 encoding = UTF8; 104 unread = n; 105 } 106 107 if (unread > 0) 108 internalIn.unread(bom, (n - unread), unread); 109 110 // Use given encoding 111 CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter( 112 CodingErrorAction.REPORT); 113 internalIn2 = new InputStreamReader(internalIn, decoder); 114 } 115 close()116 public void close() throws IOException { 117 init(); 118 internalIn2.close(); 119 } 120 read(char[] cbuf, int off, int len)121 public int read(char[] cbuf, int off, int len) throws IOException { 122 init(); 123 return internalIn2.read(cbuf, off, len); 124 } 125 }