• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2008, http://www.snakeyaml.org
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package org.yaml.snakeyaml.reader;
17 
18 /**
19  version: 1.1 / 2007-01-25
20  - changed BOM recognition ordering (longer boms first)
21 
22  Original pseudocode   : Thomas Weidenfeller
23  Implementation tweaked: Aki Nieminen
24  Implementation changed: Andrey Somov
25  * UTF-32 removed because it is not supported by YAML
26  * no default encoding
27 
28  http://www.unicode.org/unicode/faq/utf_bom.html
29  BOMs:
30  00 00 FE FF    = UTF-32, big-endian
31  FF FE 00 00    = UTF-32, little-endian
32  EF BB BF       = UTF-8,
33  FE FF          = UTF-16, big-endian
34  FF FE          = UTF-16, little-endian
35 
36  Win2k Notepad:
37  Unicode format = UTF-16LE
38  ***/
39 
40 import java.io.IOException;
41 import java.io.InputStream;
42 import java.io.InputStreamReader;
43 import java.io.PushbackInputStream;
44 import java.io.Reader;
45 import java.nio.charset.Charset;
46 import java.nio.charset.CharsetDecoder;
47 import java.nio.charset.CodingErrorAction;
48 
49 /**
50  * Generic unicode textreader, which will use BOM mark to identify the encoding
51  * to be used. If BOM is not found then use a given default or system encoding.
52  */
53 public class UnicodeReader extends Reader {
54     private static final Charset UTF8 = Charset.forName("UTF-8");
55     private static final Charset UTF16BE = Charset.forName("UTF-16BE");
56     private static final Charset UTF16LE = Charset.forName("UTF-16LE");
57 
58     PushbackInputStream internalIn;
59     InputStreamReader internalIn2 = null;
60 
61     private static final int BOM_SIZE = 3;
62 
63     /**
64      * @param in
65      *            InputStream to be read
66      */
UnicodeReader(InputStream in)67     public UnicodeReader(InputStream in) {
68         internalIn = new PushbackInputStream(in, BOM_SIZE);
69     }
70 
71     /**
72      * Get stream encoding or NULL if stream is uninitialized. Call init() or
73      * read() method to initialize it.
74      */
getEncoding()75     public String getEncoding() {
76         return internalIn2.getEncoding();
77     }
78 
79     /**
80      * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
81      * back to the stream, only BOM bytes are skipped.
82      */
init()83     protected void init() throws IOException {
84         if (internalIn2 != null)
85             return;
86 
87         Charset encoding;
88         byte bom[] = new byte[BOM_SIZE];
89         int n, unread;
90         n = internalIn.read(bom, 0, bom.length);
91 
92         if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
93             encoding = UTF8;
94             unread = n - 3;
95         } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
96             encoding = UTF16BE;
97             unread = n - 2;
98         } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
99             encoding = UTF16LE;
100             unread = n - 2;
101         } else {
102             // Unicode BOM mark not found, unread all bytes
103             encoding = UTF8;
104             unread = n;
105         }
106 
107         if (unread > 0)
108             internalIn.unread(bom, (n - unread), unread);
109 
110         // Use given encoding
111         CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter(
112                 CodingErrorAction.REPORT);
113         internalIn2 = new InputStreamReader(internalIn, decoder);
114     }
115 
close()116     public void close() throws IOException {
117         init();
118         internalIn2.close();
119     }
120 
read(char[] cbuf, int off, int len)121     public int read(char[] cbuf, int off, int len) throws IOException {
122         init();
123         return internalIn2.read(cbuf, off, len);
124     }
125 }