• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 package org.apache.commons.io.input;
18 
19 import static org.apache.commons.io.IOUtils.EOF;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.ByteBuffer;
25 import java.nio.CharBuffer;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetEncoder;
28 import java.nio.charset.CoderResult;
29 import java.nio.charset.CodingErrorAction;
30 import java.util.Objects;
31 
32 import org.apache.commons.io.Charsets;
33 import org.apache.commons.io.charset.CharsetEncoders;
34 
35 /**
36  * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte
37  * stream using a specified charset encoding. The stream is transformed using a {@link CharsetEncoder} object,
38  * guaranteeing that all charset encodings supported by the JRE are handled correctly. In particular for charsets such
39  * as UTF-16, the implementation ensures that one and only one byte order marker is produced.
40  * <p>
41  * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy
42  * a read request on the {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore
43  * no well defined correlation between the current position of the {@link Reader} and that of the
44  * {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader}
45  * in a {@link java.io.BufferedReader}.
46  * </p>
47  * <p>
48  * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the
49  * following example, reading from {@code in2} would return the same byte sequence as reading from {@code in} (provided
50  * that the initial byte sequence is legal with respect to the charset encoding):
51  * </p>
52  *
53  * <pre>
54  * InputStream inputStream = ...
55  * Charset cs = ...
56  * InputStreamReader reader = new InputStreamReader(inputStream, cs);
57  * ReaderInputStream in2 = new ReaderInputStream(reader, cs);
58  * </pre>
59  * <p>
60  * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the
61  * control flow is reversed: both classes transform a character stream into a byte stream, but
62  * {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} pulls it
63  * from the underlying stream.
64  * </p>
65  * <p>
66  * Note that while there are use cases where there is no alternative to using this class, very often the need to use
67  * this class is an indication of a flaw in the design of the code. This class is typically used in situations where an
68  * existing API only accepts an {@link InputStream}, but where the most natural way to produce the data is as a
69  * character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may
70  * appear is when implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
71  * </p>
72  * <p>
73  * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()}
74  * are not supported.
75  * </p>
76  * <p>
77  * Instances of {@link ReaderInputStream} are not thread safe.
78  * </p>
79  *
80  * @see org.apache.commons.io.output.WriterOutputStream
81  * @since 2.0
82  */
83 public class ReaderInputStream extends InputStream {
84     private static final int DEFAULT_BUFFER_SIZE = 1024;
85 
checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize)86     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
87         final float minRequired = minBufferSize(charsetEncoder);
88         if (bufferSize < minRequired) {
89             throw new IllegalArgumentException(
90                 String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, charsetEncoder.charset().displayName()));
91         }
92         return bufferSize;
93     }
94 
minBufferSize(final CharsetEncoder charsetEncoder)95     static float minBufferSize(final CharsetEncoder charsetEncoder) {
96         return charsetEncoder.maxBytesPerChar() * 2;
97     }
98 
99     private final Reader reader;
100 
101     private final CharsetEncoder charsetEncoder;
102 
103     /**
104      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader
105      * into this buffer.
106      */
107     private final CharBuffer encoderIn;
108     /**
109      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the
110      * decoder to the buffer provided by the caller.
111      */
112     private final ByteBuffer encoderOut;
113 
114     private CoderResult lastCoderResult;
115 
116     private boolean endOfInput;
117 
118     /**
119      * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size
120      * of {@value #DEFAULT_BUFFER_SIZE} characters.
121      *
122      * @param reader the target {@link Reader}
123      * @deprecated 2.5 use {@link #ReaderInputStream(Reader, Charset)} instead
124      */
125     @Deprecated
ReaderInputStream(final Reader reader)126     public ReaderInputStream(final Reader reader) {
127         this(reader, Charset.defaultCharset());
128     }
129 
130     /**
131      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE}
132      * characters.
133      *
134      * <p>
135      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input
136      * and unmappable characters.
137      * </p>
138      *
139      * @param reader the target {@link Reader}
140      * @param charset the charset encoding
141      */
ReaderInputStream(final Reader reader, final Charset charset)142     public ReaderInputStream(final Reader reader, final Charset charset) {
143         this(reader, charset, DEFAULT_BUFFER_SIZE);
144     }
145 
146     /**
147      * Constructs a new {@link ReaderInputStream}.
148      *
149      * <p>
150      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input
151      * and unmappable characters.
152      * </p>
153      *
154      * @param reader the target {@link Reader}.
155      * @param charset the charset encoding.
156      * @param bufferSize the size of the input buffer in number of characters.
157      */
ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize)158     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
159         // @formatter:off
160         this(reader,
161             Charsets.toCharset(charset).newEncoder()
162                     .onMalformedInput(CodingErrorAction.REPLACE)
163                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
164              bufferSize);
165         // @formatter:on
166     }
167 
168     /**
169      * Constructs a new {@link ReaderInputStream}.
170      *
171      * <p>
172      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller
173      * of this constructor should do this when providing an encoder which had already been in use.
174      * </p>
175      *
176      * @param reader the target {@link Reader}
177      * @param charsetEncoder the charset encoder
178      * @since 2.1
179      */
ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder)180     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
181         this(reader, charsetEncoder, DEFAULT_BUFFER_SIZE);
182     }
183 
184     /**
185      * Constructs a new {@link ReaderInputStream}.
186      *
187      * <p>
188      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller
189      * of this constructor should do this when providing an encoder which had already been in use.
190      * </p>
191      *
192      * @param reader the target {@link Reader}
193      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
194      * @param bufferSize the size of the input buffer in number of characters
195      * @since 2.1
196      */
ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize)197     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
198         this.reader = reader;
199         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
200         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
201         this.encoderIn.flip();
202         this.encoderOut = ByteBuffer.allocate(128);
203         this.encoderOut.flip();
204     }
205 
206     /**
207      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE}
208      * characters.
209      *
210      * <p>
211      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input
212      * and unmappable characters.
213      * </p>
214      *
215      * @param reader the target {@link Reader}
216      * @param charsetName the name of the charset encoding
217      */
ReaderInputStream(final Reader reader, final String charsetName)218     public ReaderInputStream(final Reader reader, final String charsetName) {
219         this(reader, charsetName, DEFAULT_BUFFER_SIZE);
220     }
221 
222     /**
223      * Constructs a new {@link ReaderInputStream}.
224      *
225      * <p>
226      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input
227      * and unmappable characters.
228      * </p>
229      *
230      * @param reader the target {@link Reader}
231      * @param charsetName the name of the charset encoding, null maps to the default Charset.
232      * @param bufferSize the size of the input buffer in number of characters
233      */
ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize)234     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
235         this(reader, Charsets.toCharset(charsetName), bufferSize);
236     }
237 
238     /**
239      * Close the stream. This method will cause the underlying {@link Reader} to be closed.
240      *
241      * @throws IOException if an I/O error occurs.
242      */
243     @Override
close()244     public void close() throws IOException {
245         reader.close();
246     }
247 
248     /**
249      * Fills the internal char buffer from the reader.
250      *
251      * @throws IOException If an I/O error occurs
252      */
fillBuffer()253     private void fillBuffer() throws IOException {
254         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
255             encoderIn.compact();
256             final int position = encoderIn.position();
257             // We don't use Reader#read(CharBuffer) here because it is more efficient
258             // to write directly to the underlying char array (the default implementation
259             // copies data to a temporary char array).
260             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
261             if (c == EOF) {
262                 endOfInput = true;
263             } else {
264                 encoderIn.position(position + c);
265             }
266             encoderIn.flip();
267         }
268         encoderOut.compact();
269         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
270         if (endOfInput) {
271             lastCoderResult = charsetEncoder.flush(encoderOut);
272         }
273         if (lastCoderResult.isError()) {
274             lastCoderResult.throwException();
275         }
276         encoderOut.flip();
277     }
278 
279     /**
280      * Gets the CharsetEncoder.
281      *
282      * @return the CharsetEncoder.
283      */
getCharsetEncoder()284     CharsetEncoder getCharsetEncoder() {
285         return charsetEncoder;
286     }
287 
288     /**
289      * Read a single byte.
290      *
291      * @return either the byte read or {@code -1} if the end of the stream has been reached
292      * @throws IOException if an I/O error occurs.
293      */
294     @Override
read()295     public int read() throws IOException {
296         for (;;) {
297             if (encoderOut.hasRemaining()) {
298                 return encoderOut.get() & 0xFF;
299             }
300             fillBuffer();
301             if (endOfInput && !encoderOut.hasRemaining()) {
302                 return EOF;
303             }
304         }
305     }
306 
307     /**
308      * Read the specified number of bytes into an array.
309      *
310      * @param b the byte array to read into
311      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
312      * @throws IOException if an I/O error occurs.
313      */
314     @Override
read(final byte[] b)315     public int read(final byte[] b) throws IOException {
316         return read(b, 0, b.length);
317     }
318 
319     /**
320      * Read the specified number of bytes into an array.
321      *
322      * @param array the byte array to read into
323      * @param off the offset to start reading bytes into
324      * @param len the number of bytes to read
325      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
326      * @throws IOException if an I/O error occurs.
327      */
328     @Override
read(final byte[] array, int off, int len)329     public int read(final byte[] array, int off, int len) throws IOException {
330         Objects.requireNonNull(array, "array");
331         if (len < 0 || off < 0 || off + len > array.length) {
332             throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
333         }
334         int read = 0;
335         if (len == 0) {
336             return 0; // Always return 0 if len == 0
337         }
338         while (len > 0) {
339             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
340                 final int c = Math.min(encoderOut.remaining(), len);
341                 encoderOut.get(array, off, c);
342                 off += c;
343                 len -= c;
344                 read += c;
345             } else if (endOfInput) { // Already reach EOF in the last read
346                 break;
347             } else { // Read again
348                 fillBuffer();
349             }
350         }
351         return read == 0 && endOfInput ? EOF : read;
352     }
353 }
354