• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 package org.apache.commons.io.input;
18 
19 import static org.apache.commons.io.IOUtils.EOF;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.Reader;
24 import java.nio.ByteBuffer;
25 import java.nio.CharBuffer;
26 import java.nio.charset.Charset;
27 import java.nio.charset.CharsetEncoder;
28 import java.nio.charset.CoderResult;
29 import java.nio.charset.CodingErrorAction;
30 import java.util.Objects;
31 
32 import org.apache.commons.io.Charsets;
33 import org.apache.commons.io.IOUtils;
34 import org.apache.commons.io.build.AbstractOrigin;
35 import org.apache.commons.io.build.AbstractStreamBuilder;
36 import org.apache.commons.io.charset.CharsetEncoders;
37 
38 /**
39  * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding.
40  * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In
41  * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced.
42  * <p>
43  * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the
44  * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the
45  * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a
46  * {@link java.io.BufferedReader}.
47  * </p>
48  * <p>
49  * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the following example, reading from {@code in2}
50  * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding):
51  * </p>
52  * <p>
53  * To build an instance, see {@link Builder}.
54  * </p>
55  * <pre>
56  * InputStream inputStream = ...
57  * Charset cs = ...
58  * InputStreamReader reader = new InputStreamReader(inputStream, cs);
59  * ReaderInputStream in2 = ReaderInputStream.builder()
60  *   .setReader(reader)
61  *   .setCharset(cs)
62  *   .get();
63  * </pre>
64  * <p>
65  * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the control flow is reversed: both classes
66  * transform a character stream into a byte stream, but {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream}
67  * pulls it from the underlying stream.
68  * </p>
69  * <p>
70  * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in
71  * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way
72  * to produce the data is as a character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may appear is when
73  * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework.
74  * </p>
75  * <p>
76  * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported.
77  * </p>
78  * <p>
79  * Instances of {@link ReaderInputStream} are not thread safe.
80  * </p>
81  *
82  * @see org.apache.commons.io.output.WriterOutputStream
83  * @since 2.0
84  */
85 public class ReaderInputStream extends InputStream {
86 
87     /**
88      * Builds a new {@link ReaderInputStream} instance.
89      * <p>
90      * For example:
91      * </p>
92      * <pre>{@code
93      * ReaderInputStream s = ReaderInputStream.builder()
94      *   .setPath(path)
95      *   .setCharsetEncoder(Charset.defaultCharset().newEncoder())
96      *   .get();}
97      * </pre>
98      *
99      * @since 2.12.0
100      */
101     public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> {
102 
103         private CharsetEncoder charsetEncoder = newEncoder(getCharset());
104 
105         /**
106          * Constructs a new instance.
107          * <p>
108          * This builder use the aspects Reader, Charset, CharsetEncoder, buffer size.
109          * </p>
110          * <p>
111          * You must provide an origin that can be converted to a Reader by this builder, otherwise, this call will throw an
112          * {@link UnsupportedOperationException}.
113          * </p>
114          *
115          * @return a new instance.
116          * @throws UnsupportedOperationException if the origin cannot provide a Reader.
117          * @throws IllegalStateException if the {@code origin} is {@code null}.
118          * @see AbstractOrigin#getReader(Charset)
119          */
120         @SuppressWarnings("resource")
121         @Override
get()122         public ReaderInputStream get() throws IOException {
123             return new ReaderInputStream(checkOrigin().getReader(getCharset()), charsetEncoder, getBufferSize());
124         }
125 
getCharsetEncoder()126         CharsetEncoder getCharsetEncoder() {
127             return charsetEncoder;
128         }
129 
130         @Override
setCharset(final Charset charset)131         public Builder setCharset(final Charset charset) {
132             super.setCharset(charset);
133             charsetEncoder = newEncoder(getCharset());
134             return this;
135         }
136 
137         /**
138          * Sets the charset encoder. Assumes that the caller has configured the encoder.
139          *
140          * @param newEncoder the charset encoder, null resets to a default encoder.
141          * @return this
142          */
setCharsetEncoder(final CharsetEncoder newEncoder)143         public Builder setCharsetEncoder(final CharsetEncoder newEncoder) {
144             charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault()));
145             super.setCharset(charsetEncoder.charset());
146             return this;
147         }
148 
149     }
150 
151     /**
152      * Constructs a new {@link Builder}.
153      *
154      * @return a new {@link Builder}.
155      * @since 2.12.0
156      */
builder()157     public static Builder builder() {
158         return new Builder();
159     }
160 
checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize)161     static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) {
162         final float minRequired = minBufferSize(charsetEncoder);
163         if (bufferSize < minRequired) {
164             throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired,
165                     charsetEncoder.charset().displayName()));
166         }
167         return bufferSize;
168     }
169 
minBufferSize(final CharsetEncoder charsetEncoder)170     static float minBufferSize(final CharsetEncoder charsetEncoder) {
171         return charsetEncoder.maxBytesPerChar() * 2;
172     }
173 
newEncoder(final Charset charset)174     private static CharsetEncoder newEncoder(final Charset charset) {
175         // @formatter:off
176         return Charsets.toCharset(charset).newEncoder()
177                 .onMalformedInput(CodingErrorAction.REPLACE)
178                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
179         // @formatter:on
180     }
181 
182     private final Reader reader;
183 
184     private final CharsetEncoder charsetEncoder;
185 
186     /**
187      * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer.
188      */
189     private final CharBuffer encoderIn;
190     /**
191      * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the
192      * caller.
193      */
194     private final ByteBuffer encoderOut;
195 
196     private CoderResult lastCoderResult;
197 
198     private boolean endOfInput;
199 
200     /**
201      * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size of
202      * {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
203      *
204      * @param reader the target {@link Reader}
205      * @deprecated Use {@link ReaderInputStream#builder()} instead
206      */
207     @Deprecated
ReaderInputStream(final Reader reader)208     public ReaderInputStream(final Reader reader) {
209         this(reader, Charset.defaultCharset());
210     }
211 
212     /**
213      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
214      *
215      * <p>
216      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
217      * </p>
218      *
219      * @param reader  the target {@link Reader}
220      * @param charset the charset encoding
221      * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses.
222      */
223     @Deprecated
ReaderInputStream(final Reader reader, final Charset charset)224     public ReaderInputStream(final Reader reader, final Charset charset) {
225         this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE);
226     }
227 
228     /**
229      * Constructs a new {@link ReaderInputStream}.
230      *
231      * <p>
232      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
233      * </p>
234      *
235      * @param reader     the target {@link Reader}.
236      * @param charset    the charset encoding.
237      * @param bufferSize the size of the input buffer in number of characters.
238      * @deprecated Use {@link ReaderInputStream#builder()} instead
239      */
240     @Deprecated
ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize)241     public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) {
242         // @formatter:off
243         this(reader,
244             Charsets.toCharset(charset).newEncoder()
245                     .onMalformedInput(CodingErrorAction.REPLACE)
246                     .onUnmappableCharacter(CodingErrorAction.REPLACE),
247              bufferSize);
248         // @formatter:on
249     }
250 
251     /**
252      * Constructs a new {@link ReaderInputStream}.
253      *
254      * <p>
255      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
256      * an encoder which had already been in use.
257      * </p>
258      *
259      * @param reader         the target {@link Reader}
260      * @param charsetEncoder the charset encoder
261      * @since 2.1
262      * @deprecated Use {@link ReaderInputStream#builder()} instead
263      */
264     @Deprecated
ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder)265     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) {
266         this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE);
267     }
268 
269     /**
270      * Constructs a new {@link ReaderInputStream}.
271      *
272      * <p>
273      * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing
274      * an encoder which had already been in use.
275      * </p>
276      *
277      * @param reader         the target {@link Reader}
278      * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder.
279      * @param bufferSize     the size of the input buffer in number of characters
280      * @since 2.1
281      * @deprecated Use {@link ReaderInputStream#builder()} instead
282      */
283     @Deprecated
ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize)284     public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) {
285         this.reader = reader;
286         this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder);
287         this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize));
288         this.encoderIn.flip();
289         this.encoderOut = ByteBuffer.allocate(128);
290         this.encoderOut.flip();
291     }
292 
293     /**
294      * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters.
295      *
296      * <p>
297      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
298      * </p>
299      *
300      * @param reader      the target {@link Reader}
301      * @param charsetName the name of the charset encoding
302      * @deprecated Use {@link ReaderInputStream#builder()} instead
303      */
304     @Deprecated
ReaderInputStream(final Reader reader, final String charsetName)305     public ReaderInputStream(final Reader reader, final String charsetName) {
306         this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE);
307     }
308 
309     /**
310      * Constructs a new {@link ReaderInputStream}.
311      *
312      * <p>
313      * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters.
314      * </p>
315      *
316      * @param reader      the target {@link Reader}
317      * @param charsetName the name of the charset encoding, null maps to the default Charset.
318      * @param bufferSize  the size of the input buffer in number of characters
319      * @deprecated Use {@link ReaderInputStream#builder()} instead
320      */
321     @Deprecated
ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize)322     public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) {
323         this(reader, Charsets.toCharset(charsetName), bufferSize);
324     }
325 
326     /**
327      * Closes the stream. This method will cause the underlying {@link Reader} to be closed.
328      *
329      * @throws IOException if an I/O error occurs.
330      */
331     @Override
close()332     public void close() throws IOException {
333         reader.close();
334     }
335 
336     /**
337      * Fills the internal char buffer from the reader.
338      *
339      * @throws IOException If an I/O error occurs
340      */
fillBuffer()341     private void fillBuffer() throws IOException {
342         if (endOfInput) {
343             return;
344         }
345         if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) {
346             encoderIn.compact();
347             final int position = encoderIn.position();
348             // We don't use Reader#read(CharBuffer) here because it is more efficient
349             // to write directly to the underlying char array (the default implementation
350             // copies data to a temporary char array).
351             final int c = reader.read(encoderIn.array(), position, encoderIn.remaining());
352             if (c == EOF) {
353                 endOfInput = true;
354             } else {
355                 encoderIn.position(position + c);
356             }
357             encoderIn.flip();
358         }
359         encoderOut.compact();
360         lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput);
361         if (endOfInput) {
362             lastCoderResult = charsetEncoder.flush(encoderOut);
363         }
364         if (lastCoderResult.isError()) {
365             lastCoderResult.throwException();
366         }
367         encoderOut.flip();
368     }
369 
370     /**
371      * Gets the CharsetEncoder.
372      *
373      * @return the CharsetEncoder.
374      */
getCharsetEncoder()375     CharsetEncoder getCharsetEncoder() {
376         return charsetEncoder;
377     }
378 
379     /**
380      * Reads a single byte.
381      *
382      * @return either the byte read or {@code -1} if the end of the stream has been reached
383      * @throws IOException if an I/O error occurs.
384      */
385     @Override
read()386     public int read() throws IOException {
387         for (;;) {
388             if (encoderOut.hasRemaining()) {
389                 return encoderOut.get() & 0xFF;
390             }
391             fillBuffer();
392             if (endOfInput && !encoderOut.hasRemaining()) {
393                 return EOF;
394             }
395         }
396     }
397 
398     /**
399      * Reads the specified number of bytes into an array.
400      *
401      * @param b the byte array to read into
402      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
403      * @throws IOException if an I/O error occurs.
404      */
405     @Override
read(final byte[] b)406     public int read(final byte[] b) throws IOException {
407         return read(b, 0, b.length);
408     }
409 
410     /**
411      * Reads the specified number of bytes into an array.
412      *
413      * @param array the byte array to read into
414      * @param off   the offset to start reading bytes into
415      * @param len   the number of bytes to read
416      * @return the number of bytes read or {@code -1} if the end of the stream has been reached
417      * @throws IOException if an I/O error occurs.
418      */
419     @Override
read(final byte[] array, int off, int len)420     public int read(final byte[] array, int off, int len) throws IOException {
421         Objects.requireNonNull(array, "array");
422         if (len < 0 || off < 0 || off + len > array.length) {
423             throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len);
424         }
425         int read = 0;
426         if (len == 0) {
427             return 0; // Always return 0 if len == 0
428         }
429         while (len > 0) {
430             if (encoderOut.hasRemaining()) { // Data from the last read not fully copied
431                 final int c = Math.min(encoderOut.remaining(), len);
432                 encoderOut.get(array, off, c);
433                 off += c;
434                 len -= c;
435                 read += c;
436             } else if (endOfInput) { // Already reach EOF in the last read
437                 break;
438             } else { // Read again
439                 fillBuffer();
440             }
441         }
442         return read == 0 && endOfInput ? EOF : read;
443     }
444 }
445