1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.io.input; 18 19 import static org.apache.commons.io.IOUtils.EOF; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Reader; 24 import java.nio.ByteBuffer; 25 import java.nio.CharBuffer; 26 import java.nio.charset.Charset; 27 import java.nio.charset.CharsetEncoder; 28 import java.nio.charset.CoderResult; 29 import java.nio.charset.CodingErrorAction; 30 import java.util.Objects; 31 32 import org.apache.commons.io.Charsets; 33 import org.apache.commons.io.charset.CharsetEncoders; 34 35 /** 36 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte 37 * stream using a specified charset encoding. The stream is transformed using a {@link CharsetEncoder} object, 38 * guaranteeing that all charset encodings supported by the JRE are handled correctly. In particular for charsets such 39 * as UTF-16, the implementation ensures that one and only one byte order marker is produced. 40 * <p> 41 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy 42 * a read request on the {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore 43 * no well defined correlation between the current position of the {@link Reader} and that of the 44 * {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} 45 * in a {@link java.io.BufferedReader}. 46 * </p> 47 * <p> 48 * {@link ReaderInputStream} implements the inverse transformation of {@link java.io.InputStreamReader}; in the 49 * following example, reading from {@code in2} would return the same byte sequence as reading from {@code in} (provided 50 * that the initial byte sequence is legal with respect to the charset encoding): 51 * </p> 52 * 53 * <pre> 54 * InputStream inputStream = ... 55 * Charset cs = ... 56 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 57 * ReaderInputStream in2 = new ReaderInputStream(reader, cs); 58 * </pre> 59 * <p> 60 * {@link ReaderInputStream} implements the same transformation as {@link java.io.OutputStreamWriter}, except that the 61 * control flow is reversed: both classes transform a character stream into a byte stream, but 62 * {@link java.io.OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} pulls it 63 * from the underlying stream. 64 * </p> 65 * <p> 66 * Note that while there are use cases where there is no alternative to using this class, very often the need to use 67 * this class is an indication of a flaw in the design of the code. This class is typically used in situations where an 68 * existing API only accepts an {@link InputStream}, but where the most natural way to produce the data is as a 69 * character stream, i.e. by providing a {@link Reader} instance. An example of a situation where this problem may 70 * appear is when implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 71 * </p> 72 * <p> 73 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} 74 * are not supported. 75 * </p> 76 * <p> 77 * Instances of {@link ReaderInputStream} are not thread safe. 78 * </p> 79 * 80 * @see org.apache.commons.io.output.WriterOutputStream 81 * @since 2.0 82 */ 83 public class ReaderInputStream extends InputStream { 84 private static final int DEFAULT_BUFFER_SIZE = 1024; 85 checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize)86 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 87 final float minRequired = minBufferSize(charsetEncoder); 88 if (bufferSize < minRequired) { 89 throw new IllegalArgumentException( 90 String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, charsetEncoder.charset().displayName())); 91 } 92 return bufferSize; 93 } 94 minBufferSize(final CharsetEncoder charsetEncoder)95 static float minBufferSize(final CharsetEncoder charsetEncoder) { 96 return charsetEncoder.maxBytesPerChar() * 2; 97 } 98 99 private final Reader reader; 100 101 private final CharsetEncoder charsetEncoder; 102 103 /** 104 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader 105 * into this buffer. 106 */ 107 private final CharBuffer encoderIn; 108 /** 109 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the 110 * decoder to the buffer provided by the caller. 111 */ 112 private final ByteBuffer encoderOut; 113 114 private CoderResult lastCoderResult; 115 116 private boolean endOfInput; 117 118 /** 119 * Constructs a new {@link ReaderInputStream} that uses the default character encoding with a default input buffer size 120 * of {@value #DEFAULT_BUFFER_SIZE} characters. 121 * 122 * @param reader the target {@link Reader} 123 * @deprecated 2.5 use {@link #ReaderInputStream(Reader, Charset)} instead 124 */ 125 @Deprecated ReaderInputStream(final Reader reader)126 public ReaderInputStream(final Reader reader) { 127 this(reader, Charset.defaultCharset()); 128 } 129 130 /** 131 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} 132 * characters. 133 * 134 * <p> 135 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input 136 * and unmappable characters. 137 * </p> 138 * 139 * @param reader the target {@link Reader} 140 * @param charset the charset encoding 141 */ ReaderInputStream(final Reader reader, final Charset charset)142 public ReaderInputStream(final Reader reader, final Charset charset) { 143 this(reader, charset, DEFAULT_BUFFER_SIZE); 144 } 145 146 /** 147 * Constructs a new {@link ReaderInputStream}. 148 * 149 * <p> 150 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input 151 * and unmappable characters. 152 * </p> 153 * 154 * @param reader the target {@link Reader}. 155 * @param charset the charset encoding. 156 * @param bufferSize the size of the input buffer in number of characters. 157 */ ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize)158 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 159 // @formatter:off 160 this(reader, 161 Charsets.toCharset(charset).newEncoder() 162 .onMalformedInput(CodingErrorAction.REPLACE) 163 .onUnmappableCharacter(CodingErrorAction.REPLACE), 164 bufferSize); 165 // @formatter:on 166 } 167 168 /** 169 * Constructs a new {@link ReaderInputStream}. 170 * 171 * <p> 172 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller 173 * of this constructor should do this when providing an encoder which had already been in use. 174 * </p> 175 * 176 * @param reader the target {@link Reader} 177 * @param charsetEncoder the charset encoder 178 * @since 2.1 179 */ ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder)180 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 181 this(reader, charsetEncoder, DEFAULT_BUFFER_SIZE); 182 } 183 184 /** 185 * Constructs a new {@link ReaderInputStream}. 186 * 187 * <p> 188 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller 189 * of this constructor should do this when providing an encoder which had already been in use. 190 * </p> 191 * 192 * @param reader the target {@link Reader} 193 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 194 * @param bufferSize the size of the input buffer in number of characters 195 * @since 2.1 196 */ ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize)197 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 198 this.reader = reader; 199 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 200 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 201 this.encoderIn.flip(); 202 this.encoderOut = ByteBuffer.allocate(128); 203 this.encoderOut.flip(); 204 } 205 206 /** 207 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value #DEFAULT_BUFFER_SIZE} 208 * characters. 209 * 210 * <p> 211 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input 212 * and unmappable characters. 213 * </p> 214 * 215 * @param reader the target {@link Reader} 216 * @param charsetName the name of the charset encoding 217 */ ReaderInputStream(final Reader reader, final String charsetName)218 public ReaderInputStream(final Reader reader, final String charsetName) { 219 this(reader, charsetName, DEFAULT_BUFFER_SIZE); 220 } 221 222 /** 223 * Constructs a new {@link ReaderInputStream}. 224 * 225 * <p> 226 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input 227 * and unmappable characters. 228 * </p> 229 * 230 * @param reader the target {@link Reader} 231 * @param charsetName the name of the charset encoding, null maps to the default Charset. 232 * @param bufferSize the size of the input buffer in number of characters 233 */ ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize)234 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 235 this(reader, Charsets.toCharset(charsetName), bufferSize); 236 } 237 238 /** 239 * Close the stream. This method will cause the underlying {@link Reader} to be closed. 240 * 241 * @throws IOException if an I/O error occurs. 242 */ 243 @Override close()244 public void close() throws IOException { 245 reader.close(); 246 } 247 248 /** 249 * Fills the internal char buffer from the reader. 250 * 251 * @throws IOException If an I/O error occurs 252 */ fillBuffer()253 private void fillBuffer() throws IOException { 254 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 255 encoderIn.compact(); 256 final int position = encoderIn.position(); 257 // We don't use Reader#read(CharBuffer) here because it is more efficient 258 // to write directly to the underlying char array (the default implementation 259 // copies data to a temporary char array). 260 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 261 if (c == EOF) { 262 endOfInput = true; 263 } else { 264 encoderIn.position(position + c); 265 } 266 encoderIn.flip(); 267 } 268 encoderOut.compact(); 269 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 270 if (endOfInput) { 271 lastCoderResult = charsetEncoder.flush(encoderOut); 272 } 273 if (lastCoderResult.isError()) { 274 lastCoderResult.throwException(); 275 } 276 encoderOut.flip(); 277 } 278 279 /** 280 * Gets the CharsetEncoder. 281 * 282 * @return the CharsetEncoder. 283 */ getCharsetEncoder()284 CharsetEncoder getCharsetEncoder() { 285 return charsetEncoder; 286 } 287 288 /** 289 * Read a single byte. 290 * 291 * @return either the byte read or {@code -1} if the end of the stream has been reached 292 * @throws IOException if an I/O error occurs. 293 */ 294 @Override read()295 public int read() throws IOException { 296 for (;;) { 297 if (encoderOut.hasRemaining()) { 298 return encoderOut.get() & 0xFF; 299 } 300 fillBuffer(); 301 if (endOfInput && !encoderOut.hasRemaining()) { 302 return EOF; 303 } 304 } 305 } 306 307 /** 308 * Read the specified number of bytes into an array. 309 * 310 * @param b the byte array to read into 311 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 312 * @throws IOException if an I/O error occurs. 313 */ 314 @Override read(final byte[] b)315 public int read(final byte[] b) throws IOException { 316 return read(b, 0, b.length); 317 } 318 319 /** 320 * Read the specified number of bytes into an array. 321 * 322 * @param array the byte array to read into 323 * @param off the offset to start reading bytes into 324 * @param len the number of bytes to read 325 * @return the number of bytes read or {@code -1} if the end of the stream has been reached 326 * @throws IOException if an I/O error occurs. 327 */ 328 @Override read(final byte[] array, int off, int len)329 public int read(final byte[] array, int off, int len) throws IOException { 330 Objects.requireNonNull(array, "array"); 331 if (len < 0 || off < 0 || off + len > array.length) { 332 throw new IndexOutOfBoundsException("Array size=" + array.length + ", offset=" + off + ", length=" + len); 333 } 334 int read = 0; 335 if (len == 0) { 336 return 0; // Always return 0 if len == 0 337 } 338 while (len > 0) { 339 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 340 final int c = Math.min(encoderOut.remaining(), len); 341 encoderOut.get(array, off, c); 342 off += c; 343 len -= c; 344 read += c; 345 } else if (endOfInput) { // Already reach EOF in the last read 346 break; 347 } else { // Read again 348 fillBuffer(); 349 } 350 } 351 return read == 0 && endOfInput ? EOF : read; 352 } 353 } 354