1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.io.input.compatibility; 18 19 import java.io.BufferedInputStream; 20 import java.io.BufferedReader; 21 import java.io.File; 22 import java.io.IOException; 23 import java.io.InputStream; 24 import java.io.InputStreamReader; 25 import java.io.Reader; 26 import java.io.StringReader; 27 import java.net.HttpURLConnection; 28 import java.net.URL; 29 import java.net.URLConnection; 30 import java.nio.charset.StandardCharsets; 31 import java.nio.file.Files; 32 import java.text.MessageFormat; 33 import java.util.Locale; 34 import java.util.regex.Matcher; 35 import java.util.regex.Pattern; 36 37 import org.apache.commons.io.IOUtils; 38 import org.apache.commons.io.output.XmlStreamWriter; 39 40 /** 41 * Character stream that handles all the necessary Voodoo to figure out the 42 * charset encoding of the XML document within the stream. 43 * <p> 44 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. 45 * This one IS a character stream. 46 * </p> 47 * <p> 48 * All this has to be done without consuming characters from the stream, if not 49 * the XML parser will not recognized the document as a valid XML. This is not 50 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers 51 * right now, XmlStreamReader handles it and things work in all parsers). 52 * </p> 53 * <p> 54 * The XmlStreamReader class handles the charset encoding of XML documents in 55 * Files, raw streams and HTTP streams by offering a wide set of constructors. 56 * </p> 57 * <p> 58 * By default the charset encoding detection is lenient, the constructor with 59 * the lenient flag can be used for a script (following HTTP MIME and XML 60 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 61 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> 62 * Determining the character encoding of a feed</a>. 63 * </p> 64 * <p> 65 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under 66 * Apache License 2.0. 67 * </p> 68 * 69 * @see XmlStreamWriter 70 */ 71 public class XmlStreamReader extends Reader { 72 73 private static final String UTF_8 = StandardCharsets.UTF_8.name(); 74 75 private static final String US_ASCII = StandardCharsets.US_ASCII.name(); 76 77 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name(); 78 79 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name(); 80 81 private static final String UTF_16 = StandardCharsets.UTF_16.name(); 82 83 private static final String UTF_32BE = "UTF-32BE"; 84 85 private static final String UTF_32LE = "UTF-32LE"; 86 87 private static final String UTF_32 = "UTF-32"; 88 89 private static final String EBCDIC = "CP1047"; 90 91 private static String staticDefaultEncoding; 92 93 private static final Pattern CHARSET_PATTERN = Pattern 94 .compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 95 96 public static final Pattern ENCODING_PATTERN = Pattern.compile( 97 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", 98 Pattern.MULTILINE); 99 100 private static final MessageFormat RAW_EX_1 = new MessageFormat( 101 "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); 102 103 private static final MessageFormat RAW_EX_2 = new MessageFormat( 104 "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"); 105 106 private static final MessageFormat HTTP_EX_1 = new MessageFormat( 107 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null"); 108 109 private static final MessageFormat HTTP_EX_2 = new MessageFormat( 110 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"); 111 112 private static final MessageFormat HTTP_EX_3 = new MessageFormat( 113 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME"); 114 115 // returns the BOM in the stream, null if not present, 116 // if there was BOM the in the stream it is consumed getBOMEncoding(final BufferedInputStream is)117 static String getBOMEncoding(final BufferedInputStream is) 118 throws IOException { 119 String encoding = null; 120 final int[] bytes = new int[3]; 121 is.mark(3); 122 bytes[0] = is.read(); 123 bytes[1] = is.read(); 124 bytes[2] = is.read(); 125 126 if (bytes[0] == 0xFE && bytes[1] == 0xFF) { 127 encoding = UTF_16BE; 128 is.reset(); 129 is.read(); 130 is.read(); 131 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) { 132 encoding = UTF_16LE; 133 is.reset(); 134 is.read(); 135 is.read(); 136 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { 137 encoding = UTF_8; 138 } else { 139 is.reset(); 140 } 141 return encoding; 142 } 143 144 // returns charset parameter value, null if not present, null if 145 // httpContentType is null getContentTypeEncoding(final String httpContentType)146 static String getContentTypeEncoding(final String httpContentType) { 147 String encoding = null; 148 if (httpContentType != null) { 149 final int i = httpContentType.indexOf(";"); 150 if (i > -1) { 151 final String postMime = httpContentType.substring(i + 1); 152 final Matcher m = CHARSET_PATTERN.matcher(postMime); 153 encoding = m.find() ? m.group(1) : null; 154 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 155 } 156 } 157 return encoding; 158 } 159 160 // returns MIME type or null if httpContentType is null getContentTypeMime(final String httpContentType)161 static String getContentTypeMime(final String httpContentType) { 162 String mime = null; 163 if (httpContentType != null) { 164 final int i = httpContentType.indexOf(";"); 165 mime = (i == -1 ? httpContentType : httpContentType.substring(0, 166 i)).trim(); 167 } 168 return mime; 169 } 170 171 /** 172 * Returns the default encoding to use if none is set in HTTP content-type, 173 * XML prolog and the rules based on content-type are not adequate. 174 * <p> 175 * If it is null the content-type based rules are used. 176 * 177 * @return the default encoding to use. 178 */ getDefaultEncoding()179 public static String getDefaultEncoding() { 180 return staticDefaultEncoding; 181 } 182 183 // returns the best guess for the encoding by looking the first bytes of the 184 // stream, '<?' getXMLGuessEncoding(final BufferedInputStream is)185 private static String getXMLGuessEncoding(final BufferedInputStream is) 186 throws IOException { 187 String encoding = null; 188 final int[] bytes = new int[4]; 189 is.mark(4); 190 bytes[0] = is.read(); 191 bytes[1] = is.read(); 192 bytes[2] = is.read(); 193 bytes[3] = is.read(); 194 is.reset(); 195 196 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 197 && bytes[3] == 0x3F) { 198 encoding = UTF_16BE; 199 } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F 200 && bytes[3] == 0x00) { 201 encoding = UTF_16LE; 202 } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 203 && bytes[3] == 0x6D) { 204 encoding = UTF_8; 205 } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 206 && bytes[3] == 0x94) { 207 encoding = EBCDIC; 208 } 209 return encoding; 210 } 211 212 // returns the encoding declared in the <?xml encoding=...?>, null if none getXmlProlog(final BufferedInputStream is, final String guessedEnc)213 private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc) 214 throws IOException { 215 String encoding = null; 216 if (guessedEnc != null) { 217 final byte[] bytes = IOUtils.byteArray(); 218 is.mark(IOUtils.DEFAULT_BUFFER_SIZE); 219 int offset = 0; 220 int max = IOUtils.DEFAULT_BUFFER_SIZE; 221 int c = is.read(bytes, offset, max); 222 int firstGT = -1; 223 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 224 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 225 offset += c; 226 max -= c; 227 c = is.read(bytes, offset, max); 228 xmlProlog = new String(bytes, 0, offset, guessedEnc); 229 firstGT = xmlProlog.indexOf('>'); 230 } 231 if (firstGT == -1) { 232 if (c == -1) { 233 throw new IOException("Unexpected end of XML stream"); 234 } 235 throw new IOException( 236 "XML prolog or ROOT element not found on first " 237 + offset + " bytes"); 238 } 239 final int bytesRead = offset; 240 if (bytesRead > 0) { 241 is.reset(); 242 final BufferedReader bReader = new BufferedReader(new StringReader( 243 xmlProlog.substring(0, firstGT + 1))); 244 final StringBuilder prolog = new StringBuilder(); 245 String line; 246 while ((line = bReader.readLine()) != null) { 247 prolog.append(line); 248 } 249 final Matcher m = ENCODING_PATTERN.matcher(prolog); 250 if (m.find()) { 251 encoding = m.group(1).toUpperCase(Locale.ROOT); 252 encoding = encoding.substring(1, encoding.length() - 1); 253 } 254 } 255 } 256 return encoding; 257 } 258 259 // indicates if the MIME type belongs to the APPLICATION XML family isAppXml(final String mime)260 static boolean isAppXml(final String mime) { 261 return mime != null 262 && (mime.equals("application/xml") 263 || mime.equals("application/xml-dtd") 264 || mime 265 .equals("application/xml-external-parsed-entity") || mime 266 .startsWith("application/") && mime.endsWith("+xml")); 267 } 268 269 // indicates if the MIME type belongs to the TEXT XML family isTextXml(final String mime)270 static boolean isTextXml(final String mime) { 271 return mime != null 272 && (mime.equals("text/xml") 273 || mime.equals("text/xml-external-parsed-entity") || mime 274 .startsWith("text/") && mime.endsWith("+xml")); 275 } 276 277 /** 278 * Sets the default encoding to use if none is set in HTTP content-type, XML 279 * prolog and the rules based on content-type are not adequate. 280 * <p> 281 * If it is set to null the content-type based rules are used. 282 * <p> 283 * By default it is null. 284 * 285 * @param encoding charset encoding to default to. 286 */ setDefaultEncoding(final String encoding)287 public static void setDefaultEncoding(final String encoding) { 288 staticDefaultEncoding = encoding; 289 } 290 291 private Reader reader; 292 293 private String encoding; 294 295 private final String defaultEncoding; 296 297 /** 298 * Creates a Reader for a File. 299 * <p> 300 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, 301 * if this is also missing defaults to UTF-8. 302 * <p> 303 * It does a lenient charset encoding detection, check the constructor with 304 * the lenient parameter for details. 305 * 306 * @param file File to create a Reader from. 307 * @throws IOException thrown if there is a problem reading the file. 308 */ 309 @SuppressWarnings("resource") // FileInputStream is closed when this closed when this object is closed. XmlStreamReader(final File file)310 public XmlStreamReader(final File file) throws IOException { 311 this(Files.newInputStream(file.toPath())); 312 } 313 314 /** 315 * Creates a Reader for a raw InputStream. 316 * <p> 317 * It follows the same logic used for files. 318 * <p> 319 * It does a lenient charset encoding detection, check the constructor with 320 * the lenient parameter for details. 321 * 322 * @param inputStream InputStream to create a Reader from. 323 * @throws IOException thrown if there is a problem reading the stream. 324 */ XmlStreamReader(final InputStream inputStream)325 public XmlStreamReader(final InputStream inputStream) throws IOException { 326 this(inputStream, true); 327 } 328 329 /** 330 * Creates a Reader for a raw InputStream. 331 * <p> 332 * It follows the same logic used for files. 333 * <p> 334 * If lenient detection is indicated and the detection above fails as per 335 * specifications it then attempts the following: 336 * <p> 337 * If the content type was 'text/html' it replaces it with 'text/xml' and 338 * tries the detection again. 339 * <p> 340 * Else if the XML prolog had a charset encoding that encoding is used. 341 * <p> 342 * Else if the content type had a charset encoding that encoding is used. 343 * <p> 344 * Else 'UTF-8' is used. 345 * <p> 346 * If lenient detection is indicated an XmlStreamReaderException is never 347 * thrown. 348 * 349 * @param inputStream InputStream to create a Reader from. 350 * @param lenient indicates if the charset encoding detection should be 351 * relaxed. 352 * @throws IOException thrown if there is a problem reading the stream. 353 * @throws XmlStreamReaderException thrown if the charset encoding could not 354 * be determined according to the specification. 355 */ XmlStreamReader(final InputStream inputStream, final boolean lenient)356 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException, 357 XmlStreamReaderException { 358 defaultEncoding = staticDefaultEncoding; 359 try { 360 doRawStream(inputStream); 361 } catch (final XmlStreamReaderException ex) { 362 if (!lenient) { 363 throw ex; 364 } 365 doLenientDetection(null, ex); 366 } 367 } 368 369 /** 370 * Creates a Reader using an InputStream and the associated content-type 371 * header. 372 * <p> 373 * First it checks if the stream has BOM. If there is not BOM checks the 374 * content-type encoding. If there is not content-type encoding checks the 375 * XML prolog encoding. If there is not XML prolog encoding uses the default 376 * encoding mandated by the content-type MIME type. 377 * <p> 378 * It does a lenient charset encoding detection, check the constructor with 379 * the lenient parameter for details. 380 * 381 * @param inputStream InputStream to create the reader from. 382 * @param httpContentType content-type header to use for the resolution of 383 * the charset encoding. 384 * @throws IOException thrown if there is a problem reading the file. 385 */ XmlStreamReader(final InputStream inputStream, final String httpContentType)386 public XmlStreamReader(final InputStream inputStream, final String httpContentType) 387 throws IOException { 388 this(inputStream, httpContentType, true); 389 } 390 391 /** 392 * Creates a Reader using an InputStream and the associated content-type 393 * header. This constructor is lenient regarding the encoding detection. 394 * <p> 395 * First it checks if the stream has BOM. If there is not BOM checks the 396 * content-type encoding. If there is not content-type encoding checks the 397 * XML prolog encoding. If there is not XML prolog encoding uses the default 398 * encoding mandated by the content-type MIME type. 399 * <p> 400 * If lenient detection is indicated and the detection above fails as per 401 * specifications it then attempts the following: 402 * <p> 403 * If the content type was 'text/html' it replaces it with 'text/xml' and 404 * tries the detection again. 405 * <p> 406 * Else if the XML prolog had a charset encoding that encoding is used. 407 * <p> 408 * Else if the content type had a charset encoding that encoding is used. 409 * <p> 410 * Else 'UTF-8' is used. 411 * <p> 412 * If lenient detection is indicated an XmlStreamReaderException is never 413 * thrown. 414 * 415 * @param inputStream InputStream to create the reader from. 416 * @param httpContentType content-type header to use for the resolution of 417 * the charset encoding. 418 * @param lenient indicates if the charset encoding detection should be 419 * relaxed. 420 * @throws IOException thrown if there is a problem reading the file. 421 * @throws XmlStreamReaderException thrown if the charset encoding could not 422 * be determined according to the specification. 423 */ XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient)424 public XmlStreamReader(final InputStream inputStream, final String httpContentType, 425 final boolean lenient) throws IOException, XmlStreamReaderException { 426 this(inputStream, httpContentType, lenient, null); 427 } 428 429 /** 430 * Creates a Reader using an InputStream and the associated content-type 431 * header. This constructor is lenient regarding the encoding detection. 432 * <p> 433 * First it checks if the stream has BOM. If there is not BOM checks the 434 * content-type encoding. If there is not content-type encoding checks the 435 * XML prolog encoding. If there is not XML prolog encoding uses the default 436 * encoding mandated by the content-type MIME type. 437 * <p> 438 * If lenient detection is indicated and the detection above fails as per 439 * specifications it then attempts the following: 440 * <p> 441 * If the content type was 'text/html' it replaces it with 'text/xml' and 442 * tries the detection again. 443 * <p> 444 * Else if the XML prolog had a charset encoding that encoding is used. 445 * <p> 446 * Else if the content type had a charset encoding that encoding is used. 447 * <p> 448 * Else 'UTF-8' is used. 449 * <p> 450 * If lenient detection is indicated an XmlStreamReaderException is never 451 * thrown. 452 * 453 * @param inputStream InputStream to create the reader from. 454 * @param httpContentType content-type header to use for the resolution of 455 * the charset encoding. 456 * @param lenient indicates if the charset encoding detection should be 457 * relaxed. 458 * @param defaultEncoding the default encoding to use 459 * @throws IOException thrown if there is a problem reading the file. 460 * @throws XmlStreamReaderException thrown if the charset encoding could not 461 * be determined according to the specification. 462 */ XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)463 public XmlStreamReader(final InputStream inputStream, final String httpContentType, 464 final boolean lenient, final String defaultEncoding) throws IOException, 465 XmlStreamReaderException { 466 this.defaultEncoding = defaultEncoding == null ? staticDefaultEncoding 467 : defaultEncoding; 468 try { 469 doHttpStream(inputStream, httpContentType, lenient); 470 } catch (final XmlStreamReaderException ex) { 471 if (!lenient) { 472 throw ex; 473 } 474 doLenientDetection(httpContentType, ex); 475 } 476 } 477 478 /** 479 * Creates a Reader using the InputStream of a URL. 480 * <p> 481 * If the URL is not of type HTTP and there is not 'content-type' header in 482 * the fetched data it uses the same logic used for Files. 483 * <p> 484 * If the URL is a HTTP Url or there is a 'content-type' header in the 485 * fetched data it uses the same logic used for an InputStream with 486 * content-type. 487 * <p> 488 * It does a lenient charset encoding detection, check the constructor with 489 * the lenient parameter for details. 490 * 491 * @param url URL to create a Reader from. 492 * @throws IOException thrown if there is a problem reading the stream of 493 * the URL. 494 */ XmlStreamReader(final URL url)495 public XmlStreamReader(final URL url) throws IOException { 496 // TODO URLConnection leak. 497 this(url.openConnection()); 498 } 499 500 /** 501 * Creates a Reader using the InputStream of a URLConnection. 502 * <p> 503 * If the URLConnection is not of type HttpURLConnection and there is not 504 * 'content-type' header in the fetched data it uses the same logic used for 505 * files. 506 * <p> 507 * If the URLConnection is a HTTP Url or there is a 'content-type' header in 508 * the fetched data it uses the same logic used for an InputStream with 509 * content-type. 510 * <p> 511 * It does a lenient charset encoding detection, check the constructor with 512 * the lenient parameter for details. 513 * 514 * @param conn URLConnection to create a Reader from. 515 * @throws IOException thrown if there is a problem reading the stream of 516 * the URLConnection. 517 */ XmlStreamReader(final URLConnection conn)518 public XmlStreamReader(final URLConnection conn) throws IOException { 519 defaultEncoding = staticDefaultEncoding; 520 final boolean lenient = true; 521 if (conn instanceof HttpURLConnection || conn.getContentType() != null) { 522 try { 523 doHttpStream(conn.getInputStream(), conn.getContentType(), 524 lenient); 525 } catch (final XmlStreamReaderException ex) { 526 doLenientDetection(conn.getContentType(), ex); 527 } 528 } else { 529 try { 530 doRawStream(conn.getInputStream()); 531 } catch (final XmlStreamReaderException ex) { 532 doLenientDetection(null, ex); 533 } 534 } 535 } 536 537 // InputStream is passed for XmlStreamReaderException creation only calculateHttpEncoding(final String cTMime, final String cTEnc, final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final InputStream is, final boolean lenient)538 String calculateHttpEncoding(final String cTMime, final String cTEnc, 539 final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final InputStream is, 540 final boolean lenient) throws IOException { 541 final String encoding; 542 if (lenient && xmlEnc != null) { 543 encoding = xmlEnc; 544 } else { 545 final boolean appXml = isAppXml(cTMime); 546 final boolean textXml = isTextXml(cTMime); 547 if (!appXml && !textXml) { 548 throw new XmlStreamReaderException(HTTP_EX_3 549 .format(new Object[] { cTMime, cTEnc, bomEnc, 550 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, bomEnc, 551 xmlGuessEnc, xmlEnc, is); 552 } 553 if (cTEnc == null) { 554 if (appXml) { 555 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, 556 xmlEnc, is); 557 } else { 558 encoding = defaultEncoding == null ? US_ASCII 559 : defaultEncoding; 560 } 561 } else if (bomEnc != null 562 && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) { 563 throw new XmlStreamReaderException(HTTP_EX_1 564 .format(new Object[] { cTMime, cTEnc, bomEnc, 565 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, 566 bomEnc, xmlGuessEnc, xmlEnc, is); 567 } else if (cTEnc.equals(UTF_16)) { 568 if (bomEnc == null || !bomEnc.startsWith(UTF_16)) { 569 throw new XmlStreamReaderException(HTTP_EX_2 570 .format(new Object[] { cTMime, cTEnc, bomEnc, 571 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, 572 bomEnc, xmlGuessEnc, xmlEnc, is); 573 } 574 encoding = bomEnc; 575 } else if (bomEnc != null 576 && (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE))) { 577 throw new XmlStreamReaderException(HTTP_EX_1 578 .format(new Object[] { cTMime, cTEnc, bomEnc, 579 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, 580 bomEnc, xmlGuessEnc, xmlEnc, is); 581 } else if (cTEnc.equals(UTF_32)) { 582 if (bomEnc == null || !bomEnc.startsWith(UTF_32)) { 583 throw new XmlStreamReaderException(HTTP_EX_2 584 .format(new Object[] { cTMime, cTEnc, bomEnc, 585 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, 586 bomEnc, xmlGuessEnc, xmlEnc, is); 587 } 588 encoding = bomEnc; 589 } else { 590 encoding = cTEnc; 591 } 592 } 593 return encoding; 594 } 595 596 // InputStream is passed for XmlStreamReaderException creation only calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final InputStream is)597 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, 598 final String xmlEnc, final InputStream is) throws IOException { 599 final String encoding; 600 if (bomEnc == null) { 601 if (xmlGuessEnc == null || xmlEnc == null) { 602 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 603 } else if (xmlEnc.equals(UTF_16) 604 && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc 605 .equals(UTF_16LE))) { 606 encoding = xmlGuessEnc; 607 } else if (xmlEnc.equals(UTF_32) 608 && (xmlGuessEnc.equals(UTF_32BE) || xmlGuessEnc 609 .equals(UTF_32LE))) { 610 encoding = xmlGuessEnc; 611 } else { 612 encoding = xmlEnc; 613 } 614 } else if (bomEnc.equals(UTF_8)) { 615 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 616 throw new XmlStreamReaderException(RAW_EX_1 617 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }), 618 bomEnc, xmlGuessEnc, xmlEnc, is); 619 } 620 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 621 throw new XmlStreamReaderException(RAW_EX_1 622 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }), 623 bomEnc, xmlGuessEnc, xmlEnc, is); 624 } 625 encoding = UTF_8; 626 } else { 627 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 628 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 629 throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc, 630 xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is); 631 } 632 if (xmlEnc != null && !xmlEnc.equals(UTF_16) 633 && !xmlEnc.equals(bomEnc)) { 634 throw new XmlStreamReaderException(RAW_EX_1 635 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }), 636 bomEnc, xmlGuessEnc, xmlEnc, is); 637 } 638 } else if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 639 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 640 throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc, 641 xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is); 642 } 643 if (xmlEnc != null && !xmlEnc.equals(UTF_32) 644 && !xmlEnc.equals(bomEnc)) { 645 throw new XmlStreamReaderException(RAW_EX_1 646 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }), 647 bomEnc, xmlGuessEnc, xmlEnc, is); 648 } 649 } else { 650 throw new XmlStreamReaderException(RAW_EX_2.format(new Object[] { 651 bomEnc, xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, 652 xmlEnc, is); 653 } 654 encoding = bomEnc; 655 } 656 return encoding; 657 } 658 659 /** 660 * Closes the XmlStreamReader stream. 661 * 662 * @throws IOException thrown if there was a problem closing the stream. 663 */ 664 @Override close()665 public void close() throws IOException { 666 reader.close(); 667 } 668 doHttpStream(final InputStream inputStream, final String httpContentType, final boolean lenient)669 private void doHttpStream(final InputStream inputStream, final String httpContentType, 670 final boolean lenient) throws IOException { 671 final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE); 672 final String cTMime = getContentTypeMime(httpContentType); 673 final String cTEnc = getContentTypeEncoding(httpContentType); 674 final String bomEnc = getBOMEncoding(pis); 675 final String xmlGuessEnc = getXMLGuessEncoding(pis); 676 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 677 final String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, 678 xmlGuessEnc, xmlEnc, pis, lenient); 679 prepareReader(pis, encoding); 680 } 681 doLenientDetection(String httpContentType, XmlStreamReaderException ex)682 private void doLenientDetection(String httpContentType, 683 XmlStreamReaderException ex) throws IOException { 684 if (httpContentType != null && httpContentType.startsWith("text/html")) { 685 httpContentType = httpContentType.substring("text/html" 686 .length()); 687 httpContentType = "text/xml" + httpContentType; 688 try { 689 doHttpStream(ex.getInputStream(), httpContentType, true); 690 ex = null; 691 } catch (final XmlStreamReaderException ex2) { 692 ex = ex2; 693 } 694 } 695 if (ex != null) { 696 String encoding = ex.getXmlEncoding(); 697 if (encoding == null) { 698 encoding = ex.getContentTypeEncoding(); 699 } 700 if (encoding == null) { 701 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 702 } 703 prepareReader(ex.getInputStream(), encoding); 704 } 705 } 706 doRawStream(final InputStream inputStream)707 private void doRawStream(final InputStream inputStream) 708 throws IOException { 709 final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE); 710 final String bomEnc = getBOMEncoding(pis); 711 final String xmlGuessEnc = getXMLGuessEncoding(pis); 712 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 713 final String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis); 714 prepareReader(pis, encoding); 715 } 716 717 /** 718 * Returns the charset encoding of the XmlStreamReader. 719 * 720 * @return charset encoding. 721 */ getEncoding()722 public String getEncoding() { 723 return encoding; 724 } 725 prepareReader(final InputStream inputStream, final String encoding)726 private void prepareReader(final InputStream inputStream, final String encoding) 727 throws IOException { 728 reader = new InputStreamReader(inputStream, encoding); 729 this.encoding = encoding; 730 } 731 732 @Override read(final char[] buf, final int offset, final int len)733 public int read(final char[] buf, final int offset, final int len) throws IOException { 734 return reader.read(buf, offset, len); 735 } 736 737 } 738