1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 35 package org.owasp.encoder; 36 37 import java.io.IOException; 38 import java.io.Writer; 39 import java.nio.CharBuffer; 40 import java.nio.charset.CoderResult; 41 42 /** 43 * Encode -- fluent interface for contextual encoding. Example usage in a JSP: 44 * 45 * <pre> 46 * <input value="<%=Encode.forHtml(value)%>" /> 47 * </pre> 48 * 49 * <p>There are two versions of each contextual encoding method. The first 50 * takes a {@code String} argument and returns the encoded version as a 51 * {@code String}. The second version writes the encoded version directly 52 * to a {@code Writer}.</p> 53 * 54 * <p>Please make sure to read and understand the context that the method encodes 55 * for. Encoding for the incorrect context will likely lead to exposing a 56 * cross-site scripting vulnerability. Those new to XSS mitigation may find it 57 * useful to read the 58 * <a href="https://cheatsheetseries.owasp.org/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.html"> 59 * Cross Site Scripting Prevention Cheat Sheet</a> that is part of the OWASP Cheat Sheet series for background 60 * material. 61 * </p> 62 * 63 * @author Jeff Ichnowski 64 */ 65 public final class Encode { 66 /** No instances. */ Encode()67 private Encode() {} 68 69 /** 70 * <p>Encodes for (X)HTML text content and text attributes. Since 71 * this method encodes for both contexts, it may be slightly less 72 * efficient to use this method over the methods targeted towards 73 * the specific contexts ({@link #forHtmlAttribute(String)} and 74 * {@link #forHtmlContent(String)}). In general this method should 75 * be preferred unless you are really concerned with saving a few 76 * bytes or are writing a framework that utilizes this 77 * package.</p> 78 * 79 * <b>Example JSP Usage</b> 80 * <pre> 81 * <div><%=Encode.forHtml(unsafeData)%></div> 82 * 83 * <input value="<%=Encode.forHtml(unsafeData)%>" /> 84 * </pre> 85 * 86 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 87 * <caption><b>Encoding Table</b></caption> 88 * <thead> 89 * <tr> 90 * <th align="left" class="colFirst">Input</th> 91 * <th align="left" class="colLast">Result</th> 92 * </tr> 93 * </thead> 94 * <tbody> 95 * <tr class="altColor"> 96 * <td class="colFirst">{@code &}</td> 97 * <td class="colLast">{@code &}</td> 98 * </tr> 99 * <tr class="rowColor"> 100 * <td class="colFirst">{@code <}</td> 101 * <td class="colLast">{@code <}</td> 102 * </tr> 103 * <tr class="altColor"> 104 * <td class="colFirst">{@code >}</td> 105 * <td class="colLast">{@code >}</td> 106 * </tr> 107 * <tr class="rowColor"> 108 * <td class="colFirst">{@code "}</td> 109 * <td class="colLast">{@code "}</td> 110 * </tr> 111 * <tr class="altColor"> 112 * <td class="colFirst">{@code '}</td> 113 * <td class="colLast">{@code '}</td> 114 * </tr> 115 * </tbody> 116 * </table> 117 * 118 * <p><b>Additional Notes</b></p> 119 * <ul> 120 * <li>The encoding of the greater-than sign ({@code >}) is not 121 * strictly required, but is included for maximum 122 * compatibility.</li> 123 * 124 * <li>Numeric encoding is used for double-quote character ({@code 125 * "}) as it shorter than the also valid {@code "}.</li> 126 * 127 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 128 * (U+09) and space (U+20) are valid in quoted attributes and in 129 * block in an unescaped form.</li> 130 * 131 * <li>Surrogate pairs are passed through only if valid.</li> 132 * 133 * <li>Characters that are not <a 134 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 135 * to the XML specification</a> are replaced by a space character 136 * as they could lead to parsing errors. In particular only {@code #x9 137 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 138 * [#x10000-#x10FFFF]} are considered valid.</li> 139 * </ul> 140 * 141 * @param input the data to encode 142 * @return the data encoded for html. 143 */ forHtml(String input)144 public static String forHtml(String input) { 145 return forXml(input); 146 } 147 148 /** 149 * See {@link #forHtml(String)} for description of encoding. This 150 * version writes directly to a Writer without an intervening string. 151 * 152 * @param out where to write encoded output 153 * @param input the input string to encode 154 * @throws IOException if thrown by writer 155 */ forHtml(Writer out, String input)156 public static void forHtml(Writer out, String input) throws IOException { 157 forXml(out, input); 158 } 159 160 /** 161 * <p>This method encodes for HTML text content. It does not escape 162 * quotation characters and is thus unsafe for use with 163 * HTML attributes. Use either {@link #forHtml(String)} or {@link #forHtmlAttribute(String)} for those 164 * methods.</p> 165 * 166 * <b>Example JSP Usage</b> 167 * <pre> 168 * <div><%=Encode.forHtmlContent(unsafeData)%></div> 169 * </pre> 170 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 171 * <caption><b>Encoding Table</b></caption> 172 * <thead> 173 * <tr> 174 * <th align="left" class="colFirst">Input</th> 175 * <th align="left" class="colLast">Result</th> 176 * </tr> 177 * </thead> 178 * <tbody> 179 * <tr class="altColor"> 180 * <td class="colFirst">{@code &}</td> 181 * <td class="colLast">{@code &}</td> 182 * </tr> 183 * <tr class="rowColor"> 184 * <td class="colFirst">{@code <}</td> 185 * <td class="colLast">{@code <}</td> 186 * </tr> 187 * <tr class="altColor"> 188 * <td class="colFirst">{@code >}</td> 189 * <td class="colLast">{@code >}</td> 190 * </tr> 191 * </tbody> 192 * </table> 193 * 194 * <p><b>Additional Notes</b></p> 195 * <ul> 196 * <li>Single-quote character ({@code '}) and double-quote 197 * character ({@code "}) do not require encoding in HTML 198 * blocks, unlike other HTML contexts.</li> 199 * 200 * <li>The encoding of the greater-than sign ({@code >}) is not 201 * strictly required, but is included for maximum 202 * compatibility.</li> 203 * 204 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 205 * (U+09) and space (U+20) are valid in quoted attributes and in 206 * block in an unescaped form.</li> 207 * 208 * <li>Surrogate pairs are passed through only if valid.</li> 209 * 210 * <li>Characters that are not <a 211 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 212 * to the XML specification</a> are replaced by a space character 213 * as they could lead to parsing errors. In particular only {@code #x9 214 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 215 * [#x10000-#x10FFFF]} are considered valid.</li> 216 * </ul> 217 * 218 * @param input the input to encode 219 * @return the encoded result 220 */ forHtmlContent(String input)221 public static String forHtmlContent(String input) { 222 return forXmlContent(input); 223 } 224 225 /** 226 * See {@link #forHtmlContent(String)} for description of encoding. This 227 * version writes directly to a Writer without an intervening string. 228 * 229 * @param out where to write encoded output 230 * @param input the input string to encode 231 * @throws IOException if thrown by writer 232 */ forHtmlContent(Writer out, String input)233 public static void forHtmlContent(Writer out, String input) 234 throws IOException 235 { 236 forXmlContent(out, input); 237 } 238 239 /** 240 * <p>This method encodes for HTML text attributes. Do not use for JavaScript event attributes or for attributes 241 * that are interpreted as a URL. Instead use {@link #forJavaScript(String)} and {@link #forUriComponent(String)} 242 * respectively for those.</p> 243 * 244 * <b>Example JSP Usage</b> 245 * <pre> 246 * <input value="<%=Encode.forHtmlAttribute(unsafeData)%>" title='<%=Encode.forHtmlAttribute(moreUnsafeData)%>' /> 247 * </pre> 248 * 249 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 250 * <caption><b>Encoding Table</b></caption> 251 * <thead> 252 * <tr> 253 * <th align="left" class="colFirst">Input</th> 254 * <th align="left" class="colLast">Result</th> 255 * </tr> 256 * </thead> 257 * <tbody> 258 * <tr class="altColor"> 259 * <td class="colFirst">{@code &}</td> 260 * <td class="colLast">{@code &}</td> 261 * </tr> 262 * <tr class="rowColor"> 263 * <td class="colFirst">{@code <}</td> 264 * <td class="colLast">{@code <}</td> 265 * </tr> 266 * <tr class="altColor"> 267 * <td class="colFirst">{@code "}</td> 268 * <td class="colLast">{@code "}</td> 269 * </tr> 270 * <tr class="rowColor"> 271 * <td class="colFirst">{@code '}</td> 272 * <td class="colLast">{@code '}</td> 273 * </tr> 274 * </tbody> 275 * </table> 276 * 277 * <p><b>Additional Notes</b></p> 278 * <ul> 279 * <li>When using this method, the caller must provide quotes around the attribute value.</li> 280 * 281 * <li>Both the single-quote character ({@code '}) and the 282 * double-quote character ({@code "}) are encoded so this is safe 283 * for HTML attributes with either enclosing character.</li> 284 * 285 * <li>The encoding of the greater-than sign ({@code >}) is not 286 * required for attributes.</li> 287 * 288 * <li>Numeric encoding is used for double-quote character ({@code 289 * "}) as it shorter than the also valid {@code "}.</li> 290 * 291 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 292 * (U+09) and space (U+20) are valid in quoted attributes and in 293 * block in an unescaped form.</li> 294 * 295 * <li>Surrogate pairs are passed through only if valid.</li> 296 * 297 * <li>Characters that are not <a 298 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 299 * to the XML specification</a> are replaced by a space character 300 * as they could lead to parsing errors. In particular only {@code #x9 301 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 302 * [#x10000-#x10FFFF]} are considered valid.</li> 303 * </ul> 304 * 305 * @param input the input to encode 306 * @return the encoded result 307 */ forHtmlAttribute(String input)308 public static String forHtmlAttribute(String input) { 309 return forXmlAttribute(input); 310 } 311 312 /** 313 * See {@link #forHtmlAttribute(String)} for description of encoding. This 314 * version writes directly to a Writer without an intervening string. 315 * 316 * @param out where to write encoded output 317 * @param input the input string to encode 318 * @throws IOException if thrown by writer 319 */ forHtmlAttribute(Writer out, String input)320 public static void forHtmlAttribute(Writer out, String input) 321 throws IOException 322 { 323 forXmlAttribute(out, input); 324 } 325 326 327 /** 328 * <p>Encodes for unquoted HTML attribute values. {@link 329 * #forHtml(String)} or {@link #forHtmlAttribute(String)} should 330 * usually be preferred over this method as quoted attributes are 331 * XHTML compliant.</p> 332 * 333 * <p>When using this method, the caller is not required to 334 * provide quotes around the attribute (since it is encoded for 335 * such context). The caller should make sure that the attribute 336 * value does not abut unsafe characters--and thus should usually 337 * err on the side of including a space character after the 338 * value.</p> 339 * 340 * <p>Use of this method is discouraged as quoted attributes are 341 * generally more compatible and safer. Also note, that no 342 * attempt has been made to optimize this encoding, though it is 343 * still probably faster than other encoding libraries.</p> 344 * 345 * <b>Example JSP Usage</b> 346 * <pre> 347 * <input value=<%=Encode.forHtmlUnquotedAttribute(input)%> > 348 * </pre> 349 * 350 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 351 * <caption><b>Encoding Table</b></caption> 352 * <thead> 353 * <tr> 354 * <th align="left" class="colFirst">Input</th> 355 * <th align="left" class="colLast">Result</th> 356 * </tr> 357 * </thead> 358 * <tbody> 359 * <tr class="altColor"> 360 * <td class="colFirst">{@code U+0009} (horizontal tab)</td> 361 * <td class="colLast">{@code 	}</td></tr> 362 * <tr class="rowColor"> 363 * <td class="colFirst">{@code U+000A} (line feed)</td> 364 * <td class="colLast">{@code }</td></tr> 365 * <tr class="altColor"> 366 * <td class="colFirst">{@code U+000C} (form feed)</td> 367 * <td class="colLast">{@code }</td></tr> 368 * <tr class="rowColor"> 369 * <td class="colFirst">{@code U+000D} (carriage return)</td> 370 * <td class="colLast">{@code }</td></tr> 371 * <tr class="altColor"> 372 * <td class="colFirst">{@code U+0020} (space)</td> 373 * <td class="colLast">{@code  }</td></tr> 374 * <tr class="rowColor"> 375 * <td class="colFirst">{@code &}</td> 376 * <td class="colLast">{@code &}</td></tr> 377 * <tr class="altColor"> 378 * <td class="colFirst">{@code <}</td> 379 * <td class="colLast">{@code <}</td></tr> 380 * <tr class="rowColor"> 381 * <td class="colFirst">{@code >}</td> 382 * <td class="colLast">{@code >}</td></tr> 383 * <tr class="altColor"> 384 * <td class="colFirst">{@code "}</td> 385 * <td class="colLast">{@code "}</td></tr> 386 * <tr class="rowColor"> 387 * <td class="colFirst">{@code '}</td> 388 * <td class="colLast">{@code '}</td></tr> 389 * <tr class="altColor"> 390 * <td class="colFirst">{@code /}</td> 391 * <td class="colLast">{@code /}</td></tr> 392 * <tr class="rowColor"> 393 * <td class="colFirst">{@code =}</td> 394 * <td class="colLast">{@code =}</td></tr> 395 * <tr class="altColor"> 396 * <td class="colFirst">{@code `}</td> 397 * <td class="colLast">{@code `}</td></tr> 398 * <tr class="rowColor"> 399 * <td class="colFirst">{@code U+0085} (next line)</td> 400 * <td class="colLast">{@code …}</td></tr> 401 * <tr class="altColor"> 402 * <td class="colFirst">{@code U+2028} (line separator)</td> 403 * <td class="colLast">{@code 
}</td></tr> 404 * <tr class="rowColor"> 405 * <td class="colFirst">{@code U+2029} (paragraph separator)</td> 406 * <td class="colLast">{@code 
}</td></tr> 407 * </tbody> 408 * </table> 409 * 410 * <p><b>Additional Notes</b></p> 411 * <ul> 412 * <li>The following characters are <i>not</i> encoded: 413 * {@code 0-9, a-z, A-Z}, {@code !}, {@code 414 * #}, {@code $}, {@code %}, 415 * {@code (}, {@code )}, {@code 416 * *}, {@code +}, {@code ,}, 417 * {@code -}, {@code .}, {@code 418 * [}, {@code \}, {@code ]}, 419 * {@code ^}, {@code _}, {@code 420 * }}.</li> 421 * 422 * <li>Surrogate pairs are passed through only if valid. Invalid 423 * surrogate pairs are replaced by a hyphen (-).</li> 424 * 425 * <li>Characters in the C0 and C1 control blocks and not 426 * otherwise listed above are considered invalid and replaced by a 427 * hyphen (-) character.</li> 428 * 429 * <li>Unicode "non-characters" are replaced by hyphens (-).</li> 430 * </ul> 431 * 432 * @param input the attribute value to be encoded. 433 * @return the attribute value encoded for unquoted attribute 434 * context. 435 */ forHtmlUnquotedAttribute(String input)436 public static String forHtmlUnquotedAttribute(String input) { 437 return encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, input); 438 } 439 440 /** 441 * See {@link #forHtmlUnquotedAttribute(String)} for description of encoding. This 442 * version writes directly to a Writer without an intervening string. 443 * 444 * @param out where to write encoded output 445 * @param input the input string to encode 446 * @throws IOException if thrown by writer 447 */ forHtmlUnquotedAttribute(Writer out, String input)448 public static void forHtmlUnquotedAttribute(Writer out, String input) 449 throws IOException 450 { 451 encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, out, input); 452 } 453 454 455 // HTML comment encoding is not currently supported because 456 // of the number of vendor-specific sequences that would need 457 // to be handled (e.g. "<!--[if IE]-->" 458 459 // public static String forHtmlComment(String input) { 460 // // only alphanumeric and space, everything else becomes a space 461 // 462 // // HTML comment context needs to avoid browser extensions 463 // // such as "<!--[if IE]-->" 464 // throw new UnsupportedOperationException(); 465 // } 466 467 /** 468 * Encodes for CSS strings. The context must be surrounded by quotation 469 * characters. It is safe for use in both style blocks and attributes in 470 * HTML. 471 * 472 * <b>Example JSP Usage</b> 473 * <pre> 474 * <div style="background: url('<=Encode.forCssString(...)%>');"> 475 * 476 * <style type="text/css"> 477 * background: url('<%=Encode.forCssString(...)%>'); 478 * </style> 479 * </pre> 480 * 481 * <b>Encoding Notes</b> 482 * <ul> 483 * 484 * <li>The following characters are encoded using hexadecimal 485 * encodings: {@code U+0000} - {@code U+001f}, 486 * {@code "}, 487 * {@code '}, 488 * {@code \}, 489 * {@code <}, 490 * {@code &}, 491 * {@code /}, 492 * {@code >}, 493 * {@code U+007f}, 494 * line separator ({@code U+2028}), 495 * paragraph separator ({@code U+2029}).</li> 496 * 497 * <li>Any character requiring encoding is encoded as {@code \xxx} 498 * where {@code xxx} is the shortest hexadecimal representation of 499 * its Unicode code point (after decoding surrogate pairs if 500 * necessary). This encoding is never zero padded. Thus, for 501 * example, the tab character is encoded as {@code \9}, not {@code 502 * \0009}.</li> 503 * 504 * <li>The encoder looks ahead 1 character in the input and 505 * appends a space to an encoding to avoid the next character 506 * becoming part of the hexadecimal encoded sequence. Thus 507 * “{@code '1}” is encoded as “{@code \27 508 * 1}”, and not as “{@code \271}”. If a space 509 * is not necessary, it is not included, thus “{@code 510 * 'x}” is encoded as “{@code \27x}”, and not as 511 * “{@code \27 x}”.</li> 512 * 513 * <li>Surrogate pairs are passed through only if valid. Invalid 514 * surrogate pairs are replaced by an underscore (_).</li> 515 * 516 * <li>Unicode "non-characters" are replaced by underscores (_).</li> 517 * 518 * </ul> 519 * 520 * @param input the input to encode 521 * @return the encoded result 522 */ forCssString(String input)523 public static String forCssString(String input) { 524 // need to watch out for CSS expressions 525 return encode(Encoders.CSS_STRING_ENCODER, input); 526 } 527 528 /** 529 * See {@link #forCssString(String)} for description of encoding. This 530 * version writes directly to a Writer without an intervening string. 531 * 532 * @param out where to write encoded output 533 * @param input the input string to encode 534 * @throws IOException if thrown by writer 535 */ forCssString(Writer out, String input)536 public static void forCssString(Writer out, String input) 537 throws IOException 538 { 539 encode(Encoders.CSS_STRING_ENCODER, out, input); 540 } 541 542 /** 543 * Encodes for CSS URL contexts. The context must be surrounded by {@code "url("} 544 * and {@code ")"}. It is safe for use in both style blocks and attributes in HTML. 545 * Note: this does not do any checking on the quality or safety of the URL 546 * itself. The caller should insure that the URL is safe for embedding 547 * (e.g. input validation) by other means. 548 * 549 * <b>Example JSP Usage</b> 550 * <pre> 551 * <div style="background:url(<=Encode.forCssUrl(...)%>);"> 552 * 553 * <style type="text/css"> 554 * background: url('<%=Encode.forCssUrl(...)%>'); 555 * </style> 556 * </pre> 557 * <b>Encoding Notes</b> 558 * <ul> 559 * 560 * <li>The following characters are encoded using hexadecimal 561 * encodings: {@code U+0000} - {@code U+001f}, 562 * {@code "}, 563 * {@code '}, 564 * {@code \}, 565 * {@code <}, 566 * {@code &}, 567 * {@code /}, 568 * {@code >}, 569 * {@code U+007f}, 570 * line separator ({@code U+2028}), 571 * paragraph separator ({@code U+2029}).</li> 572 * 573 * <li>Any character requiring encoding is encoded as {@code \xxx} 574 * where {@code xxx} is the shortest hexadecimal representation of 575 * its Unicode code point (after decoding surrogate pairs if 576 * necessary). This encoding is never zero padded. Thus, for 577 * example, the tab character is encoded as {@code \9}, not {@code 578 * \0009}.</li> 579 * 580 * <li>The encoder looks ahead 1 character in the input and 581 * appends a space to an encoding to avoid the next character 582 * becoming part of the hexadecimal encoded sequence. Thus 583 * “{@code '1}” is encoded as “{@code \27 584 * 1}”, and not as “{@code \271}”. If a space 585 * is not necessary, it is not included, thus “{@code 586 * 'x}” is encoded as “{@code \27x}”, and not as 587 * “{@code \27 x}”.</li> 588 * 589 * <li>Surrogate pairs are passed through only if valid. Invalid 590 * surrogate pairs are replaced by an underscore (_).</li> 591 * 592 * <li>Unicode "non-characters" are replaced by underscores (_).</li> 593 * 594 * </ul> 595 * 596 * @param input the input to encode 597 * @return the encoded result 598 */ forCssUrl(String input)599 public static String forCssUrl(String input) { 600 return encode(Encoders.CSS_URL_ENCODER, input); 601 } 602 603 /** 604 * See {@link #forCssUrl(String)} for description of encoding. This 605 * version writes directly to a Writer without an intervening string. 606 * 607 * @param out where to write encoded output 608 * @param input the input string to encode 609 * @throws IOException if thrown by writer 610 */ forCssUrl(Writer out, String input)611 public static void forCssUrl(Writer out, String input) 612 throws IOException 613 { 614 encode(Encoders.CSS_URL_ENCODER, out, input); 615 } 616 617 /** 618 * <p>Performs percent-encoding of a URL according to RFC 3986. The provided 619 * URL is assumed to a valid URL. This method does not do any checking on 620 * the quality or safety of the URL itself. In many applications it may 621 * be better to use {@link java.net.URI} instead. Note: this is a 622 * particularly dangerous context to put untrusted content in, as for 623 * example a "javascript:" URL provided by a malicious user would be 624 * "properly" escaped, and still execute.</p> 625 * 626 * <b>Encoding Table</b> 627 * <p>The following characters are <i>not</i> encoded:</p> 628 * <pre> 629 * U+20: ! # $ & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; = ? 630 * U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _ 631 * U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~ 632 * </pre> 633 * 634 * <b>Encoding Notes</b> 635 * <ul> 636 * 637 * <li>The single-quote character({@code '}) <b>is not encoded</b>.</li> 638 * 639 * <li>This encoding is not intended to be used standalone. The 640 * output should be encoded to the target context. For example: 641 * {@code <a 642 * href="<%=Encode.forHtmlAttribute(Encode.forUri(uri))%>">...</a>}. 643 * (Note, the single-quote character ({@code '}) is not 644 * encoded.)</li> 645 * 646 * <li>URL encoding is an encoding for bytes, not unicode. The 647 * input string is thus first encoded as a sequence of UTF-8 648 * byte. The bytes are then encoded as {@code %xx} where {@code 649 * xx} is the two-digit hexadecimal representation of the 650 * byte. (The implementation does this as one step for 651 * performance.)</li> 652 * 653 * <li>Surrogate pairs are first decoded to a Unicode code point 654 * before encoding as UTF-8.</li> 655 * 656 * <li>Invalid characters (e.g. partial or invalid surrogate 657 * pairs), are replaced with a hyphen ({@code -}) character.</li> 658 * 659 * </ul> 660 * 661 * @param input the input to encode 662 * @return the encoded result 663 */ forUri(String input)664 @Deprecated public static String forUri(String input) { 665 return encode(Encoders.URI_ENCODER, input); 666 } 667 668 /** 669 * See {@link #forUri(String)} for description of encoding. This 670 * version writes directly to a Writer without an intervening string. 671 * 672 * @param out where to write encoded output 673 * @param input the input string to encode 674 * @throws IOException if thrown by writer 675 * 676 * @deprecated There is never a need to encode a complete URI with this form of encoding. 677 */ forUri(Writer out, String input)678 @Deprecated public static void forUri(Writer out, String input) 679 throws IOException 680 { 681 encode(Encoders.URI_ENCODER, out, input); 682 } 683 684 /** 685 * Performs percent-encoding for a component of a URI, such as a query 686 * parameter name or value, path or query-string. In particular this 687 * method insures that special characters in the component do not get 688 * interpreted as part of another component. 689 * 690 * <pre> 691 * <a href="http://www.owasp.org/<%=Encode.forUriComponent(...)%>?query#fragment"> 692 * 693 * <a href="/search?value=<%=Encode.forUriComponent(...)%>&order=1#top"> 694 * </pre> 695 * 696 * <b>Encoding Table</b> 697 * <p>The following characters are <i>not</i> encoded:</p> 698 * <pre> 699 * U+20: - . 0 1 2 3 4 5 6 7 8 9 700 * U+40: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ 701 * U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~ 702 * </pre> 703 * 704 * <b>Encoding Notes</b> 705 * <ul> 706 * 707 * <li>Unlike {@link #forUri(String)} this method is safe to be 708 * used in most containing contexts, including: HTML/XML, CSS, 709 * and JavaScript contexts.</li> 710 * 711 * <li>URL encoding is an encoding for bytes, not unicode. The 712 * input string is thus first encoded as a sequence of UTF-8 713 * byte. The bytes are then encoded as {@code %xx} where {@code 714 * xx} is the two-digit hexadecimal representation of the 715 * byte. (The implementation does this as one step for 716 * performance.)</li> 717 * 718 * <li>Surrogate pairs are first decoded to a Unicode code point 719 * before encoding as UTF-8.</li> 720 * 721 * <li>Invalid characters (e.g. partial or invalid surrogate 722 * pairs), are replaced with a hyphen ({@code -}) character.</li> 723 * 724 * </ul> 725 * 726 * @param input the input to encode 727 * @return the encoded result 728 */ forUriComponent(String input)729 public static String forUriComponent(String input) { 730 return encode(Encoders.URI_COMPONENT_ENCODER, input); 731 } 732 733 /** 734 * See {@link #forUriComponent(String)} for description of encoding. This 735 * version writes directly to a Writer without an intervening string. 736 * 737 * @param out where to write encoded output 738 * @param input the input string to encode 739 * @throws IOException if thrown by writer 740 */ forUriComponent(Writer out, String input)741 public static void forUriComponent(Writer out, String input) 742 throws IOException 743 { 744 encode(Encoders.URI_COMPONENT_ENCODER, out, input); 745 } 746 747 /** 748 * Encoder for XML and XHTML. See {@link #forHtml(String)} for a 749 * description of the encoding and context. 750 * 751 * @see #forHtml(String) 752 * @param input the input to encode 753 * @return the encoded result 754 */ forXml(String input)755 public static String forXml(String input) { 756 return encode(Encoders.XML_ENCODER, input); 757 } 758 759 /** 760 * See {@link #forXml(String)} for description of encoding. This 761 * version writes directly to a Writer without an intervening string. 762 * 763 * @param out where to write encoded output 764 * @param input the input string to encode 765 * @throws IOException if thrown by writer 766 */ forXml(Writer out, String input)767 public static void forXml(Writer out, String input) 768 throws IOException 769 { 770 encode(Encoders.XML_ENCODER, out, input); 771 } 772 773 /** 774 * Encoder for XML and XHTML text content. See {@link 775 * #forHtmlContent(String)} for description of encoding and 776 * context. 777 * 778 * @see #forHtmlContent(String) 779 * @param input the input to encode 780 * @return the encoded result 781 */ forXmlContent(String input)782 public static String forXmlContent(String input) { 783 return encode(Encoders.XML_CONTENT_ENCODER, input); 784 } 785 786 /** 787 * See {@link #forXmlContent(String)} for description of encoding. This 788 * version writes directly to a Writer without an intervening string. 789 * 790 * @param out where to write encoded output 791 * @param input the input string to encode 792 * @throws IOException if thrown by writer 793 */ forXmlContent(Writer out, String input)794 public static void forXmlContent(Writer out, String input) 795 throws IOException 796 { 797 encode(Encoders.XML_CONTENT_ENCODER, out, input); 798 } 799 800 /** 801 * Encoder for XML and XHTML attribute content. See {@link 802 * #forHtmlAttribute(String)} for description of encoding and 803 * context. 804 * 805 * @see #forHtmlAttribute(String) 806 * @param input the input to encode 807 * @return the encoded result 808 */ forXmlAttribute(String input)809 public static String forXmlAttribute(String input) { 810 return encode(Encoders.XML_ATTRIBUTE_ENCODER, input); 811 } 812 813 /** 814 * See {@link #forXmlAttribute(String)} for description of encoding. This 815 * version writes directly to a Writer without an intervening string. 816 * 817 * @param out where to write encoded output 818 * @param input the input string to encode 819 * @throws IOException if thrown by writer 820 */ forXmlAttribute(Writer out, String input)821 public static void forXmlAttribute(Writer out, String input) 822 throws IOException 823 { 824 encode(Encoders.XML_ATTRIBUTE_ENCODER, out, input); 825 } 826 827 /** 828 * Encoder for XML comments. <strong>NOT FOR USE WITH 829 * (X)HTML CONTEXTS.</strong> (X)HTML comments may be interpreted by 830 * browsers as something other than a comment, typically in vendor 831 * specific extensions (e.g. {@code <--if[IE]-->}). 832 * For (X)HTML it is recommend that unsafe content never be included 833 * in a comment. 834 * 835 * <p>The caller must provide the comment start and end sequences.</p> 836 * 837 * <p>This method replaces all invalid XML characters with spaces, 838 * and replaces the "--" sequence (which is invalid in XML comments) 839 * with "-~" (hyphen-tilde). <b>This encoding behavior may change 840 * in future releases.</b> If the comments need to be decoded, the 841 * caller will need to come up with their own encode/decode system.</p> 842 * 843 * <pre> 844 * out.println("<?xml version='1.0'?>"); 845 * out.println("<data>"); 846 * out.println("<!-- "+Encode.forXmlComment(comment)+" -->"); 847 * out.println("</data>"); 848 * </pre> 849 * 850 * @param input the input to encode 851 * @return the encoded result 852 */ forXmlComment(String input)853 public static String forXmlComment(String input) { 854 return encode(Encoders.XML_COMMENT_ENCODER, input); 855 } 856 857 /** 858 * See {@link #forXmlComment(String)} for description of encoding. This 859 * version writes directly to a Writer without an intervening string. 860 * 861 * @param out where to write encoded output 862 * @param input the input string to encode 863 * @throws IOException if thrown by writer 864 */ forXmlComment(Writer out, String input)865 public static void forXmlComment(Writer out, String input) 866 throws IOException 867 { 868 encode(Encoders.XML_COMMENT_ENCODER, out, input); 869 } 870 871 /** 872 * Encodes data for an XML CDATA section. On the chance that the input 873 * contains a terminating {@code "]]>"}, it will be replaced by 874 * {@code "]]>]]<![CDATA[>"}. 875 * As with all XML contexts, characters that are invalid according to the 876 * XML specification will be replaced by a space character. Caller must 877 * provide the CDATA section boundaries. 878 * 879 * <pre> 880 * <xml-data><![CDATA[<%=Encode.forCDATA(...)%>]]></xml-data> 881 * </pre> 882 * 883 * @param input the input to encode 884 * @return the encoded result 885 */ forCDATA(String input)886 public static String forCDATA(String input) { 887 return encode(Encoders.CDATA_ENCODER, input); 888 } 889 890 /** 891 * See {@link #forCDATA(String)} for description of encoding. This 892 * version writes directly to a Writer without an intervening string. 893 * 894 * @param out where to write encoded output 895 * @param input the input string to encode 896 * @throws IOException if thrown by writer 897 */ forCDATA(Writer out, String input)898 public static void forCDATA(Writer out, String input) 899 throws IOException 900 { 901 encode(Encoders.CDATA_ENCODER, out, input); 902 } 903 904 /** 905 * Encodes for a Java string. This method will use "\b", "\t", "\r", "\f", 906 * "\n", "\"", "\'", "\\", octal and unicode escapes. Valid surrogate 907 * pairing is not checked. The caller must provide the enclosing quotation 908 * characters. This method is useful for when writing code generators and 909 * outputting debug messages. 910 * 911 * <pre> 912 * out.println("public class Hello {"); 913 * out.println(" public static void main(String[] args) {"); 914 * out.println(" System.out.println(\"" + Encode.forJava(message) + "\");"); 915 * out.println(" }"); 916 * out.println("}"); 917 * </pre> 918 * 919 * @param input the input to encode 920 * @return the input encoded for java strings. 921 */ forJava(String input)922 public static String forJava(String input) { 923 return encode(Encoders.JAVA_ENCODER, input); 924 } 925 926 /** 927 * See {@link #forJava(String)} for description of encoding. This 928 * version writes directly to a Writer without an intervening string. 929 * 930 * @param out where to write encoded output 931 * @param input the input string to encode 932 * @throws IOException if thrown by writer 933 */ forJava(Writer out, String input)934 public static void forJava(Writer out, String input) 935 throws IOException 936 { 937 encode(Encoders.JAVA_ENCODER, out, input); 938 } 939 940 /** 941 * <p>Encodes for a JavaScript string. It is safe for use in HTML 942 * script attributes (such as {@code onclick}), script 943 * blocks, JSON files, and JavaScript source. The caller MUST 944 * provide the surrounding quotation characters for the string. 945 * Since this performs additional encoding so it can work in all 946 * of the JavaScript contexts listed, it may be slightly less 947 * efficient than using one of the methods targeted to a specific 948 * JavaScript context ({@link #forJavaScriptAttribute(String)}, 949 * {@link #forJavaScriptBlock}, {@link #forJavaScriptSource}). 950 * Unless you are interested in saving a few bytes of output or 951 * are writing a framework on top of this library, it is recommend 952 * that you use this method over the others.</p> 953 * 954 * <b>Example JSP Usage:</b> 955 * <pre> 956 * <button onclick="alert('<%=Encode.forJavaScript(data)%>');"> 957 * <script type="text/javascript"> 958 * var data = "<%=Encode.forJavaScript(data)%>"; 959 * </script> 960 * </pre> 961 * 962 * <table cellspacing="1" class="memberSummary" cellpadding="1" border="0"> 963 * <caption><b>Encoding Description</b></caption> 964 * <thead> 965 * <tr> 966 * <th align="left" colspan="2" class="colFirst">Input Character</th> 967 * <th align="left" class="colLast">Encoded Result</th> 968 * <th align="left" class="colLast">Notes</th> 969 * </tr> 970 * </thead> 971 * <tbody> 972 * <tr class="altColor"> 973 * <td class="colFirst">U+0008</td><td><i>BS</i></td> 974 * <td class="colLast"><code>\b</code></td> 975 * <td class="colLast">Backspace character</td> 976 * </tr> 977 * <tr class="rowColor"> 978 * <td class="colFirst">U+0009</td><td><i>HT</i></td> 979 * <td class="colLast"><code>\t</code></td> 980 * <td class="colLast">Horizontal tab character</td> 981 * </tr> 982 * <tr class="altColor"> 983 * <td class="colFirst">U+000A</td><td><i>LF</i></td> 984 * <td class="colLast"><code>\n</code></td> 985 * <td class="colLast">Line feed character</td> 986 * </tr> 987 * <tr class="rowColor"> 988 * <td class="colFirst">U+000C</td><td><i>FF</i></td> 989 * <td class="colLast"><code>\f</code></td> 990 * <td class="colLast">Form feed character</td> 991 * </tr> 992 * <tr class="altColor"> 993 * <td class="colFirst">U+000D</td><td><i>CR</i></td> 994 * <td class="colLast"><code>\r</code></td> 995 * <td class="colLast">Carriage return character</td> 996 * </tr> 997 * <tr class="rowColor"> 998 * <td class="colFirst">U+0022</td><td><code>"</code></td> 999 * <td class="colLast"><code>\x22</code></td> 1000 * <td class="colLast">The encoding <code>\"</code> is not used here because 1001 * it is not safe for use in HTML attributes. (In HTML 1002 * attributes, it would also be correct to use 1003 * "\&quot;".)</td> 1004 * </tr> 1005 * <tr class="altColor"> 1006 * <td class="colFirst">U+0026</td><td><code>&</code></td> 1007 * <td class="colLast"><code>\x26</code></td> 1008 * <td class="colLast">Ampersand character</td> 1009 * </tr> 1010 * <tr class="rowColor"> 1011 * <td class="colFirst">U+0027</td><td><code>'</code></td> 1012 * <td class="colLast"><code>\x27</code></td> 1013 * <td class="colLast">The encoding <code>\'</code> is not used here because 1014 * it is not safe for use in HTML attributes. (In HTML 1015 * attributes, it would also be correct to use 1016 * "\&#39;".)</td> 1017 * </tr> 1018 * <tr class="altColor"> 1019 * <td class="colFirst">U+002F</td><td><code>/</code></td> 1020 * <td class="colLast"><code>\/</code></td> 1021 * <td class="colLast">This encoding is used to avoid an input sequence 1022 * "</" from prematurely terminating a </script> 1023 * block.</td> 1024 * </tr> 1025 * <tr class="rowColor"> 1026 * <td class="colFirst">U+005C</td><td><code>\</code></td> 1027 * <td class="colLast"><code>\\</code></td> 1028 * <td class="colLast"></td> 1029 * </tr> 1030 * <tr class="altColor"> 1031 * <td class="colFirst" colspan="2">U+0000 to U+001F</td> 1032 * <td class="colLast"><code>\x##</code></td> 1033 * <td class="colLast">Hexadecimal encoding is used for characters in this 1034 * range that were not already mentioned in above.</td> 1035 * </tr> 1036 * </tbody> 1037 * </table> 1038 * 1039 * @param input the input string to encode 1040 * @return the input encoded for JavaScript 1041 * @see #forJavaScriptAttribute(String) 1042 * @see #forJavaScriptBlock(String) 1043 */ forJavaScript(String input)1044 public static String forJavaScript(String input) { 1045 return encode(Encoders.JAVASCRIPT_ENCODER, input); 1046 } 1047 1048 /** 1049 * See {@link #forJavaScript(String)} for description of encoding. This 1050 * version writes directly to a Writer without an intervening string. 1051 * 1052 * @param out where to write encoded output 1053 * @param input the input string to encode 1054 * @throws IOException if thrown by writer 1055 */ forJavaScript(Writer out, String input)1056 public static void forJavaScript(Writer out, String input) 1057 throws IOException 1058 { 1059 encode(Encoders.JAVASCRIPT_ENCODER, out, input); 1060 } 1061 1062 /** 1063 * <p>This method encodes for JavaScript strings contained within 1064 * HTML script attributes (such as {@code onclick}). It is 1065 * NOT safe for use in script blocks. The caller MUST provide the 1066 * surrounding quotation characters. This method performs the 1067 * same encode as {@link #forJavaScript(String)} with the 1068 * exception that <code>/</code> is not escaped.</p> 1069 * 1070 * <p><strong>Unless you are interested in saving a few bytes of 1071 * output or are writing a framework on top of this library, it is 1072 * recommend that you use {@link #forJavaScript(String)} over this 1073 * method.</strong></p> 1074 * 1075 * <b>Example JSP Usage:</b> 1076 * <pre> 1077 * <button onclick="alert('<%=Encode.forJavaScriptAttribute(data)%>');"> 1078 * </pre> 1079 * 1080 * @param input the input string to encode 1081 * @return the input encoded for JavaScript 1082 * @see #forJavaScript(String) 1083 * @see #forJavaScriptBlock(String) 1084 */ forJavaScriptAttribute(String input)1085 public static String forJavaScriptAttribute(String input) { 1086 return encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, input); 1087 } 1088 1089 /** 1090 * See {@link #forJavaScriptAttribute(String)} for description of encoding. This 1091 * version writes directly to a Writer without an intervening string. 1092 * 1093 * @param out where to write encoded output 1094 * @param input the input string to encode 1095 * @throws IOException if thrown by writer 1096 */ forJavaScriptAttribute(Writer out, String input)1097 public static void forJavaScriptAttribute(Writer out, String input) 1098 throws IOException 1099 { 1100 encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, out, input); 1101 } 1102 1103 /** 1104 * <p>This method encodes for JavaScript strings contained within 1105 * HTML script blocks. It is NOT safe for use in script 1106 * attributes (such as <code>onclick</code>). The caller must 1107 * provide the surrounding quotation characters. This method 1108 * performs the same encode as {@link #forJavaScript(String)} with 1109 * the exception that <code>"</code> and <code>'</code> are 1110 * encoded as <code>\"</code> and <code>\'</code> 1111 * respectively.</p> 1112 * 1113 * <p><strong>Unless you are interested in saving a few bytes of 1114 * output or are writing a framework on top of this library, it is 1115 * recommend that you use {@link #forJavaScript(String)} over this 1116 * method.</strong></p> 1117 * 1118 * <b>Example JSP Usage:</b> 1119 * <pre> 1120 * <script type="text/javascript"> 1121 * var data = "<%=Encode.forJavaScriptBlock(data)%>"; 1122 * </script> 1123 * </pre> 1124 * 1125 * @param input the input string to encode 1126 * @return the input encoded for JavaScript 1127 * @see #forJavaScript(String) 1128 * @see #forJavaScriptAttribute(String) 1129 */ forJavaScriptBlock(String input)1130 public static String forJavaScriptBlock(String input) { 1131 return encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, input); 1132 } 1133 1134 /** 1135 * See {@link #forJavaScriptBlock(String)} for description of encoding. This 1136 * version writes directly to a Writer without an intervening string. 1137 * 1138 * @param out where to write encoded output 1139 * @param input the input string to encode 1140 * @throws IOException if thrown by writer 1141 */ forJavaScriptBlock(Writer out, String input)1142 public static void forJavaScriptBlock(Writer out, String input) 1143 throws IOException 1144 { 1145 encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, out, input); 1146 } 1147 1148 /** 1149 * <p>This method encodes for JavaScript strings contained within 1150 * a JavaScript or JSON file. <strong>This method is NOT safe for 1151 * use in ANY context embedded in HTML.</strong> The caller must 1152 * provide the surrounding quotation characters. This method 1153 * performs the same encode as {@link #forJavaScript(String)} with 1154 * the exception that <code>/</code> and <code>&</code> are not 1155 * escaped and <code>"</code> and <code>'</code> are encoded as 1156 * <code>\"</code> and <code>\'</code> respectively.</p> 1157 * 1158 * <p><strong>Unless you are interested in saving a few bytes of 1159 * output or are writing a framework on top of this library, it is 1160 * recommend that you use {@link #forJavaScript(String)} over this 1161 * method.</strong></p> 1162 * 1163 * <b>Example JSP Usage:</b> 1164 * This example is serving up JavaScript source directly: 1165 * <pre> 1166 * <%@page contentType="text/javascript; charset=UTF-8"%> 1167 * var data = "<%=Encode.forJavaScriptSource(data)%>"; 1168 * </pre> 1169 * 1170 * This example is serving up JSON data (users of this use-case 1171 * are encouraged to read up on "JSON Hijacking"): 1172 * <pre> 1173 * <%@page contentType="application/json; charset=UTF-8"%> 1174 * <% myapp.jsonHijackingPreventionMeasure(); %> 1175 * {"data":"<%=Encode.forJavaScriptSource(data)%>"} 1176 * </pre> 1177 * 1178 * @param input the input string to encode 1179 * @return the input encoded for JavaScript 1180 * @see #forJavaScript(String) 1181 * @see #forJavaScriptAttribute(String) 1182 * @see #forJavaScriptBlock(String) 1183 */ forJavaScriptSource(String input)1184 public static String forJavaScriptSource(String input) { 1185 return encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, input); 1186 } 1187 1188 /** 1189 * See {@link #forJavaScriptSource(String)} for description of encoding. This 1190 * version writes directly to a Writer without an intervening string. 1191 * 1192 * @param out where to write encoded output 1193 * @param input the input string to encode 1194 * @throws IOException if thrown by writer 1195 */ forJavaScriptSource(Writer out, String input)1196 public static void forJavaScriptSource(Writer out, String input) 1197 throws IOException 1198 { 1199 encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, out, input); 1200 } 1201 1202 // Additional? 1203 // MySQL 1204 // PostreSQL 1205 // Oracle 1206 // ... 1207 1208 /** 1209 * Core encoding loop shared by public methods. It first uses the 1210 * encoder to scan the input for characters that need encoding. If 1211 * no characters require encoding, the input string is returned. 1212 * Otherwise a buffer is used to encode the remainder 1213 * of the input. 1214 * 1215 * @param encoder the encoder to use 1216 * @param str the string to encode 1217 * @return the input string encoded with the provided encoder. 1218 */ encode(Encoder encoder, String str)1219 static String encode(Encoder encoder, String str) { 1220 if (str == null) { 1221 // consistent with String.valueOf(...) use "null" for null. 1222 str = "null"; 1223 } 1224 1225 // quick pass--see if we need to actually encode anything, if not 1226 // return the value unchanged. 1227 final int n = str.length(); 1228 int j = encoder.firstEncodedOffset(str, 0, n); 1229 1230 if (j == n) { 1231 return str; 1232 } 1233 1234 // otherwise, we need to encode. We use a buffer to avoid 1235 // excessive memory allocation for these calls. Note: this means that 1236 // an encoder implementation must NEVER call this method internally. 1237 return new Buffer().encode(encoder, str, j); 1238 } 1239 1240 /** 1241 * Core encoding loop shared by public methods. It first uses the 1242 * encoder to scan the input for characters that need encoding. If no 1243 * characters require encoding, the input string is written directly to 1244 * the writer. Otherwise a buffer is used to encode the 1245 * remainder of the input to the buffers. This version saves a wrapping 1246 * in an String. 1247 * 1248 * @param encoder the encoder to use 1249 * @param out the writer for the encoded output 1250 * @param str the string to encode 1251 * @throws IOException if thrown by the writer 1252 */ encode(Encoder encoder, Writer out, String str)1253 static void encode(Encoder encoder, Writer out, String str) 1254 throws IOException 1255 { 1256 if (str == null) { 1257 // consistent with String.valueOf(...) use "null" for null. 1258 str = "null"; 1259 } 1260 1261 // quick pass--see if we need to actually encode anything, if not 1262 // return the value unchanged. 1263 final int n = str.length(); 1264 int j = encoder.firstEncodedOffset(str, 0, n); 1265 1266 if (j == n) { 1267 out.write(str); 1268 return; 1269 } 1270 1271 // otherwise, we need to encode. We use a buffer to avoid 1272 // excessive memory allocation for these calls. Note: this means that 1273 // an encoder implementation must NEVER call this method internally. 1274 new Buffer().encode(encoder, out, str, j); 1275 } 1276 1277 /** 1278 * A buffer used for encoding. 1279 */ 1280 static class Buffer { 1281 /** 1282 * Input buffer size, used to extract a copy of the input 1283 * from a string and then send to the encoder. 1284 */ 1285 static final int INPUT_BUFFER_SIZE = 1024; 1286 /** 1287 * Output buffer size used to store the encoded output before 1288 * wrapping in a string. 1289 */ 1290 static final int OUTPUT_BUFFER_SIZE = INPUT_BUFFER_SIZE * 2; 1291 1292 /** 1293 * The input buffer. A heap-allocated, array-backed buffer of 1294 * INPUT_BUFFER_SIZE used for holding the characters to encode. 1295 */ 1296 final CharBuffer _input = CharBuffer.allocate(INPUT_BUFFER_SIZE); 1297 /** 1298 * The output buffer. A heap-allocated, array-backed buffer of 1299 * OUTPUT_BUFFER_SIZE used for holding the encoded output. 1300 */ 1301 final CharBuffer _output = CharBuffer.allocate(OUTPUT_BUFFER_SIZE); 1302 1303 /** 1304 * The core String encoding routine of this class. It uses the input 1305 * and output buffers to allow the encoders to work in reuse arrays. 1306 * When the input and/or output exceeds the capacity of the reused 1307 * arrays, temporary ones are allocated and then discarded after 1308 * the encode is done. 1309 * 1310 * @param encoder the encoder to use 1311 * @param str the string to encode 1312 * @param j the offset in {@code str} to start encoding 1313 * @return the encoded result 1314 */ encode(Encoder encoder, String str, int j)1315 String encode(Encoder encoder, String str, int j) { 1316 final int n = str.length(); 1317 final int remaining = n - j; 1318 1319 if (remaining <= INPUT_BUFFER_SIZE && j <= OUTPUT_BUFFER_SIZE) { 1320 // the remaining input to encode fits completely in the pre- 1321 // allocated buffer. 1322 str.getChars(0, j, _output.array(), 0); 1323 str.getChars(j, n, _input.array(), 0); 1324 1325 _input.limit(remaining).position(0); 1326 _output.clear().position(j); 1327 1328 CoderResult cr = encoder.encodeArrays(_input, _output, true); 1329 if (cr.isUnderflow()) { 1330 return new String(_output.array(), 0, _output.position()); 1331 } 1332 1333 // else, it's an overflow, we need to use a new output buffer 1334 // we'll allocate this buffer to be the exact size of the worst 1335 // case, guaranteeing a second overflow would not be possible. 1336 CharBuffer tmp = CharBuffer.allocate(_output.position() 1337 + encoder.maxEncodedLength(_input.remaining())); 1338 1339 // copy over everything that has been encoded so far 1340 tmp.put(_output.array(), 0, _output.position()); 1341 1342 cr = encoder.encodeArrays(_input, tmp, true); 1343 if (cr.isOverflow()) { 1344 throw new AssertionError("unexpected result from encoder"); 1345 } 1346 1347 return new String(tmp.array(), 0, tmp.position()); 1348 } else { 1349 // the input it too large for our pre-allocated buffers 1350 // we'll use a temporary direct heap allocation 1351 final int m = j + encoder.maxEncodedLength(remaining); 1352 CharBuffer buffer = CharBuffer.allocate(m); 1353 str.getChars(0, j, buffer.array(), 0); 1354 str.getChars(j, n, buffer.array(), m - remaining); 1355 1356 CharBuffer input = buffer.duplicate(); 1357 input.limit(m).position(m-remaining); 1358 buffer.position(j); 1359 1360 CoderResult cr = encoder.encodeArrays(input, buffer, true); 1361 1362 if (cr.isOverflow()) { 1363 throw new AssertionError("unexpected result from encoder"); 1364 } 1365 1366 return new String(buffer.array(), 0, buffer.position()); 1367 } 1368 } 1369 1370 /** 1371 * The core Writer encoding routing of this class. It uses the 1372 * input and output buffers to allow the encoders to reuse arrays. 1373 * Unlike the string version, this method will never allocate more 1374 * memory, instead encoding is done in batches and flushed to the 1375 * writer in batches as large as possible. 1376 * 1377 * @param encoder the encoder to use 1378 * @param out where to write the encoded output 1379 * @param str the string to encode 1380 * @param j the position in the string at which the first character 1381 * needs encoding. 1382 * @throws IOException if thrown by the writer. 1383 */ encode(Encoder encoder, Writer out, String str, int j)1384 void encode(Encoder encoder, Writer out, String str, int j) 1385 throws IOException 1386 { 1387 out.write(str, 0, j); 1388 1389 final int n = str.length(); 1390 1391 _input.clear(); 1392 _output.clear(); 1393 1394 final char[] inputArray = _input.array(); 1395 final char[] outputArray = _output.array(); 1396 1397 for (;;) { 1398 final int remainingInput = n - j; 1399 final int startPosition = _input.position(); 1400 final int batchSize = Math.min(remainingInput, _input.remaining()); 1401 str.getChars(j, j+batchSize, inputArray, startPosition); 1402 1403 _input.limit(startPosition + batchSize); 1404 1405 1406 for (;;) { 1407 CoderResult cr = encoder.encodeArrays( 1408 _input, _output, batchSize == remainingInput); 1409 1410 if (cr.isUnderflow()) { 1411 // get next input batch 1412 break; 1413 } 1414 1415 // else, output buffer full, flush and continue. 1416 out.write(outputArray, 0, _output.position()); 1417 _output.clear(); 1418 } 1419 1420 j += _input.position() - startPosition; 1421 1422 if (j == n) { 1423 // done. flush remaining output buffer and return 1424 out.write(outputArray, 0, _output.position()); 1425 return; 1426 } 1427 1428 _input.compact(); 1429 } 1430 } 1431 } 1432 } 1433