1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 35 package org.owasp.encoder; 36 37 import java.io.IOException; 38 import java.io.Writer; 39 import java.nio.CharBuffer; 40 import java.nio.charset.CoderResult; 41 42 /** 43 * Encode -- fluent interface for contextual encoding. Example usage in a JSP: 44 * 45 * <pre> 46 * <input value="<%=Encode.forHtml(value)%>" /> 47 * </pre> 48 * 49 * <p>There are two versions of each contextual encoding method. The first 50 * takes a {@code String} argument and returns the encoded version as a 51 * {@code String}. The second version writes the encoded version directly 52 * to a {@code Writer}.</p> 53 * 54 * <p>Please make sure to read and understand the context that the method encodes 55 * for. Encoding for the incorrect context will likely lead to exposing a 56 * cross-site scripting vulnerability. Those new to XSS mitigation may find it 57 * useful to read the 58 * <a href="https://cheatsheetseries.owasp.org/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.html"> 59 * Cross Site Scripting Prevention Cheat Sheet</a> that is part of the OWASP Cheat Sheet series for background 60 * material. 61 * </p> 62 * 63 * @author Jeff Ichnowski 64 */ 65 public final class Encode { 66 /** No instances. */ Encode()67 private Encode() {} 68 69 /** 70 * <p>Encodes for (X)HTML text content and text attributes. Since 71 * this method encodes for both contexts, it may be slightly less 72 * efficient to use this method over the methods targeted towards 73 * the specific contexts ({@link #forHtmlAttribute(String)} and 74 * {@link #forHtmlContent(String)}). In general this method should 75 * be preferred unless you are really concerned with saving a few 76 * bytes or are writing a framework that utilizes this 77 * package.</p> 78 * 79 * <b>Example JSP Usage</b> 80 * <pre> 81 * <div><%=Encode.forHtml(unsafeData)%></div> 82 * 83 * <input value="<%=Encode.forHtml(unsafeData)%>" /> 84 * </pre> 85 * 86 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 87 * <caption><b>Encoding Table</b></caption> 88 * <thead> 89 * <tr> 90 * <th align="left" class="colFirst">Input</th> 91 * <th align="left" class="colLast">Result</th> 92 * </tr> 93 * </thead> 94 * <tbody> 95 * <tr class="altColor"> 96 * <td class="colFirst">{@code &}</td> 97 * <td class="colLast">{@code &}</td> 98 * </tr> 99 * <tr class="rowColor"> 100 * <td class="colFirst">{@code <}</td> 101 * <td class="colLast">{@code <}</td> 102 * </tr> 103 * <tr class="altColor"> 104 * <td class="colFirst">{@code >}</td> 105 * <td class="colLast">{@code >}</td> 106 * </tr> 107 * <tr class="rowColor"> 108 * <td class="colFirst">{@code "}</td> 109 * <td class="colLast">{@code "}</td> 110 * </tr> 111 * <tr class="altColor"> 112 * <td class="colFirst">{@code '}</td> 113 * <td class="colLast">{@code '}</td> 114 * </tr> 115 * </tbody> 116 * </table> 117 * 118 * <p><b>Additional Notes</b></p> 119 * <ul> 120 * <li>The encoding of the greater-than sign ({@code >}) is not 121 * strictly required, but is included for maximum 122 * compatibility.</li> 123 * 124 * <li>Numeric encoding is used for double-quote character ({@code 125 * "}) as it shorter than the also valid {@code "}.</li> 126 * 127 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 128 * (U+09) and space (U+20) are valid in quoted attributes and in 129 * block in an unescaped form.</li> 130 * 131 * <li>Surrogate pairs are passed through only if valid.</li> 132 * 133 * <li>Characters that are not <a 134 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 135 * to the XML specification</a> are replaced by a space character 136 * as they could lead to parsing errors. In particular only {@code #x9 137 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 138 * [#x10000-#x10FFFF]} are considered valid.</li> 139 * </ul> 140 * 141 * @param input the data to encode 142 * @return the data encoded for html. 143 */ forHtml(String input)144 public static String forHtml(String input) { 145 return forXml(input); 146 } 147 148 /** 149 * See {@link #forHtml(String)} for description of encoding. This 150 * version writes directly to a Writer without an intervening string. 151 * 152 * @param out where to write encoded output 153 * @param input the input string to encode 154 * @throws IOException if thrown by writer 155 */ forHtml(Writer out, String input)156 public static void forHtml(Writer out, String input) throws IOException { 157 forXml(out, input); 158 } 159 160 /** 161 * <p>This method encodes for HTML text content. It does not escape 162 * quotation characters and is thus unsafe for use with 163 * HTML attributes. Use either {@link #forHtml(String)} or {@link #forHtmlAttribute(String)} for those 164 * methods.</p> 165 * 166 * <b>Example JSP Usage</b> 167 * <pre> 168 * <div><%=Encode.forHtmlContent(unsafeData)%></div> 169 * </pre> 170 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 171 * <caption><b>Encoding Table</b></caption> 172 * <thead> 173 * <tr> 174 * <th align="left" class="colFirst">Input</th> 175 * <th align="left" class="colLast">Result</th> 176 * </tr> 177 * </thead> 178 * <tbody> 179 * <tr class="altColor"> 180 * <td class="colFirst">{@code &}</td> 181 * <td class="colLast">{@code &}</td> 182 * </tr> 183 * <tr class="rowColor"> 184 * <td class="colFirst">{@code <}</td> 185 * <td class="colLast">{@code <}</td> 186 * </tr> 187 * <tr class="altColor"> 188 * <td class="colFirst">{@code >}</td> 189 * <td class="colLast">{@code >}</td> 190 * </tr> 191 * </tbody> 192 * </table> 193 * 194 * <p><b>Additional Notes</b></p> 195 * <ul> 196 * <li>Single-quote character ({@code '}) and double-quote 197 * character ({@code "}) do not require encoding in HTML 198 * blocks, unlike other HTML contexts.</li> 199 * 200 * <li>The encoding of the greater-than sign ({@code >}) is not 201 * strictly required, but is included for maximum 202 * compatibility.</li> 203 * 204 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 205 * (U+09) and space (U+20) are valid in quoted attributes and in 206 * block in an unescaped form.</li> 207 * 208 * <li>Surrogate pairs are passed through only if valid.</li> 209 * 210 * <li>Characters that are not <a 211 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 212 * to the XML specification</a> are replaced by a space character 213 * as they could lead to parsing errors. In particular only {@code #x9 214 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 215 * [#x10000-#x10FFFF]} are considered valid.</li> 216 * </ul> 217 * 218 * @param input the input to encode 219 * @return the encoded result 220 */ forHtmlContent(String input)221 public static String forHtmlContent(String input) { 222 return forXmlContent(input); 223 } 224 225 /** 226 * See {@link #forHtmlContent(String)} for description of encoding. This 227 * version writes directly to a Writer without an intervening string. 228 * 229 * @param out where to write encoded output 230 * @param input the input string to encode 231 * @throws IOException if thrown by writer 232 */ forHtmlContent(Writer out, String input)233 public static void forHtmlContent(Writer out, String input) 234 throws IOException 235 { 236 forXmlContent(out, input); 237 } 238 239 /** 240 * <p>This method encodes for HTML text attributes. Do not use for JavaScript event attributes or for attributes 241 * that are interpreted as a URL. Instead use {@link #forJavaScript(String)} and {@link #forUriComponent(String)} 242 * respectively for those.</p> 243 * 244 * <b>Example JSP Usage</b> 245 * <pre> 246 * <div><%=Encode.forHtmlAttribute(unsafeData)%></div> 247 * </pre> 248 * 249 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 250 * <caption><b>Encoding Table</b></caption> 251 * <thead> 252 * <tr> 253 * <th align="left" class="colFirst">Input</th> 254 * <th align="left" class="colLast">Result</th> 255 * </tr> 256 * </thead> 257 * <tbody> 258 * <tr class="altColor"> 259 * <td class="colFirst">{@code &}</td> 260 * <td class="colLast">{@code &}</td> 261 * </tr> 262 * <tr class="rowColor"> 263 * <td class="colFirst">{@code <}</td> 264 * <td class="colLast">{@code <}</td> 265 * </tr> 266 * <tr class="altColor"> 267 * <td class="colFirst">{@code "}</td> 268 * <td class="colLast">{@code "}</td> 269 * </tr> 270 * <tr class="rowColor"> 271 * <td class="colFirst">{@code '}</td> 272 * <td class="colLast">{@code '}</td> 273 * </tr> 274 * </tbody> 275 * </table> 276 * 277 * <p><b>Additional Notes</b></p> 278 * <ul> 279 * <li>Both the single-quote character ({@code '}) and the 280 * double-quote character ({@code "}) are encoded so this is safe 281 * for HTML attributes with either enclosing character.</li> 282 * 283 * <li>The encoding of the greater-than sign ({@code >}) is not 284 * required for attributes.</li> 285 * 286 * <li>Numeric encoding is used for double-quote character ({@code 287 * "}) as it shorter than the also valid {@code "}.</li> 288 * 289 * <li>Carriage return (U+0D), line-feed (U+0A), horizontal tab 290 * (U+09) and space (U+20) are valid in quoted attributes and in 291 * block in an unescaped form.</li> 292 * 293 * <li>Surrogate pairs are passed through only if valid.</li> 294 * 295 * <li>Characters that are not <a 296 * href="http://www.w3.org/TR/REC-xml/#charsets">valid according 297 * to the XML specification</a> are replaced by a space character 298 * as they could lead to parsing errors. In particular only {@code #x9 299 * | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 300 * [#x10000-#x10FFFF]} are considered valid.</li> 301 * </ul> 302 * 303 * @param input the input to encode 304 * @return the encoded result 305 */ forHtmlAttribute(String input)306 public static String forHtmlAttribute(String input) { 307 return forXmlAttribute(input); 308 } 309 310 /** 311 * See {@link #forHtmlAttribute(String)} for description of encoding. This 312 * version writes directly to a Writer without an intervening string. 313 * 314 * @param out where to write encoded output 315 * @param input the input string to encode 316 * @throws IOException if thrown by writer 317 */ forHtmlAttribute(Writer out, String input)318 public static void forHtmlAttribute(Writer out, String input) 319 throws IOException 320 { 321 forXmlAttribute(out, input); 322 } 323 324 325 /** 326 * <p>Encodes for unquoted HTML attribute values. {@link 327 * #forHtml(String)} or {@link #forHtmlAttribute(String)} should 328 * usually be preferred over this method as quoted attributes are 329 * XHTML compliant.</p> 330 * 331 * <p>When using this method, the caller is not required to 332 * provide quotes around the attribute (since it is encoded for 333 * such context). The caller should make sure that the attribute 334 * value does not abut unsafe characters--and thus should usually 335 * err on the side of including a space character after the 336 * value.</p> 337 * 338 * <p>Use of this method is discouraged as quoted attributes are 339 * generally more compatible and safer. Also note, that no 340 * attempt has been made to optimize this encoding, though it is 341 * still probably faster than other encoding libraries.</p> 342 * 343 * <b>Example JSP Usage</b> 344 * <pre> 345 * <input value=<%=Encode.forHtmlUnquotedAttribute(input)%> > 346 * </pre> 347 * 348 * <table border="0" class="memberSummary" summary="Shows the input and results of encoding"> 349 * <caption><b>Encoding Table</b></caption> 350 * <thead> 351 * <tr> 352 * <th align="left" class="colFirst">Input</th> 353 * <th align="left" class="colLast">Result</th> 354 * </tr> 355 * </thead> 356 * <tbody> 357 * <tr class="altColor"> 358 * <td class="colFirst">{@code U+0009} (horizontal tab)</td> 359 * <td class="colLast">{@code 	}</td></tr> 360 * <tr class="rowColor"> 361 * <td class="colFirst">{@code U+000A} (line feed)</td> 362 * <td class="colLast">{@code }</td></tr> 363 * <tr class="altColor"> 364 * <td class="colFirst">{@code U+000C} (form feed)</td> 365 * <td class="colLast">{@code }</td></tr> 366 * <tr class="rowColor"> 367 * <td class="colFirst">{@code U+000D} (carriage return)</td> 368 * <td class="colLast">{@code }</td></tr> 369 * <tr class="altColor"> 370 * <td class="colFirst">{@code U+0020} (space)</td> 371 * <td class="colLast">{@code  }</td></tr> 372 * <tr class="rowColor"> 373 * <td class="colFirst">{@code &}</td> 374 * <td class="colLast">{@code &}</td></tr> 375 * <tr class="altColor"> 376 * <td class="colFirst">{@code <}</td> 377 * <td class="colLast">{@code <}</td></tr> 378 * <tr class="rowColor"> 379 * <td class="colFirst">{@code >}</td> 380 * <td class="colLast">{@code >}</td></tr> 381 * <tr class="altColor"> 382 * <td class="colFirst">{@code "}</td> 383 * <td class="colLast">{@code "}</td></tr> 384 * <tr class="rowColor"> 385 * <td class="colFirst">{@code '}</td> 386 * <td class="colLast">{@code '}</td></tr> 387 * <tr class="altColor"> 388 * <td class="colFirst">{@code /}</td> 389 * <td class="colLast">{@code /}</td></tr> 390 * <tr class="rowColor"> 391 * <td class="colFirst">{@code =}</td> 392 * <td class="colLast">{@code =}</td></tr> 393 * <tr class="altColor"> 394 * <td class="colFirst">{@code `}</td> 395 * <td class="colLast">{@code `}</td></tr> 396 * <tr class="rowColor"> 397 * <td class="colFirst">{@code U+0085} (next line)</td> 398 * <td class="colLast">{@code …}</td></tr> 399 * <tr class="altColor"> 400 * <td class="colFirst">{@code U+2028} (line separator)</td> 401 * <td class="colLast">{@code 
}</td></tr> 402 * <tr class="rowColor"> 403 * <td class="colFirst">{@code U+2029} (paragraph separator)</td> 404 * <td class="colLast">{@code 
}</td></tr> 405 * </tbody> 406 * </table> 407 * 408 * <p><b>Additional Notes</b></p> 409 * <ul> 410 * <li>The following characters are <i>not</i> encoded: 411 * {@code 0-9, a-z, A-Z}, {@code !}, {@code 412 * #}, {@code $}, {@code %}, 413 * {@code (}, {@code )}, {@code 414 * *}, {@code +}, {@code ,}, 415 * {@code -}, {@code .}, {@code 416 * [}, {@code \}, {@code ]}, 417 * {@code ^}, {@code _}, {@code 418 * }}.</li> 419 * 420 * <li>Surrogate pairs are passed through only if valid. Invalid 421 * surrogate pairs are replaced by a hyphen (-).</li> 422 * 423 * <li>Characters in the C0 and C1 control blocks and not 424 * otherwise listed above are considered invalid and replaced by a 425 * hyphen (-) character.</li> 426 * 427 * <li>Unicode "non-characters" are replaced by hyphens (-).</li> 428 * </ul> 429 * 430 * @param input the attribute value to be encoded. 431 * @return the attribute value encoded for unquoted attribute 432 * context. 433 */ forHtmlUnquotedAttribute(String input)434 public static String forHtmlUnquotedAttribute(String input) { 435 return encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, input); 436 } 437 438 /** 439 * See {@link #forHtmlUnquotedAttribute(String)} for description of encoding. This 440 * version writes directly to a Writer without an intervening string. 441 * 442 * @param out where to write encoded output 443 * @param input the input string to encode 444 * @throws IOException if thrown by writer 445 */ forHtmlUnquotedAttribute(Writer out, String input)446 public static void forHtmlUnquotedAttribute(Writer out, String input) 447 throws IOException 448 { 449 encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, out, input); 450 } 451 452 453 // HTML comment encoding is not currently supported because 454 // of the number of vendor-specific sequences that would need 455 // to be handled (e.g. "<!--[if IE]-->" 456 457 // public static String forHtmlComment(String input) { 458 // // only alphanumeric and space, everything else becomes a space 459 // 460 // // HTML comment context needs to avoid browser extensions 461 // // such as "<!--[if IE]-->" 462 // throw new UnsupportedOperationException(); 463 // } 464 465 /** 466 * Encodes for CSS strings. The context must be surrounded by quotation 467 * characters. It is safe for use in both style blocks and attributes in 468 * HTML. 469 * 470 * <b>Example JSP Usage</b> 471 * <pre> 472 * <div style="background: url('<=Encode.forCssString(...)%>');"> 473 * 474 * <style type="text/css"> 475 * background: url('<%=Encode.forCssString(...)%>'); 476 * </style> 477 * </pre> 478 * 479 * <b>Encoding Notes</b> 480 * <ul> 481 * 482 * <li>The following characters are encoded using hexadecimal 483 * encodings: {@code U+0000} - {@code U+001f}, 484 * {@code "}, 485 * {@code '}, 486 * {@code \}, 487 * {@code <}, 488 * {@code &}, 489 * {@code /}, 490 * {@code >}, 491 * {@code U+007f}, 492 * line separator ({@code U+2028}), 493 * paragraph separator ({@code U+2029}).</li> 494 * 495 * <li>Any character requiring encoding is encoded as {@code \xxx} 496 * where {@code xxx} is the shortest hexadecimal representation of 497 * its Unicode code point (after decoding surrogate pairs if 498 * necessary). This encoding is never zero padded. Thus, for 499 * example, the tab character is encoded as {@code \9}, not {@code 500 * \0009}.</li> 501 * 502 * <li>The encoder looks ahead 1 character in the input and 503 * appends a space to an encoding to avoid the next character 504 * becoming part of the hexadecimal encoded sequence. Thus 505 * “{@code '1}” is encoded as “{@code \27 506 * 1}”, and not as “{@code \271}”. If a space 507 * is not necessary, it is not included, thus “{@code 508 * 'x}” is encoded as “{@code \27x}”, and not as 509 * “{@code \27 x}”.</li> 510 * 511 * <li>Surrogate pairs are passed through only if valid. Invalid 512 * surrogate pairs are replaced by an underscore (_).</li> 513 * 514 * <li>Unicode "non-characters" are replaced by underscores (_).</li> 515 * 516 * </ul> 517 * 518 * @param input the input to encode 519 * @return the encoded result 520 */ forCssString(String input)521 public static String forCssString(String input) { 522 // need to watch out for CSS expressions 523 return encode(Encoders.CSS_STRING_ENCODER, input); 524 } 525 526 /** 527 * See {@link #forCssString(String)} for description of encoding. This 528 * version writes directly to a Writer without an intervening string. 529 * 530 * @param out where to write encoded output 531 * @param input the input string to encode 532 * @throws IOException if thrown by writer 533 */ forCssString(Writer out, String input)534 public static void forCssString(Writer out, String input) 535 throws IOException 536 { 537 encode(Encoders.CSS_STRING_ENCODER, out, input); 538 } 539 540 /** 541 * Encodes for CSS URL contexts. The context must be surrounded by {@code "url("} 542 * and {@code ")"}. It is safe for use in both style blocks and attributes in HTML. 543 * Note: this does not do any checking on the quality or safety of the URL 544 * itself. The caller should insure that the URL is safe for embedding 545 * (e.g. input validation) by other means. 546 * 547 * <b>Example JSP Usage</b> 548 * <pre> 549 * <div style="background:url(<=Encode.forCssUrl(...)%>);"> 550 * 551 * <style type="text/css"> 552 * background: url('<%=Encode.forCssUrl(...)%>'); 553 * </style> 554 * </pre> 555 * <b>Encoding Notes</b> 556 * <ul> 557 * 558 * <li>The following characters are encoded using hexadecimal 559 * encodings: {@code U+0000} - {@code U+001f}, 560 * {@code "}, 561 * {@code '}, 562 * {@code \}, 563 * {@code <}, 564 * {@code &}, 565 * {@code /}, 566 * {@code >}, 567 * {@code U+007f}, 568 * line separator ({@code U+2028}), 569 * paragraph separator ({@code U+2029}).</li> 570 * 571 * <li>Any character requiring encoding is encoded as {@code \xxx} 572 * where {@code xxx} is the shortest hexadecimal representation of 573 * its Unicode code point (after decoding surrogate pairs if 574 * necessary). This encoding is never zero padded. Thus, for 575 * example, the tab character is encoded as {@code \9}, not {@code 576 * \0009}.</li> 577 * 578 * <li>The encoder looks ahead 1 character in the input and 579 * appends a space to an encoding to avoid the next character 580 * becoming part of the hexadecimal encoded sequence. Thus 581 * “{@code '1}” is encoded as “{@code \27 582 * 1}”, and not as “{@code \271}”. If a space 583 * is not necessary, it is not included, thus “{@code 584 * 'x}” is encoded as “{@code \27x}”, and not as 585 * “{@code \27 x}”.</li> 586 * 587 * <li>Surrogate pairs are passed through only if valid. Invalid 588 * surrogate pairs are replaced by an underscore (_).</li> 589 * 590 * <li>Unicode "non-characters" are replaced by underscores (_).</li> 591 * 592 * </ul> 593 * 594 * @param input the input to encode 595 * @return the encoded result 596 */ forCssUrl(String input)597 public static String forCssUrl(String input) { 598 return encode(Encoders.CSS_URL_ENCODER, input); 599 } 600 601 /** 602 * See {@link #forCssUrl(String)} for description of encoding. This 603 * version writes directly to a Writer without an intervening string. 604 * 605 * @param out where to write encoded output 606 * @param input the input string to encode 607 * @throws IOException if thrown by writer 608 */ forCssUrl(Writer out, String input)609 public static void forCssUrl(Writer out, String input) 610 throws IOException 611 { 612 encode(Encoders.CSS_URL_ENCODER, out, input); 613 } 614 615 /** 616 * <p>Performs percent-encoding of a URL according to RFC 3986. The provided 617 * URL is assumed to a valid URL. This method does not do any checking on 618 * the quality or safety of the URL itself. In many applications it may 619 * be better to use {@link java.net.URI} instead. Note: this is a 620 * particularly dangerous context to put untrusted content in, as for 621 * example a "javascript:" URL provided by a malicious user would be 622 * "properly" escaped, and still execute.</p> 623 * 624 * <b>Encoding Table</b> 625 * <p>The following characters are <i>not</i> encoded:</p> 626 * <pre> 627 * U+20: ! # $ & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; = ? 628 * U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _ 629 * U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~ 630 * </pre> 631 * 632 * <b>Encoding Notes</b> 633 * <ul> 634 * 635 * <li>The single-quote character({@code '}) <b>is not encoded</b>.</li> 636 * 637 * <li>This encoding is not intended to be used standalone. The 638 * output should be encoded to the target context. For example: 639 * {@code <a 640 * href="<%=Encode.forHtmlAttribute(Encode.forUri(uri))%>">...</a>}. 641 * (Note, the single-quote character ({@code '}) is not 642 * encoded.)</li> 643 * 644 * <li>URL encoding is an encoding for bytes, not unicode. The 645 * input string is thus first encoded as a sequence of UTF-8 646 * byte. The bytes are then encoded as {@code %xx} where {@code 647 * xx} is the two-digit hexadecimal representation of the 648 * byte. (The implementation does this as one step for 649 * performance.)</li> 650 * 651 * <li>Surrogate pairs are first decoded to a Unicode code point 652 * before encoding as UTF-8.</li> 653 * 654 * <li>Invalid characters (e.g. partial or invalid surrogate 655 * pairs), are replaced with a hyphen ({@code -}) character.</li> 656 * 657 * </ul> 658 * 659 * @param input the input to encode 660 * @return the encoded result 661 */ forUri(String input)662 @Deprecated public static String forUri(String input) { 663 return encode(Encoders.URI_ENCODER, input); 664 } 665 666 /** 667 * See {@link #forUri(String)} for description of encoding. This 668 * version writes directly to a Writer without an intervening string. 669 * 670 * @param out where to write encoded output 671 * @param input the input string to encode 672 * @throws IOException if thrown by writer 673 * 674 * @deprecated There is never a need to encode a complete URI with this form of encoding. 675 */ forUri(Writer out, String input)676 @Deprecated public static void forUri(Writer out, String input) 677 throws IOException 678 { 679 encode(Encoders.URI_ENCODER, out, input); 680 } 681 682 /** 683 * Performs percent-encoding for a component of a URI, such as a query 684 * parameter name or value, path or query-string. In particular this 685 * method insures that special characters in the component do not get 686 * interpreted as part of another component. 687 * 688 * <pre> 689 * <a href="http://www.owasp.org/<%=Encode.forUriComponent(...)%>?query#fragment"> 690 * 691 * <a href="/search?value=<%=Encode.forUriComponent(...)%>&order=1#top"> 692 * </pre> 693 * 694 * <b>Encoding Table</b> 695 * <p>The following characters are <i>not</i> encoded:</p> 696 * <pre> 697 * U+20: - . 0 1 2 3 4 5 6 7 8 9 698 * U+40: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ 699 * U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~ 700 * </pre> 701 * 702 * <b>Encoding Notes</b> 703 * <ul> 704 * 705 * <li>Unlike {@link #forUri(String)} this method is safe to be 706 * used in most containing contexts, including: HTML/XML, CSS, 707 * and JavaScript contexts.</li> 708 * 709 * <li>URL encoding is an encoding for bytes, not unicode. The 710 * input string is thus first encoded as a sequence of UTF-8 711 * byte. The bytes are then encoded as {@code %xx} where {@code 712 * xx} is the two-digit hexadecimal representation of the 713 * byte. (The implementation does this as one step for 714 * performance.)</li> 715 * 716 * <li>Surrogate pairs are first decoded to a Unicode code point 717 * before encoding as UTF-8.</li> 718 * 719 * <li>Invalid characters (e.g. partial or invalid surrogate 720 * pairs), are replaced with a hyphen ({@code -}) character.</li> 721 * 722 * </ul> 723 * 724 * @param input the input to encode 725 * @return the encoded result 726 */ forUriComponent(String input)727 public static String forUriComponent(String input) { 728 return encode(Encoders.URI_COMPONENT_ENCODER, input); 729 } 730 731 /** 732 * See {@link #forUriComponent(String)} for description of encoding. This 733 * version writes directly to a Writer without an intervening string. 734 * 735 * @param out where to write encoded output 736 * @param input the input string to encode 737 * @throws IOException if thrown by writer 738 */ forUriComponent(Writer out, String input)739 public static void forUriComponent(Writer out, String input) 740 throws IOException 741 { 742 encode(Encoders.URI_COMPONENT_ENCODER, out, input); 743 } 744 745 /** 746 * Encoder for XML and XHTML. See {@link #forHtml(String)} for a 747 * description of the encoding and context. 748 * 749 * @see #forHtml(String) 750 * @param input the input to encode 751 * @return the encoded result 752 */ forXml(String input)753 public static String forXml(String input) { 754 return encode(Encoders.XML_ENCODER, input); 755 } 756 757 /** 758 * See {@link #forXml(String)} for description of encoding. This 759 * version writes directly to a Writer without an intervening string. 760 * 761 * @param out where to write encoded output 762 * @param input the input string to encode 763 * @throws IOException if thrown by writer 764 */ forXml(Writer out, String input)765 public static void forXml(Writer out, String input) 766 throws IOException 767 { 768 encode(Encoders.XML_ENCODER, out, input); 769 } 770 771 /** 772 * Encoder for XML and XHTML text content. See {@link 773 * #forHtmlContent(String)} for description of encoding and 774 * context. 775 * 776 * @see #forHtmlContent(String) 777 * @param input the input to encode 778 * @return the encoded result 779 */ forXmlContent(String input)780 public static String forXmlContent(String input) { 781 return encode(Encoders.XML_CONTENT_ENCODER, input); 782 } 783 784 /** 785 * See {@link #forXmlContent(String)} for description of encoding. This 786 * version writes directly to a Writer without an intervening string. 787 * 788 * @param out where to write encoded output 789 * @param input the input string to encode 790 * @throws IOException if thrown by writer 791 */ forXmlContent(Writer out, String input)792 public static void forXmlContent(Writer out, String input) 793 throws IOException 794 { 795 encode(Encoders.XML_CONTENT_ENCODER, out, input); 796 } 797 798 /** 799 * Encoder for XML and XHTML attribute content. See {@link 800 * #forHtmlAttribute(String)} for description of encoding and 801 * context. 802 * 803 * @see #forHtmlAttribute(String) 804 * @param input the input to encode 805 * @return the encoded result 806 */ forXmlAttribute(String input)807 public static String forXmlAttribute(String input) { 808 return encode(Encoders.XML_ATTRIBUTE_ENCODER, input); 809 } 810 811 /** 812 * See {@link #forXmlAttribute(String)} for description of encoding. This 813 * version writes directly to a Writer without an intervening string. 814 * 815 * @param out where to write encoded output 816 * @param input the input string to encode 817 * @throws IOException if thrown by writer 818 */ forXmlAttribute(Writer out, String input)819 public static void forXmlAttribute(Writer out, String input) 820 throws IOException 821 { 822 encode(Encoders.XML_ATTRIBUTE_ENCODER, out, input); 823 } 824 825 /** 826 * Encoder for XML comments. <strong>NOT FOR USE WITH 827 * (X)HTML CONTEXTS.</strong> (X)HTML comments may be interpreted by 828 * browsers as something other than a comment, typically in vendor 829 * specific extensions (e.g. {@code <--if[IE]-->}). 830 * For (X)HTML it is recommend that unsafe content never be included 831 * in a comment. 832 * 833 * <p>The caller must provide the comment start and end sequences.</p> 834 * 835 * <p>This method replaces all invalid XML characters with spaces, 836 * and replaces the "--" sequence (which is invalid in XML comments) 837 * with "-~" (hyphen-tilde). <b>This encoding behavior may change 838 * in future releases.</b> If the comments need to be decoded, the 839 * caller will need to come up with their own encode/decode system.</p> 840 * 841 * <pre> 842 * out.println("<?xml version='1.0'?>"); 843 * out.println("<data>"); 844 * out.println("<!-- "+Encode.forXmlComment(comment)+" -->"); 845 * out.println("</data>"); 846 * </pre> 847 * 848 * @param input the input to encode 849 * @return the encoded result 850 */ forXmlComment(String input)851 public static String forXmlComment(String input) { 852 return encode(Encoders.XML_COMMENT_ENCODER, input); 853 } 854 855 /** 856 * See {@link #forXmlComment(String)} for description of encoding. This 857 * version writes directly to a Writer without an intervening string. 858 * 859 * @param out where to write encoded output 860 * @param input the input string to encode 861 * @throws IOException if thrown by writer 862 */ forXmlComment(Writer out, String input)863 public static void forXmlComment(Writer out, String input) 864 throws IOException 865 { 866 encode(Encoders.XML_COMMENT_ENCODER, out, input); 867 } 868 869 /** 870 * Encodes data for an XML CDATA section. On the chance that the input 871 * contains a terminating {@code "]]>"}, it will be replaced by 872 * {@code "]]>]]<![CDATA[>"}. 873 * As with all XML contexts, characters that are invalid according to the 874 * XML specification will be replaced by a space character. Caller must 875 * provide the CDATA section boundaries. 876 * 877 * <pre> 878 * <xml-data><![CDATA[<%=Encode.forCDATA(...)%>]]></xml-data> 879 * </pre> 880 * 881 * @param input the input to encode 882 * @return the encoded result 883 */ forCDATA(String input)884 public static String forCDATA(String input) { 885 return encode(Encoders.CDATA_ENCODER, input); 886 } 887 888 /** 889 * See {@link #forCDATA(String)} for description of encoding. This 890 * version writes directly to a Writer without an intervening string. 891 * 892 * @param out where to write encoded output 893 * @param input the input string to encode 894 * @throws IOException if thrown by writer 895 */ forCDATA(Writer out, String input)896 public static void forCDATA(Writer out, String input) 897 throws IOException 898 { 899 encode(Encoders.CDATA_ENCODER, out, input); 900 } 901 902 /** 903 * Encodes for a Java string. This method will use "\b", "\t", "\r", "\f", 904 * "\n", "\"", "\'", "\\", octal and unicode escapes. Valid surrogate 905 * pairing is not checked. The caller must provide the enclosing quotation 906 * characters. This method is useful for when writing code generators and 907 * outputting debug messages. 908 * 909 * <pre> 910 * out.println("public class Hello {"); 911 * out.println(" public static void main(String[] args) {"); 912 * out.println(" System.out.println(\"" + Encode.forJava(message) + "\");"); 913 * out.println(" }"); 914 * out.println("}"); 915 * </pre> 916 * 917 * @param input the input to encode 918 * @return the input encoded for java strings. 919 */ forJava(String input)920 public static String forJava(String input) { 921 return encode(Encoders.JAVA_ENCODER, input); 922 } 923 924 /** 925 * See {@link #forJava(String)} for description of encoding. This 926 * version writes directly to a Writer without an intervening string. 927 * 928 * @param out where to write encoded output 929 * @param input the input string to encode 930 * @throws IOException if thrown by writer 931 */ forJava(Writer out, String input)932 public static void forJava(Writer out, String input) 933 throws IOException 934 { 935 encode(Encoders.JAVA_ENCODER, out, input); 936 } 937 938 /** 939 * <p>Encodes for a JavaScript string. It is safe for use in HTML 940 * script attributes (such as {@code onclick}), script 941 * blocks, JSON files, and JavaScript source. The caller MUST 942 * provide the surrounding quotation characters for the string. 943 * Since this performs additional encoding so it can work in all 944 * of the JavaScript contexts listed, it may be slightly less 945 * efficient than using one of the methods targeted to a specific 946 * JavaScript context ({@link #forJavaScriptAttribute(String)}, 947 * {@link #forJavaScriptBlock}, {@link #forJavaScriptSource}). 948 * Unless you are interested in saving a few bytes of output or 949 * are writing a framework on top of this library, it is recommend 950 * that you use this method over the others.</p> 951 * 952 * <b>Example JSP Usage:</b> 953 * <pre> 954 * <button onclick="alert('<%=Encode.forJavaScript(data)%>');"> 955 * <script type="text/javascript"> 956 * var data = "<%=Encode.forJavaScript(data)%>"; 957 * </script> 958 * </pre> 959 * 960 * <table cellspacing="1" class="memberSummary" cellpadding="1" border="0"> 961 * <caption><b>Encoding Description</b></caption> 962 * <thead> 963 * <tr> 964 * <th align="left" colspan="2" class="colFirst">Input Character</th> 965 * <th align="left" class="colLast">Encoded Result</th> 966 * <th align="left" class="colLast">Notes</th> 967 * </tr> 968 * </thead> 969 * <tbody> 970 * <tr class="altColor"> 971 * <td class="colFirst">U+0008</td><td><i>BS</i></td> 972 * <td class="colLast"><code>\b</code></td> 973 * <td class="colLast">Backspace character</td> 974 * </tr> 975 * <tr class="rowColor"> 976 * <td class="colFirst">U+0009</td><td><i>HT</i></td> 977 * <td class="colLast"><code>\t</code></td> 978 * <td class="colLast">Horizontal tab character</td> 979 * </tr> 980 * <tr class="altColor"> 981 * <td class="colFirst">U+000A</td><td><i>LF</i></td> 982 * <td class="colLast"><code>\n</code></td> 983 * <td class="colLast">Line feed character</td> 984 * </tr> 985 * <tr class="rowColor"> 986 * <td class="colFirst">U+000C</td><td><i>FF</i></td> 987 * <td class="colLast"><code>\f</code></td> 988 * <td class="colLast">Form feed character</td> 989 * </tr> 990 * <tr class="altColor"> 991 * <td class="colFirst">U+000D</td><td><i>CR</i></td> 992 * <td class="colLast"><code>\r</code></td> 993 * <td class="colLast">Carriage return character</td> 994 * </tr> 995 * <tr class="rowColor"> 996 * <td class="colFirst">U+0022</td><td><code>"</code></td> 997 * <td class="colLast"><code>\x22</code></td> 998 * <td class="colLast">The encoding <code>\"</code> is not used here because 999 * it is not safe for use in HTML attributes. (In HTML 1000 * attributes, it would also be correct to use 1001 * "\&quot;".)</td> 1002 * </tr> 1003 * <tr class="altColor"> 1004 * <td class="colFirst">U+0026</td><td><code>&</code></td> 1005 * <td class="colLast"><code>\x26</code></td> 1006 * <td class="colLast">Ampersand character</td> 1007 * </tr> 1008 * <tr class="rowColor"> 1009 * <td class="colFirst">U+0027</td><td><code>'</code></td> 1010 * <td class="colLast"><code>\x27</code></td> 1011 * <td class="colLast">The encoding <code>\'</code> is not used here because 1012 * it is not safe for use in HTML attributes. (In HTML 1013 * attributes, it would also be correct to use 1014 * "\&#39;".)</td> 1015 * </tr> 1016 * <tr class="altColor"> 1017 * <td class="colFirst">U+002F</td><td><code>/</code></td> 1018 * <td class="colLast"><code>\/</code></td> 1019 * <td class="colLast">This encoding is used to avoid an input sequence 1020 * "</" from prematurely terminating a </script> 1021 * block.</td> 1022 * </tr> 1023 * <tr class="rowColor"> 1024 * <td class="colFirst">U+005C</td><td><code>\</code></td> 1025 * <td class="colLast"><code>\\</code></td> 1026 * <td class="colLast"></td> 1027 * </tr> 1028 * <tr class="altColor"> 1029 * <td class="colFirst" colspan="2">U+0000 to U+001F</td> 1030 * <td class="colLast"><code>\x##</code></td> 1031 * <td class="colLast">Hexadecimal encoding is used for characters in this 1032 * range that were not already mentioned in above.</td> 1033 * </tr> 1034 * </tbody> 1035 * </table> 1036 * 1037 * @param input the input string to encode 1038 * @return the input encoded for JavaScript 1039 * @see #forJavaScriptAttribute(String) 1040 * @see #forJavaScriptBlock(String) 1041 */ forJavaScript(String input)1042 public static String forJavaScript(String input) { 1043 return encode(Encoders.JAVASCRIPT_ENCODER, input); 1044 } 1045 1046 /** 1047 * See {@link #forJavaScript(String)} for description of encoding. This 1048 * version writes directly to a Writer without an intervening string. 1049 * 1050 * @param out where to write encoded output 1051 * @param input the input string to encode 1052 * @throws IOException if thrown by writer 1053 */ forJavaScript(Writer out, String input)1054 public static void forJavaScript(Writer out, String input) 1055 throws IOException 1056 { 1057 encode(Encoders.JAVASCRIPT_ENCODER, out, input); 1058 } 1059 1060 /** 1061 * <p>This method encodes for JavaScript strings contained within 1062 * HTML script attributes (such as {@code onclick}). It is 1063 * NOT safe for use in script blocks. The caller MUST provide the 1064 * surrounding quotation characters. This method performs the 1065 * same encode as {@link #forJavaScript(String)} with the 1066 * exception that <code>/</code> is not escaped.</p> 1067 * 1068 * <p><strong>Unless you are interested in saving a few bytes of 1069 * output or are writing a framework on top of this library, it is 1070 * recommend that you use {@link #forJavaScript(String)} over this 1071 * method.</strong></p> 1072 * 1073 * <b>Example JSP Usage:</b> 1074 * <pre> 1075 * <button onclick="alert('<%=Encode.forJavaScriptAttribute(data)%>');"> 1076 * </pre> 1077 * 1078 * @param input the input string to encode 1079 * @return the input encoded for JavaScript 1080 * @see #forJavaScript(String) 1081 * @see #forJavaScriptBlock(String) 1082 */ forJavaScriptAttribute(String input)1083 public static String forJavaScriptAttribute(String input) { 1084 return encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, input); 1085 } 1086 1087 /** 1088 * See {@link #forJavaScriptAttribute(String)} for description of encoding. This 1089 * version writes directly to a Writer without an intervening string. 1090 * 1091 * @param out where to write encoded output 1092 * @param input the input string to encode 1093 * @throws IOException if thrown by writer 1094 */ forJavaScriptAttribute(Writer out, String input)1095 public static void forJavaScriptAttribute(Writer out, String input) 1096 throws IOException 1097 { 1098 encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, out, input); 1099 } 1100 1101 /** 1102 * <p>This method encodes for JavaScript strings contained within 1103 * HTML script blocks. It is NOT safe for use in script 1104 * attributes (such as <code>onclick</code>). The caller must 1105 * provide the surrounding quotation characters. This method 1106 * performs the same encode as {@link #forJavaScript(String)} with 1107 * the exception that <code>"</code> and <code>'</code> are 1108 * encoded as <code>\"</code> and <code>\'</code> 1109 * respectively.</p> 1110 * 1111 * <p><strong>Unless you are interested in saving a few bytes of 1112 * output or are writing a framework on top of this library, it is 1113 * recommend that you use {@link #forJavaScript(String)} over this 1114 * method.</strong></p> 1115 * 1116 * <b>Example JSP Usage:</b> 1117 * <pre> 1118 * <script type="text/javascript"> 1119 * var data = "<%=Encode.forJavaScriptBlock(data)%>"; 1120 * </script> 1121 * </pre> 1122 * 1123 * @param input the input string to encode 1124 * @return the input encoded for JavaScript 1125 * @see #forJavaScript(String) 1126 * @see #forJavaScriptAttribute(String) 1127 */ forJavaScriptBlock(String input)1128 public static String forJavaScriptBlock(String input) { 1129 return encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, input); 1130 } 1131 1132 /** 1133 * See {@link #forJavaScriptBlock(String)} for description of encoding. This 1134 * version writes directly to a Writer without an intervening string. 1135 * 1136 * @param out where to write encoded output 1137 * @param input the input string to encode 1138 * @throws IOException if thrown by writer 1139 */ forJavaScriptBlock(Writer out, String input)1140 public static void forJavaScriptBlock(Writer out, String input) 1141 throws IOException 1142 { 1143 encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, out, input); 1144 } 1145 1146 /** 1147 * <p>This method encodes for JavaScript strings contained within 1148 * a JavaScript or JSON file. <strong>This method is NOT safe for 1149 * use in ANY context embedded in HTML.</strong> The caller must 1150 * provide the surrounding quotation characters. This method 1151 * performs the same encode as {@link #forJavaScript(String)} with 1152 * the exception that <code>/</code> and <code>&</code> are not 1153 * escaped and <code>"</code> and <code>'</code> are encoded as 1154 * <code>\"</code> and <code>\'</code> respectively.</p> 1155 * 1156 * <p><strong>Unless you are interested in saving a few bytes of 1157 * output or are writing a framework on top of this library, it is 1158 * recommend that you use {@link #forJavaScript(String)} over this 1159 * method.</strong></p> 1160 * 1161 * <b>Example JSP Usage:</b> 1162 * This example is serving up JavaScript source directly: 1163 * <pre> 1164 * <%@page contentType="text/javascript; charset=UTF-8"%> 1165 * var data = "<%=Encode.forJavaScriptSource(data)%>"; 1166 * </pre> 1167 * 1168 * This example is serving up JSON data (users of this use-case 1169 * are encouraged to read up on "JSON Hijacking"): 1170 * <pre> 1171 * <%@page contentType="application/json; charset=UTF-8"%> 1172 * <% myapp.jsonHijackingPreventionMeasure(); %> 1173 * {"data":"<%=Encode.forJavaScriptSource(data)%>"} 1174 * </pre> 1175 * 1176 * @param input the input string to encode 1177 * @return the input encoded for JavaScript 1178 * @see #forJavaScript(String) 1179 * @see #forJavaScriptAttribute(String) 1180 * @see #forJavaScriptBlock(String) 1181 */ forJavaScriptSource(String input)1182 public static String forJavaScriptSource(String input) { 1183 return encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, input); 1184 } 1185 1186 /** 1187 * See {@link #forJavaScriptSource(String)} for description of encoding. This 1188 * version writes directly to a Writer without an intervening string. 1189 * 1190 * @param out where to write encoded output 1191 * @param input the input string to encode 1192 * @throws IOException if thrown by writer 1193 */ forJavaScriptSource(Writer out, String input)1194 public static void forJavaScriptSource(Writer out, String input) 1195 throws IOException 1196 { 1197 encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, out, input); 1198 } 1199 1200 // Additional? 1201 // MySQL 1202 // PostreSQL 1203 // Oracle 1204 // ... 1205 1206 /** 1207 * Core encoding loop shared by public methods. It first uses the 1208 * encoder to scan the input for characters that need encoding. If 1209 * no characters require encoding, the input string is returned. 1210 * Otherwise a buffer is used to encode the remainder 1211 * of the input. 1212 * 1213 * @param encoder the encoder to use 1214 * @param str the string to encode 1215 * @return the input string encoded with the provided encoder. 1216 */ encode(Encoder encoder, String str)1217 static String encode(Encoder encoder, String str) { 1218 if (str == null) { 1219 // consistent with String.valueOf(...) use "null" for null. 1220 str = "null"; 1221 } 1222 1223 // quick pass--see if we need to actually encode anything, if not 1224 // return the value unchanged. 1225 final int n = str.length(); 1226 int j = encoder.firstEncodedOffset(str, 0, n); 1227 1228 if (j == n) { 1229 return str; 1230 } 1231 1232 // otherwise, we need to encode. We use a buffer to avoid 1233 // excessive memory allocation for these calls. Note: this means that 1234 // an encoder implementation must NEVER call this method internally. 1235 return new Buffer().encode(encoder, str, j); 1236 } 1237 1238 /** 1239 * Core encoding loop shared by public methods. It first uses the 1240 * encoder to scan the input for characters that need encoding. If no 1241 * characters require encoding, the input string is written directly to 1242 * the writer. Otherwise a buffer is used to encode the 1243 * remainder of the input to the buffers. This version saves a wrapping 1244 * in an String. 1245 * 1246 * @param encoder the encoder to use 1247 * @param out the writer for the encoded output 1248 * @param str the string to encode 1249 * @throws IOException if thrown by the writer 1250 */ encode(Encoder encoder, Writer out, String str)1251 static void encode(Encoder encoder, Writer out, String str) 1252 throws IOException 1253 { 1254 if (str == null) { 1255 // consistent with String.valueOf(...) use "null" for null. 1256 str = "null"; 1257 } 1258 1259 // quick pass--see if we need to actually encode anything, if not 1260 // return the value unchanged. 1261 final int n = str.length(); 1262 int j = encoder.firstEncodedOffset(str, 0, n); 1263 1264 if (j == n) { 1265 out.write(str); 1266 return; 1267 } 1268 1269 // otherwise, we need to encode. We use a buffer to avoid 1270 // excessive memory allocation for these calls. Note: this means that 1271 // an encoder implementation must NEVER call this method internally. 1272 new Buffer().encode(encoder, out, str, j); 1273 } 1274 1275 /** 1276 * A buffer used for encoding. 1277 */ 1278 static class Buffer { 1279 /** 1280 * Input buffer size, used to extract a copy of the input 1281 * from a string and then send to the encoder. 1282 */ 1283 static final int INPUT_BUFFER_SIZE = 1024; 1284 /** 1285 * Output buffer size used to store the encoded output before 1286 * wrapping in a string. 1287 */ 1288 static final int OUTPUT_BUFFER_SIZE = INPUT_BUFFER_SIZE * 2; 1289 1290 /** 1291 * The input buffer. A heap-allocated, array-backed buffer of 1292 * INPUT_BUFFER_SIZE used for holding the characters to encode. 1293 */ 1294 final CharBuffer _input = CharBuffer.allocate(INPUT_BUFFER_SIZE); 1295 /** 1296 * The output buffer. A heap-allocated, array-backed buffer of 1297 * OUTPUT_BUFFER_SIZE used for holding the encoded output. 1298 */ 1299 final CharBuffer _output = CharBuffer.allocate(OUTPUT_BUFFER_SIZE); 1300 1301 /** 1302 * The core String encoding routine of this class. It uses the input 1303 * and output buffers to allow the encoders to work in reuse arrays. 1304 * When the input and/or output exceeds the capacity of the reused 1305 * arrays, temporary ones are allocated and then discarded after 1306 * the encode is done. 1307 * 1308 * @param encoder the encoder to use 1309 * @param str the string to encode 1310 * @param j the offset in {@code str} to start encoding 1311 * @return the encoded result 1312 */ encode(Encoder encoder, String str, int j)1313 String encode(Encoder encoder, String str, int j) { 1314 final int n = str.length(); 1315 final int remaining = n - j; 1316 1317 if (remaining <= INPUT_BUFFER_SIZE && j <= OUTPUT_BUFFER_SIZE) { 1318 // the remaining input to encode fits completely in the pre- 1319 // allocated buffer. 1320 str.getChars(0, j, _output.array(), 0); 1321 str.getChars(j, n, _input.array(), 0); 1322 1323 _input.limit(remaining).position(0); 1324 _output.clear().position(j); 1325 1326 CoderResult cr = encoder.encodeArrays(_input, _output, true); 1327 if (cr.isUnderflow()) { 1328 return new String(_output.array(), 0, _output.position()); 1329 } 1330 1331 // else, it's an overflow, we need to use a new output buffer 1332 // we'll allocate this buffer to be the exact size of the worst 1333 // case, guaranteeing a second overflow would not be possible. 1334 CharBuffer tmp = CharBuffer.allocate(_output.position() 1335 + encoder.maxEncodedLength(_input.remaining())); 1336 1337 // copy over everything that has been encoded so far 1338 tmp.put(_output.array(), 0, _output.position()); 1339 1340 cr = encoder.encodeArrays(_input, tmp, true); 1341 if (cr.isOverflow()) { 1342 throw new AssertionError("unexpected result from encoder"); 1343 } 1344 1345 return new String(tmp.array(), 0, tmp.position()); 1346 } else { 1347 // the input it too large for our pre-allocated buffers 1348 // we'll use a temporary direct heap allocation 1349 final int m = j + encoder.maxEncodedLength(remaining); 1350 CharBuffer buffer = CharBuffer.allocate(m); 1351 str.getChars(0, j, buffer.array(), 0); 1352 str.getChars(j, n, buffer.array(), m - remaining); 1353 1354 CharBuffer input = buffer.duplicate(); 1355 input.limit(m).position(m-remaining); 1356 buffer.position(j); 1357 1358 CoderResult cr = encoder.encodeArrays(input, buffer, true); 1359 1360 if (cr.isOverflow()) { 1361 throw new AssertionError("unexpected result from encoder"); 1362 } 1363 1364 return new String(buffer.array(), 0, buffer.position()); 1365 } 1366 } 1367 1368 /** 1369 * The core Writer encoding routing of this class. It uses the 1370 * input and output buffers to allow the encoders to reuse arrays. 1371 * Unlike the string version, this method will never allocate more 1372 * memory, instead encoding is done in batches and flushed to the 1373 * writer in batches as large as possible. 1374 * 1375 * @param encoder the encoder to use 1376 * @param out where to write the encoded output 1377 * @param str the string to encode 1378 * @param j the position in the string at which the first character 1379 * needs encoding. 1380 * @throws IOException if thrown by the writer. 1381 */ encode(Encoder encoder, Writer out, String str, int j)1382 void encode(Encoder encoder, Writer out, String str, int j) 1383 throws IOException 1384 { 1385 out.write(str, 0, j); 1386 1387 final int n = str.length(); 1388 1389 _input.clear(); 1390 _output.clear(); 1391 1392 final char[] inputArray = _input.array(); 1393 final char[] outputArray = _output.array(); 1394 1395 for (;;) { 1396 final int remainingInput = n - j; 1397 final int startPosition = _input.position(); 1398 final int batchSize = Math.min(remainingInput, _input.remaining()); 1399 str.getChars(j, j+batchSize, inputArray, startPosition); 1400 1401 _input.limit(startPosition + batchSize); 1402 1403 1404 for (;;) { 1405 CoderResult cr = encoder.encodeArrays( 1406 _input, _output, batchSize == remainingInput); 1407 1408 if (cr.isUnderflow()) { 1409 // get next input batch 1410 break; 1411 } 1412 1413 // else, output buffer full, flush and continue. 1414 out.write(outputArray, 0, _output.position()); 1415 _output.clear(); 1416 } 1417 1418 j += _input.position() - startPosition; 1419 1420 if (j == n) { 1421 // done. flush remaining output buffer and return 1422 out.write(outputArray, 0, _output.position()); 1423 return; 1424 } 1425 1426 _input.compact(); 1427 } 1428 } 1429 } 1430 } 1431