1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 package org.owasp.encoder; 35 36 import java.nio.CharBuffer; 37 import java.nio.charset.CoderResult; 38 39 /** 40 * <p> 41 * HTMLEncoder -- an encoder for HTML contexts. Currently most HTML-based 42 * contexts are properly handled by {@link XMLEncoder}. The remaining 43 * HTML-specific context of "unquoted attributes" could not be added to the 44 * XMLEncoder without slowing it down. This class implements that remaining 45 * context: <strong>unquoted attribute values</strong>.</p> 46 * 47 * <p> 48 * Note: because this context is likely small strings, and hopefully rarely 49 * used, no effort was put into optimizing this encoder.</p> 50 * 51 * @author Jeff Ichnowski 52 */ 53 class HTMLEncoder extends Encoder { 54 55 /** 56 * Number of characters in the encoding prefix and suffix when using decimal 57 * numeric encodings of the form "&#...;". 58 */ 59 private static final int ENCODE_AFFIX_CHAR_COUNT = 3; 60 61 /** 62 * Encoding for '\t'. 63 */ 64 private static final char[] TAB = "	".toCharArray(); 65 /** 66 * Encoding for '&'. 67 */ 68 private static final char[] AMP = "&".toCharArray(); 69 /** 70 * Encoding for '<'. 71 */ 72 private static final char[] LT = "<".toCharArray(); 73 /** 74 * Encoding for '>'. 75 */ 76 private static final char[] GT = ">".toCharArray(); 77 78 // The large table-switch implementation used here is fast to 79 // implement but slower at runtime than tuned-for-expected-input 80 // encoders that use selective if/else's. Look at the results of 81 // BenchmarkTest to see the difference. See note in javadoc as to 82 // reasoning. 83 // On Core i7 (Sandybridge) 84 // Baseline is 371.401009 ns/op 85 // Benchmarked Encode.forXml: 324.219992 ns/op (-12.70% on baseline) 86 // Benchmarked Encode.forHtmlUnquotedAttribute: 821.583263 ns/op (+121.21% on baseline) 87 @Override maxEncodedLength(int n)88 int maxEncodedLength(int n) { 89 // if everything is line separators and paragraph separators then 90 // we get "⁛" 91 return n * (ENCODE_AFFIX_CHAR_COUNT + 4); 92 } 93 94 @Override firstEncodedOffset(String input, int off, int len)95 int firstEncodedOffset(String input, int off, int len) { 96 final int n = off + len; 97 for (int i = off; i < n; ++i) { 98 final char ch = input.charAt(i); 99 100 switch (ch) { 101 case '\t': 102 case '\r': 103 case '\f': 104 case '\n': 105 case ' ': 106 case Unicode.NEL: 107 case '\"': 108 case '\'': 109 case '/': 110 case '=': 111 case '`': 112 case '&': 113 case '<': 114 case '>': 115 return i; 116 117 case '!': 118 case '#': 119 case '$': 120 case '%': 121 case '(': 122 case ')': 123 case '*': 124 case '+': 125 case ',': 126 case '-': 127 case '.': 128 129 case '0': 130 case '1': 131 case '2': 132 case '3': 133 case '4': 134 case '5': 135 case '6': 136 case '7': 137 case '8': 138 case '9': 139 case ':': 140 case ';': 141 case '?': 142 case '@': 143 144 case 'A': 145 case 'B': 146 case 'C': 147 case 'D': 148 case 'E': 149 case 'F': 150 case 'G': 151 case 'H': 152 case 'I': 153 case 'J': 154 case 'K': 155 case 'L': 156 case 'M': 157 case 'N': 158 case 'O': 159 case 'P': 160 case 'Q': 161 case 'R': 162 case 'S': 163 case 'T': 164 case 'U': 165 case 'V': 166 case 'W': 167 case 'X': 168 case 'Y': 169 case 'Z': 170 171 case '[': 172 case '\\': 173 case ']': 174 case '^': 175 case '_': 176 177 case 'a': 178 case 'b': 179 case 'c': 180 case 'd': 181 case 'e': 182 case 'f': 183 case 'g': 184 case 'h': 185 case 'i': 186 case 'j': 187 case 'k': 188 case 'l': 189 case 'm': 190 case 'n': 191 case 'o': 192 case 'p': 193 case 'q': 194 case 'r': 195 case 's': 196 case 't': 197 case 'u': 198 case 'v': 199 case 'w': 200 case 'x': 201 case 'y': 202 case 'z': 203 204 case '{': 205 case '|': 206 case '}': 207 case '~': 208 break; // valid 209 210 default: 211 212 if (Character.isHighSurrogate(ch)) { 213 if (i + 1 < n) { 214 if (Character.isLowSurrogate(input.charAt(i + 1))) { 215 int cp = Character.toCodePoint(ch, input.charAt(i + 1)); 216 if (Unicode.isNonCharacter(cp)) { 217 return i; 218 } else { 219 ++i; 220 } 221 break; 222 } 223 } else { 224 return i; 225 } 226 } 227 228 if (ch <= Unicode.MAX_C1_CTRL_CHAR 229 || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE 230 || ch > '\ufffd' 231 || ('\ufdd0' <= ch && ch <= '\ufdef') 232 || ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR) 233 { 234 return i; 235 } 236 } 237 } 238 return n; 239 } 240 241 /** 242 * Appends a source array verbatim to the output array. Caller must insure 243 * there is enough space in the array for the output. 244 * 245 * @param src the characters to copy 246 * @param out the output buffer 247 * @param j the offset where to write in the output buffer 248 * @return {@code j + src.length} 249 */ append(char[] src, char[] out, int j)250 static int append(char[] src, char[] out, int j) { 251 System.arraycopy(src, 0, out, j, src.length); 252 return j + src.length; 253 } 254 255 /** 256 * Appends the numerically encoded version of {@code codePoint} to the 257 * output buffer. Caller must insure there is enough space for the output. 258 * 259 * @param codePoint the character to encode 260 * @param out the output buffer 261 * @param j the offset where to write in the output buffer 262 * @return {@code j} + the encoded length. 263 */ encode(int codePoint, char[] out, int j)264 static int encode(int codePoint, char[] out, int j) { 265 out[j++] = '&'; 266 out[j++] = '#'; 267 if (codePoint >= 1000) { 268 out[j++] = (char) (codePoint / 1000 % 10 + '0'); 269 } 270 if (codePoint >= 100) { 271 out[j++] = (char) (codePoint / 100 % 10 + '0'); 272 } 273 if (codePoint >= 10) { 274 out[j++] = (char) (codePoint / 10 % 10 + '0'); 275 } 276 out[j++] = (char) (codePoint % 10 + '0'); 277 out[j++] = ';'; 278 return j; 279 } 280 281 //CSOFF: MethodLength 282 @Override encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)283 CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { 284 final char[] in = input.array(); 285 final char[] out = output.array(); 286 int i = input.arrayOffset() + input.position(); 287 final int n = input.arrayOffset() + input.limit(); 288 int j = output.arrayOffset() + output.position(); 289 final int m = output.arrayOffset() + output.limit(); 290 291 charLoop: 292 for (; i < n; ++i) { 293 final char ch = in[i]; 294 295 // gigantic switch, hopefully compiled to a tableswitch. 296 // this approach appears to be slower than the if/else 297 // approach used in the other encoders. Perhaps an artifact 298 // of the CPU's branch predictor, or possible additional 299 // overhead of range checking, or having the entire table 300 // available to the cache. If time allows, it would 301 // interesting to find out. 302 switch (ch) { 303 case '\t': 304 if (j + TAB.length > m) { 305 return overflow(input, i, output, j); 306 } 307 j = append(TAB, out, j); 308 break; 309 310 case '\r': 311 case '\n': 312 case '\f': 313 case ' ': 314 case '\"': 315 case '\'': 316 case '/': 317 case '=': 318 case '`': 319 if (ENCODE_AFFIX_CHAR_COUNT + 2 + j > m) { 320 return overflow(input, i, output, j); 321 } 322 j = encode(ch, out, j); 323 break; 324 325 case Unicode.NEL: 326 if (ENCODE_AFFIX_CHAR_COUNT + 3 + j > m) { 327 return overflow(input, i, output, j); 328 } 329 j = encode(ch, out, j); 330 break; 331 332 case '&': 333 if (j + AMP.length > m) { 334 return overflow(input, i, output, j); 335 } 336 j = append(AMP, out, j); 337 break; 338 339 case '<': 340 if (j + LT.length > m) { 341 return overflow(input, i, output, j); 342 } 343 j = append(LT, out, j); 344 break; 345 346 case '>': 347 if (j + GT.length > m) { 348 return overflow(input, i, output, j); 349 } 350 j = append(GT, out, j); 351 break; 352 353 case '!': 354 case '#': 355 case '$': 356 case '%': 357 case '(': 358 case ')': 359 case '*': 360 case '+': 361 case ',': 362 case '-': 363 case '.': 364 365 case '0': 366 case '1': 367 case '2': 368 case '3': 369 case '4': 370 case '5': 371 case '6': 372 case '7': 373 case '8': 374 case '9': 375 case ':': 376 case ';': 377 case '?': 378 case '@': 379 380 case 'A': 381 case 'B': 382 case 'C': 383 case 'D': 384 case 'E': 385 case 'F': 386 case 'G': 387 case 'H': 388 case 'I': 389 case 'J': 390 case 'K': 391 case 'L': 392 case 'M': 393 case 'N': 394 case 'O': 395 case 'P': 396 case 'Q': 397 case 'R': 398 case 'S': 399 case 'T': 400 case 'U': 401 case 'V': 402 case 'W': 403 case 'X': 404 case 'Y': 405 case 'Z': 406 407 case '[': 408 case '\\': 409 case ']': 410 case '^': 411 case '_': 412 413 case 'a': 414 case 'b': 415 case 'c': 416 case 'd': 417 case 'e': 418 case 'f': 419 case 'g': 420 case 'h': 421 case 'i': 422 case 'j': 423 case 'k': 424 case 'l': 425 case 'm': 426 case 'n': 427 case 'o': 428 case 'p': 429 case 'q': 430 case 'r': 431 case 's': 432 case 't': 433 case 'u': 434 case 'v': 435 case 'w': 436 case 'x': 437 case 'y': 438 case 'z': 439 case '{': 440 case '|': 441 case '}': 442 case '~': 443 if (j >= m) { 444 return overflow(input, i, output, j); 445 } 446 out[j++] = ch; 447 break; 448 default: 449 450 if (Character.isHighSurrogate(ch)) { 451 if (i + 1 < n) { 452 if (Character.isLowSurrogate(in[i + 1])) { 453 int cp = Character.toCodePoint(ch, in[i + 1]); 454 if (Unicode.isNonCharacter(cp)) { 455 if (j >= m) { 456 return overflow(input, i, output, j); 457 } 458 out[j++] = '-'; 459 ++i; 460 } else { 461 if (j + 1 >= m) { 462 return overflow(input, i, output, j); 463 } 464 out[j++] = ch; 465 out[j++] = in[++i]; 466 } 467 break; 468 } 469 } else if (!endOfInput) { 470 break charLoop; 471 } 472 } 473 474 if (j >= m) { 475 return overflow(input, i, output, j); 476 } 477 478 if (ch <= Unicode.MAX_C1_CTRL_CHAR 479 || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE 480 || ch > '\ufffd' 481 || ('\ufdd0' <= ch && ch <= '\ufdef')) 482 { 483 // invalid 484 out[j++] = '-'; 485 } else if (ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR) { 486 if (ENCODE_AFFIX_CHAR_COUNT + 4 + j > m) { 487 return overflow(input, i, output, j); 488 } 489 j = encode(ch, out, j); 490 } else { 491 out[j++] = ch; 492 } 493 } 494 } 495 496 return underflow(input, i, output, j); 497 } 498 //CSON: MethodLength 499 } 500