• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 package org.owasp.encoder;
35 
36 import java.nio.CharBuffer;
37 import java.nio.charset.CoderResult;
38 
39 /**
40  * <p>
41  * HTMLEncoder -- an encoder for HTML contexts. Currently most HTML-based
42  * contexts are properly handled by {@link XMLEncoder}. The remaining
43  * HTML-specific context of "unquoted attributes" could not be added to the
44  * XMLEncoder without slowing it down. This class implements that remaining
45  * context: <strong>unquoted attribute values</strong>.</p>
46  *
47  * <p>
48  * Note: because this context is likely small strings, and hopefully rarely
49  * used, no effort was put into optimizing this encoder.</p>
50  *
51  * @author Jeff Ichnowski
52  */
53 class HTMLEncoder extends Encoder {
54 
55     /**
56      * Number of characters in the encoding prefix and suffix when using decimal
57      * numeric encodings of the form "&#...;".
58      */
59     private static final int ENCODE_AFFIX_CHAR_COUNT = 3;
60 
61     /**
62      * Encoding for '\t'.
63      */
64     private static final char[] TAB = "&#9;".toCharArray();
65     /**
66      * Encoding for '&amp;'.
67      */
68     private static final char[] AMP = "&amp;".toCharArray();
69     /**
70      * Encoding for '&lt;'.
71      */
72     private static final char[] LT = "&lt;".toCharArray();
73     /**
74      * Encoding for '&gt;'.
75      */
76     private static final char[] GT = "&gt;".toCharArray();
77 
78     // The large table-switch implementation used here is fast to
79     // implement but slower at runtime than tuned-for-expected-input
80     // encoders that use selective if/else's.  Look at the results of
81     // BenchmarkTest to see the difference.  See note in javadoc as to
82     // reasoning.
83     // On Core i7 (Sandybridge)
84     // Baseline is 371.401009 ns/op
85     // Benchmarked Encode.forXml: 324.219992 ns/op (-12.70% on baseline)
86     // Benchmarked Encode.forHtmlUnquotedAttribute: 821.583263 ns/op (+121.21% on baseline)
87     @Override
maxEncodedLength(int n)88     int maxEncodedLength(int n) {
89         // if everything is line separators and paragraph separators then
90         // we get "&#8283;"
91         return n * (ENCODE_AFFIX_CHAR_COUNT + 4);
92     }
93 
94     @Override
firstEncodedOffset(String input, int off, int len)95     int firstEncodedOffset(String input, int off, int len) {
96         final int n = off + len;
97         for (int i = off; i < n; ++i) {
98             final char ch = input.charAt(i);
99 
100             switch (ch) {
101                 case '\t':
102                 case '\r':
103                 case '\f':
104                 case '\n':
105                 case ' ':
106                 case Unicode.NEL:
107                 case '\"':
108                 case '\'':
109                 case '/':
110                 case '=':
111                 case '`':
112                 case '&':
113                 case '<':
114                 case '>':
115                     return i;
116 
117                 case '!':
118                 case '#':
119                 case '$':
120                 case '%':
121                 case '(':
122                 case ')':
123                 case '*':
124                 case '+':
125                 case ',':
126                 case '-':
127                 case '.':
128 
129                 case '0':
130                 case '1':
131                 case '2':
132                 case '3':
133                 case '4':
134                 case '5':
135                 case '6':
136                 case '7':
137                 case '8':
138                 case '9':
139                 case ':':
140                 case ';':
141                 case '?':
142                 case '@':
143 
144                 case 'A':
145                 case 'B':
146                 case 'C':
147                 case 'D':
148                 case 'E':
149                 case 'F':
150                 case 'G':
151                 case 'H':
152                 case 'I':
153                 case 'J':
154                 case 'K':
155                 case 'L':
156                 case 'M':
157                 case 'N':
158                 case 'O':
159                 case 'P':
160                 case 'Q':
161                 case 'R':
162                 case 'S':
163                 case 'T':
164                 case 'U':
165                 case 'V':
166                 case 'W':
167                 case 'X':
168                 case 'Y':
169                 case 'Z':
170 
171                 case '[':
172                 case '\\':
173                 case ']':
174                 case '^':
175                 case '_':
176 
177                 case 'a':
178                 case 'b':
179                 case 'c':
180                 case 'd':
181                 case 'e':
182                 case 'f':
183                 case 'g':
184                 case 'h':
185                 case 'i':
186                 case 'j':
187                 case 'k':
188                 case 'l':
189                 case 'm':
190                 case 'n':
191                 case 'o':
192                 case 'p':
193                 case 'q':
194                 case 'r':
195                 case 's':
196                 case 't':
197                 case 'u':
198                 case 'v':
199                 case 'w':
200                 case 'x':
201                 case 'y':
202                 case 'z':
203 
204                 case '{':
205                 case '|':
206                 case '}':
207                 case '~':
208                     break; // valid
209 
210                 default:
211 
212                     if (Character.isHighSurrogate(ch)) {
213                         if (i + 1 < n) {
214                             if (Character.isLowSurrogate(input.charAt(i + 1))) {
215                                 int cp = Character.toCodePoint(ch, input.charAt(i + 1));
216                                 if (Unicode.isNonCharacter(cp)) {
217                                     return i;
218                                 } else {
219                                     ++i;
220                                 }
221                                 break;
222                             }
223                         } else {
224                             return i;
225                         }
226                     }
227 
228                     if (ch <= Unicode.MAX_C1_CTRL_CHAR
229                             || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE
230                             || ch > '\ufffd'
231                             || ('\ufdd0' <= ch && ch <= '\ufdef')
232                             || ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR)
233                     {
234                         return i;
235                     }
236             }
237         }
238         return n;
239     }
240 
241     /**
242      * Appends a source array verbatim to the output array. Caller must insure
243      * there is enough space in the array for the output.
244      *
245      * @param src the characters to copy
246      * @param out the output buffer
247      * @param j the offset where to write in the output buffer
248      * @return {@code j + src.length}
249      */
append(char[] src, char[] out, int j)250     static int append(char[] src, char[] out, int j) {
251         System.arraycopy(src, 0, out, j, src.length);
252         return j + src.length;
253     }
254 
255     /**
256      * Appends the numerically encoded version of {@code codePoint} to the
257      * output buffer. Caller must insure there is enough space for the output.
258      *
259      * @param codePoint the character to encode
260      * @param out the output buffer
261      * @param j the offset where to write in the output buffer
262      * @return {@code j} + the encoded length.
263      */
encode(int codePoint, char[] out, int j)264     static int encode(int codePoint, char[] out, int j) {
265         out[j++] = '&';
266         out[j++] = '#';
267         if (codePoint >= 1000) {
268             out[j++] = (char) (codePoint / 1000 % 10 + '0');
269         }
270         if (codePoint >= 100) {
271             out[j++] = (char) (codePoint / 100 % 10 + '0');
272         }
273         if (codePoint >= 10) {
274             out[j++] = (char) (codePoint / 10 % 10 + '0');
275         }
276         out[j++] = (char) (codePoint % 10 + '0');
277         out[j++] = ';';
278         return j;
279     }
280 
281     //CSOFF: MethodLength
282     @Override
encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)283     CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
284         final char[] in = input.array();
285         final char[] out = output.array();
286         int i = input.arrayOffset() + input.position();
287         final int n = input.arrayOffset() + input.limit();
288         int j = output.arrayOffset() + output.position();
289         final int m = output.arrayOffset() + output.limit();
290 
291         charLoop:
292         for (; i < n; ++i) {
293             final char ch = in[i];
294 
295             // gigantic switch, hopefully compiled to a tableswitch.
296             // this approach appears to be slower than the if/else
297             // approach used in the other encoders.  Perhaps an artifact
298             // of the CPU's branch predictor, or possible additional
299             // overhead of range checking, or having the entire table
300             // available to the cache.  If time allows, it would
301             // interesting to find out.
302             switch (ch) {
303                 case '\t':
304                     if (j + TAB.length > m) {
305                         return overflow(input, i, output, j);
306                     }
307                     j = append(TAB, out, j);
308                     break;
309 
310                 case '\r':
311                 case '\n':
312                 case '\f':
313                 case ' ':
314                 case '\"':
315                 case '\'':
316                 case '/':
317                 case '=':
318                 case '`':
319                     if (ENCODE_AFFIX_CHAR_COUNT + 2 + j > m) {
320                         return overflow(input, i, output, j);
321                     }
322                     j = encode(ch, out, j);
323                     break;
324 
325                 case Unicode.NEL:
326                     if (ENCODE_AFFIX_CHAR_COUNT + 3 + j > m) {
327                         return overflow(input, i, output, j);
328                     }
329                     j = encode(ch, out, j);
330                     break;
331 
332                 case '&':
333                     if (j + AMP.length > m) {
334                         return overflow(input, i, output, j);
335                     }
336                     j = append(AMP, out, j);
337                     break;
338 
339                 case '<':
340                     if (j + LT.length > m) {
341                         return overflow(input, i, output, j);
342                     }
343                     j = append(LT, out, j);
344                     break;
345 
346                 case '>':
347                     if (j + GT.length > m) {
348                         return overflow(input, i, output, j);
349                     }
350                     j = append(GT, out, j);
351                     break;
352 
353                 case '!':
354                 case '#':
355                 case '$':
356                 case '%':
357                 case '(':
358                 case ')':
359                 case '*':
360                 case '+':
361                 case ',':
362                 case '-':
363                 case '.':
364 
365                 case '0':
366                 case '1':
367                 case '2':
368                 case '3':
369                 case '4':
370                 case '5':
371                 case '6':
372                 case '7':
373                 case '8':
374                 case '9':
375                 case ':':
376                 case ';':
377                 case '?':
378                 case '@':
379 
380                 case 'A':
381                 case 'B':
382                 case 'C':
383                 case 'D':
384                 case 'E':
385                 case 'F':
386                 case 'G':
387                 case 'H':
388                 case 'I':
389                 case 'J':
390                 case 'K':
391                 case 'L':
392                 case 'M':
393                 case 'N':
394                 case 'O':
395                 case 'P':
396                 case 'Q':
397                 case 'R':
398                 case 'S':
399                 case 'T':
400                 case 'U':
401                 case 'V':
402                 case 'W':
403                 case 'X':
404                 case 'Y':
405                 case 'Z':
406 
407                 case '[':
408                 case '\\':
409                 case ']':
410                 case '^':
411                 case '_':
412 
413                 case 'a':
414                 case 'b':
415                 case 'c':
416                 case 'd':
417                 case 'e':
418                 case 'f':
419                 case 'g':
420                 case 'h':
421                 case 'i':
422                 case 'j':
423                 case 'k':
424                 case 'l':
425                 case 'm':
426                 case 'n':
427                 case 'o':
428                 case 'p':
429                 case 'q':
430                 case 'r':
431                 case 's':
432                 case 't':
433                 case 'u':
434                 case 'v':
435                 case 'w':
436                 case 'x':
437                 case 'y':
438                 case 'z':
439                 case '{':
440                 case '|':
441                 case '}':
442                 case '~':
443                     if (j >= m) {
444                         return overflow(input, i, output, j);
445                     }
446                     out[j++] = ch;
447                     break;
448                 default:
449 
450                     if (Character.isHighSurrogate(ch)) {
451                         if (i + 1 < n) {
452                             if (Character.isLowSurrogate(in[i + 1])) {
453                                 int cp = Character.toCodePoint(ch, in[i + 1]);
454                                 if (Unicode.isNonCharacter(cp)) {
455                                     if (j >= m) {
456                                         return overflow(input, i, output, j);
457                                     }
458                                     out[j++] = '-';
459                                     ++i;
460                                 } else {
461                                     if (j + 1 >= m) {
462                                         return overflow(input, i, output, j);
463                                     }
464                                     out[j++] = ch;
465                                     out[j++] = in[++i];
466                                 }
467                                 break;
468                             }
469                         } else if (!endOfInput) {
470                             break charLoop;
471                         }
472                     }
473 
474                     if (j >= m) {
475                         return overflow(input, i, output, j);
476                     }
477 
478                     if (ch <= Unicode.MAX_C1_CTRL_CHAR
479                             || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE
480                             || ch > '\ufffd'
481                             || ('\ufdd0' <= ch && ch <= '\ufdef'))
482                     {
483                         // invalid
484                         out[j++] = '-';
485                     } else if (ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR) {
486                         if (ENCODE_AFFIX_CHAR_COUNT + 4 + j > m) {
487                             return overflow(input, i, output, j);
488                         }
489                         j = encode(ch, out, j);
490                     } else {
491                         out[j++] = ch;
492                     }
493             }
494         }
495 
496         return underflow(input, i, output, j);
497     }
498     //CSON: MethodLength
499 }
500