• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 package org.owasp.encoder;
35 
36 import java.nio.CharBuffer;
37 import java.nio.charset.CoderResult;
38 
39 /**
40  * XMLEncoder -- encoder for XML attribute and content data. It uses XML entity
41  * entity ("&...;") to encode valid but significant characters. Characters
42  * that are invalid according to the XML specification are replaced by a space
43  * character (U+0020). This encoder supports several modes of operation,
44  * allowing for varying contexts, such as: attribute data between single-quotes,
45  * attribute data between double-quotes, attribute data with indeterminate
46  * quotes, content, or a context safe for all of the above.
47  *
48  * @author jeffi
49  */
50 class XMLEncoder extends Encoder {
51 
52     // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
53     // Unicode Noncharacters (Unicode Standard 16.7)
54     //  U+FFFE &  U+FFFF
55     // U+1FFFE & U+1FFFF
56     // U+2FFFE & U+2FFFF
57     // ...
58     // U+10FFFE & U+10FFFF
59     // U+FDD0 .. U+FDEF
60     // Control Characters
61     // U+0000 .. U+001F <-- CR, LF, TAB are in this range and ok.
62     // U+007f .. U+009F <-- U+85 = NEL (next line) = CR+LF in one = ok.
63     // Note: the standard says it is a good practice to replace noncharacters
64     // with U+FFFD "replacement character".
65     /**
66      * A bit-mask of valid characters with code-points in the range 0--63.
67      */
68     private static final long BASE_VALID_MASK
69             = (1L << '\t') | (1L << '\r') | (1L << '\n');
70 
71     /**
72      * Maximum number of encoded characters per input character.
73      */
74     static final int MAX_ENCODED_CHAR_LENGTH = 5;
75     /**
76      * The encoded length of an ampersand.
77      */
78     static final int AMP_LENGTH = 5;
79     /**
80      * The encoded length of a less-than sign.
81      */
82     static final int LT_LENGTH = 4;
83     /**
84      * The encoded length of a greater-than sign.
85      */
86     static final int GT_LENGTH = 4;
87     /**
88      * The encoded length of an apostrophe.
89      */
90     static final int APOS_LENGTH = 5;
91     /**
92      * The encoded length of a double-quotation character.
93      */
94     static final int QUOT_LENGTH = 5;
95 
96     /**
97      * An enum of supported "modes" of operation for the XMLEncoder.
98      */
99     enum Mode {
100 
101         /**
102          * All significant characters are encoded (&amp; &lt; &gt; ' "). This
103          * mode is safe for use in either content or attributes. See note on
104          * {@link #CONTENT} for explanation of why '>' is encoded.
105          */
106         ALL("&<>\'\""),
107         /**
108          * Characters are encoded for content (a.k.a. "CharData"). This means
109          * &amp; &lt; and &gt;. Note: &gt; only requires encoding if it follows
110          * "]]". However for maximum compatibility and to avoid the overhead of
111          * looking for "]]", we just always encode '>' to '&amp;gt;'.
112          */
113         CONTENT("&<>"),
114         /**
115          * Characters are encoded for attribute values--either single or double
116          * quoted. This means the characters &amp; &lt ' and " are encoded.
117          * Note: &gt; is NOT encoded, and thus this mode is not suitable for
118          * content.
119          */
120         ATTRIBUTE("&<\'\""),
121         /**
122          * Characters are encoded for single-quoted attribute values. Thus, the
123          * same as {@link #ATTRIBUTE} except ' is not encoded.
124          */
125         SINGLE_QUOTED_ATTRIBUTE("&<\'"),
126         /**
127          * Characters are encoded for double-quoted attribute values. Thus, the
128          * same as {@link #ATTRIBUTE} except " is not encoded.
129          */
130         DOUBLE_QUOTED_ATTRIBUTE("&<\""),;
131 
132         /**
133          * The bit-mask of characters that do not need encoding in this mode.
134          */
135         private final long _validMask;
136 
137         /**
138          * Sole constructor.
139          *
140          * @param encodedChars -- a string of characters must be encoded in this
141          * mode. This string is converted to a bit-mask.
142          */
Mode(String encodedChars)143         Mode(String encodedChars) {
144             long encodeMask = 0;
145             for (int i = 0, n = encodedChars.length(); i < n; ++i) {
146                 encodeMask |= 1L << encodedChars.charAt(i);
147             }
148             _validMask = BASE_VALID_MASK | ((-1L << ' ') & ~(encodeMask));
149         }
150 
151         /**
152          * Accessor for {@link #_validMask}.
153          *
154          * @return {@link #_validMask}
155          */
validMask()156         long validMask() {
157             return _validMask;
158         }
159     }
160 
161     /**
162      * Character to use as a replacement for invalid characters (Not to be
163      * confused with characters that require encoding). Invalid characters have
164      * no encoding, and are not allowed in the context.
165      */
166     static final char INVALID_CHARACTER_REPLACEMENT = ' ';
167 
168     /**
169      * The mask of valid characters extracted from the mode for efficiency.
170      */
171     private final long _validMask;
172     /**
173      * The mode of operation--only really stored to provide a relevant toString
174      * implementation.
175      */
176     private final Mode _mode;
177 
178     /**
179      * Default constructor--equivalent to XMLEncoder(Mode.ALL).
180      */
XMLEncoder()181     XMLEncoder() {
182         this(Mode.ALL);
183     }
184 
185     /**
186      * Creates an XMLEncoder for the specified mode constant.
187      *
188      * @param mode the mode of the encoder.
189      */
XMLEncoder(Mode mode)190     XMLEncoder(Mode mode) {
191         _mode = mode;
192         _validMask = mode.validMask();
193     }
194 
195     @Override
maxEncodedLength(int n)196     public int maxEncodedLength(int n) {
197         // "&amp;" = 5 chars.
198         return n * MAX_ENCODED_CHAR_LENGTH;
199     }
200 
201     @Override
firstEncodedOffset(String input, int off, int len)202     public int firstEncodedOffset(String input, int off, int len) {
203         final int n = off + len;
204 
205         for (int i = off; i < n; ++i) {
206             char ch = input.charAt(i);
207             if (ch < Unicode.DEL) {
208                 if (ch <= '>' && (_validMask & (1L << ch)) == 0) {
209                     // either needs encoding or is invalid
210                     return i;
211 //                } else {
212 //                    // valid
213                 }
214             } else if (ch < Character.MIN_HIGH_SURROGATE) {
215                 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) {
216                     return i;
217 //                } else {
218 //                    // valid
219                 }
220             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
221                 if (i + 1 < n && Character.isLowSurrogate(input.charAt(i + 1))) {
222                     int cp = Character.toCodePoint(ch, input.charAt(i + 1));
223                     if (Unicode.isNonCharacter(cp)) {
224                         // noncharacter
225                         return i;
226                     }
227                     ++i;
228                 } else {
229                     return i;
230                 }
231             } else if (ch <= Character.MAX_LOW_SURROGATE
232                     || ch > '\ufffd'
233                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
234             {
235                 return i;
236 //            } else {
237 //                // valid
238             }
239         }
240 
241         return n;
242     }
243 
244     /**
245      * {@inheritDoc}
246      */
encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)247     protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
248         final char[] in = input.array();
249         final char[] out = output.array();
250         int i = input.arrayOffset() + input.position();
251         final int n = input.arrayOffset() + input.limit();
252         int j = output.arrayOffset() + output.position();
253         final int m = output.arrayOffset() + output.limit();
254 
255         for (; i < n; ++i) {
256             final char ch = in[i];
257             if (ch < Unicode.DEL) {
258                 if (ch > '>' || ((_validMask & (1L << ch)) != 0)) {
259                     // Common case ('>' .. '~') reached in two branches
260                     if (j >= m) {
261                         return overflow(input, i, output, j);
262                     }
263                     out[j++] = ch;
264                 } else {
265                     switch (ch) {
266                         case '&':
267                             if (j + AMP_LENGTH > m) {
268                                 return overflow(input, i, output, j);
269                             }
270                             out[j++] = '&';
271                             out[j++] = 'a';
272                             out[j++] = 'm';
273                             out[j++] = 'p';
274                             out[j++] = ';';
275                             break;
276                         case '<':
277                             if (j + LT_LENGTH > m) {
278                                 return overflow(input, i, output, j);
279                             }
280                             out[j++] = '&';
281                             out[j++] = 'l';
282                             out[j++] = 't';
283                             out[j++] = ';';
284                             break;
285                         case '>':
286                             if (j + GT_LENGTH > m) {
287                                 return overflow(input, i, output, j);
288                             }
289                             out[j++] = '&';
290                             out[j++] = 'g';
291                             out[j++] = 't';
292                             out[j++] = ';';
293                             break;
294                         case '\'':
295                             // &apos; is valid in XML, but not in HTML, and numeric code is shorter
296                             if (j + APOS_LENGTH > m) {
297                                 return overflow(input, i, output, j);
298                             }
299                             out[j++] = '&';
300                             out[j++] = '#';
301                             out[j++] = '3';
302                             out[j++] = '9';
303                             out[j++] = ';';
304                             break;
305                         case '\"':
306                             // &quot; is valid in XML and HTML, but numeric code is shorter
307                             if (j + QUOT_LENGTH > m) {
308                                 return overflow(input, i, output, j);
309                             }
310                             out[j++] = '&';
311                             out[j++] = '#';
312                             out[j++] = '3';
313                             out[j++] = '4';
314                             out[j++] = ';';
315                             break;
316                         default:
317                             // invalid character
318                             if (j >= m) {
319                                 return overflow(input, i, output, j);
320                             }
321                             out[j++] = INVALID_CHARACTER_REPLACEMENT;
322                             break;
323                     }
324                 }
325             } else if (ch < Character.MIN_HIGH_SURROGATE) {
326                 if (j >= m) {
327                     return overflow(input, i, output, j);
328                 }
329                 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) {
330                     out[j++] = ch;
331                 } else {
332                     // C1 control code
333                     out[j++] = INVALID_CHARACTER_REPLACEMENT;
334                 }
335             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
336                 if (i + 1 < n) {
337                     if (Character.isLowSurrogate(in[i + 1])) {
338                         int cp = Character.toCodePoint(ch, in[i + 1]);
339                         if (Unicode.isNonCharacter(cp)) {
340                             // noncharacter
341                             if (j >= m) {
342                                 return overflow(input, i, output, j);
343                             }
344                             out[j++] = INVALID_CHARACTER_REPLACEMENT;
345                             ++i;
346                         } else {
347                             if (j + 1 >= m) {
348                                 return overflow(input, i, output, j);
349                             }
350                             out[j++] = ch;
351                             out[j++] = in[++i];
352                         }
353                     } else {
354                         // high without low
355                         if (j >= m) {
356                             return overflow(input, i, output, j);
357                         }
358                         out[j++] = INVALID_CHARACTER_REPLACEMENT;
359                     }
360                 } else if (endOfInput) {
361                     // end of input, high without low = invalid
362                     if (j >= m) {
363                         return overflow(input, i, output, j);
364                     }
365                     out[j++] = INVALID_CHARACTER_REPLACEMENT;
366                 } else {
367                     break;
368                 }
369             } else if (// low surrogate without preceding high surrogate
370                     ch <= Character.MAX_LOW_SURROGATE
371                     // or non-characters
372                     || ch > '\ufffd'
373                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
374             {
375                 if (j >= m) {
376                     return overflow(input, i, output, j);
377                 }
378                 out[j++] = INVALID_CHARACTER_REPLACEMENT;
379             } else {
380                 if (j >= m) {
381                     return overflow(input, i, output, j);
382                 }
383                 out[j++] = ch;
384             }
385         }
386 
387         return underflow(input, i, output, j);
388     }
389 
390     @Override
toString()391     public String toString() {
392         return "XMLEncoder(" + _mode + ")";
393     }
394 }
395