• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 package org.owasp.encoder;
35 
36 import java.nio.CharBuffer;
37 import java.nio.charset.CoderResult;
38 
39 /**
40  * CDATAEncoder -- encoder for CDATA sections. CDATA sections are generally good
41  * for including large blocks of text that contain characters that normally
42  * require encoding (ampersand, quotes, less-than, etc...). The CDATA context
43  * however still does not allow invalid characters, and can be closed by the
44  * sequence "]]>". This encoder removes invalid XML characters, and encodes
45  * "]]>" (to "]]]]><![CDATA[>"). The result is that the data integrity is
46  * maintained, but the code receiving the output will have to handle multiple
47  * CDATA events. As an alternate approach, the caller could pre-encode "]]>" to
48  * something of their choosing (e.g. data.replaceAll("\\]\\]>", "]] >")), then
49  * use this encoder to remove any invalid XML characters.
50  *
51  * @author Jeff Ichnowski
52  */
53 class CDATAEncoder extends Encoder {
54 
55     /**
56      * The encoding of @{code "]]>"}.
57      */
58     private static final char[] CDATA_END_ENCODED
59             = "]]]]><![CDATA[>".toCharArray();
60 
61     /**
62      * Length of {@code "]]]]><![CDATA[>"}.
63      */
64     private static final int CDATA_END_ENCODED_LENGTH = 15;
65 
66     /**
67      * Length of {@code "]]>"}.
68      */
69     private static final int CDATA_END_LENGTH = 3;
70 
71     @Override
maxEncodedLength(int n)72     protected int maxEncodedLength(int n) {
73         // "]" becomes "]" (1 -> 1)
74         // "]]" becomes "]]" (2 -> 2)
75         // "]]>" becomes "]]]]><![CDATA[>" (3 -> 15)
76         // "]]>]" becomes "]]]]><![CDATA[>]" (3 -> 15 + 1 -> 1)
77         // ...
78 
79         int worstCase = n / CDATA_END_LENGTH;
80         int remainder = n % CDATA_END_LENGTH;
81 
82         return worstCase * CDATA_END_ENCODED_LENGTH + remainder;
83 
84 //        return (n - remainder) * 5 + remainder;
85     }
86 
87     @Override
firstEncodedOffset(String input, int off, int len)88     protected int firstEncodedOffset(String input, int off, int len) {
89         final int n = off + len;
90         //int closeCount = 0; //unused...
91         for (int i = off; i < n; ++i) {
92             char ch = input.charAt(i);
93             if (ch <= Unicode.MAX_ASCII) {
94                 if (ch != ']') {
95                     if (ch < ' ' && ch != '\n' && ch != '\r' && ch != '\t') {
96                         return i;
97 //                    } else {
98 //                        // valid
99                     }
100 
101                 } else if (i + 1 < n) {
102                     if (input.charAt(i + 1) != ']') {
103                         // "]x" (next character is safe for this to be ']')
104                     } else {
105                         // "]]?"
106                         // keep looping through ']'
107                         for (; i + 2 < n && input.charAt(i + 2) == ']'; ++i) {
108                             // valid
109                         }
110                         // at this point we've looped through a sequence
111                         // of 2 or more "]", if the next character is ">"
112                         // we need to encode "]]>".
113                         if (i + 2 < n) {
114                             if (input.charAt(i + 2) == '>') {
115                                 return i;
116 //                                } else {
117 //                                    // valid
118                             }
119 
120                         } else {
121                             return n;
122                         }
123                     }
124                 } else {
125                     return n;
126                 }
127             } else if (ch < Character.MIN_HIGH_SURROGATE) {
128                 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) {
129                     return i;
130 //                } else {
131 //                    // valid
132                 }
133             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
134                 if (i + 1 < n) {
135                     if (Character.isLowSurrogate(input.charAt(i + 1))) {
136                         int cp = Character.toCodePoint(ch, input.charAt(i + 1));
137                         if (Unicode.isNonCharacter(cp)) {
138                             return i;
139                         } else {
140                             ++i;
141                             // valid pair
142                         }
143                     } else {
144                         return i;
145                     }
146                 } else {
147                     // end of input, high without low = invalid
148                     return i;
149                 }
150             } else if (// low surrogate without preceding high surrogate
151                     ch <= Character.MAX_LOW_SURROGATE
152                     // or non-characters
153                     || ch > '\ufffd'
154                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
155             {
156                 return i;
157 //            } else {
158 //                // valid
159             }
160 
161         }
162         return n;
163     }
164 
165     @Override
encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)166     protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
167         final char[] in = input.array();
168         final char[] out = output.array();
169         int i = input.arrayOffset() + input.position();
170         final int n = input.arrayOffset() + input.limit();
171         int j = output.arrayOffset() + output.position();
172         final int m = output.arrayOffset() + output.limit();
173 
174         for (; i < n; ++i) {
175             char ch = in[i];
176             if (ch <= Unicode.MAX_ASCII) {
177                 if (ch != ']') {
178                     if (j >= m) {
179                         return overflow(input, i, output, j);
180                     }
181                     if (ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t') {
182                         out[j++] = ch;
183                     } else {
184                         out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
185                     }
186                 } else if (i + 1 < n) {
187                     if (in[i + 1] != ']') {
188                         // "]x" (next character is safe for this to be ']')
189                         if (j >= m) {
190                             return overflow(input, i, output, j);
191                         }
192                         out[j++] = ']';
193                     } else {
194                         // "]]?"
195                         // keep looping through ']'
196                         for (; i + 2 < n && in[i + 2] == ']'; ++i) {
197                             if (j >= m) {
198                                 return overflow(input, i, output, j);
199                             }
200                             out[j++] = ']';
201                         }
202                         // at this point we've looped through a sequence
203                         // of 2 or more "]", if the next character is ">"
204                         // we need to encode "]]>".
205                         if (i + 2 < n) {
206                             if (in[i + 2] == '>') {
207                                 if (j + CDATA_END_ENCODED_LENGTH > m) {
208                                     return overflow(input, i, output, j);
209                                 }
210                                 System.arraycopy(CDATA_END_ENCODED, 0, out, j, CDATA_END_ENCODED_LENGTH);
211                                 j += CDATA_END_ENCODED_LENGTH;
212                                 i += 2;
213                             } else {
214                                 if (j >= m) {
215                                     return overflow(input, i, output, j);
216                                 }
217                                 out[j++] = ']';
218                             }
219                         } else if (endOfInput) {
220                             if (j + 2 > m) {
221                                 return overflow(input, i, output, j);
222                             }
223                             out[j++] = ']';
224                             out[j++] = ']';
225                             i = n;
226                             break;
227                         } else {
228                             break;
229                         }
230                     }
231                 } else if (endOfInput) {
232                     // seen "]", then end of input.
233                     if (j >= m) {
234                         return overflow(input, i, output, j);
235                     }
236                     out[j++] = ']';
237                     i++;
238                     break;
239                 } else {
240                     break;
241                 }
242             } else if (ch < Character.MIN_HIGH_SURROGATE) {
243                 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) {
244                     if (j >= m) {
245                         return overflow(input, i, output, j);
246                     }
247                     out[j++] = ch;
248                 } else {
249                     // C1 control code
250                     if (j >= m) {
251                         return overflow(input, i, output, j);
252                     }
253                     out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
254                 }
255             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
256                 if (i + 1 < n) {
257                     if (Character.isLowSurrogate(in[i + 1])) {
258                         int cp = Character.toCodePoint(ch, in[i + 1]);
259                         if (Unicode.isNonCharacter(cp)) {
260                             if (j >= m) {
261                                 return overflow(input, i, output, j);
262                             }
263                             out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
264                             ++i;
265                         } else {
266                             if (j + 1 >= m) {
267                                 return overflow(input, i, output, j);
268                             }
269                             out[j++] = ch;
270                             out[j++] = in[++i];
271                         }
272                     } else {
273                         // high without low
274                         if (j >= m) {
275                             return overflow(input, i, output, j);
276                         }
277                         out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
278                     }
279                 } else if (endOfInput) {
280                     // end of input, high without low = invalid
281                     if (j >= m) {
282                         return overflow(input, i, output, j);
283                     }
284                     out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
285                 } else {
286                     break;
287                 }
288             } else if (// low surrogate without preceding high surrogate
289                     ch <= Character.MAX_LOW_SURROGATE
290                     // or non-characters
291                     || ch > '\ufffd'
292                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
293             {
294                 if (j >= m) {
295                     return overflow(input, i, output, j);
296                 }
297                 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
298             } else {
299                 if (j >= m) {
300                     return overflow(input, i, output, j);
301                 }
302                 out[j++] = ch;
303             }
304         }
305         return underflow(input, i, output, j);
306     }
307 
308     @Override
toString()309     public String toString() {
310         return "CDATAEncoder";
311     }
312 }
313