• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2012 Jeff Ichnowski
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 //     * Redistributions of source code must retain the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer.
11 //
12 //     * Redistributions in binary form must reproduce the above
13 //       copyright notice, this list of conditions and the following
14 //       disclaimer in the documentation and/or other materials
15 //       provided with the distribution.
16 //
17 //     * Neither the name of the OWASP nor the names of its
18 //       contributors may be used to endorse or promote products
19 //       derived from this software without specific prior written
20 //       permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33 // OF THE POSSIBILITY OF SUCH DAMAGE.
34 package org.owasp.encoder;
35 
36 import java.nio.CharBuffer;
37 import java.nio.charset.CoderResult;
38 
39 /**
40  * XMLCommentEncoder -- Encodes for the XML/HTML comment context. The sequence "--" is not allowed in comments, and must be
41  * removed/replaced. We also must be careful of trailing hyphens at end of input, as they could combine with the external comment
42  * ending sequence "-->" to become "--->", which is also invalid. As with all XML-based context, invalid XML characters are not
43  * allowed.
44  *
45  * @author Jeff Ichnowski
46  */
47 class XMLCommentEncoder extends Encoder {
48 
49     /**
50      * This is the character used to replace a hyphen when a sequence of hypens is encountered.
51      */
52     static final char HYPHEN_REPLACEMENT = '~';
53 
54     // Input:
55     // <!-- foo -- bar -->
56     // Possible Options:
57     // <!-- foo &ndash; bar -->
58     // <!-- foo == bar -->
59     // <!-- foo __ bar -->
60     // <!-- foo - - bar -->
61     // <!-- foo \u2010\u2010 bar --> (Unicode Hyphen)
62     // <!-- foo \u2013 bar --> (Unicode en-dash)
63     // Note: HTML comments differ, in that they cannot start with: ">", "->".
64     // On IE, "<!--[if ..." has special interpretation
65     // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
66     @Override
maxEncodedLength(int n)67     protected int maxEncodedLength(int n) {
68         return n;
69     }
70 
71     @Override
firstEncodedOffset(String input, int off, int len)72     protected int firstEncodedOffset(String input, int off, int len) {
73         final int n = off + len;
74         for (int i = off; i < n; ++i) {
75             char ch = input.charAt(i);
76             if (ch <= Unicode.MAX_ASCII) {
77                 if (ch == '-') {
78                     if (i + 1 < n) {
79                         if (input.charAt(i + 1) == '-') {
80                             return i;
81 //                        } else {
82 //                            // valid
83                         }
84                     } else {
85                         return i;
86                     }
87                 } else if (ch < ' ' && ch != '\n' && ch != '\r' && ch != '\t') {
88                     return i;
89 //                } else {
90 //                    // valid
91                 }
92             } else if (ch < Character.MIN_HIGH_SURROGATE) {
93                 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) {
94                     return i;
95 //                } else {
96 //                    // valid
97                 }
98             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
99                 if (i + 1 < n && Character.isLowSurrogate(input.charAt(i + 1))) {
100                     int cp = Character.toCodePoint(ch, input.charAt(i + 1));
101                     if (Unicode.isNonCharacter(cp)) {
102                         // noncharacter
103                         return i;
104                     }
105                     ++i;
106                 } else {
107                     return i;
108                 }
109             } else if (ch <= Character.MAX_LOW_SURROGATE || ch > '\ufffd'
110                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
111             {
112                 return i;
113 //            } else {
114 //                // valid
115             }
116         }
117         return n;
118     }
119 
120     @Override
encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)121     protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
122         final char[] in = input.array();
123         final char[] out = output.array();
124         int i = input.arrayOffset() + input.position();
125         final int n = input.arrayOffset() + input.limit();
126         int j = output.arrayOffset() + output.position();
127         final int m = output.arrayOffset() + output.limit();
128 
129         for (; i < n; ++i) {
130             char ch = in[i];
131             if (ch <= Unicode.MAX_ASCII) {
132                 if (ch == '-') {
133                     if (i + 1 < n) {
134                         if (in[i + 1] == '-') {
135                             if (j + 1 >= m) {
136                                 return overflow(input, i, output, j);
137                             }
138                             out[j++] = '-';
139                             out[j++] = HYPHEN_REPLACEMENT;
140                             ++i;
141                         } else {
142                             if (j >= m) {
143                                 return overflow(input, i, output, j);
144                             }
145                             out[j++] = '-';
146                         }
147                     } else if (endOfInput) {
148                         if (j >= m) {
149                             return overflow(input, i, output, j);
150                         }
151                         out[j++] = HYPHEN_REPLACEMENT;
152                     } else {
153                         // saw '-' at the end of the buffer, but this is not
154                         // end of input, we need to see the next character
155                         // before deciding what to do.
156                         break;
157                     }
158                 } else if (ch > ' ' || ch == '\n' || ch == '\r' || ch == '\t') {
159                     if (j >= m) {
160                         return overflow(input, i, output, j);
161                     }
162                     out[j++] = ch;
163                 } else {
164                     if (j >= m) {
165                         return overflow(input, i, output, j);
166                     }
167                     out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
168                 }
169             } else if (ch < Character.MIN_HIGH_SURROGATE) {
170                 if (j >= m) {
171                     return overflow(input, i, output, j);
172                 }
173                 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) {
174                     out[j++] = ch;
175                 } else {
176                     // C1 control code
177                     out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
178                 }
179             } else if (ch <= Character.MAX_HIGH_SURROGATE) {
180                 if (i + 1 < n) {
181                     if (Character.isLowSurrogate(in[i + 1])) {
182                         int cp = Character.toCodePoint(ch, in[i + 1]);
183                         if (Unicode.isNonCharacter(cp)) {
184                             // noncharacter
185                             if (j >= m) {
186                                 return overflow(input, i, output, j);
187                             }
188                             out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
189                             ++i;
190                         } else {
191                             if (j + 1 >= m) {
192                                 return overflow(input, i, output, j);
193                             }
194                             out[j++] = ch;
195                             out[j++] = in[++i];
196                         }
197                     } else {
198                         // high without low
199                         if (j >= m) {
200                             return overflow(input, i, output, j);
201                         }
202                         out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
203                     }
204                 } else if (endOfInput) {
205                     // end of input, high without low = invalid
206                     if (j >= m) {
207                         return overflow(input, i, output, j);
208                     }
209                     out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
210                 } else {
211                     break;
212                 }
213             } else if (// low surrogate without preceding high surrogate
214                     ch <= Character.MAX_LOW_SURROGATE
215                     // or non-characters
216                     || ch > '\ufffd'
217                     || ('\ufdd0' <= ch && ch <= '\ufdef'))
218             {
219                 if (j >= m) {
220                     return overflow(input, i, output, j);
221                 }
222                 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
223             } else {
224                 if (j >= m) {
225                     return overflow(input, i, output, j);
226                 }
227                 out[j++] = ch;
228             }
229         }
230         return underflow(input, i, output, j);
231     }
232 
233     @Override
toString()234     public String toString() {
235         return "XMLCommentEncoder";
236     }
237 }
238