1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 package org.owasp.encoder; 35 36 import java.nio.CharBuffer; 37 import java.nio.charset.CoderResult; 38 39 /** 40 * XMLEncoder -- encoder for XML attribute and content data. It uses XML entity 41 * entity ("&...;") to encode valid but significant characters. Characters 42 * that are invalid according to the XML specification are replaced by a space 43 * character (U+0020). This encoder supports several modes of operation, 44 * allowing for varying contexts, such as: attribute data between single-quotes, 45 * attribute data between double-quotes, attribute data with indeterminate 46 * quotes, content, or a context safe for all of the above. 47 * 48 * @author jeffi 49 */ 50 class XMLEncoder extends Encoder { 51 52 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 53 // Unicode Noncharacters (Unicode Standard 16.7) 54 // U+FFFE & U+FFFF 55 // U+1FFFE & U+1FFFF 56 // U+2FFFE & U+2FFFF 57 // ... 58 // U+10FFFE & U+10FFFF 59 // U+FDD0 .. U+FDEF 60 // Control Characters 61 // U+0000 .. U+001F <-- CR, LF, TAB are in this range and ok. 62 // U+007f .. U+009F <-- U+85 = NEL (next line) = CR+LF in one = ok. 63 // Note: the standard says it is a good practice to replace noncharacters 64 // with U+FFFD "replacement character". 65 /** 66 * A bit-mask of valid characters with code-points in the range 0--63. 67 */ 68 private static final long BASE_VALID_MASK 69 = (1L << '\t') | (1L << '\r') | (1L << '\n'); 70 71 /** 72 * Maximum number of encoded characters per input character. 73 */ 74 static final int MAX_ENCODED_CHAR_LENGTH = 5; 75 /** 76 * The encoded length of an ampersand. 77 */ 78 static final int AMP_LENGTH = 5; 79 /** 80 * The encoded length of a less-than sign. 81 */ 82 static final int LT_LENGTH = 4; 83 /** 84 * The encoded length of a greater-than sign. 85 */ 86 static final int GT_LENGTH = 4; 87 /** 88 * The encoded length of an apostrophe. 89 */ 90 static final int APOS_LENGTH = 5; 91 /** 92 * The encoded length of a double-quotation character. 93 */ 94 static final int QUOT_LENGTH = 5; 95 96 /** 97 * An enum of supported "modes" of operation for the XMLEncoder. 98 */ 99 enum Mode { 100 101 /** 102 * All significant characters are encoded (& < > ' "). This 103 * mode is safe for use in either content or attributes. See note on 104 * {@link #CONTENT} for explanation of why '>' is encoded. 105 */ 106 ALL("&<>\'\""), 107 /** 108 * Characters are encoded for content (a.k.a. "CharData"). This means 109 * & < and >. Note: > only requires encoding if it follows 110 * "]]". However for maximum compatibility and to avoid the overhead of 111 * looking for "]]", we just always encode '>' to '&gt;'. 112 */ 113 CONTENT("&<>"), 114 /** 115 * Characters are encoded for attribute values--either single or double 116 * quoted. This means the characters & < ' and " are encoded. 117 * Note: > is NOT encoded, and thus this mode is not suitable for 118 * content. 119 */ 120 ATTRIBUTE("&<\'\""), 121 /** 122 * Characters are encoded for single-quoted attribute values. Thus, the 123 * same as {@link #ATTRIBUTE} except ' is not encoded. 124 */ 125 SINGLE_QUOTED_ATTRIBUTE("&<\'"), 126 /** 127 * Characters are encoded for double-quoted attribute values. Thus, the 128 * same as {@link #ATTRIBUTE} except " is not encoded. 129 */ 130 DOUBLE_QUOTED_ATTRIBUTE("&<\""),; 131 132 /** 133 * The bit-mask of characters that do not need encoding in this mode. 134 */ 135 private final long _validMask; 136 137 /** 138 * Sole constructor. 139 * 140 * @param encodedChars -- a string of characters must be encoded in this 141 * mode. This string is converted to a bit-mask. 142 */ Mode(String encodedChars)143 Mode(String encodedChars) { 144 long encodeMask = 0; 145 for (int i = 0, n = encodedChars.length(); i < n; ++i) { 146 encodeMask |= 1L << encodedChars.charAt(i); 147 } 148 _validMask = BASE_VALID_MASK | ((-1L << ' ') & ~(encodeMask)); 149 } 150 151 /** 152 * Accessor for {@link #_validMask}. 153 * 154 * @return {@link #_validMask} 155 */ validMask()156 long validMask() { 157 return _validMask; 158 } 159 } 160 161 /** 162 * Character to use as a replacement for invalid characters (Not to be 163 * confused with characters that require encoding). Invalid characters have 164 * no encoding, and are not allowed in the context. 165 */ 166 static final char INVALID_CHARACTER_REPLACEMENT = ' '; 167 168 /** 169 * The mask of valid characters extracted from the mode for efficiency. 170 */ 171 private final long _validMask; 172 /** 173 * The mode of operation--only really stored to provide a relevant toString 174 * implementation. 175 */ 176 private final Mode _mode; 177 178 /** 179 * Default constructor--equivalent to XMLEncoder(Mode.ALL). 180 */ XMLEncoder()181 XMLEncoder() { 182 this(Mode.ALL); 183 } 184 185 /** 186 * Creates an XMLEncoder for the specified mode constant. 187 * 188 * @param mode the mode of the encoder. 189 */ XMLEncoder(Mode mode)190 XMLEncoder(Mode mode) { 191 _mode = mode; 192 _validMask = mode.validMask(); 193 } 194 195 @Override maxEncodedLength(int n)196 public int maxEncodedLength(int n) { 197 // "&" = 5 chars. 198 return n * MAX_ENCODED_CHAR_LENGTH; 199 } 200 201 @Override firstEncodedOffset(String input, int off, int len)202 public int firstEncodedOffset(String input, int off, int len) { 203 final int n = off + len; 204 205 for (int i = off; i < n; ++i) { 206 char ch = input.charAt(i); 207 if (ch < Unicode.DEL) { 208 if (ch <= '>' && (_validMask & (1L << ch)) == 0) { 209 // either needs encoding or is invalid 210 return i; 211 // } else { 212 // // valid 213 } 214 } else if (ch < Character.MIN_HIGH_SURROGATE) { 215 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) { 216 return i; 217 // } else { 218 // // valid 219 } 220 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 221 if (i + 1 < n && Character.isLowSurrogate(input.charAt(i + 1))) { 222 int cp = Character.toCodePoint(ch, input.charAt(i + 1)); 223 if (Unicode.isNonCharacter(cp)) { 224 // noncharacter 225 return i; 226 } 227 ++i; 228 } else { 229 return i; 230 } 231 } else if (ch <= Character.MAX_LOW_SURROGATE 232 || ch > '\ufffd' 233 || ('\ufdd0' <= ch && ch <= '\ufdef')) 234 { 235 return i; 236 // } else { 237 // // valid 238 } 239 } 240 241 return n; 242 } 243 244 /** 245 * {@inheritDoc} 246 */ encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)247 protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { 248 final char[] in = input.array(); 249 final char[] out = output.array(); 250 int i = input.arrayOffset() + input.position(); 251 final int n = input.arrayOffset() + input.limit(); 252 int j = output.arrayOffset() + output.position(); 253 final int m = output.arrayOffset() + output.limit(); 254 255 for (; i < n; ++i) { 256 final char ch = in[i]; 257 if (ch < Unicode.DEL) { 258 if (ch > '>' || ((_validMask & (1L << ch)) != 0)) { 259 // Common case ('>' .. '~') reached in two branches 260 if (j >= m) { 261 return overflow(input, i, output, j); 262 } 263 out[j++] = ch; 264 } else { 265 switch (ch) { 266 case '&': 267 if (j + AMP_LENGTH > m) { 268 return overflow(input, i, output, j); 269 } 270 out[j++] = '&'; 271 out[j++] = 'a'; 272 out[j++] = 'm'; 273 out[j++] = 'p'; 274 out[j++] = ';'; 275 break; 276 case '<': 277 if (j + LT_LENGTH > m) { 278 return overflow(input, i, output, j); 279 } 280 out[j++] = '&'; 281 out[j++] = 'l'; 282 out[j++] = 't'; 283 out[j++] = ';'; 284 break; 285 case '>': 286 if (j + GT_LENGTH > m) { 287 return overflow(input, i, output, j); 288 } 289 out[j++] = '&'; 290 out[j++] = 'g'; 291 out[j++] = 't'; 292 out[j++] = ';'; 293 break; 294 case '\'': 295 // ' is valid in XML, but not in HTML, and numeric code is shorter 296 if (j + APOS_LENGTH > m) { 297 return overflow(input, i, output, j); 298 } 299 out[j++] = '&'; 300 out[j++] = '#'; 301 out[j++] = '3'; 302 out[j++] = '9'; 303 out[j++] = ';'; 304 break; 305 case '\"': 306 // " is valid in XML and HTML, but numeric code is shorter 307 if (j + QUOT_LENGTH > m) { 308 return overflow(input, i, output, j); 309 } 310 out[j++] = '&'; 311 out[j++] = '#'; 312 out[j++] = '3'; 313 out[j++] = '4'; 314 out[j++] = ';'; 315 break; 316 default: 317 // invalid character 318 if (j >= m) { 319 return overflow(input, i, output, j); 320 } 321 out[j++] = INVALID_CHARACTER_REPLACEMENT; 322 break; 323 } 324 } 325 } else if (ch < Character.MIN_HIGH_SURROGATE) { 326 if (j >= m) { 327 return overflow(input, i, output, j); 328 } 329 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) { 330 out[j++] = ch; 331 } else { 332 // C1 control code 333 out[j++] = INVALID_CHARACTER_REPLACEMENT; 334 } 335 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 336 if (i + 1 < n) { 337 if (Character.isLowSurrogate(in[i + 1])) { 338 int cp = Character.toCodePoint(ch, in[i + 1]); 339 if (Unicode.isNonCharacter(cp)) { 340 // noncharacter 341 if (j >= m) { 342 return overflow(input, i, output, j); 343 } 344 out[j++] = INVALID_CHARACTER_REPLACEMENT; 345 ++i; 346 } else { 347 if (j + 1 >= m) { 348 return overflow(input, i, output, j); 349 } 350 out[j++] = ch; 351 out[j++] = in[++i]; 352 } 353 } else { 354 // high without low 355 if (j >= m) { 356 return overflow(input, i, output, j); 357 } 358 out[j++] = INVALID_CHARACTER_REPLACEMENT; 359 } 360 } else if (endOfInput) { 361 // end of input, high without low = invalid 362 if (j >= m) { 363 return overflow(input, i, output, j); 364 } 365 out[j++] = INVALID_CHARACTER_REPLACEMENT; 366 } else { 367 break; 368 } 369 } else if (// low surrogate without preceding high surrogate 370 ch <= Character.MAX_LOW_SURROGATE 371 // or non-characters 372 || ch > '\ufffd' 373 || ('\ufdd0' <= ch && ch <= '\ufdef')) 374 { 375 if (j >= m) { 376 return overflow(input, i, output, j); 377 } 378 out[j++] = INVALID_CHARACTER_REPLACEMENT; 379 } else { 380 if (j >= m) { 381 return overflow(input, i, output, j); 382 } 383 out[j++] = ch; 384 } 385 } 386 387 return underflow(input, i, output, j); 388 } 389 390 @Override toString()391 public String toString() { 392 return "XMLEncoder(" + _mode + ")"; 393 } 394 } 395