1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 package org.owasp.encoder; 35 36 import java.nio.CharBuffer; 37 import java.nio.charset.CoderResult; 38 39 /** 40 * CDATAEncoder -- encoder for CDATA sections. CDATA sections are generally good 41 * for including large blocks of text that contain characters that normally 42 * require encoding (ampersand, quotes, less-than, etc...). The CDATA context 43 * however still does not allow invalid characters, and can be closed by the 44 * sequence "]]>". This encoder removes invalid XML characters, and encodes 45 * "]]>" (to "]]]]><![CDATA[>"). The result is that the data integrity is 46 * maintained, but the code receiving the output will have to handle multiple 47 * CDATA events. As an alternate approach, the caller could pre-encode "]]>" to 48 * something of their choosing (e.g. data.replaceAll("\\]\\]>", "]] >")), then 49 * use this encoder to remove any invalid XML characters. 50 * 51 * @author Jeff Ichnowski 52 */ 53 class CDATAEncoder extends Encoder { 54 55 /** 56 * The encoding of @{code "]]>"}. 57 */ 58 private static final char[] CDATA_END_ENCODED 59 = "]]]]><![CDATA[>".toCharArray(); 60 61 /** 62 * Length of {@code "]]]]><![CDATA[>"}. 63 */ 64 private static final int CDATA_END_ENCODED_LENGTH = 15; 65 66 /** 67 * Length of {@code "]]>"}. 68 */ 69 private static final int CDATA_END_LENGTH = 3; 70 71 @Override maxEncodedLength(int n)72 protected int maxEncodedLength(int n) { 73 // "]" becomes "]" (1 -> 1) 74 // "]]" becomes "]]" (2 -> 2) 75 // "]]>" becomes "]]]]><![CDATA[>" (3 -> 15) 76 // "]]>]" becomes "]]]]><![CDATA[>]" (3 -> 15 + 1 -> 1) 77 // ... 78 79 int worstCase = n / CDATA_END_LENGTH; 80 int remainder = n % CDATA_END_LENGTH; 81 82 return worstCase * CDATA_END_ENCODED_LENGTH + remainder; 83 84 // return (n - remainder) * 5 + remainder; 85 } 86 87 @Override firstEncodedOffset(String input, int off, int len)88 protected int firstEncodedOffset(String input, int off, int len) { 89 final int n = off + len; 90 //int closeCount = 0; //unused... 91 for (int i = off; i < n; ++i) { 92 char ch = input.charAt(i); 93 if (ch <= Unicode.MAX_ASCII) { 94 if (ch != ']') { 95 if (ch < ' ' && ch != '\n' && ch != '\r' && ch != '\t') { 96 return i; 97 // } else { 98 // // valid 99 } 100 101 } else if (i + 1 < n) { 102 if (input.charAt(i + 1) != ']') { 103 // "]x" (next character is safe for this to be ']') 104 } else { 105 // "]]?" 106 // keep looping through ']' 107 for (; i + 2 < n && input.charAt(i + 2) == ']'; ++i) { 108 // valid 109 } 110 // at this point we've looped through a sequence 111 // of 2 or more "]", if the next character is ">" 112 // we need to encode "]]>". 113 if (i + 2 < n) { 114 if (input.charAt(i + 2) == '>') { 115 return i; 116 // } else { 117 // // valid 118 } 119 120 } else { 121 return n; 122 } 123 } 124 } else { 125 return n; 126 } 127 } else if (ch < Character.MIN_HIGH_SURROGATE) { 128 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) { 129 return i; 130 // } else { 131 // // valid 132 } 133 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 134 if (i + 1 < n) { 135 if (Character.isLowSurrogate(input.charAt(i + 1))) { 136 int cp = Character.toCodePoint(ch, input.charAt(i + 1)); 137 if (Unicode.isNonCharacter(cp)) { 138 return i; 139 } else { 140 ++i; 141 // valid pair 142 } 143 } else { 144 return i; 145 } 146 } else { 147 // end of input, high without low = invalid 148 return i; 149 } 150 } else if (// low surrogate without preceding high surrogate 151 ch <= Character.MAX_LOW_SURROGATE 152 // or non-characters 153 || ch > '\ufffd' 154 || ('\ufdd0' <= ch && ch <= '\ufdef')) 155 { 156 return i; 157 // } else { 158 // // valid 159 } 160 161 } 162 return n; 163 } 164 165 @Override encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)166 protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { 167 final char[] in = input.array(); 168 final char[] out = output.array(); 169 int i = input.arrayOffset() + input.position(); 170 final int n = input.arrayOffset() + input.limit(); 171 int j = output.arrayOffset() + output.position(); 172 final int m = output.arrayOffset() + output.limit(); 173 174 for (; i < n; ++i) { 175 char ch = in[i]; 176 if (ch <= Unicode.MAX_ASCII) { 177 if (ch != ']') { 178 if (j >= m) { 179 return overflow(input, i, output, j); 180 } 181 if (ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t') { 182 out[j++] = ch; 183 } else { 184 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 185 } 186 } else if (i + 1 < n) { 187 if (in[i + 1] != ']') { 188 // "]x" (next character is safe for this to be ']') 189 if (j >= m) { 190 return overflow(input, i, output, j); 191 } 192 out[j++] = ']'; 193 } else { 194 // "]]?" 195 // keep looping through ']' 196 for (; i + 2 < n && in[i + 2] == ']'; ++i) { 197 if (j >= m) { 198 return overflow(input, i, output, j); 199 } 200 out[j++] = ']'; 201 } 202 // at this point we've looped through a sequence 203 // of 2 or more "]", if the next character is ">" 204 // we need to encode "]]>". 205 if (i + 2 < n) { 206 if (in[i + 2] == '>') { 207 if (j + CDATA_END_ENCODED_LENGTH > m) { 208 return overflow(input, i, output, j); 209 } 210 System.arraycopy(CDATA_END_ENCODED, 0, out, j, CDATA_END_ENCODED_LENGTH); 211 j += CDATA_END_ENCODED_LENGTH; 212 i += 2; 213 } else { 214 if (j >= m) { 215 return overflow(input, i, output, j); 216 } 217 out[j++] = ']'; 218 } 219 } else if (endOfInput) { 220 if (j + 2 > m) { 221 return overflow(input, i, output, j); 222 } 223 out[j++] = ']'; 224 out[j++] = ']'; 225 i = n; 226 break; 227 } else { 228 break; 229 } 230 } 231 } else if (endOfInput) { 232 // seen "]", then end of input. 233 if (j >= m) { 234 return overflow(input, i, output, j); 235 } 236 out[j++] = ']'; 237 i++; 238 break; 239 } else { 240 break; 241 } 242 } else if (ch < Character.MIN_HIGH_SURROGATE) { 243 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) { 244 if (j >= m) { 245 return overflow(input, i, output, j); 246 } 247 out[j++] = ch; 248 } else { 249 // C1 control code 250 if (j >= m) { 251 return overflow(input, i, output, j); 252 } 253 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 254 } 255 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 256 if (i + 1 < n) { 257 if (Character.isLowSurrogate(in[i + 1])) { 258 int cp = Character.toCodePoint(ch, in[i + 1]); 259 if (Unicode.isNonCharacter(cp)) { 260 if (j >= m) { 261 return overflow(input, i, output, j); 262 } 263 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 264 ++i; 265 } else { 266 if (j + 1 >= m) { 267 return overflow(input, i, output, j); 268 } 269 out[j++] = ch; 270 out[j++] = in[++i]; 271 } 272 } else { 273 // high without low 274 if (j >= m) { 275 return overflow(input, i, output, j); 276 } 277 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 278 } 279 } else if (endOfInput) { 280 // end of input, high without low = invalid 281 if (j >= m) { 282 return overflow(input, i, output, j); 283 } 284 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 285 } else { 286 break; 287 } 288 } else if (// low surrogate without preceding high surrogate 289 ch <= Character.MAX_LOW_SURROGATE 290 // or non-characters 291 || ch > '\ufffd' 292 || ('\ufdd0' <= ch && ch <= '\ufdef')) 293 { 294 if (j >= m) { 295 return overflow(input, i, output, j); 296 } 297 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 298 } else { 299 if (j >= m) { 300 return overflow(input, i, output, j); 301 } 302 out[j++] = ch; 303 } 304 } 305 return underflow(input, i, output, j); 306 } 307 308 @Override toString()309 public String toString() { 310 return "CDATAEncoder"; 311 } 312 } 313