1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 package org.owasp.encoder; 35 36 import java.nio.CharBuffer; 37 import java.nio.charset.CoderResult; 38 39 /** 40 * XMLCommentEncoder -- Encodes for the XML/HTML comment context. The sequence "--" is not allowed in comments, and must be 41 * removed/replaced. We also must be careful of trailing hyphens at end of input, as they could combine with the external comment 42 * ending sequence "-->" to become "--->", which is also invalid. As with all XML-based context, invalid XML characters are not 43 * allowed. 44 * 45 * @author Jeff Ichnowski 46 */ 47 class XMLCommentEncoder extends Encoder { 48 49 /** 50 * This is the character used to replace a hyphen when a sequence of hypens is encountered. 51 */ 52 static final char HYPHEN_REPLACEMENT = '~'; 53 54 // Input: 55 // <!-- foo -- bar --> 56 // Possible Options: 57 // <!-- foo – bar --> 58 // <!-- foo == bar --> 59 // <!-- foo __ bar --> 60 // <!-- foo - - bar --> 61 // <!-- foo \u2010\u2010 bar --> (Unicode Hyphen) 62 // <!-- foo \u2013 bar --> (Unicode en-dash) 63 // Note: HTML comments differ, in that they cannot start with: ">", "->". 64 // On IE, "<!--[if ..." has special interpretation 65 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 66 @Override maxEncodedLength(int n)67 protected int maxEncodedLength(int n) { 68 return n; 69 } 70 71 @Override firstEncodedOffset(String input, int off, int len)72 protected int firstEncodedOffset(String input, int off, int len) { 73 final int n = off + len; 74 for (int i = off; i < n; ++i) { 75 char ch = input.charAt(i); 76 if (ch <= Unicode.MAX_ASCII) { 77 if (ch == '-') { 78 if (i + 1 < n) { 79 if (input.charAt(i + 1) == '-') { 80 return i; 81 // } else { 82 // // valid 83 } 84 } else { 85 return i; 86 } 87 } else if (ch < ' ' && ch != '\n' && ch != '\r' && ch != '\t') { 88 return i; 89 // } else { 90 // // valid 91 } 92 } else if (ch < Character.MIN_HIGH_SURROGATE) { 93 if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) { 94 return i; 95 // } else { 96 // // valid 97 } 98 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 99 if (i + 1 < n && Character.isLowSurrogate(input.charAt(i + 1))) { 100 int cp = Character.toCodePoint(ch, input.charAt(i + 1)); 101 if (Unicode.isNonCharacter(cp)) { 102 // noncharacter 103 return i; 104 } 105 ++i; 106 } else { 107 return i; 108 } 109 } else if (ch <= Character.MAX_LOW_SURROGATE || ch > '\ufffd' 110 || ('\ufdd0' <= ch && ch <= '\ufdef')) 111 { 112 return i; 113 // } else { 114 // // valid 115 } 116 } 117 return n; 118 } 119 120 @Override encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)121 protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { 122 final char[] in = input.array(); 123 final char[] out = output.array(); 124 int i = input.arrayOffset() + input.position(); 125 final int n = input.arrayOffset() + input.limit(); 126 int j = output.arrayOffset() + output.position(); 127 final int m = output.arrayOffset() + output.limit(); 128 129 for (; i < n; ++i) { 130 char ch = in[i]; 131 if (ch <= Unicode.MAX_ASCII) { 132 if (ch == '-') { 133 if (i + 1 < n) { 134 if (in[i + 1] == '-') { 135 if (j + 1 >= m) { 136 return overflow(input, i, output, j); 137 } 138 out[j++] = '-'; 139 out[j++] = HYPHEN_REPLACEMENT; 140 ++i; 141 } else { 142 if (j >= m) { 143 return overflow(input, i, output, j); 144 } 145 out[j++] = '-'; 146 } 147 } else if (endOfInput) { 148 if (j >= m) { 149 return overflow(input, i, output, j); 150 } 151 out[j++] = HYPHEN_REPLACEMENT; 152 } else { 153 // saw '-' at the end of the buffer, but this is not 154 // end of input, we need to see the next character 155 // before deciding what to do. 156 break; 157 } 158 } else if (ch > ' ' || ch == '\n' || ch == '\r' || ch == '\t') { 159 if (j >= m) { 160 return overflow(input, i, output, j); 161 } 162 out[j++] = ch; 163 } else { 164 if (j >= m) { 165 return overflow(input, i, output, j); 166 } 167 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 168 } 169 } else if (ch < Character.MIN_HIGH_SURROGATE) { 170 if (j >= m) { 171 return overflow(input, i, output, j); 172 } 173 if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) { 174 out[j++] = ch; 175 } else { 176 // C1 control code 177 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 178 } 179 } else if (ch <= Character.MAX_HIGH_SURROGATE) { 180 if (i + 1 < n) { 181 if (Character.isLowSurrogate(in[i + 1])) { 182 int cp = Character.toCodePoint(ch, in[i + 1]); 183 if (Unicode.isNonCharacter(cp)) { 184 // noncharacter 185 if (j >= m) { 186 return overflow(input, i, output, j); 187 } 188 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 189 ++i; 190 } else { 191 if (j + 1 >= m) { 192 return overflow(input, i, output, j); 193 } 194 out[j++] = ch; 195 out[j++] = in[++i]; 196 } 197 } else { 198 // high without low 199 if (j >= m) { 200 return overflow(input, i, output, j); 201 } 202 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 203 } 204 } else if (endOfInput) { 205 // end of input, high without low = invalid 206 if (j >= m) { 207 return overflow(input, i, output, j); 208 } 209 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 210 } else { 211 break; 212 } 213 } else if (// low surrogate without preceding high surrogate 214 ch <= Character.MAX_LOW_SURROGATE 215 // or non-characters 216 || ch > '\ufffd' 217 || ('\ufdd0' <= ch && ch <= '\ufdef')) 218 { 219 if (j >= m) { 220 return overflow(input, i, output, j); 221 } 222 out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT; 223 } else { 224 if (j >= m) { 225 return overflow(input, i, output, j); 226 } 227 out[j++] = ch; 228 } 229 } 230 return underflow(input, i, output, j); 231 } 232 233 @Override toString()234 public String toString() { 235 return "XMLCommentEncoder"; 236 } 237 } 238