1 // Copyright (c) 2012, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import java.io.IOException; 32 33 import com.google.common.annotations.VisibleForTesting; 34 35 /** Encoders and decoders for HTML. */ 36 final class Encoding { 37 38 /** 39 * Decodes HTML entities to produce a string containing only valid 40 * Unicode scalar values. 41 */ 42 @VisibleForTesting decodeHtml(String s)43 static String decodeHtml(String s) { 44 int firstAmp = s.indexOf('&'); 45 int safeLimit = longestPrefixOfGoodCodeunits(s); 46 if ((firstAmp & safeLimit) < 0) { return s; } 47 48 StringBuilder sb; 49 { 50 int n = s.length(); 51 sb = new StringBuilder(n); 52 int pos = 0; 53 int amp = firstAmp; 54 while (amp >= 0) { 55 long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); 56 int end = (int) (endAndCodepoint >>> 32); 57 int codepoint = (int) endAndCodepoint; 58 sb.append(s, pos, amp).appendCodePoint(codepoint); 59 pos = end; 60 amp = s.indexOf('&', end); 61 } 62 sb.append(s, pos, n); 63 } 64 65 stripBannedCodeunits( 66 sb, 67 firstAmp < 0 68 ? safeLimit : safeLimit < 0 69 ? firstAmp : Math.min(firstAmp, safeLimit)); 70 71 return sb.toString(); 72 } 73 74 /** 75 * Returns the portion of its input that consists of XML safe chars. 76 * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 77 */ 78 @TCB stripBannedCodeunits(String s)79 static String stripBannedCodeunits(String s) { 80 int safeLimit = longestPrefixOfGoodCodeunits(s); 81 if (safeLimit < 0) { return s; } 82 83 StringBuilder sb = new StringBuilder(s); 84 stripBannedCodeunits(sb, safeLimit); 85 return sb.toString(); 86 } 87 88 /** 89 * Leaves in the input buffer only code-units that comprise XML safe chars. 90 * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 91 */ 92 @TCB stripBannedCodeunits(StringBuilder sb)93 static void stripBannedCodeunits(StringBuilder sb) { 94 stripBannedCodeunits(sb, 0); 95 } 96 97 @TCB stripBannedCodeunits(StringBuilder sb, int start)98 private static void stripBannedCodeunits(StringBuilder sb, int start) { 99 int k = start; 100 for (int i = start, n = sb.length(); i < n; ++i) { 101 char ch = sb.charAt(i); 102 if (ch < 0x20) { 103 if (IS_BANNED_ASCII[ch]) { 104 continue; 105 } 106 } else if (0xd800 <= ch) { 107 if (ch <= 0xdfff) { 108 if (i+1 < n) { 109 char next = sb.charAt(i+1); 110 if (Character.isSurrogatePair(ch, next)) { 111 sb.setCharAt(k++, ch); 112 sb.setCharAt(k++, next); 113 ++i; 114 } 115 } 116 continue; 117 } else if ((ch & 0xfffe) == 0xfffe) { 118 continue; 119 } 120 } 121 sb.setCharAt(k++, ch); 122 } 123 sb.setLength(k); 124 } 125 126 /** 127 * The number of code-units at the front of s that form code-points in the 128 * XML Character production. 129 * @return -1 if all of s is in the XML Character production. 130 */ 131 @TCB longestPrefixOfGoodCodeunits(String s)132 private static int longestPrefixOfGoodCodeunits(String s) { 133 int n = s.length(), i; 134 for (i = 0; i < n; ++i) { 135 char ch = s.charAt(i); 136 if (ch < 0x20) { 137 if (IS_BANNED_ASCII[ch]) { 138 return i; 139 } 140 } else if (0xd800 <= ch) { 141 if (ch <= 0xdfff) { 142 if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) { 143 ++i; // Skip over low surrogate since we know it's ok. 144 } else { 145 return i; 146 } 147 } else if ((ch & 0xfffe) == 0xfffe) { 148 return i; 149 } 150 } 151 } 152 return -1; 153 } 154 155 /** 156 * Writes the HTML equivalent of the given plain text to output. 157 * For example, {@code escapeHtmlOnto("1 < 2", w)}, 158 * is equivalent to {@code w.append("1 < 2")} but possibly with fewer 159 * smaller appends. 160 * Elides code-units that are not valid XML Characters. 161 * @see <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">XML Ch. 2.2 - Characters</a> 162 */ 163 @TCB encodeHtmlOnto(String plainText, Appendable output)164 static void encodeHtmlOnto(String plainText, Appendable output) 165 throws IOException { 166 int n = plainText.length(); 167 int pos = 0; 168 for (int i = 0; i < n; ++i) { 169 char ch = plainText.charAt(i); 170 if (ch < REPLACEMENTS.length) { 171 String repl = REPLACEMENTS[ch]; 172 if (repl != null) { 173 output.append(plainText, pos, i).append(repl); 174 pos = i + 1; 175 } 176 } else if (((char) 0xd800) <= ch) { 177 if (ch <= ((char) 0xdfff)) { 178 char next; 179 if (i + 1 < n 180 && Character.isSurrogatePair( 181 ch, next = plainText.charAt(i + 1))) { 182 // Emit supplemental codepoints as entity so that they cannot 183 // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper 184 // and get involved in UTF-16/UCS-2 confusion. 185 int codepoint = Character.toCodePoint(ch, next); 186 output.append(plainText, pos, i); 187 appendNumericEntity(codepoint, output); 188 ++i; 189 pos = i + 1; 190 } else { 191 output.append(plainText, pos, i); 192 // Elide the orphaned surrogate. 193 pos = i + 1; 194 } 195 } else if (0xff00 <= ch) { 196 output.append(plainText, pos, i); 197 pos = i + 1; 198 // Is a control character or possible full-width version of a 199 // special character. 200 if ((ch & 0xfffe) == 0xfffe) { 201 // Elide since not an the XML Character. 202 } else { 203 appendNumericEntity(ch, output); 204 } 205 } 206 } 207 } 208 output.append(plainText, pos, n); 209 } 210 211 @TCB appendNumericEntity(int codepoint, Appendable output)212 static void appendNumericEntity(int codepoint, Appendable output) 213 throws IOException { 214 if (codepoint < 100) { 215 // TODO: is this dead code due to REPLACEMENTS above. 216 output.append("&#"); 217 if (codepoint < 10) { 218 output.append((char) ('0' + codepoint)); 219 } else { 220 output.append((char) ('0' + (codepoint / 10))); 221 output.append((char) ('0' + (codepoint % 10))); 222 } 223 output.append(";"); 224 } else { 225 int nDigits = (codepoint < 0x1000 226 ? codepoint < 0x100 ? 2 : 3 227 : (codepoint < 0x10000 ? 4 228 : codepoint < 0x100000 ? 5 : 6)); 229 output.append("&#x"); 230 for (int digit = nDigits; --digit >= 0;) { 231 int hexDigit = (codepoint >>> (digit << 2)) & 0xf; 232 output.append(HEX_NUMERAL[hexDigit]); 233 } 234 output.append(";"); 235 } 236 } 237 238 private static final char[] HEX_NUMERAL = { 239 '0', '1', '2', '3', '4', '5', '6', '7', 240 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 241 }; 242 243 /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ 244 static final String[] REPLACEMENTS = new String[0x61]; 245 static { 246 for (int i = 0; i < ' '; ++i) { 247 // We elide control characters so that we can ensure that our output is 248 // in the intersection of valid HTML5 and XML. According to 249 // http://www.w3.org/TR/2008/REC-xml-20081126/#charsets 250 // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] 251 // | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 252 if (i != '\t' && i != '\n' && i != '\r') { 253 REPLACEMENTS[i] = ""; // Elide 254 } 255 } 256 // """ is shorter than """ 257 REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. 258 REPLACEMENTS['&'] = "&"; // HTML special. 259 // We don't use ' since that is not in the intersection of HTML&XML. 260 REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";"; // Attribute delimiter. 261 REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. 262 REPLACEMENTS['<'] = "<"; // HTML special. 263 REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. 264 REPLACEMENTS['>'] = ">"; // HTML special. 265 REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. 266 REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. 267 } 268 269 /** 270 * {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in 271 * an HTML5 text node or properly quoted attribute value. 272 */ 273 private static boolean[] IS_BANNED_ASCII = new boolean[0x20]; 274 static { 275 for (int i = 0; i < IS_BANNED_ASCII.length; ++i) { 276 IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r'); 277 } 278 } 279 280 } 281