1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import java.util.LinkedList; 32 import java.util.List; 33 import javax.annotation.Nullable; 34 35 import com.google.common.collect.Lists; 36 37 /** 38 * Consumes an HTML stream, and dispatches events to a policy object which 39 * decides which elements and attributes to allow. 40 */ 41 public final class HtmlSanitizer { 42 43 /** 44 * Receives events based on the HTML stream, and applies a policy to decide 45 * what HTML constructs to allow. 46 * Typically, implementations use an {@link HtmlStreamRenderer} to produce 47 * the sanitized output. 48 * 49 * <p> 50 * <b>Implementations of this class are in the TCB.</b></p> 51 */ 52 @TCB 53 public interface Policy extends HtmlStreamEventReceiver { 54 /** 55 * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. 56 * 57 * @param elementName a normalized (lower-case for non-namespaced names) 58 * element name. 59 * @param attrs a list of alternating attribute name and value pairs. 60 * For efficiency, this list may be mutated by this during this method 61 * call, but ownership reverts to the caller on method exit. 62 * The values are raw -- HTML entities have been decoded. 63 * Specifically, implementations are allowed to use a list iterator 64 * and remove all disallowed attributes, add necessary attributes, and 65 * then pass the list to an {@link HtmlStreamRenderer}. 66 */ openTag(String elementName, List<String> attrs)67 void openTag(String elementName, List<String> attrs); 68 69 /** 70 * Called when an HTML tag like {@code </foo>} is seen in the input. 71 * 72 * @param elementName a normalized (lower-case for non-namespaced names) 73 * element name. 74 */ closeTag(String elementName)75 void closeTag(String elementName); 76 77 /** 78 * Called when textual content is seen. 79 * @param textChunk raw content -- HTML entities have been decoded. 80 */ text(String textChunk)81 void text(String textChunk); 82 } 83 84 /** 85 * Sanitizes the given HTML by applying the given policy to it. 86 * 87 * <p> 88 * This method is not in the TCB. 89 * 90 * <p> 91 * This method has no return value since policies are assumed to render things 92 * they accept and do nothing on things they reject. 93 * Use {@link HtmlStreamRenderer} to render content to an output buffer. 94 * 95 * @param html A snippet of HTML to sanitize. {@code null} is treated as the 96 * empty string and will not result in a {@code NullPointerException}. 97 * @param policy The Policy that will receive events based on the tokens in 98 * HTML. Typically, this policy ends up routing the events to an 99 * {@link HtmlStreamRenderer} after filtering. 100 * {@link HtmlPolicyBuilder} provides an easy way to create policies. 101 */ sanitize(@ullable String html, final Policy policy)102 public static void sanitize(@Nullable String html, final Policy policy) { 103 if (html == null) { html = ""; } 104 105 TagBalancingHtmlStreamEventReceiver balancer 106 = new TagBalancingHtmlStreamEventReceiver(policy); 107 108 // According to Opera the maximum table nesting depth seen in the wild is 109 // 795, but 99.99% of documents have a table nesting depth of less than 22. 110 // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a 111 // document depth of 90 (incl. HTML & BODY). 112 // Obviously table nesting depth is not the same as whole document depth, 113 // but it is the best proxy I have available. 114 // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for 115 // the original data. 116 117 // Webkit defines the maximum HTML parser tree depth as 512. 118 // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408 119 // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512; 120 121 // The first number gives us a lower bound on the nesting depth we allow, 122 // 90, and the second gives us an upper bound: 512. 123 // We do not want to bump right up against that limit. 124 // 256 is substantially larger than the lower bound and well clear of the 125 // upper bound. 126 balancer.setNestingLimit(256); 127 128 balancer.openDocument(); 129 130 HtmlLexer lexer = new HtmlLexer(html); 131 // Use a linked list so that policies can use Iterator.remove() in an O(1) 132 // way. 133 LinkedList<String> attrs = Lists.newLinkedList(); 134 while (lexer.hasNext()) { 135 HtmlToken token = lexer.next(); 136 switch (token.type) { 137 case TEXT: 138 balancer.text( 139 Encoding.decodeHtml(html.substring(token.start, token.end))); 140 break; 141 case UNESCAPED: 142 balancer.text(Encoding.stripBannedCodeunits( 143 html.substring(token.start, token.end))); 144 break; 145 case TAGBEGIN: 146 if (html.charAt(token.start + 1) == '/') { // A close tag. 147 balancer.closeTag(HtmlLexer.canonicalName( 148 html.substring(token.start + 2, token.end))); 149 while (lexer.hasNext() 150 && lexer.next().type != HtmlTokenType.TAGEND) { 151 // skip tokens until we see a ">" 152 } 153 } else { 154 attrs.clear(); 155 156 boolean attrsReadyForName = true; 157 tagBody: 158 while (lexer.hasNext()) { 159 HtmlToken tagBodyToken = lexer.next(); 160 switch (tagBodyToken.type) { 161 case ATTRNAME: 162 if (!attrsReadyForName) { 163 // Last attribute added was valueless. 164 attrs.add(attrs.getLast()); 165 } else { 166 attrsReadyForName = false; 167 } 168 attrs.add(HtmlLexer.canonicalName( 169 html.substring(tagBodyToken.start, tagBodyToken.end))); 170 break; 171 case ATTRVALUE: 172 attrs.add(Encoding.decodeHtml(stripQuotes( 173 html.substring(tagBodyToken.start, tagBodyToken.end)))); 174 attrsReadyForName = true; 175 break; 176 case TAGEND: 177 break tagBody; 178 default: 179 // Just drop anything not recognized 180 } 181 } 182 if (!attrsReadyForName) { 183 attrs.add(attrs.getLast()); 184 } 185 balancer.openTag( 186 HtmlLexer.canonicalName( 187 html.substring(token.start + 1, token.end)), 188 attrs); 189 } 190 break; 191 default: 192 // Ignore comments, XML prologues, processing instructions, and other 193 // stuff that shouldn't show up in the output. 194 break; 195 } 196 } 197 198 balancer.closeDocument(); 199 } 200 stripQuotes(String encodedAttributeValue)201 private static String stripQuotes(String encodedAttributeValue) { 202 int n = encodedAttributeValue.length(); 203 if (n > 0) { 204 char last = encodedAttributeValue.charAt(n - 1); 205 if (last == '"' || last == '\'') { 206 int start = 0; 207 if (n != 1 && last == encodedAttributeValue.charAt(0)) { 208 start = 1; 209 } else { 210 // Browsers deal with missing left quotes : <img src=foo.png"> 211 // but generally do not deal with missing right : <img src="foo.png> 212 } 213 return encodedAttributeValue.substring(start, n - 1); 214 } 215 } 216 return encodedAttributeValue; 217 } 218 219 } 220