• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html;
30 
31 import java.util.LinkedList;
32 import java.util.List;
33 import javax.annotation.Nullable;
34 
35 import com.google.common.collect.Lists;
36 
37 /**
38  * Consumes an HTML stream, and dispatches events to a policy object which
39  * decides which elements and attributes to allow.
40  */
41 public final class HtmlSanitizer {
42 
43   /**
44    * Receives events based on the HTML stream, and applies a policy to decide
45    * what HTML constructs to allow.
46    * Typically, implementations use an {@link HtmlStreamRenderer} to produce
47    * the sanitized output.
48    *
49    * <p>
50    * <b>Implementations of this class are in the TCB.</b></p>
51    */
52   @TCB
53   public interface Policy extends HtmlStreamEventReceiver {
54     /**
55      * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
56      *
57      * @param elementName a normalized (lower-case for non-namespaced names)
58      *     element name.
59      * @param attrs a list of alternating attribute name and value pairs.
60      *     For efficiency, this list may be mutated by this during this method
61      *     call, but ownership reverts to the caller on method exit.
62      *     The values are raw -- HTML entities have been decoded.
63      *     Specifically, implementations are allowed to use a list iterator
64      *     and remove all disallowed attributes, add necessary attributes, and
65      *     then pass the list to an {@link HtmlStreamRenderer}.
66      */
openTag(String elementName, List<String> attrs)67     void openTag(String elementName, List<String> attrs);
68 
69     /**
70      * Called when an HTML tag like {@code </foo>} is seen in the input.
71      *
72      * @param elementName a normalized (lower-case for non-namespaced names)
73      *     element name.
74      */
closeTag(String elementName)75     void closeTag(String elementName);
76 
77     /**
78      * Called when textual content is seen.
79      * @param textChunk raw content -- HTML entities have been decoded.
80      */
text(String textChunk)81     void text(String textChunk);
82   }
83 
84   /**
85    * Sanitizes the given HTML by applying the given policy to it.
86    *
87    * <p>
88    * This method is not in the TCB.
89    *
90    * <p>
91    * This method has no return value since policies are assumed to render things
92    * they accept and do nothing on things they reject.
93    * Use {@link HtmlStreamRenderer} to render content to an output buffer.
94    *
95    * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
96    *     empty string and will not result in a {@code NullPointerException}.
97    * @param policy The Policy that will receive events based on the tokens in
98    *     HTML.  Typically, this policy ends up routing the events to an
99    *     {@link HtmlStreamRenderer} after filtering.
100    *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
101    */
sanitize(@ullable String html, final Policy policy)102   public static void sanitize(@Nullable String html, final Policy policy) {
103     if (html == null) { html = ""; }
104 
105     TagBalancingHtmlStreamEventReceiver balancer
106         = new TagBalancingHtmlStreamEventReceiver(policy);
107 
108     // According to Opera the maximum table nesting depth seen in the wild is
109     // 795, but 99.99% of documents have a table nesting depth of less than 22.
110     // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
111     // document depth of 90 (incl. HTML & BODY).
112     // Obviously table nesting depth is not the same as whole document depth,
113     // but it is the best proxy I have available.
114     // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
115     // the original data.
116 
117     // Webkit defines the maximum HTML parser tree depth as 512.
118     // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
119     // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
120 
121     // The first number gives us a lower bound on the nesting depth we allow,
122     // 90, and the second gives us an upper bound: 512.
123     // We do not want to bump right up against that limit.
124     // 256 is substantially larger than the lower bound and well clear of the
125     // upper bound.
126     balancer.setNestingLimit(256);
127 
128     balancer.openDocument();
129 
130     HtmlLexer lexer = new HtmlLexer(html);
131     // Use a linked list so that policies can use Iterator.remove() in an O(1)
132     // way.
133     LinkedList<String> attrs = Lists.newLinkedList();
134     while (lexer.hasNext()) {
135       HtmlToken token = lexer.next();
136       switch (token.type) {
137         case TEXT:
138           balancer.text(
139               Encoding.decodeHtml(html.substring(token.start, token.end)));
140           break;
141         case UNESCAPED:
142           balancer.text(Encoding.stripBannedCodeunits(
143               html.substring(token.start, token.end)));
144           break;
145         case TAGBEGIN:
146           if (html.charAt(token.start + 1) == '/') {  // A close tag.
147             balancer.closeTag(HtmlLexer.canonicalName(
148                 html.substring(token.start + 2, token.end)));
149             while (lexer.hasNext()
150                    && lexer.next().type != HtmlTokenType.TAGEND) {
151               // skip tokens until we see a ">"
152             }
153           } else {
154             attrs.clear();
155 
156             boolean attrsReadyForName = true;
157             tagBody:
158             while (lexer.hasNext()) {
159               HtmlToken tagBodyToken = lexer.next();
160               switch (tagBodyToken.type) {
161                 case ATTRNAME:
162                   if (!attrsReadyForName) {
163                     // Last attribute added was valueless.
164                     attrs.add(attrs.getLast());
165                   } else {
166                     attrsReadyForName = false;
167                   }
168                   attrs.add(HtmlLexer.canonicalName(
169                       html.substring(tagBodyToken.start, tagBodyToken.end)));
170                   break;
171                 case ATTRVALUE:
172                   attrs.add(Encoding.decodeHtml(stripQuotes(
173                       html.substring(tagBodyToken.start, tagBodyToken.end))));
174                   attrsReadyForName = true;
175                   break;
176                 case TAGEND:
177                   break tagBody;
178                 default:
179                   // Just drop anything not recognized
180               }
181             }
182             if (!attrsReadyForName) {
183               attrs.add(attrs.getLast());
184             }
185             balancer.openTag(
186                 HtmlLexer.canonicalName(
187                     html.substring(token.start + 1, token.end)),
188                 attrs);
189           }
190           break;
191         default:
192           // Ignore comments, XML prologues, processing instructions, and other
193           // stuff that shouldn't show up in the output.
194           break;
195       }
196     }
197 
198     balancer.closeDocument();
199   }
200 
stripQuotes(String encodedAttributeValue)201   private static String stripQuotes(String encodedAttributeValue) {
202     int n = encodedAttributeValue.length();
203     if (n > 0) {
204       char last = encodedAttributeValue.charAt(n - 1);
205       if (last == '"' || last == '\'') {
206         int start = 0;
207         if (n != 1 && last == encodedAttributeValue.charAt(0)) {
208           start = 1;
209         } else {
210           // Browsers deal with missing left quotes : <img src=foo.png">
211           // but generally do not deal with missing right : <img src="foo.png>
212         }
213         return encodedAttributeValue.substring(start, n - 1);
214       }
215     }
216     return encodedAttributeValue;
217   }
218 
219 }
220