001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html;
030    
031    import java.util.LinkedList;
032    import java.util.List;
033    import javax.annotation.Nullable;
034    
035    import com.google.common.collect.Lists;
036    
037    /**
038     * Consumes an HTML stream, and dispatches events to a policy object which
039     * decides which elements and attributes to allow.
040     */
041    public final class HtmlSanitizer {
042    
043      /**
044       * Receives events based on the HTML stream, and applies a policy to decide
045       * what HTML constructs to allow.
046       * Typically, implementations use an {@link HtmlStreamRenderer} to produce
047       * the sanitized output.
048       *
049       * <p>
050       * <b>Implementations of this class are in the TCB.</b></p>
051       */
052      @TCB
053      public interface Policy extends HtmlStreamEventReceiver {
054        /**
055         * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
056         *
057         * @param elementName a normalized (lower-case for non-namespaced names)
058         *     element name.
059         * @param attrs a list of alternating attribute name and value pairs.
060         *     For efficiency, this list may be mutated by this during this method
061         *     call, but ownership reverts to the caller on method exit.
062         *     The values are raw -- HTML entities have been decoded.
063         *     Specifically, implementations are allowed to use a list iterator
064         *     and remove all disallowed attributes, add necessary attributes, and
065         *     then pass the list to an {@link HtmlStreamRenderer}.
066         */
067        void openTag(String elementName, List<String> attrs);
068    
069        /**
070         * Called when an HTML tag like {@code </foo>} is seen in the input.
071         *
072         * @param elementName a normalized (lower-case for non-namespaced names)
073         *     element name.
074         */
075        void closeTag(String elementName);
076    
077        /**
078         * Called when textual content is seen.
079         * @param textChunk raw content -- HTML entities have been decoded.
080         */
081        void text(String textChunk);
082      }
083    
084      /**
085       * Sanitizes the given HTML by applying the given policy to it.
086       *
087       * <p>
088       * This method is not in the TCB.
089       *
090       * <p>
091       * This method has no return value since policies are assumed to render things
092       * they accept and do nothing on things they reject.
093       * Use {@link HtmlStreamRenderer} to render content to an output buffer.
094       *
095       * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
096       *     empty string and will not result in a {@code NullPointerException}.
097       * @param policy The Policy that will receive events based on the tokens in
098       *     HTML.  Typically, this policy ends up routing the events to an
099       *     {@link HtmlStreamRenderer} after filtering.
100       *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
101       */
102      public static void sanitize(@Nullable String html, final Policy policy) {
103        if (html == null) { html = ""; }
104    
105        TagBalancingHtmlStreamEventReceiver balancer
106            = new TagBalancingHtmlStreamEventReceiver(policy);
107    
108        // According to Opera the maximum table nesting depth seen in the wild is
109        // 795, but 99.99% of documents have a table nesting depth of less than 22.
110        // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
111        // document depth of 90 (incl. HTML & BODY).
112        // Obviously table nesting depth is not the same as whole document depth,
113        // but it is the best proxy I have available.
114        // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
115        // the original data.
116    
117        // Webkit defines the maximum HTML parser tree depth as 512.
118        // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
119        // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;
120    
121        // The first number gives us a lower bound on the nesting depth we allow,
122        // 90, and the second gives us an upper bound: 512.
123        // We do not want to bump right up against that limit.
124        // 256 is substantially larger than the lower bound and well clear of the
125        // upper bound.
126        balancer.setNestingLimit(256);
127    
128        balancer.openDocument();
129    
130        HtmlLexer lexer = new HtmlLexer(html);
131        // Use a linked list so that policies can use Iterator.remove() in an O(1)
132        // way.
133        LinkedList<String> attrs = Lists.newLinkedList();
134        while (lexer.hasNext()) {
135          HtmlToken token = lexer.next();
136          switch (token.type) {
137            case TEXT:
138              balancer.text(
139                  Encoding.decodeHtml(html.substring(token.start, token.end)));
140              break;
141            case UNESCAPED:
142              balancer.text(Encoding.stripBannedCodeunits(
143                  html.substring(token.start, token.end)));
144              break;
145            case TAGBEGIN:
146              if (html.charAt(token.start + 1) == '/') {  // A close tag.
147                balancer.closeTag(HtmlLexer.canonicalName(
148                    html.substring(token.start + 2, token.end)));
149                while (lexer.hasNext()
150                       && lexer.next().type != HtmlTokenType.TAGEND) {
151                  // skip tokens until we see a ">"
152                }
153              } else {
154                attrs.clear();
155    
156                boolean attrsReadyForName = true;
157                tagBody:
158                while (lexer.hasNext()) {
159                  HtmlToken tagBodyToken = lexer.next();
160                  switch (tagBodyToken.type) {
161                    case ATTRNAME:
162                      if (!attrsReadyForName) {
163                        // Last attribute added was valueless.
164                        attrs.add(attrs.getLast());
165                      } else {
166                        attrsReadyForName = false;
167                      }
168                      attrs.add(HtmlLexer.canonicalName(
169                          html.substring(tagBodyToken.start, tagBodyToken.end)));
170                      break;
171                    case ATTRVALUE:
172                      attrs.add(Encoding.decodeHtml(stripQuotes(
173                          html.substring(tagBodyToken.start, tagBodyToken.end))));
174                      attrsReadyForName = true;
175                      break;
176                    case TAGEND:
177                      break tagBody;
178                    default:
179                      // Just drop anything not recognized
180                  }
181                }
182                if (!attrsReadyForName) {
183                  attrs.add(attrs.getLast());
184                }
185                balancer.openTag(
186                    HtmlLexer.canonicalName(
187                        html.substring(token.start + 1, token.end)),
188                    attrs);
189              }
190              break;
191            default:
192              // Ignore comments, XML prologues, processing instructions, and other
193              // stuff that shouldn't show up in the output.
194              break;
195          }
196        }
197    
198        balancer.closeDocument();
199      }
200    
201      private static String stripQuotes(String encodedAttributeValue) {
202        int n = encodedAttributeValue.length();
203        if (n > 0) {
204          char last = encodedAttributeValue.charAt(n - 1);
205          if (last == '"' || last == '\'') {
206            int start = 0;
207            if (n != 1 && last == encodedAttributeValue.charAt(0)) {
208              start = 1;
209            } else {
210              // Browsers deal with missing left quotes : <img src=foo.png">
211              // but generally do not deal with missing right : <img src="foo.png>
212            }
213            return encodedAttributeValue.substring(start, n - 1);
214          }
215        }
216        return encodedAttributeValue;
217      }
218    
219    }