• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html;
30 
31 import junit.framework.TestCase;
32 
33 import javax.annotation.Nullable;
34 
35 import org.junit.Test;
36 
37 
38 public class HtmlSanitizerTest extends TestCase {
39 
40   @Test
testEmpty()41   public static final void testEmpty() throws Exception {
42     assertEquals("", sanitize(""));
43     assertEquals("", sanitize(null));
44   }
45 
46   @Test
testSimpleText()47   public static final void testSimpleText() throws Exception {
48     assertEquals("hello world", sanitize("hello world"));
49   }
50 
51   @Test
testEntities1()52   public static final void testEntities1() throws Exception {
53     assertEquals("<hello world>", sanitize("<hello world>"));
54   }
55 
56   @Test
testEntities2()57   public static final void testEntities2() throws Exception {
58     assertEquals("<b>hello <i>world</i></b>",
59                  sanitize("<b>hello <i>world</i></b>"));
60   }
61 
62   @Test
testUnknownTagsRemoved()63   public static final void testUnknownTagsRemoved() throws Exception {
64     assertEquals("<b>hello <i>world</i></b>",
65                  sanitize("<b>hello <bogus></bogus><i>world</i></b>"));
66   }
67 
68   @Test
testUnsafeTagsRemoved()69   public static final void testUnsafeTagsRemoved() throws Exception {
70     assertEquals("<b>hello <i>world</i></b>",
71                  sanitize("<b>hello <i>world</i>"
72                           + "<script src=foo.js></script></b>"));
73   }
74 
75   @Test
testUnsafeAttributesRemoved()76   public static final void testUnsafeAttributesRemoved() throws Exception {
77     assertEquals(
78         "<b>hello <i>world</i></b>",
79         sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>"));
80   }
81 
82   @Test
testCruftEscaped()83   public static final void testCruftEscaped() throws Exception {
84     assertEquals("<b>hello <i>world&lt;</i></b> &amp; tomorrow the universe",
85                  sanitize(
86                      "<b>hello <i>world<</i></b> & tomorrow the universe"));
87   }
88 
89   @Test
testTagCruftRemoved()90   public static final void testTagCruftRemoved() throws Exception {
91     assertEquals("<b id=\"p-foo\">hello <i>world&lt;</i></b>",
92                  sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>"));
93   }
94 
95   @Test
testIdsAndClassesPrefixed()96   public static final void testIdsAndClassesPrefixed() throws Exception {
97     assertEquals(
98         "<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">"
99         + "hello <i>world&lt;</i></b>",
100         sanitize(
101             "<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>"));
102   }
103 
104   @Test
testSpecialCharsInAttributes()105   public static final void testSpecialCharsInAttributes() throws Exception {
106     assertEquals(
107         "<b title=\"a&lt;b &amp;&amp; c&gt;b\">bar</b>",
108         sanitize("<b title=\"a<b && c>b\">bar</b>"));
109   }
110 
111   @Test
testUnclosedTags()112   public static final void testUnclosedTags() throws Exception {
113     assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>",
114                  sanitize("<div id=\"foo\">Bar<br>Baz"));
115   }
116 
117   @Test
testUnopenedTags()118   public static final void testUnopenedTags() throws Exception {
119     assertEquals("Foo<b>Bar</b>Baz",
120                  sanitize("Foo<b></select>Bar</b></b>Baz</select>"));
121   }
122 
123   @Test
testUnsafeEndTags()124   public static final void testUnsafeEndTags() throws Exception {
125     assertEquals(
126         "",
127         sanitize(
128             "</meta http-equiv=\"refesh\""
129             + " content=\"1;URL=http://evilgadget.com\">"));
130   }
131 
132   @Test
testEmptyEndTags()133   public static final void testEmptyEndTags() throws Exception {
134     assertEquals("<input />", sanitize("<input></input>"));
135   }
136 
137   @Test
testOnLoadStripped()138   public static final void testOnLoadStripped() throws Exception {
139     assertEquals(
140         "<img />",
141         sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>"));
142   }
143 
144   @Test
testClosingTagParameters()145   public static final void testClosingTagParameters() throws Exception {
146     assertEquals(
147         "<p>Hello world</p>",
148         sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">"));
149   }
150 
151   @Test
testOptionalEndTags()152   public static final void testOptionalEndTags() throws Exception {
153     // Should not be
154     //     "<ol> <li>A</li> <li>B<li>C </li></li></ol>"
155     // The difference is significant because in the first, the item contains no
156     // space after 'A", but in the third, the item contains 'C' and a space.
157     assertEquals(
158         "<ol><li>A</li><li>B</li><li>C </li></ol>",
159         sanitize("<ol> <li>A</li> <li>B<li>C </ol>"));
160   }
161 
162   @Test
testFoldingOfHtmlAndBodyTags()163   public static final void testFoldingOfHtmlAndBodyTags() throws Exception {
164     assertEquals(
165         "<p>P 1</p>",
166         sanitize("<html><head><title>Foo</title></head>"
167                  + "<body><p>P 1</p></body></html>"));
168     assertEquals(
169         "Hello",
170         sanitize("<body bgcolor=\"blue\">Hello</body>"));
171     assertEquals(
172         "<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>",
173         sanitize(
174             "<html>"
175             + "<head>"
176             + "<title>Blah</title>"
177             + "<p>Foo</p>"
178             + "</head>"
179             + "<body>"
180             + "<p>One"
181             + "<p>Two</p>"
182             + "Three"
183             + "<p>Four</p>"
184             + "</body>"
185             + "</html>"));
186   }
187 
188   @Test
testEmptyAndValuelessAttributes()189   public static final void testEmptyAndValuelessAttributes() throws Exception {
190     assertEquals(
191         "<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />",
192         sanitize("<input checked type=checkbox id=\"\" class=>"));
193   }
194 
195   @Test
testSgmlShortTags()196   public static final void testSgmlShortTags() throws Exception {
197     // We make no attempt to correctly handle SGML short tags since they are
198     // not implemented consistently across browsers, and have been removed from
199     // HTML 5.
200     //
201     // According to http://www.w3.org/QA/2007/10/shorttags.html
202     //      Shorttags - the odd side of HTML 4.01
203     //      ...
204     //      It uses an ill-known feature of SGML called shorthand markup, which
205     //      was authorized in HTML up to HTML 4.01. But what used to be a "cool"
206     //      feature for SGML experts becomes a liability in HTML, where the
207     //      construct is more likely to appear as a typo than as a conscious
208     //      choice.
209     //
210     //      All could be fine if this form typo-that-happens-to-be-legal was
211     //      properly implemented in contemporary HTML user-agents. It is not.
212     assertEquals("<p></p>", sanitize("<p/b/"));  // Short-tag discarded.
213     assertEquals("<p></p>", sanitize("<p<b>"));  // Discard <b attribute
214     assertEquals(
215         // This behavior for short tags is not ideal, but it is safe.
216         "<p href=\"/\">first part of the text&lt;/&gt; second part</p>",
217         sanitize("<p<a href=\"/\">first part of the text</> second part"));
218   }
219 
220   @Test
testNul()221   public static final void testNul() throws Exception {
222     assertEquals(
223         "<a title="
224         + "\"harmless  SCRIPT&#61;javascript:alert(1) ignored&#61;ignored\">"
225         + "</a>",
226         sanitize(
227             "<A TITLE="
228             + "\"harmless\0  SCRIPT=javascript:alert(1) ignored=ignored\">"
229             ));
230   }
231 
232   @Test
testDigitsInAttrNames()233   public static final void testDigitsInAttrNames() throws Exception {
234     // See bug 614 for details.
235     assertEquals(
236         "<div>Hello</div>",
237         sanitize(
238             "<div style1=\"expression(\'alert(1)\")\">Hello</div>"
239             ));
240   }
241 
242   @Test
testSupplementaryCodepointEncoding()243   public static final void testSupplementaryCodepointEncoding()
244       throws Exception {
245     // &#xd87e;&#xdc1a; is not appropriate.
246     // &#x2f81a; is appropriate as is the unencoded form.
247     assertEquals(
248         "&#x2f81a; | &#x2f81a; | &#x2f81a;",
249         sanitize("&#x2F81A; | \ud87e\udc1a | &#xd87e;&#xdc1a;"));
250   }
251 
252   @Test
testDeeplyNestedTagsDoS()253   public static final void testDeeplyNestedTagsDoS() throws Exception {
254     String sanitized = sanitize(stringRepeatedTimes("<div>", 20000));
255     int n = sanitized.length() / "<div></div>".length();
256     assertTrue("" + n, 50 <= n && n <= 1000);
257     int middle = n * "<div>".length();
258     assertEquals(sanitized.substring(0, middle),
259                  stringRepeatedTimes("<div>", n));
260     assertEquals(sanitized.substring(middle),
261                  stringRepeatedTimes("</div>", n));
262   }
263 
264   @Test
testInnerHTMLIE8()265   public static final void testInnerHTMLIE8() throws Exception {
266     // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
267     // values.  Given
268     //     <div attr="``foo=bar">
269     // we encode &#96; but if JavaScript does:
270     //    nodeA.innerHTML = nodeB.innerHTML;
271     // and nodeB contains the DIV above, then IE8 will produce
272     //     <div attr=``foo=bar>
273     // as the value of nodeB.innerHTML and assign it to nodeA.
274     // IE8's HTML parser treats `` as a blank attribute value and foo=bar
275     // becomes a separate attribute.
276     // Adding a space at the end of the attribute prevents this by forcing
277     // IE8 to put double quotes around the attribute when computing
278     // nodeB.innerHTML.
279     assertEquals(
280         "<div title=\"&#96;&#96;onmouseover&#61;alert(1337) \"></div>",
281         sanitize("<div title=\"``onmouseover=alert(1337)\">"));
282   }
283 
284   @Test
testNabobsOfNegativism()285   public static final void testNabobsOfNegativism() throws Exception {
286     // Treating <noscript> as raw-text gains us nothing security-wise.
287     assertEquals("<noscript></noscript>",
288                  sanitize("<noscript><evil></noscript>"));
289     assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
290                  sanitize("<noscript>I <b><3</b> Ponies</noscript>"));
291     assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
292                  sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>"));
293     assertEquals("<noframes>I <b>&lt;3</b> Ponies</noframes>",
294                  sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>"));
295     assertEquals("<noembed>I <b>&lt;3</b> Ponies</noembed>",
296                  sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>"));
297     assertEquals("<noxss>I <b>&lt;3</b> Ponies</noxss>",
298                  sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>"));
299     assertEquals(
300         "&lt;noscript&gt;I &lt;b&gt;&lt;3&lt;/b&gt; Ponies&lt;/noscript&gt;",
301         sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>"));
302   }
303 
304   @Test
testNULs()305   public static final void testNULs() throws Exception {
306     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>"));
307     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000"));
308     assertEquals("",               sanitize("\u0000"));
309     assertEquals("<b>Hello, </b>", sanitize("<b>Hello, &#0;</b>"));
310     assertEquals("",               sanitize("&#0;"));
311   }
312 
313   @Test
testQMarkMeta()314   public static final void testQMarkMeta() throws Exception {
315     assertEquals(
316         "Hello, <b>World</b>!",
317         sanitize(
318             ""
319             // An XML Prologue.
320             // HTML5 treats it as ignorable content via the bogus comment state.
321             + "<?xml version=\"1\" ?>"
322             + "Hello, "
323             // An XML Processing instruction.
324             // HTML5 treats it as ignorable content via the bogus comment state.
325             + "<?processing instruction?>"
326             + "<b>World"
327             // Appears in HTML copied from outlook.
328             + "<?xml:namespace prefix = o ns = "
329             + "\"urn:schemas-microsoft-com:office:office\" />"
330             + "</b>!"));
331   }
332 
333   @Test
testScriptInIframe()334   public static final void testScriptInIframe() throws Exception {
335     assertEquals(
336         "<iframe></iframe>",
337         sanitize(
338             "<iframe>\n"
339             + "  <script>alert(Hi)</script>\n"
340             + "</iframe>"));
341   }
342 
sanitize(@ullable String html)343   private static String sanitize(@Nullable String html) throws Exception {
344     StringBuilder sb = new StringBuilder();
345     HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
346         sb,
347         new Handler<String>() {
348           public void handle(String errorMessage) {
349             fail(errorMessage);
350           }
351         });
352 
353     HtmlSanitizer.Policy policy = new HtmlPolicyBuilder()
354         // Allow these tags.
355        .allowElements(
356            "a", "b", "br", "div", "i", "iframe", "img", "input", "li",
357            "ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss")
358        // And these attributes.
359        .allowAttributes(
360            "dir", "checked", "class", "href", "id", "target", "title", "type")
361        .globally()
362        // Cleanup IDs and CLASSes and prefix them with p- to move to a separate
363        // name-space.
364        .allowAttributes("id", "class")
365        .matching(
366            new AttributePolicy() {
367             public String apply(
368                 String elementName, String attributeName, String value) {
369               return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1")
370                   .replaceAll("\\s+", " ")
371                   .trim();
372             }
373            })
374        .globally()
375        // Don't throw out useless <img> and <input> elements to ease debugging.
376        .allowWithoutAttributes("img", "input")
377        .build(renderer);
378 
379     HtmlSanitizer.sanitize(html, policy);
380 
381     return sb.toString();
382   }
383 
stringRepeatedTimes(String s, int n)384   private static final String stringRepeatedTimes(String s, int n) {
385     StringBuilder sb = new StringBuilder(s.length() * n);
386     while (--n >= 0) {
387       sb.append(s);
388     }
389     return sb.toString();
390   }
391 }
392