001 // Copyright (c) 2011, Mike Samuel 002 // All rights reserved. 003 // 004 // Redistribution and use in source and binary forms, with or without 005 // modification, are permitted provided that the following conditions 006 // are met: 007 // 008 // Redistributions of source code must retain the above copyright 009 // notice, this list of conditions and the following disclaimer. 010 // Redistributions in binary form must reproduce the above copyright 011 // notice, this list of conditions and the following disclaimer in the 012 // documentation and/or other materials provided with the distribution. 013 // Neither the name of the OWASP nor the names of its contributors may 014 // be used to endorse or promote products derived from this software 015 // without specific prior written permission. 016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 027 // POSSIBILITY OF SUCH DAMAGE. 028 029 package org.owasp.html; 030 031 import com.google.common.annotations.VisibleForTesting; 032 import java.io.Closeable; 033 import java.io.Flushable; 034 import java.io.IOException; 035 import java.util.Iterator; 036 import java.util.List; 037 import javax.annotation.WillCloseWhenClosed; 038 import javax.annotation.concurrent.NotThreadSafe; 039 040 /** 041 * Given a series of HTML tokens, writes valid, normalized HTML to the output. 042 * The output will have well-defined tag boundaries, but there may be orphaned 043 * or missing close and open tags. 044 * The result of two renderers can always be concatenated to produce a larger 045 * snippet of HTML, but if the first was called with 046 * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not 047 * be interpreted as tags in the concatenated version. 048 */ 049 @TCB 050 @NotThreadSafe 051 public class HtmlStreamRenderer implements HtmlStreamEventReceiver { 052 053 private final Appendable output; 054 private final Handler<? super IOException> ioExHandler; 055 private final Handler<? super String> badHtmlHandler; 056 private String lastTagOpened; 057 private StringBuilder pendingUnescaped; 058 private boolean open; 059 060 /** 061 * Factory. 062 * @param output the buffer to which HTML is streamed. 063 * @param ioExHandler called with any exception raised by output. 064 * @param badHtmlHandler receives alerts when HTML cannot be rendered because 065 * there is not valid HTML tree that results from that series of calls. 066 * E.g. it is not possible to create an HTML {@code <style>} element whose 067 * textual content is {@code "</style>"}. 068 */ 069 public static HtmlStreamRenderer create( 070 @WillCloseWhenClosed Appendable output, 071 Handler<? super IOException> ioExHandler, 072 Handler<? super String> badHtmlHandler) { 073 if (output instanceof Closeable) { 074 return new CloseableHtmlStreamRenderer( 075 output, ioExHandler, badHtmlHandler); 076 } else { 077 return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler); 078 } 079 } 080 081 /** 082 * Factory. 083 * @param output the buffer to which HTML is streamed. 084 * @param badHtmlHandler receives alerts when HTML cannot be rendered because 085 * there is not valid HTML tree that results from that series of calls. 086 * E.g. it is not possible to create an HTML {@code <style>} element whose 087 * textual content is {@code "</style>"}. 088 */ 089 public static HtmlStreamRenderer create( 090 StringBuilder output, Handler<? super String> badHtmlHandler) { 091 // Propagate since StringBuilder should not throw IOExceptions. 092 return create(output, Handler.PROPAGATE, badHtmlHandler); 093 } 094 095 private HtmlStreamRenderer( 096 Appendable output, Handler<? super IOException> ioExHandler, 097 Handler<? super String> badHtmlHandler) { 098 this.output = output; 099 this.ioExHandler = ioExHandler; 100 this.badHtmlHandler = badHtmlHandler; 101 } 102 103 /** 104 * Called when the series of calls make no sense. 105 * May be overridden to throw an unchecked throwable, to log, or to take some 106 * other action. 107 * 108 * @param message for human consumption. 109 * @param identifier an HTML identifier associated with the message. 110 */ 111 private final void error(String message, CharSequence identifier) { 112 if (badHtmlHandler != Handler.DO_NOTHING) { // Avoid string append. 113 badHtmlHandler.handle(message + " : " + identifier); 114 } 115 } 116 117 public final void openDocument() throws IllegalStateException { 118 if (open) { throw new IllegalStateException(); } 119 open = true; 120 } 121 122 public final void closeDocument() throws IllegalStateException { 123 if (!open) { throw new IllegalStateException(); } 124 if (pendingUnescaped != null) { 125 closeTag(lastTagOpened); 126 } 127 open = false; 128 if (output instanceof Flushable) { 129 try { 130 ((Flushable) output).flush(); 131 } catch (IOException ex) { 132 ioExHandler.handle(ex); 133 } 134 } 135 } 136 137 public final boolean isDocumentOpen() { 138 return open; 139 } 140 141 public final void openTag(String elementName, List<String> attrs) { 142 try { 143 writeOpenTag(elementName, attrs); 144 } catch (IOException ex) { 145 ioExHandler.handle(ex); 146 } 147 } 148 149 private void writeOpenTag(String elementName, List<? extends String> attrs) 150 throws IOException { 151 if (!open) { throw new IllegalStateException(); } 152 elementName = safeName(elementName); 153 if (!isValidHtmlName(elementName)) { 154 error("Invalid element name", elementName); 155 return; 156 } 157 if (pendingUnescaped != null) { 158 error("Tag content cannot appear inside CDATA element", elementName); 159 return; 160 } 161 162 switch (HtmlTextEscapingMode.getModeForTag(elementName)) { 163 case CDATA_SOMETIMES: 164 case CDATA: 165 case PLAIN_TEXT: 166 lastTagOpened = elementName; 167 pendingUnescaped = new StringBuilder(); 168 break; 169 default: 170 } 171 172 output.append('<').append(elementName); 173 174 for (Iterator<? extends String> attrIt = attrs.iterator(); 175 attrIt.hasNext();) { 176 String name = attrIt.next(); 177 String value = attrIt.next(); 178 name = HtmlLexer.canonicalName(name); 179 if (!isValidHtmlName(name)) { 180 error("Invalid attr name", name); 181 continue; 182 } 183 output.append(' ').append(name).append('=').append('"'); 184 Encoding.encodeHtmlOnto(value, output); 185 if (value.indexOf('`') != -1) { 186 // Apparently, in quirks mode, IE8 does a poor job producing innerHTML 187 // values. Given 188 // <div attr="``foo=bar"> 189 // we encode ` but if JavaScript does: 190 // nodeA.innerHTML = nodeB.innerHTML; 191 // and nodeB contains the DIV above, then IE8 will produce 192 // <div attr=``foo=bar> 193 // as the value of nodeB.innerHTML and assign it to nodeA. 194 // IE8's HTML parser treats `` as a blank attribute value and foo=bar 195 // becomes a separate attribute. 196 // Adding a space at the end of the attribute prevents this by forcing 197 // IE8 to put double quotes around the attribute when computing 198 // nodeB.innerHTML. 199 output.append(' '); 200 } 201 output.append('"'); 202 } 203 204 // Limit our output to the intersection of valid XML and valid HTML5 when 205 // the output contains no special HTML5 elements like <title>, <script>, or 206 // <textarea>. 207 if (HtmlTextEscapingMode.isVoidElement(elementName)) { 208 output.append(" /"); 209 } 210 211 output.append('>'); 212 } 213 214 public final void closeTag(String elementName) { 215 try { 216 writeCloseTag(safeName(elementName)); 217 } catch (IOException ex) { 218 ioExHandler.handle(ex); 219 } 220 } 221 222 private final void writeCloseTag(String elementName) 223 throws IOException { 224 if (!open) { throw new IllegalStateException(); } 225 elementName = HtmlLexer.canonicalName(elementName); 226 if (!isValidHtmlName(elementName)) { 227 error("Invalid element name", elementName); 228 return; 229 } 230 231 if (pendingUnescaped != null) { 232 if (!lastTagOpened.equals(elementName)) { 233 error("Tag content cannot appear inside CDATA element", elementName); 234 return; 235 } else { 236 StringBuilder cdataContent = pendingUnescaped; 237 pendingUnescaped = null; 238 Encoding.stripBannedCodeunits(cdataContent); 239 int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent); 240 if (problemIndex == -1) { 241 output.append(cdataContent); 242 } else { 243 error( 244 "Invalid CDATA text content", 245 cdataContent.subSequence( 246 problemIndex, 247 Math.min(problemIndex + 10, cdataContent.length()))); 248 // Still output the close tag. 249 } 250 } 251 if ("plaintext".equals(elementName)) { return; } 252 } 253 output.append("</").append(elementName).append(">"); 254 } 255 256 public final void text(String text) { 257 try { 258 writeText(text); 259 } catch (IOException ex) { 260 ioExHandler.handle(ex); 261 } 262 } 263 264 private final void writeText(String text) throws IOException { 265 if (!open) { throw new IllegalStateException(); } 266 if (pendingUnescaped != null) { 267 pendingUnescaped.append(text); 268 } else { 269 Encoding.encodeHtmlOnto(text, output); // Works for RCDATA. 270 } 271 } 272 273 private static int checkHtmlCdataCloseable( 274 String localName, StringBuilder sb) { 275 int escapingTextSpanStart = -1; 276 for (int i = 0, n = sb.length(); i < n; ++i) { 277 char ch = sb.charAt(i); 278 switch (ch) { 279 case '<': 280 if (i + 3 < n 281 && '!' == sb.charAt(i + 1) 282 && '-' == sb.charAt(i + 2) 283 && '-' == sb.charAt(i + 3)) { 284 if (escapingTextSpanStart == -1) { 285 escapingTextSpanStart = i; 286 } else { 287 return i; 288 } 289 } else if (i + 1 + localName.length() < n 290 && '/' == sb.charAt(i + 1) 291 && Strings.regionMatchesIgnoreCase( 292 sb, i + 2, localName, 0, localName.length())) { 293 // A close tag contained in the content. 294 if (escapingTextSpanStart < 0) { 295 // We could try some recovery strategies here. 296 // E.g. prepending "/<!--\n" to sb if "script".equals(localName) 297 return i; 298 } 299 if (!"script".equals(localName)) { 300 // Script tags are commonly included inside script tags. 301 // <script><!--document.write('<script>f()</script>');--></script> 302 // but this does not happen in other CDATA element types. 303 // Actually allowing an end tag inside others is problematic. 304 // Specifically, 305 // <style><!--</style>-->/* foo */</style> 306 // displays the text "/* foo */" on some browsers. 307 return i; 308 } 309 } 310 break; 311 case '>': 312 // From the HTML5 spec: 313 // The text in style, script, title, and textarea elements must not 314 // have an escaping text span start that is not followed by an 315 // escaping text span end. 316 // We look left since the HTML 5 spec allows the escaping text span 317 // end to share dashes with the start. 318 if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) { 319 if (escapingTextSpanStart < 0) { return i - 2; } 320 escapingTextSpanStart = -1; 321 } 322 break; 323 default: 324 break; 325 } 326 } 327 if (escapingTextSpanStart >= 0) { 328 // We could try recovery strategies here. 329 // E.g. appending "//-->" to the buffer if "script".equals(localName) 330 return escapingTextSpanStart; 331 } 332 return -1; 333 } 334 335 336 @VisibleForTesting 337 static boolean isValidHtmlName(String name) { 338 int n = name.length(); 339 if (n == 0) { return false; } 340 if (n > 128) { return false; } 341 boolean isNamespaced = false; 342 for (int i = 0; i < n; ++i) { 343 char ch = name.charAt(i); 344 switch (ch) { 345 case ':': 346 if (isNamespaced) { return false; } 347 isNamespaced = true; 348 if (i == 0 || i + 1 == n) { return false; } 349 break; 350 case '-': 351 if (i == 0 || i + 1 == n) { return false; } 352 break; 353 default: 354 if (ch <= '9') { 355 if (i == 0 || ch < '0') { return false; } 356 } else if ('A' <= ch && ch <= 'z') { 357 if ('Z' < ch && ch < 'a') { return false; } 358 } else { 359 return false; 360 } 361 break; 362 } 363 } 364 return true; 365 } 366 367 /** 368 * Canonicalizes the element name and possibly substitutes an alternative 369 * that has more consistent semantics. 370 */ 371 static String safeName(String elementName) { 372 elementName = HtmlLexer.canonicalName(elementName); 373 374 // Substitute a reliably non-raw-text element for raw-text and 375 // plain-text elements. 376 switch (elementName.length()) { 377 case 3: 378 if ("xmp".equals(elementName)) { return "pre"; } 379 break; 380 case 7: 381 if ("listing".equals(elementName)) { return "pre"; } 382 break; 383 case 9: 384 if ("plaintext".equals(elementName)) { return "pre"; } 385 break; 386 } 387 return elementName; 388 } 389 390 static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer 391 implements Closeable { 392 private final Closeable closeable; 393 394 CloseableHtmlStreamRenderer( 395 @WillCloseWhenClosed 396 Appendable output, Handler<? super IOException> errorHandler, 397 Handler<? super String> badHtmlHandler) { 398 super(output, errorHandler, badHtmlHandler); 399 this.closeable = (Closeable) output; 400 } 401 402 public void close() throws IOException { 403 if (isDocumentOpen()) { closeDocument(); } 404 closeable.close(); 405 } 406 } 407 }