001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html;
030    
031    import com.google.common.annotations.VisibleForTesting;
032    import java.io.Closeable;
033    import java.io.Flushable;
034    import java.io.IOException;
035    import java.util.Iterator;
036    import java.util.List;
037    import javax.annotation.WillCloseWhenClosed;
038    import javax.annotation.concurrent.NotThreadSafe;
039    
040    /**
041     * Given a series of HTML tokens, writes valid, normalized HTML to the output.
042     * The output will have well-defined tag boundaries, but there may be orphaned
043     * or missing close and open tags.
044     * The result of two renderers can always be concatenated to produce a larger
045     * snippet of HTML, but if the first was called with
046     * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not
047     * be interpreted as tags in the concatenated version.
048     */
049    @TCB
050    @NotThreadSafe
051    public class HtmlStreamRenderer implements HtmlStreamEventReceiver {
052    
053      private final Appendable output;
054      private final Handler<? super IOException> ioExHandler;
055      private final Handler<? super String> badHtmlHandler;
056      private String lastTagOpened;
057      private StringBuilder pendingUnescaped;
058      private boolean open;
059    
060      /**
061       * Factory.
062       * @param output the buffer to which HTML is streamed.
063       * @param ioExHandler called with any exception raised by output.
064       * @param badHtmlHandler receives alerts when HTML cannot be rendered because
065       *    there is not valid HTML tree that results from that series of calls.
066       *    E.g. it is not possible to create an HTML {@code <style>} element whose
067       *    textual content is {@code "</style>"}.
068       */
069      public static HtmlStreamRenderer create(
070          @WillCloseWhenClosed Appendable output,
071          Handler<? super IOException> ioExHandler,
072          Handler<? super String> badHtmlHandler) {
073        if (output instanceof Closeable) {
074          return new CloseableHtmlStreamRenderer(
075              output, ioExHandler, badHtmlHandler);
076        } else {
077          return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler);
078        }
079      }
080    
081      /**
082       * Factory.
083       * @param output the buffer to which HTML is streamed.
084       * @param badHtmlHandler receives alerts when HTML cannot be rendered because
085       *    there is not valid HTML tree that results from that series of calls.
086       *    E.g. it is not possible to create an HTML {@code <style>} element whose
087       *    textual content is {@code "</style>"}.
088       */
089      public static HtmlStreamRenderer create(
090          StringBuilder output, Handler<? super String> badHtmlHandler) {
091        // Propagate since StringBuilder should not throw IOExceptions.
092        return create(output, Handler.PROPAGATE, badHtmlHandler);
093      }
094    
095      private HtmlStreamRenderer(
096          Appendable output, Handler<? super IOException> ioExHandler,
097          Handler<? super String> badHtmlHandler) {
098        this.output = output;
099        this.ioExHandler = ioExHandler;
100        this.badHtmlHandler = badHtmlHandler;
101      }
102    
103      /**
104       * Called when the series of calls make no sense.
105       * May be overridden to throw an unchecked throwable, to log, or to take some
106       * other action.
107       *
108       * @param message for human consumption.
109       * @param identifier an HTML identifier associated with the message.
110       */
111      private final void error(String message, CharSequence identifier) {
112        if (badHtmlHandler != Handler.DO_NOTHING) {   // Avoid string append.
113          badHtmlHandler.handle(message + " : " + identifier);
114        }
115      }
116    
117      public final void openDocument() throws IllegalStateException {
118        if (open) { throw new IllegalStateException(); }
119        open = true;
120      }
121    
122      public final void closeDocument() throws IllegalStateException {
123        if (!open) { throw new IllegalStateException(); }
124        if (pendingUnescaped != null) {
125          closeTag(lastTagOpened);
126        }
127        open = false;
128        if (output instanceof Flushable) {
129          try {
130            ((Flushable) output).flush();
131          } catch (IOException ex) {
132            ioExHandler.handle(ex);
133          }
134        }
135      }
136    
137      public final boolean isDocumentOpen() {
138        return open;
139      }
140    
141      public final void openTag(String elementName, List<String> attrs) {
142        try {
143          writeOpenTag(elementName, attrs);
144        } catch (IOException ex) {
145          ioExHandler.handle(ex);
146        }
147      }
148    
149      private void writeOpenTag(String elementName, List<? extends String> attrs)
150          throws IOException {
151        if (!open) { throw new IllegalStateException(); }
152        elementName = safeName(elementName);
153        if (!isValidHtmlName(elementName)) {
154          error("Invalid element name", elementName);
155          return;
156        }
157        if (pendingUnescaped != null) {
158          error("Tag content cannot appear inside CDATA element", elementName);
159          return;
160        }
161    
162        switch (HtmlTextEscapingMode.getModeForTag(elementName)) {
163          case CDATA_SOMETIMES:
164          case CDATA:
165          case PLAIN_TEXT:
166            lastTagOpened = elementName;
167            pendingUnescaped = new StringBuilder();
168            break;
169          default:
170        }
171    
172        output.append('<').append(elementName);
173    
174        for (Iterator<? extends String> attrIt = attrs.iterator();
175             attrIt.hasNext();) {
176          String name = attrIt.next();
177          String value = attrIt.next();
178          name = HtmlLexer.canonicalName(name);
179          if (!isValidHtmlName(name)) {
180            error("Invalid attr name", name);
181            continue;
182          }
183          output.append(' ').append(name).append('=').append('"');
184          Encoding.encodeHtmlOnto(value, output);
185          if (value.indexOf('`') != -1) {
186            // Apparently, in quirks mode, IE8 does a poor job producing innerHTML
187            // values.  Given
188            //     <div attr="``foo=bar">
189            // we encode &#96; but if JavaScript does:
190            //    nodeA.innerHTML = nodeB.innerHTML;
191            // and nodeB contains the DIV above, then IE8 will produce
192            //     <div attr=``foo=bar>
193            // as the value of nodeB.innerHTML and assign it to nodeA.
194            // IE8's HTML parser treats `` as a blank attribute value and foo=bar
195            // becomes a separate attribute.
196            // Adding a space at the end of the attribute prevents this by forcing
197            // IE8 to put double quotes around the attribute when computing
198            // nodeB.innerHTML.
199            output.append(' ');
200          }
201          output.append('"');
202        }
203    
204        // Limit our output to the intersection of valid XML and valid HTML5 when
205        // the output contains no special HTML5 elements like <title>, <script>, or
206        // <textarea>.
207        if (HtmlTextEscapingMode.isVoidElement(elementName)) {
208          output.append(" /");
209        }
210    
211        output.append('>');
212      }
213    
214      public final void closeTag(String elementName) {
215        try {
216          writeCloseTag(safeName(elementName));
217        } catch (IOException ex) {
218          ioExHandler.handle(ex);
219        }
220      }
221    
222      private final void writeCloseTag(String elementName)
223          throws IOException {
224        if (!open) { throw new IllegalStateException(); }
225        elementName = HtmlLexer.canonicalName(elementName);
226        if (!isValidHtmlName(elementName)) {
227          error("Invalid element name", elementName);
228          return;
229        }
230    
231        if (pendingUnescaped != null) {
232          if (!lastTagOpened.equals(elementName)) {
233            error("Tag content cannot appear inside CDATA element", elementName);
234            return;
235          } else {
236            StringBuilder cdataContent = pendingUnescaped;
237            pendingUnescaped = null;
238            Encoding.stripBannedCodeunits(cdataContent);
239            int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent);
240            if (problemIndex == -1) {
241              output.append(cdataContent);
242            } else {
243              error(
244                  "Invalid CDATA text content",
245                  cdataContent.subSequence(
246                      problemIndex,
247                      Math.min(problemIndex + 10, cdataContent.length())));
248              // Still output the close tag.
249            }
250          }
251          if ("plaintext".equals(elementName)) { return; }
252        }
253        output.append("</").append(elementName).append(">");
254      }
255    
256      public final void text(String text) {
257        try {
258          writeText(text);
259        } catch (IOException ex) {
260          ioExHandler.handle(ex);
261        }
262      }
263    
264      private final void writeText(String text) throws IOException {
265        if (!open) { throw new IllegalStateException(); }
266        if (pendingUnescaped != null) {
267          pendingUnescaped.append(text);
268        } else {
269          Encoding.encodeHtmlOnto(text, output);  // Works for RCDATA.
270        }
271      }
272    
273      private static int checkHtmlCdataCloseable(
274          String localName, StringBuilder sb) {
275        int escapingTextSpanStart = -1;
276        for (int i = 0, n = sb.length(); i < n; ++i) {
277          char ch = sb.charAt(i);
278          switch (ch) {
279            case '<':
280              if (i + 3 < n
281                  && '!' == sb.charAt(i + 1)
282                  && '-' == sb.charAt(i + 2)
283                  && '-' == sb.charAt(i + 3)) {
284                if (escapingTextSpanStart == -1) {
285                  escapingTextSpanStart = i;
286                } else {
287                  return i;
288                }
289              } else if (i + 1 + localName.length() < n
290                         && '/' == sb.charAt(i + 1)
291                         && Strings.regionMatchesIgnoreCase(
292                             sb, i + 2, localName, 0, localName.length())) {
293                // A close tag contained in the content.
294                if (escapingTextSpanStart < 0) {
295                  // We could try some recovery strategies here.
296                  // E.g. prepending "/<!--\n" to sb if "script".equals(localName)
297                  return i;
298                }
299                if (!"script".equals(localName)) {
300                  // Script tags are commonly included inside script tags.
301                  // <script><!--document.write('<script>f()</script>');--></script>
302                  // but this does not happen in other CDATA element types.
303                  // Actually allowing an end tag inside others is problematic.
304                  // Specifically,
305                  // <style><!--</style>-->/* foo */</style>
306                  // displays the text "/* foo */" on some browsers.
307                  return i;
308                }
309              }
310              break;
311            case '>':
312              // From the HTML5 spec:
313              //    The text in style, script, title, and textarea elements must not
314              //    have an escaping text span start that is not followed by an
315              //    escaping text span end.
316              // We look left since the HTML 5 spec allows the escaping text span
317              // end to share dashes with the start.
318              if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
319                if (escapingTextSpanStart < 0) { return i - 2; }
320                escapingTextSpanStart = -1;
321              }
322              break;
323            default:
324              break;
325          }
326        }
327        if (escapingTextSpanStart >= 0) {
328          // We could try recovery strategies here.
329          // E.g. appending "//-->" to the buffer if "script".equals(localName)
330          return escapingTextSpanStart;
331        }
332        return -1;
333      }
334    
335    
336      @VisibleForTesting
337      static boolean isValidHtmlName(String name) {
338        int n = name.length();
339        if (n == 0) { return false; }
340        if (n > 128) { return false; }
341        boolean isNamespaced = false;
342        for (int i = 0; i < n; ++i) {
343          char ch = name.charAt(i);
344          switch (ch) {
345            case ':':
346              if (isNamespaced) { return false; }
347              isNamespaced = true;
348              if (i == 0 || i + 1 == n) { return false; }
349              break;
350            case '-':
351              if (i == 0 || i + 1 == n) { return false; }
352              break;
353            default:
354              if (ch <= '9') {
355                if (i == 0 || ch < '0') { return false; }
356              } else if ('A' <= ch && ch <= 'z') {
357                if ('Z' < ch && ch < 'a') { return false; }
358              } else {
359                return false;
360              }
361              break;
362          }
363        }
364        return true;
365      }
366    
367      /**
368       * Canonicalizes the element name and possibly substitutes an alternative
369       * that has more consistent semantics.
370       */
371      static String safeName(String elementName) {
372        elementName = HtmlLexer.canonicalName(elementName);
373    
374        // Substitute a reliably non-raw-text element for raw-text and
375        // plain-text elements.
376        switch (elementName.length()) {
377          case 3:
378            if ("xmp".equals(elementName)) { return "pre"; }
379            break;
380          case 7:
381            if ("listing".equals(elementName)) { return "pre"; }
382            break;
383          case 9:
384            if ("plaintext".equals(elementName)) { return "pre"; }
385            break;
386        }
387        return elementName;
388      }
389    
390      static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer
391          implements Closeable {
392        private final Closeable closeable;
393    
394        CloseableHtmlStreamRenderer(
395            @WillCloseWhenClosed
396            Appendable output, Handler<? super IOException> errorHandler,
397            Handler<? super String> badHtmlHandler) {
398          super(output, errorHandler, badHtmlHandler);
399          this.closeable = (Closeable) output;
400        }
401    
402        public void close() throws IOException {
403          if (isDocumentOpen()) { closeDocument(); }
404          closeable.close();
405        }
406      }
407    }