1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.streamhtmlparser; 18 19 /** 20 * Methods exposed for HTML parsing of text to facilitate implementation 21 * of Automatic context-aware escaping. The HTML parser also embeds a 22 * Javascript parser for processing Javascript fragments. In the future, 23 * it will also embed other specific parsers and hence most likely remain 24 * the main interface to callers of this package. 25 * 26 * <p>Note: These are the exact methods exposed in the original C++ Parser. The 27 * names are simply modified to conform to Java. 28 */ 29 public interface HtmlParser extends Parser { 30 31 /** 32 * The Parser Mode requested for parsing a given template. 33 * Currently we support: 34 * <ul> 35 * <li>{@code HTML} for HTML templates. 36 * <li>{@code JS} for javascript templates. 37 * <li>{@code CSS} for Cascading Style-Sheets templates. 38 * <li>{@code HTML_IN_TAG} for HTML templates that consist only of 39 * HTML attribute name and value pairs. This is typically the case for 40 * a template that is being included from a parent template where the 41 * parent template contains the start and the closing of the HTML tag. 42 * This is a special mode, for standard HTML templates please use 43 * {@link #HTML}. 44 * An example of such as template is: 45 * <p><code>class="someClass" target="_blank"</code></p> 46 * <p>Which could be included from a parent template that contains 47 * an anchor tag, say:</p> 48 * <p><code><a href="/bla" ["INCLUDED_TEMPLATE"]></code></p> 49 * </ul> 50 */ 51 public enum Mode { 52 HTML, 53 JS, 54 CSS, 55 HTML_IN_TAG 56 } 57 58 /** 59 * Indicates the type of HTML attribute that the parser is currently in or 60 * {@code NONE} if the parser is not currently in an attribute. 61 * {@code URI} is for attributes taking a URI such as "href" and "src". 62 * {@code JS} is for attributes taking javascript such as "onclick". 63 * {@code STYLE} is for the "style" attribute. 64 * All other attributes fall under {@code REGULAR}. 65 * 66 * Returned by {@link HtmlParser#getAttributeType()} 67 */ 68 public enum ATTR_TYPE { 69 NONE, 70 REGULAR, 71 URI, 72 JS, 73 STYLE 74 } 75 76 /** 77 * All the states in which the parser can be. These are external states. 78 * The parser has many more internal states that are not exposed and which 79 * are instead mapped to one of these external ones. 80 * {@code STATE_TEXT} the parser is in HTML proper. 81 * {@code STATE_TAG} the parser is inside an HTML tag name. 82 * {@code STATE_COMMENT} the parser is inside an HTML comment. 83 * {@code STATE_ATTR} the parser is inside an HTML attribute name. 84 * {@code STATE_VALUE} the parser is inside an HTML attribute value. 85 * {@code STATE_JS_FILE} the parser is inside javascript code. 86 * {@code STATE_CSS_FILE} the parser is inside CSS code. 87 * 88 * <p>All these states map exactly to those exposed in the C++ (original) 89 * version of the HtmlParser. 90 */ 91 public final static ExternalState STATE_TEXT = 92 new ExternalState("STATE_TEXT"); 93 public final static ExternalState STATE_TAG = 94 new ExternalState("STATE_TAG"); 95 public final static ExternalState STATE_COMMENT = 96 new ExternalState("STATE_COMMENT"); 97 public final static ExternalState STATE_ATTR = 98 new ExternalState("STATE_ATTR"); 99 public final static ExternalState STATE_VALUE = 100 new ExternalState("STATE_VALUE"); 101 public final static ExternalState STATE_JS_FILE = 102 new ExternalState("STATE_JS_FILE"); 103 public final static ExternalState STATE_CSS_FILE = 104 new ExternalState("STATE_CSS_FILE"); 105 106 /** 107 * Returns {@code true} if the parser is currently processing Javascript. 108 * Such is the case if and only if, the parser is processing an attribute 109 * that takes Javascript, a Javascript script block or the parser 110 * is (re)set with {@link Mode#JS}. 111 * 112 * @return {@code true} if the parser is processing Javascript, 113 * {@code false} otherwise 114 */ inJavascript()115 public boolean inJavascript(); 116 117 /** 118 * Returns {@code true} if the parser is currently processing 119 * a Javascript litteral that is quoted. The caller will typically 120 * invoke this method after determining that the parser is processing 121 * Javascript. Knowing whether the element is quoted or not helps 122 * determine which escaping to apply to it when needed. 123 * 124 * @return {@code true} if and only if the parser is inside a quoted 125 * Javascript literal 126 */ isJavascriptQuoted()127 public boolean isJavascriptQuoted(); 128 129 130 /** 131 * Returns {@code true} if and only if the parser is currently within 132 * an attribute, be it within the attribute name or the attribute value. 133 * 134 * @return {@code true} if and only if inside an attribute 135 */ inAttribute()136 public boolean inAttribute(); 137 138 /** 139 * Returns {@code true} if and only if the parser is currently within 140 * a CSS context. A CSS context is one of the below: 141 * <ul> 142 * <li>Inside a STYLE tag. 143 * <li>Inside a STYLE attribute. 144 * <li>Inside a CSS file when the parser was reset in the CSS mode. 145 * </ul> 146 * 147 * @return {@code true} if and only if the parser is inside CSS 148 */ inCss()149 public boolean inCss(); 150 151 /** 152 * Returns the type of the attribute that the parser is in 153 * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute. 154 * The caller will typically invoke this method after determining 155 * that the parser is processing an attribute. 156 * 157 * <p>This is useful to determine which escaping to apply based 158 * on the type of value this attribute expects. 159 * 160 * @return type of the attribute 161 * @see HtmlParser.ATTR_TYPE 162 */ getAttributeType()163 public ATTR_TYPE getAttributeType(); 164 165 /** 166 * Returns {@code true} if and only if the parser is currently within 167 * an attribute value and that attribute value is quoted. 168 * 169 * @return {@code true} if and only if the attribute value is quoted 170 */ isAttributeQuoted()171 public boolean isAttributeQuoted(); 172 173 174 /** 175 * Returns the name of the HTML tag if the parser is currently within one. 176 * Note that the name may be incomplete if the parser is currently still 177 * parsing the name. Returns an empty {@code String} if the parser is not 178 * in a tag as determined by {@code getCurrentExternalState}. 179 * 180 * @return the name of the HTML tag or an empty {@code String} if we are 181 * not within an HTML tag 182 */ getTag()183 public String getTag(); 184 185 /** 186 * Returns the name of the HTML attribute the parser is currently processing. 187 * If the parser is still parsing the name, then the returned name 188 * may be incomplete. Returns an empty {@code String} if the parser is not 189 * in an attribute as determined by {@code getCurrentExternalState}. 190 * 191 * @return the name of the HTML attribute or an empty {@code String} 192 * if we are not within an HTML attribute 193 */ getAttribute()194 public String getAttribute(); 195 196 /** 197 * Returns the value of an HTML attribute if the parser is currently 198 * within one. If the parser is currently parsing the value, the returned 199 * value may be incomplete. The caller will typically first determine 200 * that the parser is processing a value by calling 201 * {@code getCurrentExternalState}. 202 * 203 * @return the value, could be an empty {@code String} if the parser is not 204 * in an HTML attribute value 205 */ getValue()206 public String getValue(); 207 208 /** 209 * Returns the current position of the parser within the HTML attribute 210 * value, zero being the position of the first character in the value. 211 * The caller will typically first determine that the parser is 212 * processing a value by calling {@link #getState()}. 213 * 214 * @return the index or zero if the parser is not processing a value 215 */ getValueIndex()216 public int getValueIndex(); 217 218 /** 219 * Returns {@code true} if and only if the current position of the parser is 220 * at the start of a URL HTML attribute value. This is the case when the 221 * following three conditions are all met: 222 * <p> 223 * <ol> 224 * <li>The parser is in an HTML attribute value. 225 * <li>The HTML attribute expects a URL, as determined by 226 * {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}. 227 * <li>The parser has not yet seen any characters from that URL. 228 * </ol> 229 * 230 * <p> This method may be used by an Html Sanitizer or an Auto-Escape system 231 * to determine whether to validate the URL for well-formedness and validate 232 * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe. 233 * In particular, it is recommended to use this method instead of 234 * checking that {@link #getValueIndex()} is {@code 0} to support attribute 235 * types where the URL does not start at index zero, such as the 236 * {@code content} attribute of the {@code meta} HTML tag. 237 * 238 * @return {@code true} if and only if the parser is at the start of the URL 239 */ isUrlStart()240 public boolean isUrlStart(); 241 242 /** 243 * Resets the state of the parser, allowing for reuse of the 244 * {@code HtmlParser} object. 245 * 246 * <p>See the {@link HtmlParser.Mode} enum for information on all 247 * the valid modes. 248 * 249 * @param mode is an enum representing the high-level state of the parser 250 */ resetMode(HtmlParser.Mode mode)251 public void resetMode(HtmlParser.Mode mode); 252 253 /** 254 * A specialized directive to tell the parser there is some content 255 * that will be inserted here but that it will not get to parse. Used 256 * by the template system that may not be able to give some content 257 * to the parser but wants it to know there typically will be content 258 * inserted at that point. This is a hint used in corner cases within 259 * parsing of HTML attribute names and values where content we do not 260 * get to see could affect our parsing and alter our current state. 261 * 262 * <p>Returns {@code false} if and only if the parser encountered 263 * a fatal error which prevents it from continuing further parsing. 264 * 265 * <p>Note: The return value is different from the C++ Parser which 266 * always returns {@code true} but in my opinion makes more sense. 267 * 268 * @throws ParseException if an unrecoverable error occurred during parsing 269 */ insertText()270 public void insertText() throws ParseException; 271 272 /** 273 * Returns the state the Javascript parser is in. 274 * 275 * <p>See {@link JavascriptParser} for more information on the valid 276 * external states. The caller will typically first determine that the 277 * parser is processing Javascript and then invoke this method to 278 * obtain more fine-grained state information. 279 * 280 * @return external state of the javascript parser 281 */ getJavascriptState()282 public ExternalState getJavascriptState(); 283 } 284