1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.streamhtmlparser.impl; 18 19 import com.google.common.collect.Maps; 20 import com.google.streamhtmlparser.ExternalState; 21 import com.google.streamhtmlparser.JavascriptParser; 22 import com.google.streamhtmlparser.util.HtmlUtils; 23 import com.google.streamhtmlparser.util.JavascriptTokenBuffer; 24 25 import java.util.Map; 26 27 /** 28 * <p>Many comments copied almost verbatim from the original C version. 29 */ 30 public class JavascriptParserImpl extends GenericParser 31 implements JavascriptParser { 32 33 final static InternalState JS_TEXT; 34 final static InternalState JS_Q; 35 final static InternalState JS_Q_E; 36 final static InternalState JS_DQ; 37 final static InternalState JS_DQ_E; 38 final static InternalState JS_SLASH; 39 final static InternalState JS_REGEXP_SLASH; 40 final static InternalState JS_REGEXP; 41 final static InternalState JS_REGEXP_BRK; 42 final static InternalState JS_REGEXP_BRK_E; 43 final static InternalState JS_REGEXP_E; 44 final static InternalState JS_COM_LN; 45 final static InternalState JS_COM_ML; 46 final static InternalState JS_COM_ML_CLOSE; 47 final static InternalState JS_COM_AFTER; 48 49 static { 50 JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT"); 51 JS_Q = InternalState.getInstanceJavascript("JS_Q"); 52 JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E"); 53 JS_DQ = InternalState.getInstanceJavascript("JS_DQ"); 54 JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E"); 55 JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH"); 56 JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP"); 57 JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH"); 58 JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E"); 59 JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK"); 60 JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E"); 61 JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN"); 62 JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML"); 63 JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE"); 64 JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER"); 65 } 66 67 private static final Map<InternalState, ExternalState> STATE_MAPPING = 68 Maps.newHashMap(); 69 static { initializeStateMapping()70 initializeStateMapping(); 71 } 72 73 private static final ParserStateTable STATE_TABLE = new ParserStateTable(); 74 static { initializeParserStateTable()75 initializeParserStateTable(); 76 } 77 78 private final JavascriptTokenBuffer ccBuffer; 79 80 /** 81 * Creates a {@code JavascriptParserImpl} object. 82 */ JavascriptParserImpl()83 public JavascriptParserImpl() { 84 super(STATE_TABLE, STATE_MAPPING, JS_TEXT); 85 ccBuffer = new JavascriptTokenBuffer(); 86 } 87 88 /** 89 * Creates a {@code JavascriptParserImpl} object that is a copy 90 * of the one provided. 91 * 92 * @param aJavascriptParserImpl the {@code JavascriptParserImpl} to copy 93 */ JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl)94 public JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl) { 95 super(aJavascriptParserImpl); 96 ccBuffer = new JavascriptTokenBuffer(aJavascriptParserImpl.ccBuffer); 97 } 98 99 @Override reset()100 public void reset() { 101 super.reset(); 102 currentState = JS_TEXT; 103 } 104 105 @Override handleEnterState(InternalState currentState, InternalState expectedNextState, char input)106 protected InternalState handleEnterState(InternalState currentState, 107 InternalState expectedNextState, 108 char input) { 109 InternalState nextState = expectedNextState; 110 if (currentState == JS_SLASH) { 111 nextState = enterStateJsSlash(currentState, input); 112 } else if (currentState == JS_COM_AFTER) { 113 enterStateJsCommentAfter(); 114 } 115 return nextState; 116 } 117 118 @Override handleExitState(InternalState currentState, InternalState expectedNextState, char input)119 protected InternalState handleExitState(InternalState currentState, 120 InternalState expectedNextState, 121 char input) { 122 // Nothing to do - no handlers for exit states 123 return expectedNextState; 124 } 125 126 @Override handleInState(InternalState currentState, char input)127 protected InternalState handleInState(InternalState currentState, 128 char input) { 129 if (currentState == JS_TEXT) { 130 inStateJsText(input); 131 } 132 return currentState; 133 } 134 135 /** 136 * Called every time we find a slash ('/') character in the javascript 137 * text (except for slashes that close comments or regexp literals). 138 * 139 * <p>Comment copied verbatim from the corresponding C-version. 140 * 141 * <p>Implements the logic to figure out if this slash character is a 142 * division operator or if it opens a regular expression literal. 143 * This is heavily inspired by the syntactic resynchronization 144 * for javascript 2.0: 145 * 146 * <p>When we receive a '/', we look at the previous non space character 147 * to figure out if it's the ending of a punctuator that can precede a 148 * regexp literal, in which case we assume the current '/' is part of a 149 * regular expression literal (or the opening of a javascript comment, 150 * but that part is dealt with in the state machine). The exceptions to 151 * this are unary operators, so we look back a second character to rule 152 * out '++' and '--'. 153 * 154 * <p> Although it is not straightforward to figure out if the binary 155 * operator is a postfix of the previous expression or a prefix of the 156 * regular expression, we rule out the later as it is an uncommon practice. 157 * 158 * <p>If we ruled out the previous token to be a valid regexp preceding 159 * punctuator, we extract the last identifier in the buffer and match 160 * against a list of keywords that are known to precede expressions in 161 * the grammar. If we get a match on any of these keywords, then we are 162 * opening a regular expression, if not, then we have a division operator. 163 * 164 * <p>Known cases that are accepted by the grammar but we handle 165 * differently, although I (falmeida) don't believe there is a 166 * legitimate usage for those: 167 * Division of a regular expression: var result = /test/ / 5; 168 * Prefix unary increment of a regular expression: var result = ++/test/; 169 * Division of an object literal: { a: 1 } /x/.exec('x'); 170 * 171 * @param state being entered to 172 * @param input character being processed 173 * @return state next state to go to, may be the same as the one we 174 * were called with 175 * 176 * <a>http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html> 177 * Syntactic Resynchronization</a> 178 */ enterStateJsSlash(InternalState state, char input)179 private InternalState enterStateJsSlash(InternalState state, char input) { 180 181 InternalState nextState = state; 182 int position = -1; 183 184 // Consume the last whitespace 185 if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) { 186 --position; 187 } 188 189 switch (ccBuffer.getChar(position)) { 190 // Ignore unary increment 191 case '+': 192 if (ccBuffer.getChar(position - 1) != '+') { 193 nextState = JS_REGEXP_SLASH; 194 } 195 break; 196 case '-': 197 // Ignore unary decrement 198 if (ccBuffer.getChar(position - 1) != '-') { 199 nextState = JS_REGEXP_SLASH; 200 } 201 break; 202 // List of punctuator endings except ), ], }, + and - * 203 case '=': 204 case '<': 205 case '>': 206 case '&': 207 case '|': 208 case '!': 209 case '%': 210 case '*': 211 case '/': 212 case ',': 213 case ';': 214 case '?': 215 case ':': 216 case '^': 217 case '~': 218 case '{': 219 case '(': 220 case '[': 221 case '}': 222 case '\0': 223 nextState = JS_REGEXP_SLASH; 224 break; 225 default: 226 String lastIdentifier = ccBuffer.getLastIdentifier(); 227 if (lastIdentifier != null && HtmlUtils 228 .isJavascriptRegexpPrefix(lastIdentifier)) { 229 nextState = JS_REGEXP_SLASH; 230 } 231 } 232 ccBuffer.appendChar(input); 233 return nextState; 234 } 235 236 /** 237 * Called at the end of a javascript comment. 238 * 239 * <p>When we open a comment, the initial '/' was inserted into the ring 240 * buffer, but it is not a token and should be considered whitespace 241 * for parsing purposes. 242 * 243 * <p>When we first saw the '/' character, we didn't yet know if it was 244 * the beginning of a comment, a division operator, or a regexp. 245 * 246 * <p>In this function we just replace the inital '/' with a whitespace 247 * character, unless we had a preceding whitespace character, in which 248 * case we just remove the '/'. This is needed to ensure all spaces in 249 * the buffer are correctly folded. 250 */ enterStateJsCommentAfter()251 private void enterStateJsCommentAfter() { 252 if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) { 253 ccBuffer.popChar(); 254 } else { 255 ccBuffer.setChar(-1, ' '); 256 } 257 } 258 inStateJsText(char input)259 private void inStateJsText(char input) { 260 ccBuffer.appendChar(input); 261 } 262 263 // ======================================================= // 264 // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. // 265 // ======================================================= // 266 registerMapping(InternalState internalState, ExternalState externalState)267 private static void registerMapping(InternalState internalState, 268 ExternalState externalState) { 269 STATE_MAPPING.put(internalState, externalState); 270 } 271 initializeStateMapping()272 private static void initializeStateMapping() { 273 // Each parser implementation must map the error state appropriately. 274 registerMapping(InternalState.INTERNAL_ERROR_STATE, 275 JavascriptParser.STATE_ERROR); 276 277 registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT); 278 registerMapping(JS_Q, JavascriptParser.STATE_Q); 279 registerMapping(JS_Q_E, JavascriptParser.STATE_Q); 280 registerMapping(JS_DQ, JavascriptParser.STATE_DQ); 281 registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ); 282 registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT); 283 registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT); 284 registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP); 285 registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP); 286 registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP); 287 registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP); 288 registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT); 289 registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT); 290 registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT); 291 registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT); 292 } 293 registerTransition(String expression, InternalState source, InternalState to)294 private static void registerTransition(String expression, 295 InternalState source, 296 InternalState to) { 297 // It seems to silly to go through a StateTableTransition here 298 // but it adds extra data checking. 299 StateTableTransition stt = new StateTableTransition(expression, 300 source, to); 301 STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(), 302 stt.getTo()); 303 } 304 initializeParserStateTable()305 private static void initializeParserStateTable() { 306 registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT); 307 registerTransition("/", JS_COM_AFTER, JS_SLASH); 308 registerTransition("\"", JS_COM_AFTER, JS_DQ); 309 registerTransition("\'", JS_COM_AFTER, JS_Q); 310 registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML); 311 registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER); 312 registerTransition("[:default:]", JS_COM_ML, JS_COM_ML); 313 registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE); 314 registerTransition("[:default:]", JS_COM_LN,JS_COM_LN); 315 registerTransition("\n", JS_COM_LN,JS_COM_AFTER); 316 registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP); 317 registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK); 318 registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK); 319 registerTransition("]", JS_REGEXP_BRK, JS_REGEXP); 320 registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E); 321 registerTransition("[:default:]", JS_REGEXP, JS_REGEXP); 322 registerTransition("/", JS_REGEXP, JS_TEXT); 323 registerTransition("[", JS_REGEXP, JS_REGEXP_BRK); 324 registerTransition("\\", JS_REGEXP, JS_REGEXP_E); 325 registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP); 326 registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK); 327 registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E); 328 registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML); 329 registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN); 330 registerTransition("[:default:]", JS_SLASH, JS_TEXT); 331 registerTransition("*", JS_SLASH, JS_COM_ML); 332 registerTransition("/", JS_SLASH, JS_COM_LN); 333 registerTransition("[:default:]", JS_DQ_E,JS_DQ); 334 registerTransition("[:default:]", JS_DQ,JS_DQ); 335 registerTransition("\"", JS_DQ, JS_TEXT); 336 registerTransition("\\", JS_DQ, JS_DQ_E); 337 registerTransition("[:default:]", JS_Q_E,JS_Q); 338 registerTransition("[:default:]", JS_Q,JS_Q); 339 registerTransition("\'", JS_Q, JS_TEXT); 340 registerTransition("\\", JS_Q, JS_Q_E); 341 registerTransition("[:default:]", JS_TEXT, JS_TEXT); 342 registerTransition("/", JS_TEXT, JS_SLASH); 343 registerTransition("\"", JS_TEXT, JS_DQ); 344 registerTransition("\'", JS_TEXT, JS_Q); 345 } 346 }