/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.impl; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.streamhtmlparser.ExternalState; import com.google.streamhtmlparser.HtmlParser; import com.google.streamhtmlparser.ParseException; import com.google.streamhtmlparser.util.CharacterRecorder; import com.google.streamhtmlparser.util.EntityResolver; import com.google.streamhtmlparser.util.HtmlUtils; import java.util.Map; /** * A custom specialized parser - ported from the main C++ version - used to * implement context-aware escaping of run-time data in web-application * templates. * *
This is the main class in the package. It implements the * {@code HtmlParser} interface. * *
This class is not thread-safe, in particular you cannot invoke any * state changing operations (such as {@code parse} from multiple threads * on the same object. * *
If you are looking at this class, chances are very high you are
* implementing Auto-Escaping for a new template system. Please see the
* landing page including a design document at
* Auto-Escape Landing Page.
*/
public class HtmlParserImpl extends GenericParser implements HtmlParser {
/*
* Internal representation of the parser state, which is at a
* finer-granularity than the external state as given to callers.
* The relationship between Both for performance reasons and to leverage code a state-flow machine
* that is automatically generated from Python for multiple target
* languages, this object uses a static {@code ParserStateTable} that
* is read-only and obtained from the generated code in {@code HtmlParserFsm}.
* That code also maintains the mapping from internal states
* ({@code InternalState}) to external states ({@code ExternalState}).
*/
public HtmlParserImpl() {
super(STATE_TABLE, STATE_MAPPING, TEXT);
tag = new CharacterRecorder();
attr = new CharacterRecorder();
value = new CharacterRecorder();
cdataCloseTag = new CharacterRecorder();
entityResolver = new EntityResolver();
jsParser = new JavascriptParserImpl();
insideJavascript = false;
valueIndex = 0;
textInsideUrlValue = false;
}
/**
* Creates an {@code HtmlParserImpl} that is a copy of the one provided.
*
* @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
*/
public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
super(aHtmlParserImpl);
tag = new CharacterRecorder(aHtmlParserImpl.tag);
attr = new CharacterRecorder(aHtmlParserImpl.attr);
value = new CharacterRecorder(aHtmlParserImpl.value);
cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
insideJavascript = aHtmlParserImpl.insideJavascript;
valueIndex = aHtmlParserImpl.valueIndex;
textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
}
@Override
public boolean inJavascript() {
return (insideJavascript
&& ( (getState() == STATE_VALUE)
|| (currentState == CDATA_TEXT)
|| (currentState == CDATA_COM_START)
|| (currentState == CDATA_COM_START_DASH)
|| (currentState == CDATA_COM_BODY)
|| (currentState == CDATA_COM_DASH)
|| (currentState == CDATA_COM_DASH_DASH)
|| (currentState == CDATA_LT)
|| (currentState == CDATA_MAY_CLOSE)
|| (currentState == JS_FILE) ));
}
@Override
public boolean isJavascriptQuoted() {
if (inJavascript()) {
ExternalState jsParserState = jsParser.getState();
return (jsParserState == JavascriptParserImpl.STATE_Q
|| jsParserState == JavascriptParserImpl.STATE_DQ);
}
return false;
}
@Override
public boolean inAttribute() {
ExternalState extState = getState();
return (extState != null && (extState == STATE_ATTR
|| extState == STATE_VALUE));
}
/**
* Returns {@code true} if and only if the parser is currently within
* a CSS context. A CSS context is one of the below:
* See the {@link HtmlParser.Mode} enum for information on all
* the valid modes.
*
* @param mode is an enum representing the high-level state of the parser
*/
@Override
public void resetMode(Mode mode) {
insideJavascript = false;
tag.reset();
attr.reset();
value.reset();
cdataCloseTag.reset();
valueIndex = 0;
textInsideUrlValue = false;
jsParser.reset();
switch (mode) {
case HTML:
currentState = TEXT;
break;
case JS:
currentState = JS_FILE;
insideJavascript = true;
break;
case CSS:
currentState = CSS_FILE;
break;
case HTML_IN_TAG:
currentState = TAG_SPACE;
break;
default:
throw new IllegalArgumentException("Did not recognize Mode: " +
mode.toString());
}
}
/**
* Resets the state of the parser to the initial state of parsing HTML.
*/
public void reset() {
super.reset();
resetMode(Mode.HTML);
}
/**
* A specialized directive to tell the parser there is some content
* that will be inserted here but that it will not get to parse. Used
* by the template system that may not be able to give some content
* to the parser but wants it to know there typically will be content
* inserted at that point. This is a hint used in corner cases within
* parsing of HTML attribute names and values where content we do not
* get to see could affect our parsing and alter our current state.
*
* The two cases where {@code #insertText()} affects our parsing are:
* Called for every character inside an attribute value.
*
* @param input character read
* @throws ParseException if an unrecoverable error occurred during parsing
*/
private void inStateValue(char input) throws ParseException {
valueIndex++;
if (insideJavascript) {
EntityResolver.Status status = entityResolver.processChar(input);
if (status == EntityResolver.Status.COMPLETED) {
jsParser.parse(entityResolver.getEntity());
entityResolver.reset();
} else if (status == EntityResolver.Status.NOT_STARTED) {
jsParser.parse(input);
}
}
}
/**
* Handles the tag it finished reading.
*
* For a script tag, it initializes the javascript parser. For all
* tags that are recognized to have CDATA values
* (including the script tag), it switches the CDATA state to handle them
* properly. For code simplification, CDATA and RCDATA sections are
* treated the same.
*
* Called when the parser leaves a tag definition.
*
* @param state current state
* @return state next state, could be the same as current state
*/
private InternalState tagClose(InternalState state) {
InternalState nextState = state;
String tagName = getTag();
if ("script".equals(tagName)) {
nextState = CDATA_TEXT;
jsParser.reset();
insideJavascript = true;
} else if ("style".equals(tagName)
|| "title".equals(tagName)
|| "textarea".equals(tagName)) {
nextState = CDATA_TEXT;
insideJavascript = false;
}
return nextState;
}
/**
* Feeds the character to the javascript parser for processing.
*
* Called inside CDATA blocks to parse javascript.
*
* @param input character read
* @throws ParseException if an unrecoverable error occurred during parsing
*/
private void inStateCdata(char input) throws ParseException {
if (insideJavascript) {
jsParser.parse(input);
}
}
/**
* Starts recording. This is so we find the closing tag name in order to
* know if the tag is going to be closed or not.
*
* Called when encountering a '<' character in a CDATA section.
*/
private void enterStateCdataMayClose() {
cdataCloseTag.startRecording();
}
/**
* Determines whether to close the tag element, It closes it if it finds
* the corresponding end tag. Called when reading what could be a
* closing CDATA tag.
*
* @param input the character read
* @param expectedNextState the expected state to go to next
* unless we want to change it here
* @return the next state to go to
*/
private InternalState exitStateCdataMayClose(
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
cdataCloseTag.stopRecording();
String cdataCloseTagString = cdataCloseTag.getContent();
Preconditions.checkState(!cdataCloseTagString.isEmpty()
&& cdataCloseTagString.charAt(0) == '/'); // Developer error.
if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
&& (input == '>' || HtmlUtils.isHtmlSpace(input))) {
tag.clear();
insideJavascript = false;
} else {
nextState = CDATA_TEXT;
}
return nextState;
}
// ======================================================= //
// SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. //
// ======================================================= //
private static void registerMapping(InternalState internalState,
ExternalState externalState) {
STATE_MAPPING.put(internalState, externalState);
}
private static void initializeStateMapping() {
// Each parser implementation must map the error state appropriately.
registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
registerMapping(TEXT, HtmlParser.STATE_TEXT);
registerMapping(TAG_START, HtmlParser.STATE_TAG);
registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
registerMapping(DECL_START, HtmlParser.STATE_TEXT);
registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
registerMapping(PI, HtmlParser.STATE_TEXT);
registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
registerMapping(ATTR, HtmlParser.STATE_ATTR);
registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
registerMapping(VALUE, HtmlParser.STATE_VALUE);
registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
}
private static void registerTransition(String expression,
InternalState source,
InternalState to) {
// It seems to silly to go through a StateTableTransition here
// but it adds extra data checking.
StateTableTransition stt = new StateTableTransition(expression,
source, to);
STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
stt.getTo());
}
// NOTE: The "[:default:]" transition should be registered before any
// other transitions for a given state or it will over-write them.
private static void initializeParserStateTable() {
registerTransition("[:default:]", CSS_FILE, CSS_FILE);
registerTransition("[:default:]", JS_FILE, JS_FILE);
registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
registerTransition(">", CDATA_MAY_CLOSE, TEXT);
registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
registerTransition("!", CDATA_LT, CDATA_COM_START);
registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
registerTransition("<", CDATA_TEXT, CDATA_LT);
registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
registerTransition("\"", VALUE_DQ, TAG_SPACE);
registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
registerTransition("[:default:]", VALUE_Q, VALUE_Q);
registerTransition("\'", VALUE_Q, TAG_SPACE);
registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
registerTransition("\'", VALUE_Q_START, TAG_SPACE);
registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
registerTransition(">", VALUE_TEXT, TAG_CLOSE);
registerTransition("[:default:]", VALUE, VALUE_TEXT);
registerTransition(">", VALUE, TAG_CLOSE);
registerTransition(" \t\n\r", VALUE, VALUE);
registerTransition("\"", VALUE, VALUE_DQ_START);
registerTransition("\'", VALUE, VALUE_Q_START);
registerTransition("=", ATTR_SPACE, VALUE);
registerTransition("/", ATTR_SPACE, TAG_SPACE);
registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
registerTransition(">", ATTR_SPACE, TAG_CLOSE);
registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
registerTransition("=", ATTR, VALUE);
registerTransition("/", ATTR, TAG_SPACE);
registerTransition(">", ATTR, TAG_CLOSE);
registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
registerTransition("[:default:]", TAG_CLOSE, TEXT);
registerTransition("<", TAG_CLOSE, TAG_START);
registerTransition("/", TAG_SPACE, TAG_SPACE);
registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
registerTransition(">", TAG_SPACE, TAG_CLOSE);
registerTransition("[:default:]", PI_MAY_END, PI);
registerTransition(">", PI_MAY_END, TEXT);
registerTransition("[:default:]", PI, PI);
registerTransition("?", PI, PI_MAY_END);
registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
registerTransition(">", COM_DASH_DASH, TEXT);
registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
registerTransition("[:default:]", COM_DASH, COM_BODY);
registerTransition("-", COM_DASH, COM_DASH_DASH);
registerTransition("[:default:]", COM_BODY, COM_BODY);
registerTransition("-", COM_BODY, COM_DASH);
registerTransition("[:default:]", COM_OPEN, TEXT);
registerTransition("-", COM_OPEN, COM_BODY);
registerTransition("[:default:]", DECL_BODY, DECL_BODY);
registerTransition(">", DECL_BODY, TEXT);
registerTransition("[:default:]", DECL_START, DECL_BODY);
registerTransition(">", DECL_START, TEXT);
registerTransition("-", DECL_START, COM_OPEN);
registerTransition(">", TAG_NAME, TAG_CLOSE);
registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
// Manual change to remain in-sync with CL 10597850 in C HtmlParser.
registerTransition("[:default:]", TAG_START, TEXT);
registerTransition("<", TAG_START, TAG_START);
// End of manual change.
registerTransition("!", TAG_START, DECL_START);
registerTransition("?", TAG_START, PI);
registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
registerTransition("[:default:]", TEXT, TEXT);
registerTransition("<", TEXT, TAG_START);
}
}
InternalState
and
* ExternalState
is a many-to-one relationship.
*/
private static final InternalState TEXT;
private static final InternalState TAG_START;
private static final InternalState TAG_NAME;
private static final InternalState DECL_START;
private static final InternalState DECL_BODY;
private static final InternalState COM_OPEN;
private static final InternalState COM_BODY;
private static final InternalState COM_DASH;
private static final InternalState COM_DASH_DASH;
private static final InternalState PI;
private static final InternalState PI_MAY_END;
private static final InternalState TAG_SPACE;
private static final InternalState TAG_CLOSE;
private static final InternalState ATTR;
private static final InternalState ATTR_SPACE;
private static final InternalState VALUE;
private static final InternalState VALUE_TEXT;
private static final InternalState VALUE_Q_START;
private static final InternalState VALUE_Q;
private static final InternalState VALUE_DQ_START;
private static final InternalState VALUE_DQ;
private static final InternalState CDATA_COM_START;
private static final InternalState CDATA_COM_START_DASH;
private static final InternalState CDATA_COM_BODY;
private static final InternalState CDATA_COM_DASH;
private static final InternalState CDATA_COM_DASH_DASH;
private static final InternalState CDATA_TEXT;
private static final InternalState CDATA_LT;
private static final InternalState CDATA_MAY_CLOSE;
private static final InternalState JS_FILE;
private static final InternalState CSS_FILE;
static {
TEXT = InternalState.getInstanceHtml("TEXT");
TAG_START = InternalState.getInstanceHtml("TAG_START");
TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
DECL_START = InternalState.getInstanceHtml("DECL_START");
DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
COM_BODY = InternalState.getInstanceHtml("COM_BODY");
COM_DASH = InternalState.getInstanceHtml("COM_DASH");
COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
PI =InternalState.getInstanceHtml("PI");
PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
ATTR = InternalState.getInstanceHtml("ATTR");
ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
VALUE = InternalState.getInstanceHtml("VALUE");
VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
CDATA_COM_START_DASH =
InternalState.getInstanceHtml("CDATA_COM_START_DASH");
CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
JS_FILE = InternalState.getInstanceHtml("JS_FILE");
CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
}
private static final Map
*
*
* @return {@code true} if and only if the parser is inside CSS
*/
@Override
public boolean inCss() {
return (currentState == CSS_FILE
|| (getState() == STATE_VALUE
&& (getAttributeType() == ATTR_TYPE.STYLE))
|| ("style".equals(getTag())));
}
@Override
public ATTR_TYPE getAttributeType() {
String attribute = getAttribute();
if (!inAttribute()) {
return ATTR_TYPE.NONE;
}
if (HtmlUtils.isAttributeJavascript(attribute)) {
return ATTR_TYPE.JS;
}
if (HtmlUtils.isAttributeUri(attribute)) {
return ATTR_TYPE.URI;
}
if (HtmlUtils.isAttributeStyle(attribute)) {
return ATTR_TYPE.STYLE;
}
// Special logic to handle the "content" attribute of the "meta" tag.
if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
HtmlUtils.META_REDIRECT_TYPE redirectType =
HtmlUtils.parseContentAttributeForUrl(getValue());
if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
return ATTR_TYPE.URI;
}
return ATTR_TYPE.REGULAR;
}
@Override
public ExternalState getJavascriptState() {
return jsParser.getState();
}
@Override
public boolean isAttributeQuoted() {
return (currentState == VALUE_Q_START
|| currentState == VALUE_Q
|| currentState == VALUE_DQ_START
|| currentState == VALUE_DQ);
}
@Override
public String getTag() {
return tag.getContent().toLowerCase();
}
@Override
public String getAttribute() {
return inAttribute() ? attr.getContent().toLowerCase() : "";
}
@Override
public String getValue() {
return (getState() == STATE_VALUE) ? value.getContent() : "";
}
@Override
public int getValueIndex() {
if (getState() != STATE_VALUE) {
return 0;
}
return valueIndex;
}
@Override
public boolean isUrlStart() {
// False when not inside an HTML attribute value
if (getState() != STATE_VALUE) {
return false;
}
// Or when the HTML attribute is not of URI type.
if (getAttributeType() != ATTR_TYPE.URI) {
return false;
}
// Or when we received an InsertText() directive at the start of a URL.
if (textInsideUrlValue) {
return false;
}
if ("meta".equals(getTag())) {
// At this point, we know we are in the "content" attribute
// or we would not have the URI attribute type.
return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
HtmlUtils.META_REDIRECT_TYPE.URL_START);
}
// For all other URI attributes, check if we are at index 0.
return (getValueIndex() == 0);
}
/**
* {@inheritDoc}
*
* Resets the state of the parser to a state consistent with the
* {@code Mode} provided. This will reset finer-grained state
* information back to a default value, hence use only when
* you want to parse text from a very clean slate.
*
*
*
*
* @throws ParseException if an unrecoverable error occurred during parsing
*/
@Override
public void insertText() throws ParseException {
// Case: Inside URL attribute value.
if (getState() == STATE_VALUE
&& getAttributeType() == ATTR_TYPE.URI
&& isUrlStart()) {
textInsideUrlValue = true;
}
// Case: Before parsing any attribute value.
if (currentState == VALUE) {
setNextState(VALUE_TEXT);
}
}
@Override
protected InternalState handleEnterState(InternalState currentState,
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
if (currentState == TAG_NAME) {
enterTagName();
} else if (currentState == ATTR) {
enterAttribute();
} else if (currentState == TAG_CLOSE) {
nextState = tagClose(currentState);
} else if (currentState == CDATA_MAY_CLOSE) {
enterStateCdataMayClose();
} else if (currentState == VALUE) {
enterValue();
} else
if (currentState == VALUE_TEXT || currentState == VALUE_Q
|| currentState == VALUE_DQ) {
enterValueContent();
}
return nextState;
}
@Override
protected InternalState handleExitState(InternalState currentState,
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
if (currentState == TAG_NAME) {
exitTagName();
} else if (currentState == ATTR) {
exitAttribute();
} else if (currentState == CDATA_MAY_CLOSE) {
nextState = exitStateCdataMayClose(nextState, input);
} else
if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
|| (currentState == VALUE_DQ)) {
exitValueContent();
}
return nextState;
}
@Override
protected InternalState handleInState(InternalState currentState,
char input) throws ParseException {
if ((currentState == CDATA_TEXT)
|| (currentState == CDATA_COM_START)
|| (currentState == CDATA_COM_START_DASH)
|| (currentState == CDATA_COM_BODY)
|| (currentState == CDATA_COM_DASH)
|| (currentState == CDATA_COM_DASH_DASH)
|| (currentState == CDATA_LT)
|| (currentState == CDATA_MAY_CLOSE)
|| (currentState == JS_FILE)) {
inStateCdata(input);
} else if ((currentState == VALUE_TEXT)
|| (currentState == VALUE_Q)
|| (currentState == VALUE_DQ)) {
inStateValue(input);
}
return currentState;
}
/**
* Invokes recording on all CharacterRecorder objects. Currently we do
* not check that one and only one of them is recording. I did a fair
* bit of testing on the C++ parser and was not convinced there is
* such a guarantee.
*/
@Override
protected void record(char input) {
attr.maybeRecord(input);
tag.maybeRecord(input);
value.maybeRecord(input);
cdataCloseTag.maybeRecord(input);
}
/**
* Starts recording the name of the HTML tag. Called when the parser
* enters a new tag.
*/
private void enterTagName() {
tag.startRecording();
}
private void exitTagName() {
tag.stopRecording();
String tagString = tag.getContent();
if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
tag.reset();
}
}
/**
* Starts recording the name of the HTML attribute. Called when the parser
* enters a new HTML attribute.
*/
private void enterAttribute() {
attr.startRecording();
}
private void exitAttribute() {
attr.stopRecording();
}
/**
* Tracks the index within the HTML attribute value and initializes
* the javascript parser for attributes that take javascript.
*
* Called when the parser enters a new HTML attribute value.
*/
private void enterValue() {
valueIndex = 0;
textInsideUrlValue = false;
if (HtmlUtils.isAttributeJavascript(getAttribute())) {
entityResolver.reset();
jsParser.reset();
insideJavascript = true;
} else {
insideJavascript = false;
}
}
/**
* Starts recordning the contents of the attribute value.
*
* Called when entering an attribute value.
*/
private void enterValueContent() {
value.startRecording();
}
/**
* Stops the recording of the attribute value and exits javascript
* (in case we were inside it).
*/
private void exitValueContent() {
value.stopRecording();
insideJavascript = false;
}
/**
* Processes javascript after performing entity resolution and updates
* the position within the attribute value.
* If the status of the entity resolution is IN_PROGRESS
,
* we don't invoke the javascript parser.
*
*