/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.impl; import com.google.common.base.Preconditions; /** * Holds a state table which is defined as the set of all * recognized state transitions and the set of characters that * trigger them. * *
The logic of what character causes what state transition is derived from * a base definition written as a Python configuration file in the original * C++ parser. * *
This class provides methods to initially build the state table and then * methods at parsing time to determine the transitions to subsequent states. * *
Note on characters outside the extended ASCII range: Currently, all state * transitions in the Python configuration file trigger only on extended * ASCII characters, that is characters in the Unicode space of [U+0000 to * U+00FF]. We use that property to design a more efficient state transition * representation. When receiving characters outside that ASCII range, we * simply apply the DEFAULT transition for the given state - as we do for any * character that is not a hot character for that state. If no default * transition exists, we switch to the Internal Error state. * *
Technical note: In Java, a {@code char} is a code unit and in some cases * may not represent a complete Unicode code point. However, when that happens, * the code units that follow for that code point are all in the surrogate area * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger * any incorrect state transitions. * *
This class is storage-inefficient but it is static at least * and not generated for each Parser instance. */ class ParserStateTable { /** * A limit on how many different states we can have in one state table. * Can be increased should it no longer be sufficient. */ private static final int MAX_STATES = 256; /** * We only check transitions for (extended) ASCII characters, hence * characters in the range 0 to MAX_CHARS -1. */ private static final int MAX_CHARS = 256; /** * Records all state transitions except those identified as DEFAULT * transitions. It is two dimensional: Stores a target {@code InternalState} * given a source state (referenced by its numeric ID) and the current * character. */ private final InternalState[][] stateTable; /** * Records all DEFAULT state transitions. These are transitions provided * using the {@code "[:default:]"} syntax in the Python configuration file. * There can be only one such transition for any given source state, hence * the array is one dimensional. */ private final InternalState[] defaultStateTable; public ParserStateTable() { stateTable = new InternalState[MAX_STATES][MAX_CHARS]; defaultStateTable = new InternalState[MAX_STATES]; } /** * Returns the state to go to when receiving the current {@code char} * in the {@code from} state. * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no * state transition for that character and no default state transition * for that state. * *
For ASCII characters, first look-up an explicit state transition for * the current character. If none is found, look-up a default transition. For * non-ASCII characters, look-up a default transition only. * * @param from the source state * @param currentChar the character received * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE} */ InternalState getNextState(InternalState from, int currentChar) { // TODO: Consider throwing run-time error here. if (from == null || currentChar < 0) return InternalState.INTERNAL_ERROR_STATE; int id = from.getId(); if (id < 0 || id >= MAX_STATES) { return InternalState.INTERNAL_ERROR_STATE; } InternalState result = null; if (currentChar < MAX_CHARS) { result = stateTable[id][currentChar]; } if (result == null) { result = defaultStateTable[from.getId()]; } return result != null ? result : InternalState.INTERNAL_ERROR_STATE; } void setExpression(String expr, InternalState from, InternalState to) { if ((expr == null) || (from == null) || (to == null)) { return; } // This special string indicates a default state transition. if ("[:default:]".equals(expr)) { setDefaultDestination(from, to); return; } int i = 0; while (i < expr.length()) { // If next char is a '-' which is not the last character of the expr if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') { setRange(from, expr.charAt(i), expr.charAt(i + 2), to); i += 2; } else { setDestination(from, expr.charAt(i), to); i++; } } } private void fill(InternalState from, InternalState to) { char c; for (c = 0; c < MAX_CHARS; c++) { setDestination(from, c, to); } } private void setDefaultDestination(InternalState from, InternalState to) { Preconditions.checkNotNull(from); // Developer error if it triggers Preconditions.checkNotNull(to); // Developer error if it triggers int id = from.getId(); if ((id < 0) || (id >= MAX_STATES)) { return; } // TODO: Consider asserting if there was a state transition defined. defaultStateTable[from.getId()] = to; } private void setDestination(InternalState from, char chr, InternalState to) { Preconditions.checkNotNull(from); // Developer error if it triggers Preconditions.checkNotNull(to); // Developer error if it triggers Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS, "char must be in ASCII set: %c", chr); int id = from.getId(); if ((id < 0) || (id >= MAX_STATES)) { return; } stateTable[from.getId()][chr] = to; } private void setRange(InternalState from, char start, char end, InternalState to) { // Developer error if either trigger. Preconditions.checkArgument(start >= 0 && start < MAX_CHARS, "char must be in ASCII set: %c", start); Preconditions.checkArgument(end >= 0 && end < MAX_CHARS, "char must be in ASCII set: %c", end); char c; for (c = start; c <= end; c++) { setDestination(from, c, to); } } }