/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser; import com.google.streamhtmlparser.impl.HtmlParserImpl; import java.util.Set; import java.util.logging.Logger; /** * A factory class to obtain instances of an {@link HtmlParser}. * Currently each instance is a new object given these are fairly * light-weight. * *

In the unlikely case that this class fails to initialize properly * (a developer error), an error is emitted to the error console and the logs * and the specialized parser creation methods will throw * an {@link AssertionError} on all invokations. */ public class HtmlParserFactory { private static final Logger logger = Logger.getLogger(HtmlParserFactory.class.getName()); /** * To provide additional options when creating an {@code HtmlParser} using * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE, * boolean, Set)} */ public enum AttributeOptions { /** * Indicates that the attribute value is Javascript-quoted. Only takes * effect for Javascript-accepting attributes - as identified by * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also * HTML quoted. */ JS_QUOTED, /** * Indicates the attribute value is only a part of a URL as opposed to a * full URL. In particular, the value is not at the start of a URL and * hence does not necessitate validation of the URL scheme. * Only valid for URI-accepting attributes - as identified by * {@link HtmlParser.ATTR_TYPE#URI}. */ URL_PARTIAL, } /** * To provide additional options when creating an {@code HtmlParser} using * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)} */ public enum ModeOptions { /** * Indicates that the parser is inside a quoted {@code String}. Only * valid in the {@link HtmlParser.Mode#JS} mode. */ JS_QUOTED } private static final HtmlParser parserInDefaultAttr = createParser(); private static final HtmlParser parserInDefaultAttrQ = createParser(); private static final HtmlParser parserInUriAttrComplete = createParser(); private static final HtmlParser parserInUriAttrQComplete = createParser(); private static final HtmlParser parserInUriAttrPartial = createParser(); private static final HtmlParser parserInUriAttrQPartial = createParser(); private static final HtmlParser parserInJsAttr = createParser(); private static final HtmlParser parserInJsAttrQ = createParser(); private static final HtmlParser parserInQJsAttr = createParser(); private static final HtmlParser parserInStyleAttr = createParser(); private static final HtmlParser parserInStyleAttrQ = createParser(); private static final HtmlParser parserInJsQ = createParser(); /** * Protects all the createParserXXX methods by throwing a run-time exception * if this class failed to initialize properly. */ private static boolean initSuccess = false; static { try { initializeParsers(); initSuccess = true; } catch (ParseException e) { // Log a severe error and print it to stderr along with a stack trace. String error = HtmlParserFactory.class.getName() + " Failed initialization: " + e.getMessage(); logger.severe(error); System.err.println(error); e.printStackTrace(); } } // Static class. private HtmlParserFactory() { } // COV_NF_LINE /** * Returns an {@code HtmlParser} object ready to parse HTML input. * * @return an {@code HtmlParser} in the provided mode */ public static HtmlParser createParser() { return new HtmlParserImpl(); } /** * Returns an {@code HtmlParser} object initialized with the * requested Mode. Provide non {@code null} options to provide * a more precise initialization with the desired Mode. * * @param mode the mode to reset the parser with * @param options additional options or {@code null} for none * @return an {@code HtmlParser} in the provided mode * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParserInMode(HtmlParser.Mode mode, Set options) { requireInitialized(); if (options != null && options.contains(ModeOptions.JS_QUOTED)) return createParser(parserInJsQ); // With no options given, this method is just a convenience wrapper for // the two calls below. HtmlParser parser = new HtmlParserImpl(); parser.resetMode(mode); return parser; } /** * Returns an {@code HtmlParser} that is a copy of the one * supplied. It holds the same internal state and hence can * proceed with parsing in-lieu of the supplied parser. * * @param aHtmlParser a {@code HtmlParser} to copy from * @return an {@code HtmlParser} that is a copy of the provided one * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParser(HtmlParser aHtmlParser) { requireInitialized(); // Should never get a ClassCastException since there is only one // implementation of the HtmlParser interface. return new HtmlParserImpl((HtmlParserImpl) aHtmlParser); } /** * A very specialized {@code HtmlParser} accessor that returns a parser * in a state where it expects to read the value of an attribute * of an HTML tag. This is only useful when the parser has not seen a * certain HTML tag and an attribute name and needs to continue parsing * from a state as though it has. * *

For example, to create a parser in a state akin to that * after the parser has parsed "<a href=\"", invoke: *

   *   createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
   * 
* *

You must provide the proper value of quoting or the parser * will go into an unexpected state. * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE} * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state * inside an HTML tag where it expects an attribute name not an attribute * value. It becomes equivalent to a parser initialized in the * {@code HTML_IN_TAG} mode. * * @param attrtype the attribute type which the parser should be in * @param quoted whether the attribute value is enclosed in double quotes * @param options additional options or {@code null} for none * @return an {@code HtmlParser} initialized in the given attribute type * and quoting * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParserInAttribute( HtmlParser.ATTR_TYPE attrtype, boolean quoted, Set options) { requireInitialized(); HtmlParser parser; switch (attrtype) { case REGULAR: parser = createParser( quoted ? parserInDefaultAttrQ : parserInDefaultAttr); break; case URI: if (options != null && options.contains(AttributeOptions.URL_PARTIAL)) parser = createParser( quoted ? parserInUriAttrQPartial : parserInUriAttrPartial); else parser = createParser( quoted ? parserInUriAttrQComplete : parserInUriAttrComplete); break; case JS: // Note: We currently do not support the case of the value being // inside a Javascript quoted string that is in an unquoted HTML // attribute, such as . // It would be simple to add but currently we assume Javascript // quoted attribute values are always HTML quoted. if (quoted) { if (options != null && options.contains(AttributeOptions.JS_QUOTED)) parser = createParser(parserInQJsAttr); else parser = createParser(parserInJsAttrQ); } else { parser = createParser(parserInJsAttr); } break; case STYLE: parser = createParser( quoted ? parserInStyleAttrQ : parserInStyleAttr); break; case NONE: parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null); break; default: throw new IllegalArgumentException( "Did not recognize ATTR_TYPE given: " + attrtype); } return parser; } /** * Initializes a set of static parsers to be subsequently used * by the various createParserXXX methods. * The parsers are set to their proper states by making them parse * an appropriate HTML input fragment. This approach is the most likely * to ensure all their internal state is consistent. * *

In the very unexpected case of the parsing failing (developer error), * this class will fail to initialize properly. * *

In addition: *

* * @throws ParseException if parsing failed. */ private static void initializeParsers() throws ParseException { parserInDefaultAttr.parse(") // and a fake query parameter. final String fakeUrlPrefix = "http://example.com/fakequeryparam="; parserInUriAttrPartial.parse("