/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.util; import com.google.common.collect.ImmutableSortedSet; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.Matcher; /** * Utility functions for HTML and Javascript that are most likely * not interesting to users outside this package. * *
The HtmlParser
will be open-sourced hence we took the
* decision to keep these utilities in this package as well as not to
* leverage others that may exist in the google3
code base.
*
*
The functionality exposed is designed to be 100% compatible with * the corresponding logic in the C-version of the HtmlParser as such * we are particularly concerned with cross-language compatibility. * *
Note: The words {@code Javascript} and {@code ECMAScript} are used * interchangeably unless otherwise noted. */ public final class HtmlUtils { /** * static utility class */ private HtmlUtils() { } // COV_NF_LINE /** * Indicates the type of content contained in the {@code content} HTML * attribute of the {@code meta} HTML tag. Used by * {@link HtmlUtils#parseContentAttributeForUrl(String)}. *
The values are: *
The token {@code void} was added to the list. Several keywords are
* defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
* simple we do not differentiate on the version and bundle them all together.
*/
private static final Set Currently returns {@code true} for any attribute name that starts
* with "on" which is not exactly correct but we trust a developer to
* not use non-spec compliant attribute names (e.g. onbogus).
*
* @param attribute the name of an HTML attribute
* @return {@code false} if the input is null or is not an attribute
* that expects javascript code; {@code true}
*/
public static boolean isAttributeJavascript(String attribute) {
return ((attribute != null) && attribute.startsWith("on"));
}
/**
* Determines if the HTML attribute specified expects a {@code style}
* for its value. Currently this is only true for the {@code style}
* HTML attribute.
*
* @param attribute the name of an HTML attribute
* @return {@code true} iff the attribute name is one that expects a
* style for a value; otherwise {@code false}
*/
public static boolean isAttributeStyle(String attribute) {
return "style".equals(attribute);
}
/**
* Determines if the HTML attribute specified expects a {@code URI}
* for its value. For example, both {@code href} and {@code src}
* expect a {@code URI} but {@code style} does not. Returns
* {@code false} if the attribute given was {@code null}.
*
* @param attribute the name of an HTML attribute
* @return {@code true} if the attribute name is one that expects
* a URI for a value; otherwise {@code null}
*
* @see #ATTRIBUTE_EXPECTS_URI
*/
public static boolean isAttributeUri(String attribute) {
return ATTRIBUTE_EXPECTS_URI.contains(attribute);
}
/**
* Determines if the specified character is an HTML whitespace character.
* A character is an HTML whitespace character if and only if it is one
* of the characters below.
* Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
* particular, this list is quite different from that in
* This function expects to receive the value of the {@code content} HTML
* attribute. This attribute takes on different meanings depending on the
* value of the {@code http-equiv} HTML attribute of the same {@code meta}
* tag. Since we may not have access to the {@code http-equiv} attribute,
* we instead rely on parsing the given value to determine if it contains
* a URL.
*
* The specification of the {@code meta} HTML tag can be found in:
* http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
*
* We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
* value contains a URL and whether we are at the start of the URL or past
* the start. We are at the start of the URL if and only if one of the two
* conditions below is true:
* Examples:
*
*
*
* Note: The list includes the zero-width space (Space
character
* Tab
character
* Line feed
character
* Carriage Return
character
* Zero-Width Space
character
* ​
)
* which is not included in the C version.
*
* @param chr the {@code char} to check
* @return {@code true} if the character is an HTML whitespace character
*
* White space
*/
public static boolean isHtmlSpace(char chr) {
return HTML_WHITESPACE.contains(chr);
}
/**
* Determines if the specified character is an ECMAScript whitespace or line
* terminator character. A character is a whitespace or line terminator if
* and only if it is one of the characters below:
*
*
*
* Tab
, Vertical Tab
,
* Form Feed
, Space
,
* No-break space
)
* Line Feed
,
* Carriage Return
, Line separator
,
* Paragraph Separator
).
* Character.isWhitespace
.
*
* ECMAScript Language Specification
*
* @param chr the {@code char} to check
* @return {@code true} or {@code false}
*
*/
public static boolean isJavascriptWhitespace(char chr) {
return JAVASCRIPT_WHITESPACE.contains(chr);
}
/**
* Determines if the specified character is a valid character in an
* ECMAScript identifier. This determination is currently not exact,
* in particular:
*
*
*
* We are considering leveraging Character.isJavaIdentifierStart
* and Character.isJavaIdentifierPart
given that Java
* and Javascript follow similar identifier naming rules but we lose
* compatibility with the C-version.
*
* @param chr {@code char} to check
* @return {@code true} if the {@code chr} is a Javascript whitespace
* character; otherwise {@code false}
*/
public static boolean isJavascriptIdentifier(char chr) {
return ((chr >= 'a' && chr <= 'z')
|| (chr >= 'A' && chr <= 'Z')
|| (chr >= '0' && chr <= '9')
|| chr == '_' || chr == '$');
}
/**
* Determines if the input token provided is a valid token prefix to a
* javascript regular expression. The token argument is compared against
* a {@code Set} of identifiers that can precede a regular expression in the
* javascript grammar, and returns {@code true} if the provided
* {@code String} is in that {@code Set}.
*
* @param input the {@code String} token to check
* @return {@code true} iff the token is a valid prefix of a regexp
*/
public static boolean isJavascriptRegexpPrefix(String input) {
return REGEXP_TOKEN_PREFIXS.contains(input);
}
/**
* Encodes the specified character using Ascii for convenient insertion into
* a single-quote enclosed {@code String}. Printable characters
* are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
* back-slash and single quote are all backslash-escaped. All other characters
* are returned hex-encoded.
*
* @param chr {@code char} to encode
* @return an Ascii-friendly encoding of the given {@code char}
*/
public static String encodeCharForAscii(char chr) {
if (chr == '\'') {
return "\\'";
} else if (chr == '\\') {
return "\\\\";
} else if (chr >= 32 && chr <= 126) {
return String.format("%c", chr);
} else if (chr == '\n') {
return "\\n";
} else if (chr == '\r') {
return "\\r";
} else if (chr == '\t') {
return "\\t";
} else {
// Cannot apply a precision specifier for integral types. Specifying
// 0-padded hex-encoding with minimum width of two.
return String.format("\\u%04x", (int)chr);
}
}
/**
* Parses the given {@code String} to determine if it contains a URL in the
* format followed by the {@code content} attribute of the {@code meta}
* HTML tag.
*
*
*
*
*
*
*
* @param value {@code String} to parse
* @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
* of a URL in the given value
*/
public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
if (value == null)
return META_REDIRECT_TYPE.NONE;
Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
if (!matcher.find())
return META_REDIRECT_TYPE.NONE;
// We have more content.
if (value.length() > matcher.end())
return META_REDIRECT_TYPE.URL;
return META_REDIRECT_TYPE.URL_START;
}
}
* <meta http-equiv="refresh" content="5; URL=http://www.google.com">
*
*
* <meta http-equiv="refresh" content="5; URL=">
*
*
* <meta http-equiv="content-type" content="text/html">
*
*