/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.util; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import java.util.Map; /** *
Decodes (unescapes) HTML entities with the complication that these * are received one character at a time hence must be stored temporarily. * Also, we may receive some "junk" characters before the actual * entity which we will discard. * *
This class is designed to be 100% compatible with the corresponding
* logic in the C-version of the
* {@link com.google.security.streamhtmlparser.HtmlParser}, found
* in htmlparser.c
. There are however a few intentional
* differences outlines below:
*
processChar
* returns the output {@code String} whereas in Java, we return
* a status code and then provide the {@code String} in a separate
* method getEntity
. It is cleaner as it avoids the
* need to return empty {@code String}s during incomplete processing.
* Valid HTML entities have one of the following three forms: *
ⅆ
where dd is a number in decimal (base 10) form.
* &x|Xyy;
where yy is a hex-number (base 16).
* &<html-entity>;
where
* <html-entity>
is one of lt
,
* gt
, amp
, quot
or
* apos
.
* A reset
method is provided to facilitate object re-use.
*/
public class EntityResolver {
/**
* Returned in processChar
method.
*
*
NOT_STARTED
indicates we are still processing
* trailing characters before the start of an entity.
* The caller may want to save the characters it provided us.
* IN_PROGRESS
indicates we are currently processing
* characters part of an entity.
* COMPLETED
indicates we have finished processing
* an entity. The caller can then invoke getEntity
* then re-set the object for future re-use.
* StreamEntityResolver
* in a human readable form. The format of the returned String
* is not specified and is subject to change.
*
* @return full state of this object
*/
@Override
public String toString() {
return String.format("Status: %s; Contents (%d): %s", status.toString(),
sb.length(), sb.toString());
}
/**
* Returns the decoded HTML Entity. Should only be called
* after {@code processChar} returned status {@code COMPLETED}.
*
* @return the decoded HTML Entity or an empty {@code String} if
* we were called with any status other than {@code COMPLETED}
*/
public String getEntity() {
return entity;
}
/**
* Processes a character from the input stream and decodes any html entities
* from that processed input stream.
*
* @param input the {@code char} to process
* @return the processed {@code String}. Typically returns an empty
* {@code String} while awaiting for more characters to complete
* processing of the entity.
*/
public Status processChar(char input) {
// Developer error if the precondition fails.
Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0);
if (status == Status.NOT_STARTED) {
if (input == '&') {
sb.append(input);
status = Status.IN_PROGRESS;
}
} else if (status == Status.IN_PROGRESS) {
if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) {
status = Status.COMPLETED;
entity = convertEntity(input);
} else {
if (sb.length() < MAX_ENTITY_SIZE) {
sb.append(input);
} else {
status = Status.COMPLETED;
entity = uncovertedInput(input);
}
}
} else {
// Status.COMPLETED, ignore character, do nothing.
}
return status;
}
/**
* Performs the decoding of a complete HTML entity and saves the
* result back into the buffer.
*
* Numeric Character References
*
* @param terminator the last character read, unused on successful
* conversions since it is the end delimiter of the entity
* @return The decoded entity or the original input if we could not decode it.
*/
private String convertEntity(char terminator) {
// Developer error if the buffer was empty or does not start with '&'.
Preconditions.checkArgument(sb.length() > 0);
Preconditions.checkArgument(sb.charAt(0) == '&');
if (sb.length() > 1) {
if (sb.charAt(1) == '#') {
if (sb.length() <= 2) { // Error => return content as-is.
return uncovertedInput(terminator);
}
try {
if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(3), 16)));
} else { // Decimal NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(2))));
}
} catch (NumberFormatException e) {
return uncovertedInput(terminator);
}
}
// See if it matches any of the few recognized entities.
String key = sb.toString();
if (HTML_ENTITIES_MAP.containsKey(key)) {
return HTML_ENTITIES_MAP.get(key);
}
}
// Covers the case of a lonely '&' given or valid/invalid unknown entities.
return uncovertedInput(terminator);
}
private String uncovertedInput(char terminator) {
return String.format("%s%c", sb.toString(), terminator);
}
}