1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.streamhtmlparser.util; 18 19 import com.google.common.base.Preconditions; 20 import com.google.common.collect.ImmutableMap; 21 22 import java.util.Map; 23 24 /** 25 * <p>Decodes (unescapes) HTML entities with the complication that these 26 * are received one character at a time hence must be stored temporarily. 27 * Also, we may receive some "junk" characters before the actual 28 * entity which we will discard. 29 * 30 * <p>This class is designed to be 100% compatible with the corresponding 31 * logic in the C-version of the 32 * {@link com.google.security.streamhtmlparser.HtmlParser}, found 33 * in <code>htmlparser.c</code>. There are however a few intentional 34 * differences outlines below: 35 * <ul> 36 * <li>We accept lower and upper-case hex NCRs, the C-version 37 * accepts only lower-case ones. 38 * <li>The output on some invalid inputs may be different. This is 39 * currently in the process of consolidation with Filipe. 40 * <li>The API is a bit different, I find this one better suited 41 * for Java. In particular, the C method <code>processChar</code> 42 * returns the output {@code String} whereas in Java, we return 43 * a status code and then provide the {@code String} in a separate 44 * method <code>getEntity</code>. It is cleaner as it avoids the 45 * need to return empty {@code String}s during incomplete processing. 46 * </ul> 47 * 48 * <p>Valid HTML entities have one of the following three forms: 49 * <ul> 50 * <li><code>&dd;</code> where dd is a number in decimal (base 10) form. 51 * <li><code>&x|Xyy;</code> where yy is a hex-number (base 16). 52 * <li><code>&<html-entity>;</code> where 53 * <code><html-entity></code> is one of <code>lt</code>, 54 * <code>gt</code>, <code>amp</code>, <code>quot</code> or 55 * <code>apos</code>. 56 * </ul> 57 * 58 * <p>A <code>reset</code> method is provided to facilitate object re-use. 59 */ 60 public class EntityResolver { 61 62 /** 63 * Returned in <code>processChar</code> method. 64 * <p> 65 * <ul> 66 * <li><code>NOT_STARTED</code> indicates we are still processing 67 * trailing characters before the start of an entity. 68 * The caller may want to save the characters it provided us. 69 * <li><code>IN_PROGRESS</code> indicates we are currently processing 70 * characters part of an entity. 71 * <li><code>COMPLETED</code> indicates we have finished processing 72 * an entity. The caller can then invoke <code>getEntity</code> 73 * then re-set the object for future re-use. 74 * </ul> 75 */ 76 public enum Status { 77 NOT_STARTED("Not Started"), 78 IN_PROGRESS("In Progress"), 79 COMPLETED("Completed"); 80 81 private final String message; 82 Status(String message)83 private Status(String message) { 84 this.message = message; 85 } 86 87 /** 88 * Returns a brief description of the {@code Status} for 89 * debugging purposes. The format of the returned {@code String} 90 * is not fully specified nor guaranteed to remain the same. 91 */ 92 @Override toString()93 public String toString() { 94 return message; 95 } 96 } 97 98 /** 99 * How many characters to store as we are processing an entity. Once we 100 * reach that size, we know the entity is definitely invalid. The size 101 * is higher than needed but keeping it as-is for compatibility with 102 * the C-version. 103 */ 104 private static final int MAX_ENTITY_SIZE = 10; 105 106 /** 107 * Map containing the recognized HTML entities and their decoded values. 108 * The trailing ';' is not included in the key but it is accounted for. 109 */ 110 private static final Map<String, String> HTML_ENTITIES_MAP = 111 new ImmutableMap.Builder<String, String>() 112 .put("<", "<") 113 .put(">", ">") 114 .put("&", "&") 115 .put("&apos", "'") 116 .build(); 117 118 /** Storage for received until characters until an HTML entity is complete. */ 119 private final StringBuilder sb; 120 121 /** 122 * Indicates the state we are in. see {@link EntityResolver.Status}. 123 */ 124 private Status status; 125 private String entity; 126 127 /** 128 * Constructs an entity resolver that is initially empty and 129 * with status {@code NOT_STARTED}, see {@link EntityResolver.Status}. 130 * 131 */ EntityResolver()132 public EntityResolver() { 133 sb = new StringBuilder(); 134 status = Status.NOT_STARTED; 135 entity = ""; 136 } 137 138 /** 139 * Constructs an entity resolver that is an exact copy of 140 * the one provided. In particular it has the same contents 141 * and status. 142 * 143 * @param aEntityResolver the entity resolver to copy 144 */ EntityResolver(EntityResolver aEntityResolver)145 public EntityResolver(EntityResolver aEntityResolver) { 146 sb = new StringBuilder(); 147 sb.replace(0, sb.length(), aEntityResolver.sb.toString()); 148 entity = aEntityResolver.entity; 149 status = aEntityResolver.status; 150 } 151 152 /** 153 * Returns the object to its original state for re-use, deleting any 154 * stored characters that may be present. 155 */ reset()156 public void reset() { 157 status = Status.NOT_STARTED; 158 sb.setLength(0); 159 entity = ""; 160 } 161 162 /** 163 * Returns the full state of the <code>StreamEntityResolver</code> 164 * in a human readable form. The format of the returned <code>String</code> 165 * is not specified and is subject to change. 166 * 167 * @return full state of this object 168 */ 169 @Override toString()170 public String toString() { 171 return String.format("Status: %s; Contents (%d): %s", status.toString(), 172 sb.length(), sb.toString()); 173 } 174 175 /** 176 * Returns the decoded HTML Entity. Should only be called 177 * after {@code processChar} returned status {@code COMPLETED}. 178 * 179 * @return the decoded HTML Entity or an empty {@code String} if 180 * we were called with any status other than {@code COMPLETED} 181 */ getEntity()182 public String getEntity() { 183 return entity; 184 } 185 186 /** 187 * Processes a character from the input stream and decodes any html entities 188 * from that processed input stream. 189 * 190 * @param input the {@code char} to process 191 * @return the processed {@code String}. Typically returns an empty 192 * {@code String} while awaiting for more characters to complete 193 * processing of the entity. 194 */ processChar(char input)195 public Status processChar(char input) { 196 // Developer error if the precondition fails. 197 Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0); 198 if (status == Status.NOT_STARTED) { 199 if (input == '&') { 200 sb.append(input); 201 status = Status.IN_PROGRESS; 202 } 203 } else if (status == Status.IN_PROGRESS) { 204 if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) { 205 status = Status.COMPLETED; 206 entity = convertEntity(input); 207 } else { 208 if (sb.length() < MAX_ENTITY_SIZE) { 209 sb.append(input); 210 } else { 211 status = Status.COMPLETED; 212 entity = uncovertedInput(input); 213 } 214 } 215 } else { 216 // Status.COMPLETED, ignore character, do nothing. 217 } 218 return status; 219 } 220 221 /** 222 * Performs the decoding of a complete HTML entity and saves the 223 * result back into the buffer. 224 * <a href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1"> 225 * Numeric Character References</a> 226 * 227 * @param terminator the last character read, unused on successful 228 * conversions since it is the end delimiter of the entity 229 * @return The decoded entity or the original input if we could not decode it. 230 */ convertEntity(char terminator)231 private String convertEntity(char terminator) { 232 // Developer error if the buffer was empty or does not start with '&'. 233 Preconditions.checkArgument(sb.length() > 0); 234 Preconditions.checkArgument(sb.charAt(0) == '&'); 235 236 if (sb.length() > 1) { 237 if (sb.charAt(1) == '#') { 238 if (sb.length() <= 2) { // Error => return content as-is. 239 return uncovertedInput(terminator); 240 } 241 try { 242 if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR 243 return new String(Character.toChars( 244 Integer.parseInt(sb.substring(3), 16))); 245 } else { // Decimal NCR 246 return new String(Character.toChars( 247 Integer.parseInt(sb.substring(2)))); 248 } 249 } catch (NumberFormatException e) { 250 return uncovertedInput(terminator); 251 } 252 } 253 254 // See if it matches any of the few recognized entities. 255 String key = sb.toString(); 256 if (HTML_ENTITIES_MAP.containsKey(key)) { 257 return HTML_ENTITIES_MAP.get(key); 258 } 259 } 260 // Covers the case of a lonely '&' given or valid/invalid unknown entities. 261 return uncovertedInput(terminator); 262 } 263 uncovertedInput(char terminator)264 private String uncovertedInput(char terminator) { 265 return String.format("%s%c", sb.toString(), terminator); 266 } 267 } 268