• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.clearsilver.jsilver.functions.html;
18 
19 import com.google.clearsilver.jsilver.functions.TextFilter;
20 
21 import java.io.IOException;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.Map;
25 
26 /**
27  * This class implements the html_strip function. It removes html tags from text, and expands
28  * numbered and named html entities to their corresponding special characters.
29  */
30 public class HtmlStripFunction implements TextFilter {
31 
32   // The maximum length of an entity (preceded by an &)
33   private static final int MAX_AMP_LENGTH = 9;
34 
35   // The state the strip function can be, normal, in an amp escaped entity or
36   // inside a html tag.
37   private enum State {
38     DEFAULT, IN_AMP, IN_TAG
39   }
40 
41   // Map of entity names to special characters.
42   private static final Map<String, String> entityValues;
43 
44   // Initialize the entityName lookup map.
45   static {
46     Map<String, String> tempMap = new HashMap<String, String>();
47 
48     // Html specific characters.
49     tempMap.put("amp", "&");
50     tempMap.put("quot", "\"");
51     tempMap.put("gt", ">");
52     tempMap.put("lt", "<");
53 
54     tempMap.put("agrave", "\u00e0");
55     tempMap.put("aacute", "\u00e1");
56     tempMap.put("acirc", "\u00e2");
57     tempMap.put("atilde", "\u00e3");
58     tempMap.put("auml", "\u00e4");
59     tempMap.put("aring", "\u00e5");
60     tempMap.put("aelig", "\u00e6");
61     tempMap.put("ccedil", "\u00e7");
62     tempMap.put("egrave", "\u00e8");
63     tempMap.put("eacute", "\u00e9");
64     tempMap.put("ecirc", "\u00ea");
65     tempMap.put("euml", "\u00eb");
66     tempMap.put("eth", "\u00f0");
67     tempMap.put("igrave", "\u00ec");
68     tempMap.put("iacute", "\u00ed");
69     tempMap.put("icirc", "\u00ee");
70     tempMap.put("iuml", "\u00ef");
71     tempMap.put("ntilde", "\u00f1");
72     tempMap.put("nbsp", " ");
73     tempMap.put("ograve", "\u00f2");
74     tempMap.put("oacute", "\u00f3");
75     tempMap.put("ocirc", "\u00f4");
76     tempMap.put("otilde", "\u00f5");
77     tempMap.put("ouml", "\u00f6");
78     tempMap.put("oslash", "\u00f8");
79     tempMap.put("szlig", "\u00df");
80     tempMap.put("thorn", "\u00fe");
81     tempMap.put("ugrave", "\u00f9");
82     tempMap.put("uacute", "\u00fa");
83     tempMap.put("ucirc", "\u00fb");
84     tempMap.put("uuml", "\u00fc");
85     tempMap.put("yacute", "\u00fd");
86 
87     // Clearsilver's Copyright symbol!
88     tempMap.put("copy", "(C)");
89 
90     // Copy the temporary map to an unmodifiable map for the static lookup.
91     entityValues = Collections.unmodifiableMap(tempMap);
92   }
93 
94   @Override
filter(String in, Appendable out)95   public void filter(String in, Appendable out) throws IOException {
96     char[] inChars = in.toCharArray();
97 
98     // Holds the contents of an & (amp) entity before its decoded.
99     StringBuilder amp = new StringBuilder();
100     State state = State.DEFAULT;
101 
102     // Loop over the input string, ignoring tags, and decoding entities.
103     for (int i = 0; i < inChars.length; i++) {
104       char c = inChars[i];
105       switch (state) {
106 
107         case DEFAULT:
108           switch (c) {
109             case '&':
110               state = State.IN_AMP;
111               break;
112             case '<':
113               state = State.IN_TAG;
114               break;
115             default:
116               // If this is isn't the start of an amp of a tag, treat as plain
117               // text and just output.
118               out.append(c);
119           }
120           break;
121 
122         case IN_TAG:
123           // When in a tag, all input is ignored until the end of the tag.
124           if (c == '>') {
125             state = State.DEFAULT;
126           }
127           break;
128 
129         case IN_AMP:
130           // Semi-colon terminates an entity, try and decode it.
131           if (c == ';') {
132             state = State.DEFAULT;
133             appendDecodedEntityReference(out, amp);
134             amp = new StringBuilder();
135           } else {
136             if (amp.length() < MAX_AMP_LENGTH) {
137               // If this is not the last character in the input, append to the
138               // amp buffer and continue, if it is the last, dump the buffer
139               // to stop the contents of it being lost.
140               if (i != inChars.length - 1) {
141                 amp.append(c);
142               } else {
143                 out.append('&').append(amp).append(c);
144               }
145             } else {
146               // More than 8 chars, so not a valid entity, dump as plain text.
147               out.append('&').append(amp).append(c);
148               amp = new StringBuilder();
149               state = State.DEFAULT;
150             }
151           }
152           break;
153       }
154     }
155   }
156 
157   /**
158    * Attempts to decode the entity provided, if it succeeds appends it to the out string.
159    *
160    * @param out the string builder to add the decoded entity to.
161    * @param entityName to decode.
162    */
appendDecodedEntityReference(Appendable out, CharSequence entityName)163   private void appendDecodedEntityReference(Appendable out, CharSequence entityName)
164       throws IOException {
165 
166     // All the valid entities are at least two characters long.
167     if (entityName.length() < 2) {
168       return;
169     }
170 
171     entityName = entityName.toString().toLowerCase();
172 
173     // Numbered entity.
174     if (entityName.charAt(0) == '#') {
175       appendNumberedEntity(out, entityName.subSequence(1, entityName.length()));
176       return;
177     }
178 
179     // If the entity is not a numeric value, try looking it up by name.
180     String entity = entityValues.get(entityName);
181 
182     // If there is an entity by that name add it to the output.
183     if (entity != null) {
184       out.append(entity);
185     }
186   }
187 
188   /**
189    * Appends an entity to a string by numeric code.
190    *
191    * @param out the string to add the entity to.
192    * @param entity the numeric code for the entity as a char sequence.
193    */
appendNumberedEntity(Appendable out, CharSequence entity)194   private void appendNumberedEntity(Appendable out, CharSequence entity) throws IOException {
195 
196     if (entity.length() != 0) {
197       try {
198         char c;
199         // Hex numbered entities start with x.
200         if (entity.charAt(0) == 'x') {
201           c = (char) Integer.parseInt(entity.subSequence(1, entity.length()).toString(), 16);
202         } else {
203           // If its numbered, but not hex, its decimal.
204           c = (char) Integer.parseInt(entity.toString(), 10);
205         }
206 
207         // Don't append null characters, this is to remain Clearsilver compatible.
208         if (c != '\u0000') {
209           out.append(c);
210         }
211       } catch (NumberFormatException e) {
212         // Do nothing if this is not a valid numbered entity.
213       }
214     }
215   }
216 }
217