• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.clearsilver.jsilver.template;
18 
19 import java.io.IOException;
20 
21 /**
22  * HTML whitespace stripper to be used by JSilver.  It removes leading and
23  * trailing whitespace, it reduces contiguous whitespace characters with just
24  * the first character, and removes lines of nothing but whitespace.
25  *
26  * It does not strip whitespace inside the following elements:
27  * <ul>
28  * <li> PRE
29  * <li> VERBATIM
30  * <li> TEXTAREA
31  * <li> SCRIPT
32  * </ul>
33  * It also strips out empty lines and leading whitespace inside HTML tags (i.e.
34  * between '<' and '>') and inside SCRIPT elements.  It leaves trailing
35  * whitespace since that is more costly to remove and tends to not be common
36  * based on how templates are created (they don't have trailing whitespace).
37  * <p>
38  * Loadtests indicate that this class can strip whitespace almost as quickly
39  * as just reading every character from a string (20% slower).
40  * <p>
41  * While not strictly compatible with the JNI Clearsilver whitestripping
42  * function, we are not aware of any differences that yield functionally
43  * different HTML output. However, we encourage users to verify for themselves
44  * and report any differences.
45  */
46 public class HtmlWhiteSpaceStripper implements Appendable {
47 
48   // Object to output stripped content to.
49   private final Appendable out;
50   // Level of whitespace stripping to perform. (Currently not used).
51   // TODO: Determine what the exact differences are in levels in
52   // JNI Clearsilver and see if it is worth porting it.
53   private final int level;
54 
55   // Has any non-whitespace character been seen since the start of the line.
56   private boolean nonWsSeen = false;
57   // Was there previously one or more whitespace chars? If so, we should output
58   // the first whitespace char in the sequence before any other non-whitespace
59   // character. 0 signifies no pending whitespace.
60   private char pendingWs = 0;
61 
62   // We just saw the start of an HTML tag '<'.
63   private boolean startHtmlTag = false;
64   // Are we currently in an opening HTML tag (not "</").
65   private boolean inOpenTag = false;
66   // Are we currently in a closing HTML tag.
67   private boolean inCloseTag = false;
68   // Are we currently in an HTML tag name.
69   private boolean inTagName = false;
70 
71   // Are we between <textarea> tags
72   private int textAreaScope = 0;
73   // Are we between <pre> tags
74   private int preScope = 0;
75   // Are we between verbatim flags
76   private int verbatimScope = 0;
77   // Are we between <script> tags
78   private int scriptScope = 0;
79 
80   // Used to hold HTML tag element name.
81   private StringBuilder tagName = new StringBuilder(16);
82 
83   /**
84    * Intermediate Appendable object that strips whitespace as it passes through characters to
85    * another Appendable object.
86    *
87    * @param out The Appendable object to dump the stripped output to.
88    */
HtmlWhiteSpaceStripper(Appendable out)89   public HtmlWhiteSpaceStripper(Appendable out) {
90     this(out, 1);
91   }
92 
93   /**
94    * Intermediate Appendable object that strips whitespace as it passes through characters to
95    * another Appendable object.
96    *
97    * @param out The Appendable object to dump the stripped output to.
98    * @param level Ignored for now.
99    */
HtmlWhiteSpaceStripper(Appendable out, int level)100   public HtmlWhiteSpaceStripper(Appendable out, int level) {
101     this.out = out;
102     this.level = level;
103   }
104 
105   @Override
toString()106   public String toString() {
107     return out.toString();
108   }
109 
110   @Override
append(CharSequence csq)111   public Appendable append(CharSequence csq) throws IOException {
112     return append(csq, 0, csq.length());
113   }
114 
115   @Override
append(CharSequence csq, int start, int end)116   public Appendable append(CharSequence csq, int start, int end) throws IOException {
117     for (int i = start; i < end; i++) {
118       append(csq.charAt(i));
119     }
120     return this;
121   }
122 
123   @Override
append(char c)124   public Appendable append(char c) throws IOException {
125     if (inOpenTag || inCloseTag) {
126       // In an HTML tag.
127       if (startHtmlTag) {
128         // This is the first character in an HTML tag.
129         if (c == '/') {
130           // We are in a close tag.
131           inOpenTag = false;
132           inCloseTag = true;
133         } else {
134           // This is the first non-'/' character in an HTML tag.
135           startHtmlTag = false;
136           if (isTagNameStartChar(c)) {
137             // we have a valid tag name first char.
138             inTagName = true;
139             tagName.append(c);
140           }
141         }
142       } else if (inTagName) {
143         // We were last parsing the name of an HTML attribute.
144         if (isTagNameChar(c)) {
145           tagName.append(c);
146         } else {
147           processTagName();
148           inTagName = false;
149         }
150       }
151       if (c == '>') {
152         // We are at the end of the tag.
153         inOpenTag = inCloseTag = false;
154         nonWsSeen = true;
155       }
156       stripLeadingWsAndEmptyLines(c);
157     } else {
158       // Outside of HTML tag.
159       if (c == '<') {
160         // Starting a new HTML tag.
161         inOpenTag = true;
162         startHtmlTag = true;
163       }
164       if (preScope > 0 || verbatimScope > 0 || textAreaScope > 0) {
165         // In an HTML element that we want to preserve whitespace in.
166         out.append(c);
167       } else if (scriptScope > 0) {
168         // Want to remove newlines only.
169         stripLeadingWsAndEmptyLines(c);
170       } else {
171         stripAll(c);
172       }
173     }
174 
175     return this;
176   }
177 
stripLeadingWsAndEmptyLines(char c)178   private void stripLeadingWsAndEmptyLines(char c) throws IOException {
179     // Detect and delete empty lines.
180     switch (c) {
181       case '\n':
182         if (nonWsSeen) {
183           out.append(c);
184         }
185         nonWsSeen = false;
186         break;
187       case ' ':
188       case '\t':
189       case '\r':
190         if (nonWsSeen) {
191           out.append(c);
192         }
193         break;
194       default:
195         if (!nonWsSeen) {
196           nonWsSeen = true;
197         }
198         out.append(c);
199     }
200   }
201 
stripAll(char c)202   private void stripAll(char c) throws IOException {
203     // All that remains is content that is safe to remove whitespace from.
204     switch (c) {
205       case '\n':
206         if (nonWsSeen) {
207           // We don't want blank lines so we don't output linefeed unless we
208           // saw non-whitespace.
209           out.append(c);
210         }
211         // We don't want trailing whitespace.
212         pendingWs = 0;
213         nonWsSeen = false;
214         break;
215       case ' ':
216       case '\t':
217       case '\r':
218         if (nonWsSeen) {
219           pendingWs = c;
220         } else {
221           // Omit leading whitespace
222         }
223         break;
224       default:
225         if (pendingWs != 0) {
226           out.append(pendingWs);
227           pendingWs = 0;
228         }
229         nonWsSeen = true;
230         out.append(c);
231     }
232   }
233 
updateScope(int current, int inc)234   private int updateScope(int current, int inc) {
235     current += inc;
236     return current < 0 ? 0 : current;
237   }
238 
239   /**
240    * This code assumes well-formed HTML as input with HTML elements opening and closing properly in
241    * the right order.
242    */
processTagName()243   private void processTagName() {
244     inTagName = false;
245     String name = tagName.toString();
246     tagName.delete(0, tagName.length());
247     int inc = inOpenTag ? 1 : -1;
248     if ("textarea".equalsIgnoreCase(name)) {
249       textAreaScope = updateScope(textAreaScope, inc);
250     } else if ("pre".equalsIgnoreCase(name)) {
251       preScope = updateScope(preScope, inc);
252     } else if ("verbatim".equalsIgnoreCase(name)) {
253       verbatimScope = updateScope(verbatimScope, inc);
254     } else if ("script".equalsIgnoreCase(name)) {
255       scriptScope = updateScope(scriptScope, inc);
256     }
257   }
258 
isTagNameStartChar(char c)259   private boolean isTagNameStartChar(char c) {
260     return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
261   }
262 
263   // From W3C HTML spec.
isTagNameChar(char c)264   private boolean isTagNameChar(char c) {
265     return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || (c == '_')
266         || (c == '-') || (c == ':') || (c == '.');
267   }
268 
269   /**
270    * Note, we treat '\n' as a separate special character as it has special rules since it determines
271    * what a 'line' of content is for doing leading and trailing whitespace removal and empty line
272    * removal.
273    */
isWs(char c)274   private boolean isWs(char c) {
275     return c == ' ' || c == '\t' || c == '\r';
276   }
277 }
278