• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.clearsilver.jsilver.functions.html;
18 
19 import com.google.clearsilver.jsilver.functions.TextFilter;
20 import com.google.clearsilver.jsilver.functions.escape.HtmlEscapeFunction;
21 import com.google.clearsilver.jsilver.functions.escape.SimpleEscapingFunction;
22 
23 import java.io.IOException;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26 
27 /**
28  * This class implements the ClearSilver text_html function.
29  *
30  * It converts plain text into html, including adding 'tt' tags to ascii art and linking email and
31  * web addresses.
32  *
33  * Note this implementation differs from ClearSilver, in that it html escapes the contents of links
34  * and mailtos.
35  */
36 public class TextHtmlFunction implements TextFilter {
37 
38   // These regular expressions are adapted from html.c in the ClearSilver
39   // source.
40 
41   // Regular expression used to match email addresses, taken from the
42   // ClearSilver source to maintain compatibility.
43   private static final String EMAIL_REGEXP =
44       "[^]\\[@:;<>\\\"()\\s\\p{Cntrl}]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]";
45 
46   // Regular expression used to match urls without a scheme (www.foo.com),
47   // adapted from the ClearSilver source to maintain compatibility.
48   private static final String WITH_SCHEME_REGEXP = "(?:http|https|ftp|mailto):[^\\s>\"]*";
49 
50   // Regular expression used to match urls with a scheme (http://www.foo.com),
51   // adapted from the ClearSilver source to maintain compatibility.
52   private static final String WITHOUT_SCHEME_REGEXP = "www\\.[-a-z0-9\\.]+[^\\s;\">]*";
53 
54   // Pattern to match any string in the input that is linkable.
55   private static final Pattern LINKABLES =
56       Pattern.compile("(" + EMAIL_REGEXP + ")|(" + WITH_SCHEME_REGEXP + ")|("
57           + WITHOUT_SCHEME_REGEXP + ")", Pattern.CASE_INSENSITIVE);
58 
59   // Matching groups for the LINKABLES pattern.
60   private static final int EMAIL_GROUP = 1;
61   private static final int WITH_SCHEME_GROUP = 2;
62 
63   // We don't have access to the global html escaper here, so create a new one.
64   private final HtmlEscapeFunction htmlEscaper = new HtmlEscapeFunction(false);
65 
66   // Escapes a small set of non-safe html characters, and does a a very small
67   // amount of formatting.
68   private final SimpleEscapingFunction htmlCharEscaper =
69       new SimpleEscapingFunction(new char[] {'<', '>', '&', '\n', '\r'}) {
70 
71         @Override
72         protected String getEscapeString(char c) {
73           switch (c) {
74             case '<':
75               return "&lt;";
76             case '>':
77               return "&gt;";
78             case '&':
79               return "&amp;";
80             case '\n':
81               return "<br/>\n";
82             case '\r':
83               return "";
84             default:
85               return null;
86           }
87         }
88 
89       };
90 
91   @Override
filter(String in, Appendable out)92   public void filter(String in, Appendable out) throws IOException {
93 
94     boolean hasAsciiArt = hasAsciiArt(in);
95 
96     // Add 'tt' tag to a string that contains 'ascii-art'.
97     if (hasAsciiArt) {
98       out.append("<tt>");
99     }
100 
101     splitAndConvert(in, out);
102 
103     if (hasAsciiArt) {
104       out.append("</tt>");
105     }
106   }
107 
108   /**
109    * Splits the input string into blocks of normal text or linkable text. The linkable text is
110    * converted into anchor tags before being appended to the output. The normal text is escaped and
111    * appended to the output.
112    */
splitAndConvert(String in, Appendable out)113   private void splitAndConvert(String in, Appendable out) throws IOException {
114     Matcher matcher = LINKABLES.matcher(in);
115     int end = in.length();
116     int matchStart;
117     int matchEnd;
118     int regionStart = 0;
119 
120     // Keep looking for email addresses and web links until there are none left.
121     while (matcher.find()) {
122       matchStart = matcher.start();
123       matchEnd = matcher.end();
124 
125       // Escape all the text from the end of the previous match to the start of
126       // this match, and append it to the output.
127       htmlCharEscaper.filter(in.subSequence(regionStart, matchStart).toString(), out);
128 
129       // Don't include a . or , in the text that is linked.
130       if (in.charAt(matchEnd - 1) == ',' || in.charAt(matchEnd - 1) == '.') {
131         matchEnd--;
132       }
133 
134       if (matcher.group(EMAIL_GROUP) != null) {
135         formatEmail(in, matchStart, matchEnd, out);
136       } else {
137         formatUrl(in, matchStart, matchEnd,
138         // Add a scheme if the one wasn't found.
139             matcher.group(WITH_SCHEME_GROUP) == null, out);
140       }
141 
142       regionStart = matchEnd;
143     }
144 
145     // Escape the text after the last match, and append it to the output.
146     htmlCharEscaper.filter(in.substring(regionStart, end), out);
147   }
148 
149   /**
150    * Formats the input sequence into a suitable mailto: anchor tag and appends it to the output.
151    *
152    * @param in The string that contains the email.
153    * @param start The start of the email address in the whole string.
154    * @param end The end of the email in the whole string.
155    * @param out The text output that the email address should be appended to.
156    * @throws IOException
157    */
formatEmail(String in, int start, int end, Appendable out)158   private void formatEmail(String in, int start, int end, Appendable out) throws IOException {
159 
160     String emailPart = in.substring(start, end);
161 
162     out.append("<a href=\"mailto:");
163     htmlEscaper.filter(emailPart, out);
164     out.append("\">");
165     htmlEscaper.filter(emailPart, out);
166     out.append("</a>");
167   }
168 
169   /**
170    * Formats the input sequence into a suitable anchor tag and appends it to the output.
171    *
172    * @param in The string that contains the url.
173    * @param start The start of the url in the containing string.
174    * @param end The end of the url in the containing string.
175    * @param addScheme true if 'http://' should be added to the anchor.
176    * @param out The text output that the url should be appended to.
177    * @throws IOException
178    */
formatUrl(String in, int start, int end, boolean addScheme, Appendable out)179   private void formatUrl(String in, int start, int end, boolean addScheme, Appendable out)
180       throws IOException {
181 
182     String urlPart = in.substring(start, end);
183 
184     out.append(" <a target=\"_blank\" href=\"");
185     if (addScheme) {
186       out.append("http://");
187     }
188     htmlEscaper.filter(urlPart, out);
189     out.append("\">");
190     htmlEscaper.filter(urlPart, out);
191     out.append("</a>");
192   }
193 
194   /**
195    * Attempts to detect if a string contains ascii art, whitespace such as tabs will suppress ascii
196    * art detection.
197    *
198    * This method takes its conditions from ClearSilver to maintain compatibility. See
199    * has_space_formatting in html.c in the ClearSilver source.
200    *
201    * @param in The string to analyze for ascii art.
202    * @return true if it is believed that the string contains ascii art.
203    */
hasAsciiArt(String in)204   private boolean hasAsciiArt(String in) {
205     int spaces = 0;
206     int returns = 0;
207     int asciiArt = 0;
208     int x = 0;
209     char[] inChars = in.toCharArray();
210 
211     int length = in.length();
212     for (x = 0; x < length; x++) {
213 
214       switch (inChars[x]) {
215         case '\t':
216           return false;
217 
218         case '\r':
219           break;
220 
221         case ' ':
222           // Ignore spaces after full stops.
223           if (x == 0 || inChars[x - 1] != '.') {
224             spaces++;
225           }
226           break;
227 
228         case '\n':
229           spaces = 0;
230           returns++;
231           break;
232 
233         // Characters to count towards the art total.
234         case '/':
235         case '\\':
236         case '<':
237         case '>':
238         case ':':
239         case '[':
240         case ']':
241         case '!':
242         case '@':
243         case '#':
244         case '$':
245         case '%':
246         case '^':
247         case '&':
248         case '*':
249         case '(':
250         case ')':
251         case '|':
252           asciiArt++;
253           if (asciiArt > 3) {
254             return true;
255           }
256           break;
257 
258         default:
259           if (returns > 2) {
260             return false;
261           }
262           if (spaces > 2) {
263             return false;
264           }
265           returns = 0;
266           spaces = 0;
267           asciiArt = 0;
268           break;
269       }
270     }
271 
272     return false;
273   }
274 }
275