1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.clearsilver.jsilver.functions.html; 18 19 import com.google.clearsilver.jsilver.functions.TextFilter; 20 import com.google.clearsilver.jsilver.functions.escape.HtmlEscapeFunction; 21 import com.google.clearsilver.jsilver.functions.escape.SimpleEscapingFunction; 22 23 import java.io.IOException; 24 import java.util.regex.Matcher; 25 import java.util.regex.Pattern; 26 27 /** 28 * This class implements the ClearSilver text_html function. 29 * 30 * It converts plain text into html, including adding 'tt' tags to ascii art and linking email and 31 * web addresses. 32 * 33 * Note this implementation differs from ClearSilver, in that it html escapes the contents of links 34 * and mailtos. 35 */ 36 public class TextHtmlFunction implements TextFilter { 37 38 // These regular expressions are adapted from html.c in the ClearSilver 39 // source. 40 41 // Regular expression used to match email addresses, taken from the 42 // ClearSilver source to maintain compatibility. 43 private static final String EMAIL_REGEXP = 44 "[^]\\[@:;<>\\\"()\\s\\p{Cntrl}]+@[-+a-zA-Z0-9]+\\.[-+a-zA-Z0-9\\.]+[-+a-zA-Z0-9]"; 45 46 // Regular expression used to match urls without a scheme (www.foo.com), 47 // adapted from the ClearSilver source to maintain compatibility. 48 private static final String WITH_SCHEME_REGEXP = "(?:http|https|ftp|mailto):[^\\s>\"]*"; 49 50 // Regular expression used to match urls with a scheme (http://www.foo.com), 51 // adapted from the ClearSilver source to maintain compatibility. 52 private static final String WITHOUT_SCHEME_REGEXP = "www\\.[-a-z0-9\\.]+[^\\s;\">]*"; 53 54 // Pattern to match any string in the input that is linkable. 55 private static final Pattern LINKABLES = 56 Pattern.compile("(" + EMAIL_REGEXP + ")|(" + WITH_SCHEME_REGEXP + ")|(" 57 + WITHOUT_SCHEME_REGEXP + ")", Pattern.CASE_INSENSITIVE); 58 59 // Matching groups for the LINKABLES pattern. 60 private static final int EMAIL_GROUP = 1; 61 private static final int WITH_SCHEME_GROUP = 2; 62 63 // We don't have access to the global html escaper here, so create a new one. 64 private final HtmlEscapeFunction htmlEscaper = new HtmlEscapeFunction(false); 65 66 // Escapes a small set of non-safe html characters, and does a a very small 67 // amount of formatting. 68 private final SimpleEscapingFunction htmlCharEscaper = 69 new SimpleEscapingFunction(new char[] {'<', '>', '&', '\n', '\r'}) { 70 71 @Override 72 protected String getEscapeString(char c) { 73 switch (c) { 74 case '<': 75 return "<"; 76 case '>': 77 return ">"; 78 case '&': 79 return "&"; 80 case '\n': 81 return "<br/>\n"; 82 case '\r': 83 return ""; 84 default: 85 return null; 86 } 87 } 88 89 }; 90 91 @Override filter(String in, Appendable out)92 public void filter(String in, Appendable out) throws IOException { 93 94 boolean hasAsciiArt = hasAsciiArt(in); 95 96 // Add 'tt' tag to a string that contains 'ascii-art'. 97 if (hasAsciiArt) { 98 out.append("<tt>"); 99 } 100 101 splitAndConvert(in, out); 102 103 if (hasAsciiArt) { 104 out.append("</tt>"); 105 } 106 } 107 108 /** 109 * Splits the input string into blocks of normal text or linkable text. The linkable text is 110 * converted into anchor tags before being appended to the output. The normal text is escaped and 111 * appended to the output. 112 */ splitAndConvert(String in, Appendable out)113 private void splitAndConvert(String in, Appendable out) throws IOException { 114 Matcher matcher = LINKABLES.matcher(in); 115 int end = in.length(); 116 int matchStart; 117 int matchEnd; 118 int regionStart = 0; 119 120 // Keep looking for email addresses and web links until there are none left. 121 while (matcher.find()) { 122 matchStart = matcher.start(); 123 matchEnd = matcher.end(); 124 125 // Escape all the text from the end of the previous match to the start of 126 // this match, and append it to the output. 127 htmlCharEscaper.filter(in.subSequence(regionStart, matchStart).toString(), out); 128 129 // Don't include a . or , in the text that is linked. 130 if (in.charAt(matchEnd - 1) == ',' || in.charAt(matchEnd - 1) == '.') { 131 matchEnd--; 132 } 133 134 if (matcher.group(EMAIL_GROUP) != null) { 135 formatEmail(in, matchStart, matchEnd, out); 136 } else { 137 formatUrl(in, matchStart, matchEnd, 138 // Add a scheme if the one wasn't found. 139 matcher.group(WITH_SCHEME_GROUP) == null, out); 140 } 141 142 regionStart = matchEnd; 143 } 144 145 // Escape the text after the last match, and append it to the output. 146 htmlCharEscaper.filter(in.substring(regionStart, end), out); 147 } 148 149 /** 150 * Formats the input sequence into a suitable mailto: anchor tag and appends it to the output. 151 * 152 * @param in The string that contains the email. 153 * @param start The start of the email address in the whole string. 154 * @param end The end of the email in the whole string. 155 * @param out The text output that the email address should be appended to. 156 * @throws IOException 157 */ formatEmail(String in, int start, int end, Appendable out)158 private void formatEmail(String in, int start, int end, Appendable out) throws IOException { 159 160 String emailPart = in.substring(start, end); 161 162 out.append("<a href=\"mailto:"); 163 htmlEscaper.filter(emailPart, out); 164 out.append("\">"); 165 htmlEscaper.filter(emailPart, out); 166 out.append("</a>"); 167 } 168 169 /** 170 * Formats the input sequence into a suitable anchor tag and appends it to the output. 171 * 172 * @param in The string that contains the url. 173 * @param start The start of the url in the containing string. 174 * @param end The end of the url in the containing string. 175 * @param addScheme true if 'http://' should be added to the anchor. 176 * @param out The text output that the url should be appended to. 177 * @throws IOException 178 */ formatUrl(String in, int start, int end, boolean addScheme, Appendable out)179 private void formatUrl(String in, int start, int end, boolean addScheme, Appendable out) 180 throws IOException { 181 182 String urlPart = in.substring(start, end); 183 184 out.append(" <a target=\"_blank\" href=\""); 185 if (addScheme) { 186 out.append("http://"); 187 } 188 htmlEscaper.filter(urlPart, out); 189 out.append("\">"); 190 htmlEscaper.filter(urlPart, out); 191 out.append("</a>"); 192 } 193 194 /** 195 * Attempts to detect if a string contains ascii art, whitespace such as tabs will suppress ascii 196 * art detection. 197 * 198 * This method takes its conditions from ClearSilver to maintain compatibility. See 199 * has_space_formatting in html.c in the ClearSilver source. 200 * 201 * @param in The string to analyze for ascii art. 202 * @return true if it is believed that the string contains ascii art. 203 */ hasAsciiArt(String in)204 private boolean hasAsciiArt(String in) { 205 int spaces = 0; 206 int returns = 0; 207 int asciiArt = 0; 208 int x = 0; 209 char[] inChars = in.toCharArray(); 210 211 int length = in.length(); 212 for (x = 0; x < length; x++) { 213 214 switch (inChars[x]) { 215 case '\t': 216 return false; 217 218 case '\r': 219 break; 220 221 case ' ': 222 // Ignore spaces after full stops. 223 if (x == 0 || inChars[x - 1] != '.') { 224 spaces++; 225 } 226 break; 227 228 case '\n': 229 spaces = 0; 230 returns++; 231 break; 232 233 // Characters to count towards the art total. 234 case '/': 235 case '\\': 236 case '<': 237 case '>': 238 case ':': 239 case '[': 240 case ']': 241 case '!': 242 case '@': 243 case '#': 244 case '$': 245 case '%': 246 case '^': 247 case '&': 248 case '*': 249 case '(': 250 case ')': 251 case '|': 252 asciiArt++; 253 if (asciiArt > 3) { 254 return true; 255 } 256 break; 257 258 default: 259 if (returns > 2) { 260 return false; 261 } 262 if (spaces > 2) { 263 return false; 264 } 265 returns = 0; 266 spaces = 0; 267 asciiArt = 0; 268 break; 269 } 270 } 271 272 return false; 273 } 274 } 275