1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.util; 18 19 import android.compat.annotation.UnsupportedAppUsage; 20 21 import java.util.ArrayList; 22 import java.util.HashMap; 23 import java.util.LinkedHashMap; 24 import java.util.List; 25 import java.util.Set; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 /** 30 * 31 * Logic for parsing a text message typed by the user looking for smileys, 32 * urls, acronyms,formatting (e.g., '*'s for bold), me commands 33 * (e.g., "/me is asleep"), and punctuation. 34 * 35 * It constructs an array, which breaks the text up into its 36 * constituent pieces, which we return to the client. 37 * 38 */ 39 public abstract class AbstractMessageParser { 40 /** 41 * Interface representing the set of resources needed by a message parser 42 * 43 * @author jessan (Jessan Hutchison-Quillian) 44 */ 45 public static interface Resources { 46 47 /** Get the known set of URL schemes. */ getSchemes()48 public Set<String> getSchemes(); 49 50 /** Get the possible values for the last part of a domain name. 51 * Values are expected to be reversed in the Trie. 52 */ getDomainSuffixes()53 public TrieNode getDomainSuffixes(); 54 55 /** Get the smileys accepted by the parser. */ getSmileys()56 public TrieNode getSmileys(); 57 58 /** Get the acronyms accepted by the parser. */ getAcronyms()59 public TrieNode getAcronyms(); 60 } 61 62 /** 63 * Subclasses must define the schemes, domains, smileys and acronyms 64 * that are necessary for parsing 65 */ getResources()66 protected abstract Resources getResources(); 67 68 /** Music note that indicates user is listening to a music track. */ 69 public static final String musicNote = "\u266B "; 70 71 private String text; 72 private int nextChar; 73 private int nextClass; 74 private ArrayList<Part> parts; 75 private ArrayList<Token> tokens; 76 private HashMap<Character,Format> formatStart; 77 private boolean parseSmilies; 78 private boolean parseAcronyms; 79 private boolean parseFormatting; 80 private boolean parseUrls; 81 private boolean parseMeText; 82 private boolean parseMusic; 83 84 /** 85 * Create a message parser to parse urls, formatting, acronyms, smileys, 86 * /me text and music 87 * 88 * @param text the text to parse 89 */ AbstractMessageParser(String text)90 public AbstractMessageParser(String text) { 91 this(text, true, true, true, true, true, true); 92 } 93 94 /** 95 * Create a message parser, specifying the kinds of text to parse 96 * 97 * @param text the text to parse 98 * 99 */ AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)100 public AbstractMessageParser(String text, boolean parseSmilies, 101 boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, 102 boolean parseMusic, boolean parseMeText) { 103 this.text = text; 104 this.nextChar = 0; 105 this.nextClass = 10; 106 this.parts = new ArrayList<Part>(); 107 this.tokens = new ArrayList<Token>(); 108 this.formatStart = new HashMap<Character,Format>(); 109 this.parseSmilies = parseSmilies; 110 this.parseAcronyms = parseAcronyms; 111 this.parseFormatting = parseFormatting; 112 this.parseUrls = parseUrls; 113 this.parseMusic = parseMusic; 114 this.parseMeText = parseMeText; 115 } 116 117 /** Returns the raw text being parsed. */ getRawText()118 public final String getRawText() { return text; } 119 120 /** Return the number of parts. */ getPartCount()121 public final int getPartCount() { return parts.size(); } 122 123 /** Return the part at the given index. */ getPart(int index)124 public final Part getPart(int index) { return parts.get(index); } 125 126 /** Return the list of parts from the parsed text */ getParts()127 public final List<Part> getParts() { return parts; } 128 129 /** Parses the text string into an internal representation. */ parse()130 public void parse() { 131 // Look for music track (of which there would be only one and it'll be the 132 // first token) 133 if (parseMusicTrack()) { 134 buildParts(null); 135 return; 136 } 137 138 // Look for me commands. 139 String meText = null; 140 if (parseMeText && text.startsWith("/me") && (text.length() > 3) && 141 Character.isWhitespace(text.charAt(3))) { 142 meText = text.substring(0, 4); 143 text = text.substring(4); 144 } 145 146 // Break the text into tokens. 147 boolean wasSmiley = false; 148 while (nextChar < text.length()) { 149 if (!isWordBreak(nextChar)) { 150 if (!wasSmiley || !isSmileyBreak(nextChar)) { 151 throw new AssertionError("last chunk did not end at word break"); 152 } 153 } 154 155 if (parseSmiley()) { 156 wasSmiley = true; 157 } else { 158 wasSmiley = false; 159 160 if (!parseAcronym() && !parseURL() && !parseFormatting()) { 161 parseText(); 162 } 163 } 164 } 165 166 // Trim the whitespace before and after media components. 167 for (int i = 0; i < tokens.size(); ++i) { 168 if (tokens.get(i).isMedia()) { 169 if ((i > 0) && (tokens.get(i - 1) instanceof Html)) { 170 ((Html)tokens.get(i - 1)).trimLeadingWhitespace(); 171 } 172 if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) { 173 ((Html)tokens.get(i + 1)).trimTrailingWhitespace(); 174 } 175 } 176 } 177 178 // Remove any empty html tokens. 179 for (int i = 0; i < tokens.size(); ++i) { 180 if (tokens.get(i).isHtml() && 181 (tokens.get(i).toHtml(true).length() == 0)) { 182 tokens.remove(i); 183 --i; // visit this index again 184 } 185 } 186 187 buildParts(meText); 188 } 189 190 /** 191 * Get a the appropriate Token for a given URL 192 * 193 * @param text the anchor text 194 * @param url the url 195 * 196 */ tokenForUrl(String url, String text)197 public static Token tokenForUrl(String url, String text) { 198 if(url == null) { 199 return null; 200 } 201 202 //Look for video links 203 Video video = Video.matchURL(url, text); 204 if (video != null) { 205 return video; 206 } 207 208 // Look for video links. 209 YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text); 210 if (ytVideo != null) { 211 return ytVideo; 212 } 213 214 // Look for photo links. 215 Photo photo = Photo.matchURL(url, text); 216 if (photo != null) { 217 return photo; 218 } 219 220 // Look for photo links. 221 FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text); 222 if (flickrPhoto != null) { 223 return flickrPhoto; 224 } 225 226 //Not media, so must be a regular URL 227 return new Link(url, text); 228 } 229 230 /** 231 * Builds the parts list. 232 * 233 * @param meText any meText parsed from the message 234 */ buildParts(String meText)235 private void buildParts(String meText) { 236 for (int i = 0; i < tokens.size(); ++i) { 237 Token token = tokens.get(i); 238 if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) { 239 parts.add(new Part()); 240 } 241 lastPart().add(token); 242 } 243 244 // The first part inherits the meText of the line. 245 if (parts.size() > 0) { 246 parts.get(0).setMeText(meText); 247 } 248 } 249 250 /** Returns the last part in the list. */ lastPart()251 private Part lastPart() { return parts.get(parts.size() - 1); } 252 253 /** 254 * Looks for a music track (\u266B is first character, everything else is 255 * track info). 256 */ parseMusicTrack()257 private boolean parseMusicTrack() { 258 259 if (parseMusic && text.startsWith(musicNote)) { 260 addToken(new MusicTrack(text.substring(musicNote.length()))); 261 nextChar = text.length(); 262 return true; 263 } 264 return false; 265 } 266 267 /** Consumes all of the text in the next word . */ parseText()268 private void parseText() { 269 StringBuilder buf = new StringBuilder(); 270 int start = nextChar; 271 do { 272 char ch = text.charAt(nextChar++); 273 switch (ch) { 274 case '<': buf.append("<"); break; 275 case '>': buf.append(">"); break; 276 case '&': buf.append("&"); break; 277 case '"': buf.append("""); break; 278 case '\'': buf.append("'"); break; 279 case '\n': buf.append("<br>"); break; 280 default: buf.append(ch); break; 281 } 282 } while (!isWordBreak(nextChar)); 283 284 addToken(new Html(text.substring(start, nextChar), buf.toString())); 285 } 286 287 /** 288 * Looks for smileys (e.g., ":)") in the text. The set of known smileys is 289 * loaded from a file into a trie at server start. 290 */ parseSmiley()291 private boolean parseSmiley() { 292 if(!parseSmilies) { 293 return false; 294 } 295 TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar, 296 true); 297 if (match == null) { 298 return false; 299 } else { 300 int previousCharClass = getCharClass(nextChar - 1); 301 int nextCharClass = getCharClass(nextChar + match.getText().length()); 302 if ((previousCharClass == 2 || previousCharClass == 3) 303 && (nextCharClass == 2 || nextCharClass == 3)) { 304 return false; 305 } 306 addToken(new Smiley(match.getText())); 307 nextChar += match.getText().length(); 308 return true; 309 } 310 } 311 312 /** Looks for acronyms (e.g., "lol") in the text. 313 */ parseAcronym()314 private boolean parseAcronym() { 315 if(!parseAcronyms) { 316 return false; 317 } 318 TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar); 319 if (match == null) { 320 return false; 321 } else { 322 addToken(new Acronym(match.getText(), match.getValue())); 323 nextChar += match.getText().length(); 324 return true; 325 } 326 } 327 328 /** Determines if this is an allowable domain character. */ isDomainChar(char c)329 private boolean isDomainChar(char c) { 330 return c == '-' || Character.isLetter(c) || Character.isDigit(c); 331 } 332 333 /** Determines if the given string is a valid domain. */ isValidDomain(String domain)334 private boolean isValidDomain(String domain) { 335 // For hostnames, check that it ends with a known domain suffix 336 if (matches(getResources().getDomainSuffixes(), reverse(domain))) { 337 return true; 338 } 339 return false; 340 } 341 342 /** 343 * Looks for a URL in two possible forms: either a proper URL with a known 344 * scheme or a domain name optionally followed by a path, query, or query. 345 */ parseURL()346 private boolean parseURL() { 347 // Make sure this is a valid place to start a URL. 348 if (!parseUrls || !isURLBreak(nextChar)) { 349 return false; 350 } 351 352 int start = nextChar; 353 354 // Search for the first block of letters. 355 int index = start; 356 while ((index < text.length()) && isDomainChar(text.charAt(index))) { 357 index += 1; 358 } 359 360 String url = ""; 361 boolean done = false; 362 363 if (index == text.length()) { 364 return false; 365 } else if (text.charAt(index) == ':') { 366 // Make sure this is a known scheme. 367 String scheme = text.substring(nextChar, index); 368 if (!getResources().getSchemes().contains(scheme)) { 369 return false; 370 } 371 } else if (text.charAt(index) == '.') { 372 // Search for the end of the domain name. 373 while (index < text.length()) { 374 char ch = text.charAt(index); 375 if ((ch != '.') && !isDomainChar(ch)) { 376 break; 377 } else { 378 index += 1; 379 } 380 } 381 382 // Make sure the domain name has a valid suffix. Since tries look for 383 // prefix matches, we reverse all the strings to get suffix comparisons. 384 String domain = text.substring(nextChar, index); 385 if (!isValidDomain(domain)) { 386 return false; 387 } 388 389 // Search for a port. We deal with this specially because a colon can 390 // also be a punctuation character. 391 if ((index + 1 < text.length()) && (text.charAt(index) == ':')) { 392 char ch = text.charAt(index + 1); 393 if (Character.isDigit(ch)) { 394 index += 1; 395 while ((index < text.length()) && 396 Character.isDigit(text.charAt(index))) { 397 index += 1; 398 } 399 } 400 } 401 402 // The domain name should be followed by end of line, whitespace, 403 // punctuation, or a colon, slash, question, or hash character. The 404 // tricky part here is that some URL characters are also punctuation, so 405 // we need to distinguish them. Since we looked for ports above, a colon 406 // is always punctuation here. To distinguish '?' cases, we look at the 407 // character that follows it. 408 if (index == text.length()) { 409 done = true; 410 } else { 411 char ch = text.charAt(index); 412 if (ch == '?') { 413 // If the next character is whitespace or punctuation (or missing), 414 // then this question mark looks like punctuation. 415 if (index + 1 == text.length()) { 416 done = true; 417 } else { 418 char ch2 = text.charAt(index + 1); 419 if (Character.isWhitespace(ch2) || isPunctuation(ch2)) { 420 done = true; 421 } 422 } 423 } else if (isPunctuation(ch)) { 424 done = true; 425 } else if (Character.isWhitespace(ch)) { 426 done = true; 427 } else if ((ch == '/') || (ch == '#')) { 428 // In this case, the URL is not done. We will search for the end of 429 // it below. 430 } else { 431 return false; 432 } 433 } 434 435 // We will assume the user meant HTTP. (One weird case is where they 436 // type a port of 443. That could mean HTTPS, but they might also want 437 // HTTP. We'll let them specify if they don't want HTTP.) 438 url = "http://"; 439 } else { 440 return false; 441 } 442 443 // If the URL is not done, search for the end, which is just before the 444 // next whitespace character. 445 if (!done) { 446 while ((index < text.length()) && 447 !Character.isWhitespace(text.charAt(index))) { 448 index += 1; 449 } 450 } 451 452 String urlText = text.substring(start, index); 453 url += urlText; 454 455 // Figure out the appropriate token type. 456 addURLToken(url, urlText); 457 458 nextChar = index; 459 return true; 460 } 461 462 /** 463 * Adds the appropriate token for the given URL. This might be a simple 464 * link or it might be a recognized media type. 465 */ addURLToken(String url, String text)466 private void addURLToken(String url, String text) { 467 addToken(tokenForUrl(url, text)); 468 } 469 470 /** 471 * Deal with formatting characters. 472 * 473 * Parsing is as follows: 474 * - Treat all contiguous strings of formatting characters as one block. 475 * (This method processes one block.) 476 * - Only a single instance of a particular format character within a block 477 * is used to determine whether to turn on/off that type of formatting; 478 * other instances simply print the character itself. 479 * - If the format is to be turned on, we use the _first_ instance; if it 480 * is to be turned off, we use the _last_ instance (by appending the 481 * format.) 482 * 483 * Example: 484 * **string** turns into <b>*string*</b> 485 */ parseFormatting()486 private boolean parseFormatting() { 487 if(!parseFormatting) { 488 return false; 489 } 490 int endChar = nextChar; 491 while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) { 492 endChar += 1; 493 } 494 495 if ((endChar == nextChar) || !isWordBreak(endChar)) { 496 return false; 497 } 498 499 // Keeps track of whether we've seen a character (in map if we've seen it) 500 // and whether we should append a closing format token (if value in 501 // map is TRUE). Linked hashmap for consistent ordering. 502 LinkedHashMap<Character, Boolean> seenCharacters = 503 new LinkedHashMap<Character, Boolean>(); 504 505 for (int index = nextChar; index < endChar; ++index) { 506 char ch = text.charAt(index); 507 Character key = Character.valueOf(ch); 508 if (seenCharacters.containsKey(key)) { 509 // Already seen this character, just append an unmatched token, which 510 // will print plaintext character 511 addToken(new Format(ch, false)); 512 } else { 513 Format start = formatStart.get(key); 514 if (start != null) { 515 // Match the start token, and ask an end token to be appended 516 start.setMatched(true); 517 formatStart.remove(key); 518 seenCharacters.put(key, Boolean.TRUE); 519 } else { 520 // Append start token 521 start = new Format(ch, true); 522 formatStart.put(key, start); 523 addToken(start); 524 seenCharacters.put(key, Boolean.FALSE); 525 } 526 } 527 } 528 529 // Append any necessary end tokens 530 for (Character key : seenCharacters.keySet()) { 531 if (seenCharacters.get(key) == Boolean.TRUE) { 532 Format end = new Format(key.charValue(), false); 533 end.setMatched(true); 534 addToken(end); 535 } 536 } 537 538 nextChar = endChar; 539 return true; 540 } 541 542 /** Determines whether the given index could be a possible word break. */ isWordBreak(int index)543 private boolean isWordBreak(int index) { 544 return getCharClass(index - 1) != getCharClass(index); 545 } 546 547 /** Determines whether the given index could be a possible smiley break. */ isSmileyBreak(int index)548 private boolean isSmileyBreak(int index) { 549 if (index > 0 && index < text.length()) { 550 if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) { 551 return true; 552 } 553 } 554 555 return false; 556 } 557 558 /** 559 * Verifies that the character before the given index is end of line, 560 * whitespace, or punctuation. 561 */ isURLBreak(int index)562 private boolean isURLBreak(int index) { 563 switch (getCharClass(index - 1)) { 564 case 2: 565 case 3: 566 case 4: 567 return false; 568 569 case 0: 570 case 1: 571 default: 572 return true; 573 } 574 } 575 576 /** Returns the class for the character at the given index. */ getCharClass(int index)577 private int getCharClass(int index) { 578 if ((index < 0) || (text.length() <= index)) { 579 return 0; 580 } 581 582 char ch = text.charAt(index); 583 if (Character.isWhitespace(ch)) { 584 return 1; 585 } else if (Character.isLetter(ch)) { 586 return 2; 587 } else if (Character.isDigit(ch)) { 588 return 3; 589 } else if (isPunctuation(ch)) { 590 // For punctuation, we return a unique value every time so that they are 591 // always different from any other character. Punctuation should always 592 // be considered a possible word break. 593 return ++nextClass; 594 } else { 595 return 4; 596 } 597 } 598 599 /** 600 * Returns true if <code>c1</code> could be the last character of 601 * a smiley and <code>c2</code> could be the first character of 602 * a different smiley, if {@link #isWordBreak} would not already 603 * recognize that this is possible. 604 */ isSmileyBreak(char c1, char c2)605 private static boolean isSmileyBreak(char c1, char c2) { 606 switch (c1) { 607 /* 608 * These characters can end smileys, but don't normally end words. 609 */ 610 case '$': case '&': case '*': case '+': case '-': 611 case '/': case '<': case '=': case '>': case '@': 612 case '[': case '\\': case ']': case '^': case '|': 613 case '}': case '~': 614 switch (c2) { 615 /* 616 * These characters can begin smileys, but don't normally 617 * begin words. 618 */ 619 case '#': case '$': case '%': case '*': case '/': 620 case '<': case '=': case '>': case '@': case '[': 621 case '\\': case '^': case '~': 622 return true; 623 } 624 } 625 626 return false; 627 } 628 629 /** Determines whether the given character is punctuation. */ isPunctuation(char ch)630 private static boolean isPunctuation(char ch) { 631 switch (ch) { 632 case '.': case ',': case '"': case ':': case ';': 633 case '?': case '!': case '(': case ')': 634 return true; 635 636 default: 637 return false; 638 } 639 } 640 641 /** 642 * Determines whether the given character is the beginning or end of a 643 * section with special formatting. 644 */ isFormatChar(char ch)645 private static boolean isFormatChar(char ch) { 646 switch (ch) { 647 case '*': case '_': case '^': 648 return true; 649 650 default: 651 return false; 652 } 653 } 654 655 /** Represents a unit of parsed output. */ 656 public static abstract class Token { 657 @UnsupportedAppUsage(implicitMember = 658 "values()[Lcom/google/android/util/AbstractMessageParser$Token$Type;") 659 public enum Type { 660 661 @UnsupportedAppUsage 662 HTML ("html"), 663 @UnsupportedAppUsage 664 FORMAT ("format"), // subtype of HTML 665 @UnsupportedAppUsage 666 LINK ("l"), 667 @UnsupportedAppUsage 668 SMILEY ("e"), 669 @UnsupportedAppUsage 670 ACRONYM ("a"), 671 @UnsupportedAppUsage 672 MUSIC ("m"), 673 @UnsupportedAppUsage 674 GOOGLE_VIDEO ("v"), 675 @UnsupportedAppUsage 676 YOUTUBE_VIDEO ("yt"), 677 @UnsupportedAppUsage 678 PHOTO ("p"), 679 @UnsupportedAppUsage 680 FLICKR ("f"); 681 682 //stringreps for HTML and FORMAT don't really matter 683 //because they don't define getInfo(), which is where it is used 684 //For the other types, code depends on their stringreps 685 private String stringRep; 686 Type(String stringRep)687 Type(String stringRep) { 688 this.stringRep = stringRep; 689 } 690 691 /** {@inheritDoc} */ toString()692 public String toString() { 693 return this.stringRep; 694 } 695 } 696 697 protected Type type; 698 protected String text; 699 Token(Type type, String text)700 protected Token(Type type, String text) { 701 this.type = type; 702 this.text = text; 703 } 704 705 /** Returns the type of the token. */ getType()706 public Type getType() { return type; } 707 708 /** 709 * Get the relevant information about a token 710 * 711 * @return a list of strings representing the token, not null 712 * The first item is always a string representation of the type 713 */ getInfo()714 public List<String> getInfo() { 715 List<String> info = new ArrayList<String>(); 716 info.add(getType().toString()); 717 return info; 718 } 719 720 /** Returns the raw text of the token. */ getRawText()721 public String getRawText() { return text; } 722 isMedia()723 public boolean isMedia() { return false; } isHtml()724 public abstract boolean isHtml(); isArray()725 public boolean isArray() { return !isHtml(); } 726 toHtml(boolean caps)727 public String toHtml(boolean caps) { throw new AssertionError("not html"); } 728 729 // The token can change the caps of the text after that point. controlCaps()730 public boolean controlCaps() { return false; } setCaps()731 public boolean setCaps() { return false; } 732 } 733 734 /** Represents a simple string of html text. */ 735 public static class Html extends Token { 736 private String html; 737 Html(String text, String html)738 public Html(String text, String html) { 739 super(Type.HTML, text); 740 this.html = html; 741 } 742 isHtml()743 public boolean isHtml() { return true; } toHtml(boolean caps)744 public String toHtml(boolean caps) { 745 return caps ? html.toUpperCase() : html; 746 } 747 /** 748 * Not supported. Info should not be needed for this type 749 */ getInfo()750 public List<String> getInfo() { 751 throw new UnsupportedOperationException(); 752 } 753 trimLeadingWhitespace()754 public void trimLeadingWhitespace() { 755 text = trimLeadingWhitespace(text); 756 html = trimLeadingWhitespace(html); 757 } 758 trimTrailingWhitespace()759 public void trimTrailingWhitespace() { 760 text = trimTrailingWhitespace(text); 761 html = trimTrailingWhitespace(html); 762 } 763 trimLeadingWhitespace(String text)764 private static String trimLeadingWhitespace(String text) { 765 int index = 0; 766 while ((index < text.length()) && 767 Character.isWhitespace(text.charAt(index))) { 768 ++index; 769 } 770 return text.substring(index); 771 } 772 trimTrailingWhitespace(String text)773 public static String trimTrailingWhitespace(String text) { 774 int index = text.length(); 775 while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) { 776 --index; 777 } 778 return text.substring(0, index); 779 } 780 } 781 782 /** Represents a music track token at the beginning. */ 783 public static class MusicTrack extends Token { 784 private String track; 785 MusicTrack(String track)786 public MusicTrack(String track) { 787 super(Type.MUSIC, track); 788 this.track = track; 789 } 790 getTrack()791 public String getTrack() { return track; } 792 isHtml()793 public boolean isHtml() { return false; } 794 getInfo()795 public List<String> getInfo() { 796 List<String> info = super.getInfo(); 797 info.add(getTrack()); 798 return info; 799 } 800 } 801 802 /** Represents a link that was found in the input. */ 803 public static class Link extends Token { 804 private String url; 805 Link(String url, String text)806 public Link(String url, String text) { 807 super(Type.LINK, text); 808 this.url = url; 809 } 810 getURL()811 public String getURL() { return url; } 812 isHtml()813 public boolean isHtml() { return false; } 814 getInfo()815 public List<String> getInfo() { 816 List<String> info = super.getInfo(); 817 info.add(getURL()); 818 info.add(getRawText()); 819 return info; 820 } 821 } 822 823 /** Represents a link to a Google Video. */ 824 public static class Video extends Token { 825 /** Pattern for a video URL. */ 826 private static final Pattern URL_PATTERN = Pattern.compile( 827 "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?" 828 + ".*?\\bdocid=(-?\\d+).*"); 829 830 private String docid; 831 Video(String docid, String text)832 public Video(String docid, String text) { 833 super(Type.GOOGLE_VIDEO, text); 834 this.docid = docid; 835 } 836 getDocID()837 public String getDocID() { return docid; } 838 isHtml()839 public boolean isHtml() { return false; } isMedia()840 public boolean isMedia() { return true; } 841 842 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)843 public static Video matchURL(String url, String text) { 844 Matcher m = URL_PATTERN.matcher(url); 845 if (m.matches()) { 846 return new Video(m.group(1), text); 847 } else { 848 return null; 849 } 850 } 851 getInfo()852 public List<String> getInfo() { 853 List<String> info = super.getInfo(); 854 info.add(getRssUrl(docid)); 855 info.add(getURL(docid)); 856 return info; 857 } 858 859 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)860 public static String getRssUrl(String docid) { 861 return "http://video.google.com/videofeed" 862 + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid; 863 } 864 865 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)866 public static String getURL(String docid) { 867 return getURL(docid, null); 868 } 869 870 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)871 public static String getURL(String docid, String extraParams) { 872 if (extraParams == null) { 873 extraParams = ""; 874 } else if (extraParams.length() > 0) { 875 extraParams += "&"; 876 } 877 return "http://video.google.com/videoplay?" + extraParams 878 + "docid=" + docid; 879 } 880 } 881 882 /** Represents a link to a YouTube video. */ 883 public static class YouTubeVideo extends Token { 884 /** Pattern for a video URL. */ 885 private static final Pattern URL_PATTERN = Pattern.compile( 886 "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?" 887 + ".*\\bv=([-_a-zA-Z0-9=]+).*"); 888 889 private String docid; 890 YouTubeVideo(String docid, String text)891 public YouTubeVideo(String docid, String text) { 892 super(Type.YOUTUBE_VIDEO, text); 893 this.docid = docid; 894 } 895 getDocID()896 public String getDocID() { return docid; } 897 isHtml()898 public boolean isHtml() { return false; } isMedia()899 public boolean isMedia() { return true; } 900 901 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)902 public static YouTubeVideo matchURL(String url, String text) { 903 Matcher m = URL_PATTERN.matcher(url); 904 if (m.matches()) { 905 return new YouTubeVideo(m.group(1), text); 906 } else { 907 return null; 908 } 909 } 910 getInfo()911 public List<String> getInfo() { 912 List<String> info = super.getInfo(); 913 info.add(getRssUrl(docid)); 914 info.add(getURL(docid)); 915 return info; 916 } 917 918 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)919 public static String getRssUrl(String docid) { 920 return "http://youtube.com/watch?v=" + docid; 921 } 922 923 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)924 public static String getURL(String docid) { 925 return getURL(docid, null); 926 } 927 928 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)929 public static String getURL(String docid, String extraParams) { 930 if (extraParams == null) { 931 extraParams = ""; 932 } else if (extraParams.length() > 0) { 933 extraParams += "&"; 934 } 935 return "http://youtube.com/watch?" + extraParams + "v=" + docid; 936 } 937 938 /** (For testing purposes:) Returns a video URL with the given parts. 939 * @param http If true, includes http:// 940 * @param prefix If non-null/non-blank, adds to URL before youtube.com. 941 * (e.g., prefix="br." --> "br.youtube.com") 942 */ getPrefixedURL(boolean http, String prefix, String docid, String extraParams)943 public static String getPrefixedURL(boolean http, String prefix, 944 String docid, String extraParams) { 945 String protocol = ""; 946 947 if (http) { 948 protocol = "http://"; 949 } 950 951 if (prefix == null) { 952 prefix = ""; 953 } 954 955 if (extraParams == null) { 956 extraParams = ""; 957 } else if (extraParams.length() > 0) { 958 extraParams += "&"; 959 } 960 961 return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" + 962 docid; 963 } 964 } 965 966 /** Represents a link to a Picasa photo or album. */ 967 public static class Photo extends Token { 968 /** Pattern for an album or photo URL. */ 969 // TODO (katyarogers) searchbrowse includes search lists and tags, 970 // it follows a different pattern than albums - would be nice to add later 971 private static final Pattern URL_PATTERN = Pattern.compile( 972 "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?"); 973 974 private String user; 975 private String album; 976 private String photo; // null for albums 977 Photo(String user, String album, String photo, String text)978 public Photo(String user, String album, String photo, String text) { 979 super(Type.PHOTO, text); 980 this.user = user; 981 this.album = album; 982 this.photo = photo; 983 } 984 getUser()985 public String getUser() { return user; } getAlbum()986 public String getAlbum() { return album; } getPhoto()987 public String getPhoto() { return photo; } 988 isHtml()989 public boolean isHtml() { return false; } isMedia()990 public boolean isMedia() { return true; } 991 992 /** Returns a Photo object if the given url is to a photo or album. */ matchURL(String url, String text)993 public static Photo matchURL(String url, String text) { 994 Matcher m = URL_PATTERN.matcher(url); 995 if (m.matches()) { 996 return new Photo(m.group(1), m.group(2), m.group(3), text); 997 } else { 998 return null; 999 } 1000 } 1001 getInfo()1002 public List<String> getInfo() { 1003 List<String> info = super.getInfo(); 1004 info.add(getRssUrl(getUser())); 1005 info.add(getAlbumURL(getUser(), getAlbum())); 1006 if (getPhoto() != null) { 1007 info.add(getPhotoURL(getUser(), getAlbum(), getPhoto())); 1008 } else { 1009 info.add((String)null); 1010 } 1011 return info; 1012 } 1013 1014 /** Returns the URL for the RSS description of the user's albums. */ getRssUrl(String user)1015 public static String getRssUrl(String user) { 1016 return "http://picasaweb.google.com/data/feed/api/user/" + user + 1017 "?category=album&alt=rss"; 1018 } 1019 1020 /** Returns the URL for an album. */ getAlbumURL(String user, String album)1021 public static String getAlbumURL(String user, String album) { 1022 return "http://picasaweb.google.com/" + user + "/" + album; 1023 } 1024 1025 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String album, String photo)1026 public static String getPhotoURL(String user, String album, String photo) { 1027 return "http://picasaweb.google.com/" + user + "/" + album + "/photo#" 1028 + photo; 1029 } 1030 } 1031 1032 /** Represents a link to a Flickr photo or album. */ 1033 public static class FlickrPhoto extends Token { 1034 /** Pattern for a user album or photo URL. */ 1035 private static final Pattern URL_PATTERN = Pattern.compile( 1036 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*"); 1037 private static final Pattern GROUPING_PATTERN = Pattern.compile( 1038 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" + 1039 "([^/?#&]+)/?"); 1040 1041 private static final String SETS = "sets"; 1042 private static final String TAGS = "tags"; 1043 1044 private String user; 1045 private String photo; // null for user album 1046 private String grouping; // either "tags" or "sets" 1047 private String groupingId; // sets or tags identifier 1048 FlickrPhoto(String user, String photo, String grouping, String groupingId, String text)1049 public FlickrPhoto(String user, String photo, String grouping, 1050 String groupingId, String text) { 1051 super(Type.FLICKR, text); 1052 1053 /* System wide tags look like the URL to a Flickr user. */ 1054 if (!TAGS.equals(user)) { 1055 this.user = user; 1056 // Don't consider slide show URL a photo 1057 this.photo = (!"show".equals(photo) ? photo : null); 1058 this.grouping = grouping; 1059 this.groupingId = groupingId; 1060 } else { 1061 this.user = null; 1062 this.photo = null; 1063 this.grouping = TAGS; 1064 this.groupingId = photo; 1065 } 1066 } 1067 getUser()1068 public String getUser() { return user; } getPhoto()1069 public String getPhoto() { return photo; } getGrouping()1070 public String getGrouping() { return grouping; } getGroupingId()1071 public String getGroupingId() { return groupingId; } 1072 isHtml()1073 public boolean isHtml() { return false; } isMedia()1074 public boolean isMedia() { return true; } 1075 1076 /** 1077 * Returns a FlickrPhoto object if the given url is to a photo or Flickr 1078 * user. 1079 */ matchURL(String url, String text)1080 public static FlickrPhoto matchURL(String url, String text) { 1081 Matcher m = GROUPING_PATTERN.matcher(url); 1082 if (m.matches()) { 1083 return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text); 1084 } 1085 1086 m = URL_PATTERN.matcher(url); 1087 if (m.matches()) { 1088 return new FlickrPhoto(m.group(1), m.group(2), null, null, text); 1089 } else { 1090 return null; 1091 } 1092 } 1093 getInfo()1094 public List<String> getInfo() { 1095 List<String> info = super.getInfo(); 1096 info.add(getUrl()); 1097 info.add(getUser() != null ? getUser() : ""); 1098 info.add(getPhoto() != null ? getPhoto() : ""); 1099 info.add(getGrouping() != null ? getGrouping() : ""); 1100 info.add(getGroupingId() != null ? getGroupingId() : ""); 1101 return info; 1102 } 1103 getUrl()1104 public String getUrl() { 1105 if (SETS.equals(grouping)) { 1106 return getUserSetsURL(user, groupingId); 1107 } else if (TAGS.equals(grouping)) { 1108 if (user != null) { 1109 return getUserTagsURL(user, groupingId); 1110 } else { 1111 return getTagsURL(groupingId); 1112 } 1113 } else if (photo != null) { 1114 return getPhotoURL(user, photo); 1115 } else { 1116 return getUserURL(user); 1117 } 1118 } 1119 1120 /** Returns the URL for the RSS description. */ getRssUrl(String user)1121 public static String getRssUrl(String user) { 1122 return null; 1123 } 1124 1125 /** Returns the URL for a particular tag. */ getTagsURL(String tag)1126 public static String getTagsURL(String tag) { 1127 return "http://flickr.com/photos/tags/" + tag; 1128 } 1129 1130 /** Returns the URL to the user's Flickr homepage. */ getUserURL(String user)1131 public static String getUserURL(String user) { 1132 return "http://flickr.com/photos/" + user; 1133 } 1134 1135 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String photo)1136 public static String getPhotoURL(String user, String photo) { 1137 return "http://flickr.com/photos/" + user + "/" + photo; 1138 } 1139 1140 /** Returns the URL for a user tag photo set. */ getUserTagsURL(String user, String tagId)1141 public static String getUserTagsURL(String user, String tagId) { 1142 return "http://flickr.com/photos/" + user + "/tags/" + tagId; 1143 } 1144 1145 /** Returns the URL for user set. */ getUserSetsURL(String user, String setId)1146 public static String getUserSetsURL(String user, String setId) { 1147 return "http://flickr.com/photos/" + user + "/sets/" + setId; 1148 } 1149 } 1150 1151 /** Represents a smiley that was found in the input. */ 1152 public static class Smiley extends Token { 1153 // TODO: Pass the SWF URL down to the client. 1154 Smiley(String text)1155 public Smiley(String text) { 1156 super(Type.SMILEY, text); 1157 } 1158 isHtml()1159 public boolean isHtml() { return false; } 1160 getInfo()1161 public List<String> getInfo() { 1162 List<String> info = super.getInfo(); 1163 info.add(getRawText()); 1164 return info; 1165 } 1166 } 1167 1168 /** Represents an acronym that was found in the input. */ 1169 public static class Acronym extends Token { 1170 private String value; 1171 // TODO: SWF 1172 Acronym(String text, String value)1173 public Acronym(String text, String value) { 1174 super(Type.ACRONYM, text); 1175 this.value = value; 1176 } 1177 getValue()1178 public String getValue() { return value; } 1179 isHtml()1180 public boolean isHtml() { return false; } 1181 getInfo()1182 public List<String> getInfo() { 1183 List<String> info = super.getInfo(); 1184 info.add(getRawText()); 1185 info.add(getValue()); 1186 return info; 1187 } 1188 } 1189 1190 /** Represents a character that changes formatting. */ 1191 public static class Format extends Token { 1192 private char ch; 1193 private boolean start; 1194 private boolean matched; 1195 Format(char ch, boolean start)1196 public Format(char ch, boolean start) { 1197 super(Type.FORMAT, String.valueOf(ch)); 1198 this.ch = ch; 1199 this.start = start; 1200 } 1201 setMatched(boolean matched)1202 public void setMatched(boolean matched) { this.matched = matched; } 1203 isHtml()1204 public boolean isHtml() { return true; } 1205 toHtml(boolean caps)1206 public String toHtml(boolean caps) { 1207 // This character only implies special formatting if it was matched. 1208 // Otherwise, it was just a plain old character. 1209 if (matched) { 1210 return start ? getFormatStart(ch) : getFormatEnd(ch); 1211 } else { 1212 // We have to make sure we escape HTML characters as usual. 1213 return (ch == '"') ? """ : String.valueOf(ch); 1214 } 1215 } 1216 1217 /** 1218 * Not supported. Info should not be needed for this type 1219 */ getInfo()1220 public List<String> getInfo() { 1221 throw new UnsupportedOperationException(); 1222 } 1223 controlCaps()1224 public boolean controlCaps() { return (ch == '^'); } setCaps()1225 public boolean setCaps() { return start; } 1226 getFormatStart(char ch)1227 private String getFormatStart(char ch) { 1228 switch (ch) { 1229 case '*': return "<b>"; 1230 case '_': return "<i>"; 1231 case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps 1232 case '"': return "<font color=\"#999999\">\u201c"; 1233 default: throw new AssertionError("unknown format '" + ch + "'"); 1234 } 1235 } 1236 getFormatEnd(char ch)1237 private String getFormatEnd(char ch) { 1238 switch (ch) { 1239 case '*': return "</b>"; 1240 case '_': return "</i>"; 1241 case '^': return "</font></b>"; // TODO: all caps 1242 case '"': return "\u201d</font>"; 1243 default: throw new AssertionError("unknown format '" + ch + "'"); 1244 } 1245 } 1246 } 1247 1248 /** Adds the given token to the parsed output. */ addToken(Token token)1249 private void addToken(Token token) { 1250 tokens.add(token); 1251 } 1252 1253 /** Converts the entire message into a single HTML display string. */ toHtml()1254 public String toHtml() { 1255 StringBuilder html = new StringBuilder(); 1256 1257 for (Part part : parts) { 1258 boolean caps = false; 1259 1260 html.append("<p>"); 1261 for (Token token : part.getTokens()) { 1262 if (token.isHtml()) { 1263 html.append(token.toHtml(caps)); 1264 } else { 1265 switch (token.getType()) { 1266 case LINK: 1267 html.append("<a href=\""); 1268 html.append(((Link)token).getURL()); 1269 html.append("\">"); 1270 html.append(token.getRawText()); 1271 html.append("</a>"); 1272 break; 1273 1274 case SMILEY: 1275 // TODO: link to an appropriate image 1276 html.append(token.getRawText()); 1277 break; 1278 1279 case ACRONYM: 1280 html.append(token.getRawText()); 1281 break; 1282 1283 case MUSIC: 1284 // TODO: include a music glyph 1285 html.append(((MusicTrack)token).getTrack()); 1286 break; 1287 1288 case GOOGLE_VIDEO: 1289 // TODO: include a Google Video icon 1290 html.append("<a href=\""); 1291 html.append(((Video)token).getURL(((Video)token).getDocID())); 1292 html.append("\">"); 1293 html.append(token.getRawText()); 1294 html.append("</a>"); 1295 break; 1296 1297 case YOUTUBE_VIDEO: 1298 // TODO: include a YouTube icon 1299 html.append("<a href=\""); 1300 html.append(((YouTubeVideo)token).getURL( 1301 ((YouTubeVideo)token).getDocID())); 1302 html.append("\">"); 1303 html.append(token.getRawText()); 1304 html.append("</a>"); 1305 break; 1306 1307 case PHOTO: { 1308 // TODO: include a Picasa Web icon 1309 html.append("<a href=\""); 1310 html.append(Photo.getAlbumURL( 1311 ((Photo)token).getUser(), ((Photo)token).getAlbum())); 1312 html.append("\">"); 1313 html.append(token.getRawText()); 1314 html.append("</a>"); 1315 break; 1316 } 1317 1318 case FLICKR: 1319 // TODO: include a Flickr icon 1320 Photo p = (Photo) token; 1321 html.append("<a href=\""); 1322 html.append(((FlickrPhoto)token).getUrl()); 1323 html.append("\">"); 1324 html.append(token.getRawText()); 1325 html.append("</a>"); 1326 break; 1327 1328 default: 1329 throw new AssertionError("unknown token type: " + token.getType()); 1330 } 1331 } 1332 1333 if (token.controlCaps()) { 1334 caps = token.setCaps(); 1335 } 1336 } 1337 html.append("</p>\n"); 1338 } 1339 1340 return html.toString(); 1341 } 1342 1343 /** Returns the reverse of the given string. */ reverse(String str)1344 protected static String reverse(String str) { 1345 StringBuilder buf = new StringBuilder(); 1346 for (int i = str.length() - 1; i >= 0; --i) { 1347 buf.append(str.charAt(i)); 1348 } 1349 return buf.toString(); 1350 } 1351 1352 public static class TrieNode { 1353 private final HashMap<Character,TrieNode> children = 1354 new HashMap<Character,TrieNode>(); 1355 private String text; 1356 private String value; 1357 TrieNode()1358 public TrieNode() { this(""); } TrieNode(String text)1359 public TrieNode(String text) { 1360 this.text = text; 1361 } 1362 exists()1363 public final boolean exists() { return value != null; } getText()1364 public final String getText() { return text; } getValue()1365 public final String getValue() { return value; } setValue(String value)1366 public void setValue(String value) { this.value = value; } 1367 getChild(char ch)1368 public TrieNode getChild(char ch) { 1369 return children.get(Character.valueOf(ch)); 1370 } 1371 getOrCreateChild(char ch)1372 public TrieNode getOrCreateChild(char ch) { 1373 Character key = Character.valueOf(ch); 1374 TrieNode node = children.get(key); 1375 if (node == null) { 1376 node = new TrieNode(text + String.valueOf(ch)); 1377 children.put(key, node); 1378 } 1379 return node; 1380 } 1381 1382 /** Adds the given string into the trie. */ addToTrie(TrieNode root, String str, String value)1383 public static void addToTrie(TrieNode root, String str, String value) { 1384 int index = 0; 1385 while (index < str.length()) { 1386 root = root.getOrCreateChild(str.charAt(index++)); 1387 } 1388 root.setValue(value); 1389 } 1390 } 1391 1392 1393 1394 /** Determines whether the given string is in the given trie. */ matches(TrieNode root, String str)1395 private static boolean matches(TrieNode root, String str) { 1396 int index = 0; 1397 while (index < str.length()) { 1398 root = root.getChild(str.charAt(index++)); 1399 if (root == null) { 1400 break; 1401 } else if (root.exists()) { 1402 return true; 1403 } 1404 } 1405 return false; 1406 } 1407 1408 /** 1409 * Returns the longest substring of the given string, starting at the given 1410 * index, that exists in the trie. 1411 */ longestMatch( TrieNode root, AbstractMessageParser p, int start)1412 private static TrieNode longestMatch( 1413 TrieNode root, AbstractMessageParser p, int start) { 1414 return longestMatch(root, p, start, false); 1415 } 1416 1417 /** 1418 * Returns the longest substring of the given string, starting at the given 1419 * index, that exists in the trie, with a special tokenizing case for 1420 * smileys if specified. 1421 */ longestMatch( TrieNode root, AbstractMessageParser p, int start, boolean smiley)1422 private static TrieNode longestMatch( 1423 TrieNode root, AbstractMessageParser p, int start, boolean smiley) { 1424 int index = start; 1425 TrieNode bestMatch = null; 1426 while (index < p.getRawText().length()) { 1427 root = root.getChild(p.getRawText().charAt(index++)); 1428 if (root == null) { 1429 break; 1430 } else if (root.exists()) { 1431 if (p.isWordBreak(index)) { 1432 bestMatch = root; 1433 } else if (smiley && p.isSmileyBreak(index)) { 1434 bestMatch = root; 1435 } 1436 } 1437 } 1438 return bestMatch; 1439 } 1440 1441 1442 /** Represents set of tokens that are delivered as a single message. */ 1443 public static class Part { 1444 private String meText; 1445 private ArrayList<Token> tokens; 1446 Part()1447 public Part() { 1448 this.tokens = new ArrayList<Token>(); 1449 } 1450 getType(boolean isSend)1451 public String getType(boolean isSend) { 1452 return (isSend ? "s" : "r") + getPartType(); 1453 } 1454 getPartType()1455 private String getPartType() { 1456 if (isMedia()) { 1457 return "d"; 1458 } else if (meText != null) { 1459 return "m"; 1460 } else { 1461 return ""; 1462 } 1463 } 1464 isMedia()1465 public boolean isMedia() { 1466 return (tokens.size() == 1) && tokens.get(0).isMedia(); 1467 } 1468 /** 1469 * Convenience method for getting the Token of a Part that represents 1470 * a media Token. Parts of this kind will always only have a single Token 1471 * 1472 * @return if this.isMedia(), 1473 * returns the Token representing the media contained in this Part, 1474 * otherwise returns null; 1475 */ getMediaToken()1476 public Token getMediaToken() { 1477 if(isMedia()) { 1478 return tokens.get(0); 1479 } 1480 return null; 1481 } 1482 1483 /** Adds the given token to this part. */ add(Token token)1484 public void add(Token token) { 1485 if (isMedia()) { 1486 throw new AssertionError("media "); 1487 } 1488 tokens.add(token); 1489 } 1490 setMeText(String meText)1491 public void setMeText(String meText) { 1492 this.meText = meText; 1493 } 1494 1495 /** Returns the original text of this part. */ getRawText()1496 public String getRawText() { 1497 StringBuilder buf = new StringBuilder(); 1498 if (meText != null) { 1499 buf.append(meText); 1500 } 1501 for (int i = 0; i < tokens.size(); ++i) { 1502 buf.append(tokens.get(i).getRawText()); 1503 } 1504 return buf.toString(); 1505 } 1506 1507 /** Returns the tokens in this part. */ getTokens()1508 public ArrayList<Token> getTokens() { return tokens; } 1509 1510 /** Adds the tokens into the given builder as an array. */ 1511 // public void toArray(JSArrayBuilder array) { 1512 // if (isMedia()) { 1513 // // For media, we send its array (i.e., we don't wrap this in another 1514 // // array as we do for non-media parts). 1515 // tokens.get(0).toArray(array); 1516 // } else { 1517 // array.beginArray(); 1518 // addToArray(array); 1519 // array.endArray(); 1520 // } 1521 // } 1522 } 1523 } 1524