1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.util; 18 19 import java.util.ArrayList; 20 import java.util.HashMap; 21 import java.util.LinkedHashMap; 22 import java.util.regex.Matcher; 23 import java.util.regex.Pattern; 24 import java.util.Set; 25 import java.util.List; 26 27 /** 28 * 29 * Logic for parsing a text message typed by the user looking for smileys, 30 * urls, acronyms,formatting (e.g., '*'s for bold), me commands 31 * (e.g., "/me is asleep"), and punctuation. 32 * 33 * It constructs an array, which breaks the text up into its 34 * constituent pieces, which we return to the client. 35 * 36 */ 37 public abstract class AbstractMessageParser { 38 /** 39 * Interface representing the set of resources needed by a message parser 40 * 41 * @author jessan (Jessan Hutchison-Quillian) 42 */ 43 public static interface Resources { 44 45 /** Get the known set of URL schemes. */ getSchemes()46 public Set<String> getSchemes(); 47 48 /** Get the possible values for the last part of a domain name. 49 * Values are expected to be reversed in the Trie. 50 */ getDomainSuffixes()51 public TrieNode getDomainSuffixes(); 52 53 /** Get the smileys accepted by the parser. */ getSmileys()54 public TrieNode getSmileys(); 55 56 /** Get the acronyms accepted by the parser. */ getAcronyms()57 public TrieNode getAcronyms(); 58 } 59 60 /** 61 * Subclasses must define the schemes, domains, smileys and acronyms 62 * that are necessary for parsing 63 */ getResources()64 protected abstract Resources getResources(); 65 66 /** Music note that indicates user is listening to a music track. */ 67 public static final String musicNote = "\u266B "; 68 69 private String text; 70 private int nextChar; 71 private int nextClass; 72 private ArrayList<Part> parts; 73 private ArrayList<Token> tokens; 74 private HashMap<Character,Format> formatStart; 75 private boolean parseSmilies; 76 private boolean parseAcronyms; 77 private boolean parseFormatting; 78 private boolean parseUrls; 79 private boolean parseMeText; 80 private boolean parseMusic; 81 82 /** 83 * Create a message parser to parse urls, formatting, acronyms, smileys, 84 * /me text and music 85 * 86 * @param text the text to parse 87 */ AbstractMessageParser(String text)88 public AbstractMessageParser(String text) { 89 this(text, true, true, true, true, true, true); 90 } 91 92 /** 93 * Create a message parser, specifying the kinds of text to parse 94 * 95 * @param text the text to parse 96 * 97 */ AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)98 public AbstractMessageParser(String text, boolean parseSmilies, 99 boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, 100 boolean parseMusic, boolean parseMeText) { 101 this.text = text; 102 this.nextChar = 0; 103 this.nextClass = 10; 104 this.parts = new ArrayList<Part>(); 105 this.tokens = new ArrayList<Token>(); 106 this.formatStart = new HashMap<Character,Format>(); 107 this.parseSmilies = parseSmilies; 108 this.parseAcronyms = parseAcronyms; 109 this.parseFormatting = parseFormatting; 110 this.parseUrls = parseUrls; 111 this.parseMusic = parseMusic; 112 this.parseMeText = parseMeText; 113 } 114 115 /** Returns the raw text being parsed. */ getRawText()116 public final String getRawText() { return text; } 117 118 /** Return the number of parts. */ getPartCount()119 public final int getPartCount() { return parts.size(); } 120 121 /** Return the part at the given index. */ getPart(int index)122 public final Part getPart(int index) { return parts.get(index); } 123 124 /** Return the list of parts from the parsed text */ getParts()125 public final List<Part> getParts() { return parts; } 126 127 /** Parses the text string into an internal representation. */ parse()128 public void parse() { 129 // Look for music track (of which there would be only one and it'll be the 130 // first token) 131 if (parseMusicTrack()) { 132 buildParts(null); 133 return; 134 } 135 136 // Look for me commands. 137 String meText = null; 138 if (parseMeText && text.startsWith("/me") && (text.length() > 3) && 139 Character.isWhitespace(text.charAt(3))) { 140 meText = text.substring(0, 4); 141 text = text.substring(4); 142 } 143 144 // Break the text into tokens. 145 boolean wasSmiley = false; 146 while (nextChar < text.length()) { 147 if (!isWordBreak(nextChar)) { 148 if (!wasSmiley || !isSmileyBreak(nextChar)) { 149 throw new AssertionError("last chunk did not end at word break"); 150 } 151 } 152 153 if (parseSmiley()) { 154 wasSmiley = true; 155 } else { 156 wasSmiley = false; 157 158 if (!parseAcronym() && !parseURL() && !parseFormatting()) { 159 parseText(); 160 } 161 } 162 } 163 164 // Trim the whitespace before and after media components. 165 for (int i = 0; i < tokens.size(); ++i) { 166 if (tokens.get(i).isMedia()) { 167 if ((i > 0) && (tokens.get(i - 1) instanceof Html)) { 168 ((Html)tokens.get(i - 1)).trimLeadingWhitespace(); 169 } 170 if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) { 171 ((Html)tokens.get(i + 1)).trimTrailingWhitespace(); 172 } 173 } 174 } 175 176 // Remove any empty html tokens. 177 for (int i = 0; i < tokens.size(); ++i) { 178 if (tokens.get(i).isHtml() && 179 (tokens.get(i).toHtml(true).length() == 0)) { 180 tokens.remove(i); 181 --i; // visit this index again 182 } 183 } 184 185 buildParts(meText); 186 } 187 188 /** 189 * Get a the appropriate Token for a given URL 190 * 191 * @param text the anchor text 192 * @param url the url 193 * 194 */ tokenForUrl(String url, String text)195 public static Token tokenForUrl(String url, String text) { 196 if(url == null) { 197 return null; 198 } 199 200 //Look for video links 201 Video video = Video.matchURL(url, text); 202 if (video != null) { 203 return video; 204 } 205 206 // Look for video links. 207 YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text); 208 if (ytVideo != null) { 209 return ytVideo; 210 } 211 212 // Look for photo links. 213 Photo photo = Photo.matchURL(url, text); 214 if (photo != null) { 215 return photo; 216 } 217 218 // Look for photo links. 219 FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text); 220 if (flickrPhoto != null) { 221 return flickrPhoto; 222 } 223 224 //Not media, so must be a regular URL 225 return new Link(url, text); 226 } 227 228 /** 229 * Builds the parts list. 230 * 231 * @param meText any meText parsed from the message 232 */ buildParts(String meText)233 private void buildParts(String meText) { 234 for (int i = 0; i < tokens.size(); ++i) { 235 Token token = tokens.get(i); 236 if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) { 237 parts.add(new Part()); 238 } 239 lastPart().add(token); 240 } 241 242 // The first part inherits the meText of the line. 243 if (parts.size() > 0) { 244 parts.get(0).setMeText(meText); 245 } 246 } 247 248 /** Returns the last part in the list. */ lastPart()249 private Part lastPart() { return parts.get(parts.size() - 1); } 250 251 /** 252 * Looks for a music track (\u266B is first character, everything else is 253 * track info). 254 */ parseMusicTrack()255 private boolean parseMusicTrack() { 256 257 if (parseMusic && text.startsWith(musicNote)) { 258 addToken(new MusicTrack(text.substring(musicNote.length()))); 259 nextChar = text.length(); 260 return true; 261 } 262 return false; 263 } 264 265 /** Consumes all of the text in the next word . */ parseText()266 private void parseText() { 267 StringBuilder buf = new StringBuilder(); 268 int start = nextChar; 269 do { 270 char ch = text.charAt(nextChar++); 271 switch (ch) { 272 case '<': buf.append("<"); break; 273 case '>': buf.append(">"); break; 274 case '&': buf.append("&"); break; 275 case '"': buf.append("""); break; 276 case '\'': buf.append("'"); break; 277 case '\n': buf.append("<br>"); break; 278 default: buf.append(ch); break; 279 } 280 } while (!isWordBreak(nextChar)); 281 282 addToken(new Html(text.substring(start, nextChar), buf.toString())); 283 } 284 285 /** 286 * Looks for smileys (e.g., ":)") in the text. The set of known smileys is 287 * loaded from a file into a trie at server start. 288 */ parseSmiley()289 private boolean parseSmiley() { 290 if(!parseSmilies) { 291 return false; 292 } 293 TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar, 294 true); 295 if (match == null) { 296 return false; 297 } else { 298 int previousCharClass = getCharClass(nextChar - 1); 299 int nextCharClass = getCharClass(nextChar + match.getText().length()); 300 if ((previousCharClass == 2 || previousCharClass == 3) 301 && (nextCharClass == 2 || nextCharClass == 3)) { 302 return false; 303 } 304 addToken(new Smiley(match.getText())); 305 nextChar += match.getText().length(); 306 return true; 307 } 308 } 309 310 /** Looks for acronyms (e.g., "lol") in the text. 311 */ parseAcronym()312 private boolean parseAcronym() { 313 if(!parseAcronyms) { 314 return false; 315 } 316 TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar); 317 if (match == null) { 318 return false; 319 } else { 320 addToken(new Acronym(match.getText(), match.getValue())); 321 nextChar += match.getText().length(); 322 return true; 323 } 324 } 325 326 /** Determines if this is an allowable domain character. */ isDomainChar(char c)327 private boolean isDomainChar(char c) { 328 return c == '-' || Character.isLetter(c) || Character.isDigit(c); 329 } 330 331 /** Determines if the given string is a valid domain. */ isValidDomain(String domain)332 private boolean isValidDomain(String domain) { 333 // For hostnames, check that it ends with a known domain suffix 334 if (matches(getResources().getDomainSuffixes(), reverse(domain))) { 335 return true; 336 } 337 return false; 338 } 339 340 /** 341 * Looks for a URL in two possible forms: either a proper URL with a known 342 * scheme or a domain name optionally followed by a path, query, or query. 343 */ parseURL()344 private boolean parseURL() { 345 // Make sure this is a valid place to start a URL. 346 if (!parseUrls || !isURLBreak(nextChar)) { 347 return false; 348 } 349 350 int start = nextChar; 351 352 // Search for the first block of letters. 353 int index = start; 354 while ((index < text.length()) && isDomainChar(text.charAt(index))) { 355 index += 1; 356 } 357 358 String url = ""; 359 boolean done = false; 360 361 if (index == text.length()) { 362 return false; 363 } else if (text.charAt(index) == ':') { 364 // Make sure this is a known scheme. 365 String scheme = text.substring(nextChar, index); 366 if (!getResources().getSchemes().contains(scheme)) { 367 return false; 368 } 369 } else if (text.charAt(index) == '.') { 370 // Search for the end of the domain name. 371 while (index < text.length()) { 372 char ch = text.charAt(index); 373 if ((ch != '.') && !isDomainChar(ch)) { 374 break; 375 } else { 376 index += 1; 377 } 378 } 379 380 // Make sure the domain name has a valid suffix. Since tries look for 381 // prefix matches, we reverse all the strings to get suffix comparisons. 382 String domain = text.substring(nextChar, index); 383 if (!isValidDomain(domain)) { 384 return false; 385 } 386 387 // Search for a port. We deal with this specially because a colon can 388 // also be a punctuation character. 389 if ((index + 1 < text.length()) && (text.charAt(index) == ':')) { 390 char ch = text.charAt(index + 1); 391 if (Character.isDigit(ch)) { 392 index += 1; 393 while ((index < text.length()) && 394 Character.isDigit(text.charAt(index))) { 395 index += 1; 396 } 397 } 398 } 399 400 // The domain name should be followed by end of line, whitespace, 401 // punctuation, or a colon, slash, question, or hash character. The 402 // tricky part here is that some URL characters are also punctuation, so 403 // we need to distinguish them. Since we looked for ports above, a colon 404 // is always punctuation here. To distinguish '?' cases, we look at the 405 // character that follows it. 406 if (index == text.length()) { 407 done = true; 408 } else { 409 char ch = text.charAt(index); 410 if (ch == '?') { 411 // If the next character is whitespace or punctuation (or missing), 412 // then this question mark looks like punctuation. 413 if (index + 1 == text.length()) { 414 done = true; 415 } else { 416 char ch2 = text.charAt(index + 1); 417 if (Character.isWhitespace(ch2) || isPunctuation(ch2)) { 418 done = true; 419 } 420 } 421 } else if (isPunctuation(ch)) { 422 done = true; 423 } else if (Character.isWhitespace(ch)) { 424 done = true; 425 } else if ((ch == '/') || (ch == '#')) { 426 // In this case, the URL is not done. We will search for the end of 427 // it below. 428 } else { 429 return false; 430 } 431 } 432 433 // We will assume the user meant HTTP. (One weird case is where they 434 // type a port of 443. That could mean HTTPS, but they might also want 435 // HTTP. We'll let them specify if they don't want HTTP.) 436 url = "http://"; 437 } else { 438 return false; 439 } 440 441 // If the URL is not done, search for the end, which is just before the 442 // next whitespace character. 443 if (!done) { 444 while ((index < text.length()) && 445 !Character.isWhitespace(text.charAt(index))) { 446 index += 1; 447 } 448 } 449 450 String urlText = text.substring(start, index); 451 url += urlText; 452 453 // Figure out the appropriate token type. 454 addURLToken(url, urlText); 455 456 nextChar = index; 457 return true; 458 } 459 460 /** 461 * Adds the appropriate token for the given URL. This might be a simple 462 * link or it might be a recognized media type. 463 */ addURLToken(String url, String text)464 private void addURLToken(String url, String text) { 465 addToken(tokenForUrl(url, text)); 466 } 467 468 /** 469 * Deal with formatting characters. 470 * 471 * Parsing is as follows: 472 * - Treat all contiguous strings of formatting characters as one block. 473 * (This method processes one block.) 474 * - Only a single instance of a particular format character within a block 475 * is used to determine whether to turn on/off that type of formatting; 476 * other instances simply print the character itself. 477 * - If the format is to be turned on, we use the _first_ instance; if it 478 * is to be turned off, we use the _last_ instance (by appending the 479 * format.) 480 * 481 * Example: 482 * **string** turns into <b>*string*</b> 483 */ parseFormatting()484 private boolean parseFormatting() { 485 if(!parseFormatting) { 486 return false; 487 } 488 int endChar = nextChar; 489 while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) { 490 endChar += 1; 491 } 492 493 if ((endChar == nextChar) || !isWordBreak(endChar)) { 494 return false; 495 } 496 497 // Keeps track of whether we've seen a character (in map if we've seen it) 498 // and whether we should append a closing format token (if value in 499 // map is TRUE). Linked hashmap for consistent ordering. 500 LinkedHashMap<Character, Boolean> seenCharacters = 501 new LinkedHashMap<Character, Boolean>(); 502 503 for (int index = nextChar; index < endChar; ++index) { 504 char ch = text.charAt(index); 505 Character key = Character.valueOf(ch); 506 if (seenCharacters.containsKey(key)) { 507 // Already seen this character, just append an unmatched token, which 508 // will print plaintext character 509 addToken(new Format(ch, false)); 510 } else { 511 Format start = formatStart.get(key); 512 if (start != null) { 513 // Match the start token, and ask an end token to be appended 514 start.setMatched(true); 515 formatStart.remove(key); 516 seenCharacters.put(key, Boolean.TRUE); 517 } else { 518 // Append start token 519 start = new Format(ch, true); 520 formatStart.put(key, start); 521 addToken(start); 522 seenCharacters.put(key, Boolean.FALSE); 523 } 524 } 525 } 526 527 // Append any necessary end tokens 528 for (Character key : seenCharacters.keySet()) { 529 if (seenCharacters.get(key) == Boolean.TRUE) { 530 Format end = new Format(key.charValue(), false); 531 end.setMatched(true); 532 addToken(end); 533 } 534 } 535 536 nextChar = endChar; 537 return true; 538 } 539 540 /** Determines whether the given index could be a possible word break. */ isWordBreak(int index)541 private boolean isWordBreak(int index) { 542 return getCharClass(index - 1) != getCharClass(index); 543 } 544 545 /** Determines whether the given index could be a possible smiley break. */ isSmileyBreak(int index)546 private boolean isSmileyBreak(int index) { 547 if (index > 0 && index < text.length()) { 548 if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) { 549 return true; 550 } 551 } 552 553 return false; 554 } 555 556 /** 557 * Verifies that the character before the given index is end of line, 558 * whitespace, or punctuation. 559 */ isURLBreak(int index)560 private boolean isURLBreak(int index) { 561 switch (getCharClass(index - 1)) { 562 case 2: 563 case 3: 564 case 4: 565 return false; 566 567 case 0: 568 case 1: 569 default: 570 return true; 571 } 572 } 573 574 /** Returns the class for the character at the given index. */ getCharClass(int index)575 private int getCharClass(int index) { 576 if ((index < 0) || (text.length() <= index)) { 577 return 0; 578 } 579 580 char ch = text.charAt(index); 581 if (Character.isWhitespace(ch)) { 582 return 1; 583 } else if (Character.isLetter(ch)) { 584 return 2; 585 } else if (Character.isDigit(ch)) { 586 return 3; 587 } else if (isPunctuation(ch)) { 588 // For punctuation, we return a unique value every time so that they are 589 // always different from any other character. Punctuation should always 590 // be considered a possible word break. 591 return ++nextClass; 592 } else { 593 return 4; 594 } 595 } 596 597 /** 598 * Returns true if <code>c1</code> could be the last character of 599 * a smiley and <code>c2</code> could be the first character of 600 * a different smiley, if {@link #isWordBreak} would not already 601 * recognize that this is possible. 602 */ isSmileyBreak(char c1, char c2)603 private static boolean isSmileyBreak(char c1, char c2) { 604 switch (c1) { 605 /* 606 * These characters can end smileys, but don't normally end words. 607 */ 608 case '$': case '&': case '*': case '+': case '-': 609 case '/': case '<': case '=': case '>': case '@': 610 case '[': case '\\': case ']': case '^': case '|': 611 case '}': case '~': 612 switch (c2) { 613 /* 614 * These characters can begin smileys, but don't normally 615 * begin words. 616 */ 617 case '#': case '$': case '%': case '*': case '/': 618 case '<': case '=': case '>': case '@': case '[': 619 case '\\': case '^': case '~': 620 return true; 621 } 622 } 623 624 return false; 625 } 626 627 /** Determines whether the given character is punctuation. */ isPunctuation(char ch)628 private static boolean isPunctuation(char ch) { 629 switch (ch) { 630 case '.': case ',': case '"': case ':': case ';': 631 case '?': case '!': case '(': case ')': 632 return true; 633 634 default: 635 return false; 636 } 637 } 638 639 /** 640 * Determines whether the given character is the beginning or end of a 641 * section with special formatting. 642 */ isFormatChar(char ch)643 private static boolean isFormatChar(char ch) { 644 switch (ch) { 645 case '*': case '_': case '^': 646 return true; 647 648 default: 649 return false; 650 } 651 } 652 653 /** Represents a unit of parsed output. */ 654 public static abstract class Token { 655 public enum Type { 656 657 HTML ("html"), 658 FORMAT ("format"), // subtype of HTML 659 LINK ("l"), 660 SMILEY ("e"), 661 ACRONYM ("a"), 662 MUSIC ("m"), 663 GOOGLE_VIDEO ("v"), 664 YOUTUBE_VIDEO ("yt"), 665 PHOTO ("p"), 666 FLICKR ("f"); 667 668 //stringreps for HTML and FORMAT don't really matter 669 //because they don't define getInfo(), which is where it is used 670 //For the other types, code depends on their stringreps 671 private String stringRep; 672 Type(String stringRep)673 Type(String stringRep) { 674 this.stringRep = stringRep; 675 } 676 677 /** {@inheritDoc} */ toString()678 public String toString() { 679 return this.stringRep; 680 } 681 } 682 683 protected Type type; 684 protected String text; 685 Token(Type type, String text)686 protected Token(Type type, String text) { 687 this.type = type; 688 this.text = text; 689 } 690 691 /** Returns the type of the token. */ getType()692 public Type getType() { return type; } 693 694 /** 695 * Get the relevant information about a token 696 * 697 * @return a list of strings representing the token, not null 698 * The first item is always a string representation of the type 699 */ getInfo()700 public List<String> getInfo() { 701 List<String> info = new ArrayList<String>(); 702 info.add(getType().toString()); 703 return info; 704 } 705 706 /** Returns the raw text of the token. */ getRawText()707 public String getRawText() { return text; } 708 isMedia()709 public boolean isMedia() { return false; } isHtml()710 public abstract boolean isHtml(); isArray()711 public boolean isArray() { return !isHtml(); } 712 toHtml(boolean caps)713 public String toHtml(boolean caps) { throw new AssertionError("not html"); } 714 715 // The token can change the caps of the text after that point. controlCaps()716 public boolean controlCaps() { return false; } setCaps()717 public boolean setCaps() { return false; } 718 } 719 720 /** Represents a simple string of html text. */ 721 public static class Html extends Token { 722 private String html; 723 Html(String text, String html)724 public Html(String text, String html) { 725 super(Type.HTML, text); 726 this.html = html; 727 } 728 isHtml()729 public boolean isHtml() { return true; } toHtml(boolean caps)730 public String toHtml(boolean caps) { 731 return caps ? html.toUpperCase() : html; 732 } 733 /** 734 * Not supported. Info should not be needed for this type 735 */ getInfo()736 public List<String> getInfo() { 737 throw new UnsupportedOperationException(); 738 } 739 trimLeadingWhitespace()740 public void trimLeadingWhitespace() { 741 text = trimLeadingWhitespace(text); 742 html = trimLeadingWhitespace(html); 743 } 744 trimTrailingWhitespace()745 public void trimTrailingWhitespace() { 746 text = trimTrailingWhitespace(text); 747 html = trimTrailingWhitespace(html); 748 } 749 trimLeadingWhitespace(String text)750 private static String trimLeadingWhitespace(String text) { 751 int index = 0; 752 while ((index < text.length()) && 753 Character.isWhitespace(text.charAt(index))) { 754 ++index; 755 } 756 return text.substring(index); 757 } 758 trimTrailingWhitespace(String text)759 public static String trimTrailingWhitespace(String text) { 760 int index = text.length(); 761 while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) { 762 --index; 763 } 764 return text.substring(0, index); 765 } 766 } 767 768 /** Represents a music track token at the beginning. */ 769 public static class MusicTrack extends Token { 770 private String track; 771 MusicTrack(String track)772 public MusicTrack(String track) { 773 super(Type.MUSIC, track); 774 this.track = track; 775 } 776 getTrack()777 public String getTrack() { return track; } 778 isHtml()779 public boolean isHtml() { return false; } 780 getInfo()781 public List<String> getInfo() { 782 List<String> info = super.getInfo(); 783 info.add(getTrack()); 784 return info; 785 } 786 } 787 788 /** Represents a link that was found in the input. */ 789 public static class Link extends Token { 790 private String url; 791 Link(String url, String text)792 public Link(String url, String text) { 793 super(Type.LINK, text); 794 this.url = url; 795 } 796 getURL()797 public String getURL() { return url; } 798 isHtml()799 public boolean isHtml() { return false; } 800 getInfo()801 public List<String> getInfo() { 802 List<String> info = super.getInfo(); 803 info.add(getURL()); 804 info.add(getRawText()); 805 return info; 806 } 807 } 808 809 /** Represents a link to a Google Video. */ 810 public static class Video extends Token { 811 /** Pattern for a video URL. */ 812 private static final Pattern URL_PATTERN = Pattern.compile( 813 "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?" 814 + ".*?\\bdocid=(-?\\d+).*"); 815 816 private String docid; 817 Video(String docid, String text)818 public Video(String docid, String text) { 819 super(Type.GOOGLE_VIDEO, text); 820 this.docid = docid; 821 } 822 getDocID()823 public String getDocID() { return docid; } 824 isHtml()825 public boolean isHtml() { return false; } isMedia()826 public boolean isMedia() { return true; } 827 828 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)829 public static Video matchURL(String url, String text) { 830 Matcher m = URL_PATTERN.matcher(url); 831 if (m.matches()) { 832 return new Video(m.group(1), text); 833 } else { 834 return null; 835 } 836 } 837 getInfo()838 public List<String> getInfo() { 839 List<String> info = super.getInfo(); 840 info.add(getRssUrl(docid)); 841 info.add(getURL(docid)); 842 return info; 843 } 844 845 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)846 public static String getRssUrl(String docid) { 847 return "http://video.google.com/videofeed" 848 + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid; 849 } 850 851 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)852 public static String getURL(String docid) { 853 return getURL(docid, null); 854 } 855 856 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)857 public static String getURL(String docid, String extraParams) { 858 if (extraParams == null) { 859 extraParams = ""; 860 } else if (extraParams.length() > 0) { 861 extraParams += "&"; 862 } 863 return "http://video.google.com/videoplay?" + extraParams 864 + "docid=" + docid; 865 } 866 } 867 868 /** Represents a link to a YouTube video. */ 869 public static class YouTubeVideo extends Token { 870 /** Pattern for a video URL. */ 871 private static final Pattern URL_PATTERN = Pattern.compile( 872 "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?" 873 + ".*\\bv=([-_a-zA-Z0-9=]+).*"); 874 875 private String docid; 876 YouTubeVideo(String docid, String text)877 public YouTubeVideo(String docid, String text) { 878 super(Type.YOUTUBE_VIDEO, text); 879 this.docid = docid; 880 } 881 getDocID()882 public String getDocID() { return docid; } 883 isHtml()884 public boolean isHtml() { return false; } isMedia()885 public boolean isMedia() { return true; } 886 887 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)888 public static YouTubeVideo matchURL(String url, String text) { 889 Matcher m = URL_PATTERN.matcher(url); 890 if (m.matches()) { 891 return new YouTubeVideo(m.group(1), text); 892 } else { 893 return null; 894 } 895 } 896 getInfo()897 public List<String> getInfo() { 898 List<String> info = super.getInfo(); 899 info.add(getRssUrl(docid)); 900 info.add(getURL(docid)); 901 return info; 902 } 903 904 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)905 public static String getRssUrl(String docid) { 906 return "http://youtube.com/watch?v=" + docid; 907 } 908 909 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)910 public static String getURL(String docid) { 911 return getURL(docid, null); 912 } 913 914 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)915 public static String getURL(String docid, String extraParams) { 916 if (extraParams == null) { 917 extraParams = ""; 918 } else if (extraParams.length() > 0) { 919 extraParams += "&"; 920 } 921 return "http://youtube.com/watch?" + extraParams + "v=" + docid; 922 } 923 924 /** (For testing purposes:) Returns a video URL with the given parts. 925 * @param http If true, includes http:// 926 * @param prefix If non-null/non-blank, adds to URL before youtube.com. 927 * (e.g., prefix="br." --> "br.youtube.com") 928 */ getPrefixedURL(boolean http, String prefix, String docid, String extraParams)929 public static String getPrefixedURL(boolean http, String prefix, 930 String docid, String extraParams) { 931 String protocol = ""; 932 933 if (http) { 934 protocol = "http://"; 935 } 936 937 if (prefix == null) { 938 prefix = ""; 939 } 940 941 if (extraParams == null) { 942 extraParams = ""; 943 } else if (extraParams.length() > 0) { 944 extraParams += "&"; 945 } 946 947 return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" + 948 docid; 949 } 950 } 951 952 /** Represents a link to a Picasa photo or album. */ 953 public static class Photo extends Token { 954 /** Pattern for an album or photo URL. */ 955 // TODO (katyarogers) searchbrowse includes search lists and tags, 956 // it follows a different pattern than albums - would be nice to add later 957 private static final Pattern URL_PATTERN = Pattern.compile( 958 "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?"); 959 960 private String user; 961 private String album; 962 private String photo; // null for albums 963 Photo(String user, String album, String photo, String text)964 public Photo(String user, String album, String photo, String text) { 965 super(Type.PHOTO, text); 966 this.user = user; 967 this.album = album; 968 this.photo = photo; 969 } 970 getUser()971 public String getUser() { return user; } getAlbum()972 public String getAlbum() { return album; } getPhoto()973 public String getPhoto() { return photo; } 974 isHtml()975 public boolean isHtml() { return false; } isMedia()976 public boolean isMedia() { return true; } 977 978 /** Returns a Photo object if the given url is to a photo or album. */ matchURL(String url, String text)979 public static Photo matchURL(String url, String text) { 980 Matcher m = URL_PATTERN.matcher(url); 981 if (m.matches()) { 982 return new Photo(m.group(1), m.group(2), m.group(3), text); 983 } else { 984 return null; 985 } 986 } 987 getInfo()988 public List<String> getInfo() { 989 List<String> info = super.getInfo(); 990 info.add(getRssUrl(getUser())); 991 info.add(getAlbumURL(getUser(), getAlbum())); 992 if (getPhoto() != null) { 993 info.add(getPhotoURL(getUser(), getAlbum(), getPhoto())); 994 } else { 995 info.add((String)null); 996 } 997 return info; 998 } 999 1000 /** Returns the URL for the RSS description of the user's albums. */ getRssUrl(String user)1001 public static String getRssUrl(String user) { 1002 return "http://picasaweb.google.com/data/feed/api/user/" + user + 1003 "?category=album&alt=rss"; 1004 } 1005 1006 /** Returns the URL for an album. */ getAlbumURL(String user, String album)1007 public static String getAlbumURL(String user, String album) { 1008 return "http://picasaweb.google.com/" + user + "/" + album; 1009 } 1010 1011 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String album, String photo)1012 public static String getPhotoURL(String user, String album, String photo) { 1013 return "http://picasaweb.google.com/" + user + "/" + album + "/photo#" 1014 + photo; 1015 } 1016 } 1017 1018 /** Represents a link to a Flickr photo or album. */ 1019 public static class FlickrPhoto extends Token { 1020 /** Pattern for a user album or photo URL. */ 1021 private static final Pattern URL_PATTERN = Pattern.compile( 1022 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*"); 1023 private static final Pattern GROUPING_PATTERN = Pattern.compile( 1024 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" + 1025 "([^/?#&]+)/?"); 1026 1027 private static final String SETS = "sets"; 1028 private static final String TAGS = "tags"; 1029 1030 private String user; 1031 private String photo; // null for user album 1032 private String grouping; // either "tags" or "sets" 1033 private String groupingId; // sets or tags identifier 1034 FlickrPhoto(String user, String photo, String grouping, String groupingId, String text)1035 public FlickrPhoto(String user, String photo, String grouping, 1036 String groupingId, String text) { 1037 super(Type.FLICKR, text); 1038 1039 /* System wide tags look like the URL to a Flickr user. */ 1040 if (!TAGS.equals(user)) { 1041 this.user = user; 1042 // Don't consider slide show URL a photo 1043 this.photo = (!"show".equals(photo) ? photo : null); 1044 this.grouping = grouping; 1045 this.groupingId = groupingId; 1046 } else { 1047 this.user = null; 1048 this.photo = null; 1049 this.grouping = TAGS; 1050 this.groupingId = photo; 1051 } 1052 } 1053 getUser()1054 public String getUser() { return user; } getPhoto()1055 public String getPhoto() { return photo; } getGrouping()1056 public String getGrouping() { return grouping; } getGroupingId()1057 public String getGroupingId() { return groupingId; } 1058 isHtml()1059 public boolean isHtml() { return false; } isMedia()1060 public boolean isMedia() { return true; } 1061 1062 /** 1063 * Returns a FlickrPhoto object if the given url is to a photo or Flickr 1064 * user. 1065 */ matchURL(String url, String text)1066 public static FlickrPhoto matchURL(String url, String text) { 1067 Matcher m = GROUPING_PATTERN.matcher(url); 1068 if (m.matches()) { 1069 return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text); 1070 } 1071 1072 m = URL_PATTERN.matcher(url); 1073 if (m.matches()) { 1074 return new FlickrPhoto(m.group(1), m.group(2), null, null, text); 1075 } else { 1076 return null; 1077 } 1078 } 1079 getInfo()1080 public List<String> getInfo() { 1081 List<String> info = super.getInfo(); 1082 info.add(getUrl()); 1083 info.add(getUser() != null ? getUser() : ""); 1084 info.add(getPhoto() != null ? getPhoto() : ""); 1085 info.add(getGrouping() != null ? getGrouping() : ""); 1086 info.add(getGroupingId() != null ? getGroupingId() : ""); 1087 return info; 1088 } 1089 getUrl()1090 public String getUrl() { 1091 if (SETS.equals(grouping)) { 1092 return getUserSetsURL(user, groupingId); 1093 } else if (TAGS.equals(grouping)) { 1094 if (user != null) { 1095 return getUserTagsURL(user, groupingId); 1096 } else { 1097 return getTagsURL(groupingId); 1098 } 1099 } else if (photo != null) { 1100 return getPhotoURL(user, photo); 1101 } else { 1102 return getUserURL(user); 1103 } 1104 } 1105 1106 /** Returns the URL for the RSS description. */ getRssUrl(String user)1107 public static String getRssUrl(String user) { 1108 return null; 1109 } 1110 1111 /** Returns the URL for a particular tag. */ getTagsURL(String tag)1112 public static String getTagsURL(String tag) { 1113 return "http://flickr.com/photos/tags/" + tag; 1114 } 1115 1116 /** Returns the URL to the user's Flickr homepage. */ getUserURL(String user)1117 public static String getUserURL(String user) { 1118 return "http://flickr.com/photos/" + user; 1119 } 1120 1121 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String photo)1122 public static String getPhotoURL(String user, String photo) { 1123 return "http://flickr.com/photos/" + user + "/" + photo; 1124 } 1125 1126 /** Returns the URL for a user tag photo set. */ getUserTagsURL(String user, String tagId)1127 public static String getUserTagsURL(String user, String tagId) { 1128 return "http://flickr.com/photos/" + user + "/tags/" + tagId; 1129 } 1130 1131 /** Returns the URL for user set. */ getUserSetsURL(String user, String setId)1132 public static String getUserSetsURL(String user, String setId) { 1133 return "http://flickr.com/photos/" + user + "/sets/" + setId; 1134 } 1135 } 1136 1137 /** Represents a smiley that was found in the input. */ 1138 public static class Smiley extends Token { 1139 // TODO: Pass the SWF URL down to the client. 1140 Smiley(String text)1141 public Smiley(String text) { 1142 super(Type.SMILEY, text); 1143 } 1144 isHtml()1145 public boolean isHtml() { return false; } 1146 getInfo()1147 public List<String> getInfo() { 1148 List<String> info = super.getInfo(); 1149 info.add(getRawText()); 1150 return info; 1151 } 1152 } 1153 1154 /** Represents an acronym that was found in the input. */ 1155 public static class Acronym extends Token { 1156 private String value; 1157 // TODO: SWF 1158 Acronym(String text, String value)1159 public Acronym(String text, String value) { 1160 super(Type.ACRONYM, text); 1161 this.value = value; 1162 } 1163 getValue()1164 public String getValue() { return value; } 1165 isHtml()1166 public boolean isHtml() { return false; } 1167 getInfo()1168 public List<String> getInfo() { 1169 List<String> info = super.getInfo(); 1170 info.add(getRawText()); 1171 info.add(getValue()); 1172 return info; 1173 } 1174 } 1175 1176 /** Represents a character that changes formatting. */ 1177 public static class Format extends Token { 1178 private char ch; 1179 private boolean start; 1180 private boolean matched; 1181 Format(char ch, boolean start)1182 public Format(char ch, boolean start) { 1183 super(Type.FORMAT, String.valueOf(ch)); 1184 this.ch = ch; 1185 this.start = start; 1186 } 1187 setMatched(boolean matched)1188 public void setMatched(boolean matched) { this.matched = matched; } 1189 isHtml()1190 public boolean isHtml() { return true; } 1191 toHtml(boolean caps)1192 public String toHtml(boolean caps) { 1193 // This character only implies special formatting if it was matched. 1194 // Otherwise, it was just a plain old character. 1195 if (matched) { 1196 return start ? getFormatStart(ch) : getFormatEnd(ch); 1197 } else { 1198 // We have to make sure we escape HTML characters as usual. 1199 return (ch == '"') ? """ : String.valueOf(ch); 1200 } 1201 } 1202 1203 /** 1204 * Not supported. Info should not be needed for this type 1205 */ getInfo()1206 public List<String> getInfo() { 1207 throw new UnsupportedOperationException(); 1208 } 1209 controlCaps()1210 public boolean controlCaps() { return (ch == '^'); } setCaps()1211 public boolean setCaps() { return start; } 1212 getFormatStart(char ch)1213 private String getFormatStart(char ch) { 1214 switch (ch) { 1215 case '*': return "<b>"; 1216 case '_': return "<i>"; 1217 case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps 1218 case '"': return "<font color=\"#999999\">\u201c"; 1219 default: throw new AssertionError("unknown format '" + ch + "'"); 1220 } 1221 } 1222 getFormatEnd(char ch)1223 private String getFormatEnd(char ch) { 1224 switch (ch) { 1225 case '*': return "</b>"; 1226 case '_': return "</i>"; 1227 case '^': return "</font></b>"; // TODO: all caps 1228 case '"': return "\u201d</font>"; 1229 default: throw new AssertionError("unknown format '" + ch + "'"); 1230 } 1231 } 1232 } 1233 1234 /** Adds the given token to the parsed output. */ addToken(Token token)1235 private void addToken(Token token) { 1236 tokens.add(token); 1237 } 1238 1239 /** Converts the entire message into a single HTML display string. */ toHtml()1240 public String toHtml() { 1241 StringBuilder html = new StringBuilder(); 1242 1243 for (Part part : parts) { 1244 boolean caps = false; 1245 1246 html.append("<p>"); 1247 for (Token token : part.getTokens()) { 1248 if (token.isHtml()) { 1249 html.append(token.toHtml(caps)); 1250 } else { 1251 switch (token.getType()) { 1252 case LINK: 1253 html.append("<a href=\""); 1254 html.append(((Link)token).getURL()); 1255 html.append("\">"); 1256 html.append(token.getRawText()); 1257 html.append("</a>"); 1258 break; 1259 1260 case SMILEY: 1261 // TODO: link to an appropriate image 1262 html.append(token.getRawText()); 1263 break; 1264 1265 case ACRONYM: 1266 html.append(token.getRawText()); 1267 break; 1268 1269 case MUSIC: 1270 // TODO: include a music glyph 1271 html.append(((MusicTrack)token).getTrack()); 1272 break; 1273 1274 case GOOGLE_VIDEO: 1275 // TODO: include a Google Video icon 1276 html.append("<a href=\""); 1277 html.append(((Video)token).getURL(((Video)token).getDocID())); 1278 html.append("\">"); 1279 html.append(token.getRawText()); 1280 html.append("</a>"); 1281 break; 1282 1283 case YOUTUBE_VIDEO: 1284 // TODO: include a YouTube icon 1285 html.append("<a href=\""); 1286 html.append(((YouTubeVideo)token).getURL( 1287 ((YouTubeVideo)token).getDocID())); 1288 html.append("\">"); 1289 html.append(token.getRawText()); 1290 html.append("</a>"); 1291 break; 1292 1293 case PHOTO: { 1294 // TODO: include a Picasa Web icon 1295 html.append("<a href=\""); 1296 html.append(Photo.getAlbumURL( 1297 ((Photo)token).getUser(), ((Photo)token).getAlbum())); 1298 html.append("\">"); 1299 html.append(token.getRawText()); 1300 html.append("</a>"); 1301 break; 1302 } 1303 1304 case FLICKR: 1305 // TODO: include a Flickr icon 1306 Photo p = (Photo) token; 1307 html.append("<a href=\""); 1308 html.append(((FlickrPhoto)token).getUrl()); 1309 html.append("\">"); 1310 html.append(token.getRawText()); 1311 html.append("</a>"); 1312 break; 1313 1314 default: 1315 throw new AssertionError("unknown token type: " + token.getType()); 1316 } 1317 } 1318 1319 if (token.controlCaps()) { 1320 caps = token.setCaps(); 1321 } 1322 } 1323 html.append("</p>\n"); 1324 } 1325 1326 return html.toString(); 1327 } 1328 1329 /** Returns the reverse of the given string. */ reverse(String str)1330 protected static String reverse(String str) { 1331 StringBuilder buf = new StringBuilder(); 1332 for (int i = str.length() - 1; i >= 0; --i) { 1333 buf.append(str.charAt(i)); 1334 } 1335 return buf.toString(); 1336 } 1337 1338 public static class TrieNode { 1339 private final HashMap<Character,TrieNode> children = 1340 new HashMap<Character,TrieNode>(); 1341 private String text; 1342 private String value; 1343 TrieNode()1344 public TrieNode() { this(""); } TrieNode(String text)1345 public TrieNode(String text) { 1346 this.text = text; 1347 } 1348 exists()1349 public final boolean exists() { return value != null; } getText()1350 public final String getText() { return text; } getValue()1351 public final String getValue() { return value; } setValue(String value)1352 public void setValue(String value) { this.value = value; } 1353 getChild(char ch)1354 public TrieNode getChild(char ch) { 1355 return children.get(Character.valueOf(ch)); 1356 } 1357 getOrCreateChild(char ch)1358 public TrieNode getOrCreateChild(char ch) { 1359 Character key = Character.valueOf(ch); 1360 TrieNode node = children.get(key); 1361 if (node == null) { 1362 node = new TrieNode(text + String.valueOf(ch)); 1363 children.put(key, node); 1364 } 1365 return node; 1366 } 1367 1368 /** Adds the given string into the trie. */ addToTrie(TrieNode root, String str, String value)1369 public static void addToTrie(TrieNode root, String str, String value) { 1370 int index = 0; 1371 while (index < str.length()) { 1372 root = root.getOrCreateChild(str.charAt(index++)); 1373 } 1374 root.setValue(value); 1375 } 1376 } 1377 1378 1379 1380 /** Determines whether the given string is in the given trie. */ matches(TrieNode root, String str)1381 private static boolean matches(TrieNode root, String str) { 1382 int index = 0; 1383 while (index < str.length()) { 1384 root = root.getChild(str.charAt(index++)); 1385 if (root == null) { 1386 break; 1387 } else if (root.exists()) { 1388 return true; 1389 } 1390 } 1391 return false; 1392 } 1393 1394 /** 1395 * Returns the longest substring of the given string, starting at the given 1396 * index, that exists in the trie. 1397 */ longestMatch( TrieNode root, AbstractMessageParser p, int start)1398 private static TrieNode longestMatch( 1399 TrieNode root, AbstractMessageParser p, int start) { 1400 return longestMatch(root, p, start, false); 1401 } 1402 1403 /** 1404 * Returns the longest substring of the given string, starting at the given 1405 * index, that exists in the trie, with a special tokenizing case for 1406 * smileys if specified. 1407 */ longestMatch( TrieNode root, AbstractMessageParser p, int start, boolean smiley)1408 private static TrieNode longestMatch( 1409 TrieNode root, AbstractMessageParser p, int start, boolean smiley) { 1410 int index = start; 1411 TrieNode bestMatch = null; 1412 while (index < p.getRawText().length()) { 1413 root = root.getChild(p.getRawText().charAt(index++)); 1414 if (root == null) { 1415 break; 1416 } else if (root.exists()) { 1417 if (p.isWordBreak(index)) { 1418 bestMatch = root; 1419 } else if (smiley && p.isSmileyBreak(index)) { 1420 bestMatch = root; 1421 } 1422 } 1423 } 1424 return bestMatch; 1425 } 1426 1427 1428 /** Represents set of tokens that are delivered as a single message. */ 1429 public static class Part { 1430 private String meText; 1431 private ArrayList<Token> tokens; 1432 Part()1433 public Part() { 1434 this.tokens = new ArrayList<Token>(); 1435 } 1436 getType(boolean isSend)1437 public String getType(boolean isSend) { 1438 return (isSend ? "s" : "r") + getPartType(); 1439 } 1440 getPartType()1441 private String getPartType() { 1442 if (isMedia()) { 1443 return "d"; 1444 } else if (meText != null) { 1445 return "m"; 1446 } else { 1447 return ""; 1448 } 1449 } 1450 isMedia()1451 public boolean isMedia() { 1452 return (tokens.size() == 1) && tokens.get(0).isMedia(); 1453 } 1454 /** 1455 * Convenience method for getting the Token of a Part that represents 1456 * a media Token. Parts of this kind will always only have a single Token 1457 * 1458 * @return if this.isMedia(), 1459 * returns the Token representing the media contained in this Part, 1460 * otherwise returns null; 1461 */ getMediaToken()1462 public Token getMediaToken() { 1463 if(isMedia()) { 1464 return tokens.get(0); 1465 } 1466 return null; 1467 } 1468 1469 /** Adds the given token to this part. */ add(Token token)1470 public void add(Token token) { 1471 if (isMedia()) { 1472 throw new AssertionError("media "); 1473 } 1474 tokens.add(token); 1475 } 1476 setMeText(String meText)1477 public void setMeText(String meText) { 1478 this.meText = meText; 1479 } 1480 1481 /** Returns the original text of this part. */ getRawText()1482 public String getRawText() { 1483 StringBuilder buf = new StringBuilder(); 1484 if (meText != null) { 1485 buf.append(meText); 1486 } 1487 for (int i = 0; i < tokens.size(); ++i) { 1488 buf.append(tokens.get(i).getRawText()); 1489 } 1490 return buf.toString(); 1491 } 1492 1493 /** Returns the tokens in this part. */ getTokens()1494 public ArrayList<Token> getTokens() { return tokens; } 1495 1496 /** Adds the tokens into the given builder as an array. */ 1497 // public void toArray(JSArrayBuilder array) { 1498 // if (isMedia()) { 1499 // // For media, we send its array (i.e., we don't wrap this in another 1500 // // array as we do for non-media parts). 1501 // tokens.get(0).toArray(array); 1502 // } else { 1503 // array.beginArray(); 1504 // addToArray(array); 1505 // array.endArray(); 1506 // } 1507 // } 1508 } 1509 } 1510