1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.android.util; 18 19 import android.compat.annotation.UnsupportedAppUsage; 20 21 import java.util.ArrayList; 22 import java.util.HashMap; 23 import java.util.LinkedHashMap; 24 import java.util.List; 25 import java.util.Set; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 /** 30 * 31 * Logic for parsing a text message typed by the user looking for smileys, 32 * urls, acronyms,formatting (e.g., '*'s for bold), me commands 33 * (e.g., "/me is asleep"), and punctuation. 34 * 35 * It constructs an array, which breaks the text up into its 36 * constituent pieces, which we return to the client. 37 * 38 */ 39 public abstract class AbstractMessageParser { 40 /** 41 * Interface representing the set of resources needed by a message parser 42 * 43 * @author jessan (Jessan Hutchison-Quillian) 44 */ 45 public static interface Resources { 46 47 /** Get the known set of URL schemes. */ getSchemes()48 public Set<String> getSchemes(); 49 50 /** Get the possible values for the last part of a domain name. 51 * Values are expected to be reversed in the Trie. 52 */ getDomainSuffixes()53 public TrieNode getDomainSuffixes(); 54 55 /** Get the smileys accepted by the parser. */ getSmileys()56 public TrieNode getSmileys(); 57 58 /** Get the acronyms accepted by the parser. */ getAcronyms()59 public TrieNode getAcronyms(); 60 } 61 62 /** 63 * Subclasses must define the schemes, domains, smileys and acronyms 64 * that are necessary for parsing 65 */ getResources()66 protected abstract Resources getResources(); 67 68 /** Music note that indicates user is listening to a music track. */ 69 public static final String musicNote = "\u266B "; 70 71 private String text; 72 private int nextChar; 73 private int nextClass; 74 private ArrayList<Part> parts; 75 private ArrayList<Token> tokens; 76 private HashMap<Character,Format> formatStart; 77 private boolean parseSmilies; 78 private boolean parseAcronyms; 79 private boolean parseFormatting; 80 private boolean parseUrls; 81 private boolean parseMeText; 82 private boolean parseMusic; 83 84 /** 85 * Create a message parser to parse urls, formatting, acronyms, smileys, 86 * /me text and music 87 * 88 * @param text the text to parse 89 */ AbstractMessageParser(String text)90 public AbstractMessageParser(String text) { 91 this(text, true, true, true, true, true, true); 92 } 93 94 /** 95 * Create a message parser, specifying the kinds of text to parse 96 * 97 * @param text the text to parse 98 * 99 */ AbstractMessageParser(String text, boolean parseSmilies, boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, boolean parseMusic, boolean parseMeText)100 public AbstractMessageParser(String text, boolean parseSmilies, 101 boolean parseAcronyms, boolean parseFormatting, boolean parseUrls, 102 boolean parseMusic, boolean parseMeText) { 103 this.text = text; 104 this.nextChar = 0; 105 this.nextClass = 10; 106 this.parts = new ArrayList<Part>(); 107 this.tokens = new ArrayList<Token>(); 108 this.formatStart = new HashMap<Character,Format>(); 109 this.parseSmilies = parseSmilies; 110 this.parseAcronyms = parseAcronyms; 111 this.parseFormatting = parseFormatting; 112 this.parseUrls = parseUrls; 113 this.parseMusic = parseMusic; 114 this.parseMeText = parseMeText; 115 } 116 117 /** Returns the raw text being parsed. */ getRawText()118 public final String getRawText() { return text; } 119 120 /** Return the number of parts. */ getPartCount()121 public final int getPartCount() { return parts.size(); } 122 123 /** Return the part at the given index. */ getPart(int index)124 public final Part getPart(int index) { return parts.get(index); } 125 126 /** Return the list of parts from the parsed text */ getParts()127 public final List<Part> getParts() { return parts; } 128 129 /** Parses the text string into an internal representation. */ parse()130 public void parse() { 131 // Look for music track (of which there would be only one and it'll be the 132 // first token) 133 if (parseMusicTrack()) { 134 buildParts(null); 135 return; 136 } 137 138 // Look for me commands. 139 String meText = null; 140 if (parseMeText && text.startsWith("/me") && (text.length() > 3) && 141 Character.isWhitespace(text.charAt(3))) { 142 meText = text.substring(0, 4); 143 text = text.substring(4); 144 } 145 146 // Break the text into tokens. 147 boolean wasSmiley = false; 148 while (nextChar < text.length()) { 149 if (!isWordBreak(nextChar)) { 150 if (!wasSmiley || !isSmileyBreak(nextChar)) { 151 throw new AssertionError("last chunk did not end at word break"); 152 } 153 } 154 155 if (parseSmiley()) { 156 wasSmiley = true; 157 } else { 158 wasSmiley = false; 159 160 if (!parseAcronym() && !parseURL() && !parseFormatting()) { 161 parseText(); 162 } 163 } 164 } 165 166 // Trim the whitespace before and after media components. 167 for (int i = 0; i < tokens.size(); ++i) { 168 if (tokens.get(i).isMedia()) { 169 if ((i > 0) && (tokens.get(i - 1) instanceof Html)) { 170 ((Html)tokens.get(i - 1)).trimLeadingWhitespace(); 171 } 172 if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) { 173 ((Html)tokens.get(i + 1)).trimTrailingWhitespace(); 174 } 175 } 176 } 177 178 // Remove any empty html tokens. 179 for (int i = 0; i < tokens.size(); ++i) { 180 if (tokens.get(i).isHtml() && 181 (tokens.get(i).toHtml(true).length() == 0)) { 182 tokens.remove(i); 183 --i; // visit this index again 184 } 185 } 186 187 buildParts(meText); 188 } 189 190 /** 191 * Get a the appropriate Token for a given URL 192 * 193 * @param text the anchor text 194 * @param url the url 195 * 196 */ tokenForUrl(String url, String text)197 public static Token tokenForUrl(String url, String text) { 198 if(url == null) { 199 return null; 200 } 201 202 //Look for video links 203 Video video = Video.matchURL(url, text); 204 if (video != null) { 205 return video; 206 } 207 208 // Look for video links. 209 YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text); 210 if (ytVideo != null) { 211 return ytVideo; 212 } 213 214 // Look for photo links. 215 Photo photo = Photo.matchURL(url, text); 216 if (photo != null) { 217 return photo; 218 } 219 220 // Look for photo links. 221 FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text); 222 if (flickrPhoto != null) { 223 return flickrPhoto; 224 } 225 226 //Not media, so must be a regular URL 227 return new Link(url, text); 228 } 229 230 /** 231 * Builds the parts list. 232 * 233 * @param meText any meText parsed from the message 234 */ buildParts(String meText)235 private void buildParts(String meText) { 236 for (int i = 0; i < tokens.size(); ++i) { 237 Token token = tokens.get(i); 238 if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) { 239 parts.add(new Part()); 240 } 241 lastPart().add(token); 242 } 243 244 // The first part inherits the meText of the line. 245 if (parts.size() > 0) { 246 parts.get(0).setMeText(meText); 247 } 248 } 249 250 /** Returns the last part in the list. */ lastPart()251 private Part lastPart() { return parts.get(parts.size() - 1); } 252 253 /** 254 * Looks for a music track (\u266B is first character, everything else is 255 * track info). 256 */ parseMusicTrack()257 private boolean parseMusicTrack() { 258 259 if (parseMusic && text.startsWith(musicNote)) { 260 addToken(new MusicTrack(text.substring(musicNote.length()))); 261 nextChar = text.length(); 262 return true; 263 } 264 return false; 265 } 266 267 /** Consumes all of the text in the next word . */ parseText()268 private void parseText() { 269 StringBuilder buf = new StringBuilder(); 270 int start = nextChar; 271 do { 272 char ch = text.charAt(nextChar++); 273 switch (ch) { 274 case '<': buf.append("<"); break; 275 case '>': buf.append(">"); break; 276 case '&': buf.append("&"); break; 277 case '"': buf.append("""); break; 278 case '\'': buf.append("'"); break; 279 case '\n': buf.append("<br>"); break; 280 default: buf.append(ch); break; 281 } 282 } while (!isWordBreak(nextChar)); 283 284 addToken(new Html(text.substring(start, nextChar), buf.toString())); 285 } 286 287 /** 288 * Looks for smileys (e.g., ":)") in the text. The set of known smileys is 289 * loaded from a file into a trie at server start. 290 */ parseSmiley()291 private boolean parseSmiley() { 292 if(!parseSmilies) { 293 return false; 294 } 295 TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar, 296 true); 297 if (match == null) { 298 return false; 299 } else { 300 int previousCharClass = getCharClass(nextChar - 1); 301 int nextCharClass = getCharClass(nextChar + match.getText().length()); 302 if ((previousCharClass == 2 || previousCharClass == 3) 303 && (nextCharClass == 2 || nextCharClass == 3)) { 304 return false; 305 } 306 addToken(new Smiley(match.getText())); 307 nextChar += match.getText().length(); 308 return true; 309 } 310 } 311 312 /** Looks for acronyms (e.g., "lol") in the text. 313 */ parseAcronym()314 private boolean parseAcronym() { 315 if(!parseAcronyms) { 316 return false; 317 } 318 TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar); 319 if (match == null) { 320 return false; 321 } else { 322 addToken(new Acronym(match.getText(), match.getValue())); 323 nextChar += match.getText().length(); 324 return true; 325 } 326 } 327 328 /** Determines if this is an allowable domain character. */ isDomainChar(char c)329 private boolean isDomainChar(char c) { 330 return c == '-' || Character.isLetter(c) || Character.isDigit(c); 331 } 332 333 /** Determines if the given string is a valid domain. */ isValidDomain(String domain)334 private boolean isValidDomain(String domain) { 335 // For hostnames, check that it ends with a known domain suffix 336 if (matches(getResources().getDomainSuffixes(), reverse(domain))) { 337 return true; 338 } 339 return false; 340 } 341 342 /** 343 * Looks for a URL in two possible forms: either a proper URL with a known 344 * scheme or a domain name optionally followed by a path, query, or query. 345 */ parseURL()346 private boolean parseURL() { 347 // Make sure this is a valid place to start a URL. 348 if (!parseUrls || !isURLBreak(nextChar)) { 349 return false; 350 } 351 352 int start = nextChar; 353 354 // Search for the first block of letters. 355 int index = start; 356 while ((index < text.length()) && isDomainChar(text.charAt(index))) { 357 index += 1; 358 } 359 360 String url = ""; 361 boolean done = false; 362 363 if (index == text.length()) { 364 return false; 365 } else if (text.charAt(index) == ':') { 366 // Make sure this is a known scheme. 367 String scheme = text.substring(nextChar, index); 368 if (!getResources().getSchemes().contains(scheme)) { 369 return false; 370 } 371 } else if (text.charAt(index) == '.') { 372 // Search for the end of the domain name. 373 while (index < text.length()) { 374 char ch = text.charAt(index); 375 if ((ch != '.') && !isDomainChar(ch)) { 376 break; 377 } else { 378 index += 1; 379 } 380 } 381 382 // Make sure the domain name has a valid suffix. Since tries look for 383 // prefix matches, we reverse all the strings to get suffix comparisons. 384 String domain = text.substring(nextChar, index); 385 if (!isValidDomain(domain)) { 386 return false; 387 } 388 389 // Search for a port. We deal with this specially because a colon can 390 // also be a punctuation character. 391 if ((index + 1 < text.length()) && (text.charAt(index) == ':')) { 392 char ch = text.charAt(index + 1); 393 if (Character.isDigit(ch)) { 394 index += 1; 395 while ((index < text.length()) && 396 Character.isDigit(text.charAt(index))) { 397 index += 1; 398 } 399 } 400 } 401 402 // The domain name should be followed by end of line, whitespace, 403 // punctuation, or a colon, slash, question, or hash character. The 404 // tricky part here is that some URL characters are also punctuation, so 405 // we need to distinguish them. Since we looked for ports above, a colon 406 // is always punctuation here. To distinguish '?' cases, we look at the 407 // character that follows it. 408 if (index == text.length()) { 409 done = true; 410 } else { 411 char ch = text.charAt(index); 412 if (ch == '?') { 413 // If the next character is whitespace or punctuation (or missing), 414 // then this question mark looks like punctuation. 415 if (index + 1 == text.length()) { 416 done = true; 417 } else { 418 char ch2 = text.charAt(index + 1); 419 if (Character.isWhitespace(ch2) || isPunctuation(ch2)) { 420 done = true; 421 } 422 } 423 } else if (isPunctuation(ch)) { 424 done = true; 425 } else if (Character.isWhitespace(ch)) { 426 done = true; 427 } else if ((ch == '/') || (ch == '#')) { 428 // In this case, the URL is not done. We will search for the end of 429 // it below. 430 } else { 431 return false; 432 } 433 } 434 435 // We will assume the user meant HTTP. (One weird case is where they 436 // type a port of 443. That could mean HTTPS, but they might also want 437 // HTTP. We'll let them specify if they don't want HTTP.) 438 url = "http://"; 439 } else { 440 return false; 441 } 442 443 // If the URL is not done, search for the end, which is just before the 444 // next whitespace character. 445 if (!done) { 446 while ((index < text.length()) && 447 !Character.isWhitespace(text.charAt(index))) { 448 index += 1; 449 } 450 } 451 452 String urlText = text.substring(start, index); 453 url += urlText; 454 455 // Figure out the appropriate token type. 456 addURLToken(url, urlText); 457 458 nextChar = index; 459 return true; 460 } 461 462 /** 463 * Adds the appropriate token for the given URL. This might be a simple 464 * link or it might be a recognized media type. 465 */ addURLToken(String url, String text)466 private void addURLToken(String url, String text) { 467 addToken(tokenForUrl(url, text)); 468 } 469 470 /** 471 * Deal with formatting characters. 472 * 473 * Parsing is as follows: 474 * - Treat all contiguous strings of formatting characters as one block. 475 * (This method processes one block.) 476 * - Only a single instance of a particular format character within a block 477 * is used to determine whether to turn on/off that type of formatting; 478 * other instances simply print the character itself. 479 * - If the format is to be turned on, we use the _first_ instance; if it 480 * is to be turned off, we use the _last_ instance (by appending the 481 * format.) 482 * 483 * Example: 484 * **string** turns into <b>*string*</b> 485 */ parseFormatting()486 private boolean parseFormatting() { 487 if(!parseFormatting) { 488 return false; 489 } 490 int endChar = nextChar; 491 while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) { 492 endChar += 1; 493 } 494 495 if ((endChar == nextChar) || !isWordBreak(endChar)) { 496 return false; 497 } 498 499 // Keeps track of whether we've seen a character (in map if we've seen it) 500 // and whether we should append a closing format token (if value in 501 // map is TRUE). Linked hashmap for consistent ordering. 502 LinkedHashMap<Character, Boolean> seenCharacters = 503 new LinkedHashMap<Character, Boolean>(); 504 505 for (int index = nextChar; index < endChar; ++index) { 506 char ch = text.charAt(index); 507 Character key = Character.valueOf(ch); 508 if (seenCharacters.containsKey(key)) { 509 // Already seen this character, just append an unmatched token, which 510 // will print plaintext character 511 addToken(new Format(ch, false)); 512 } else { 513 Format start = formatStart.get(key); 514 if (start != null) { 515 // Match the start token, and ask an end token to be appended 516 start.setMatched(true); 517 formatStart.remove(key); 518 seenCharacters.put(key, Boolean.TRUE); 519 } else { 520 // Append start token 521 start = new Format(ch, true); 522 formatStart.put(key, start); 523 addToken(start); 524 seenCharacters.put(key, Boolean.FALSE); 525 } 526 } 527 } 528 529 // Append any necessary end tokens 530 for (Character key : seenCharacters.keySet()) { 531 if (seenCharacters.get(key) == Boolean.TRUE) { 532 Format end = new Format(key.charValue(), false); 533 end.setMatched(true); 534 addToken(end); 535 } 536 } 537 538 nextChar = endChar; 539 return true; 540 } 541 542 /** Determines whether the given index could be a possible word break. */ isWordBreak(int index)543 private boolean isWordBreak(int index) { 544 return getCharClass(index - 1) != getCharClass(index); 545 } 546 547 /** Determines whether the given index could be a possible smiley break. */ isSmileyBreak(int index)548 private boolean isSmileyBreak(int index) { 549 if (index > 0 && index < text.length()) { 550 if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) { 551 return true; 552 } 553 } 554 555 return false; 556 } 557 558 /** 559 * Verifies that the character before the given index is end of line, 560 * whitespace, or punctuation. 561 */ isURLBreak(int index)562 private boolean isURLBreak(int index) { 563 switch (getCharClass(index - 1)) { 564 case 2: 565 case 3: 566 case 4: 567 return false; 568 569 case 0: 570 case 1: 571 default: 572 return true; 573 } 574 } 575 576 /** Returns the class for the character at the given index. */ getCharClass(int index)577 private int getCharClass(int index) { 578 if ((index < 0) || (text.length() <= index)) { 579 return 0; 580 } 581 582 char ch = text.charAt(index); 583 if (Character.isWhitespace(ch)) { 584 return 1; 585 } else if (Character.isLetter(ch)) { 586 return 2; 587 } else if (Character.isDigit(ch)) { 588 return 3; 589 } else if (isPunctuation(ch)) { 590 // For punctuation, we return a unique value every time so that they are 591 // always different from any other character. Punctuation should always 592 // be considered a possible word break. 593 return ++nextClass; 594 } else { 595 return 4; 596 } 597 } 598 599 /** 600 * Returns true if <code>c1</code> could be the last character of 601 * a smiley and <code>c2</code> could be the first character of 602 * a different smiley, if {@link #isWordBreak} would not already 603 * recognize that this is possible. 604 */ isSmileyBreak(char c1, char c2)605 private static boolean isSmileyBreak(char c1, char c2) { 606 switch (c1) { 607 /* 608 * These characters can end smileys, but don't normally end words. 609 */ 610 case '$': case '&': case '*': case '+': case '-': 611 case '/': case '<': case '=': case '>': case '@': 612 case '[': case '\\': case ']': case '^': case '|': 613 case '}': case '~': 614 switch (c2) { 615 /* 616 * These characters can begin smileys, but don't normally 617 * begin words. 618 */ 619 case '#': case '$': case '%': case '*': case '/': 620 case '<': case '=': case '>': case '@': case '[': 621 case '\\': case '^': case '~': 622 return true; 623 } 624 } 625 626 return false; 627 } 628 629 /** Determines whether the given character is punctuation. */ isPunctuation(char ch)630 private static boolean isPunctuation(char ch) { 631 switch (ch) { 632 case '.': case ',': case '"': case ':': case ';': 633 case '?': case '!': case '(': case ')': 634 return true; 635 636 default: 637 return false; 638 } 639 } 640 641 /** 642 * Determines whether the given character is the beginning or end of a 643 * section with special formatting. 644 */ isFormatChar(char ch)645 private static boolean isFormatChar(char ch) { 646 switch (ch) { 647 case '*': case '_': case '^': 648 return true; 649 650 default: 651 return false; 652 } 653 } 654 655 /** Represents a unit of parsed output. */ 656 public static abstract class Token { 657 @UnsupportedAppUsage(implicitMember = 658 "values()[Lcom/google/android/util/AbstractMessageParser$Token$Type;") 659 public enum Type { 660 @UnsupportedAppUsage 661 HTML ("html"), 662 @UnsupportedAppUsage 663 FORMAT ("format"), // subtype of HTML 664 @UnsupportedAppUsage 665 LINK ("l"), 666 @UnsupportedAppUsage 667 SMILEY ("e"), 668 @UnsupportedAppUsage 669 ACRONYM ("a"), 670 @UnsupportedAppUsage 671 MUSIC ("m"), 672 @UnsupportedAppUsage 673 GOOGLE_VIDEO ("v"), 674 @UnsupportedAppUsage 675 YOUTUBE_VIDEO ("yt"), 676 @UnsupportedAppUsage 677 PHOTO ("p"), 678 @UnsupportedAppUsage 679 FLICKR ("f"); 680 681 //stringreps for HTML and FORMAT don't really matter 682 //because they don't define getInfo(), which is where it is used 683 //For the other types, code depends on their stringreps 684 private String stringRep; 685 Type(String stringRep)686 Type(String stringRep) { 687 this.stringRep = stringRep; 688 } 689 690 /** {@inheritDoc} */ toString()691 public String toString() { 692 return this.stringRep; 693 } 694 } 695 696 protected Type type; 697 protected String text; 698 Token(Type type, String text)699 protected Token(Type type, String text) { 700 this.type = type; 701 this.text = text; 702 } 703 704 /** Returns the type of the token. */ getType()705 public Type getType() { return type; } 706 707 /** 708 * Get the relevant information about a token 709 * 710 * @return a list of strings representing the token, not null 711 * The first item is always a string representation of the type 712 */ getInfo()713 public List<String> getInfo() { 714 List<String> info = new ArrayList<String>(); 715 info.add(getType().toString()); 716 return info; 717 } 718 719 /** Returns the raw text of the token. */ getRawText()720 public String getRawText() { return text; } 721 isMedia()722 public boolean isMedia() { return false; } isHtml()723 public abstract boolean isHtml(); isArray()724 public boolean isArray() { return !isHtml(); } 725 toHtml(boolean caps)726 public String toHtml(boolean caps) { throw new AssertionError("not html"); } 727 728 // The token can change the caps of the text after that point. controlCaps()729 public boolean controlCaps() { return false; } setCaps()730 public boolean setCaps() { return false; } 731 } 732 733 /** Represents a simple string of html text. */ 734 public static class Html extends Token { 735 private String html; 736 Html(String text, String html)737 public Html(String text, String html) { 738 super(Type.HTML, text); 739 this.html = html; 740 } 741 isHtml()742 public boolean isHtml() { return true; } toHtml(boolean caps)743 public String toHtml(boolean caps) { 744 return caps ? html.toUpperCase() : html; 745 } 746 /** 747 * Not supported. Info should not be needed for this type 748 */ getInfo()749 public List<String> getInfo() { 750 throw new UnsupportedOperationException(); 751 } 752 trimLeadingWhitespace()753 public void trimLeadingWhitespace() { 754 text = trimLeadingWhitespace(text); 755 html = trimLeadingWhitespace(html); 756 } 757 trimTrailingWhitespace()758 public void trimTrailingWhitespace() { 759 text = trimTrailingWhitespace(text); 760 html = trimTrailingWhitespace(html); 761 } 762 trimLeadingWhitespace(String text)763 private static String trimLeadingWhitespace(String text) { 764 int index = 0; 765 while ((index < text.length()) && 766 Character.isWhitespace(text.charAt(index))) { 767 ++index; 768 } 769 return text.substring(index); 770 } 771 trimTrailingWhitespace(String text)772 public static String trimTrailingWhitespace(String text) { 773 int index = text.length(); 774 while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) { 775 --index; 776 } 777 return text.substring(0, index); 778 } 779 } 780 781 /** Represents a music track token at the beginning. */ 782 public static class MusicTrack extends Token { 783 private String track; 784 MusicTrack(String track)785 public MusicTrack(String track) { 786 super(Type.MUSIC, track); 787 this.track = track; 788 } 789 getTrack()790 public String getTrack() { return track; } 791 isHtml()792 public boolean isHtml() { return false; } 793 getInfo()794 public List<String> getInfo() { 795 List<String> info = super.getInfo(); 796 info.add(getTrack()); 797 return info; 798 } 799 } 800 801 /** Represents a link that was found in the input. */ 802 public static class Link extends Token { 803 private String url; 804 Link(String url, String text)805 public Link(String url, String text) { 806 super(Type.LINK, text); 807 this.url = url; 808 } 809 getURL()810 public String getURL() { return url; } 811 isHtml()812 public boolean isHtml() { return false; } 813 getInfo()814 public List<String> getInfo() { 815 List<String> info = super.getInfo(); 816 info.add(getURL()); 817 info.add(getRawText()); 818 return info; 819 } 820 } 821 822 /** Represents a link to a Google Video. */ 823 public static class Video extends Token { 824 /** Pattern for a video URL. */ 825 private static final Pattern URL_PATTERN = Pattern.compile( 826 "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?" 827 + ".*?\\bdocid=(-?\\d+).*"); 828 829 private String docid; 830 Video(String docid, String text)831 public Video(String docid, String text) { 832 super(Type.GOOGLE_VIDEO, text); 833 this.docid = docid; 834 } 835 getDocID()836 public String getDocID() { return docid; } 837 isHtml()838 public boolean isHtml() { return false; } isMedia()839 public boolean isMedia() { return true; } 840 841 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)842 public static Video matchURL(String url, String text) { 843 Matcher m = URL_PATTERN.matcher(url); 844 if (m.matches()) { 845 return new Video(m.group(1), text); 846 } else { 847 return null; 848 } 849 } 850 getInfo()851 public List<String> getInfo() { 852 List<String> info = super.getInfo(); 853 info.add(getRssUrl(docid)); 854 info.add(getURL(docid)); 855 return info; 856 } 857 858 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)859 public static String getRssUrl(String docid) { 860 return "http://video.google.com/videofeed" 861 + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid; 862 } 863 864 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)865 public static String getURL(String docid) { 866 return getURL(docid, null); 867 } 868 869 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)870 public static String getURL(String docid, String extraParams) { 871 if (extraParams == null) { 872 extraParams = ""; 873 } else if (extraParams.length() > 0) { 874 extraParams += "&"; 875 } 876 return "http://video.google.com/videoplay?" + extraParams 877 + "docid=" + docid; 878 } 879 } 880 881 /** Represents a link to a YouTube video. */ 882 public static class YouTubeVideo extends Token { 883 /** Pattern for a video URL. */ 884 private static final Pattern URL_PATTERN = Pattern.compile( 885 "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?" 886 + ".*\\bv=([-_a-zA-Z0-9=]+).*"); 887 888 private String docid; 889 YouTubeVideo(String docid, String text)890 public YouTubeVideo(String docid, String text) { 891 super(Type.YOUTUBE_VIDEO, text); 892 this.docid = docid; 893 } 894 getDocID()895 public String getDocID() { return docid; } 896 isHtml()897 public boolean isHtml() { return false; } isMedia()898 public boolean isMedia() { return true; } 899 900 /** Returns a Video object if the given url is to a video. */ matchURL(String url, String text)901 public static YouTubeVideo matchURL(String url, String text) { 902 Matcher m = URL_PATTERN.matcher(url); 903 if (m.matches()) { 904 return new YouTubeVideo(m.group(1), text); 905 } else { 906 return null; 907 } 908 } 909 getInfo()910 public List<String> getInfo() { 911 List<String> info = super.getInfo(); 912 info.add(getRssUrl(docid)); 913 info.add(getURL(docid)); 914 return info; 915 } 916 917 /** Returns the URL for the RSS description of the given video. */ getRssUrl(String docid)918 public static String getRssUrl(String docid) { 919 return "http://youtube.com/watch?v=" + docid; 920 } 921 922 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid)923 public static String getURL(String docid) { 924 return getURL(docid, null); 925 } 926 927 /** (For testing purposes:) Returns a video URL with the given parts. */ getURL(String docid, String extraParams)928 public static String getURL(String docid, String extraParams) { 929 if (extraParams == null) { 930 extraParams = ""; 931 } else if (extraParams.length() > 0) { 932 extraParams += "&"; 933 } 934 return "http://youtube.com/watch?" + extraParams + "v=" + docid; 935 } 936 937 /** (For testing purposes:) Returns a video URL with the given parts. 938 * @param http If true, includes http:// 939 * @param prefix If non-null/non-blank, adds to URL before youtube.com. 940 * (e.g., prefix="br." --> "br.youtube.com") 941 */ getPrefixedURL(boolean http, String prefix, String docid, String extraParams)942 public static String getPrefixedURL(boolean http, String prefix, 943 String docid, String extraParams) { 944 String protocol = ""; 945 946 if (http) { 947 protocol = "http://"; 948 } 949 950 if (prefix == null) { 951 prefix = ""; 952 } 953 954 if (extraParams == null) { 955 extraParams = ""; 956 } else if (extraParams.length() > 0) { 957 extraParams += "&"; 958 } 959 960 return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" + 961 docid; 962 } 963 } 964 965 /** Represents a link to a Picasa photo or album. */ 966 public static class Photo extends Token { 967 /** Pattern for an album or photo URL. */ 968 // TODO (katyarogers) searchbrowse includes search lists and tags, 969 // it follows a different pattern than albums - would be nice to add later 970 private static final Pattern URL_PATTERN = Pattern.compile( 971 "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?"); 972 973 private String user; 974 private String album; 975 private String photo; // null for albums 976 Photo(String user, String album, String photo, String text)977 public Photo(String user, String album, String photo, String text) { 978 super(Type.PHOTO, text); 979 this.user = user; 980 this.album = album; 981 this.photo = photo; 982 } 983 getUser()984 public String getUser() { return user; } getAlbum()985 public String getAlbum() { return album; } getPhoto()986 public String getPhoto() { return photo; } 987 isHtml()988 public boolean isHtml() { return false; } isMedia()989 public boolean isMedia() { return true; } 990 991 /** Returns a Photo object if the given url is to a photo or album. */ matchURL(String url, String text)992 public static Photo matchURL(String url, String text) { 993 Matcher m = URL_PATTERN.matcher(url); 994 if (m.matches()) { 995 return new Photo(m.group(1), m.group(2), m.group(3), text); 996 } else { 997 return null; 998 } 999 } 1000 getInfo()1001 public List<String> getInfo() { 1002 List<String> info = super.getInfo(); 1003 info.add(getRssUrl(getUser())); 1004 info.add(getAlbumURL(getUser(), getAlbum())); 1005 if (getPhoto() != null) { 1006 info.add(getPhotoURL(getUser(), getAlbum(), getPhoto())); 1007 } else { 1008 info.add((String)null); 1009 } 1010 return info; 1011 } 1012 1013 /** Returns the URL for the RSS description of the user's albums. */ getRssUrl(String user)1014 public static String getRssUrl(String user) { 1015 return "http://picasaweb.google.com/data/feed/api/user/" + user + 1016 "?category=album&alt=rss"; 1017 } 1018 1019 /** Returns the URL for an album. */ getAlbumURL(String user, String album)1020 public static String getAlbumURL(String user, String album) { 1021 return "http://picasaweb.google.com/" + user + "/" + album; 1022 } 1023 1024 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String album, String photo)1025 public static String getPhotoURL(String user, String album, String photo) { 1026 return "http://picasaweb.google.com/" + user + "/" + album + "/photo#" 1027 + photo; 1028 } 1029 } 1030 1031 /** Represents a link to a Flickr photo or album. */ 1032 public static class FlickrPhoto extends Token { 1033 /** Pattern for a user album or photo URL. */ 1034 private static final Pattern URL_PATTERN = Pattern.compile( 1035 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*"); 1036 private static final Pattern GROUPING_PATTERN = Pattern.compile( 1037 "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" + 1038 "([^/?#&]+)/?"); 1039 1040 private static final String SETS = "sets"; 1041 private static final String TAGS = "tags"; 1042 1043 private String user; 1044 private String photo; // null for user album 1045 private String grouping; // either "tags" or "sets" 1046 private String groupingId; // sets or tags identifier 1047 FlickrPhoto(String user, String photo, String grouping, String groupingId, String text)1048 public FlickrPhoto(String user, String photo, String grouping, 1049 String groupingId, String text) { 1050 super(Type.FLICKR, text); 1051 1052 /* System wide tags look like the URL to a Flickr user. */ 1053 if (!TAGS.equals(user)) { 1054 this.user = user; 1055 // Don't consider slide show URL a photo 1056 this.photo = (!"show".equals(photo) ? photo : null); 1057 this.grouping = grouping; 1058 this.groupingId = groupingId; 1059 } else { 1060 this.user = null; 1061 this.photo = null; 1062 this.grouping = TAGS; 1063 this.groupingId = photo; 1064 } 1065 } 1066 getUser()1067 public String getUser() { return user; } getPhoto()1068 public String getPhoto() { return photo; } getGrouping()1069 public String getGrouping() { return grouping; } getGroupingId()1070 public String getGroupingId() { return groupingId; } 1071 isHtml()1072 public boolean isHtml() { return false; } isMedia()1073 public boolean isMedia() { return true; } 1074 1075 /** 1076 * Returns a FlickrPhoto object if the given url is to a photo or Flickr 1077 * user. 1078 */ matchURL(String url, String text)1079 public static FlickrPhoto matchURL(String url, String text) { 1080 Matcher m = GROUPING_PATTERN.matcher(url); 1081 if (m.matches()) { 1082 return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text); 1083 } 1084 1085 m = URL_PATTERN.matcher(url); 1086 if (m.matches()) { 1087 return new FlickrPhoto(m.group(1), m.group(2), null, null, text); 1088 } else { 1089 return null; 1090 } 1091 } 1092 getInfo()1093 public List<String> getInfo() { 1094 List<String> info = super.getInfo(); 1095 info.add(getUrl()); 1096 info.add(getUser() != null ? getUser() : ""); 1097 info.add(getPhoto() != null ? getPhoto() : ""); 1098 info.add(getGrouping() != null ? getGrouping() : ""); 1099 info.add(getGroupingId() != null ? getGroupingId() : ""); 1100 return info; 1101 } 1102 getUrl()1103 public String getUrl() { 1104 if (SETS.equals(grouping)) { 1105 return getUserSetsURL(user, groupingId); 1106 } else if (TAGS.equals(grouping)) { 1107 if (user != null) { 1108 return getUserTagsURL(user, groupingId); 1109 } else { 1110 return getTagsURL(groupingId); 1111 } 1112 } else if (photo != null) { 1113 return getPhotoURL(user, photo); 1114 } else { 1115 return getUserURL(user); 1116 } 1117 } 1118 1119 /** Returns the URL for the RSS description. */ getRssUrl(String user)1120 public static String getRssUrl(String user) { 1121 return null; 1122 } 1123 1124 /** Returns the URL for a particular tag. */ getTagsURL(String tag)1125 public static String getTagsURL(String tag) { 1126 return "http://flickr.com/photos/tags/" + tag; 1127 } 1128 1129 /** Returns the URL to the user's Flickr homepage. */ getUserURL(String user)1130 public static String getUserURL(String user) { 1131 return "http://flickr.com/photos/" + user; 1132 } 1133 1134 /** Returns the URL for a particular photo. */ getPhotoURL(String user, String photo)1135 public static String getPhotoURL(String user, String photo) { 1136 return "http://flickr.com/photos/" + user + "/" + photo; 1137 } 1138 1139 /** Returns the URL for a user tag photo set. */ getUserTagsURL(String user, String tagId)1140 public static String getUserTagsURL(String user, String tagId) { 1141 return "http://flickr.com/photos/" + user + "/tags/" + tagId; 1142 } 1143 1144 /** Returns the URL for user set. */ getUserSetsURL(String user, String setId)1145 public static String getUserSetsURL(String user, String setId) { 1146 return "http://flickr.com/photos/" + user + "/sets/" + setId; 1147 } 1148 } 1149 1150 /** Represents a smiley that was found in the input. */ 1151 public static class Smiley extends Token { 1152 // TODO: Pass the SWF URL down to the client. 1153 Smiley(String text)1154 public Smiley(String text) { 1155 super(Type.SMILEY, text); 1156 } 1157 isHtml()1158 public boolean isHtml() { return false; } 1159 getInfo()1160 public List<String> getInfo() { 1161 List<String> info = super.getInfo(); 1162 info.add(getRawText()); 1163 return info; 1164 } 1165 } 1166 1167 /** Represents an acronym that was found in the input. */ 1168 public static class Acronym extends Token { 1169 private String value; 1170 // TODO: SWF 1171 Acronym(String text, String value)1172 public Acronym(String text, String value) { 1173 super(Type.ACRONYM, text); 1174 this.value = value; 1175 } 1176 getValue()1177 public String getValue() { return value; } 1178 isHtml()1179 public boolean isHtml() { return false; } 1180 getInfo()1181 public List<String> getInfo() { 1182 List<String> info = super.getInfo(); 1183 info.add(getRawText()); 1184 info.add(getValue()); 1185 return info; 1186 } 1187 } 1188 1189 /** Represents a character that changes formatting. */ 1190 public static class Format extends Token { 1191 private char ch; 1192 private boolean start; 1193 private boolean matched; 1194 Format(char ch, boolean start)1195 public Format(char ch, boolean start) { 1196 super(Type.FORMAT, String.valueOf(ch)); 1197 this.ch = ch; 1198 this.start = start; 1199 } 1200 setMatched(boolean matched)1201 public void setMatched(boolean matched) { this.matched = matched; } 1202 isHtml()1203 public boolean isHtml() { return true; } 1204 toHtml(boolean caps)1205 public String toHtml(boolean caps) { 1206 // This character only implies special formatting if it was matched. 1207 // Otherwise, it was just a plain old character. 1208 if (matched) { 1209 return start ? getFormatStart(ch) : getFormatEnd(ch); 1210 } else { 1211 // We have to make sure we escape HTML characters as usual. 1212 return (ch == '"') ? """ : String.valueOf(ch); 1213 } 1214 } 1215 1216 /** 1217 * Not supported. Info should not be needed for this type 1218 */ getInfo()1219 public List<String> getInfo() { 1220 throw new UnsupportedOperationException(); 1221 } 1222 controlCaps()1223 public boolean controlCaps() { return (ch == '^'); } setCaps()1224 public boolean setCaps() { return start; } 1225 getFormatStart(char ch)1226 private String getFormatStart(char ch) { 1227 switch (ch) { 1228 case '*': return "<b>"; 1229 case '_': return "<i>"; 1230 case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps 1231 case '"': return "<font color=\"#999999\">\u201c"; 1232 default: throw new AssertionError("unknown format '" + ch + "'"); 1233 } 1234 } 1235 getFormatEnd(char ch)1236 private String getFormatEnd(char ch) { 1237 switch (ch) { 1238 case '*': return "</b>"; 1239 case '_': return "</i>"; 1240 case '^': return "</font></b>"; // TODO: all caps 1241 case '"': return "\u201d</font>"; 1242 default: throw new AssertionError("unknown format '" + ch + "'"); 1243 } 1244 } 1245 } 1246 1247 /** Adds the given token to the parsed output. */ addToken(Token token)1248 private void addToken(Token token) { 1249 tokens.add(token); 1250 } 1251 1252 /** Converts the entire message into a single HTML display string. */ toHtml()1253 public String toHtml() { 1254 StringBuilder html = new StringBuilder(); 1255 1256 for (Part part : parts) { 1257 boolean caps = false; 1258 1259 html.append("<p>"); 1260 for (Token token : part.getTokens()) { 1261 if (token.isHtml()) { 1262 html.append(token.toHtml(caps)); 1263 } else { 1264 switch (token.getType()) { 1265 case LINK: 1266 html.append("<a href=\""); 1267 html.append(((Link)token).getURL()); 1268 html.append("\">"); 1269 html.append(token.getRawText()); 1270 html.append("</a>"); 1271 break; 1272 1273 case SMILEY: 1274 // TODO: link to an appropriate image 1275 html.append(token.getRawText()); 1276 break; 1277 1278 case ACRONYM: 1279 html.append(token.getRawText()); 1280 break; 1281 1282 case MUSIC: 1283 // TODO: include a music glyph 1284 html.append(((MusicTrack)token).getTrack()); 1285 break; 1286 1287 case GOOGLE_VIDEO: 1288 // TODO: include a Google Video icon 1289 html.append("<a href=\""); 1290 html.append(((Video)token).getURL(((Video)token).getDocID())); 1291 html.append("\">"); 1292 html.append(token.getRawText()); 1293 html.append("</a>"); 1294 break; 1295 1296 case YOUTUBE_VIDEO: 1297 // TODO: include a YouTube icon 1298 html.append("<a href=\""); 1299 html.append(((YouTubeVideo)token).getURL( 1300 ((YouTubeVideo)token).getDocID())); 1301 html.append("\">"); 1302 html.append(token.getRawText()); 1303 html.append("</a>"); 1304 break; 1305 1306 case PHOTO: { 1307 // TODO: include a Picasa Web icon 1308 html.append("<a href=\""); 1309 html.append(Photo.getAlbumURL( 1310 ((Photo)token).getUser(), ((Photo)token).getAlbum())); 1311 html.append("\">"); 1312 html.append(token.getRawText()); 1313 html.append("</a>"); 1314 break; 1315 } 1316 1317 case FLICKR: 1318 // TODO: include a Flickr icon 1319 Photo p = (Photo) token; 1320 html.append("<a href=\""); 1321 html.append(((FlickrPhoto)token).getUrl()); 1322 html.append("\">"); 1323 html.append(token.getRawText()); 1324 html.append("</a>"); 1325 break; 1326 1327 default: 1328 throw new AssertionError("unknown token type: " + token.getType()); 1329 } 1330 } 1331 1332 if (token.controlCaps()) { 1333 caps = token.setCaps(); 1334 } 1335 } 1336 html.append("</p>\n"); 1337 } 1338 1339 return html.toString(); 1340 } 1341 1342 /** Returns the reverse of the given string. */ reverse(String str)1343 protected static String reverse(String str) { 1344 StringBuilder buf = new StringBuilder(); 1345 for (int i = str.length() - 1; i >= 0; --i) { 1346 buf.append(str.charAt(i)); 1347 } 1348 return buf.toString(); 1349 } 1350 1351 public static class TrieNode { 1352 private final HashMap<Character,TrieNode> children = 1353 new HashMap<Character,TrieNode>(); 1354 private String text; 1355 private String value; 1356 TrieNode()1357 public TrieNode() { this(""); } TrieNode(String text)1358 public TrieNode(String text) { 1359 this.text = text; 1360 } 1361 exists()1362 public final boolean exists() { return value != null; } getText()1363 public final String getText() { return text; } getValue()1364 public final String getValue() { return value; } setValue(String value)1365 public void setValue(String value) { this.value = value; } 1366 getChild(char ch)1367 public TrieNode getChild(char ch) { 1368 return children.get(Character.valueOf(ch)); 1369 } 1370 getOrCreateChild(char ch)1371 public TrieNode getOrCreateChild(char ch) { 1372 Character key = Character.valueOf(ch); 1373 TrieNode node = children.get(key); 1374 if (node == null) { 1375 node = new TrieNode(text + String.valueOf(ch)); 1376 children.put(key, node); 1377 } 1378 return node; 1379 } 1380 1381 /** Adds the given string into the trie. */ addToTrie(TrieNode root, String str, String value)1382 public static void addToTrie(TrieNode root, String str, String value) { 1383 int index = 0; 1384 while (index < str.length()) { 1385 root = root.getOrCreateChild(str.charAt(index++)); 1386 } 1387 root.setValue(value); 1388 } 1389 } 1390 1391 1392 1393 /** Determines whether the given string is in the given trie. */ matches(TrieNode root, String str)1394 private static boolean matches(TrieNode root, String str) { 1395 int index = 0; 1396 while (index < str.length()) { 1397 root = root.getChild(str.charAt(index++)); 1398 if (root == null) { 1399 break; 1400 } else if (root.exists()) { 1401 return true; 1402 } 1403 } 1404 return false; 1405 } 1406 1407 /** 1408 * Returns the longest substring of the given string, starting at the given 1409 * index, that exists in the trie. 1410 */ longestMatch( TrieNode root, AbstractMessageParser p, int start)1411 private static TrieNode longestMatch( 1412 TrieNode root, AbstractMessageParser p, int start) { 1413 return longestMatch(root, p, start, false); 1414 } 1415 1416 /** 1417 * Returns the longest substring of the given string, starting at the given 1418 * index, that exists in the trie, with a special tokenizing case for 1419 * smileys if specified. 1420 */ longestMatch( TrieNode root, AbstractMessageParser p, int start, boolean smiley)1421 private static TrieNode longestMatch( 1422 TrieNode root, AbstractMessageParser p, int start, boolean smiley) { 1423 int index = start; 1424 TrieNode bestMatch = null; 1425 while (index < p.getRawText().length()) { 1426 root = root.getChild(p.getRawText().charAt(index++)); 1427 if (root == null) { 1428 break; 1429 } else if (root.exists()) { 1430 if (p.isWordBreak(index)) { 1431 bestMatch = root; 1432 } else if (smiley && p.isSmileyBreak(index)) { 1433 bestMatch = root; 1434 } 1435 } 1436 } 1437 return bestMatch; 1438 } 1439 1440 1441 /** Represents set of tokens that are delivered as a single message. */ 1442 public static class Part { 1443 private String meText; 1444 private ArrayList<Token> tokens; 1445 Part()1446 public Part() { 1447 this.tokens = new ArrayList<Token>(); 1448 } 1449 getType(boolean isSend)1450 public String getType(boolean isSend) { 1451 return (isSend ? "s" : "r") + getPartType(); 1452 } 1453 getPartType()1454 private String getPartType() { 1455 if (isMedia()) { 1456 return "d"; 1457 } else if (meText != null) { 1458 return "m"; 1459 } else { 1460 return ""; 1461 } 1462 } 1463 isMedia()1464 public boolean isMedia() { 1465 return (tokens.size() == 1) && tokens.get(0).isMedia(); 1466 } 1467 /** 1468 * Convenience method for getting the Token of a Part that represents 1469 * a media Token. Parts of this kind will always only have a single Token 1470 * 1471 * @return if this.isMedia(), 1472 * returns the Token representing the media contained in this Part, 1473 * otherwise returns null; 1474 */ getMediaToken()1475 public Token getMediaToken() { 1476 if(isMedia()) { 1477 return tokens.get(0); 1478 } 1479 return null; 1480 } 1481 1482 /** Adds the given token to this part. */ add(Token token)1483 public void add(Token token) { 1484 if (isMedia()) { 1485 throw new AssertionError("media "); 1486 } 1487 tokens.add(token); 1488 } 1489 setMeText(String meText)1490 public void setMeText(String meText) { 1491 this.meText = meText; 1492 } 1493 1494 /** Returns the original text of this part. */ getRawText()1495 public String getRawText() { 1496 StringBuilder buf = new StringBuilder(); 1497 if (meText != null) { 1498 buf.append(meText); 1499 } 1500 for (int i = 0; i < tokens.size(); ++i) { 1501 buf.append(tokens.get(i).getRawText()); 1502 } 1503 return buf.toString(); 1504 } 1505 1506 /** Returns the tokens in this part. */ getTokens()1507 public ArrayList<Token> getTokens() { return tokens; } 1508 1509 /** Adds the tokens into the given builder as an array. */ 1510 // public void toArray(JSArrayBuilder array) { 1511 // if (isMedia()) { 1512 // // For media, we send its array (i.e., we don't wrap this in another 1513 // // array as we do for non-media parts). 1514 // tokens.get(0).toArray(array); 1515 // } else { 1516 // array.beginArray(); 1517 // addToArray(array); 1518 // array.endArray(); 1519 // } 1520 // } 1521 } 1522 } 1523