1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.util; 18 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 22 /** 23 * Commonly used regular expression patterns. 24 */ 25 public class Patterns { 26 /** 27 * Regular expression to match all IANA top-level domains. 28 * List accurate as of 2011/07/18. List taken from: 29 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 30 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 31 * 32 * @deprecated Due to the recent profileration of gTLDs, this API is 33 * expected to become out-of-date very quickly. Therefore it is now 34 * deprecated. 35 */ 36 @Deprecated 37 public static final String TOP_LEVEL_DOMAIN_STR = 38 "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 39 + "|(biz|b[abdefghijmnorstvwyz])" 40 + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" 41 + "|d[ejkmoz]" 42 + "|(edu|e[cegrstu])" 43 + "|f[ijkmor]" 44 + "|(gov|g[abdefghilmnpqrstuwy])" 45 + "|h[kmnrtu]" 46 + "|(info|int|i[delmnoqrst])" 47 + "|(jobs|j[emop])" 48 + "|k[eghimnprwyz]" 49 + "|l[abcikrstuvy]" 50 + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" 51 + "|(name|net|n[acefgilopruz])" 52 + "|(org|om)" 53 + "|(pro|p[aefghklmnrstwy])" 54 + "|qa" 55 + "|r[eosuw]" 56 + "|s[abcdeghijklmnortuvyz]" 57 + "|(tel|travel|t[cdfghjklmnoprtvwz])" 58 + "|u[agksyz]" 59 + "|v[aceginu]" 60 + "|w[fs]" 61 + "|(\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae|\u0438\u0441\u043f\u044b\u0442\u0430\u043d\u0438\u0435|\u0440\u0444|\u0441\u0440\u0431|\u05d8\u05e2\u05e1\u05d8|\u0622\u0632\u0645\u0627\u06cc\u0634\u06cc|\u0625\u062e\u062a\u0628\u0627\u0631|\u0627\u0644\u0627\u0631\u062f\u0646|\u0627\u0644\u062c\u0632\u0627\u0626\u0631|\u0627\u0644\u0633\u0639\u0648\u062f\u064a\u0629|\u0627\u0644\u0645\u063a\u0631\u0628|\u0627\u0645\u0627\u0631\u0627\u062a|\u0628\u06be\u0627\u0631\u062a|\u062a\u0648\u0646\u0633|\u0633\u0648\u0631\u064a\u0629|\u0641\u0644\u0633\u0637\u064a\u0646|\u0642\u0637\u0631|\u0645\u0635\u0631|\u092a\u0930\u0940\u0915\u094d\u0937\u093e|\u092d\u093e\u0930\u0924|\u09ad\u09be\u09b0\u09a4|\u0a2d\u0a3e\u0a30\u0a24|\u0aad\u0abe\u0ab0\u0aa4|\u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe|\u0b87\u0bb2\u0b99\u0bcd\u0b95\u0bc8|\u0b9a\u0bbf\u0b99\u0bcd\u0b95\u0baa\u0bcd\u0baa\u0bc2\u0bb0\u0bcd|\u0baa\u0bb0\u0bbf\u0b9f\u0bcd\u0b9a\u0bc8|\u0c2d\u0c3e\u0c30\u0c24\u0c4d|\u0dbd\u0d82\u0d9a\u0dcf|\u0e44\u0e17\u0e22|\u30c6\u30b9\u30c8|\u4e2d\u56fd|\u4e2d\u570b|\u53f0\u6e7e|\u53f0\u7063|\u65b0\u52a0\u5761|\u6d4b\u8bd5|\u6e2c\u8a66|\u9999\u6e2f|\ud14c\uc2a4\ud2b8|\ud55c\uad6d|xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-3e0b707e|xn\\-\\-45brj9c|xn\\-\\-80akhbyknj4f|xn\\-\\-90a3ac|xn\\-\\-9t4b11yi5a|xn\\-\\-clchc0ea0b2g2a9gcd|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fpcrj9c3d|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-gecrj9c|xn\\-\\-h2brj9c|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a71e|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-s9brj9c|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-yfro4i67o|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah|xxx)" 62 + "|y[et]" 63 + "|z[amw])"; 64 65 /** 66 * Regular expression pattern to match all IANA top-level domains. 67 * @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}. 68 */ 69 @Deprecated 70 public static final Pattern TOP_LEVEL_DOMAIN = 71 Pattern.compile(TOP_LEVEL_DOMAIN_STR); 72 73 /** 74 * Regular expression to match all IANA top-level domains for WEB_URL. 75 * List accurate as of 2011/07/18. List taken from: 76 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 77 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 78 * 79 * @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}. 80 */ 81 @Deprecated 82 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 83 "(?:" 84 + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 85 + "|(?:biz|b[abdefghijmnorstvwyz])" 86 + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" 87 + "|d[ejkmoz]" 88 + "|(?:edu|e[cegrstu])" 89 + "|f[ijkmor]" 90 + "|(?:gov|g[abdefghilmnpqrstuwy])" 91 + "|h[kmnrtu]" 92 + "|(?:info|int|i[delmnoqrst])" 93 + "|(?:jobs|j[emop])" 94 + "|k[eghimnprwyz]" 95 + "|l[abcikrstuvy]" 96 + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" 97 + "|(?:name|net|n[acefgilopruz])" 98 + "|(?:org|om)" 99 + "|(?:pro|p[aefghklmnrstwy])" 100 + "|qa" 101 + "|r[eosuw]" 102 + "|s[abcdeghijklmnortuvyz]" 103 + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" 104 + "|u[agksyz]" 105 + "|v[aceginu]" 106 + "|w[fs]" 107 + "|(?:\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae|\u0438\u0441\u043f\u044b\u0442\u0430\u043d\u0438\u0435|\u0440\u0444|\u0441\u0440\u0431|\u05d8\u05e2\u05e1\u05d8|\u0622\u0632\u0645\u0627\u06cc\u0634\u06cc|\u0625\u062e\u062a\u0628\u0627\u0631|\u0627\u0644\u0627\u0631\u062f\u0646|\u0627\u0644\u062c\u0632\u0627\u0626\u0631|\u0627\u0644\u0633\u0639\u0648\u062f\u064a\u0629|\u0627\u0644\u0645\u063a\u0631\u0628|\u0627\u0645\u0627\u0631\u0627\u062a|\u0628\u06be\u0627\u0631\u062a|\u062a\u0648\u0646\u0633|\u0633\u0648\u0631\u064a\u0629|\u0641\u0644\u0633\u0637\u064a\u0646|\u0642\u0637\u0631|\u0645\u0635\u0631|\u092a\u0930\u0940\u0915\u094d\u0937\u093e|\u092d\u093e\u0930\u0924|\u09ad\u09be\u09b0\u09a4|\u0a2d\u0a3e\u0a30\u0a24|\u0aad\u0abe\u0ab0\u0aa4|\u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe|\u0b87\u0bb2\u0b99\u0bcd\u0b95\u0bc8|\u0b9a\u0bbf\u0b99\u0bcd\u0b95\u0baa\u0bcd\u0baa\u0bc2\u0bb0\u0bcd|\u0baa\u0bb0\u0bbf\u0b9f\u0bcd\u0b9a\u0bc8|\u0c2d\u0c3e\u0c30\u0c24\u0c4d|\u0dbd\u0d82\u0d9a\u0dcf|\u0e44\u0e17\u0e22|\u30c6\u30b9\u30c8|\u4e2d\u56fd|\u4e2d\u570b|\u53f0\u6e7e|\u53f0\u7063|\u65b0\u52a0\u5761|\u6d4b\u8bd5|\u6e2c\u8a66|\u9999\u6e2f|\ud14c\uc2a4\ud2b8|\ud55c\uad6d|xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-3e0b707e|xn\\-\\-45brj9c|xn\\-\\-80akhbyknj4f|xn\\-\\-90a3ac|xn\\-\\-9t4b11yi5a|xn\\-\\-clchc0ea0b2g2a9gcd|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fpcrj9c3d|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-gecrj9c|xn\\-\\-h2brj9c|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a71e|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-s9brj9c|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-yfro4i67o|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah|xxx)" 108 + "|y[et]" 109 + "|z[amw]))"; 110 111 /** 112 * Good characters for Internationalized Resource Identifiers (IRI). 113 * This comprises most common used Unicode characters allowed in IRI 114 * as detailed in RFC 3987. 115 * Specifically, those two byte Unicode characters are not included. 116 */ 117 public static final String GOOD_IRI_CHAR = 118 "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; 119 120 public static final Pattern IP_ADDRESS 121 = Pattern.compile( 122 "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" 123 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" 124 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 125 + "|[1-9][0-9]|[0-9]))"); 126 127 /** 128 * RFC 1035 Section 2.3.4 limits the labels to a maximum 63 octets. 129 */ 130 private static final String IRI 131 = "[" + GOOD_IRI_CHAR + "]([" + GOOD_IRI_CHAR + "\\-]{0,61}[" + GOOD_IRI_CHAR + "]){0,1}"; 132 133 private static final String GOOD_GTLD_CHAR = 134 "a-zA-Z\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; 135 private static final String GTLD = "[" + GOOD_GTLD_CHAR + "]{2,63}"; 136 private static final String HOST_NAME = "(" + IRI + "\\.)+" + GTLD; 137 138 public static final Pattern DOMAIN_NAME 139 = Pattern.compile("(" + HOST_NAME + "|" + IP_ADDRESS + ")"); 140 141 /** 142 * Regular expression pattern to match most part of RFC 3987 143 * Internationalized URLs, aka IRIs. Commonly used Unicode characters are 144 * added. 145 */ 146 public static final Pattern WEB_URL = Pattern.compile( 147 "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" 148 + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" 149 + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" 150 + "(?:" + DOMAIN_NAME + ")" 151 + "(?:\\:\\d{1,5})?)" // plus option port number 152 + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params 153 + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" 154 + "(?:\\b|$)"); // and finally, a word boundary or end of 155 // input. This is to stop foo.sure from 156 // matching as foo.su 157 158 public static final Pattern EMAIL_ADDRESS 159 = Pattern.compile( 160 "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" + 161 "\\@" + 162 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + 163 "(" + 164 "\\." + 165 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" + 166 ")+" 167 ); 168 169 /** 170 * This pattern is intended for searching for things that look like they 171 * might be phone numbers in arbitrary text, not for validating whether 172 * something is in fact a phone number. It will miss many things that 173 * are legitimate phone numbers. 174 * 175 * <p> The pattern matches the following: 176 * <ul> 177 * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes 178 * may follow. 179 * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes. 180 * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes. 181 * </ul> 182 */ 183 public static final Pattern PHONE 184 = Pattern.compile( // sdd = space, dot, or dash 185 "(\\+[0-9]+[\\- \\.]*)?" // +<digits><sdd>* 186 + "(\\([0-9]+\\)[\\- \\.]*)?" // (<digits>)<sdd>* 187 + "([0-9][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit> 188 189 /** 190 * Convenience method to take all of the non-null matching groups in a 191 * regex Matcher and return them as a concatenated string. 192 * 193 * @param matcher The Matcher object from which grouped text will 194 * be extracted 195 * 196 * @return A String comprising all of the non-null matched 197 * groups concatenated together 198 */ concatGroups(Matcher matcher)199 public static final String concatGroups(Matcher matcher) { 200 StringBuilder b = new StringBuilder(); 201 final int numGroups = matcher.groupCount(); 202 203 for (int i = 1; i <= numGroups; i++) { 204 String s = matcher.group(i); 205 206 if (s != null) { 207 b.append(s); 208 } 209 } 210 211 return b.toString(); 212 } 213 214 /** 215 * Convenience method to return only the digits and plus signs 216 * in the matching string. 217 * 218 * @param matcher The Matcher object from which digits and plus will 219 * be extracted 220 * 221 * @return A String comprising all of the digits and plus in 222 * the match 223 */ digitsAndPlusOnly(Matcher matcher)224 public static final String digitsAndPlusOnly(Matcher matcher) { 225 StringBuilder buffer = new StringBuilder(); 226 String matchingRegion = matcher.group(); 227 228 for (int i = 0, size = matchingRegion.length(); i < size; i++) { 229 char character = matchingRegion.charAt(i); 230 231 if (character == '+' || Character.isDigit(character)) { 232 buffer.append(character); 233 } 234 } 235 return buffer.toString(); 236 } 237 238 /** 239 * Do not create this static utility class. 240 */ Patterns()241 private Patterns() {} 242 } 243