1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.util; 18 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 22 /** 23 * Commonly used regular expression patterns. 24 */ 25 public class Patterns { 26 /** 27 * Regular expression to match all IANA top-level domains. 28 * List accurate as of 2010/02/05. List taken from: 29 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 30 * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py 31 */ 32 public static final String TOP_LEVEL_DOMAIN_STR = 33 "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 34 + "|(biz|b[abdefghijmnorstvwyz])" 35 + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" 36 + "|d[ejkmoz]" 37 + "|(edu|e[cegrstu])" 38 + "|f[ijkmor]" 39 + "|(gov|g[abdefghilmnpqrstuwy])" 40 + "|h[kmnrtu]" 41 + "|(info|int|i[delmnoqrst])" 42 + "|(jobs|j[emop])" 43 + "|k[eghimnprwyz]" 44 + "|l[abcikrstuvy]" 45 + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" 46 + "|(name|net|n[acefgilopruz])" 47 + "|(org|om)" 48 + "|(pro|p[aefghklmnrstwy])" 49 + "|qa" 50 + "|r[eosuw]" 51 + "|s[abcdeghijklmnortuvyz]" 52 + "|(tel|travel|t[cdfghjklmnoprtvwz])" 53 + "|u[agksyz]" 54 + "|v[aceginu]" 55 + "|w[fs]" 56 + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" 57 + "|y[etu]" 58 + "|z[amw])"; 59 60 /** 61 * Regular expression pattern to match all IANA top-level domains. 62 */ 63 public static final Pattern TOP_LEVEL_DOMAIN = 64 Pattern.compile(TOP_LEVEL_DOMAIN_STR); 65 66 /** 67 * Regular expression to match all IANA top-level domains for WEB_URL. 68 * List accurate as of 2010/02/05. List taken from: 69 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 70 * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py 71 */ 72 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 73 "(?:" 74 + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" 75 + "|(?:biz|b[abdefghijmnorstvwyz])" 76 + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" 77 + "|d[ejkmoz]" 78 + "|(?:edu|e[cegrstu])" 79 + "|f[ijkmor]" 80 + "|(?:gov|g[abdefghilmnpqrstuwy])" 81 + "|h[kmnrtu]" 82 + "|(?:info|int|i[delmnoqrst])" 83 + "|(?:jobs|j[emop])" 84 + "|k[eghimnprwyz]" 85 + "|l[abcikrstuvy]" 86 + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" 87 + "|(?:name|net|n[acefgilopruz])" 88 + "|(?:org|om)" 89 + "|(?:pro|p[aefghklmnrstwy])" 90 + "|qa" 91 + "|r[eosuw]" 92 + "|s[abcdeghijklmnortuvyz]" 93 + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" 94 + "|u[agksyz]" 95 + "|v[aceginu]" 96 + "|w[fs]" 97 + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" 98 + "|y[etu]" 99 + "|z[amw]))"; 100 101 /** 102 * Good characters for Internationalized Resource Identifiers (IRI). 103 * This comprises most common used Unicode characters allowed in IRI 104 * as detailed in RFC 3987. 105 * Specifically, those two byte Unicode characters are not included. 106 */ 107 public static final String GOOD_IRI_CHAR = 108 "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; 109 110 /** 111 * Regular expression pattern to match most part of RFC 3987 112 * Internationalized URLs, aka IRIs. Commonly used Unicode characters are 113 * added. 114 */ 115 public static final Pattern WEB_URL = Pattern.compile( 116 "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" 117 + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" 118 + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" 119 + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host 120 + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL 121 + "|(?:(?:25[0-5]|2[0-4]" // or ip address 122 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" 123 + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" 124 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 125 + "|[1-9][0-9]|[0-9])))" 126 + "(?:\\:\\d{1,5})?)" // plus option port number 127 + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params 128 + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" 129 + "(?:\\b|$)"); // and finally, a word boundary or end of 130 // input. This is to stop foo.sure from 131 // matching as foo.su 132 133 public static final Pattern IP_ADDRESS 134 = Pattern.compile( 135 "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" 136 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" 137 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 138 + "|[1-9][0-9]|[0-9]))"); 139 140 public static final Pattern DOMAIN_NAME 141 = Pattern.compile( 142 "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" 143 + TOP_LEVEL_DOMAIN + ")|" 144 + IP_ADDRESS + ")"); 145 146 public static final Pattern EMAIL_ADDRESS 147 = Pattern.compile( 148 "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" + 149 "\\@" + 150 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + 151 "(" + 152 "\\." + 153 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" + 154 ")+" 155 ); 156 157 /** 158 * This pattern is intended for searching for things that look like they 159 * might be phone numbers in arbitrary text, not for validating whether 160 * something is in fact a phone number. It will miss many things that 161 * are legitimate phone numbers. 162 * 163 * <p> The pattern matches the following: 164 * <ul> 165 * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes 166 * may follow. 167 * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes. 168 * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes. 169 * </ul> 170 */ 171 public static final Pattern PHONE 172 = Pattern.compile( // sdd = space, dot, or dash 173 "(\\+[0-9]+[\\- \\.]*)?" // +<digits><sdd>* 174 + "(\\([0-9]+\\)[\\- \\.]*)?" // (<digits>)<sdd>* 175 + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit> 176 177 /** 178 * Convenience method to take all of the non-null matching groups in a 179 * regex Matcher and return them as a concatenated string. 180 * 181 * @param matcher The Matcher object from which grouped text will 182 * be extracted 183 * 184 * @return A String comprising all of the non-null matched 185 * groups concatenated together 186 */ concatGroups(Matcher matcher)187 public static final String concatGroups(Matcher matcher) { 188 StringBuilder b = new StringBuilder(); 189 final int numGroups = matcher.groupCount(); 190 191 for (int i = 1; i <= numGroups; i++) { 192 String s = matcher.group(i); 193 194 System.err.println("Group(" + i + ") : " + s); 195 196 if (s != null) { 197 b.append(s); 198 } 199 } 200 201 return b.toString(); 202 } 203 204 /** 205 * Convenience method to return only the digits and plus signs 206 * in the matching string. 207 * 208 * @param matcher The Matcher object from which digits and plus will 209 * be extracted 210 * 211 * @return A String comprising all of the digits and plus in 212 * the match 213 */ digitsAndPlusOnly(Matcher matcher)214 public static final String digitsAndPlusOnly(Matcher matcher) { 215 StringBuilder buffer = new StringBuilder(); 216 String matchingRegion = matcher.group(); 217 218 for (int i = 0, size = matchingRegion.length(); i < size; i++) { 219 char character = matchingRegion.charAt(i); 220 221 if (character == '+' || Character.isDigit(character)) { 222 buffer.append(character); 223 } 224 } 225 return buffer.toString(); 226 } 227 228 /** 229 * Do not create this static utility class. 230 */ Patterns()231 private Patterns() {} 232 } 233