• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.util;
18 
19 import java.util.regex.Matcher;
20 import java.util.regex.Pattern;
21 
22 /**
23  * Commonly used regular expression patterns.
24  */
25 public class Patterns {
26     /**
27      *  Regular expression to match all IANA top-level domains.
28      *  List accurate as of 2010/02/05.  List taken from:
29      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
30      *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
31      */
32     public static final String TOP_LEVEL_DOMAIN_STR =
33         "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
34         + "|(biz|b[abdefghijmnorstvwyz])"
35         + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
36         + "|d[ejkmoz]"
37         + "|(edu|e[cegrstu])"
38         + "|f[ijkmor]"
39         + "|(gov|g[abdefghilmnpqrstuwy])"
40         + "|h[kmnrtu]"
41         + "|(info|int|i[delmnoqrst])"
42         + "|(jobs|j[emop])"
43         + "|k[eghimnprwyz]"
44         + "|l[abcikrstuvy]"
45         + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
46         + "|(name|net|n[acefgilopruz])"
47         + "|(org|om)"
48         + "|(pro|p[aefghklmnrstwy])"
49         + "|qa"
50         + "|r[eosuw]"
51         + "|s[abcdeghijklmnortuvyz]"
52         + "|(tel|travel|t[cdfghjklmnoprtvwz])"
53         + "|u[agksyz]"
54         + "|v[aceginu]"
55         + "|w[fs]"
56         + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
57         + "|y[etu]"
58         + "|z[amw])";
59 
60     /**
61      *  Regular expression pattern to match all IANA top-level domains.
62      */
63     public static final Pattern TOP_LEVEL_DOMAIN =
64         Pattern.compile(TOP_LEVEL_DOMAIN_STR);
65 
66     /**
67      *  Regular expression to match all IANA top-level domains for WEB_URL.
68      *  List accurate as of 2010/02/05.  List taken from:
69      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
70      *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
71      */
72     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
73         "(?:"
74         + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
75         + "|(?:biz|b[abdefghijmnorstvwyz])"
76         + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
77         + "|d[ejkmoz]"
78         + "|(?:edu|e[cegrstu])"
79         + "|f[ijkmor]"
80         + "|(?:gov|g[abdefghilmnpqrstuwy])"
81         + "|h[kmnrtu]"
82         + "|(?:info|int|i[delmnoqrst])"
83         + "|(?:jobs|j[emop])"
84         + "|k[eghimnprwyz]"
85         + "|l[abcikrstuvy]"
86         + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
87         + "|(?:name|net|n[acefgilopruz])"
88         + "|(?:org|om)"
89         + "|(?:pro|p[aefghklmnrstwy])"
90         + "|qa"
91         + "|r[eosuw]"
92         + "|s[abcdeghijklmnortuvyz]"
93         + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
94         + "|u[agksyz]"
95         + "|v[aceginu]"
96         + "|w[fs]"
97         + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
98         + "|y[etu]"
99         + "|z[amw]))";
100 
101     /**
102      * Good characters for Internationalized Resource Identifiers (IRI).
103      * This comprises most common used Unicode characters allowed in IRI
104      * as detailed in RFC 3987.
105      * Specifically, those two byte Unicode characters are not included.
106      */
107     public static final String GOOD_IRI_CHAR =
108         "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
109 
110     /**
111      *  Regular expression pattern to match most part of RFC 3987
112      *  Internationalized URLs, aka IRIs.  Commonly used Unicode characters are
113      *  added.
114      */
115     public static final Pattern WEB_URL = Pattern.compile(
116         "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
117         + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
118         + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
119         + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+"   // named host
120         + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
121         + "|(?:(?:25[0-5]|2[0-4]" // or ip address
122         + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
123         + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
124         + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
125         + "|[1-9][0-9]|[0-9])))"
126         + "(?:\\:\\d{1,5})?)" // plus option port number
127         + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
128         + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
129         + "(?:\\b|$)"); // and finally, a word boundary or end of
130                         // input.  This is to stop foo.sure from
131                         // matching as foo.su
132 
133     public static final Pattern IP_ADDRESS
134         = Pattern.compile(
135             "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
136             + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
137             + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
138             + "|[1-9][0-9]|[0-9]))");
139 
140     public static final Pattern DOMAIN_NAME
141         = Pattern.compile(
142             "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
143             + TOP_LEVEL_DOMAIN + ")|"
144             + IP_ADDRESS + ")");
145 
146     public static final Pattern EMAIL_ADDRESS
147         = Pattern.compile(
148             "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" +
149             "\\@" +
150             "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
151             "(" +
152                 "\\." +
153                 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" +
154             ")+"
155         );
156 
157     /**
158      * This pattern is intended for searching for things that look like they
159      * might be phone numbers in arbitrary text, not for validating whether
160      * something is in fact a phone number.  It will miss many things that
161      * are legitimate phone numbers.
162      *
163      * <p> The pattern matches the following:
164      * <ul>
165      * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
166      * may follow.
167      * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
168      * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
169      * </ul>
170      */
171     public static final Pattern PHONE
172         = Pattern.compile(                                  // sdd = space, dot, or dash
173                 "(\\+[0-9]+[\\- \\.]*)?"                    // +<digits><sdd>*
174                 + "(\\([0-9]+\\)[\\- \\.]*)?"               // (<digits>)<sdd>*
175                 + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit>
176 
177     /**
178      *  Convenience method to take all of the non-null matching groups in a
179      *  regex Matcher and return them as a concatenated string.
180      *
181      *  @param matcher      The Matcher object from which grouped text will
182      *                      be extracted
183      *
184      *  @return             A String comprising all of the non-null matched
185      *                      groups concatenated together
186      */
concatGroups(Matcher matcher)187     public static final String concatGroups(Matcher matcher) {
188         StringBuilder b = new StringBuilder();
189         final int numGroups = matcher.groupCount();
190 
191         for (int i = 1; i <= numGroups; i++) {
192             String s = matcher.group(i);
193 
194             System.err.println("Group(" + i + ") : " + s);
195 
196             if (s != null) {
197                 b.append(s);
198             }
199         }
200 
201         return b.toString();
202     }
203 
204     /**
205      * Convenience method to return only the digits and plus signs
206      * in the matching string.
207      *
208      * @param matcher      The Matcher object from which digits and plus will
209      *                     be extracted
210      *
211      * @return             A String comprising all of the digits and plus in
212      *                     the match
213      */
digitsAndPlusOnly(Matcher matcher)214     public static final String digitsAndPlusOnly(Matcher matcher) {
215         StringBuilder buffer = new StringBuilder();
216         String matchingRegion = matcher.group();
217 
218         for (int i = 0, size = matchingRegion.length(); i < size; i++) {
219             char character = matchingRegion.charAt(i);
220 
221             if (character == '+' || Character.isDigit(character)) {
222                 buffer.append(character);
223             }
224         }
225         return buffer.toString();
226     }
227 
228     /**
229      * Do not create this static utility class.
230      */
Patterns()231     private Patterns() {}
232 }
233