• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3from urllib2 import urlopen
4
5TLD_PREFIX = r"""
6    /**
7     *  Regular expression to match all IANA top-level domains.
8     *  List accurate as of 2011/07/18.  List taken from:
9     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
11     */
12    public static final String TOP_LEVEL_DOMAIN_STR =
13"""
14TLD_SUFFIX = '";'
15
16URL_PREFIX = r"""
17    /**
18     *  Regular expression to match all IANA top-level domains for WEB_URL.
19     *  List accurate as of 2011/07/18.  List taken from:
20     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
21     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
22     */
23    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24        "(?:"
25"""
26
27URL_SUFFIX = ';'
28TAB = '        '
29
30class BucketOutput:
31    def __init__(self):
32        self.buffer = TAB
33        self.lineLength = len(TAB)
34
35    def __iadd__(self, other):
36        self.buffer += other
37        self.lineLength += len(other)
38        return self
39
40    def addPipe(self):
41        if self.lineLength > 90:
42            self.buffer += '"\n'
43            self.buffer += TAB
44            self.buffer += '+ "'
45            self.lineLength = len(TAB)
46
47        self += '|'
48
49    def value(self):
50        return self.buffer
51
52class Bucket:
53    def __init__(self, baseLetter):
54        self.base=baseLetter
55        self.words=[]
56        self.letters=[]
57
58    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
59        if (len(self.words) == 0) and (len(self.letters) == 0):
60            return ''
61
62        self.words.sort()
63        self.letters.sort()
64
65        output = BucketOutput()
66
67        if isFirst:
68            if isWebUrl:
69                output += '+ "'
70            else:
71                output += '"('
72        else:
73            output += '+ "|'
74
75        if len(self.words) != 0:
76            output += '('
77
78            if isWebUrl:
79                output += '?:'
80
81        firstWord = 1
82        for word in self.words:
83            if firstWord == 0:
84                output.addPipe()
85            firstWord = 0
86            for letter in word:
87                if letter == '-':
88                    output += '\\\\'  # escape the '-' character.
89                output += letter
90
91        if len(self.words) > 0 and len(self.letters) > 0:
92            output.addPipe()
93
94        if len(self.letters) == 1:
95            output += '%c%c' % (self.base, self.letters[0])
96        elif len(self.letters) > 0:
97            output += '%c[' % self.base
98
99            for letter in self.letters:
100                output += letter
101
102            output += ']'
103
104        if len(self.words) != 0:
105            output += ')'
106
107        if not isLast:
108            output += '"'
109            output += '\n'
110
111        return output.value();
112
113    def add(self, line):
114        length = len(line)
115
116        if line.startswith('#') or (length == 0):
117            return;
118
119        if length == 2:
120            self.letters.append(line[1:2])
121        else:
122            self.words.append(line)
123
124def getBucket(buckets, line):
125    letter = line[0]
126    bucket = buckets.get(letter)
127
128    if bucket is None:
129        bucket = Bucket(letter)
130        buckets[letter] = bucket
131
132    return bucket
133
134def makePattern(prefix, suffix, buckets, isWebUrl=False):
135    output = prefix
136
137    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
138
139    for letter in range(ord('b'), ord('z')):
140        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
141
142    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
143
144    if isWebUrl:
145        output += '))"'
146    else:
147        output += ')'
148
149    output += suffix
150
151    print output
152
153if __name__ == "__main__":
154    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
155    domains = f.readlines()
156    f.close()
157
158    buckets = {}
159
160    for domain in domains:
161        domain = domain.lower()
162
163        if len(domain) > 0:
164            getBucket(buckets, domain[0]).add(domain.strip())
165
166        if domain.startswith('xn--'):
167	   puny = domain.strip()[4:]
168	   result = puny.decode('punycode')
169	   result = repr(result)
170           getBucket(buckets, 'xn--').add(result[2:-1])
171
172    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
173    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
174