• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3from urllib2 import urlopen
4
5TLD_PREFIX = r"""
6    /**
7     *  Regular expression to match all IANA top-level domains.
8     *  List accurate as of 2011/07/18.  List taken from:
9     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
11     */
12    public static final String TOP_LEVEL_DOMAIN_STR =
13"""
14TLD_SUFFIX = '";'
15
16URL_PREFIX = r"""
17    /**
18     *  Regular expression to match all IANA top-level domains for WEB_URL.
19     *  List accurate as of 2011/07/18.  List taken from:
20     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
21     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
22     */
23    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24        "(?:"
25"""
26
27URL_SUFFIX = ';'
28
29class Bucket:
30    def __init__(self, baseLetter):
31        self.base=baseLetter
32        self.words=[]
33        self.letters=[]
34
35    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
36        if (len(self.words) == 0) and (len(self.letters) == 0):
37            return ''
38
39        self.words.sort()
40        self.letters.sort()
41
42        output = '        ';
43
44        if isFirst:
45            if isWebUrl:
46                output += '+ "'
47            else:
48                output += '"('
49        else:
50            output += '+ "|'
51
52        if len(self.words) != 0:
53            output += '('
54
55            if isWebUrl:
56                output += '?:'
57
58        firstWord = 1
59        for word in self.words:
60            if firstWord == 0:
61                output += '|'
62            firstWord = 0
63            for letter in word:
64                if letter == '-':
65                    output += '\\\\'  # escape the '-' character.
66                output += letter
67
68        if len(self.words) > 0 and len(self.letters) > 0:
69            output += '|'
70
71        if len(self.letters) == 1:
72            output += '%c%c' % (self.base, self.letters[0])
73        elif len(self.letters) > 0:
74            output += '%c[' % self.base
75
76            for letter in self.letters:
77                output += letter
78
79            output += ']'
80
81        if len(self.words) != 0:
82            output += ')'
83
84        if not isLast:
85            output += '"'
86            output += '\n'
87
88        return output;
89
90    def add(self, line):
91        length = len(line)
92
93        if line.startswith('#') or (length == 0):
94            return;
95
96        if length == 2:
97            self.letters.append(line[1:2])
98        else:
99            self.words.append(line)
100
101def getBucket(buckets, line):
102    letter = line[0]
103    bucket = buckets.get(letter)
104
105    if bucket is None:
106        bucket = Bucket(letter)
107        buckets[letter] = bucket
108
109    return bucket
110
111def makePattern(prefix, suffix, buckets, isWebUrl=False):
112    output = prefix
113
114    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
115
116    for letter in range(ord('b'), ord('z')):
117        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
118
119    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
120
121    if isWebUrl:
122        output += '))"'
123    else:
124        output += ')'
125
126    output += suffix
127
128    print output
129
130if __name__ == "__main__":
131    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
132    domains = f.readlines()
133    f.close()
134
135    buckets = {}
136
137    for domain in domains:
138        domain = domain.lower()
139
140        if len(domain) > 0:
141            getBucket(buckets, domain[0]).add(domain.strip())
142
143        if domain.startswith('xn--'):
144	   puny = domain.strip()[4:]
145	   result = puny.decode('punycode')
146	   result = repr(result)
147           getBucket(buckets, 'xn--').add(result[2:-1])
148
149    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
150    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
151