1#!/usr/bin/env python 2 3from urllib2 import urlopen 4 5TLD_PREFIX = r""" 6 /** 7 * Regular expression to match all IANA top-level domains. 8 * List accurate as of 2011/07/18. List taken from: 9 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 10 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 11 */ 12 public static final String TOP_LEVEL_DOMAIN_STR = 13""" 14TLD_SUFFIX = '";' 15 16URL_PREFIX = r""" 17 /** 18 * Regular expression to match all IANA top-level domains for WEB_URL. 19 * List accurate as of 2011/07/18. List taken from: 20 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 21 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 22 */ 23 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 24 "(?:" 25""" 26 27URL_SUFFIX = ';' 28TAB = ' ' 29 30class BucketOutput: 31 def __init__(self): 32 self.buffer = TAB 33 self.lineLength = len(TAB) 34 35 def __iadd__(self, other): 36 self.buffer += other 37 self.lineLength += len(other) 38 return self 39 40 def addPipe(self): 41 if self.lineLength > 90: 42 self.buffer += '"\n' 43 self.buffer += TAB 44 self.buffer += '+ "' 45 self.lineLength = len(TAB) 46 47 self += '|' 48 49 def value(self): 50 return self.buffer 51 52class Bucket: 53 def __init__(self, baseLetter): 54 self.base=baseLetter 55 self.words=[] 56 self.letters=[] 57 58 def dump(self, isWebUrl=False, isFirst=False, isLast=False): 59 if (len(self.words) == 0) and (len(self.letters) == 0): 60 return '' 61 62 self.words.sort() 63 self.letters.sort() 64 65 output = BucketOutput() 66 67 if isFirst: 68 if isWebUrl: 69 output += '+ "' 70 else: 71 output += '"(' 72 else: 73 output += '+ "|' 74 75 if len(self.words) != 0: 76 output += '(' 77 78 if isWebUrl: 79 output += '?:' 80 81 firstWord = 1 82 for word in self.words: 83 if firstWord == 0: 84 output.addPipe() 85 firstWord = 0 86 for letter in word: 87 if letter == '-': 88 output += '\\\\' # escape the '-' character. 89 output += letter 90 91 if len(self.words) > 0 and len(self.letters) > 0: 92 output.addPipe() 93 94 if len(self.letters) == 1: 95 output += '%c%c' % (self.base, self.letters[0]) 96 elif len(self.letters) > 0: 97 output += '%c[' % self.base 98 99 for letter in self.letters: 100 output += letter 101 102 output += ']' 103 104 if len(self.words) != 0: 105 output += ')' 106 107 if not isLast: 108 output += '"' 109 output += '\n' 110 111 return output.value(); 112 113 def add(self, line): 114 length = len(line) 115 116 if line.startswith('#') or (length == 0): 117 return; 118 119 if length == 2: 120 self.letters.append(line[1:2]) 121 else: 122 self.words.append(line) 123 124def getBucket(buckets, line): 125 letter = line[0] 126 bucket = buckets.get(letter) 127 128 if bucket is None: 129 bucket = Bucket(letter) 130 buckets[letter] = bucket 131 132 return bucket 133 134def makePattern(prefix, suffix, buckets, isWebUrl=False): 135 output = prefix 136 137 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) 138 139 for letter in range(ord('b'), ord('z')): 140 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) 141 142 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) 143 144 if isWebUrl: 145 output += '))"' 146 else: 147 output += ')' 148 149 output += suffix 150 151 print output 152 153if __name__ == "__main__": 154 f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') 155 domains = f.readlines() 156 f.close() 157 158 buckets = {} 159 160 for domain in domains: 161 domain = domain.lower() 162 163 if len(domain) > 0: 164 getBucket(buckets, domain[0]).add(domain.strip()) 165 166 if domain.startswith('xn--'): 167 puny = domain.strip()[4:] 168 result = puny.decode('punycode') 169 result = repr(result) 170 getBucket(buckets, 'xn--').add(result[2:-1]) 171 172 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) 173 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) 174