#!/usr/bin/env python from urllib2 import urlopen TLD_PREFIX = r""" /** * Regular expression to match all IANA top-level domains. * List accurate as of 2011/07/18. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py */ public static final String TOP_LEVEL_DOMAIN_STR = """ TLD_SUFFIX = '";' URL_PREFIX = r""" /** * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2011/07/18. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py */ public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" """ URL_SUFFIX = ';' class Bucket: def __init__(self, baseLetter): self.base=baseLetter self.words=[] self.letters=[] def dump(self, isWebUrl=False, isFirst=False, isLast=False): if (len(self.words) == 0) and (len(self.letters) == 0): return '' self.words.sort() self.letters.sort() output = ' '; if isFirst: if isWebUrl: output += '+ "' else: output += '"(' else: output += '+ "|' if len(self.words) != 0: output += '(' if isWebUrl: output += '?:' firstWord = 1 for word in self.words: if firstWord == 0: output += '|' firstWord = 0 for letter in word: if letter == '-': output += '\\\\' # escape the '-' character. output += letter if len(self.words) > 0 and len(self.letters) > 0: output += '|' if len(self.letters) == 1: output += '%c%c' % (self.base, self.letters[0]) elif len(self.letters) > 0: output += '%c[' % self.base for letter in self.letters: output += letter output += ']' if len(self.words) != 0: output += ')' if not isLast: output += '"' output += '\n' return output; def add(self, line): length = len(line) if line.startswith('#') or (length == 0): return; if length == 2: self.letters.append(line[1:2]) else: self.words.append(line) def getBucket(buckets, line): letter = line[0] bucket = buckets.get(letter) if bucket is None: bucket = Bucket(letter) buckets[letter] = bucket return bucket def makePattern(prefix, suffix, buckets, isWebUrl=False): output = prefix output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) for letter in range(ord('b'), ord('z')): output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) if isWebUrl: output += '))"' else: output += ')' output += suffix print output if __name__ == "__main__": f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') domains = f.readlines() f.close() buckets = {} for domain in domains: domain = domain.lower() if len(domain) > 0: getBucket(buckets, domain[0]).add(domain.strip()) if domain.startswith('xn--'): puny = domain.strip()[4:] result = puny.decode('punycode') result = repr(result) getBucket(buckets, 'xn--').add(result[2:-1]) makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)