diff options
author | PJ Eby <distutils-sig@python.org> | 2007-05-31 17:30:55 +0000 |
---|---|---|
committer | PJ Eby <distutils-sig@python.org> | 2007-05-31 17:30:55 +0000 |
commit | b364978eee6eaf2e03999ab0590a16278a03b13e (patch) | |
tree | 82cfe1b08ac83e76cb354191573a21beb4060695 /setuptools/package_index.py | |
parent | 89111e6143f3a9bb510433f529d4281681b7c66e (diff) | |
download | external_python_setuptools-b364978eee6eaf2e03999ab0590a16278a03b13e.tar.gz external_python_setuptools-b364978eee6eaf2e03999ab0590a16278a03b13e.tar.bz2 external_python_setuptools-b364978eee6eaf2e03999ab0590a16278a03b13e.zip |
Backport fixes and doc updates; prep for 0.6c6 release
--HG--
branch : setuptools-0.6
extra : convert_revision : svn%3A6015fed2-1504-0410-9fe1-9d1591cc4771/sandbox/branches/setuptools-0.6%4055712
Diffstat (limited to 'setuptools/package_index.py')
-rwxr-xr-x | setuptools/package_index.py | 48 |
1 files changed, 44 insertions, 4 deletions
diff --git a/setuptools/package_index.py b/setuptools/package_index.py index e4f96f0b..3da253a5 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -132,14 +132,14 @@ def find_external_links(url, page): rels = map(str.strip, rel.lower().split(',')) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): - yield urlparse.urljoin(url, match.group(1)) + yield urlparse.urljoin(url, htmldecode(match.group(1))) for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: - yield urlparse.urljoin(url, match.group(1)) + yield urlparse.urljoin(url, htmldecode(match.group(1))) user_agent = "Python-urllib/%s setuptools/%s" % ( urllib2.__version__, require('setuptools')[0].version @@ -200,7 +200,7 @@ class PackageIndex(Environment): if url.startswith(self.index_url) and getattr(f,'code',None)!=404: page = self.process_index(url, page) for match in HREF.finditer(page): - link = urlparse.urljoin(base, match.group(1)) + link = urlparse.urljoin(base, htmldecode(match.group(1))) self.process_url(link) def process_filename(self, fn, nested=False): @@ -262,7 +262,7 @@ class PackageIndex(Environment): # process an index page into the package-page index for match in HREF.finditer(page): - scan( urlparse.urljoin(url, match.group(1)) ) + scan( urlparse.urljoin(url, htmldecode(match.group(1))) ) pkg, ver = scan(url) # ensure this page is in the page index if pkg: @@ -611,6 +611,8 @@ class PackageIndex(Environment): self.url_ok(url, True) # raises error if not allowed return self._attempt_download(url, filename) + + def scan_url(self, url): self.process_url(url, True) @@ -652,6 +654,44 @@ class PackageIndex(Environment): def warn(self, msg, *args): log.warn(msg, *args) +# This pattern matches a character entity reference (a decimal numeric +# references, a hexadecimal numeric reference, or a named reference). +entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub + +def uchr(c): + if not isinstance(c, int): + return c + if c>255: return unichr(c) + return chr(c) + +def decode_entity(match): + what = match.group(1) + if what.startswith('#x'): + what = int(what[2:], 16) + elif what.startswith('#'): + what = int(what[1:]) + else: + from htmlentitydefs import name2codepoint + what = name2codepoint.get(what, match.group(0)) + return uchr(what) + +def htmldecode(text): + """Decode HTML entities in the given text.""" + return entity_sub(decode_entity, text) + + + + + + + + + + + + + + |