diff options
author | Dirley Rodrigues <dirleyrls@gmail.com> | 2013-02-04 11:30:58 -0200 |
---|---|---|
committer | Dirley Rodrigues <dirleyrls@gmail.com> | 2013-02-04 11:30:58 -0200 |
commit | e3207bd63bcf365a1f91b7c3e75a4b3354435501 (patch) | |
tree | 42942c5ca5bfca15527d3bc55ea0d1d24f10fbec /setuptools/package_index.py | |
parent | cf2a28328628a15a95ec354f8c3a4421d3652e31 (diff) | |
download | external_python_setuptools-e3207bd63bcf365a1f91b7c3e75a4b3354435501.tar.gz external_python_setuptools-e3207bd63bcf365a1f91b7c3e75a4b3354435501.tar.bz2 external_python_setuptools-e3207bd63bcf365a1f91b7c3e75a4b3354435501.zip |
Improve external links finder to not yield duplicate links.
--HG--
branch : distribute
extra : rebase_source : 78e932fca32ee0ee1f50794cf998f4e7db78131b
Diffstat (limited to 'setuptools/package_index.py')
-rwxr-xr-x | setuptools/package_index.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/setuptools/package_index.py b/setuptools/package_index.py index 0ee21e3b..4393c83a 100755 --- a/setuptools/package_index.py +++ b/setuptools/package_index.py @@ -139,20 +139,26 @@ REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) def find_external_links(url, page): """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" + seen = set() for match in REL.finditer(page): tag, rel = match.groups() rels = map(str.strip, rel.lower().split(',')) if 'homepage' in rels or 'download' in rels: for match in HREF.finditer(tag): - yield urlparse.urljoin(url, htmldecode(match.group(1))) + url = urlparse.urljoin(url, htmldecode(match.group(1))) + if not url in seen: + yield url for tag in ("<th>Home Page", "<th>Download URL"): pos = page.find(tag) if pos!=-1: match = HREF.search(page,pos) if match: - yield urlparse.urljoin(url, htmldecode(match.group(1))) + url = urlparse.urljoin(url, htmldecode(match.group(1))) + if not url in seen: + yield url + user_agent = "Python-urllib/%s distribute/%s" % ( sys.version[:3], require('distribute')[0].version |