diff options
author | PJ Eby <distutils-sig@python.org> | 2005-06-12 03:44:07 +0000 |
---|---|---|
committer | PJ Eby <distutils-sig@python.org> | 2005-06-12 03:44:07 +0000 |
commit | 8f64fbe5e5a016bd88f19d108e126be7c23e757a (patch) | |
tree | 2bec42958fb6ab9976b8cf48bfdeafab0fdb9d99 /setuptools | |
parent | 449f35743f5d0bc97ebfdba00bb8e7acc79465af (diff) | |
download | external_python_setuptools-8f64fbe5e5a016bd88f19d108e126be7c23e757a.tar.gz external_python_setuptools-8f64fbe5e5a016bd88f19d108e126be7c23e757a.tar.bz2 external_python_setuptools-8f64fbe5e5a016bd88f19d108e126be7c23e757a.zip |
Move package index/downloading stuff to setuptools.package_index module.
--HG--
branch : setuptools
extra : convert_revision : svn%3A6015fed2-1504-0410-9fe1-9d1591cc4771/sandbox/trunk/setuptools%4041051
Diffstat (limited to 'setuptools')
-rwxr-xr-x | setuptools/package_index.py | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/setuptools/package_index.py b/setuptools/package_index.py new file mode 100755 index 00000000..9e48de35 --- /dev/null +++ b/setuptools/package_index.py @@ -0,0 +1,369 @@ +"""PyPI and direct package downloading""" + +import sys, os.path, re, urlparse, urllib2 +from pkg_resources import * + +HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I) +URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):',re.I).match +EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split() + +__all__ = [ + 'PackageIndex', 'distros_for_url', +] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def distros_for_url(url, metadata=None): + """Yield egg or source distribution objects that might be found at a URL""" + + path = urlparse.urlparse(url)[2] + base = urllib2.unquote(path.split('/')[-1]) + + if base.endswith('.egg'): + dist = Distribution.from_filename(base, metadata) + dist.path = url + yield dist + return # only one, unambiguous interpretation + + for ext in EXTENSIONS: + if base.endswith(ext): + base = base[:-len(ext)] + break + else: + return # no extension matched + + # Generate alternative interpretations of a source distro name + # Because some packages are ambiguous as to name/versions split + # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc. + # So, we generate each possible interepretation (e.g. "adns, python-1.1.0" + # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice, + # the spurious interpretations should be ignored, because in the event + # there's also an "adns" package, the spurious "python-1.1.0" version will + # compare lower than any numeric version number, and is therefore unlikely + # to match a request for it. It's still a potential problem, though, and + # in the long run PyPI and the distutils should go for "safe" names and + # versions in distribution archive names (sdist and bdist). + + parts = base.split('-') + for p in range(1,len(parts)+1): + yield Distribution( + url, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]), + distro_type = SOURCE_DIST + ) + + + + +class PackageIndex(AvailableDistributions): + """A distribution index that scans web pages for download URLs""" + + def __init__(self,index_url="http://www.python.org/pypi",*args,**kw): + AvailableDistributions.__init__(self,*args,**kw) + self.index_url = index_url + "/"[:not index_url.endswith('/')] + self.scanned_urls = {} + self.fetched_urls = {} + self.package_pages = {} + + def scan_url(self, url): + self.process_url(url, True) + + def process_url(self, url, retrieve=False): + if url in self.scanned_urls and not retrieve: + return + + self.scanned_urls[url] = True + dists = list(distros_for_url(url)) + map(self.add, dists) + + if dists or not retrieve or url in self.fetched_urls: + # don't need the actual page + return + + f = self.open_url(url) + self.fetched_urls[url] = self.fetched_urls[f.url] = True + if 'html' not in f.headers['content-type'].lower(): + f.close() # not html, we can't process it + return + + base = f.url # handle redirects + page = f.read() + f.close() + if url.startswith(self.index_url): + self.process_index(url, page) + else: + for match in HREF.finditer(page): + link = urlparse.urljoin(base, match.group(1)) + self.process_url(link) + + def find_packages(self,requirement): + self.scan_url(self.index_url + requirement.distname) + if not self.package_pages.get(requirement.key): + # We couldn't find the target package, so search the index page too + self.scan_url(self.index_url) + for url in self.package_pages.get(requirement.key,()): + # scan each page that might be related to the desired package + self.scan_url(url) + + def process_index(self,url,page): + def scan(link): + if link.startswith(self.index_url): + parts = map( + urllib2.unquote, link[len(self.index_url):].split('/') + ) + if len(parts)==2: + # it's a package page, sanitize and index it + pkg = safe_name(parts[0]) + ver = safe_version(parts[1]) + self.package_pages.setdefault(pkg.lower(),{})[link] = True + if url==self.index_url or 'Index of Packages</title>' in page: + # process an index page into the package-page index + for match in HREF.finditer(page): + scan( urlparse.urljoin(url, match.group(1)) ) + else: + scan(url) # ensure this page is in the page index + # process individual package page + for tag in ("<th>Home Page", "<th>Download URL"): + pos = page.find(tag) + if pos!=-1: + match = HREF.search(page,pos) + if match: + # Process the found URL + self.scan_url(urlparse.urljoin(url, match.group(1))) + + def obtain(self,requirement): + self.find_packages(requirement) + for dist in self.get(requirement.key, ()): + if dist in requirement: + return dist + + def download(self, spec, tmpdir): + """Locate and/or download `spec`, returning a local filename + + `spec` may be a ``Requirement`` object, or a string containing a URL, + an existing local filename, or a package/version requirement spec + (i.e. the string form of a ``Requirement`` object). + + If necessary, the requirement is searched for in the package index. + If the download is successful, the return value is a local file path, + and it is a subpath of `tmpdir` if the distribution had to be + downloaded. If no matching distribution is found, return ``None``. + Various errors may be raised if a problem occurs during downloading. + """ + + if not isinstance(spec,Requirement): + scheme = URL_SCHEME(spec) + if scheme: + # It's a url, download it to tmpdir + return self._download_url(scheme.group(1), spec, tmpdir) + + elif os.path.exists(spec): + # Existing file or directory, just return it + return spec + else: + try: + spec = Requirement.parse(spec) + except ValueError: + raise RuntimeError( + "Not a URL, existing file, or requirement spec: %r" % + (spec,) + ) + + # process a Requirement + dist = self.best_match(spec,[]) + if dist is not None: + return self.download(dist.path, tmpdir) + + return None + + + + dl_blocksize = 8192 + + def _download_to(self, url, filename): + # Download the file + fp, tfp = None, None + try: + fp = self.open_url(url) + if isinstance(fp, urllib2.HTTPError): + raise RuntimeError( + "Can't download %s: %s %s" % (url, fp.code,fp.msg) + ) + + headers = fp.info() + blocknum = 0 + bs = self.dl_blocksize + size = -1 + + if "content-length" in headers: + size = int(headers["Content-Length"]) + self.reporthook(url, filename, blocknum, bs, size) + + tfp = open(filename,'wb') + while True: + block = fp.read(bs) + if block: + tfp.write(block) + blocknum += 1 + self.reporthook(url, filename, blocknum, bs, size) + else: + break + return headers + + finally: + if fp: fp.close() + if tfp: tfp.close() + + def reporthook(self, url, filename, blocknum, blksize, size): + pass # no-op + + + + def open_url(self, url): + try: + return urllib2.urlopen(url) + except urllib2.HTTPError, v: + return v + except urllib2.URLError, v: + raise RuntimeError("Download error: %s" % v.reason) + + + def _download_url(self, scheme, url, tmpdir): + + # Determine download filename + # + name = filter(None,urlparse.urlparse(url)[2].split('/')) + if name: + name = name[-1] + while '..' in name: + name = name.replace('..','.').replace('\\','_') + else: + name = "__downloaded__" # default if URL has no path contents + + filename = os.path.join(tmpdir,name) + + # Download the file + # + if scheme=='svn' or scheme.startswith('svn+'): + return self._download_svn(url, filename) + else: + headers = self._download_to(url, filename) + if 'html' in headers['content-type'].lower(): + return self._download_html(url, headers, filename, tmpdir) + else: + return filename + + + + + + + + + def _download_html(self, url, headers, filename, tmpdir): + # Check for a sourceforge URL + sf_url = url.startswith('http://prdownloads.') + file = open(filename) + for line in file: + if line.strip(): + # Check for a subversion index page + if re.search(r'<title>Revision \d+:', line): + # it's a subversion index page: + file.close() + os.unlink(filename) + return self._download_svn(url, filename) + # Check for a SourceForge header + elif sf_url: + if re.search(r'^<HTML><HEAD>', line, re.I): + continue # skip first line + elif re.search(r'<TITLE>Select a Mirror for File:',line): + # Sourceforge mirror page + page = file.read() + file.close() + os.unlink(filename) + return self._download_sourceforge(url, page, tmpdir) + break # not an index page + file.close() + raise RuntimeError("Unexpected HTML page found at "+url) + + + def _download_svn(self, url, filename): + os.system("svn checkout -q %s %s" % (url, filename)) + return filename + + + + + + + + + + + + def _download_sourceforge(self, source_url, sf_page, tmpdir): + """Download package from randomly-selected SourceForge mirror""" + + mirror_regex = re.compile(r'HREF=(/.*?\?use_mirror=[^>]*)') + urls = [m.group(1) for m in mirror_regex.finditer(sf_page)] + if not urls: + raise RuntimeError( + "URL looks like a Sourceforge mirror page, but no URLs found" + ) + + import random + url = urlparse.urljoin(source_url, random.choice(urls)) + f = self.open_url(url) + match = re.search( + r'<META HTTP-EQUIV="refresh" content=".*?URL=(.*?)"', + f.read() + ) + f.close() + + if match: + download_url = match.group(1) + scheme = URL_SCHEME(download_url) + return self._download_url(scheme.group(1), download_url, tmpdir) + else: + raise RuntimeError( + 'No META HTTP-EQUIV="refresh" found in Sourceforge page at %s' + % url + ) + + + + + + + + + + + + + |