diff options
Diffstat (limited to 'asn1crypto/_iri.py')
-rw-r--r-- | asn1crypto/_iri.py | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/asn1crypto/_iri.py b/asn1crypto/_iri.py new file mode 100644 index 0000000..57ddd40 --- /dev/null +++ b/asn1crypto/_iri.py @@ -0,0 +1,288 @@ +# coding: utf-8 + +""" +Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports +the following items: + + - iri_to_uri() + - uri_to_iri() +""" + +from __future__ import unicode_literals, division, absolute_import, print_function + +from encodings import idna # noqa +import codecs +import re +import sys + +from ._errors import unwrap +from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types + +if sys.version_info < (3,): + from urlparse import urlsplit, urlunsplit + from urllib import ( + quote as urlquote, + unquote as unquote_to_bytes, + ) + +else: + from urllib.parse import ( + quote as urlquote, + unquote_to_bytes, + urlsplit, + urlunsplit, + ) + + +def iri_to_uri(value): + """ + Normalizes and encodes a unicode IRI into an ASCII byte string URI + + :param value: + A unicode string of an IRI + + :return: + A byte string of the ASCII-encoded URI + """ + + if not isinstance(value, str_cls): + raise TypeError(unwrap( + ''' + value must be a unicode string, not %s + ''', + type_name(value) + )) + + scheme = None + # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https:// + if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'): + real_prefix = None + prefix_match = re.match('^[^:]*://', value) + if prefix_match: + real_prefix = prefix_match.group(0) + value = 'http://' + value[len(real_prefix):] + parsed = urlsplit(value) + if real_prefix: + value = real_prefix + value[7:] + scheme = _urlquote(real_prefix[:-3]) + else: + parsed = urlsplit(value) + + if scheme is None: + scheme = _urlquote(parsed.scheme) + hostname = parsed.hostname + if hostname is not None: + hostname = hostname.encode('idna') + # RFC 3986 allows userinfo to contain sub-delims + username = _urlquote(parsed.username, safe='!$&\'()*+,;=') + password = _urlquote(parsed.password, safe='!$&\'()*+,;=') + port = parsed.port + if port is not None: + port = str_cls(port).encode('ascii') + + netloc = b'' + if username is not None: + netloc += username + if password: + netloc += b':' + password + netloc += b'@' + if hostname is not None: + netloc += hostname + if port is not None: + default_http = scheme == b'http' and port == b'80' + default_https = scheme == b'https' and port == b'443' + if not default_http and not default_https: + netloc += b':' + port + + # RFC 3986 allows a path to contain sub-delims, plus "@" and ":" + path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:') + # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?" + query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:') + # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?" + fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:') + + if query is None and fragment is None and path == b'/': + path = None + + # Python 2.7 compat + if path is None: + path = '' + + output = urlunsplit((scheme, netloc, path, query, fragment)) + if isinstance(output, str_cls): + output = output.encode('latin1') + return output + + +def uri_to_iri(value): + """ + Converts an ASCII URI byte string into a unicode IRI + + :param value: + An ASCII-encoded byte string of the URI + + :return: + A unicode string of the IRI + """ + + if not isinstance(value, byte_cls): + raise TypeError(unwrap( + ''' + value must be a byte string, not %s + ''', + type_name(value) + )) + + parsed = urlsplit(value) + + scheme = parsed.scheme + if scheme is not None: + scheme = scheme.decode('ascii') + + username = _urlunquote(parsed.username, remap=[':', '@']) + password = _urlunquote(parsed.password, remap=[':', '@']) + hostname = parsed.hostname + if hostname: + hostname = hostname.decode('idna') + port = parsed.port + if port and not isinstance(port, int_types): + port = port.decode('ascii') + + netloc = '' + if username is not None: + netloc += username + if password: + netloc += ':' + password + netloc += '@' + if hostname is not None: + netloc += hostname + if port is not None: + netloc += ':' + str_cls(port) + + path = _urlunquote(parsed.path, remap=['/'], preserve=True) + query = _urlunquote(parsed.query, remap=['&', '='], preserve=True) + fragment = _urlunquote(parsed.fragment) + + return urlunsplit((scheme, netloc, path, query, fragment)) + + +def _iri_utf8_errors_handler(exc): + """ + Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte + sequences encoded in %XX format, but as part of a unicode string. + + :param exc: + The UnicodeDecodeError exception + + :return: + A 2-element tuple of (replacement unicode string, integer index to + resume at) + """ + + bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end]) + replacements = ['%%%02x' % num for num in bytes_as_ints] + return (''.join(replacements), exc.end) + + +codecs.register_error('iriutf8', _iri_utf8_errors_handler) + + +def _urlquote(string, safe=''): + """ + Quotes a unicode string for use in a URL + + :param string: + A unicode string + + :param safe: + A unicode string of character to not encode + + :return: + None (if string is None) or an ASCII byte string of the quoted string + """ + + if string is None or string == '': + return None + + # Anything already hex quoted is pulled out of the URL and unquoted if + # possible + escapes = [] + if re.search('%[0-9a-fA-F]{2}', string): + # Try to unquote any percent values, restoring them if they are not + # valid UTF-8. Also, requote any safe chars since encoded versions of + # those are functionally different than the unquoted ones. + def _try_unescape(match): + byte_string = unquote_to_bytes(match.group(0)) + unicode_string = byte_string.decode('utf-8', 'iriutf8') + for safe_char in list(safe): + unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char)) + return unicode_string + string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string) + + # Once we have the minimal set of hex quoted values, removed them from + # the string so that they are not double quoted + def _extract_escape(match): + escapes.append(match.group(0).encode('ascii')) + return '\x00' + string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string) + + output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8')) + if not isinstance(output, byte_cls): + output = output.encode('ascii') + + # Restore the existing quoted values that we extracted + if len(escapes) > 0: + def _return_escape(_): + return escapes.pop(0) + output = re.sub(b'%00', _return_escape, output) + + return output + + +def _urlunquote(byte_string, remap=None, preserve=None): + """ + Unquotes a URI portion from a byte string into unicode using UTF-8 + + :param byte_string: + A byte string of the data to unquote + + :param remap: + A list of characters (as unicode) that should be re-mapped to a + %XX encoding. This is used when characters are not valid in part of a + URL. + + :param preserve: + A bool - indicates that the chars to be remapped if they occur in + non-hex form, should be preserved. E.g. / for URL path. + + :return: + A unicode string + """ + + if byte_string is None: + return byte_string + + if byte_string == b'': + return '' + + if preserve: + replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F'] + preserve_unmap = {} + for char in remap: + replacement = replacements.pop(0) + preserve_unmap[replacement] = char + byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii')) + + byte_string = unquote_to_bytes(byte_string) + + if remap: + for char in remap: + byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii')) + + output = byte_string.decode('utf-8', 'iriutf8') + + if preserve: + for replacement, original in preserve_unmap.items(): + output = output.replace(replacement, original) + + return output |