1 files changed, 288 insertions, 0 deletions
diff --git a/asn1crypto/_iri.py b/asn1crypto/_iri.py
new file mode 100644
index 0000000..57ddd40
--- /dev/null
+++ b/asn1crypto/_iri.py
@@ -0,0 +1,288 @@
+# coding: utf-8
+
+"""
+Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
+the following items:
+
+ - iri_to_uri()
+ - uri_to_iri()
+"""
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from encodings import idna  # noqa
+import codecs
+import re
+import sys
+
+from ._errors import unwrap
+from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
+
+if sys.version_info < (3,):
+    from urlparse import urlsplit, urlunsplit
+    from urllib import (
+        quote as urlquote,
+        unquote as unquote_to_bytes,
+    )
+
+else:
+    from urllib.parse import (
+        quote as urlquote,
+        unquote_to_bytes,
+        urlsplit,
+        urlunsplit,
+    )
+
+
+def iri_to_uri(value):
+    """
+    Normalizes and encodes a unicode IRI into an ASCII byte string URI
+
+    :param value:
+        A unicode string of an IRI
+
+    :return:
+        A byte string of the ASCII-encoded URI
+    """
+
+    if not isinstance(value, str_cls):
+        raise TypeError(unwrap(
+            '''
+            value must be a unicode string, not %s
+            ''',
+            type_name(value)
+        ))
+
+    scheme = None
+    # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
+    if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
+        real_prefix = None
+        prefix_match = re.match('^[^:]*://', value)
+        if prefix_match:
+            real_prefix = prefix_match.group(0)
+            value = 'http://' + value[len(real_prefix):]
+        parsed = urlsplit(value)
+        if real_prefix:
+            value = real_prefix + value[7:]
+            scheme = _urlquote(real_prefix[:-3])
+    else:
+        parsed = urlsplit(value)
+
+    if scheme is None:
+        scheme = _urlquote(parsed.scheme)
+    hostname = parsed.hostname
+    if hostname is not None:
+        hostname = hostname.encode('idna')
+    # RFC 3986 allows userinfo to contain sub-delims
+    username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
+    password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
+    port = parsed.port
+    if port is not None:
+        port = str_cls(port).encode('ascii')
+
+    netloc = b''
+    if username is not None:
+        netloc += username
+        if password:
+            netloc += b':' + password
+        netloc += b'@'
+    if hostname is not None:
+        netloc += hostname
+    if port is not None:
+        default_http = scheme == b'http' and port == b'80'
+        default_https = scheme == b'https' and port == b'443'
+        if not default_http and not default_https:
+            netloc += b':' + port
+
+    # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
+    path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
+    # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
+    query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
+    # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
+    fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
+
+    if query is None and fragment is None and path == b'/':
+        path = None
+
+    # Python 2.7 compat
+    if path is None:
+        path = ''
+
+    output = urlunsplit((scheme, netloc, path, query, fragment))
+    if isinstance(output, str_cls):
+        output = output.encode('latin1')
+    return output
+
+
+def uri_to_iri(value):
+    """
+    Converts an ASCII URI byte string into a unicode IRI
+
+    :param value:
+        An ASCII-encoded byte string of the URI
+
+    :return:
+        A unicode string of the IRI
+    """
+
+    if not isinstance(value, byte_cls):
+        raise TypeError(unwrap(
+            '''
+            value must be a byte string, not %s
+            ''',
+            type_name(value)
+        ))
+
+    parsed = urlsplit(value)
+
+    scheme = parsed.scheme
+    if scheme is not None:
+        scheme = scheme.decode('ascii')
+
+    username = _urlunquote(parsed.username, remap=[':', '@'])
+    password = _urlunquote(parsed.password, remap=[':', '@'])
+    hostname = parsed.hostname
+    if hostname:
+        hostname = hostname.decode('idna')
+    port = parsed.port
+    if port and not isinstance(port, int_types):
+        port = port.decode('ascii')
+
+    netloc = ''
+    if username is not None:
+        netloc += username
+        if password:
+            netloc += ':' + password
+        netloc += '@'
+    if hostname is not None:
+        netloc += hostname
+    if port is not None:
+        netloc += ':' + str_cls(port)
+
+    path = _urlunquote(parsed.path, remap=['/'], preserve=True)
+    query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
+    fragment = _urlunquote(parsed.fragment)
+
+    return urlunsplit((scheme, netloc, path, query, fragment))
+
+
+def _iri_utf8_errors_handler(exc):
+    """
+    Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
+    sequences encoded in %XX format, but as part of a unicode string.
+
+    :param exc:
+        The UnicodeDecodeError exception
+
+    :return:
+        A 2-element tuple of (replacement unicode string, integer index to
+        resume at)
+    """
+
+    bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
+    replacements = ['%%%02x' % num for num in bytes_as_ints]
+    return (''.join(replacements), exc.end)
+
+
+codecs.register_error('iriutf8', _iri_utf8_errors_handler)
+
+
+def _urlquote(string, safe=''):
+    """
+    Quotes a unicode string for use in a URL
+
+    :param string:
+        A unicode string
+
+    :param safe:
+        A unicode string of character to not encode
+
+    :return:
+        None (if string is None) or an ASCII byte string of the quoted string
+    """
+
+    if string is None or string == '':
+        return None
+
+    # Anything already hex quoted is pulled out of the URL and unquoted if
+    # possible
+    escapes = []
+    if re.search('%[0-9a-fA-F]{2}', string):
+        # Try to unquote any percent values, restoring them if they are not
+        # valid UTF-8. Also, requote any safe chars since encoded versions of
+        # those are functionally different than the unquoted ones.
+        def _try_unescape(match):
+            byte_string = unquote_to_bytes(match.group(0))
+            unicode_string = byte_string.decode('utf-8', 'iriutf8')
+            for safe_char in list(safe):
+                unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
+            return unicode_string
+        string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
+
+        # Once we have the minimal set of hex quoted values, removed them from
+        # the string so that they are not double quoted
+        def _extract_escape(match):
+            escapes.append(match.group(0).encode('ascii'))
+            return '\x00'
+        string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
+
+    output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
+    if not isinstance(output, byte_cls):
+        output = output.encode('ascii')
+
+    # Restore the existing quoted values that we extracted
+    if len(escapes) > 0:
+        def _return_escape(_):
+            return escapes.pop(0)
+        output = re.sub(b'%00', _return_escape, output)
+
+    return output
+
+
+def _urlunquote(byte_string, remap=None, preserve=None):
+    """
+    Unquotes a URI portion from a byte string into unicode using UTF-8
+
+    :param byte_string:
+        A byte string of the data to unquote
+
+    :param remap:
+        A list of characters (as unicode) that should be re-mapped to a
+        %XX encoding. This is used when characters are not valid in part of a
+        URL.
+
+    :param preserve:
+        A bool - indicates that the chars to be remapped if they occur in
+        non-hex form, should be preserved. E.g. / for URL path.
+
+    :return:
+        A unicode string
+    """
+
+    if byte_string is None:
+        return byte_string
+
+    if byte_string == b'':
+        return ''
+
+    if preserve:
+        replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
+        preserve_unmap = {}
+        for char in remap:
+            replacement = replacements.pop(0)
+            preserve_unmap[replacement] = char
+            byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
+
+    byte_string = unquote_to_bytes(byte_string)
+
+    if remap:
+        for char in remap:
+            byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
+
+    output = byte_string.decode('utf-8', 'iriutf8')
+
+    if preserve:
+        for replacement, original in preserve_unmap.items():
+            output = output.replace(replacement, original)
+
+    return output