aboutsummaryrefslogtreecommitdiffstats
path: root/asn1crypto/_iri.py
diff options
context:
space:
mode:
Diffstat (limited to 'asn1crypto/_iri.py')
-rw-r--r--asn1crypto/_iri.py288
1 files changed, 288 insertions, 0 deletions
diff --git a/asn1crypto/_iri.py b/asn1crypto/_iri.py
new file mode 100644
index 0000000..57ddd40
--- /dev/null
+++ b/asn1crypto/_iri.py
@@ -0,0 +1,288 @@
+# coding: utf-8
+
+"""
+Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
+the following items:
+
+ - iri_to_uri()
+ - uri_to_iri()
+"""
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from encodings import idna # noqa
+import codecs
+import re
+import sys
+
+from ._errors import unwrap
+from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
+
+if sys.version_info < (3,):
+ from urlparse import urlsplit, urlunsplit
+ from urllib import (
+ quote as urlquote,
+ unquote as unquote_to_bytes,
+ )
+
+else:
+ from urllib.parse import (
+ quote as urlquote,
+ unquote_to_bytes,
+ urlsplit,
+ urlunsplit,
+ )
+
+
+def iri_to_uri(value):
+ """
+ Normalizes and encodes a unicode IRI into an ASCII byte string URI
+
+ :param value:
+ A unicode string of an IRI
+
+ :return:
+ A byte string of the ASCII-encoded URI
+ """
+
+ if not isinstance(value, str_cls):
+ raise TypeError(unwrap(
+ '''
+ value must be a unicode string, not %s
+ ''',
+ type_name(value)
+ ))
+
+ scheme = None
+ # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
+ if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
+ real_prefix = None
+ prefix_match = re.match('^[^:]*://', value)
+ if prefix_match:
+ real_prefix = prefix_match.group(0)
+ value = 'http://' + value[len(real_prefix):]
+ parsed = urlsplit(value)
+ if real_prefix:
+ value = real_prefix + value[7:]
+ scheme = _urlquote(real_prefix[:-3])
+ else:
+ parsed = urlsplit(value)
+
+ if scheme is None:
+ scheme = _urlquote(parsed.scheme)
+ hostname = parsed.hostname
+ if hostname is not None:
+ hostname = hostname.encode('idna')
+ # RFC 3986 allows userinfo to contain sub-delims
+ username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
+ password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
+ port = parsed.port
+ if port is not None:
+ port = str_cls(port).encode('ascii')
+
+ netloc = b''
+ if username is not None:
+ netloc += username
+ if password:
+ netloc += b':' + password
+ netloc += b'@'
+ if hostname is not None:
+ netloc += hostname
+ if port is not None:
+ default_http = scheme == b'http' and port == b'80'
+ default_https = scheme == b'https' and port == b'443'
+ if not default_http and not default_https:
+ netloc += b':' + port
+
+ # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
+ path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
+ # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
+ query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
+ # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
+ fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
+
+ if query is None and fragment is None and path == b'/':
+ path = None
+
+ # Python 2.7 compat
+ if path is None:
+ path = ''
+
+ output = urlunsplit((scheme, netloc, path, query, fragment))
+ if isinstance(output, str_cls):
+ output = output.encode('latin1')
+ return output
+
+
+def uri_to_iri(value):
+ """
+ Converts an ASCII URI byte string into a unicode IRI
+
+ :param value:
+ An ASCII-encoded byte string of the URI
+
+ :return:
+ A unicode string of the IRI
+ """
+
+ if not isinstance(value, byte_cls):
+ raise TypeError(unwrap(
+ '''
+ value must be a byte string, not %s
+ ''',
+ type_name(value)
+ ))
+
+ parsed = urlsplit(value)
+
+ scheme = parsed.scheme
+ if scheme is not None:
+ scheme = scheme.decode('ascii')
+
+ username = _urlunquote(parsed.username, remap=[':', '@'])
+ password = _urlunquote(parsed.password, remap=[':', '@'])
+ hostname = parsed.hostname
+ if hostname:
+ hostname = hostname.decode('idna')
+ port = parsed.port
+ if port and not isinstance(port, int_types):
+ port = port.decode('ascii')
+
+ netloc = ''
+ if username is not None:
+ netloc += username
+ if password:
+ netloc += ':' + password
+ netloc += '@'
+ if hostname is not None:
+ netloc += hostname
+ if port is not None:
+ netloc += ':' + str_cls(port)
+
+ path = _urlunquote(parsed.path, remap=['/'], preserve=True)
+ query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
+ fragment = _urlunquote(parsed.fragment)
+
+ return urlunsplit((scheme, netloc, path, query, fragment))
+
+
+def _iri_utf8_errors_handler(exc):
+ """
+ Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
+ sequences encoded in %XX format, but as part of a unicode string.
+
+ :param exc:
+ The UnicodeDecodeError exception
+
+ :return:
+ A 2-element tuple of (replacement unicode string, integer index to
+ resume at)
+ """
+
+ bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
+ replacements = ['%%%02x' % num for num in bytes_as_ints]
+ return (''.join(replacements), exc.end)
+
+
+codecs.register_error('iriutf8', _iri_utf8_errors_handler)
+
+
+def _urlquote(string, safe=''):
+ """
+ Quotes a unicode string for use in a URL
+
+ :param string:
+ A unicode string
+
+ :param safe:
+ A unicode string of character to not encode
+
+ :return:
+ None (if string is None) or an ASCII byte string of the quoted string
+ """
+
+ if string is None or string == '':
+ return None
+
+ # Anything already hex quoted is pulled out of the URL and unquoted if
+ # possible
+ escapes = []
+ if re.search('%[0-9a-fA-F]{2}', string):
+ # Try to unquote any percent values, restoring them if they are not
+ # valid UTF-8. Also, requote any safe chars since encoded versions of
+ # those are functionally different than the unquoted ones.
+ def _try_unescape(match):
+ byte_string = unquote_to_bytes(match.group(0))
+ unicode_string = byte_string.decode('utf-8', 'iriutf8')
+ for safe_char in list(safe):
+ unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
+ return unicode_string
+ string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
+
+ # Once we have the minimal set of hex quoted values, removed them from
+ # the string so that they are not double quoted
+ def _extract_escape(match):
+ escapes.append(match.group(0).encode('ascii'))
+ return '\x00'
+ string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
+
+ output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
+ if not isinstance(output, byte_cls):
+ output = output.encode('ascii')
+
+ # Restore the existing quoted values that we extracted
+ if len(escapes) > 0:
+ def _return_escape(_):
+ return escapes.pop(0)
+ output = re.sub(b'%00', _return_escape, output)
+
+ return output
+
+
+def _urlunquote(byte_string, remap=None, preserve=None):
+ """
+ Unquotes a URI portion from a byte string into unicode using UTF-8
+
+ :param byte_string:
+ A byte string of the data to unquote
+
+ :param remap:
+ A list of characters (as unicode) that should be re-mapped to a
+ %XX encoding. This is used when characters are not valid in part of a
+ URL.
+
+ :param preserve:
+ A bool - indicates that the chars to be remapped if they occur in
+ non-hex form, should be preserved. E.g. / for URL path.
+
+ :return:
+ A unicode string
+ """
+
+ if byte_string is None:
+ return byte_string
+
+ if byte_string == b'':
+ return ''
+
+ if preserve:
+ replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
+ preserve_unmap = {}
+ for char in remap:
+ replacement = replacements.pop(0)
+ preserve_unmap[replacement] = char
+ byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
+
+ byte_string = unquote_to_bytes(byte_string)
+
+ if remap:
+ for char in remap:
+ byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
+
+ output = byte_string.decode('utf-8', 'iriutf8')
+
+ if preserve:
+ for replacement, original in preserve_unmap.items():
+ output = output.replace(replacement, original)
+
+ return output