""" authlib.util.urls ~~~~~~~~~~~~~~~~~ Wrapper functions for URL encoding and decoding. """ import re from urllib.parse import quote as _quote from urllib.parse import unquote as _unquote from urllib.parse import urlencode as _urlencode import urllib.parse as urlparse from .encoding import to_unicode, to_bytes always_safe = ( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789_.-' ) urlencoded = set(always_safe) | set('=&;:%+~,*@!()/?') INVALID_HEX_PATTERN = re.compile(r'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]') def url_encode(params): encoded = [] for k, v in params: encoded.append((to_bytes(k), to_bytes(v))) return to_unicode(_urlencode(encoded)) def url_decode(query): """Decode a query string in x-www-form-urlencoded format into a sequence of two-element tuples. Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce correct formatting of the query string by validation. If validation fails a ValueError will be raised. urllib.parse_qsl will only raise errors if any of name-value pairs omits the equals sign. """ # Check if query contains invalid characters if query and not set(query) <= urlencoded: error = ("Error trying to decode a non urlencoded string. " "Found invalid characters: %s " "in the string: '%s'. " "Please ensure the request/response body is " "x-www-form-urlencoded.") raise ValueError(error % (set(query) - urlencoded, query)) # Check for correctly hex encoded values using a regular expression # All encoded values begin with % followed by two hex characters # correct = %00, %A0, %0A, %FF # invalid = %G0, %5H, %PO if INVALID_HEX_PATTERN.search(query): raise ValueError('Invalid hex encoding in query string.') # We encode to utf-8 prior to parsing because parse_qsl behaves # differently on unicode input in python 2 and 3. # Python 2.7 # >>> urlparse.parse_qsl(u'%E5%95%A6%E5%95%A6') # u'\xe5\x95\xa6\xe5\x95\xa6' # Python 2.7, non unicode input gives the same # >>> urlparse.parse_qsl('%E5%95%A6%E5%95%A6') # '\xe5\x95\xa6\xe5\x95\xa6' # but now we can decode it to unicode # >>> urlparse.parse_qsl('%E5%95%A6%E5%95%A6').decode('utf-8') # u'\u5566\u5566' # Python 3.3 however # >>> urllib.parse.parse_qsl(u'%E5%95%A6%E5%95%A6') # u'\u5566\u5566' # We want to allow queries such as "c2" whereas urlparse.parse_qsl # with the strict_parsing flag will not. params = urlparse.parse_qsl(query, keep_blank_values=True) # unicode all the things decoded = [] for k, v in params: decoded.append((to_unicode(k), to_unicode(v))) return decoded def add_params_to_qs(query, params): """Extend a query with a list of two-tuples.""" if isinstance(params, dict): params = params.items() qs = urlparse.parse_qsl(query, keep_blank_values=True) qs.extend(params) return url_encode(qs) def add_params_to_uri(uri, params, fragment=False): """Add a list of two-tuples to the uri query components.""" sch, net, path, par, query, fra = urlparse.urlparse(uri) if fragment: fra = add_params_to_qs(fra, params) else: query = add_params_to_qs(query, params) return urlparse.urlunparse((sch, net, path, par, query, fra)) def quote(s, safe=b'/'): return to_unicode(_quote(to_bytes(s), safe)) def unquote(s): return to_unicode(_unquote(s)) def quote_url(s): return quote(s, b'~@#$&()*!+=:;,.?/\'') def extract_params(raw): """Extract parameters and return them as a list of 2-tuples. Will successfully extract parameters from urlencoded query strings, dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an empty list of parameters. Any other input will result in a return value of None. """ if isinstance(raw, (list, tuple)): try: raw = dict(raw) except (TypeError, ValueError): return None if isinstance(raw, dict): params = [] for k, v in raw.items(): params.append((to_unicode(k), to_unicode(v))) return params if not raw: return None try: return url_decode(raw) except ValueError: return None def is_valid_url(url): parsed = urlparse.urlparse(url) return parsed.scheme and parsed.hostname