# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Helper functions for documentation, etc.""" import inspect import logging import urllib.parse logger = logging.getLogger(__name__) WORKAROUND_SCHEMES = ['s3', 's3n', 's3u', 's3a', 'gs'] QUESTION_MARK_PLACEHOLDER = '///smart_open.utils.QUESTION_MARK_PLACEHOLDER///' def inspect_kwargs(kallable): # # inspect.getargspec got deprecated in Py3.4, and calling it spews # deprecation warnings that we'd prefer to avoid. Unfortunately, older # versions of Python (<3.3) did not have inspect.signature, so we need to # handle them the old-fashioned getargspec way. # try: signature = inspect.signature(kallable) except AttributeError: try: args, varargs, keywords, defaults = inspect.getargspec(kallable) except TypeError: # # Happens under Py2.7 with mocking. # return {} if not defaults: return {} supported_keywords = args[-len(defaults):] return dict(zip(supported_keywords, defaults)) else: return { name: param.default for name, param in signature.parameters.items() if param.default != inspect.Parameter.empty } def check_kwargs(kallable, kwargs): """Check which keyword arguments the callable supports. Parameters ---------- kallable: callable A function or method to test kwargs: dict The keyword arguments to check. If the callable doesn't support any of these, a warning message will get printed. Returns ------- dict A dictionary of argument names and values supported by the callable. """ supported_keywords = sorted(inspect_kwargs(kallable)) unsupported_keywords = [k for k in sorted(kwargs) if k not in supported_keywords] supported_kwargs = {k: v for (k, v) in kwargs.items() if k in supported_keywords} if unsupported_keywords: logger.warning('ignoring unsupported keyword arguments: %r', unsupported_keywords) return supported_kwargs def clamp(value, minval=0, maxval=None): """Clamp a numeric value to a specific range. Parameters ---------- value: numeric The value to clamp. minval: numeric The lower bound. maxval: numeric The upper bound. Returns ------- numeric The clamped value. It will be in the range ``[minval, maxval]``. """ if maxval is not None: value = min(value, maxval) value = max(value, minval) return value def make_range_string(start=None, stop=None): """Create a byte range specifier in accordance with RFC-2616. Parameters ---------- start: int, optional The start of the byte range. If unspecified, stop indicated offset from EOF. stop: int, optional The end of the byte range. If unspecified, indicates EOF. Returns ------- str A byte range specifier. """ # # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 # if start is None and stop is None: raise ValueError("make_range_string requires either a stop or start value") start_str = '' if start is None else str(start) stop_str = '' if stop is None else str(stop) return 'bytes=%s-%s' % (start_str, stop_str) def parse_content_range(content_range): """Extract units, start, stop, and length from a content range header like "bytes 0-846981/846982". Assumes a properly formatted content-range header from S3. See werkzeug.http.parse_content_range_header for a more robust version. Parameters ---------- content_range: str The content-range header to parse. Returns ------- tuple (units: str, start: int, stop: int, length: int) The units and three integers from the content-range header. """ units, numbers = content_range.split(' ', 1) range, length = numbers.split('/', 1) start, stop = range.split('-', 1) return units, int(start), int(stop), int(length) def safe_urlsplit(url): """This is a hack to prevent the regular urlsplit from splitting around question marks. A question mark (?) in a URL typically indicates the start of a querystring, and the standard library's urlparse function handles the querystring separately. Unfortunately, question marks can also appear _inside_ the actual URL for some schemas like S3, GS. Replaces question marks with a special placeholder substring prior to splitting. This work-around behavior is disabled in the unlikely event the placeholder is already part of the URL. If this affects you, consider changing the value of QUESTION_MARK_PLACEHOLDER to something more suitable. See Also -------- https://bugs.python.org/issue43882 https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py https://github.com/RaRe-Technologies/smart_open/issues/285 https://github.com/RaRe-Technologies/smart_open/issues/458 smart_open/utils.py:QUESTION_MARK_PLACEHOLDER """ sr = urllib.parse.urlsplit(url, allow_fragments=False) placeholder = None if sr.scheme in WORKAROUND_SCHEMES and '?' in url and QUESTION_MARK_PLACEHOLDER not in url: # # This is safe because people will _almost never_ use the below # substring in a URL. If they do, then they're asking for trouble, # and this special handling will simply not happen for them. # placeholder = QUESTION_MARK_PLACEHOLDER url = url.replace('?', placeholder) sr = urllib.parse.urlsplit(url, allow_fragments=False) if placeholder is None: return sr path = sr.path.replace(placeholder, '?') return urllib.parse.SplitResult(sr.scheme, sr.netloc, path, '', '')