# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

import codecs
import gzip
import json
import os
import sys
import tempfile
import unicodedata
from collections.abc import Iterable
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Literal, Optional

from hypothesis.configuration import storage_directory
from hypothesis.control import _current_build_context
from hypothesis.errors import InvalidArgument
from hypothesis.internal.intervalsets import IntervalSet, IntervalsT

if TYPE_CHECKING:
    from typing import TypeAlias

# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
CategoryName: "TypeAlias" = Literal[
    "L",  #  Letter
    "Lu",  # Letter, uppercase
    "Ll",  # Letter, lowercase
    "Lt",  # Letter, titlecase
    "Lm",  # Letter, modifier
    "Lo",  # Letter, other
    "M",  #  Mark
    "Mn",  # Mark, nonspacing
    "Mc",  # Mark, spacing combining
    "Me",  # Mark, enclosing
    "N",  #  Number
    "Nd",  # Number, decimal digit
    "Nl",  # Number, letter
    "No",  # Number, other
    "P",  #  Punctuation
    "Pc",  # Punctuation, connector
    "Pd",  # Punctuation, dash
    "Ps",  # Punctuation, open
    "Pe",  # Punctuation, close
    "Pi",  # Punctuation, initial quote
    "Pf",  # Punctuation, final quote
    "Po",  # Punctuation, other
    "S",  #  Symbol
    "Sm",  # Symbol, math
    "Sc",  # Symbol, currency
    "Sk",  # Symbol, modifier
    "So",  # Symbol, other
    "Z",  #  Separator
    "Zs",  # Separator, space
    "Zl",  # Separator, line
    "Zp",  # Separator, paragraph
    "C",  #  Other
    "Cc",  # Other, control
    "Cf",  # Other, format
    "Cs",  # Other, surrogate
    "Co",  # Other, private use
    "Cn",  # Other, not assigned
]
Categories: "TypeAlias" = Iterable[CategoryName]
CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...]


def charmap_file(fname: str = "charmap") -> Path:
    return storage_directory(
        "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"
    )


_charmap = None


def charmap() -> dict[CategoryName, IntervalsT]:
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as d:
                tmp_charmap = dict(json.load(d))

        except Exception:
            # This loop is reduced to using only local variables for performance;
            # indexing and updating containers is a ~3x slowdown.  This doesn't fix
            # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
            category = unicodedata.category  # Local variable -> ~20% speedup!
            tmp_charmap = {}
            last_cat = category(chr(0))
            last_start = 0
            for i in range(1, sys.maxunicode + 1):
                cat = category(chr(i))
                if cat != last_cat:
                    tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))
                    last_cat, last_start = cat, i
            tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))

            try:
                # Write the Unicode table atomically
                tmpdir = storage_directory("tmp")
                tmpdir.mkdir(exist_ok=True, parents=True)
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.renames(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all(isinstance(x, int) for x in ints)
            assert ints == sorted(ints)
            assert all(len(tup) == 2 for tup in vs)

    assert _charmap is not None
    return _charmap


@lru_cache(maxsize=None)
def intervals_from_codec(codec_name: str) -> IntervalSet:  # pragma: no cover
    """Return an IntervalSet of characters which are part of this codec."""
    assert codec_name == codecs.lookup(codec_name).name
    fname = charmap_file(f"codec-{codec_name}")
    try:
        with gzip.GzipFile(fname) as gzf:
            encodable_intervals = json.load(gzf)

    except Exception:
        # This loop is kinda slow, but hopefully we don't need to do it very often!
        encodable_intervals = []
        for i in range(sys.maxunicode + 1):
            try:
                chr(i).encode(codec_name)
            except Exception:  # usually _but not always_ UnicodeEncodeError
                pass
            else:
                encodable_intervals.append((i, i))

    res = IntervalSet(encodable_intervals)
    res = res.union(res)
    try:
        # Write the Unicode table atomically
        tmpdir = storage_directory("tmp")
        tmpdir.mkdir(exist_ok=True, parents=True)
        fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
        os.close(fd)
        # Explicitly set the mtime to get reproducible output
        with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
            o.write(json.dumps(res.intervals).encode())
        os.renames(tmpfile, fname)
    except Exception:
        pass
    return res


_categories: Optional[Categories] = None


def categories() -> Categories:
    """Return a tuple of Unicode categories in a normalised order.

    >>> categories() # doctest: +ELLIPSIS
    ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
    """
    global _categories
    if _categories is None:
        cm = charmap()
        categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
        categories.remove("Cc")  # Other, Control
        categories.remove("Cs")  # Other, Surrogate
        categories.append("Cc")
        categories.append("Cs")
        _categories = tuple(categories)
    return _categories


def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:
    """Return a tuple of Unicode categories in a normalised order.

    This function expands one-letter designations of a major class to include
    all subclasses:

    >>> as_general_categories(['N'])
    ('Nd', 'Nl', 'No')

    See section 4.5 of the Unicode standard for more on classes:
    https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf

    If the collection ``cats`` includes any elements that do not represent a
    major class or a class with subclass, a deprecation warning is raised.
    """
    major_classes = ("L", "M", "N", "P", "S", "Z", "C")
    cs = categories()
    out = set(cats)
    for c in cats:
        if c in major_classes:
            out.discard(c)
            out.update(x for x in cs if x.startswith(c))
        elif c not in cs:
            raise InvalidArgument(
                f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
            )
    return tuple(c for c in cs if c in out)


category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}


def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple:
    """Return a normalised tuple of all Unicode categories that are in
    `include`, but not in `exclude`.

    If include is None then default to including all categories.
    Any item in include that is not a unicode character will be excluded.

    >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
    ('Me', 'Lu', 'Cs')
    """
    cs = categories()
    if cats is None:
        cats = set(cs)
    return tuple(c for c in cs if c in cats)


def _query_for_key(key: Categories) -> IntervalsT:
    """Return a tuple of codepoint intervals covering characters that match one
    or more categories in the tuple of categories `key`.

    >>> _query_for_key(categories())
    ((0, 1114111),)
    >>> _query_for_key(('Zl', 'Zp', 'Co'))
    ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    key = tuple(key)
    # ignore ordering on the cache key to increase potential cache hits.
    cache_key = frozenset(key)
    context = _current_build_context.value
    if context is None or not context.data.provider.avoid_realization:
        try:
            return category_index_cache[cache_key]
        except KeyError:
            pass
    elif not key:  # pragma: no cover  # only on alternative backends
        return ()
    assert key
    if set(key) == set(categories()):
        result = IntervalSet([(0, sys.maxunicode)])
    else:
        result = IntervalSet(_query_for_key(key[:-1])).union(
            IntervalSet(charmap()[key[-1]])
        )
    assert isinstance(result, IntervalSet)
    if context is None or not context.data.provider.avoid_realization:
        category_index_cache[cache_key] = result.intervals
    return result.intervals


limited_category_index_cache: dict[
    tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet
] = {}


def query(
    *,
    categories: Optional[Categories] = None,
    min_codepoint: Optional[int] = None,
    max_codepoint: Optional[int] = None,
    include_characters: str = "",
    exclude_characters: str = "",
) -> IntervalSet:
    """Return a tuple of intervals covering the codepoints for all characters
    that meet the criteria.

    >>> query()
    ((0, 1114111),)
    >>> query(min_codepoint=0, max_codepoint=128)
    ((0, 128),)
    >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])
    ((65, 90),)
    >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],
    ...       include_characters='☃')
    ((65, 90), (9731, 9731))
    """
    if min_codepoint is None:
        min_codepoint = 0
    if max_codepoint is None:
        max_codepoint = sys.maxunicode
    catkey = _category_key(categories)
    character_intervals = IntervalSet.from_string(include_characters or "")
    exclude_intervals = IntervalSet.from_string(exclude_characters or "")
    qkey = (
        catkey,
        min_codepoint,
        max_codepoint,
        character_intervals.intervals,
        exclude_intervals.intervals,
    )
    context = _current_build_context.value
    if context is None or not context.data.provider.avoid_realization:
        try:
            return limited_category_index_cache[qkey]
        except KeyError:
            pass
    base = _query_for_key(catkey)
    result = []
    for u, v in base:
        if v >= min_codepoint and u <= max_codepoint:
            result.append((max(u, min_codepoint), min(v, max_codepoint)))
    result = (IntervalSet(result) | character_intervals) - exclude_intervals
    if context is None or not context.data.provider.avoid_realization:
        limited_category_index_cache[qkey] = result
    return result