# BSD 3-Clause License; see https://github.com/scikit-hep/uproot4/blob/main/LICENSE """ This module represents external libraries that define "array-like" types so that users can choose an output format. The :doc:`uproot.interpretation.library.NumPy` library always works (NumPy is Uproot's only strict dependency) and outputs NumPy arrays for single arrays and dict/tuple/list as groups. Objects and jagged arrays are not efficiently represented, but it provides a zero-dependency least common denominator. The :doc:`uproot.interpretation.library.Awkward` library is the default and depends on Awkward Array (``awkward``). It is usually the best option, as it was designed for Uproot. The :doc:`uproot.interpretation.library.Pandas` library outputs ``pandas.Series`` for single arrays and ``pandas.DataFrame`` as groups. Objects are not efficiently represented, but some jagged arrays are encoded as ``pandas.MultiIndex``. Lazy arrays (:doc:`uproot.behaviors.TBranch.lazy`) can only use the :doc:`uproot.interpretation.library.Awkward` library. """ from __future__ import absolute_import import itertools import json import gc import numpy import uproot def _rename(name, context): if context is None or "rename" not in context: return name else: return context["rename"] class Library(object): """ Abstract superclass of array-library handlers, for libraries such as NumPy, Awkward Array, and Pandas. A library is used in the finalization and grouping stages of producing an array, converting it from internal representations like :doc:`uproot.interpretation.jagged.JaggedArray`, :doc:`uproot.interpretation.strings.StringArray`, and :doc:`uproot.interpretation.objects.ObjectArray` into the library's equivalents. It can also be required for concatenation and other late-stage operations on the output arrays. Libraries are usually selected by a string name. These names are held in a private registry in the :doc:`uproot.interpretation.library` module. """ @property def imported(self): """ Attempts to import the library and returns the imported module. """ raise AssertionError def empty(self, shape, dtype): """ Args: shape (tuple of int): NumPy array ``shape``. (The first item must be zero.) dtype (``numpy.dtype`` or its constructor argument): NumPy array ``dtype``. Returns an empty NumPy-like array. """ return numpy.empty(shape, dtype) def zeros(self, shape, dtype): """ Args: shape (tuple of int): NumPy array ``shape``. (The first item must be zero.) dtype (``numpy.dtype`` or its constructor argument): NumPy array ``dtype``. Returns a NumPy-like array of zeros. """ return numpy.zeros(shape, dtype) def finalize(self, array, branch, interpretation, entry_start, entry_stop): """ Args: array (array): Internal, temporary, trimmed array. If this is a NumPy array, it may be identical to the output array. branch (:doc:`uproot.behaviors.TBranch.TBranch`): The ``TBranch`` that is represented by this array. interpretation (:doc:`uproot.interpretation.Interpretation`): The interpretation that produced the ``array``. entry_start (int): First entry that is included in the output. entry_stop (int): FIrst entry that is excluded (one greater than the last entry that is included) in the output. Create a library-appropriate output array for this temporary ``array``. This array would represent one ``TBranch`` (i.e. not a "group"). """ raise AssertionError def group(self, arrays, expression_context, how): u""" Args: arrays (dict of str \u2192 array): Mapping from names to finalized array objets to combine into a group. expression_context (list of (str, dict) tuples): Expression strings and a dict of metadata about each. how (None, str, or container type): Library-dependent instructions for grouping. The only recognized container types are ``tuple``, ``list``, and ``dict``. Note that the container *type itself* must be passed as ``how``, not an instance of that type (i.e. ``how=tuple``, not ``how=()``). Combine the finalized ``arrays`` into a library-appropriate group type. """ if how is tuple: return tuple(arrays[name] for name, _ in expression_context) elif how is list: return [arrays[name] for name, _ in expression_context] elif how is dict or how is None: return dict( (_rename(name, c), arrays[name]) for name, c in expression_context ) else: raise TypeError( "for library {0}, how must be tuple, list, dict, or None (for " "dict)".format(self.name) ) def global_index(self, array, global_offset): """ Args: array (array): The library-appropriate array whose global index needs adjustment. global_offset (int): A number to add to the global index of ``array`` to correct it. Apply *in-place* corrections to the global index of ``array`` by adding ``global_offset``. Even though the operation is performed *in-place*, this method returns the ``array``. """ return array def concatenate(self, all_arrays): """ Args: all_arrays (list of arrays): A list of library-appropriate arrays that need to be concatenated. Returns a concatenated version of ``all_arrays``. """ raise AssertionError def __repr__(self): return repr(self.name) def __eq__(self, other): return type(_libraries[self.name]) is type(_libraries[other.name]) # noqa: E721 class NumPy(Library): u""" A :doc:`uproot.interpretation.library.Library` that presents ``TBranch`` data as NumPy arrays. The standard name for this library is ``"np"``. The single-``TBranch`` form for this library is a ``numpy.ndarray``. If the data are non-numerical, they will be converted into Python objects and stored in an array with ``dtype="O"``. This is inefficient, but it is the minimal-dependency option for Python. The "group" behavior for this library is: * ``how=dict`` or ``how=None``: a dict of str \u2192 array, mapping the names to arrays. * ``how=tuple``: a tuple of arrays, in the order requested. (Names are lost.) * ``how=list``: a list of arrays, in the order requested. (Names are lost.) Since NumPy arrays are not indexed, ``global_index`` has no effect. """ name = "np" @property def imported(self): import numpy return numpy def finalize(self, array, branch, interpretation, entry_start, entry_stop): if isinstance(array, uproot.interpretation.jagged.JaggedArray) and isinstance( array.content, uproot.interpretation.objects.StridedObjectArray, ): out = numpy.zeros(len(array), dtype=object) for i, x in enumerate(array): out[i] = numpy.zeros(len(x), dtype=object) for j, y in enumerate(x): out[i][j] = y return out elif isinstance( array, ( uproot.interpretation.jagged.JaggedArray, uproot.interpretation.strings.StringArray, uproot.interpretation.objects.ObjectArray, uproot.interpretation.objects.StridedObjectArray, ), ): out = numpy.zeros(len(array), dtype=object) for i, x in enumerate(array): out[i] = x return out else: return array def concatenate(self, all_arrays): if len(all_arrays) == 0: return all_arrays if isinstance(all_arrays[0], (tuple, list)): keys = uproot._util.range(len(all_arrays[0])) elif isinstance(all_arrays[0], dict): keys = list(all_arrays[0]) else: raise AssertionError(repr(all_arrays[0])) to_concatenate = dict((k, []) for k in keys) for arrays in all_arrays: for k in keys: to_concatenate[k].append(arrays[k]) concatenated = dict((k, numpy.concatenate(to_concatenate[k])) for k in keys) if isinstance(all_arrays[0], tuple): return tuple(concatenated[k] for k in keys) elif isinstance(all_arrays[0], list): return [concatenated[k] for k in keys] elif isinstance(all_arrays[0], dict): return concatenated def _strided_to_awkward(awkward, path, interpretation, data): contents = [] names = [] for name, member in interpretation.members: if not name.startswith("@"): p = name if len(path) != 0: p = path + "/" + name if isinstance(member, uproot.interpretation.objects.AsStridedObjects): contents.append(_strided_to_awkward(awkward, p, member, data)) else: contents.append( awkward.from_numpy( numpy.array(data[p]), regulararray=True, highlevel=False ) ) names.append(name) parameters = { "__record__": uproot.model.classname_decode(interpretation.model.__name__)[0] } length = len(data) if len(contents) == 0 else None return awkward.layout.RecordArray(contents, names, length, parameters=parameters) # FIXME: _object_to_awkward_json and _awkward_json_to_array are slow functions # with the right outputs to be replaced by compiled versions in awkward._io. def _object_to_awkward_json(form, obj): if form["class"] == "NumpyArray": return obj elif form["class"] == "RecordArray": out = {} for name, subform in form["contents"].items(): if not name.startswith("@"): if obj.has_member(name): out[name] = _object_to_awkward_json(subform, obj.member(name)) else: out[name] = _object_to_awkward_json(subform, getattr(obj, name)) return out elif form["class"][:15] == "ListOffsetArray": if form["parameters"].get("__array__") == "string": return obj elif form["parameters"].get("__array__") == "sorted_map": key_form = form["content"]["contents"][0] value_form = form["content"]["contents"][1] return [ ( _object_to_awkward_json(key_form, x), _object_to_awkward_json(value_form, y), ) for x, y in obj.items() ] else: subform = form["content"] return [_object_to_awkward_json(subform, x) for x in obj] elif form["class"] == "RegularArray": subform = form["content"] return [_object_to_awkward_json(subform, x) for x in obj] else: raise AssertionError(form["class"]) def _awkward_p(form): out = form["parameters"] out.pop("uproot", None) return out def _awkward_offsets(awkward, form, array): if isinstance(array, awkward.layout.EmptyArray): if form["offsets"] == "i32": return awkward.layout.Index32(numpy.zeros(1, dtype=numpy.int32)) elif form["offsets"] == "u32": return awkward.layout.IndexU32(numpy.zeros(1, dtype=numpy.uint32)) elif form["offsets"] == "i64": return awkward.layout.Index64(numpy.zeros(1, dtype=numpy.int64)) else: raise AssertionError(form["offsets"]) else: if form["offsets"] == "i32": return awkward.layout.Index32( numpy.asarray(array.offsets, dtype=numpy.int32) ) elif form["offsets"] == "u32": return awkward.layout.IndexU32( numpy.asarray(array.offsets, dtype=numpy.uint32) ) elif form["offsets"] == "i64": return awkward.layout.Index64( numpy.asarray(array.offsets, dtype=numpy.int64) ) else: raise AssertionError(form["offsets"]) def _awkward_json_to_array(awkward, form, array): if form["class"] == "NumpyArray": if isinstance(array, awkward.layout.EmptyArray): dtype = awkward.forms.Form.fromjson(json.dumps(form)).to_numpy() return awkward.layout.NumpyArray(numpy.empty(0, dtype=dtype)) else: return array elif form["class"] == "RecordArray": contents = [] names = [] for name, subform in form["contents"].items(): if not name.startswith("@"): if isinstance(array, awkward.layout.EmptyArray): contents.append(_awkward_json_to_array(awkward, subform, array)) else: contents.append( _awkward_json_to_array(awkward, subform, array[name]) ) names.append(name) length = len(array) if len(contents) == 0 else None return awkward.layout.RecordArray( contents, names, length, parameters=_awkward_p(form) ) elif form["class"][:15] == "ListOffsetArray": if form["parameters"].get("__array__") == "string": if isinstance(array, awkward.layout.EmptyArray): content = awkward.layout.NumpyArray( numpy.empty(0, dtype=numpy.uint8), parameters=_awkward_p(form["content"]), ) return awkward.layout.ListOffsetArray64( awkward.layout.Index64(numpy.array([0], dtype=numpy.uint8)), content, parameters=_awkward_p(form), ) else: content = _awkward_json_to_array( awkward, form["content"], array.content ) return type(array)(array.offsets, content, parameters=_awkward_p(form)) elif form["parameters"].get("__array__") == "sorted_map": offsets = _awkward_offsets(awkward, form, array) key_form = form["content"]["contents"][0] value_form = form["content"]["contents"][1] if isinstance(array, awkward.layout.EmptyArray): keys = _awkward_json_to_array(awkward, key_form, array) values = _awkward_json_to_array(awkward, value_form, array) content = awkward.layout.RecordArray( (keys, values), None, 0, parameters=_awkward_p(form["content"]), ) else: keys = _awkward_json_to_array(awkward, key_form, array.content["0"]) values = _awkward_json_to_array(awkward, value_form, array.content["1"]) length = len(array.content) if len(keys) == 0 else None content = awkward.layout.RecordArray( (keys, values), None, length, parameters=_awkward_p(form["content"]), ) cls = getattr(awkward.layout, form["class"]) return cls(offsets, content, parameters=_awkward_p(form)) else: offsets = _awkward_offsets(awkward, form, array) if isinstance(array, awkward.layout.EmptyArray): content = _awkward_json_to_array(awkward, form["content"], array) else: content = _awkward_json_to_array( awkward, form["content"], array.content ) cls = getattr(awkward.layout, form["class"]) return cls(offsets, content, parameters=_awkward_p(form)) elif form["class"] == "RegularArray": if isinstance(array, awkward.layout.EmptyArray): content = _awkward_json_to_array(awkward, form["content"], array) else: content = _awkward_json_to_array(awkward, form["content"], array.content) return awkward.layout.RegularArray( content, form["size"], parameters=_awkward_p(form) ) else: raise AssertionError(form["class"]) class Awkward(Library): u""" A :doc:`uproot.interpretation.library.Library` that presents ``TBranch`` data as Awkward Arrays. The standard name for this library is ``"ak"``. This is the default for all functions that require a :doc:`uproot.interpretation.library.Library`, though Uproot does not explicitly depend on Awkward Array. If you are confronted with a message that Awkward Array is not installed, either install ``awkward`` or select another library (likely :doc:`uproot.interpretation.library.NumPy`). Both the single-``TBranch`` and "group" forms for this library are ``ak.Array``, though groups are always arrays of records. Awkward Array was originally developed for Uproot, so the data structures are usually optimial for Uproot data. The "group" behavior for this library is: * ``how=None``: an array of Awkward records. * ``how=dict``: a dict of str \u2192 array, mapping the names to arrays. * ``how=tuple``: a tuple of arrays, in the order requested. (Names are lost.) * ``how=list``: a list of arrays, in the order requested. (Names are lost.) Since Awkward arrays are not indexed, ``global_index`` has no effect. """ name = "ak" @property def imported(self): return uproot.extras.awkward() def finalize(self, array, branch, interpretation, entry_start, entry_stop): awkward = self.imported if isinstance(array, awkward.layout.Content): return awkward.Array(array) elif isinstance(array, uproot.interpretation.objects.StridedObjectArray): return awkward.Array( _strided_to_awkward(awkward, "", array.interpretation, array.array) ) elif isinstance(array, uproot.interpretation.jagged.JaggedArray) and isinstance( array.content, uproot.interpretation.objects.StridedObjectArray ): content = _strided_to_awkward( awkward, "", array.content.interpretation, array.content.array ) if issubclass(array.offsets.dtype.type, numpy.int32): offsets = awkward.layout.Index32(array.offsets) layout = awkward.layout.ListOffsetArray32(offsets, content) else: offsets = awkward.layout.Index64(array.offsets) layout = awkward.layout.ListOffsetArray64(offsets, content) return awkward.Array(layout) elif isinstance(array, uproot.interpretation.jagged.JaggedArray): content = awkward.from_numpy( array.content, regulararray=True, highlevel=False ) if issubclass(array.offsets.dtype.type, numpy.int32): offsets = awkward.layout.Index32(array.offsets) layout = awkward.layout.ListOffsetArray32(offsets, content) else: offsets = awkward.layout.Index64(array.offsets) layout = awkward.layout.ListOffsetArray64(offsets, content) return awkward.Array(layout) elif isinstance(array, uproot.interpretation.strings.StringArray): content = awkward.layout.NumpyArray( numpy.frombuffer(array.content, dtype=numpy.dtype(numpy.uint8)), parameters={"__array__": "char"}, ) if issubclass(array.offsets.dtype.type, numpy.int32): offsets = awkward.layout.Index32(array.offsets) layout = awkward.layout.ListOffsetArray32( offsets, content, parameters={"__array__": "string"} ) elif issubclass(array.offsets.dtype.type, numpy.uint32): offsets = awkward.layout.IndexU32(array.offsets) layout = awkward.layout.ListOffsetArrayU32( offsets, content, parameters={"__array__": "string"} ) elif issubclass(array.offsets.dtype.type, numpy.int64): offsets = awkward.layout.Index64(array.offsets) layout = awkward.layout.ListOffsetArray64( offsets, content, parameters={"__array__": "string"} ) else: raise AssertionError(repr(array.offsets.dtype)) return awkward.Array(layout) elif isinstance(interpretation, uproot.interpretation.objects.AsObjects): try: form = json.loads( interpretation.awkward_form(interpretation.branch.file).tojson( verbose=True ) ) except uproot.interpretation.objects.CannotBeAwkward as err: raise ValueError( """cannot produce Awkward Arrays for interpretation {0} because {1} instead, try library="np" instead of library="ak" or globally set uproot.default_library in file {2} in object {3}""".format( repr(interpretation), err.because, interpretation.branch.file.file_path, interpretation.branch.object_path, ) ) unlabeled = awkward.from_iter( (_object_to_awkward_json(form, x) for x in array), highlevel=False ) return awkward.Array(_awkward_json_to_array(awkward, form, unlabeled)) elif array.dtype.names is not None: length, shape = array.shape[0], array.shape[1:] array = array.reshape(-1) contents = [] for name in array.dtype.names: contents.append( awkward.from_numpy( numpy.array(array[name]), regulararray=True, highlevel=False ) ) if len(contents) != 0: length = None out = awkward.layout.RecordArray(contents, array.dtype.names, length) for size in shape[::-1]: out = awkward.layout.RegularArray(out, size) return awkward.Array(out) else: return awkward.from_numpy(array, regulararray=True) def group(self, arrays, expression_context, how): awkward = self.imported if how is tuple: return tuple(arrays[name] for name, _ in expression_context) elif how is list: return [arrays[name] for name, _ in expression_context] elif how is dict: return dict( (_rename(name, c), arrays[name]) for name, c in expression_context ) elif how is None: if len(expression_context) == 0: return awkward.Array(awkward.layout.RecordArray([], keys=[])) else: return awkward.Array( dict( (_rename(name, c), arrays[name]) for name, c in expression_context ) ) elif how == "zip": nonjagged = [] offsets = [] jaggeds = [] renamed_arrays = {} for name, context in expression_context: array = renamed_arrays[_rename(name, context)] = arrays[name] if context["is_jagged"]: if len(offsets) == 0: offsets.append(array.layout.offsets) jaggeds.append([_rename(name, context)]) else: for o, j in zip(offsets, jaggeds): if numpy.array_equal(array.layout.offsets, o): j.append(_rename(name, context)) break else: offsets.append(array.layout.offsets) jaggeds.append([_rename(name, context)]) else: nonjagged.append(_rename(name, context)) out = None if len(nonjagged) != 0: if len(nonjagged) == 0: out = awkward.Array(awkward.layout.RecordArray([], keys=[])) else: out = awkward.Array( dict((name, renamed_arrays[name]) for name in nonjagged), ) for number, jagged in enumerate(jaggeds): cut = len(jagged[0]) for name in jagged: cut = min(cut, len(name)) while cut > 0 and ( name[:cut] != jagged[0][:cut] or name[cut - 1] not in ("_", ".", "/") ): cut -= 1 if cut == 0: break if cut == 0: common = "jagged{0}".format(number) if len(jagged) == 0: subarray = awkward.Array( awkward.layout.RecordArray([], keys=[]) ) else: subarray = awkward.zip( dict((name, renamed_arrays[name]) for name in jagged) ) else: common = jagged[0][:cut].strip("_./") if len(jagged) == 0: subarray = awkward.Array( awkward.layout.RecordArray([], keys=[]) ) else: subarray = awkward.zip( dict( (name[cut:].strip("_./"), renamed_arrays[name]) for name in jagged ) ) if out is None: out = awkward.Array({common: subarray}) else: for name in jagged: out = awkward.with_field(out, subarray, common) return out else: raise TypeError( 'for library {0}, how must be tuple, list, dict, "zip" for ' "a record array with jagged arrays zipped, if possible, or " "None, for an unzipped record array".format(self.name) ) def concatenate(self, all_arrays): awkward = self.imported if len(all_arrays) == 0: return all_arrays if isinstance(all_arrays[0], (tuple, list)): keys = uproot._util.range(len(all_arrays[0])) elif isinstance(all_arrays[0], dict): keys = list(all_arrays[0]) else: return awkward.concatenate(all_arrays) to_concatenate = dict((k, []) for k in keys) for arrays in all_arrays: for k in keys: to_concatenate[k].append(arrays[k]) concatenated = dict((k, awkward.concatenate(to_concatenate[k])) for k in keys) if isinstance(all_arrays[0], tuple): return tuple(concatenated[k] for k in keys) elif isinstance(all_arrays[0], list): return [concatenated[k] for k in keys] elif isinstance(all_arrays[0], dict): return concatenated def _pandas_rangeindex(): import pandas return (getattr(pandas, "RangeIndex", pandas.Int64Index), pandas.Int64Index) def _strided_to_pandas(path, interpretation, data, arrays, columns): for name, member in interpretation.members: if not name.startswith("@"): p = path + (name,) if isinstance(member, uproot.interpretation.objects.AsStridedObjects): _strided_to_pandas(p, member, data, arrays, columns) else: arrays.append(data["/".join(p)]) columns.append(p) def _pandas_basic_index(pandas, entry_start, entry_stop): if hasattr(pandas, "RangeIndex"): return pandas.RangeIndex(entry_start, entry_stop) else: return pandas.Int64Index(uproot._util.range(entry_start, entry_stop)) def _pandas_only_series(pandas, original_arrays, expression_context): arrays = {} names = [] for name, context in expression_context: if isinstance(original_arrays[name], pandas.Series): arrays[_rename(name, context)] = original_arrays[name] names.append(_rename(name, context)) else: df = original_arrays[name] for subname in df.columns: if df.leaflist: if isinstance(subname, tuple): path = (_rename(name, context),) + subname else: path = (_rename(name, context), subname) else: path = _rename(name, context) + subname arrays[path] = df[subname] names.append(path) return arrays, names def _pandas_memory_efficient(pandas, series, names): # Pandas copies the data, so at least feed columns one by one gc.collect() out = None for name in names: if out is None: out = series[name].to_frame(name=name) else: out[name] = series[name] del series[name] if out is None: return pandas.DataFrame(data=series, columns=names) else: return out class Pandas(Library): u""" A :doc:`uproot.interpretation.library.Library` that presents ``TBranch`` data as Pandas Series and DataFrames. The standard name for this library is ``"pd"``. The single-``TBranch`` (with a single ``TLeaf``) form for this library is ``pandas.Series``, and the "group" form is ``pandas.DataFrame``. The "group" behavior for this library is: * ``how=None`` or a string: passed to ``pandas.merge`` as its ``how`` parameter, which would be relevant if jagged arrays with different multiplicity are requested. * ``how=dict``: a dict of str \u2192 array, mapping the names to ``pandas.Series``. * ``how=tuple``: a tuple of ``pandas.Series``, in the order requested. (Names are assigned to the ``pandas.Series``.) * ``how=list``: a list of ``pandas.Series``, in the order requested. (Names are assigned to the ``pandas.Series``.) Pandas Series and DataFrames are indexed, so ``global_index`` adjusts them. """ name = "pd" @property def imported(self): return uproot.extras.pandas() def finalize(self, array, branch, interpretation, entry_start, entry_stop): pandas = self.imported if isinstance(array, uproot.interpretation.objects.StridedObjectArray): arrays = [] columns = [] _strided_to_pandas((), array.interpretation, array.array, arrays, columns) maxlen = max(len(x) for x in columns) if maxlen == 1: columns = [x[0] for x in columns] else: columns = pandas.MultiIndex.from_tuples( [x + ("",) * (maxlen - len(x)) for x in columns] ) index = _pandas_basic_index(pandas, entry_start, entry_stop) out = pandas.DataFrame( dict(zip(columns, arrays)), columns=columns, index=index ) out.leaflist = maxlen != 1 return out elif isinstance(array, uproot.interpretation.jagged.JaggedArray) and isinstance( array.content, uproot.interpretation.objects.StridedObjectArray ): index = pandas.MultiIndex.from_arrays( array.parents_localindex(entry_start, entry_stop), names=["entry", "subentry"], ) arrays = [] columns = [] _strided_to_pandas( (), array.content.interpretation, array.content.array, arrays, columns ) maxlen = max(len(x) for x in columns) if maxlen == 1: columns = [x[0] for x in columns] else: columns = pandas.MultiIndex.from_tuples( [x + ("",) * (maxlen - len(x)) for x in columns] ) out = pandas.DataFrame( dict(zip(columns, arrays)), columns=columns, index=index ) out.leaflist = maxlen != 1 return out elif isinstance(array, uproot.interpretation.jagged.JaggedArray): index = pandas.MultiIndex.from_arrays( array.parents_localindex(entry_start, entry_stop), names=["entry", "subentry"], ) return pandas.Series(array.content, index=index) elif isinstance( array, ( uproot.interpretation.strings.StringArray, uproot.interpretation.objects.ObjectArray, ), ): out = numpy.zeros(len(array), dtype=object) for i, x in enumerate(array): out[i] = x index = _pandas_basic_index(pandas, entry_start, entry_stop) return pandas.Series(out, index=index) elif array.dtype.names is not None and len(array.shape) != 1: names = [] arrays = {} for n in array.dtype.names: for tup in itertools.product( *[uproot._util.range(d) for d in array.shape[1:]] ): name = (n + "".join("[" + str(x) + "]" for x in tup),) names.append(name) arrays[name] = array[n][(slice(None),) + tup] index = _pandas_basic_index(pandas, entry_start, entry_stop) out = pandas.DataFrame(arrays, columns=names, index=index) out.leaflist = True return out elif array.dtype.names is not None: columns = pandas.MultiIndex.from_tuples([(x,) for x in array.dtype.names]) arrays = dict((y, array[x]) for x, y in zip(array.dtype.names, columns)) index = _pandas_basic_index(pandas, entry_start, entry_stop) out = pandas.DataFrame(arrays, columns=columns, index=index) out.leaflist = True return out elif len(array.shape) != 1: names = [] arrays = {} for tup in itertools.product( *[uproot._util.range(d) for d in array.shape[1:]] ): name = "".join("[" + str(x) + "]" for x in tup) names.append(name) arrays[name] = array[(slice(None),) + tup] index = _pandas_basic_index(pandas, entry_start, entry_stop) out = pandas.DataFrame(arrays, columns=names, index=index) out.leaflist = False return out else: index = _pandas_basic_index(pandas, entry_start, entry_stop) return pandas.Series(array, index=index) def group(self, arrays, expression_context, how): pandas = self.imported if how is tuple: return tuple(arrays[name] for name, _ in expression_context) elif how is list: return [arrays[name] for name, _ in expression_context] elif how is dict: return dict( (_rename(name, c), arrays[name]) for name, c in expression_context ) elif uproot._util.isstr(how) or how is None: arrays, names = _pandas_only_series(pandas, arrays, expression_context) if any(isinstance(x, tuple) for x in names): longest = max(len(x) for x in names if isinstance(x, tuple)) newarrays, newnames = {}, [] for x in names: if not isinstance(x, tuple): y = (x,) + (None,) * (longest - 1) else: y = x + (None,) * (longest - len(x)) newarrays[y] = arrays[x] newnames.append(y) arrays = newarrays names = pandas.MultiIndex.from_tuples(newnames) if all(isinstance(x.index, _pandas_rangeindex()) for x in arrays.values()): return _pandas_memory_efficient(pandas, arrays, names) indexes = [] groups = [] for name in names: array = arrays[name] if isinstance(array.index, pandas.MultiIndex): for index, group in zip(indexes, groups): if numpy.array_equal(array.index, index): group.append(name) break else: indexes.append(array.index) groups.append([name]) if how is None: flat_index = None dfs = [[] for x in indexes] group_names = [[] for x in indexes] for index, group, df, gn in zip(indexes, groups, dfs, group_names): for name in names: array = arrays[name] if isinstance(array.index, _pandas_rangeindex()): if flat_index is None or len(flat_index) != len( array.index ): flat_index = pandas.MultiIndex.from_arrays( [array.index] ) df.append( pandas.Series(array.values, index=flat_index).reindex( index ) ) gn.append(name) elif name in group: df.append(array) gn.append(name) out = [] for index, df, gn in zip(indexes, dfs, group_names): out.append( pandas.DataFrame( data=dict(zip(gn, df)), index=index, columns=gn ) ) if len(out) == 1: return out[0] else: return tuple(out) else: out = None for index, group in zip(indexes, groups): only = dict((name, arrays[name]) for name in group) df = pandas.DataFrame(data=only, index=index, columns=group) if out is None: out = df else: out = pandas.merge( out, df, how=how, left_index=True, right_index=True ) flat_names = [ name for name in names if isinstance(arrays[name].index, _pandas_rangeindex()) ] if len(flat_names) > 0: flat_index = pandas.MultiIndex.from_arrays( [arrays[flat_names[0]].index] ) only = dict( (name, pandas.Series(arrays[name].values, index=flat_index)) for name in flat_names ) df = pandas.DataFrame( data=only, index=flat_index, columns=flat_names ) out = pandas.merge( df.reindex(out.index), out, how=how, left_index=True, right_index=True, ) return out else: raise TypeError( "for library {0}, how must be tuple, list, dict, str (for " "pandas.merge's 'how' parameter, or None (for one or more" "DataFrames without merging)".format(self.name) ) def global_index(self, arrays, global_offset): if type(arrays.index).__name__ == "MultiIndex": if hasattr(arrays.index.levels[0], "arrays"): index = arrays.index.levels[0].arrays # pandas>=0.24.0 else: index = arrays.index.levels[0].values # pandas<0.24.0 numpy.add(index, global_offset, out=index) elif type(arrays.index).__name__ == "RangeIndex": if hasattr(arrays.index, "start") and hasattr(arrays.index, "stop"): index_start = arrays.index.start # pandas>=0.25.0 index_stop = arrays.index.stop else: index_start = arrays.index._start # pandas<0.25.0 index_stop = arrays.index._stop arrays.index = type(arrays.index)( index_start + global_offset, index_stop + global_offset ) else: if hasattr(arrays.index, "arrays"): index = arrays.index.arrays # pandas>=0.24.0 else: index = arrays.index.values # pandas<0.24.0 numpy.add(index, global_offset, out=index) return arrays def concatenate(self, all_arrays): pandas = self.imported if len(all_arrays) == 0: return all_arrays if isinstance(all_arrays[0], (tuple, list)): keys = uproot._util.range(len(all_arrays[0])) elif isinstance(all_arrays[0], dict): keys = list(all_arrays[0]) else: return pandas.concat(all_arrays) to_concatenate = dict((k, []) for k in keys) for arrays in all_arrays: for k in keys: to_concatenate[k].append(arrays[k]) concatenated = dict((k, pandas.concat(to_concatenate[k])) for k in keys) if isinstance(all_arrays[0], tuple): return tuple(concatenated[k] for k in keys) elif isinstance(all_arrays[0], list): return [concatenated[k] for k in keys] elif isinstance(all_arrays[0], dict): return concatenated _libraries = { NumPy.name: NumPy(), Awkward.name: Awkward(), Pandas.name: Pandas(), } _libraries["numpy"] = _libraries[NumPy.name] _libraries["Numpy"] = _libraries[NumPy.name] _libraries["NumPy"] = _libraries[NumPy.name] _libraries["NUMPY"] = _libraries[NumPy.name] _libraries["awkward1"] = _libraries[Awkward.name] _libraries["Awkward1"] = _libraries[Awkward.name] _libraries["AWKWARD1"] = _libraries[Awkward.name] _libraries["awkward"] = _libraries[Awkward.name] _libraries["Awkward"] = _libraries[Awkward.name] _libraries["AWKWARD"] = _libraries[Awkward.name] _libraries["pandas"] = _libraries[Pandas.name] _libraries["Pandas"] = _libraries[Pandas.name] _libraries["PANDAS"] = _libraries[Pandas.name] def _regularize_library(library): if isinstance(library, Library): if library.name in _libraries: return _libraries[library.name] else: raise ValueError( "library {0} ({1}) cannot be used in this function".format( type(library).__name__, repr(library.name) ) ) elif isinstance(library, type) and issubclass(library, Library): if library().name in _libraries: return _libraries[library().name] else: raise ValueError( "library {0} ({1}) cannot be used in this function".format( library.__name__, repr(library().name) ) ) else: try: return _libraries[library] except KeyError: raise ValueError( """library {0} not recognized (for this function); """ """try "np" (NumPy), "ak" (Awkward Array), or "pd" (Pandas) """ """instead""".format(repr(library)) ) _libraries_lazy = {Awkward.name: _libraries[Awkward.name]} _libraries_lazy["awkward1"] = _libraries_lazy[Awkward.name] _libraries_lazy["Awkward1"] = _libraries_lazy[Awkward.name] _libraries_lazy["AWKWARD1"] = _libraries_lazy[Awkward.name] _libraries_lazy["awkward"] = _libraries_lazy[Awkward.name] _libraries_lazy["Awkward"] = _libraries_lazy[Awkward.name] _libraries_lazy["AWKWARD"] = _libraries_lazy[Awkward.name] def _regularize_library_lazy(library): if isinstance(library, Library): if library.name in _libraries_lazy: return _libraries_lazy[library.name] else: raise ValueError( "library {0} ({1}) cannot be used in this function".format( type(library).__name__, repr(library.name) ) ) elif isinstance(library, type) and issubclass(library, Library): if library().name in _libraries_lazy: return _libraries_lazy[library().name] else: raise ValueError( "library {0} ({1}) cannot be used in this function".format( library.__name__, repr(library().name) ) ) else: try: return _libraries_lazy[library] except KeyError: raise ValueError( """library {0} not recognized (for this function); """ """try "ak" (Awkward Array) """ """instead""".format(repr(library)) )