# BSD 3-Clause License; see https://github.com/scikit-hep/uproot4/blob/main/LICENSE """ This module defines an :doc:`uproot.interpretation.Interpretation` and temporary array for string data. Note that :doc:`uproot.interpretation.strings.AsStrings` is an interpretation for top-level strings, but :doc:`uproot.containers.AsString` can be nested within any other :doc:`uproot.containers.AsContainer`. The :doc:`uproot.interpretation.strings.StringArray` class only holds data while an array is being built from ``TBaskets``. Its final form is determined by the :doc:`uproot.interpretation.library.Library`. """ from __future__ import absolute_import import struct import numpy import uproot _string_4byte_size = struct.Struct(">I") class AsStrings(uproot.interpretation.Interpretation): """ Args: header_bytes (int): Number of bytes to skip at the beginning of each entry. length_bytes ("1-5" or "4"): Method used to determine the length of a string: "1-5" means one byte if the length is less than 256, otherwise the true length is in the next four bytes; "4" means always four bytes. typename (None or str): If None, construct a plausible C++ typename. Otherwise, take the suggestion as given. original (None, :doc:`uproot.model.Model`, or :doc:`uproot.containers.Container`): If this interpretation is derived from :ref:`uproot.interpretation.objects.AsObjects.simplify`, this is a reminder of the original :ref:`uproot.interpretation.objects.AsObjects.model`. An :doc:`uproot.interpretation.Interpretation` for an array of strings. This cannot be nested within other :doc:`uproot.interpretation.Interpretation` objects; it can only represent a ``TBranch`` that only contains strings (not strings within ``std::vector``, for instance). Note that the :doc:`uproot.containers.AsString` class is for strings nested within other objects. (:ref:`uproot.interpretation.objects.AsObjects.simplify` converts an :doc:`uproot.interpretation.objects.AsObjects` of :doc:`uproot.containers.AsString` into a :doc:`uproot.interpretation.strings.AsStrings`.) """ def __init__( self, header_bytes=0, length_bytes="1-5", typename=None, original=None ): self._header_bytes = header_bytes if length_bytes in ("1-5", "4"): self._length_bytes = length_bytes else: raise ValueError("length_bytes must be '1-5' or '4'") self._typename = typename self._original = original @property def header_bytes(self): """ The number of bytes to skip at the beginning of each entry. """ return self._header_bytes @property def length_bytes(self): """ Method used to determine the length of a string: "1-5" means one byte if the length is less than 256, otherwise the true length is in the next four bytes; "4" means always four bytes. """ return self._length_bytes @property def original(self): """ If not None, this was the original :ref:`uproot.interpretation.objects.AsObjects.model` from an :doc:`uproot.interpretation.objects.AsObjects` that was simplified into this :doc:`uproot.interpretation.jagged.AsJagged`. """ return self._original def __repr__(self): args = [] if self._header_bytes != 0: args.append("header_bytes={0}".format(self._header_bytes)) if self._length_bytes != "1-5": args.append("length_bytes={0}".format(repr(self._length_bytes))) return "AsStrings({0})".format(", ".join(args)) def __eq__(self, other): return ( isinstance(other, AsStrings) and self._header_bytes == other._header_bytes and self._length_bytes == other._length_bytes ) @property def typename(self): if self._typename is None: return "char*" else: return self._typename @property def numpy_dtype(self): return numpy.dtype(object) def awkward_form( self, file, index_format="i64", header=False, tobject_header=True, breadcrumbs=(), ): awkward = uproot.extras.awkward() return awkward.forms.ListOffsetForm( index_format, awkward.forms.NumpyForm((), 1, "B", parameters={"__array__": "char"}), parameters={ "__array__": "string", "uproot": { "as": "strings", "header_bytes": self._header_bytes, "length_bytes": self._length_bytes, }, }, ) @property def cache_key(self): return "{0}({1},{2})".format( type(self).__name__, self._header_bytes, repr(self._length_bytes) ) def basket_array( self, data, byte_offsets, basket, branch, context, cursor_offset, library ): self.hook_before_basket_array( data=data, byte_offsets=byte_offsets, basket=basket, branch=branch, context=context, cursor_offset=cursor_offset, library=library, ) if byte_offsets is None: counts = numpy.empty(len(data), dtype=numpy.int32) outdata = numpy.empty(len(data), dtype=data.dtype) pos = 0 entry_num = 0 len_outdata = 0 if self._length_bytes == "1-5": while True: if pos >= len(data): break size = data[pos] pos += 1 if size == 255: (size,) = _string_4byte_size.unpack(data[pos : pos + 4]) pos += 4 counts[entry_num] = size entry_num += 1 outdata[len_outdata : len_outdata + size] = data[pos : pos + size] len_outdata += size pos += size elif self._length_bytes == "4": while True: if pos >= len(data): break (size,) = _string_4byte_size.unpack(data[pos : pos + 4]) pos += 4 counts[entry_num] = size entry_num += 1 outdata[len_outdata : len_outdata + size] = data[pos : pos + size] len_outdata += size pos += size else: raise AssertionError(repr(self._length_bytes)) counts = counts[:entry_num] data = outdata[:len_outdata] else: byte_starts = byte_offsets[:-1] + self._header_bytes byte_stops = byte_offsets[1:] if self._length_bytes == "1-5": length_header_size = numpy.ones(len(byte_starts), dtype=numpy.int32) length_header_size[data[byte_starts] == 255] += 4 elif self._length_bytes == "4": length_header_size = numpy.full(len(byte_starts), 4, dtype=numpy.int32) else: raise AssertionError(repr(self._length_bytes)) byte_starts += length_header_size mask = numpy.zeros(len(data), dtype=numpy.int8) mask[byte_starts[byte_starts < len(data)]] = 1 numpy.add.at(mask, byte_stops[byte_stops < len(data)], -1) numpy.cumsum(mask, out=mask) data = data[mask.view(numpy.bool_)] counts = byte_stops - byte_starts offsets = numpy.empty(len(counts) + 1, dtype=numpy.int32) offsets[0] = 0 numpy.cumsum(counts, out=offsets[1:]) if hasattr(data, "tobytes"): data = data.tobytes() else: data = data.tostring() output = StringArray(offsets, data) self.hook_after_basket_array( data=data, byte_offsets=byte_offsets, basket=basket, branch=branch, context=context, output=output, cursor_offset=cursor_offset, library=library, ) return output def final_array( self, basket_arrays, entry_start, entry_stop, entry_offsets, library, branch ): self.hook_before_final_array( basket_arrays=basket_arrays, entry_start=entry_start, entry_stop=entry_stop, entry_offsets=entry_offsets, library=library, branch=branch, ) basket_offsets = {} basket_content = {} for k, v in basket_arrays.items(): basket_offsets[k] = v.offsets basket_content[k] = v.content if entry_start >= entry_stop: output = StringArray(library.zeros((1,), numpy.int64), b"") else: length = 0 start = entry_offsets[0] for basket_num, stop in enumerate(entry_offsets[1:]): if start <= entry_start and entry_stop <= stop: length += entry_stop - entry_start elif start <= entry_start < stop: length += stop - entry_start elif start <= entry_stop <= stop: length += entry_stop - start elif entry_start < stop and start <= entry_stop: length += stop - start start = stop offsets = numpy.empty((length + 1,), numpy.int64) before = 0 start = entry_offsets[0] contents = [] for basket_num, stop in enumerate(entry_offsets[1:]): if start <= entry_start and entry_stop <= stop: local_start = entry_start - start local_stop = entry_stop - start off, cnt = basket_offsets[basket_num], basket_content[basket_num] offsets[:] = ( before - off[local_start] + off[local_start : local_stop + 1] ) before += off[local_stop] - off[local_start] contents.append(cnt[off[local_start] : off[local_stop]]) elif start <= entry_start < stop: local_start = entry_start - start local_stop = stop - start off, cnt = basket_offsets[basket_num], basket_content[basket_num] offsets[: stop - entry_start + 1] = ( before - off[local_start] + off[local_start : local_stop + 1] ) before += off[local_stop] - off[local_start] contents.append(cnt[off[local_start] : off[local_stop]]) elif start <= entry_stop <= stop: local_start = 0 local_stop = entry_stop - start off, cnt = basket_offsets[basket_num], basket_content[basket_num] offsets[start - entry_start :] = ( before - off[local_start] + off[local_start : local_stop + 1] ) before += off[local_stop] - off[local_start] contents.append(cnt[off[local_start] : off[local_stop]]) elif entry_start < stop and start <= entry_stop: off, cnt = basket_offsets[basket_num], basket_content[basket_num] offsets[start - entry_start : stop - entry_start + 1] = ( before - off[0] + off ) before += off[-1] - off[0] contents.append(cnt[off[0] : off[-1]]) start = stop output = StringArray(offsets, b"".join(contents)) self.hook_before_library_finalize( basket_arrays=basket_arrays, entry_start=entry_start, entry_stop=entry_stop, entry_offsets=entry_offsets, library=library, branch=branch, output=output, ) output = library.finalize(output, branch, self, entry_start, entry_stop) self.hook_after_final_array( basket_arrays=basket_arrays, entry_start=entry_start, entry_stop=entry_stop, entry_offsets=entry_offsets, library=library, branch=branch, output=output, ) return output class StringArray(object): """ Args: offsets (array of ``numpy.int32``): Starting and stopping indexes for each string. The length of the ``offsets`` is one greater than the number of strings. content (array): Contiguous array of character data for all strings of the array. Temporary array filled by :ref:`uproot.interpretation.strings.AsStrings.basket_array`, which will be turned into a NumPy, Awkward, or other array, depending on the specified :doc:`uproot.interpretation.library.Library`. """ def __init__(self, offsets, content): self._offsets = offsets self._content = content def __repr__(self): if len(self._content) > 100: left, right = self._content[:45], self._content[-45:] content = repr(left) + " ... " + repr(right) else: content = repr(self._content) return "StringArray({0}, {1})".format(self._offsets, content) @property def offsets(self): """ Starting and stopping indexes for each string. The length of the ``offsets`` is one greater than the number of strings. """ return self._offsets @property def content(self): """ Contiguous array of character data for all strings of the array. """ return self._content def __getitem__(self, where): data = self._content[self._offsets[where] : self._offsets[where + 1]] return uproot._util.ensure_str(data) def __len__(self): return len(self._offsets) - 1 def __iter__(self): start = self._offsets[0] content = self._content for stop in self._offsets[1:]: yield uproot._util.ensure_str(content[start:stop]) start = stop