######################################################################## # # License: BSD # Created: October 14, 2002 # Author: Francesc Altet - faltet@carabos.com # # $Id: leaf.py 3398 2008-01-11 17:26:06Z faltet $ # ######################################################################## """Here is defined the Leaf class. See Leaf class docstring for more info. Classes: Leaf Functions: calc_chunksize Misc variables: __version__ """ import sys import warnings import math import numpy import tables from tables.flavor import ( check_flavor, internal_flavor, alias_map as flavor_alias_map ) from tables import hdf5Extension from tables import utilsExtension from tables.node import Node from tables.filters import Filters from tables.utils import idx2long, byteorders, lazyattr from tables.parameters import CHUNKTIMES, BUFFERTIMES from tables.exceptions import PerformanceWarning __version__ = "$Revision: 3398 $" def csformula(expectedsizeinMB): """Return the fitted chunksize for expectedsizeinMB.""" # For a basesize of 8 KB, this will return: # 8 KB for datasets <= 1 MB # 1 MB for datasets >= 10 TB basesize = 8*1024 # 8 KB is a good minimum return basesize * int(2**math.log10(expectedsizeinMB)) def limit_es(expectedsizeinMB): """Protection against creating too small or too large chunks.""" if expectedsizeinMB < 1: # < 1 MB expectedsizeinMB = 1 elif expectedsizeinMB > 10**7: # > 10 TB expectedsizeinMB = 10**7 return expectedsizeinMB def calc_chunksize(expectedsizeinMB): """Compute the optimum HDF5 chunksize for I/O purposes. Rational: HDF5 takes the data in bunches of chunksize length to write the on disk. A BTree in memory is used to map structures on disk. The more chunks that are allocated for a dataset the larger the B-tree. Large B-trees take memory and causes file storage overhead as well as more disk I/O and higher contention for the meta data cache. You have to balance between memory and I/O overhead (small B-trees) and time to access to data (big B-trees). The tuning of the chunksize parameter affects the performance and the memory consumed. This is based on my own experiments and, as always, your mileage may vary. """ expectedsizeinMB = limit_es(expectedsizeinMB) zone = int(math.log10(expectedsizeinMB)) expectedsizeinMB = 10**zone chunksize = csformula(expectedsizeinMB) return chunksize class Leaf(Node): """ Abstract base class for all PyTables leaves. A leaf is a node (see the `Node` class) which hangs from a group (see the `Group` class) but, unlike a group, it can not have any further children below it (i.e. it is an end node). This definition includes all nodes which contain actual data (datasets handled by the `Table`, `Array`, `CArray`, `EArray` and `VLArray` classes) and unsupported nodes (the `UnImplemented` class) --these classes do in fact inherit from `Leaf`. Public instance variables ------------------------- The following instance variables are provided in addition to those in `Node`: byteorder The byte ordering of the leaf data *on disk*. chunkshape The HDF5 chunk size for chunked leaves (a tuple). This is read-only because you cannot change the chunk size of a leaf once it has been created. extdim The index of the enlargeable dimension (-1 if none). filters Filter properties for this leaf --see `Filters`. flavor The type of the data object read from this leaf. It can be any of 'numpy', 'numarray', 'numeric' or 'python' (the set of supported flavors depends on which packages you have installed on your system). You can (and are encouraged to) use this property to get, set and delete the ``FLAVOR`` HDF5 attribute of the leaf. When the leaf has no such attribute, the default flavor is used. maindim The dimension along which iterators work. Its value is 0 (i.e. the first dimension) when the dataset is not extendable, and `Leaf.extdim` (where available) for extendable ones. nrows The length of the main dimension of the leaf data. nrowsinbuf The number of rows that fit in internal input buffers. You can change this to fine-tune the speed or memory requirements of your application. shape The shape of data in the leaf. Public instance variables -- aliases ------------------------------------ The following instance variables are just easier-to-write aliases to their `Node` counterparts (indicated between parentheses): attrs The associated `AttributeSet` instance (`Node._v_attrs`). hdf5name The name of this node in the hosting HDF5 file (`Node._v_hdf5name`). name The name of this node in its parent group (`Node._v_name`). objectID A node identifier (may change from run to run). (`Node._v_objectID`). title A description for this node (`Node._v_title`). Public methods -------------- * close([flush]) * copy([newparent][, newname][, overwrite][, createparents][, **kwargs]) * delAttr(name) * flush() * getAttr(name) * isVisible() * move([newparent][, newname][, overwrite]) * remove() * rename(newname) * setAttr(name, value) * _f_close([flush]) * __len__() """ # Properties # ~~~~~~~~~~ # Node property aliases # ````````````````````` # These are a little hard to override, but so are properties. attrs = Node._v_attrs title = Node._v_title # Read-only node property aliases # ``````````````````````````````` name = property( lambda self: self._v_name, None, None, "The name of this node in its parent group (a string)." ) hdf5name = property( lambda self: self._v_hdf5name, None, None, "The name of this node in its parent group (a string)." ) chunkshape = property( lambda self: self._v_chunkshape, None, None, """ The HDF5 chunk size for chunked leaves (a tuple). This is read-only because you cannot change the chunk size of a leaf once it has been created. """ ) objectID = property( lambda self: self._v_objectID, None, None, "A node identifier (may change from run to run)." ) # Lazy read-only attributes # ````````````````````````` @lazyattr def filters(self): """Filter properties for this leaf.""" return Filters._from_leaf(self) # Other properties # ```````````````` def _getmaindim(self): if self.extdim < 0: return 0 # choose the first dimension return self.extdim maindim = property( _getmaindim, None, None, """ The dimension along which iterators work. Its value is 0 (i.e. the first dimension) when the dataset is not extendable, and `Leaf.extdim` (where available) for extendable ones. """ ) def _setflavor(self, flavor): self._v_file._checkWritable() check_flavor(flavor) self._v_attrs.FLAVOR = self._flavor = flavor # logs the change def _delflavor(self): del self._v_attrs.FLAVOR self._flavor = internal_flavor flavor = property( lambda self: self._flavor, _setflavor, _delflavor, """ The representation of data read from this leaf. It can be any of 'numpy', 'numarray', 'numeric' or 'python' (the set of supported flavors depends on which packages you have installed on your system). You can (and are encouraged to) use this property to get, set and delete the ``FLAVOR`` HDF5 attribute of the leaf. When the leaf has no such attribute, the default flavor is used. """ ) # Special methods # ~~~~~~~~~~~~~~~ def __init__(self, parentNode, name, new=False, filters=None, byteorder=None, _log=True): self._v_new = new """Is this the first time the node has been created?""" self.nrowsinbuf = None """ The number of rows that fits in internal input buffers. You can change this to fine-tune the speed or memory requirements of your application. """ self._flavor = None """Private storage for the `flavor` property.""" if new: # Get filter properties from parent group if not given. if filters is None: filters = parentNode._v_filters self.__dict__['filters'] = filters # bypass the property if byteorder not in (None, 'little', 'big'): raise ValueError( "the byteorder can only take 'little' or 'big' values " "and you passed: %s" % byteorder) self.byteorder = byteorder """The byte ordering of the leaf data *on disk*.""" # Existing filters need not be read since `filters` # is a lazy property that automatically handles their loading. super(Leaf, self).__init__(parentNode, name, _log) def __len__(self): """Return the length of the main dimension of the leaf data.""" return self.nrows def __str__(self): """The string representation for this object is its pathname in the HDF5 object tree plus some additional metainfo. """ # Get this class name classname = self.__class__.__name__ # The title title = self._v_title # The filters filters = "" if self.filters.fletcher32: filters += ", fletcher32" if self.filters.complevel: if self.filters.shuffle: filters += ", shuffle" filters += ", %s(%s)" % (self.filters.complib, self.filters.complevel) return "%s (%s%s%s) %r" % \ (self._v_pathname, classname, self.shape, filters, title) # Private methods # ~~~~~~~~~~~~~~~ def _g_postInitHook(self): """ Code to be run after node creation and before creation logging. This method gets or sets the flavor of the leaf. """ super(Leaf, self)._g_postInitHook() if self._v_new: # set flavor of new node if self._flavor is None: self._flavor = internal_flavor else: # flavor set at creation time, do not log self._v_attrs._g__setattr('FLAVOR', self._flavor) else: # get flavor of existing node (if any) flavor = getattr(self._v_attrs, 'FLAVOR', internal_flavor) self._flavor = flavor_alias_map.get(flavor, flavor) assert self._flavor is not None def _calc_chunkshape(self, expectedrows, rowsize, itemsize): """Calculate the shape for the HDF5 chunk.""" # Compute the chunksize MB = 1024 * 1024 expectedsizeinMB = (expectedrows * rowsize) / MB chunksize = calc_chunksize(expectedsizeinMB) # In case of a scalar shape, return the unit chunksize if self.shape == (): return (1,) maindim = self.maindim # Compute the chunknitems chunknitems = chunksize // itemsize # Safeguard against itemsizes being extremely large if chunknitems == 0: chunknitems = 1 chunkshape = list(self.shape) # Check whether trimming the main dimension is enough chunkshape[maindim] = 1 newchunknitems = numpy.prod(chunkshape, dtype='int64') if newchunknitems <= chunknitems: chunkshape[maindim] = chunknitems // newchunknitems else: # No, so start trimming other dimensions as well for j in xrange(len(chunkshape)): # Check whether trimming this dimension is enough chunkshape[j] = 1 newchunknitems = numpy.prod(chunkshape, dtype='int64') if newchunknitems <= chunknitems: chunkshape[j] = chunknitems // newchunknitems break else: # Ops, we ran out of the loop without a break # Set the last dimension to chunknitems chunkshape[-1] = chunknitems return tuple(chunkshape) def _calc_nrowsinbuf(self, chunkshape, rowsize, itemsize): """Calculate the number of rows that fits on a PyTables buffer.""" # Compute the nrowsinbuf # Use an int64 type to avoid overflows in 32-bit systems # Fixes ticket #90 chunksize = numpy.prod(chunkshape, dtype='int64') * itemsize buffersize = chunksize * CHUNKTIMES nrowsinbuf = buffersize // rowsize # Safeguard against row sizes being extremely large if nrowsinbuf == 0: nrowsinbuf = 1 # If rowsize is too large, issue a Performance warning maxrowsize = BUFFERTIMES * buffersize if rowsize > maxrowsize: warnings.warn("""\ The Leaf ``%s`` is exceeding the maximum recommended rowsize (%d bytes); be ready to see PyTables asking for *lots* of memory and possibly slow I/O. You may want to reduce the rowsize by trimming the value of dimensions that are orthogonal (and preferably close) to the main dimension of this leave. Alternatively, in case you have specified a very small/large chunksize, you may want to increase/decrease it.""" % (self._v_pathname, maxrowsize), PerformanceWarning) # It is difficult to forsee the level of code nesting to reach user code. #f = sys._getframe(8) ###Caller --> %s (%s:%s)""" # f.f_code.co_name, # f.f_code.co_filename, f.f_lineno,), return nrowsinbuf # This method is appropriate for calls to __getitem__ methods def _processRange(self, start, stop, step, dim=None): if dim is None: nrows = self.nrows # self.shape[self.maindim] else: nrows = self.shape[dim] if step and step < 0: raise ValueError("slice step cannot be negative") # (start, stop, step) = slice(start, stop, step).indices(nrows) # Python > 2.3 # The next function is a substitute for slice().indices in order to # support full 64-bit integer for slices (Python 2.4 does not # support that yet) # F. Altet 2005-05-08 # In order to convert possible numpy.integer values to long ones # F. Altet 2006-05-02 if start is not None: start = idx2long(start) if stop is not None: stop = idx2long(stop) if step is not None: step = idx2long(step) (start, stop, step) = utilsExtension.getIndices( slice(start, stop, step), long(nrows) ) # Some protection against empty ranges if start > stop: start = stop return (start, stop, step) # This method is appropiate for calls to read() methods def _processRangeRead(self, start, stop, step): nrows = self.nrows if start is not None and stop is None: # Protection against start greater than available records # nrows == 0 is a special case for empty objects if nrows > 0 and start >= nrows: raise IndexError( "start of range (%s) is greater than " "number of rows (%s)" % (start, nrows) ) step = 1 if start == -1: # corner case stop = nrows else: stop = start + 1 # Finally, get the correct values (over the main dimension) start, stop, step = self._processRange(start, stop, step) return (start, stop, step) def _g_copy(self, newParent, newName, recursive, _log=True, **kwargs): # Compute default arguments. start = kwargs.get('start', 0) stop = kwargs.get('stop', self.nrows) step = kwargs.get('step', 1) title = kwargs.get('title', self._v_title) filters = kwargs.get('filters', self.filters) stats = kwargs.get('stats', None) # Fix arguments with explicit None values for backwards compatibility. if stop is None: stop = self.nrows if title is None: title = self._v_title if filters is None: filters = self.filters # Compute the correct indices. (start, stop, step) = self._processRangeRead(start, stop, step) # Create a copy of the object. (newNode, bytes) = self._g_copyWithStats( newParent, newName, start, stop, step, title, filters, _log) # Copy user attributes if requested (or the flavor at least). if kwargs.get('copyuserattrs', True): self._v_attrs._g_copy(newNode._v_attrs) elif 'FLAVOR' in self._v_attrs: newNode._v_attrs._g__setattr('FLAVOR', self._flavor) newNode._flavor = self._flavor # update cached value # Update statistics if needed. if stats is not None: stats['leaves'] += 1 stats['bytes'] += bytes return newNode def _g_fix_byteorder_data(self, data, dbyteorder): "Fix the byteorder of data passed in constructors." dbyteorder = byteorders[dbyteorder] # If self.byteorder has not been passed as an argument of # the constructor, then set it to the same value of data. if self.byteorder is None: self.byteorder = dbyteorder # Do an additional in-place byteswap of data if the in-memory # byteorder doesn't match that of the on-disk. This is the only # place that we have to do the conversion manually. In all the # other cases, it will be HDF5 the responsible of doing the # byteswap properly. if dbyteorder in ['little', 'big']: if dbyteorder != self.byteorder: # if data is not writeable, do a copy first if not data.flags.writeable: data = data.copy() data.byteswap(True) else: # Fix the byteorder again, no matter which byteorder have # specified the user in the constructor. self.byteorder = "irrelevant" return data # Public methods # ~~~~~~~~~~~~~~ # Tree manipulation # ````````````````` def remove(self): """ Remove this node from the hierarchy. This method has the behavior described in `Node._f_remove()`. Please note that there is no ``recursive`` flag since leaves do not have child nodes. """ self._f_remove(False) def rename(self, newname): """ Rename this node in place. This method has the behavior described in `Node._f_rename()`. """ self._f_rename(newname) def move( self, newparent=None, newname=None, overwrite=False, createparents=False ): """ Move or rename this node. This method has the behavior described in `Node._f_move()`. """ self._f_move(newparent, newname, overwrite, createparents) def copy( self, newparent=None, newname=None, overwrite=False, createparents=False, **kwargs ): """ Copy this node and return the new one. This method has the behavior described in `Node._f_copy()`. Please note that there is no ``recursive`` flag since leaves do not have child nodes. In addition, this method recognises the following keyword arguments: `title` The new title for the destination. If omitted or ``None``, the original title is used. `filters` Specifying this parameter overrides the original filter properties in the source node. If specified, it must be an instance of the `Filters` class. The default is to copy the filter properties from the source node. `copyuserattrs` You can prevent the user attributes from being copied by setting this parameter to ``False``. The default is to copy them. `start`, `stop`, `step` Specify the range of rows to be copied; the default is to copy all the rows. `stats` This argument may be used to collect statistics on the copy process. When used, it should be a dictionary whith keys ``'groups'``, ``'leaves'`` and ``'bytes'`` having a numeric value. Their values will be incremented to reflect the number of groups, leaves and bytes, respectively, that have been copied during the operation. """ return self._f_copy( newparent, newname, overwrite, createparents, **kwargs ) def isVisible(self): """ Is this node visible? This method has the behavior described in `Node._f_isVisible()`. """ return self._f_isVisible() # Attribute handling # `````````````````` def getAttr(self, name): """ Get a PyTables attribute from this node. This method has the behavior described in `Node._f_getAttr()`. """ return self._f_getAttr(name) def setAttr(self, name, value): """ Set a PyTables attribute for this node. This method has the behavior described in `Node._f_setAttr()`. """ self._f_setAttr(name, value) def delAttr(self, name): """ Delete a PyTables attribute from this node. This method has the behavior described in `Node._f_delAttr()`. """ self._f_delAttr(name) # Data handling # ````````````` def flush(self): """ Flush pending data to disk. Saves whatever remaining buffered data to disk. It also releases I/O buffers, so if you are filling many datasets in the same PyTables session, please call ``flush()`` extensively so as to help PyTables to keep memory requirements low. """ self._g_flush() def _f_close(self, flush=True): """ Close this node in the tree. This method has the behavior described in `Node._f_close()`. Besides that, the optional argument `flush` tells whether to flush pending data to disk or not before closing. """ if not self._v_isopen: return # the node is already closed or not initialized if flush: self.flush() # Close the dataset and release resources self._g_close() # Close myself as a node. super(Leaf, self)._f_close() def close(self, flush=True): """ Close this node in the tree. This method is completely equivalent to `Leaf._f_close()`. """ self._f_close(flush) ## Local Variables: ## mode: python ## py-indent-offset: 4 ## tab-width: 4 ## fill-column: 72 ## End: