""" Data structures for sparse float data. Life is made simpler by dealing only with float64 data """ # pylint: disable=E1101,E1103,W0231 import warnings from pandas.compat import lrange, zip from pandas import compat import numpy as np from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.frame import DataFrame from pandas.core.panel import Panel from pandas.sparse.frame import SparseDataFrame from pandas.util.decorators import deprecate import pandas.core.common as com import pandas.core.ops as ops import pandas.lib as lib class SparsePanelAxis(object): def __init__(self, cache_field, frame_attr): self.cache_field = cache_field self.frame_attr = frame_attr def __get__(self, obj, type=None): return getattr(obj, self.cache_field, None) def __set__(self, obj, value): value = _ensure_index(value) if isinstance(value, MultiIndex): raise NotImplementedError("value cannot be a MultiIndex") for v in compat.itervalues(obj._frames): setattr(v, self.frame_attr, value) setattr(obj, self.cache_field, value) class SparsePanel(Panel): """ Sparse version of Panel Parameters ---------- frames : dict of DataFrame objects items : array-like major_axis : array-like minor_axis : array-like default_kind : {'block', 'integer'}, default 'block' Default sparse kind for converting Series to SparseSeries. Will not override SparseSeries passed into constructor default_fill_value : float Default fill_value for converting Series to SparseSeries. Will not override SparseSeries passed in Notes ----- """ ndim = 3 _typ = 'panel' _subtyp = 'sparse_panel' def __init__(self, frames=None, items=None, major_axis=None, minor_axis=None, default_fill_value=np.nan, default_kind='block', copy=False): # deprecation #11157 warnings.warn("SparsePanel is deprecated and will be removed in a " "future version", FutureWarning, stacklevel=2) if frames is None: frames = {} if isinstance(frames, np.ndarray): new_frames = {} for item, vals in zip(items, frames): new_frames[item] = SparseDataFrame( vals, index=major_axis, columns=minor_axis, default_fill_value=default_fill_value, default_kind=default_kind) frames = new_frames if not isinstance(frames, dict): raise TypeError('input must be a dict, a %r was passed' % type(frames).__name__) self.default_fill_value = fill_value = default_fill_value self.default_kind = kind = default_kind # pre-filter, if necessary if items is None: items = Index(sorted(frames.keys())) items = _ensure_index(items) (clean_frames, major_axis, minor_axis) = _convert_frames(frames, major_axis, minor_axis, kind=kind, fill_value=fill_value) self._frames = clean_frames # do we want to fill missing ones? for item in items: if item not in clean_frames: raise ValueError('column %r not found in data' % item) self._items = items self.major_axis = major_axis self.minor_axis = minor_axis def _consolidate_inplace(self): # pragma: no cover # do nothing when DataFrame calls this method pass def __array_wrap__(self, result): return SparsePanel(result, items=self.items, major_axis=self.major_axis, minor_axis=self.minor_axis, default_kind=self.default_kind, default_fill_value=self.default_fill_value) @classmethod def from_dict(cls, data): """ Analogous to Panel.from_dict """ return SparsePanel(data) def to_dense(self): """ Convert SparsePanel to (dense) Panel Returns ------- dense : Panel """ return Panel(self.values, self.items, self.major_axis, self.minor_axis) def as_matrix(self): return self.values @property def values(self): # return dense values return np.array([self._frames[item].values for item in self.items]) # need a special property for items to make the field assignable _items = None def _get_items(self): return self._items def _set_items(self, new_items): new_items = _ensure_index(new_items) if isinstance(new_items, MultiIndex): raise NotImplementedError("itemps cannot be a MultiIndex") # need to create new frames dict old_frame_dict = self._frames old_items = self._items self._frames = dict((new_k, old_frame_dict[old_k]) for new_k, old_k in zip(new_items, old_items)) self._items = new_items items = property(fget=_get_items, fset=_set_items) # DataFrame's index major_axis = SparsePanelAxis('_major_axis', 'index') # DataFrame's columns / "items" minor_axis = SparsePanelAxis('_minor_axis', 'columns') def _ixs(self, i, axis=0): """ for compat as we don't support Block Manager here i : int, slice, or sequence of integers axis : int """ key = self._get_axis(axis)[i] # xs cannot handle a non-scalar key, so just reindex here if com.is_list_like(key): return self.reindex(**{self._get_axis_name(axis): key}) return self.xs(key, axis=axis) def _slice(self, slobj, axis=0, kind=None): """ for compat as we don't support Block Manager here """ axis = self._get_axis_name(axis) index = self._get_axis(axis) return self.reindex(**{axis: index[slobj]}) def _get_item_cache(self, key): return self._frames[key] def __setitem__(self, key, value): if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) if not isinstance(value, SparseDataFrame): value = value.to_sparse(fill_value=self.default_fill_value, kind=self.default_kind) else: raise ValueError('only DataFrame objects can be set currently') self._frames[key] = value if key not in self.items: self._items = Index(list(self.items) + [key]) def set_value(self, item, major, minor, value): """ Quickly set single value at (item, major, minor) location Parameters ---------- item : item label (panel item) major : major axis label (panel item row) minor : minor axis label (panel item column) value : scalar Notes ----- This method *always* returns a new object. It is not particularly efficient but is provided for API compatibility with Panel Returns ------- panel : SparsePanel """ dense = self.to_dense().set_value(item, major, minor, value) return dense.to_sparse(kind=self.default_kind, fill_value=self.default_fill_value) def __delitem__(self, key): loc = self.items.get_loc(key) indices = lrange(loc) + lrange(loc + 1, len(self.items)) del self._frames[key] self._items = self._items.take(indices) def __getstate__(self): # pickling from pandas.io.pickle import _pickle_array return (self._frames, _pickle_array(self.items), _pickle_array(self.major_axis), _pickle_array(self.minor_axis), self.default_fill_value, self.default_kind) def __setstate__(self, state): frames, items, major, minor, fv, kind = state from pandas.io.pickle import _unpickle_array self.default_fill_value = fv self.default_kind = kind self._items = _ensure_index(_unpickle_array(items)) self._major_axis = _ensure_index(_unpickle_array(major)) self._minor_axis = _ensure_index(_unpickle_array(minor)) self._frames = frames def copy(self, deep=True): """ Make a copy of the sparse panel Returns ------- copy : SparsePanel """ d = self._construct_axes_dict() if deep: new_data = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(self._frames)) d = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(d)) else: new_data = self._frames.copy() d['default_fill_value'] = self.default_fill_value d['default_kind'] = self.default_kind return SparsePanel(new_data, **d) def to_frame(self, filter_observations=True): """ Convert SparsePanel to (dense) DataFrame Returns ------- frame : DataFrame """ if not filter_observations: raise TypeError('filter_observations=False not supported for ' 'SparsePanel.to_long') I, N, K = self.shape counts = np.zeros(N * K, dtype=int) d_values = {} d_indexer = {} for item in self.items: frame = self[item] values, major, minor = _stack_sparse_info(frame) # values are stacked column-major indexer = minor * N + major counts.put(indexer, counts.take(indexer) + 1) # cuteness d_values[item] = values d_indexer[item] = indexer # have full set of observations for each item mask = counts == I # for each item, take mask values at index locations for those sparse # values, and use that to select values values = np.column_stack([d_values[item][mask.take(d_indexer[item])] for item in self.items]) inds, = mask.nonzero() # still column major major_labels = inds % N minor_labels = inds // N index = MultiIndex(levels=[self.major_axis, self.minor_axis], labels=[major_labels, minor_labels], verify_integrity=False) df = DataFrame(values, index=index, columns=self.items) return df.sortlevel(level=0) to_long = deprecate('to_long', to_frame) toLong = deprecate('toLong', to_frame) def reindex(self, major=None, items=None, minor=None, major_axis=None, minor_axis=None, copy=False): """ Conform / reshape panel axis labels to new input labels Parameters ---------- major : array-like, default None items : array-like, default None minor : array-like, default None copy : boolean, default False Copy underlying SparseDataFrame objects Returns ------- reindexed : SparsePanel """ major = com._mut_exclusive(major=major, major_axis=major_axis) minor = com._mut_exclusive(minor=minor, minor_axis=minor_axis) if com._all_none(items, major, minor): raise ValueError('Must specify at least one axis') major = self.major_axis if major is None else major minor = self.minor_axis if minor is None else minor if items is not None: new_frames = {} for item in items: if item in self._frames: new_frames[item] = self._frames[item] else: raise NotImplementedError('Reindexing with new items not ' 'yet supported') else: new_frames = self._frames if copy: new_frames = dict((k, v.copy()) for k, v in compat.iteritems(new_frames)) return SparsePanel(new_frames, items=items, major_axis=major, minor_axis=minor, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def _combine(self, other, func, axis=0): if isinstance(other, DataFrame): return self._combineFrame(other, func, axis=axis) elif isinstance(other, Panel): return self._combinePanel(other, func) elif lib.isscalar(other): new_frames = dict((k, func(v, other)) for k, v in self.iteritems()) return self._new_like(new_frames) def _combineFrame(self, other, func, axis=0): index, columns = self._get_plane_axes(axis) axis = self._get_axis_number(axis) other = other.reindex(index=index, columns=columns) if axis == 0: new_values = func(self.values, other.values) elif axis == 1: new_values = func(self.values.swapaxes(0, 1), other.values.T) new_values = new_values.swapaxes(0, 1) elif axis == 2: new_values = func(self.values.swapaxes(0, 2), other.values) new_values = new_values.swapaxes(0, 2) # TODO: make faster! new_frames = {} for item, item_slice in zip(self.items, new_values): old_frame = self[item] ofv = old_frame.default_fill_value ok = old_frame.default_kind new_frames[item] = SparseDataFrame(item_slice, index=self.major_axis, columns=self.minor_axis, default_fill_value=ofv, default_kind=ok) return self._new_like(new_frames) def _new_like(self, new_frames): return SparsePanel(new_frames, self.items, self.major_axis, self.minor_axis, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def _combinePanel(self, other, func): items = self.items.union(other.items) major = self.major_axis.union(other.major_axis) minor = self.minor_axis.union(other.minor_axis) # could check that everything's the same size, but forget it this = self.reindex(items=items, major=major, minor=minor) other = other.reindex(items=items, major=major, minor=minor) new_frames = {} for item in items: new_frames[item] = func(this[item], other[item]) if not isinstance(other, SparsePanel): new_default_fill = self.default_fill_value else: # maybe unnecessary new_default_fill = func(self.default_fill_value, other.default_fill_value) return SparsePanel(new_frames, items, major, minor, default_fill_value=new_default_fill, default_kind=self.default_kind) def major_xs(self, key): """ Return slice of panel along major axis Parameters ---------- key : object Major axis label Returns ------- y : DataFrame index -> minor axis, columns -> items """ slices = dict((k, v.xs(key)) for k, v in self.iteritems()) return DataFrame(slices, index=self.minor_axis, columns=self.items) def minor_xs(self, key): """ Return slice of panel along minor axis Parameters ---------- key : object Minor axis label Returns ------- y : SparseDataFrame index -> major axis, columns -> items """ slices = dict((k, v[key]) for k, v in self.iteritems()) return SparseDataFrame(slices, index=self.major_axis, columns=self.items, default_fill_value=self.default_fill_value, default_kind=self.default_kind) # TODO: allow SparsePanel to work with flex arithmetic. # pow and mod only work for scalars for now def pow(self, val, *args, **kwargs): """wrapper around `__pow__` (only works for scalar values)""" return self.__pow__(val) def mod(self, val, *args, **kwargs): """wrapper around `__mod__` (only works for scalar values""" return self.__mod__(val) # Sparse objects opt out of numexpr SparsePanel._add_aggregate_operations(use_numexpr=False) ops.add_special_arithmetic_methods(SparsePanel, use_numexpr=False, ** ops.panel_special_funcs) SparseWidePanel = SparsePanel def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): from pandas.core.panel import _get_combined_index output = {} for item, df in compat.iteritems(frames): if not isinstance(df, SparseDataFrame): df = SparseDataFrame(df, default_kind=kind, default_fill_value=fill_value) output[item] = df if index is None: all_indexes = [x.index for x in output.values()] index = _get_combined_index(all_indexes) if columns is None: all_columns = [x.columns for x in output.values()] columns = _get_combined_index(all_columns) index = _ensure_index(index) columns = _ensure_index(columns) for item, df in compat.iteritems(output): if not (df.index.equals(index) and df.columns.equals(columns)): output[item] = df.reindex(index=index, columns=columns) return output, index, columns def _stack_sparse_info(frame): lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] # this is pretty fast minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] for col in frame.columns: series = frame[col] if not np.isnan(series.fill_value): raise TypeError('This routine assumes NaN fill value') int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) major_labels = np.concatenate(inds_to_concat) sparse_values = np.concatenate(vals_to_concat) return sparse_values, major_labels, minor_labels