Source code for fdi.dataset.tabledataset

# -*- coding: utf-8 -*-

from .indexed import Indexed
from .ndprint import ndprint
from .odict import ODict
from ..utils.common import wls
from .dataset import make_title_meta_l0, CompositeDataset
from .shaped import Shaped
from .metadata import MetaData

try:
    from .tabledataset_datamodel import Model
except ImportError:
    Model = {'metadata': {}}


import sys
from collections.abc import Sequence
from collections import OrderedDict
import itertools
from copy import copy

if sys.version_info[0] + 0.1 * sys.version_info[1] >= 3.3:
    PY33 = True
    from collections.abc import Container, Sequence, Mapping
    seqlist = Sequence
    maplist = Mapping
else:
    assert 0, 'python 3'
    PY33 = False
    from .collectionsMockUp import ContainerMockUp as Container
    from .collectionsMockUp import SequenceMockUp as Sequence
    from .collectionsMockUp import MappingMockUp as Mapping
    seqlist = (tuple, list, Sequence, str)
    # ,types.XRangeType, types.BufferType)
    maplist = (dict, Mapping)

import logging
# create logger
logger = logging.getLogger(__name__)
# logger.debug('level %d' %  (logger.getEffectiveLevel()))


[docs]class TableModel(): """ to interrogate a tabular data model """
[docs] def __init__(self, **kwds): """ """ super().__init__(**kwds)
[docs] def getColumnClass(self, columnIndex): """ Returns the class for the first cell values in the column. """ return self.getColumn(columnIndex)[0].__class__
[docs] def getColumnCount(self): """ Returns the number of columns in the model. """ return len(self.getData())
[docs] def getColumnName(self, columnIndex): """ Returns the name of the column at columnIndex. returns a set of columns if key is a slice. """ return self.getColumnNames()[columnIndex]
[docs] def getColumnNames(self): """ Returns the column names. """ return list(self.getData().keys())
[docs] def getRowCount(self): """ Returns the number of rows in the model. """ return len(self.getColumn(0))
[docs] def getValueAt(self, rowIndex, columnIndex): """ Returns the value for the cell at columnIndex and rowIndex. """ return self.getColumn(columnIndex).data[rowIndex]
[docs] def isCellEditable(self, rowIndex, columnIndex): """ Returns true if the cell at rowIndex and columnIndex is editable. """ return True
[docs] def setValueAt(self, value, rowIndex, columnIndex): """Sets the value in the cell at columnIndex and rowIndex to Value. """ self.getColumn(columnIndex).data[rowIndex] = value
[docs]def maybe2rows(header_names, units, col_width, sep='.', one_row=False, linebreak='\n'): """ makes one-row or two-row column headers :sep: a string of separator characters to split header into two fragments. a header only uses the first matching char from left. Grouping does not distinguish which sep-char was used to split a header. Example: '.' (default), '._/' :one_row: Force one row but add line breaks at sep """ if col_width is None: col_width = -1 found_repeat = False hd, hd2 = [], [] last = None for x in header_names: try: # only test if there is '.' if not any(s in x for s in sep): raise ValueError() f = float(x) hd.append(x) hd2.append('') if not found_repeat: last = '' except ValueError: # 'a.b', 'a.c' -> ('a','b'), ('a','c') r = str(x) # try splitting with all sep chars until the first successful split to get the right-most fragment for s in sep: p = r.rsplit(s, 1) if len(p) > 1: break # p0 is the group. p1 the sub-name if len(p) > 1: # 'foo.'.rsplit('.') == ['foo',''] p0 = p[0] p1 = p[1] else: # 'bar'.rsplit('.') == ['bar'] p0 = '' p1 = r hd.append(p1) hd2.append(p0) # repeat is never found if one row if not one_row and not found_repeat: if p0 != '' and p0 == last: found_repeat = True last = p0 # [(column name, unit), ...]. Widths of column head is limited hdr1 = [wls(x.replace('.', '.\n') if one_row else x, width=col_width, linebreak=linebreak) for x in (hd if found_repeat else header_names)] # name and unit rows. hdr = list('%s%s(%s)' % (nu[0], linebreak, nu[1]) for nu in zip(hdr1, units)) if 0: # one_row: return list(linebreak.join(hd2, hdr)) if found_repeat: # if there is found_repeat. use 2-row header return list(zip(hd2, hdr)) else: return hdr
MdpInfo = Model['metadata']
[docs]class TableDataset(CompositeDataset, TableModel, Shaped): """ Special dataset that contains a single Array Data object. A TableDataset is a tabular collection of Columns. It is optimized to work on array data.. The column-wise approach is convenient in many cases. For example, one has an event list, and each algorithm is adding a new field to the events (i.e. a new column, for example a quality mask). Although mechanisms are provided to grow the table row-wise, one should use these with care especially in performance driven environments as this orthogonal approach (adding rows rather than adding columns) is expensive. General Note: For reasons of flexibility, memory consumption and performance, this class is not checking whether all columns are of the same length: this is the responsibility of the user/developer. See also the library documentation for more information about this. Note on column names: If a column is added without specifying a name, the name ColumnX is created, where X denotes the index of that column. Column name duplicity is not allowed. Developers: See "Writing special datasets or products" at the developer's documentation also. Please see also this selection example. """
[docs] def __init__(self, data=None, description=None, typ_=None, version=None, zInfo=None, alwaysMeta=True, **kwds): """ """ self._list = [] # collect MDPs from args-turned-local-variables. metasToBeInstalled = copy(locals()) metasToBeInstalled.pop('__class__', None) metasToBeInstalled.pop('kwds', None) metasToBeInstalled.pop('self', None) metasToBeInstalled.pop('zInfo', None) global Model if zInfo is None: zInfo = Model super().__init__(zInfo=zInfo, **metasToBeInstalled, **kwds) # initialize data, meta, unit self.updateShape()
# def getData(self): # """ Optimized for _data being an ``ODict/dict`` implemented with ``DataContaier``. # """ # return self._data
[docs] def setData(self, data): """ sets name-column pairs from data. Valid formd include: {str:Column, ...} or [(str, [num, ...], str)] or [(str, Column), ...] or [[num ...], [num ...], ...] [{'name':str,'column':Column}] form is deprecated. Existing data will be discarded except when the provided data is a list of lists, where existing column names and units will remain but data replaced, and extra data items will form new columns named 'column'+index (index counting from 1) with unit None. """ # logging.debug(data.__class__) if data is None: super(TableDataset, self).setData(ODict()) self.updateShape() return current_data = self.getData() # list of keys of current data curdk = list(current_data.keys()) if current_data else [] super(TableDataset, self).setData(ODict()) if issubclass(data.__class__, seqlist): from .arraydataset import Column for ind, x in enumerate(data): if issubclass(x.__class__, maplist) \ and 'name' in x and 'column' in x: raise DeprecationWarning( 'Do not use [{"name":name, "column":column}...]. Use {name:column, ...} instead.') if issubclass(x.__class__, (list, tuple)): # check out string-started columns (2-col not included if len(x) > 1 and issubclass(x[0].__class__, str) and not issubclass(x[1].__class__, str): if issubclass(x[1].__class__, (list, tuple)): u = x[2] if len(x) > 2 else '' self.setColumn(x[0], Column(data=x[1], unit=u)) elif issubclass(x[1].__class__, Column): self.setColumn(x[0], x[1]) else: raise ValueError( '[[str, [], str]...], [[str, []]...], [[str, Column]...] needed.') else: # x is not string-started # e.g. [[1, 2, 3], [4, 5, 6]] if current_data is None or len(current_data) <= ind: # update the data of the ind-th column self.setColumn('', Column(data=x, unit=None)) else: colname = curdk[ind] current_data[colname].data = x self.setColumn(colname, current_data[colname]) else: raise ValueError( 'Cannot extract name and column from list member ' + str(x)) elif issubclass(data.__class__, maplist): # [Column, ... ] for k, v in data.items(): self.setColumn(k, v) else: raise TypeError('must be a Sequence or a Mapping. ' + data.__class__.__name__ + ' found.') self.updateShape()
[docs] def addColumn(self, name, column, col_des=True): """ Adds the specified column to this table, and attaches a name to it. If the name is null, a dummy name "column"+column_count+1 is created, such that it can be accessed by getColumn(str). If column name exists the corresponding column is substituted. Parameters: name - column name. column - column to be added. col_des - if True (default) and if column description is 'UNKNOWN' or `None`, set to column name. """ d = self.getData() if d is None: d = ODict() if name == '' or name is None: idx = self.getColumnCount() name = 'column' + str(idx+1) self._list.append(column.getData()) else: try: self._list[self.indexOf(name)] = column.getData() except ValueError as e: self._list.append(column.getData()) des = column.getDescription() if col_des and des == 'UNKNOWN' or des is None: column.setDescription(name) d[name] = column self.updateShape()
[docs] def removeColumn(self, key): """ Removes the columns specified by ``key``. ref. ``getColumnMap`` on ``key`` usage. """ for name in self.getColumnMap(key).keys(): self._list.pop(self.indexOf(name)) del(self.data[name]) self.updateShape()
[docs] def indexOf(self, key): """ Returns the index of specified column. if the key is a Column, it looks for equal references (same column objects), not for equal values. If the key is a string, Returns the index of specified Column name. """ from .arraydataset import Column if issubclass(key.__class__, str): ks = list(self.getData().keys()) k = key elif issubclass(key.__class__, Column): ks = list(id(v) for v in self.getData().values()) k = id(key) else: raise "key must be string or Column, not %s." % type(key).__name__ self.updateShape() return ks.index(k)
[docs] def addRow(self, row, rows=False): """ Adds the specified map as a new row to this table. row: row is a dict with names as keys and row data as values. rows: append each element in row if the row data is a list. """ d = self.getData() if len(row) < len(d): msg = 'row width d% should be %d.' % (len(row), len(d)) raise ValueError(msg) for c in d.keys(): if rows: d[c].data.extend(row[c]) else: d[c].data.append(row[c]) if hasattr(d[c], 'updateShape'): d[c].updateShape() self.updateShape()
[docs] def getRowMap(self, rowIndex): """ Returns a dict of column-names as the keys and the objects located at a particular row(s) as the values. rowIndex: return the following as the value for each key-value pair: * int: the int-th row's elements; * ``Slice`` object, a list of rows from slicing the column. Example ``a.getRow(Slice(3,,))``; * list of integers: they are used as the row index to select the rows. * list of booleans: rows where the corresponding boolean is True are chosen. """ cl = rowIndex.__class__ d = self.getData() if issubclass(cl, (int, slice)): return {n: c.getData()[rowIndex] for n, c in d.items()} if issubclass(cl, list): if type(rowIndex[0]) == int: return {n: [c.getData()[i] for i in rowIndex] for n, c in d.items()} if type(rowIndex[0]) == bool: # if len(rowIndex) != len(n): # logger.info('%s Selection length %d should be %d.' % # (name, len(rowIndex), len(n))) return {n: [x for x, s in zip(c.getData(), rowIndex) if s] for n, c in d.items()} else: raise ValueError( 'RowIndex must be an int, a slice, or a list of ints or bools.')
[docs] def getRow(self, rowIndex): """ Returns a list containing the objects located in a particular row, or a list of rows. rowIndex: ref ``getRowMap()`` * int: return the int-th row in a list of elements; * ``Slice`` object, list of integers, list of booleans: return a list of rows each represented by a tuple. Example ``a.getRow(Slice(3,,))``, ``[2,4]``, ``[True, False...]``. """ it = self.getRowMap(rowIndex).values() if issubclass(rowIndex.__class__, int): # return a list of row elements return list(it) # return transposed in a list return list(zip(*it))
[docs] def select(self, selection): """ Select a number of rows from this table dataset and return a new TableDataset object containing only the selected rows. selection: to form a new Tabledataset with ref ``getRowMap()`` """ d = ODict() if issubclass(selection.__class__, int): for name, data in self.getRowMap(selection).items(): d[name] = Column( data=[data], unit=self.getColumn(name).getUnit()) return TableDataset(data=d) from .arraydataset import Column for name, data in self.getRowMap(selection).items(): d[name] = Column(data=data, unit=self.getColumn(name).getUnit()) return TableDataset(data=d)
[docs] def removeRow(self, rowIndex): """ Removes a row with specified index from this table. rowIndex: int or a ``Slice`` object. Example ``a.removeRow(Slice(3,,))``. return: removed row data. """ if issubclass(rowIndex.__class__, slice): ret = [] for x in self.getData().values(): ret.append(x.data[rowIndex]) del x.data[rowIndex] x.updateShape() self.updateShape() return ret r = [] for x in self.getData().values(): r.append(x.data.pop(rowIndex)) x.updateShape() self.updateShape() return r
@ property def rowCount(self): return self.getRowCount() @ rowCount.setter def rowCount(self, newRowCount): self.setRowCount(newRowCount)
[docs] def setRowCount(self, rowCount): """ cannot do this. """ raise ValueError('Cannot set row count.')
@ property def columnCount(self): return self.getColumnCount() @ columnCount.setter def columnCount(self, newColumnCount): self.setColumnCount(newColumnCount)
[docs] def setColumnCount(self, columnCount): """ cannot do this. """ raise ValueError('Cannot set column count.')
@ property def list(self): return self._list @ list.setter def list(self, l): raise NotImplemented
[docs] def getColumnMap(self, key=None): """ Returns a dict of column-names as the keys and the column(s) as the values. key: return the following as the value for each key-value pair: * int: name-value where value is the int-th column. * ``Slice`` object, a list of name-columns from slicing the column index. Example ``a.getColumn(Slice(3,,))``; * Sequence of integers/strings: they are used as the column index/name to select the columns. * Sequence of booleans: columns where the corresponding boolean is True are chosen. Default is to return all columns. """ d = self.getData() try: if d is None or key is None or len(d) == 0: return d except TypeError: pass cl = key.__class__ if issubclass(cl, int): t = list(d.items())[key] return {t[0]: t[1]} # {str:Column} if issubclass(cl, slice): return ODict(list(d.items())[key]) # {str:Column, ...} if issubclass(cl, str): return {key: d[key]} # {str:Column} if issubclass(cl, Sequence): if type(key[0]) == int: # {str:Column, ...} return ODict(list(d.items())[i] for i in key) if type(key[0]) == str: return {n: d[n] for n in key} # {str:Column, ...} if type(key[0]) == bool: # {str:Column, ...} return ODict(x for x, s in zip(d.items(), key) if s) else: raise ValueError( '``key`` must be an int, a string, a slice, or a list of ints, strings, or bools.')
[docs] def getColumn(self, key): """ Returns the particular column, or a list of columns. key: ref ``getColumnMap()`` * int/str: return the int-th/named column; * ``Slice`` object, list of columns of sliced column indices; * list of integers/strings: return a list of columns corresponding to the given column index/name, or where key is True. Example ``a.getColumn(Slice(3,,))``, ``[2, 4]``, ``['time', ``energy']``. * list of booleans: return a list of columns where key is True. Example ``[True, False...]``. """ it = self.getColumnMap(key).values() if len(it) == 0: return [] if issubclass(key.__class__, (int, str)): # return a list of row elements return list(it)[0] # return transposed in a list return list(it)
[docs] def setColumn(self, key, value): """ Replaces a column in this table with specified name to specified column if key is a string and exists, or if the key is an integer in 0 to the number of columns, insert at column-index=key, with the name 'column'+key, else add a new coolumn. """ if self.getData() is None: key = '' elif type(key) == int: nms = self.getColumnNames() if 0 <= key and key < len(nms): key = nms[key] else: key = '' self.addColumn(name=key, column=value) self.updateShape()
def __iter__(self): for x in self.getData(): yield x
[docs] def items(self): """ for k,v in tabledataset.items() """ return self.getData().items()
def __getitem__(self, key): """ return colmn of given key. ref. ``getColumn()``. """ return self.getColumn(key) def __setitem__(self, key, value): """ """ self.setColumn(key, value) def __delitem__(self, key): """ delete colmn of given key. ref. ``removeColumn()``. """ return self.removeColumn(key) keys = TableModel.getColumnNames # __len__ = TableModel.getColumnCount # def __contains__(self, name): # """ if 'name` is found in column names. # """ # return name in self.getData() def __repr__(self): return self.toString(level=2)
[docs] def toString(self, level=0, extra=False, param_widths=None, tablefmt='grid', tablefmt1='simple', tablefmt2='plain', width=0, matprint=None, trans=True, heavy=True, center=-1, **kwds): """ tablefmt2: format of 2D data, others see `MetaData.toString`. """ if matprint is None: matprint = ndprint cn = self.__class__.__name__ if level > 1: s = cn + '(' s += self.meta.toString( level=level, extra=extra, param_widths=param_widths, tablefmt=tablefmt, tablefmt1=tablefmt1, tablefmt2=tablefmt2, width=width, **kwds) return s + 'data= {' + \ ', '.join('"%s": %s' % (k, v.toString( level=level, extra=extra, param_widths=param_widths, tablefmt=tablefmt, tablefmt1=tablefmt1, tablefmt2=tablefmt, width=width, heavy=heavy, **kwds)) for k, v in self.getColumnMap().items()) + \ '})' html = 'html' in tablefmt.lower() or 'html' in tablefmt2.lower() br = '<br>' if html else '\n' if html: tablefmt = tablefmt2 = 'unsafehtml' s, last = make_title_meta_l0(self, level=level, extra=extra, param_widths=param_widths, tablefmt=tablefmt, tablefmt1=tablefmt1, tablefmt2=tablefmt2, center=center, width=width, heavy=heavy, html=html, excpt=['description'], **kwds) width = len(last)-1 if level == 0: if html: d = '<center><u>%s</u></center>\n' % 'DATA' else: d = 'DATA'.center(width) + '\n' + '----'.center(width) + '\n' else: d = '' rowlimit = 2 if level > 1 else 20 if level == 1 else None cols = self.getData().values() # This is obsolete for tabulate >= 0.8.10: # widest width in all of default and in param_widths # w = MetaData.MaxDefWidth if param_widths == -1 or \ # param_widths is None else \ # max(MetaData.MaxDefWidth, max(param_widths.values())) # if html: # w = w w = MetaData.MaxDefWidth coldata = [list(itertools.islice(x.data, rowlimit)) for x in cols] # table headers hdr = maybe2rows(self.getData().keys(), (str(x.unit) for x in cols), col_width=w, one_row=html, linebreak=br) d += matprint(coldata, trans=trans, headers=hdr, tablefmt=tablefmt, tablefmt1=tablefmt1, tablefmt2=tablefmt2, center=center, mdim=2, param_widths=param_widths, maxElem=sys.maxsize, **kwds) collen = self.getRowCount() if level and rowlimit is not None and rowlimit < collen: d += '(Only display %d rows of %d for level=%d.)' % (rowlimit, collen, level) return f'{s}\n{d}{last}\n'
string = toString txt = toString def __getstate__(self): """ Can be encoded with serializableEncoder """ return OrderedDict( _ATTR_meta=getattr(self, '_meta', None), **self.getData())
[docs]class IndexedTableDataset(Indexed, TableDataset): """ TableDataset with an index table for efficient row look-up. """
[docs] def __init__(self, **kwds): """ """ self._indexCols = [0] self._rowIndexTable = {} super().__init__(**kwds) # initialize data, meta, unit
[docs] def getColumnsToLookup(self): """ returns an iterator that gives a number of sequences to looking up over. """ # list of Column's arrays return [x.data for x in self.getColumn(self._indexPattern)]
[docs] def setData(self, data): """ sets name-column pairs from data and updates index if needed """ d = self.getData() if d: reindex = False lcd = len(d) if issubclass(data.__class__, seqlist): for ind, x in enumerate(data): if lcd > ind: if reindex == False and ind in self._indexPattern: reindex = True else: reindex = True super().setData(data) if reindex: self.updateToc()
[docs] def vLookUp(self, key, return_index=True, multiple=False): """ Similar to Excel VLOOKUP, return all records (rows) that match the key. key: taken as a dictionary key unless ``multiple`` is True. return_index: if True (default) return index in the array of columns. multiple: if True (default is False) loop through key as a sequence of keys and return a sequece. """ if multiple: if return_index: toc = self._tableOfContent return [toc[k] for k in key] else: toc = self._tableOfContent # return [[c[toc[k]] for c in self._list] for k in key] return list(zip(*((c[toc[k]] for k in key) for c in self._list))) else: if return_index: return self._tableOfContent[key] else: rec_ind = self._tableOfContent[key] return [c[rec_ind] for c in self._list]
[docs] def hashx(self): s = self.__getstate__().values() l = [] return super().hash(hash_list=self.data.values())
def __getstate__(self): """ Can be encoded with serializableEncoder """ # try: # description = self.description # except (AttributeError, KeyError): # description = None return Indexed.__getstate__(self).update( _ATTR_meta=getattr(self, '_meta', None), **self.getData())