Source code for pySPACE.resources.dataset_defs.time_series

""" Load and store data of the type :mod:`pySPACE.resources.data_types.time_series` """
import itertools

import copy
import os
import cPickle
import sys
import scipy
import yaml
import csv
import numpy
import logging
import warnings
import glob
from pySPACE.missions.support.WindowerInterface import AbstractStreamReader
from pySPACE.missions.support.windower import MarkerWindower
from pySPACE.tools.filesystem import get_author
from pySPACE.resources.dataset_defs.base import BaseDataset


[docs]class TimeSeriesDataset(BaseDataset):
    """ Loading and storing a time series dataset
    
    This class encapsulate most relevant code for dealing with time series 
    datasets, most importantly for loading and storing them to the file system.
    
    These datasets consist of
    :mod:`~pySPACE.resources.data_types.time_series` objects.
    They can be loaded with a
    :mod:`~pySPACE.missions.nodes.source.time_series_source` and saved with a
    :mod:`~pySPACE.missions.nodes.sink.time_series_sink` node
    in a :class:`~pySPACE.missions.operations.node_chain.NodeChainOperation`.
    
    The standard *storage_format* is 'pickle',
    but it is also possible to load Matlab format ('mat') or
    BrainComputerInterface-competition data. For that, ``storage_format`` has
    to be set in the format **bci_comp_[competition number]_[dataset number]**
    in the metadata.yaml file. For example, **bci_comp_2_4** means loading of 
    time series from BCI Competition II (2003), dataset IV.
    Currently, the following datasets can be loaded:
    
        - BCI Competition II, dataset IV: self-paced key typing (left vs. right)
        - BCI Competition III, dataset II: P300 speller paradigm, training data
    
    See http://www.bbci.de/competition/ for further information.

    For saving the data, other formats are currently supported but not
    yet for loading the data.
    This issue can be handled by processing the data with a node chain
    operation which transforms the data into feature vectors and
    use the respective storing and loading functionality, e.g., with csv and
    arff files.
    There is also a node for transforming feature vectors back to
    TimeSeries objects.

    
    **Parameters**
    
      :dataset_md:
          A dictionary with all the meta data.
          
          (*optional, default: None*)
          
      :sort_string: 
          A lambda function string that is evaluated before the data is stored.

          (*optional, default: None*)
          
    **Known issues**
        The BCI Competition III dataset II should be actually loaded as a 
        streaming dataset to enable different possibilities for windowing.
        Segment ends (i.e., where a new letter starts) can be coded as marker.
    """
[docs]    def __init__(self, dataset_md=None, sort_string=None, **kwargs):
        super(TimeSeriesDataset, self).__init__(dataset_md=dataset_md)
        self.stream_mode = False
        if dataset_md is not None:
            dataset_dir = self.meta_data["dataset_directory"]
            s_format = self.meta_data["storage_format"]
            if type(s_format) == list:
                f_format = s_format[0]
            else:
                f_format = s_format
            # Loading depends on whether data is split into
            # training and test data, whether different splits exist and whether
            # several runs have been conducted.
            if f_format in ["pickle", "mat"] and not self.meta_data["train_test"] \
                    and self.meta_data["splits"] == 1 \
                    and self.meta_data["runs"] == 1:
                # The dataset consists only of a single set of data, for
                # one run, one splitting, and only test data
                data = dataset_md["data_pattern"].replace("_run", "_run0") \
                    .replace("_sp", "_sp0") \
                    .replace("_tt", "_test")
                # File that contains the time series objects
                ts_file = os.path.join(dataset_dir, data)
                # Current data will be loaded lazily
                self.data[(0, 0, "test")] = ts_file
            elif f_format in ["pickle", "mat"]:
                for run_nr in range(self.meta_data["runs"]):
                    for split_nr in range(self.meta_data["splits"]):
                        for train_test in ["train", "test"]:
                            # The collection consists only of a single set of
                            # data, for one run, one splitting,
                            # and only test data
                            data = dataset_md["data_pattern"]\
                                .replace("_run", "_run%s" % run_nr) \
                                .replace("_sp", "_sp%s" % split_nr) \
                                .replace("_tt", "_%s" % train_test)
                            # File that contains the time series objects
                            ts_file = os.path.join(dataset_dir,data)
                            # Actual data will be loaded lazily
                            self.data[(run_nr, split_nr, train_test)] = ts_file
            elif f_format.startswith("bci_comp"):
                # get bci competion and dataset number
                try:
                    self.comp_number, self.comp_set = f_format.split('_')[2:]
                except Exception:
                    raise Exception, "%s --- Could not extract BCI competition"\
                                     " and dataset number!" % f_format
                if self.comp_number == "2":
                    if self.comp_set == "4":
                        
                        def _update_sf(self, file_name):
                            if '1000' in file_name:
                                self.sf = 1000
                            else:
                                self.sf = 100
                            self.meta_data["sampling_frequency"] = self.sf
                            
                        # structure: 2 mat file with data in different sampling
                        # frequencies; txt file for test labels
                        if "sampling_frequency" in self.meta_data.keys() and \
                                "file_name" in self.meta_data.keys():
                            # are they inconsistent?
                            self.sf = self.meta_data["sampling_frequency"]
                            if (self.sf == 100 and '1000' in \
                                               self.meta_data["file_name"]) or \
                                    (self.sf == 1000 and '1000' not in \
                                               self.meta_data["file_name"]):
                                warnings.warn("File name does not match "
                                        "sampling frequency or vice versa. %s "
                                        "is loaded." % self.meta_data["file_name"])
                                self._update_sf(self.meta_data["file_name"])
                            ts_file = os.path.join(dataset_dir,
                                                    self.meta_data["file_name"])
                        elif "file_name" in self.meta_data.keys():
                            self._update_sf(self.meta_data["file_name"])
                            ts_file = os.path.join(dataset_dir,
                                                    self.meta_data["file_name"])
                        elif "sampling_frequency" in self.meta_data.keys():
                            self.sf = self.meta_data["sampling_frequency"]
                            if self.sf == 1000:
                                ts_file = os.path.join(dataset_dir,
                                                           "sp1s_aa_1000Hz.mat")
                            else:
                                ts_file = os.path.join(dataset_dir, 
                                                                  "sp1s_aa.mat")
                        else:
                            ts_file = glob.glob(os.path.join(dataset_dir,
                                                                    "*.mat"))[0]
                            warnings.warn("Either file name nor sampling "
                                  "frequency is given. %s is loaded." % ts_file)
                            self._update_sf(ts_file)
                        self.data[(0, 0, "test")] = ts_file
                        self.data[(0, 0, "train")] = ts_file
                    else:
                        raise NotImplementedError("Loading of BCI competition" \
                                              " %s, dataset %s not supported " \
                                            % (self.comp_number, self.comp_set))
                elif self.comp_number == "3":
                    if self.comp_set == "2":
                        # structure: mat file for train and test data
                        # TODO: loading test labels is not possible at the moment!
                        #     glob.glob(os.path.join(dataset_dir,"*Test.mat"))[0]
                        #self.data[(0, 0, "train")] = \
                        self.data[(0, 0, "test")] = \
                            glob.glob(os.path.join(dataset_dir,"*Train.mat"))[0]
                    else:
                        raise NotImplementedError("Loading of BCI competition" \
                                              " %s, dataset %s not supported " \
                                            % (self.comp_number, self.comp_set))
                else:
                    raise NotImplementedError("Loading of BCI competition %s," \
                                              " dataset %s not supported " \
                                            % (self.comp_number, self.comp_set))
                        
            else: # s_format == "csv":
                if "file_name" in self.meta_data.keys():
                    ts_file = os.path.join(dataset_dir,
                                           self.meta_data["file_name"])
                elif "data_pattern" in self.meta_data.keys():
                    # The collection consists only of a single set of data, for
                    # one run, one splitting, and only test data
                    data = dataset_md["data_pattern"].replace("_run", "_run0") \
                        .replace("_sp","_sp0") \
                        .replace("_tt","_test")
                    ts_file = os.path.join(dataset_dir,data)
                elif os.path.isfile(os.path.join(dataset_dir,"data.csv")):
                    ts_file = os.path.join(dataset_dir,"data.csv")
                else:
                    pathlist = glob.glob(os.path.join(dataset_dir,"*.csv"))
                    if len(pathlist)>1:
                        warnings.warn(
                            "To many given data sets:%s. Taking first entry."
                            % str(pathlist))
                        ts_file = pathlist[0]
                    elif len(pathlist) == 0:
                        warnings.warn("No csv file found. Trying any file.")
                        pathlist = glob.glob(os.path.join(dataset_dir, "*"))
                        ts_file = pathlist[0]
                        if "metadata.yaml" in ts_file:
                            ts_file = pathlist[1]
                self.data[(0, 0, "test")] = ts_file
               
        self.sort_string = sort_string if sort_string is not None else None

[docs]    def get_data(self, run_nr, split_nr, train_test):
        """ Return the train or test data for the given split in the given run.
        
        **Parameters**
          
          :run_nr: The number of the run whose data should be loaded.
          
          :split_nr: The number of the split whose data should be loaded.
          
          :train_test: "train" if the training data should be loaded.
                       "test" if the test data should be loaded.
    
        """
        # Do lazy loading of the time series objects.
        if isinstance(self.data[(run_nr, split_nr, train_test)], basestring):
            self._log("Lazy loading of %s time series windows from input "
                      "collection for run %s, split %s." % (train_test, run_nr, 
                                                            split_nr))
            s_format = self.meta_data["storage_format"]
            if type(s_format) == list:
                f_format = s_format[0]
            else:
                f_format = s_format
            if f_format == "pickle":
                # Load the time series from a pickled file
                f = open(self.data[(run_nr, split_nr, train_test)], 'r')
                try:
                    self.data[(run_nr, split_nr, train_test)] = cPickle.load(f)
                except ImportError:
                    # code for backward compatibility
                    # redirection of old path
                    f.seek(0)
                    self._log("Loading deprecated data. Please transfer it " +
                              "to new format.",level=logging.WARNING)
                    from pySPACE.resources.data_types import time_series
                    sys.modules['abri_dp.types.time_series'] = time_series
                    self.data[(run_nr, split_nr, train_test)] = cPickle.load(f)
                    del sys.modules['abri_dp.types.time_series']
                f.close()
            elif f_format in ["mat", "matlab", "MATLAB"]:
                from scipy.io import loadmat
                from pySPACE.resources.data_types.time_series import TimeSeries
                ts_fname = self.data[(run_nr, split_nr, train_test)]
                dataset = loadmat(ts_fname)
                channel_names = [name.strip() for name in dataset['channel_names']]
                sf = dataset["sampling_frequency"][0][0]
                self.data[(run_nr, split_nr, train_test)] = []
                # assume third axis to be trial axis
                if "channelXtime" in s_format:
                    for i in range(dataset["data"].shape[2]):
                        self.data[(run_nr, split_nr, train_test)].append(\
                            (TimeSeries(dataset["data"][:,:,i].T, channel_names,
                                        sf), dataset["labels"][i].strip()))
                else:
                    for i in range(dataset["data"].shape[2]):
                        self.data[(run_nr, split_nr, train_test)].append(\
                            (TimeSeries(dataset["data"][:,:,i], channel_names,
                                        sf), dataset["labels"][i].strip()))                    
            elif f_format.startswith("bci_comp"):
                from scipy.io import loadmat
                from pySPACE.resources.data_types.time_series import TimeSeries
                if self.comp_number == "2":
                    if self.comp_set == "4":
                        ts_fname = self.data[(run_nr, split_nr, train_test)]
                        d = loadmat(ts_fname)
                        channel_names = [name[0].astype('|S3') for name in \
                                                                   d["clab"][0]]
                        if train_test == "train":
                            self.data[(run_nr, split_nr, train_test)] = []
                            input_d = d["x_train"]
                            input_l = d["y_train"][0]
                            for i in range(input_d.shape[2]):
                                self.data[(run_nr, split_nr, 
                                           train_test)].append(\
                                            (TimeSeries(input_d[:,:,i],
                                                 channel_names, float(self.sf)), 
                                        "Left" if input_l[i] == 0 else "Right"))
                        else:
                            label_fname = glob.glob(os.path.join(
                                          os.path.dirname(ts_fname),"*.txt"))[0]
                            input_d = d["x_test"]
                            input_l = open(label_fname,'r')
                            self.data[(run_nr, split_nr, train_test)] = []
                            for i in range(input_d.shape[2]):
                                label = int(input_l.readline())
                                self.data[(run_nr, split_nr, 
                                           train_test)].append(\
                                            (TimeSeries(input_d[:,:,i],
                                                 channel_names, float(self.sf)), 
                                             "Left" if label == 0 else "Right"))
                elif self.comp_number == "3":
                    if self.comp_set == "2":
                        data = loadmat(self.data[(run_nr, split_nr, train_test)])
                        signal = data['Signal']
                        flashing = data['Flashing']
                        stimulus_code = data['StimulusCode']
                        stimulus_type = data['StimulusType']
                
                        window = 240
                        Fs = 240
                        channels = 64
                        epochs = signal.shape[0]
                        self.data[(run_nr, split_nr, train_test)] = []
                        self.start_offset_ms = 1000.0
                        self.end_offset_ms = 1000.0
                        
                        whole_len = (self.start_offset_ms + self.end_offset_ms)*Fs/1000.0 + window
                        responses = numpy.zeros((12, 15, whole_len, channels))
                        for epoch in range(epochs):
                            rowcolcnt=numpy.ones(12)
                            for n in range(1, signal.shape[1]):
                                if (flashing[epoch,n]==0 and flashing[epoch,n-1]==1):
                                    rowcol=stimulus_code[epoch,n-1]
                                    if n-24-self.start_offset_ms*Fs/1000.0 < 0:
                                        temp = signal[epoch,0:n+window+self.end_offset_ms*Fs/1000.0-24,:]
                                        temp = numpy.vstack((numpy.zeros((whole_len - temp.shape[0], temp.shape[1])), temp))
                                    elif n+window+self.end_offset_ms*Fs/1000.0-24> signal.shape[1]:
                                        temp = signal[epoch,n-24-self.start_offset_ms*Fs/1000.0:signal.shape[1],:]
                                        temp = numpy.vstack((temp, numpy.zeros((whole_len-temp.shape[0], temp.shape[1]))))
                                    else:
                                        temp = signal[epoch, n-24-self.start_offset_ms*Fs/1000.0:n+window+self.end_offset_ms*Fs/1000.0-24, :]
                                    responses[rowcol-1,rowcolcnt[rowcol-1]-1,:,:]=temp
                                    rowcolcnt[rowcol-1]=rowcolcnt[rowcol-1]+1
                
                            avgresp=numpy.mean(responses,1)
                
                            targets = stimulus_code[epoch,:]*stimulus_type[epoch,:]
                            target_rowcol = []
                            for value in targets:
                                if value not in target_rowcol:
                                    target_rowcol.append(value)
                
                            target_rowcol.sort()
                
                            for i in range(avgresp.shape[0]):
                                temp = avgresp[i,:,:]
                                data = TimeSeries(input_array = temp,
                                                  channel_names = range(64), 
                                                  sampling_frequency = window)
                                if i == target_rowcol[1]-1 or i == target_rowcol[2]-1:
                                    self.data[(run_nr, split_nr, train_test)].append((data,"Target"))
                                else:
                                    self.data[(run_nr, split_nr, train_test)].append((data,"Standard"))                 
        if self.stream_mode and not self.data[(run_nr, split_nr, train_test)] == []:
            # Create a connection to the TimeSeriesClient and return an iterator
            # that passes all received data through the windower.
            self.reader = TimeSeriesClient(self.data[(run_nr, split_nr, train_test)], blocksize=100)

            # Creates a windower that splits the training data into windows
            # based in the window definitions provided
            # and assigns correct labels to these windows
            self.reader.set_window_defs(self.window_definition)
            self.reader.connect()
            self.marker_windower = MarkerWindower(
                self.reader, self.window_definition,
                nullmarker_stride_ms=self.nullmarker_stride_ms,
                no_overlap=self.no_overlap,
                data_consistency_check=self.data_consistency_check)
            return self.marker_windower
        else:
            return self.data[(run_nr, split_nr, train_test)]

[docs]    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :s_format:
              The format in which the actual data sets should be stored.
              
              Possible formats are 'pickle', 'text', 'csv' and 'mat' (matlab)
              format. If s_format is a list, the second element further 
              specifies additional options for storing.
              
              - pickle:
                  Standard Python format
                  
              - text:
                  In the text format, all time series objects are concatenated 
                  to a single large table containing only integer values.
                  
              - csv:
                  For the csv format comma separated values are taken as default
                  or a specified Python format string.
                  
              - mat:
                  Scipy's savemat function is used for storing. Thereby the data
                  is stored as 3 dimensional array. Also meta data information,
                  like sampling frequency and channel names are saved.
                  As an additional parameter the orientation of the data arrays 
                  can be given as 'channelXtime' or 'timeXchannel'
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        
        .. todo:: Shouldn't be 'text' and 'csv' format part of the stream data
                  set?!
        """
        name = "time_series"
        # for some storage procedures we need further specifications
        s_type = None
        if type(s_format) == list:
            # file format is first position
            f_format = s_format[0]
            if len(s_format) > 1:
                s_type = s_format[1]
        else:
            f_format = s_format
        if f_format == "text" and s_type is None:
            s_type = "%i"
        elif f_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "time_series",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                               + name + "_sp_tt." + f_format})

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary 
            # (due to the lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if f_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(os.path.join(result_path,
                                                name+key_str+".pickle"), "w")
                cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL)
                result_file.close()
            elif f_format in ["text","csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"})
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if f_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if f_format == "text":
                        numpy.savetxt(result_file, data, delimiter=",", fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
                result_file.close()
            elif f_format in ["matlab", "mat", "MATLAB"]:
                # todo: handle all the other attributes of ts objects!
                import scipy.io
                result_file_name = os.path.join(result_path, 
                                                name + key_str + ".mat")
                # extract a first time series object to get meta data 
                ts1 = time_series[0][0]
                
                # collect all important information in the collection_object
                dataset_dict = {
                    "sampling_frequency": ts1.sampling_frequency,
                    "channel_names": ts1.channel_names}
                
                # we have to extract the data and labels separatly
                if 'channelXtime' in s_format:
                    dataset_dict["data"] = [data.T for data, _ in time_series] 
                else:
                    dataset_dict["data"] = [data for data, _ in time_series]
                dataset_dict["labels"] = [label for _, label in time_series]
                # construct numpy 3d array (e.g., channelXtimeXtrials)
                dataset_dict["data"] = numpy.rollaxis(numpy.array(
                    dataset_dict["data"]), 0, 3)
                
                scipy.io.savemat(result_file_name, mdict=dataset_dict)
            elif f_format in ["bp_eeg"]:

                result_file = open(os.path.join(result_path,
                                                name + key_str + ".eeg"),"a+")
                result_file_mrk = open(os.path.join(result_path,
                                                name + key_str + ".vmrk"),"w")

                result_file_mrk.write("Brain Vision Data Exchange Marker File, "
                                      "Version 1.0\n")
                result_file_mrk.write("; Data stored by pySPACE\n")
                result_file_mrk.write("[Common Infos]\n")
                result_file_mrk.write("Codepage=UTF-8\n")
                result_file_mrk.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_file_mrk.write("\n[Marker Infos]\n")

                markerno = 1
                datapoint = 1
                sf = None
                channel_names = None

                for t in time_series:
                    if sf is None:
                        sf = t[0].sampling_frequency
                    if channel_names is None:
                        channel_names = t[0].get_channel_names()
                    for mrk in t[0].marker_name.keys():
                        for tm in t[0].marker_name[mrk]:
                            result_file_mrk.write(str("Mk%d=Stimulus,%s,%d,1,0\n" %
                                (markerno, mrk, datapoint+(tm*sf/1000.0))))
                            markerno += 1
                    data_ = t[0].astype(numpy.int16)
                    data_.tofile(result_file)
                    datapoint += data_.shape[0]

                result_hdr = open(os.path.join(result_path,
                                                name + key_str + ".vhdr"),"w")

                result_hdr.write("Brain Vision Data Exchange Header "
                                 "File Version 1.0\n")
                result_hdr.write("; Data stored by pySPACE\n\n")
                result_hdr.write("[Common Infos]\n")
                result_hdr.write("Codepage=UTF-8\n")
                result_hdr.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_hdr.write("MarkerFile=%s\n" %
                                      str(name + key_str + ".vmrk"))
                result_hdr.write("DataFormat=BINARY\n")
                result_hdr.write("DataOrientation=MULTIPLEXED\n")
                result_hdr.write("NumberOfChannels=%d\n" % len(channel_names))
                result_hdr.write("SamplingInterval=%d\n\n" % (1000000/sf))
                result_hdr.write("[Binary Infos]\n")
                result_hdr.write("BinaryFormat=INT_16\n\n")
                result_hdr.write("[Channel Infos]\n")

                # TODO: Add Resolutions to time_series
                # 0 = 0.1 [micro]V,
                # 1 = 0.5 [micro]V,
                # 2 = 10 [micro]V,
                # 3 = 152.6 [micro]V (seems to be unused!)
                resolutions_str = [unicode("0.1,%sV" % unicode(u"\u03BC")),
                   unicode("0.5,%sV" % unicode(u"\u03BC")),
                   unicode("10,%sV" % unicode(u"\u03BC")),
                   unicode("152.6,%sV" % unicode(u"\u03BC"))]
                for i in range(len(channel_names)):
                    result_hdr.write(unicode("Ch%d=%s,,%s\n" %
                        (i+1,channel_names[i],
                        unicode(resolutions_str[0]))).encode('utf-8'))
                result_file.close()
            else:
                NotImplementedError("Using unavailable storage format:%s!"
                                    % f_format)
        self.update_meta_data({
            "channel_names": copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency": time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

[docs]    def set_window_defs(self, window_definition, nullmarker_stride_ms=1000,
                        no_overlap=False, data_consistency_check=False):
        """Code copied from StreamDataset for rewindowing data"""
        self.window_definition = window_definition
        self.nullmarker_stride_ms = nullmarker_stride_ms
        self.no_overlap = no_overlap
        self.data_consistency_check = data_consistency_check
        self.stream_mode = True


[docs]class TimeSeriesClient(AbstractStreamReader):
    """TimeSeries stream client for TimeSeries"""
[docs]    def __init__(self, ts_stream, **kwargs):

        self.callbacks = list()
        self._markerids= {"null":0}               # default marker
        self._markerNames = {0:"null"}            # dictionary with marker names
        self.nmarkertypes = len(self.markerids)  # number of different markers
        self._stdblocksize = None
        self._dSamplingInterval = None
        self.ts_stream = ts_stream

        self.blockcounter = 0

        # create two different iterators,
        # one for data reading, the other for
        # peeking etc
        (self.ts_stream_iter,self.backup_iter) = itertools.tee(iter(ts_stream))

    @property
    def dSamplingInterval(self):
        return self._dSamplingInterval

    @property
    def stdblocksize(self):
        return self._stdblocksize

    @property
    def markerids(self):
        return self._markerids

    @property
    def channelNames(self):
        return self._channelNames

    @property
    def markerNames(self):
        return self._markerNames

[docs]    def regcallback(self, func):
        """Register callback function"""
        self.callbacks.append(func)

[docs]    def connect(self):
        """connect and initialize client"""
        try:
            self._initialize(self.backup_iter.next())
        except StopIteration as e:
            print("timeseriesclient got no data: %s" % e)
            # if a StopIteration is catched right here there
            # is no data contained for the current modality (train/test)
            # in this datastream.
            pass

[docs]    def set_window_defs(self, window_definitions):
        """ Set all markers at which the windows are cut"""
        # extract start and endmarker
        marker_id_index = self.nmarkertypes
        self._markerNames[marker_id_index] = window_definitions[0].startmarker
        self._markerids[window_definitions[0].startmarker] = marker_id_index
        marker_id_index += 1      
        self._markerNames[marker_id_index] = window_definitions[0].endmarker
        self._markerids[window_definitions[0].endmarker] = marker_id_index
        marker_id_index += 1
        
        # extract all other markers
        for wdef in window_definitions:
            if not self.markerids.has_key(wdef.markername):
                self._markerNames[marker_id_index] = wdef.markername
                self._markerids[wdef.markername] = marker_id_index
                marker_id_index += 1
            # Exclude definitions marker
            for edef in wdef.excludedefs:
                if not self.markerids.has_key(edef.markername):
                    self._markerNames[marker_id_index] = edef.markername
                    self._markerids[edef.markername] = marker_id_index
                    marker_id_index += 1
            # Include definitions marker
            for idef in wdef.includedefs:
                if not self.markerids.has_key(idef.markername):
                    self._markerNames[marker_id_index] = idef.markername
                    self._markerids[idef.markername] = marker_id_index
                    marker_id_index += 1
        self.nmarkertypes = len(self.markerNames.keys())

[docs]    def _initialize(self,item):
        # get data part from (data,label) combination
        block = item[0]
        self.nChannels = block.shape[1]
        self._stdblocksize = block.shape[0]
        self._dSamplingInterval = block.sampling_frequency

        self._channelNames = block.channel_names

[docs]    def read(self, nblocks=1, verbose=False):
        """Invoke registered callbacks for each incoming data block

        returns number of read _data_ blocks"""
        ret = 0
        nread = 0

        while ret != -1 and \
                ret is not None and \
                (nblocks == -1 or nread < nblocks):
            ret = self._readmsg(verbose=False)
            if ret is not None:
                for f in self.callbacks:
                    f(self.ndsamples, self.ndmarkers)
                nread += 1

        return nread

[docs]    def _readmsg(self, msg_type='all', verbose=False):
        """ Read time series object from given iterator
        """

        # the iter items are a combination of data and
        # dummy label -> extract data
        block = self.ts_stream_iter.next()[0]

        self.blockcounter += 1

        if block is None:
            return None

        # no data message read until know
        # -> initialize property values
        if self.nChannels is None:
            self._initialize()

        self.ndmarkers = numpy.zeros([self.stdblocksize], int)
        self.ndmarkers.fill(-1)

        if block.shape[0] < self.stdblocksize:
            return 1

        for (marker, positions) in block.marker_name.iteritems():
            for position_as_ms in positions:

                # position_as_samples = numpy.floor(position_as_ms / 1000.0 *
                #                                   self.dSamplingInterval)
                position_as_samples = numpy.int(position_as_ms / 1000.0 *
                                                self.dSamplingInterval)

                # found a new marker, add it to marker name buffer
                if marker == -1 or not self.markerids.has_key(marker):
                    self.nmarkertypes += 1
                    self.markerNames[self.nmarkertypes] = marker
                    self.markerids[marker] = self.nmarkertypes
                if not self.markerids.has_key(marker):
                    continue

                markerid = self.markerids[marker]

                if self.ndmarkers[position_as_samples] == -1:
                    self.ndmarkers[position_as_samples] = markerid
                elif position_as_samples < self.stdblocksize:
                    self.ndmarkers[position_as_samples] = markerid
                else:
                    self.lostmarker = True
                    self.lostmarkertypedesc = markerid

        self.readSize = (self.nChannels * self.stdblocksize)

        self.ndsamples = numpy.array(block)
        self.ndsamples.shape = (self.stdblocksize, self.nChannels)
        self.ndsamples = scipy.transpose(self.ndsamples)

        return 1