Source code for pySPACE.resources.dataset_defs.feature_vector

""" Load and store data sets containing :mod:`Feature Vectors<pySPACE.resources.data_types.feature_vector>`

"""

import os
import cPickle
import yaml
import numpy
import warnings

import logging

from pySPACE.resources.dataset_defs.base import BaseDataset
from pySPACE.resources.data_types.feature_vector import FeatureVector
from pySPACE.tools.filesystem import get_author

[docs]class FeatureVectorDataset(BaseDataset):
    """ Feature vector dataset class
    
    This class is most importantly
    for loading and storing
    :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
    to the file system.
    You can load it using a :mod:`~pySPACE.missions.nodes.source.feature_vector_source` node.
    It can be saved, using a :mod:`~pySPACE.missions.nodes.sink.feature_vector_sink` node
    in a :class:`~pySPACE.missions.operations.node_chain.NodeChainOperation`
    
    The constructor expects the argument *dataset_md* that
    contains a dictionary with all the meta data.
    It is normally loaded from the metadata.yaml file.
    
    It is able to load csv-Files, arff-files and pickle files,
    where one file is always responsible for one training or test set.
    The name conventions are the same as described in
    :class:`~pySPACE.resources.dataset_defs.time_series.TimeSeriesDataset`.
    It is important that a metadata.yaml file exists, giving all
    the relevant information of the data set,
    especially the storage format, which can be
    *pickle*, *arff*, *csv* or *csvUnnamed*.
    The last format is only for loading data without heading,
    and with the labels being not in the last column.
    
    **pickle-files**
    
    See :class:`~pySPACE.resources.dataset_defs.time_series.TimeSeriesDataset`
    for name conventions (in the tutorial).
    
    
    **arff-files**
    
    http://weka.wikispaces.com/ARFF
    
    This format was introduced to connect pySPACE with weka.
    So when using weka, you need to choose this file format as the parameter
    storage format in the preprocessing operation's spec file.
    
    
    **CSV (comma separated values)**
    
    These are tables in a simple text format.
    Therefore each column is separated with a comma and each row with a new line.
    Normally the first line gives the feature names and one row is giving
    the class labels. Therefore several parameters need to be specified in
    the :ref:`metadata.yaml <metadata_yaml>` file.
    
    If no collection meta data is available for the input data, 
    the 'metadata.yaml' file can be generated with
    :mod:`~pySPACE.run.scripts.md_creator`.
    Please consider also some important parameters, described in the 
    get_data function.
    
    Preferably the labels are in the last column. This corresponds to
    *label_column* being -1 in the metadata.yaml file.

    **Special CSV Parameters**

        :label_column:
            Column containing the labels

            Normally this column looses its heading.
            when saving the csv file, the default, -1, is used.

            (*recommended, default: -1*)

        :ignored_columns:
            List of numbers containing the numbers of irrelevant columns,
            e.g., `[1,2,8,42]`

            After the data is loaded, this parameter becomes obsolete.

            .. todo:: Enable and document eval syntax

            (*optional, default: []*)

        :ignored_rows:
            Replace row in description of 'ignored_columns'

            (*optional, default: []*)

        :delimiter:
            Symbol which separates the csv entries

            Typically `,` is used or the tabulator `\t`.
            When storing, `,` is used.

            (*recommended, default: ','*)

    **Parameters**
    
        :dataset_md: dictionary containing meta data for the collection
                        to be loaded

        The following  3 Parameters contain standard information
        for a feature vector data set. 
        Normally they are not yet needed (used), because a dataset_md
        is given and real data is loaded,
        and so this information could be loaded from the data.
        Nevertheless these are important entries, which should be found
        in each dataset_md, giving information about the data set.

        :classes_names: list of the used class labels
        
        :feature_names: list of the feature names

            The feature names are either determined during the loading of the
            data, if available in the respective *storage_format*,
            or they are later on set with a default string (e.g.,
            feature_0_0.000sec).
        
        :num_features: number of the given features
    
    .. todo:: Better integration and documentation of the data_pattern variable, 
              e.g., when reading arff files.
    """
[docs]    def __init__(self, dataset_md=None, classes_names=[], feature_names=None,
                 num_features=None, **kwargs):
        """ Read out the data from the given collection
        
        .. todo:: test data pattern usage on old data
        
        .. note:: main loading concept copied from time series collection
                  check needed if code can be sent to upper class
        """
        super(FeatureVectorDataset, self).__init__(dataset_md=dataset_md)
        if not self.meta_data.has_key("feature_names"):
            self.update_meta_data({"feature_names":feature_names})
        if not self.meta_data.has_key("classes_names"):
            self.update_meta_data({"classes_names":classes_names})
        if not self.meta_data.has_key("num_features"):
            self.update_meta_data({"num_features":num_features})
        if not dataset_md is None: #data has to be loaded
            self._log("Loading feature vectors from input collection.")
            dataset_dir = self.meta_data["dataset_directory"]
            s_format = self.meta_data["storage_format"]
            if type(s_format) == list:
                s_format = s_format[0]
            # mainly code copy from time series data set defs
            if dataset_md.has_key("data_pattern") and not self.meta_data["train_test"] \
                   and self.meta_data["splits"] == 1 \
                   and self.meta_data["runs"] == 1 :
                # The collection consists only of a single set of data, for
                # one run, one splitting, and only test data
                data = dataset_md["data_pattern"].replace("_run", "_run0") \
                                                     .replace("_sp","_sp0") \
                                                     .replace("_tt","_test")
                # File that contains the time series objects
                fv_file = os.path.join(dataset_dir,data)
                
                # Actual data will be loaded lazily
                self.data[(0, 0, "test")] = fv_file
            elif dataset_md.has_key("data_pattern"):
                for run_nr in range(self.meta_data["runs"]):
                    for split_nr in range(self.meta_data["splits"]):
                        for train_test in ["train", "test"]:
                            # The collection consists only of a single set of data, for
                            # one run, one splitting, and only test data
                            data = dataset_md["data_pattern"].replace("_run", "_run%s" % run_nr) \
                                                                 .replace("_sp","_sp%s" % split_nr) \
                                                                 .replace("_tt","_%s" % train_test)
                            # File that contains the time series objects
                            fv_file = os.path.join(dataset_dir,data)
                            # Actual data will be loaded lazily
                            self.data[(run_nr, split_nr, train_test)] = fv_file
            elif dataset_md.has_key("file_name"):
                fv_file = os.path.join(dataset_dir,self.meta_data["file_name"])
                self.data[(0, 0, "test")] = fv_file
            else:
                pass
                ##TODO: What should we do? - Raise Error, because data is not defined?
                #raise NotImplementedError()
        else: #dataset_md == None # called when storing or initialize empty collection
            # We create a new, empty collection
            pass

[docs]    def add_sample(self, sample, label, train, split=0, run=0):
        """ Add a sample to this collection
        
        Adds the sample *sample* along with its class label *label*
        to this collection.
        
        **Parameters**
        
            :sample: The respective data sample
            :label:  The label of the data sample
            :train:  If *train*, this sample has already been used for training
            :split:  The number of the split this sample belongs to. \
                       
                     (*optional, default: 0*)
                       
            :run:    The run number this sample belongs to
          
                     (*optional, default: 0*)
        """
        if self.meta_data["num_features"] is None:
            self.update_meta_data({"num_features": sample.size})
        elif not sample.size == self.meta_data["num_features"]:
            self.update_meta_data({"num_features": sample.size})
            warnings.warn("Mismatching feature number: %i given but %i occured."
                          % (self.meta_data["num_features"], sample.size))
        try:
            # Remember all class labels since these will be stored
            # in the arff file meta data
            if label not in self.meta_data["classes_names"]:
                self.meta_data["classes_names"].append(label)
        except KeyError:
            self.update_meta_data({"classes_names": [label]})
        # Delegate to super class
        super(FeatureVectorDataset, self).add_sample(sample, label,
                                                     train, split, run)

[docs]    def dump(self, result_path, name):
        """ Dumps this collection into a file.
        
        Dumps (i.e. pickle) this collection object into a bz2 compressed file.
        In contrast to *store* this method stores the whole collection
        in a file. No meta data are stored in a YAML file etc.
        
        The method expects the following parameters:
         * *result_path* The path to the directory in which the pickle \
                         file will be written.
         * *name* The name of the pickle file
         
        """
        # Remove the feature names from the feature vectors since this leads to
        # unnecessary large sizes on the disk
        for values in self.data.itervalues():
            for (sample, label) in values:
                sample.feature_names = []
        
        # Delegate to super class
        super(FeatureVectorDataset, self).dump(result_path, name)

[docs]    def get_data(self, run_nr, split_nr, train_test): # feature_file, storage_format):
        """ Loads the data from the feature file of the current input collection 
        depending on the storage_format.
        Separates the actual vectors from the names and returns both as lists.
        
        The method expects the following
        
        **Parameters**
        
            :feature_file:      the file of feature vectors to be loaded
            :storage_format:    One of the first components in
                                ['arff', 'real'], ['csv', 'real'], 
                                ['csvUnnamed', 'real'] or .
                                Format in which the feature_file was saved.
                                Information need to be present in meta data.
        
        For arff and pickle files documentation see to the class description
        (docstring). Pickle format files do not need any special 
        loading because they
        already have the perfect format.
        
        **CSV**
        
        If no collection meta data is available for the input data, 
        the 'metadata.yaml' file can be generated with
        :mod:`pySPACE.run.node_chain_scripts.md_creator`.
        
        If you created the csv file with pySPACE, you automatically have the
        standard *csv* format with the feature names in the first row
        and the labels in the last column.
        
        If you have a csv tabular without headings,
        you have the *csvUnnamed* format,
        and in your 'label_column' column, specified in your spec file,
        the labels can be found.
        
        .. note:: main loading concept copied from time series collection
          check needed if code can be sent to upper class
        """
        ## init ##
        classes_names = self.meta_data["classes_names"]
        s_format = self.meta_data["storage_format"]
        if type(s_format) == list:
            s_format = s_format[0]

        # todo: automatical loading of csv?
        delimiter = self.meta_data.get("delimiter", ",")
        if not len(delimiter) == 1:
            self._log("Wrong delimiter ('%s') given. Using default ','." %
                      delimiter, level=logging.CRITICAL)
            delimiter = ","

        # Do lazy loading of the fv objects.
        if isinstance(self.data[(run_nr, split_nr, train_test)], basestring):
            self._log("Lazy loading of %s feature vectors from input "
                      "collection for run %s, split %s." % (train_test, run_nr, 
                                                            split_nr))
            if s_format == "pickle":
                # Load the data from a pickled file
                file = open(self.data[(run_nr, split_nr, train_test)], "r")
                self.data[(run_nr, split_nr, train_test)] = cPickle.load(file)
                file.close()
                sample = self.data[(run_nr, split_nr, train_test)][0][0]
                self.update_meta_data({"feature_names":sample.feature_names,
                                       "len_line":len(sample.feature_names)})
            elif s_format == "arff":
                names = []
                data = []
                # load file
                f = open(self.data[(run_nr, split_nr, train_test)])
                data_set = f.readlines()
                f.close()
                # Read the arff file completely ##
                for line in data_set:
                    if "@attribute class" in line \
                            or "@relation" in line \
                            or "@data" in line:
                        pass
                    elif "@attribute" in line:
                        name_line = line.split()
                        names.append(name_line[1])
                    else: 
                        data.append(line.split(delimiter))
                # the label is expected to be at the end 
                # of each line in the data.
                for line in data:
                    vector = line[0:-1]
                    label = line[-1].rstrip("\n\r ")  # --> label is string
                    if not label in classes_names:
                        classes_names.append(label)
                    sample = FeatureVector(numpy.atleast_2d([vector]).astype(
                        numpy.float64), feature_names=names)
                    self.add_sample(sample=sample, label=label, 
                                    train=train_test, split=split_nr, 
                                    run=run_nr)
                self.update_meta_data({"feature_names": sample.feature_names,
                       "len_line": len(sample.feature_names),
                       "classes_names": classes_names})
            elif "csv" in s_format: # csv or csv unnamed
                # load file
                f = open(self.data[(run_nr, split_nr, train_test)])
                data_set = f.readlines()
                f.close()
                # getting rid of all unwanted rows
                if "ignored_rows" in self.meta_data:
                    ignored_rows = self.meta_data["ignored_rows"]
                    if not type(ignored_rows) == list:
                        warnings.warn("Wrong format: Ignored rows included!")
                        ignored_rows = []
                    ignored_rows.sort()
                    remove_list = []
                    for i in ignored_rows:
                        remove_list.append(data_set[int(i)-1])
                    for j in remove_list:
                        data_set.remove(j)
                # get len_line and delete heading
                feature_names = self.meta_data["feature_names"]
                if s_format == "csv":
                    names = data_set[0].rstrip(",\n").split(delimiter)
                    data_set.pop(0)
                line = data_set[0].split(delimiter)
                line[-1] = line[-1].rstrip("\n\r")
                if line[-1] == '':
                        line.pop(-1)
                len_line = len(line)

                # get and prepare label column numbers (len_line needed)
                try:
                    label_column = self.meta_data["label_column"]
                except KeyError:
                    label_column = -1
                # map column numbers to indices by subtracting -1
                if type(label_column) == int:
                    label_columns = [label_column - 1]
                elif type(label_column) == list:
                    label_columns = [int(l)-1 for l in label_column]
                for i in range(len(label_columns)):
                    # map to positive value and undo previous offset
                    if label_columns[i] < 0:
                        label_columns[i] = label_columns[i] + len_line+1
                    if label_columns[i] < 0:
                        label_columns[i] = -1 + len_line
                # very important sorting for index shifts
                label_columns.sort()
                # calculate unwanted columns
                # note: These indices begin with 1 .
                # They are internally shifted when used.
                if self.meta_data.has_key("ignored_columns"):
                    ignored_columns = self.meta_data["ignored_columns"]
                    if not type(ignored_columns) == list:
                        warnings.warn("Wrong format: Ignored columns included!")
                        ignored_columns = []
                    new_ignored_columns = []
                    for i in ignored_columns:
                        i =  int(i)
                        if i < 0:
                            i += len_line
                        for label_column in label_columns:
                            if i > label_column:
                                i -= 1
                        new_ignored_columns.append(i)
                else:
                    new_ignored_columns = []
                new_ignored_columns.sort()
                # get all relevant feature_names
                if feature_names is None:
                    if s_format == "csv":
                        # delete blanks and inverted commas 
                        for i in range(len(names)):
                            names[i] = names[i].strip(' "')
                            names[i] = names[i].strip(" '")
                        feature_names = names
                    else: #s_format=="csv_unnamed"
                        feature_names = ["feature_%s" % i for i in
                                         range(len_line)]
                    # switch label names to the end
                    i = 0 # reduce index, after previous labels were deleted
                    for label_column in label_columns:
                        try:
                            feature_names.append(feature_names[label_column-i])
                            del feature_names[label_column-i]
                            i += 1
                        except IndexError:
                            feature_names.append("")
                    # create new feature names
                    feature_names = [item for index, item in enumerate(
                        feature_names) if not index+1 in new_ignored_columns]
                    for _ in label_columns:
                        feature_names.pop(-1)

                # read the data line by line
                for line in data_set:
                    if not delimiter in line:
                        warnings.warn("Line without delimiter:\n%s" % str(line))
                        continue
                    line = line.split(delimiter)
                    line[-1] = line[-1].rstrip("\n\r")
                    if line[-1] == '':
                        line.pop(-1)
                    label = []
                    i = 0
                    for label_column in label_columns:
                        label.append(line.pop(label_column-i))
                        i += 1
                    if type(label) == str:
                        label = label.strip(' "')
                        label = label.strip(" '")
                    if label not in classes_names:
                        classes_names.append(label)
                    # create new line without the ignored columns
                    vector = [item for index,item in enumerate(line) if not
                              index+1 in new_ignored_columns]
                    sample = FeatureVector(numpy.atleast_2d([vector]).astype(
                        numpy.float64), feature_names=feature_names)
                    if len(label) == 1:
                        label = label[-1]
                    self.add_sample(sample=sample, label=label, 
                                    train=train_test, split=split_nr, 
                                    run=run_nr)
                self.update_meta_data({"feature_names": sample.feature_names,
                       "num_features": len(sample.feature_names),
                       "classes_names": classes_names})

        return self.data[(run_nr, split_nr, train_test)]

[docs]    def store(self, result_dir, s_format = ["pickle", "real"]):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        The method expects the following parameters:
          * *result_dir* The directory in which the collection will be stored
          * *name* The prefix of the file names in which the individual \
                   data sets are stored. The actual file names are determined \
                   by appending suffixes that encode run, split, train/test \
                   information. Defaults to "features".
          * *format* A list with information about the format in which the 
                    actual data sets should be stored. The first entry specifies
                    the file format. If it is "arff" the second entry specifies the
                    attribute format. 
                    
                    Examples: ["arff", "real"], ["arff", "{0,1}"]
                    
                    .. todo:: Someone could implement the format ["fasta"] for sax features
                    
                    To store the data in comma separated values, use ["csv", "real"].
                    
                    (*optional, default: ["pickle", "real"]*)

        .. todo:: Adapt storing of csv file to external library instead of
                  doing it manually.

        """
        name = "features"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "feature_vector",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                                 + name + "_sp_tt." + s_format[0]})
        
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "real"
            
        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.", 
                      level=logging.ERROR)
            s_format = "pickle"
        
        # Iterate through splits and runs in this dataset
        for key, feature_vectors in self.data.iteritems():
            # test if dataset has already been loaded.
            # Otherwise replace with iterator to loaded version.
            if isinstance(feature_vectors, basestring):
                feature_vectors = self.get_data(key[0], key[1], key[2])

            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
                
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".pickle"),
                                   "w")
         
                cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL)
            elif s_format == "arff": # Write as ARFF
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".arff"),"w")
                # Create the arff file header
                relation_name = result_dir.split(os.sep)[-1]
                result_file.write('@relation "%s"\n' % relation_name)
                # Write the type of all features
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("@attribute %s %s\n" % (feature_name,  s_type))
                classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + ""

                result_file.write("@attribute class {%s}\n" % classString)
                
                result_file.write("@data\n")
                # Write all given training data into the ARFF file
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.string_):
                    feature_format = "%s,"
                elif numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                for features, class_name in feature_vectors:
                    for feature in features[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            elif s_format == "csv": # Write as Comma Separated Value
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".csv"),"w")
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("%s," % (feature_name))
                result_file.write("\n")
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                else:
                    feature_format = "%s,"
                for features, class_name in feature_vectors:
                    f = features.view(numpy.ndarray)
                    for feature in f[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            result_file.close()

        #Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)