Source code for pySPACE.resources.dataset_defs.prediction_vector

""" Load and store data sets containing :mod:`Prediction Vectors<pySPACE.resources.data_types.prediction_vector>` """

import os
import cPickle
import numpy
import logging
import warnings

from pySPACE.resources.dataset_defs.base import BaseDataset
from pySPACE.tools.filesystem import get_author
from pySPACE.resources.data_types.prediction_vector import PredictionVector

[docs]class PredictionVectorDataset(BaseDataset): """ Prediction Vector dataset class The class at hand contains the methods needed to work with the datasets consisting of :class:`~pySPACE.resources.data_types.prediction_vector.PredictionVectorDataset` The following data formats are currently supported: - `*.csv` - with or without a header column - `*.pickle` TODO: Add functionality for the `*.arff` format. .. note:: The implementation of the current dataset is adapted from the :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset` and :class:`~pySPACE.resources.dataset_defs.time_series_vector.TimeSeriesDataset`. For a more thorough documentation, we refer the reader to the 2 datasets mentioned above. **Parameters** :dataset_md: Dictionary containing meta data for the collection to be loaded. Out of these parameters, the most important one is the number of predictors since it how the Prediction Vectors will be generated. :num_predictors: The number of predictors that each PredictionVector contains. This parameter is important for determining the dimensionality of the PredictionVector. **Special CSV Parameters** :delimiter: Needed only when dealing with `*.csv` files. The default is `,` is used or the tabulator `\t`. When storing, `,` is used. (*recommended, default: ','*) :label_column: Column containing the true label of the data point Normally this column looses its heading. when saving the csv file, the default, -1, is used. (*recommended, default: -1*) :ignored_columns: List of numbers containing the numbers of irrelevant columns, e.g., `[1,2,8,42]` After the data is loaded, this parameter becomes obsolete. .. todo:: Enable and document eval syntax (*optional, default: []*) :ignored_rows: Replace row in description of 'ignored_columns' (*optional, default: []*) :Author: Andrei Ignat (andrei_cristian.ignat@dfki.de) :Created: 2014/10/15 """
[docs] def __init__(self, dataset_md=None, num_predictors=1,**kwargs): """ Read out the data from the given collection """ super(PredictionVectorDataset, self).__init__(dataset_md=dataset_md) # we want to remember the number of predictions if not dataset_md is None: #data has to be loaded self._log("Loading prediction vectors from input collection.") dataset_dir = self.meta_data["dataset_directory"] s_format = self.meta_data["storage_format"] if type(s_format) == list: s_format = s_format[0] # code copied from TimeSeriesDataset defs if dataset_md.has_key("data_pattern") \ and not self.meta_data["train_test"] \ and self.meta_data["splits"] == 1 \ and self.meta_data["runs"] == 1: # The collection consists only of a single set of data, for # one run, one splitting, and only test data data = dataset_md["data_pattern"].replace("_run", "_run0")\ .replace("_sp", "_sp0")\ .replace("_tt", "_test") # File that contains the PredictionVector objects pv_file = os.path.join(dataset_dir,data) # Actual data will be loaded lazily self.data[(0, 0, "test")] = pv_file elif dataset_md.has_key("data_pattern"): for run_nr in range(self.meta_data["runs"]): for split_nr in range(self.meta_data["splits"]): for train_test in ["train", "test"]: # The collection consists only of a single set of # data, for one run, one splitting, and only test # data data = dataset_md["data_pattern"]\ .replace("_run", "_run%s" % run_nr)\ .replace("_sp","_sp%s" % split_nr)\ .replace("_tt","_%s" % train_test) # File that contains the PredictionVector objects pv_file = os.path.join(dataset_dir, data) # Actual data will be loaded lazily self.data[(run_nr, split_nr, train_test)] = pv_file elif dataset_md.has_key("file_name"): fv_file = os.path.join(dataset_dir,self.meta_data["file_name"]) self.data[(0, 0, "test")] = fv_file else: pass else: pass
[docs] def add_sample(self, sample, label, train, split=0, run=0): """ Add a prediction vector to this collection """ # we count the total number of predictors in the dataset # and update it whenever a new sample is added if not self.meta_data.has_key("num_predictors"): self.update_meta_data({"num_predictors": numpy.size(sample.prediction)}) elif self.meta_data["num_predictors"]!=numpy.size(sample.prediction): warnings.warn("Inconsistent number of predictors for sample." "Expected %d and received %d predictors" % (self.meta_data["num_predictors"], numpy.size(sample.prediction))) try: # Remember all class labels since these will be stored if label not in self.meta_data["classes_names"]: self.meta_data["classes_names"].append(label) except KeyError: self.update_meta_data({"classes_names": [label]}) super(PredictionVectorDataset, self).add_sample(sample, label, train, split, run)
[docs] def dump(self, result_path, name): """ Dumps this collection into a file """ # Remove the predictor from the prediction vectors for values in self.data.itervalues(): for (sample, label) in values: sample.predictor = None # Delegate to super class super(PredictionVectorDataset, self).dump(result_path, name)
[docs] def get_data(self, run_nr, split_nr, train_test): """ Load the data from a prediction file """ classes_names = self.meta_data["classes_names"] s_format = self.meta_data["storage_format"] if type(s_format) == list: s_format = s_format[0] # todo: automatical loading of csv? delimiter = self.meta_data.get("delimiter", ",") if not len(delimiter) == 1: self._log("Wrong delimiter ('%s') given. Using default ','." % delimiter, level=logging.CRITICAL) delimiter = "," # Do lazy loading of the prediction vector objects. if isinstance(self.data[(run_nr, split_nr, train_test)], basestring): self._log("Lazy loading of %s prediction vectors from input " "collection for run %s, split %s." % (train_test, run_nr, split_nr)) if s_format == "pickle": # Load the data from a pickled file file = open(self.data[(run_nr, split_nr, train_test)], "r") self.data[(run_nr, split_nr, train_test)] = cPickle.load(file) file.close() sample = self.data[(run_nr, split_nr, train_test)][0][0] self.update_meta_data({"num_predictors":numpy.size(sample.prediction)}) elif "csv" in s_format: # load file f = open(self.data[(run_nr, split_nr, train_test)]) data_set = f.readlines() f.close() # getting rid of all unwanted rows if "ignored_rows" in self.meta_data: ignored_rows = self.meta_data["ignored_rows"] if not type(ignored_rows) == list: warnings.warn("Wrong format: Ignored rows included!") ignored_rows = [] ignored_rows.sort() remove_list = [] for i in ignored_rows: remove_list.append(data_set[int(i)-1]) for j in remove_list: data_set.remove(j) if s_format == "csv": names = data_set[0].rstrip(",\n").split(delimiter) data_set.pop(0) line = data_set[0].split(delimiter) line[-1] = line[-1].rstrip("\n\r") if line[-1] == '': line.pop(-1) len_line = len(line) # get and prepare label column numbers (len_line needed) try: true_label_column = self.meta_data["true_label_column"] except KeyError: true_label_column = -1 try: num_predictors = self.meta_data["num_predictors"] except: num_predictors = 1 if self.meta_data.has_key("ignored_columns"): ignored_columns = self.meta_data["ignored_columns"] if not type(ignored_columns) == list: warnings.warn("Wrong format: Ignored columns included!") ignored_columns = [] new_ignored_columns = [] for i in ignored_columns: i = int(i) if i < 0: i += len_line if i > true_label_column: i -= 1 new_ignored_columns.append(i) else: new_ignored_columns = [] new_ignored_columns.sort() # read the data line by line for line in data_set: if not delimiter in line: warnings.warn("Line without delimiter:\n%s" % str(line)) continue line = line.split(delimiter) line[-1] = line[-1].rstrip("\n\r") true_label = line.pop(-1) pred_labels = [] pred_scores = [] for i in range(num_predictors): pred_scores.append(numpy.float64(line.pop())) pred_labels.append(line.pop()) if true_label not in classes_names: classes_names.append(true_label) sample = PredictionVector(label=pred_labels, prediction=pred_scores) self.add_sample(sample=sample, label=true_label, train=train_test, split=split_nr, run=run_nr) self.update_meta_data({"num_predictors": len(sample.prediction), "classes_names": classes_names}) return self.data[(run_nr, split_nr, train_test)]
[docs] def store(self, result_dir, s_format=["pickle", "real"]): """ store the collection in *result_dir*""" name = "predictions" # Update the meta data author = get_author() self.update_meta_data({"type": "prediction_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0]}) if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" for key, prediction_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open(os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "csv": # Write as Comma Separated Value result_file = open(os.path.join(result_path, name + key_str + ".csv"),"w") if self.meta_data["num_predictors"] == 1: result_file.write("Predicted Label, Prediction Score, True Label \n") for pv in prediction_vectors: result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1])) else: # we begin by dealing with the header of the csv file base_header = "Predicted %(index)d Label, Prediction %(index)d Score, " base_result = "%(label)s, %(score)s," header = "" for i in range(self.meta_data["num_predictors"]): header+= base_header % dict(index=i+1) header += "True Label\n" result_file.write(header) # and now we can write each of the prediction vectors in turn for pv in prediction_vectors: result = "" for i in range(self.meta_data["num_predictors"]): result += base_result % dict(label=pv[0].label[i], score=pv[0].prediction[i]) result += str(pv[1]) + "\n" result_file.write(result) #Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)