""" Load and store data sets containing :mod:`Prediction Vectors<pySPACE.resources.data_types.prediction_vector>` """
import os
import cPickle
import numpy
import logging
import warnings
from pySPACE.resources.dataset_defs.base import BaseDataset
from pySPACE.tools.filesystem import get_author
from pySPACE.resources.data_types.prediction_vector import PredictionVector
[docs]class PredictionVectorDataset(BaseDataset):
""" Prediction Vector dataset class
The class at hand contains the methods needed to work with the datasets
consisting of :class:`~pySPACE.resources.data_types.prediction_vector.PredictionVectorDataset`
The following data formats are currently supported:
- `*.csv` - with or without a header column
- `*.pickle`
TODO: Add functionality for the `*.arff` format.
.. note::
The implementation of the current dataset is adapted from the
:class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset` and
:class:`~pySPACE.resources.dataset_defs.time_series_vector.TimeSeriesDataset`.
For a more thorough documentation, we refer the reader to the 2 datasets
mentioned above.
**Parameters**
:dataset_md:
Dictionary containing meta data for the collection to be loaded.
Out of these parameters, the most important one is the number of
predictors since it how the Prediction Vectors will be generated.
:num_predictors:
The number of predictors that each PredictionVector contains.
This parameter is important for determining the dimensionality of
the PredictionVector.
**Special CSV Parameters**
:delimiter:
Needed only when dealing with `*.csv` files. The default is
`,` is used or the tabulator `\t`. When storing, `,` is used.
(*recommended, default: ','*)
:label_column:
Column containing the true label of the data point
Normally this column looses its heading.
when saving the csv file, the default, -1, is used.
(*recommended, default: -1*)
:ignored_columns:
List of numbers containing the numbers of irrelevant columns,
e.g., `[1,2,8,42]`
After the data is loaded, this parameter becomes obsolete.
.. todo:: Enable and document eval syntax
(*optional, default: []*)
:ignored_rows:
Replace row in description of 'ignored_columns'
(*optional, default: []*)
:Author: Andrei Ignat (andrei_cristian.ignat@dfki.de)
:Created: 2014/10/15
"""
[docs] def __init__(self, dataset_md=None, num_predictors=1,**kwargs):
""" Read out the data from the given collection """
super(PredictionVectorDataset, self).__init__(dataset_md=dataset_md)
# we want to remember the number of predictions
if not dataset_md is None: #data has to be loaded
self._log("Loading prediction vectors from input collection.")
dataset_dir = self.meta_data["dataset_directory"]
s_format = self.meta_data["storage_format"]
if type(s_format) == list:
s_format = s_format[0]
# code copied from TimeSeriesDataset defs
if dataset_md.has_key("data_pattern") \
and not self.meta_data["train_test"] \
and self.meta_data["splits"] == 1 \
and self.meta_data["runs"] == 1:
# The collection consists only of a single set of data, for
# one run, one splitting, and only test data
data = dataset_md["data_pattern"].replace("_run", "_run0")\
.replace("_sp", "_sp0")\
.replace("_tt", "_test")
# File that contains the PredictionVector objects
pv_file = os.path.join(dataset_dir,data)
# Actual data will be loaded lazily
self.data[(0, 0, "test")] = pv_file
elif dataset_md.has_key("data_pattern"):
for run_nr in range(self.meta_data["runs"]):
for split_nr in range(self.meta_data["splits"]):
for train_test in ["train", "test"]:
# The collection consists only of a single set of
# data, for one run, one splitting, and only test
# data
data = dataset_md["data_pattern"]\
.replace("_run", "_run%s" % run_nr)\
.replace("_sp","_sp%s" % split_nr)\
.replace("_tt","_%s" % train_test)
# File that contains the PredictionVector objects
pv_file = os.path.join(dataset_dir, data)
# Actual data will be loaded lazily
self.data[(run_nr, split_nr, train_test)] = pv_file
elif dataset_md.has_key("file_name"):
fv_file = os.path.join(dataset_dir,self.meta_data["file_name"])
self.data[(0, 0, "test")] = fv_file
else:
pass
else:
pass
[docs] def add_sample(self, sample, label, train, split=0, run=0):
""" Add a prediction vector to this collection """
# we count the total number of predictors in the dataset
# and update it whenever a new sample is added
if not self.meta_data.has_key("num_predictors"):
self.update_meta_data({"num_predictors": numpy.size(sample.prediction)})
elif self.meta_data["num_predictors"]!=numpy.size(sample.prediction):
warnings.warn("Inconsistent number of predictors for sample."
"Expected %d and received %d predictors" %
(self.meta_data["num_predictors"],
numpy.size(sample.prediction)))
try:
# Remember all class labels since these will be stored
if label not in self.meta_data["classes_names"]:
self.meta_data["classes_names"].append(label)
except KeyError:
self.update_meta_data({"classes_names": [label]})
super(PredictionVectorDataset, self).add_sample(sample, label,
train, split, run)
[docs] def dump(self, result_path, name):
""" Dumps this collection into a file """
# Remove the predictor from the prediction vectors
for values in self.data.itervalues():
for (sample, label) in values:
sample.predictor = None
# Delegate to super class
super(PredictionVectorDataset, self).dump(result_path, name)
[docs] def get_data(self, run_nr, split_nr, train_test):
""" Load the data from a prediction file """
classes_names = self.meta_data["classes_names"]
s_format = self.meta_data["storage_format"]
if type(s_format) == list:
s_format = s_format[0]
# todo: automatical loading of csv?
delimiter = self.meta_data.get("delimiter", ",")
if not len(delimiter) == 1:
self._log("Wrong delimiter ('%s') given. Using default ','." %
delimiter, level=logging.CRITICAL)
delimiter = ","
# Do lazy loading of the prediction vector objects.
if isinstance(self.data[(run_nr, split_nr, train_test)], basestring):
self._log("Lazy loading of %s prediction vectors from input "
"collection for run %s, split %s." % (train_test, run_nr,
split_nr))
if s_format == "pickle":
# Load the data from a pickled file
file = open(self.data[(run_nr, split_nr, train_test)], "r")
self.data[(run_nr, split_nr, train_test)] = cPickle.load(file)
file.close()
sample = self.data[(run_nr, split_nr, train_test)][0][0]
self.update_meta_data({"num_predictors":numpy.size(sample.prediction)})
elif "csv" in s_format:
# load file
f = open(self.data[(run_nr, split_nr, train_test)])
data_set = f.readlines()
f.close()
# getting rid of all unwanted rows
if "ignored_rows" in self.meta_data:
ignored_rows = self.meta_data["ignored_rows"]
if not type(ignored_rows) == list:
warnings.warn("Wrong format: Ignored rows included!")
ignored_rows = []
ignored_rows.sort()
remove_list = []
for i in ignored_rows:
remove_list.append(data_set[int(i)-1])
for j in remove_list:
data_set.remove(j)
if s_format == "csv":
names = data_set[0].rstrip(",\n").split(delimiter)
data_set.pop(0)
line = data_set[0].split(delimiter)
line[-1] = line[-1].rstrip("\n\r")
if line[-1] == '':
line.pop(-1)
len_line = len(line)
# get and prepare label column numbers (len_line needed)
try:
true_label_column = self.meta_data["true_label_column"]
except KeyError:
true_label_column = -1
try:
num_predictors = self.meta_data["num_predictors"]
except:
num_predictors = 1
if self.meta_data.has_key("ignored_columns"):
ignored_columns = self.meta_data["ignored_columns"]
if not type(ignored_columns) == list:
warnings.warn("Wrong format: Ignored columns included!")
ignored_columns = []
new_ignored_columns = []
for i in ignored_columns:
i = int(i)
if i < 0:
i += len_line
if i > true_label_column:
i -= 1
new_ignored_columns.append(i)
else:
new_ignored_columns = []
new_ignored_columns.sort()
# read the data line by line
for line in data_set:
if not delimiter in line:
warnings.warn("Line without delimiter:\n%s" % str(line))
continue
line = line.split(delimiter)
line[-1] = line[-1].rstrip("\n\r")
true_label = line.pop(-1)
pred_labels = []
pred_scores = []
for i in range(num_predictors):
pred_scores.append(numpy.float64(line.pop()))
pred_labels.append(line.pop())
if true_label not in classes_names:
classes_names.append(true_label)
sample = PredictionVector(label=pred_labels,
prediction=pred_scores)
self.add_sample(sample=sample, label=true_label,
train=train_test, split=split_nr,
run=run_nr)
self.update_meta_data({"num_predictors": len(sample.prediction),
"classes_names": classes_names})
return self.data[(run_nr, split_nr, train_test)]
[docs] def store(self, result_dir, s_format=["pickle", "real"]):
""" store the collection in *result_dir*"""
name = "predictions"
# Update the meta data
author = get_author()
self.update_meta_data({"type": "prediction_vector",
"storage_format": s_format,
"author": author,
"data_pattern": "data_run" + os.sep
+ name + "_sp_tt." + s_format[0]})
if not s_format in ["csv", "arff", "pickle"]:
self._log("Storage format not supported! Using default.",
level=logging.ERROR)
s_format = "pickle"
for key, prediction_vectors in self.data.iteritems():
# Construct result directory
result_path = result_dir + os.sep + "data" \
+ "_run%s" % key[0]
if not os.path.exists(result_path):
os.mkdir(result_path)
key_str = "_sp%s_%s" % key[1:]
# Store data depending on the desired format
if s_format == "pickle":
result_file = open(os.path.join(result_path,
name + key_str + ".pickle"),
"w")
cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL)
elif s_format == "csv": # Write as Comma Separated Value
result_file = open(os.path.join(result_path,
name + key_str + ".csv"),"w")
if self.meta_data["num_predictors"] == 1:
result_file.write("Predicted Label, Prediction Score, True Label \n")
for pv in prediction_vectors:
result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1]))
else:
# we begin by dealing with the header of the csv file
base_header = "Predicted %(index)d Label, Prediction %(index)d Score, "
base_result = "%(label)s, %(score)s,"
header = ""
for i in range(self.meta_data["num_predictors"]):
header+= base_header % dict(index=i+1)
header += "True Label\n"
result_file.write(header)
# and now we can write each of the prediction vectors in turn
for pv in prediction_vectors:
result = ""
for i in range(self.meta_data["num_predictors"]):
result += base_result % dict(label=pv[0].label[i],
score=pv[0].prediction[i])
result += str(pv[1]) + "\n"
result_file.write(result)
#Store meta data
BaseDataset.store_meta_data(result_dir,self.meta_data)