""" Load and store data sets containing :mod:`Feature Vectors<pySPACE.resources.data_types.feature_vector>`
"""
import os
import cPickle
import yaml
import numpy
import warnings
import logging
from pySPACE.resources.dataset_defs.base import BaseDataset
from pySPACE.resources.data_types.feature_vector import FeatureVector
from pySPACE.tools.filesystem import get_author
[docs]class FeatureVectorDataset(BaseDataset):
""" Feature vector dataset class
This class is most importantly
for loading and storing
:class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
to the file system.
You can load it using a :mod:`~pySPACE.missions.nodes.source.feature_vector_source` node.
It can be saved, using a :mod:`~pySPACE.missions.nodes.sink.feature_vector_sink` node
in a :class:`~pySPACE.missions.operations.node_chain.NodeChainOperation`
The constructor expects the argument *dataset_md* that
contains a dictionary with all the meta data.
It is normally loaded from the metadata.yaml file.
It is able to load csv-Files, arff-files and pickle files,
where one file is always responsible for one training or test set.
The name conventions are the same as described in
:class:`~pySPACE.resources.dataset_defs.time_series.TimeSeriesDataset`.
It is important that a metadata.yaml file exists, giving all
the relevant information of the data set,
especially the storage format, which can be
*pickle*, *arff*, *csv* or *csvUnnamed*.
The last format is only for loading data without heading,
and with the labels being not in the last column.
**pickle-files**
See :class:`~pySPACE.resources.dataset_defs.time_series.TimeSeriesDataset`
for name conventions (in the tutorial).
**arff-files**
http://weka.wikispaces.com/ARFF
This format was introduced to connect pySPACE with weka.
So when using weka, you need to choose this file format as the parameter
storage format in the preprocessing operation's spec file.
**CSV (comma separated values)**
These are tables in a simple text format.
Therefore each column is separated with a comma and each row with a new line.
Normally the first line gives the feature names and one row is giving
the class labels. Therefore several parameters need to be specified in
the :ref:`metadata.yaml <metadata_yaml>` file.
If no collection meta data is available for the input data,
the 'metadata.yaml' file can be generated with
:mod:`~pySPACE.run.scripts.md_creator`.
Please consider also some important parameters, described in the
get_data function.
Preferably the labels are in the last column. This corresponds to
*label_column* being -1 in the metadata.yaml file.
**Special CSV Parameters**
:label_column:
Column containing the labels
Normally this column looses its heading.
when saving the csv file, the default, -1, is used.
(*recommended, default: -1*)
:ignored_columns:
List of numbers containing the numbers of irrelevant columns,
e.g., `[1,2,8,42]`
After the data is loaded, this parameter becomes obsolete.
.. todo:: Enable and document eval syntax
(*optional, default: []*)
:ignored_rows:
Replace row in description of 'ignored_columns'
(*optional, default: []*)
:delimiter:
Symbol which separates the csv entries
Typically `,` is used or the tabulator `\t`.
When storing, `,` is used.
(*recommended, default: ','*)
**Parameters**
:dataset_md: dictionary containing meta data for the collection
to be loaded
The following 3 Parameters contain standard information
for a feature vector data set.
Normally they are not yet needed (used), because a dataset_md
is given and real data is loaded,
and so this information could be loaded from the data.
Nevertheless these are important entries, which should be found
in each dataset_md, giving information about the data set.
:classes_names: list of the used class labels
:feature_names: list of the feature names
The feature names are either determined during the loading of the
data, if available in the respective *storage_format*,
or they are later on set with a default string (e.g.,
feature_0_0.000sec).
:num_features: number of the given features
.. todo:: Better integration and documentation of the data_pattern variable,
e.g., when reading arff files.
"""
[docs] def __init__(self, dataset_md=None, classes_names=[], feature_names=None,
num_features=None, **kwargs):
""" Read out the data from the given collection
.. todo:: test data pattern usage on old data
.. note:: main loading concept copied from time series collection
check needed if code can be sent to upper class
"""
super(FeatureVectorDataset, self).__init__(dataset_md=dataset_md)
if not self.meta_data.has_key("feature_names"):
self.update_meta_data({"feature_names":feature_names})
if not self.meta_data.has_key("classes_names"):
self.update_meta_data({"classes_names":classes_names})
if not self.meta_data.has_key("num_features"):
self.update_meta_data({"num_features":num_features})
if not dataset_md is None: #data has to be loaded
self._log("Loading feature vectors from input collection.")
dataset_dir = self.meta_data["dataset_directory"]
s_format = self.meta_data["storage_format"]
if type(s_format) == list:
s_format = s_format[0]
# mainly code copy from time series data set defs
if dataset_md.has_key("data_pattern") and not self.meta_data["train_test"] \
and self.meta_data["splits"] == 1 \
and self.meta_data["runs"] == 1 :
# The collection consists only of a single set of data, for
# one run, one splitting, and only test data
data = dataset_md["data_pattern"].replace("_run", "_run0") \
.replace("_sp","_sp0") \
.replace("_tt","_test")
# File that contains the time series objects
fv_file = os.path.join(dataset_dir,data)
# Actual data will be loaded lazily
self.data[(0, 0, "test")] = fv_file
elif dataset_md.has_key("data_pattern"):
for run_nr in range(self.meta_data["runs"]):
for split_nr in range(self.meta_data["splits"]):
for train_test in ["train", "test"]:
# The collection consists only of a single set of data, for
# one run, one splitting, and only test data
data = dataset_md["data_pattern"].replace("_run", "_run%s" % run_nr) \
.replace("_sp","_sp%s" % split_nr) \
.replace("_tt","_%s" % train_test)
# File that contains the time series objects
fv_file = os.path.join(dataset_dir,data)
# Actual data will be loaded lazily
self.data[(run_nr, split_nr, train_test)] = fv_file
elif dataset_md.has_key("file_name"):
fv_file = os.path.join(dataset_dir,self.meta_data["file_name"])
self.data[(0, 0, "test")] = fv_file
else:
pass
##TODO: What should we do? - Raise Error, because data is not defined?
#raise NotImplementedError()
else: #dataset_md == None # called when storing or initialize empty collection
# We create a new, empty collection
pass
[docs] def add_sample(self, sample, label, train, split=0, run=0):
""" Add a sample to this collection
Adds the sample *sample* along with its class label *label*
to this collection.
**Parameters**
:sample: The respective data sample
:label: The label of the data sample
:train: If *train*, this sample has already been used for training
:split: The number of the split this sample belongs to. \
(*optional, default: 0*)
:run: The run number this sample belongs to
(*optional, default: 0*)
"""
if self.meta_data["num_features"] is None:
self.update_meta_data({"num_features": sample.size})
elif not sample.size == self.meta_data["num_features"]:
self.update_meta_data({"num_features": sample.size})
warnings.warn("Mismatching feature number: %i given but %i occured."
% (self.meta_data["num_features"], sample.size))
try:
# Remember all class labels since these will be stored
# in the arff file meta data
if label not in self.meta_data["classes_names"]:
self.meta_data["classes_names"].append(label)
except KeyError:
self.update_meta_data({"classes_names": [label]})
# Delegate to super class
super(FeatureVectorDataset, self).add_sample(sample, label,
train, split, run)
[docs] def dump(self, result_path, name):
""" Dumps this collection into a file.
Dumps (i.e. pickle) this collection object into a bz2 compressed file.
In contrast to *store* this method stores the whole collection
in a file. No meta data are stored in a YAML file etc.
The method expects the following parameters:
* *result_path* The path to the directory in which the pickle \
file will be written.
* *name* The name of the pickle file
"""
# Remove the feature names from the feature vectors since this leads to
# unnecessary large sizes on the disk
for values in self.data.itervalues():
for (sample, label) in values:
sample.feature_names = []
# Delegate to super class
super(FeatureVectorDataset, self).dump(result_path, name)
[docs] def get_data(self, run_nr, split_nr, train_test): # feature_file, storage_format):
""" Loads the data from the feature file of the current input collection
depending on the storage_format.
Separates the actual vectors from the names and returns both as lists.
The method expects the following
**Parameters**
:feature_file: the file of feature vectors to be loaded
:storage_format: One of the first components in
['arff', 'real'], ['csv', 'real'],
['csvUnnamed', 'real'] or .
Format in which the feature_file was saved.
Information need to be present in meta data.
For arff and pickle files documentation see to the class description
(docstring). Pickle format files do not need any special
loading because they
already have the perfect format.
**CSV**
If no collection meta data is available for the input data,
the 'metadata.yaml' file can be generated with
:mod:`pySPACE.run.node_chain_scripts.md_creator`.
If you created the csv file with pySPACE, you automatically have the
standard *csv* format with the feature names in the first row
and the labels in the last column.
If you have a csv tabular without headings,
you have the *csvUnnamed* format,
and in your 'label_column' column, specified in your spec file,
the labels can be found.
.. note:: main loading concept copied from time series collection
check needed if code can be sent to upper class
"""
## init ##
classes_names = self.meta_data["classes_names"]
s_format = self.meta_data["storage_format"]
if type(s_format) == list:
s_format = s_format[0]
# todo: automatical loading of csv?
delimiter = self.meta_data.get("delimiter", ",")
if not len(delimiter) == 1:
self._log("Wrong delimiter ('%s') given. Using default ','." %
delimiter, level=logging.CRITICAL)
delimiter = ","
# Do lazy loading of the fv objects.
if isinstance(self.data[(run_nr, split_nr, train_test)], basestring):
self._log("Lazy loading of %s feature vectors from input "
"collection for run %s, split %s." % (train_test, run_nr,
split_nr))
if s_format == "pickle":
# Load the data from a pickled file
file = open(self.data[(run_nr, split_nr, train_test)], "r")
self.data[(run_nr, split_nr, train_test)] = cPickle.load(file)
file.close()
sample = self.data[(run_nr, split_nr, train_test)][0][0]
self.update_meta_data({"feature_names":sample.feature_names,
"len_line":len(sample.feature_names)})
elif s_format == "arff":
names = []
data = []
# load file
f = open(self.data[(run_nr, split_nr, train_test)])
data_set = f.readlines()
f.close()
# Read the arff file completely ##
for line in data_set:
if "@attribute class" in line \
or "@relation" in line \
or "@data" in line:
pass
elif "@attribute" in line:
name_line = line.split()
names.append(name_line[1])
else:
data.append(line.split(delimiter))
# the label is expected to be at the end
# of each line in the data.
for line in data:
vector = line[0:-1]
label = line[-1].rstrip("\n\r ") # --> label is string
if not label in classes_names:
classes_names.append(label)
sample = FeatureVector(numpy.atleast_2d([vector]).astype(
numpy.float64), feature_names=names)
self.add_sample(sample=sample, label=label,
train=train_test, split=split_nr,
run=run_nr)
self.update_meta_data({"feature_names": sample.feature_names,
"len_line": len(sample.feature_names),
"classes_names": classes_names})
elif "csv" in s_format: # csv or csv unnamed
# load file
f = open(self.data[(run_nr, split_nr, train_test)])
data_set = f.readlines()
f.close()
# getting rid of all unwanted rows
if "ignored_rows" in self.meta_data:
ignored_rows = self.meta_data["ignored_rows"]
if not type(ignored_rows) == list:
warnings.warn("Wrong format: Ignored rows included!")
ignored_rows = []
ignored_rows.sort()
remove_list = []
for i in ignored_rows:
remove_list.append(data_set[int(i)-1])
for j in remove_list:
data_set.remove(j)
# get len_line and delete heading
feature_names = self.meta_data["feature_names"]
if s_format == "csv":
names = data_set[0].rstrip(",\n").split(delimiter)
data_set.pop(0)
line = data_set[0].split(delimiter)
line[-1] = line[-1].rstrip("\n\r")
if line[-1] == '':
line.pop(-1)
len_line = len(line)
# get and prepare label column numbers (len_line needed)
try:
label_column = self.meta_data["label_column"]
except KeyError:
label_column = -1
# map column numbers to indices by subtracting -1
if type(label_column) == int:
label_columns = [label_column - 1]
elif type(label_column) == list:
label_columns = [int(l)-1 for l in label_column]
for i in range(len(label_columns)):
# map to positive value and undo previous offset
if label_columns[i] < 0:
label_columns[i] = label_columns[i] + len_line+1
if label_columns[i] < 0:
label_columns[i] = -1 + len_line
# very important sorting for index shifts
label_columns.sort()
# calculate unwanted columns
# note: These indices begin with 1 .
# They are internally shifted when used.
if self.meta_data.has_key("ignored_columns"):
ignored_columns = self.meta_data["ignored_columns"]
if not type(ignored_columns) == list:
warnings.warn("Wrong format: Ignored columns included!")
ignored_columns = []
new_ignored_columns = []
for i in ignored_columns:
i = int(i)
if i < 0:
i += len_line
for label_column in label_columns:
if i > label_column:
i -= 1
new_ignored_columns.append(i)
else:
new_ignored_columns = []
new_ignored_columns.sort()
# get all relevant feature_names
if feature_names is None:
if s_format == "csv":
# delete blanks and inverted commas
for i in range(len(names)):
names[i] = names[i].strip(' "')
names[i] = names[i].strip(" '")
feature_names = names
else: #s_format=="csv_unnamed"
feature_names = ["feature_%s" % i for i in
range(len_line)]
# switch label names to the end
i = 0 # reduce index, after previous labels were deleted
for label_column in label_columns:
try:
feature_names.append(feature_names[label_column-i])
del feature_names[label_column-i]
i += 1
except IndexError:
feature_names.append("")
# create new feature names
feature_names = [item for index, item in enumerate(
feature_names) if not index+1 in new_ignored_columns]
for _ in label_columns:
feature_names.pop(-1)
# read the data line by line
for line in data_set:
if not delimiter in line:
warnings.warn("Line without delimiter:\n%s" % str(line))
continue
line = line.split(delimiter)
line[-1] = line[-1].rstrip("\n\r")
if line[-1] == '':
line.pop(-1)
label = []
i = 0
for label_column in label_columns:
label.append(line.pop(label_column-i))
i += 1
if type(label) == str:
label = label.strip(' "')
label = label.strip(" '")
if label not in classes_names:
classes_names.append(label)
# create new line without the ignored columns
vector = [item for index,item in enumerate(line) if not
index+1 in new_ignored_columns]
sample = FeatureVector(numpy.atleast_2d([vector]).astype(
numpy.float64), feature_names=feature_names)
if len(label) == 1:
label = label[-1]
self.add_sample(sample=sample, label=label,
train=train_test, split=split_nr,
run=run_nr)
self.update_meta_data({"feature_names": sample.feature_names,
"num_features": len(sample.feature_names),
"classes_names": classes_names})
return self.data[(run_nr, split_nr, train_test)]
[docs] def store(self, result_dir, s_format = ["pickle", "real"]):
""" Stores this collection in the directory *result_dir*.
In contrast to *dump* this method stores the collection
not in a single file but as a whole directory structure with meta
information etc. The data sets are stored separately for each run,
split, train/test combination.
The method expects the following parameters:
* *result_dir* The directory in which the collection will be stored
* *name* The prefix of the file names in which the individual \
data sets are stored. The actual file names are determined \
by appending suffixes that encode run, split, train/test \
information. Defaults to "features".
* *format* A list with information about the format in which the
actual data sets should be stored. The first entry specifies
the file format. If it is "arff" the second entry specifies the
attribute format.
Examples: ["arff", "real"], ["arff", "{0,1}"]
.. todo:: Someone could implement the format ["fasta"] for sax features
To store the data in comma separated values, use ["csv", "real"].
(*optional, default: ["pickle", "real"]*)
.. todo:: Adapt storing of csv file to external library instead of
doing it manually.
"""
name = "features"
# Update the meta data
author = get_author()
self.update_meta_data({"type": "feature_vector",
"storage_format": s_format,
"author": author,
"data_pattern": "data_run" + os.sep
+ name + "_sp_tt." + s_format[0]})
if type(s_format) == list:
s_type = s_format[1]
s_format = s_format[0]
else:
s_type = "real"
if not s_format in ["csv", "arff", "pickle"]:
self._log("Storage format not supported! Using default.",
level=logging.ERROR)
s_format = "pickle"
# Iterate through splits and runs in this dataset
for key, feature_vectors in self.data.iteritems():
# test if dataset has already been loaded.
# Otherwise replace with iterator to loaded version.
if isinstance(feature_vectors, basestring):
feature_vectors = self.get_data(key[0], key[1], key[2])
# Construct result directory
result_path = result_dir + os.sep + "data" \
+ "_run%s" % key[0]
if not os.path.exists(result_path):
os.mkdir(result_path)
key_str = "_sp%s_%s" % key[1:]
# Store data depending on the desired format
if s_format == "pickle":
result_file = open(os.path.join(result_path,
name + key_str + ".pickle"),
"w")
cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL)
elif s_format == "arff": # Write as ARFF
result_file = open(os.path.join(result_path,
name + key_str + ".arff"),"w")
# Create the arff file header
relation_name = result_dir.split(os.sep)[-1]
result_file.write('@relation "%s"\n' % relation_name)
# Write the type of all features
for feature_name in self.meta_data["feature_names"]:
result_file.write("@attribute %s %s\n" % (feature_name, s_type))
classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + ""
result_file.write("@attribute class {%s}\n" % classString)
result_file.write("@data\n")
# Write all given training data into the ARFF file
fv = feature_vectors[0][0]
if numpy.issubdtype(fv.dtype, numpy.string_):
feature_format = "%s,"
elif numpy.issubdtype(fv.dtype, numpy.floating):
feature_format = "%f,"
elif numpy.issubdtype(fv.dtype, numpy.integer):
feature_format = "%d,"
for features, class_name in feature_vectors:
for feature in features[0]:
result_file.write(feature_format % feature)
result_file.write("%s\n" % str(class_name))
elif s_format == "csv": # Write as Comma Separated Value
result_file = open(os.path.join(result_path,
name + key_str + ".csv"),"w")
for feature_name in self.meta_data["feature_names"]:
result_file.write("%s," % (feature_name))
result_file.write("\n")
fv = feature_vectors[0][0]
if numpy.issubdtype(fv.dtype, numpy.floating):
feature_format = "%f,"
elif numpy.issubdtype(fv.dtype, numpy.integer):
feature_format = "%d,"
else:
feature_format = "%s,"
for features, class_name in feature_vectors:
f = features.view(numpy.ndarray)
for feature in f[0]:
result_file.write(feature_format % feature)
result_file.write("%s\n" % str(class_name))
result_file.close()
#Store meta data
BaseDataset.store_meta_data(result_dir,self.meta_data)