Source code for pySPACE.missions.nodes.splitter.traintest_splitter
""" Split data into one training and one test data set with restriction like randomization or fixed percentages """
import random
from pySPACE.missions.nodes.base_node import BaseNode
from pySPACE.tools.memoize_generator import MemoizeGenerator
[docs]class TrainTestSplitterNode(BaseNode):
""" Split data into one training and one test data set with a fixed ratio
The relative
size of the two sets is controlled via the parameter train_ratio.
.. warning:: the class ratio is not retained
.. todo::
introduce stratified parameter as in CV_Splitter
**Parameters**
:train_ratio:
The ratio of the overall available data that is assigned to the
training set. The remaining data (1-train_ratio) is used for testing.
(*optional, default: 0.5*)
:num_train_instances:
Instead of specifying a train_ratio, this option allows to specify the
absolute number of training instances of class *class_label* that
should be in the training set. All instances that occur until
*num_train_instances* are found are used for training. The remaining
data are used for testing.
(*optional, default: None*)
:class_label:
If *num_train_instances*-option is used, this string determines the
class of which training examples are count.
:random:
If *False*, the order of the data is retained. I.e. the train_ratio
instances are used for training and the remaining as test data. If
*True*, the two sets are sampled randomly from the data without
taking into consideration the data's order.
(*optional, default: True*)
**Exemplary Call**
.. code-block:: yaml
-
node : TrainTestSplitter
parameters :
train_ratio : 0.7
random : False
:Author: Jan Hendrik Metzen (jhm@informatik.uni-bremen.de)
:Created: 2010/03/08 (Documentation, old node)
:LastChange: 2011/11/14 (Documentation) Anett Seeland
"""
[docs] def __init__(self, train_ratio=0.5, random=True,
num_train_instances=None, class_label='Target', reverse=False,
**kwargs):
super(TrainTestSplitterNode, self).__init__(**kwargs)
assert(not(random and reverse)),"Reverse ordering makes no sense when randomization is active!"
self.set_permanent_attributes(train_ratio=train_ratio,
random=random,
num_train_instances=num_train_instances,
class_label=class_label,
reverse=reverse,
train_data=None,
test_data=None)
[docs] def is_split_node(self):
""" Returns whether this is a split node. """
return True
[docs] def use_next_split(self):
""" Use the next split of the data into training and test data.
Returns True if more splits are available, otherwise False.
This method is useful for benchmarking
"""
return False
[docs] def train_sweep(self, use_test_data):
""" Performs the actual training of the node.
.. note:: Split nodes cannot be trained
"""
raise Exception("Split nodes cannot be trained")
[docs] def request_data_for_training(self, use_test_data):
""" Returns the data for training of subsequent nodes
.. todo:: to document
"""
# Create split lazily when required
if self.train_data == None:
self._create_split()
# Create training data generator
self.data_for_training = \
MemoizeGenerator(instance for instance in self.train_data)
return self.data_for_training.fresh()
[docs] def request_data_for_testing(self):
""" Returns the data for testing of subsequent nodes
.. todo:: to document
"""
# Create split lazily when required
if self.test_data == None:
self._create_split()
# Create test data generator
self.data_for_testing = \
MemoizeGenerator(instance for instance in self.test_data)
return self.data_for_testing.fresh()
[docs] def _create_split(self):
""" Create the split of the data into training and test data. """
self._log("Splitting data into train and test data")
train_data = list(self.input_node.request_data_for_training(use_test_data=False))
# If there is already a non-empty training set,
# it means that we are not the first split node in the node chain.
if len(train_data) > 0:
raise Exception("No iterated splitting of data sets allowed\n "
"(Calling a splitter on a data set that is already "
"split)")
# Create generator instead of loading all data
if self.num_train_instances and not (self.random):
self.train_data = []
input_generator=self.input_node.request_data_for_testing()
for i in range(self.num_train_instances):
self.train_data.append(input_generator.next())
self.test_data = input_generator
return
# Gather all test data
test_data = list(self.input_node.request_data_for_testing())
# Remember all the data and store it in memory
# TODO: This might cause problems for large dataset
data = train_data + test_data
data_size = len(data)
# Randomize order if randomization is not switched of
if self.random:
r = random.Random(self.run_number)
r.shuffle(data)
if self.num_train_instances!=None:
if self.reverse:
data = data[::-1]
if len([i for i in range(len(data)) \
if data[i][1]==self.class_label])==self.num_train_instances:
train_end = data_size
else:
counter = 0
for (index, (window, label)) in enumerate(data):
# print "Label: ", label, "Zeitpunkt: ", window.start_time
if label == self.class_label:
counter += 1
if counter == self.num_train_instances:
train_end = index+1
break
assert(self.num_train_instances==counter), \
"Too many instances to select."
else:
# Split data into train and test data according train_ratio
train_end = int(round(data_size * self.train_ratio))
self.train_data=data[0:train_end]
self.test_data=data[train_end:]