Source code for pySPACE.missions.nodes.data_selection.instance_selection

""" Select only a part of the instances

.. todo: group instance selectors
"""

import random
import logging
from collections import defaultdict

from pySPACE.missions.nodes.base_node import BaseNode
from pySPACE.tools.memoize_generator import MemoizeGenerator


[docs]class InstanceSelectionNode(BaseNode): """Retain only a certain percentage of the instances The node InstanceSelectionNode forwards only *train_percentage_selected* percent of the training instances passed to him to the successor node and only *test_percentage_selected* percent of the test instances. The forwarded instances are selected randomly but so that the class ratio is kept. If *reduce_class* is used, only the chosen class is reduced, without keeping the class ratio. So the total mount of reduced data does not match the percentage values. **Parameters** :train_percentage_selected: The percentage of training instances which is forwarded to successor node. (*optional, default: 100*) :test_percentage_selected: The percentage of test instances which is forwarded to successor node. (*optional, default: 100*) :reduce_class: If you want only to reduce one class, choose this parameter otherwise, both classes are reduced in a balanced fashion. (*optional, default: False*) :num_train_instances: Instead of specifying *train_percentage_selected*, this option allows to specify the absolute number of training instances of class *class_label* that should be in the training set. All instances that occur until *num_train_instances* are found are used for training. (*optional, default: None*) :class_label: If *num_train_instances*-option is used, this string determines the class of which training examples are count. (*optional, default: 'Target'*) :random: If *False*, the order of the data is retained. I.e. the first X percent or number of train instances are used for training. If *True*, the training data is sampled randomly without taking into consideration the data's order. (*optional, default: True*) **Exemplary call** .. code-block:: yaml - node : InstanceSelection parameters : train_percentage_selected : 80 test_percentage_selected : 100 reduce_class : Standard :Author: Jan Hendrik Metzen (jhm@informatik.uni-bremen.de) :Created: 2010/03/31 """
[docs] def __init__(self, train_percentage_selected=100, test_percentage_selected=100, reduce_class=False, num_train_instances=None, class_label='Target', random=True, **kwargs): super(InstanceSelectionNode, self).__init__(**kwargs) self.set_permanent_attributes( train_percentage_selected=train_percentage_selected, test_percentage_selected=test_percentage_selected, reduce_class=reduce_class, num_train_instances=num_train_instances, class_label=class_label, random=random)
[docs] def get_num_data(self, iterator): """ Return a list of instances that contain *num_train_instances* many instances of class *class_label* and all other instances that occur up to this point """ counter = 0 retained_instances = [] while counter < self.num_train_instances: try: instance, label = iterator.next() except: #TODO: give some warning to user break else: if label == self.class_label: counter += 1 retained_instances.append((instance,label)) return retained_instances
[docs] def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document .. note:: This method works differently in InstanceSelectionNode than in other nodes: Only *percentage_selected* of the available data are returned. """ assert(self.input_node is not None) if self.train_percentage_selected > 100: self._log("Train percentage of %f reduced to 100." % self.train_percentage_selected, level=logging.ERROR) self.train_percentage_selected = 100 self._log("Data for training is requested.", level=logging.DEBUG) if self.train_percentage_selected == 100 and \ self.num_train_instances is None: return super(InstanceSelectionNode, self).request_data_for_training( use_test_data) # If we haven't computed the data for training yet if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) if not self.num_train_instances is None and self.random == False: retained_instances = self.get_num_data( self.input_node.request_data_for_training(use_test_data)) else: # Store all data if self.num_train_instances is None: all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) else: all_instances = list( self.input_node.request_data_for_traning(use_test_data)) if self.random: r = random.Random(self.run_number) if not self.num_train_instances is None and self.random: r.shuffle(all_instances) retained_instances = self.get_num_data( all_instances.__iter__()) else: retained_instances = [] self._log("Keeping only %s percent of training data" % self.train_percentage_selected, level=logging.DEBUG) # Retain only *percentage_selected* percent of the data for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.train_percentage_selected == 100: end_index = int(round(len(instances) * self.train_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.train_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) if self.random: # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
[docs] def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) if self.test_percentage_selected > 100: self._log("Test percentage of %f reduced to 100." % self.test_percentage_selected, level=logging.ERROR) self.test_percentage_selected = 100 self._log("Data for testing is requested.", level=logging.DEBUG) if self.test_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_testing() # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) self._log("Keeping only %s percent of test data" % self.test_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.test_percentage_selected == 100: end_index = int(round(len(instances) * self.test_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.test_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
[docs] def _execute(self, time_series): return time_series # We don't do anything with the kept instances
[docs]class ReduceOverrepresentedClassNode(BaseNode): """ Reject instances to balance categories for classification The node forwards only a reduced number of the training and test instances of the bigger class to get a balanced ratio of the classes. The forwarded instances are selected randomly. All data of the underrepresented class is forwarded. **Parameters** **Exemplary call** .. code-block:: yaml - node : Reduce_Overrepresented_Class :Author: Hendrik Woehrle (hendrik.woehrle@dfki.de) :Created: 2010/09/22 """
[docs] def __init__(self, **kwargs): super(ReduceOverrepresentedClassNode, self).__init__(**kwargs)
[docs] def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
[docs] def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
[docs] def _execute(self, time_series): return time_series # We don't do anything with the kept instances
[docs] def balance_instances(self, all_instances): """Method that performs the rejections of the data in the oversized class""" retained_instances = [] # it is supposed to have a binary classifier, e.g. to have exactly 2 classes #if not len(all_instances.keys())==2: # raise ValueError("Too many classes: only binary classification supported") # count the number of instances per class min_num_instances_per_class = float("+inf") for label, instances in all_instances.iteritems(): min_num_instances_per_class = min(min_num_instances_per_class, len(instances)) r = random.Random(self.run_number) # retain only the number of instances that corresponds # to the size of smaller class for label, instances in all_instances.iteritems(): r.shuffle(instances) retained_instances.extend( zip(instances[0:min_num_instances_per_class], [label]*min_num_instances_per_class)) r.shuffle(retained_instances) return retained_instances
_NODE_MAPPING = {"RandomInstanceSelection": InstanceSelectionNode, "Reduce_Overrepresented_Class": ReduceOverrepresentedClassNode}