Source code for pySPACE.missions.nodes.templates

""" Tell the developer about general coding and documentation approaches for nodes

A very useful tutorial can be found under :ref:`t_new_node`.
"""
from pySPACE.missions.nodes.base_node import BaseNode
import warnings
import logging
import numpy
from pySPACE.resources.data_types.feature_vector import FeatureVector
from pySPACE.tools.memoize_generator import MemoizeGenerator

[docs]class SimpleDataTransformationTemplateNode(BaseNode):
    """ Parametrized algorithm, transforming the data without training

    Describe your algorithm in detail.

    In the simplest case, an algorithm only implements its initialization
    and execution function like this node.

    The list of parameters should always be complete and correct to avoid
    hidden functionality.

    **References**

        If this node is using code from other implementations or
        is described in detail in a publication,
        mention the reference here.

    **Parameters**
        :Parameter1: Describe effect and specialties

            (*recommended, default: 42*)

        :Parameter2: Describe the effect, and if something special happens by
            default. It is also important to mention, which entries are possible
            (e.g. only True and False are accepted values).

            (*optional, default: False*)

    **Exemplary Call**

    .. code-block:: yaml

        -
            node : SimpleDataTransformationTemplate
            parameters:
                Parameter1 : 77
                Parameter2 : False


    :input:    Type1 (e.g. FeatureVector)
    :output:   Type2 (e.g. FeatureVector)
    :Author: Mario Muster (muster@informatik.exelent-university.de)
    :Created: 2013/02/25
    """
[docs]    def __init__(self, Parameter1=42, Parameter2=False, **kwargs):
        """ Set the basic parameters special for this algorithm

        If your init is not doing anything special, it does not need any
        documentation. The relevant class documentation
        is expected to be in the class docstring.

        .. note::
            The mapping from the call of the function with a YAML file
            and this init is totally straightforward.
            Every parameter in the dictionary description in the
            YAML file is directly used at the init call.
            The value of the parameter is transformed with the help
            of the YAML syntax (see: :ref:`yaml`).

        It is important to also use `**kwargs`, because they have to be
        forwarded to the base class, using:

        .. code-block:: python

            super(SimpleDataTransformationTemplateNode, self).__init__(**kwargs)

        .. warning::
            With the call of `super` comes some hidden functionality.
            Every self parameter in the init is made permanent via the
            function:
            :func:`~pySPACE.missions.nodes.base_node.BaseNode.set_permanent_attributes`
            from the base node.
            Normally all self parameters are instantiated after this call
            and have to be made permanent on their own.
            Permanent means, that these parameters are reset to the
            defined value, when the
            :func:`~pySPACE.missions.nodes.base_node.BaseNode.reset`
            method is called.
            This is for example done during k-fold cross validation, when
            the training fold is changed.
            For special variable types you may run into trouble,
            because set_permanent_attributes needs to copy them.

        .. warning::
            The init function is called before the distribution of node_chains
            in the parallel execution. So the node parameters need to be
            able to be stored into the pickle format.
            If you need parameters, which have not this functionality,
            just initialize them with the first call of the training or execute
            method.

        .. code-block:: python

            self.set_permanent_attributes(  P1 : Parameter1,
                                            P2 : Parameter2,
                                            P3 : "Hello"
                                         )

        Here `self.P3` will be an internal parameter.
        """
        super(SimpleDataTransformationTemplateNode, self).__init__(**kwargs)
        if not type(Parameter1) == int:
            warnings.warn("Parameter 1 is having wrong type %s." %
                str(type(Parameter1)))
            Parameter1 = 42
        self.set_permanent_attributes(P1=Parameter1,
                                      P2=Parameter2,
                                      P3="Hello")

[docs]    def _execute(self, x):
        """ General description of algorithm maybe followed by further details

        E.g. log "Hello" during first call and if P2 is set to True,
        always multiply data with P1 and in the other case forward the data.

        Logging is done using
        :func:`~pySPACE.missions.nodes.base_node.BaseNode._log`:

        .. code-block:: python

            self._log(self.P3, level=logging.DEBUG)

        To access only the data array and not the attached meta data, use
        `data = x.view(numpy.ndarray)` for preparation.
        """
        if self.P3:
            self._log(self.P3, level=logging.DEBUG)
            self.P3 = False
        data = x.view(numpy.ndarray)
        if self.P2:
            data = self.P1 * data
            x = FeatureVector.replace_data(x, data)
        return x


[docs]class TrainableAlgorithmTemplateNode(SimpleDataTransformationTemplateNode):
    """ Template for trainable algorithms

    :class:`SimpleDataTransformationTemplateNode` is the base node for this node
    and so, this node does not have to implement an _execute or __init__
    function. Often these methods have to be implemented nevertheless,
    but not here, to keep the example short.

    For trainable methods, a minimum of two functions has to be implemented:
    :func:`is_trainable` and :func:`_train`.
    Optionally four other functions can be overwritten:
    :func:`is_supervised`, :func:`_stop_training`, :func:`_inc_train`
    and :func:`start_retraining.`
    The first returns by default `False` and the other methods do nothing.

    .. note:: The execute function is applied on all data,
              even the training data, but the true label remains unknown.

    **Parameters**
        Please refer to :class:`SimpleDataTransformationTemplateNode`

        .. note:: Parameter1 is determined, by counting the training examples.

    **Exemplary Call**

    .. code-block:: yaml

        -
            node : TrainableAlgorithmTemplateNode
            parameters:
                Parameter1 : 77
                Parameter2 : False

    :input:    Type1 (e.g. FeatureVector)
    :output:   Type2 (e.g. FeatureVector)
    :Author: Mario Muster (muster@informatik.exelent-university.de)
    :Created: 2013/02/25
    """
[docs]    def is_trainable(self):
        """ Define trainable node, by returning True in this function """
        return True

[docs]    def is_supervised(self):
        """ Return True to get access to labels in training functions """
        return True

[docs]    def _train(self,data,class_label):
        """ Called for each element in training data to be processed

        Incremental algorithms, simply use the example to change their
        parameters and batch algorithms preprocess data and only store it.

        If :func:`is_supervised` were not overwritten or set `False`,
        this function is defined without the parameter *class_label*
        """
        if self.P3 == "Hello":
            self.P3 = ""
            self.P1 = 0
        self.P1 += 1
        self.P3 += class_label

[docs]    def _stop_training(self):
        """ Called after processing of all training examples

        For simplicity, we just reimplement the default.
        """
        pass

[docs]    def _inc_train(self,data, class_label):
        """ Train on new examples in testing phase

        During testing phase in the application phase,
        new labeled examples may occur  and this function is
        used to improve the already trained algorithm on these examples.

        .. note:: This method should always be as fast as possible.

        For simplicity, we only forward everything to :func:`_train`.

        For more details on retraining (how to turn it on, and how it works),
        have a look at the documentation of the *retrain* parameter
        in the :class:`~pySPACE.missions.nodes.base_node.BaseNode`.
        """
        self._train(data, class_label)

[docs]    def start_retraining(self):
        """ Prepare retraining

        Normally this method is not needed and does nothing,
        but maybe some parameters
        have to be changed, before the first retraining with
        the _inc_train method should be done.
        This method is here, to give this possibility.

        In our case, we simply reset the starting parameter *self.P3*.
        """
        self.P3 = "Hello"


[docs]class SpecialPurposeFunctionsTemplate(BaseNode):
    """ Introduce additional available functions

    Additional to the aforementioned methods,
    some algorithms have to overwrite the default behavior of nodes,
    directly change the normal data flow,
    manipulate data or labels,
    or communicate information to other nodes.

    Some of these methods will be introduced in the following and some use cases
    will be given.

    .. warning::
        Every method in the :class:`~pySPACE.missions.nodes.base_node.BaseNode`
        could be overwritten but this should be done very carefully to avoid
        bad side effects.
    """
[docs]    def store_state(self, result_dir, index=None):
        """ Store some additional results or information of this node

        Here the parameter *self.store* should be used to switch on the saving,
        since this method is called in every case,
        but should only store information, if this parameter is set true

        This method is automatically called during benchmarking
        for every node.
        It is for example used
        to store visualization of algorithms or data.

        Additionally to the result_dir, the node name should be used.
        If you expect this node to occur multiple times in a node chain,
        also use the index. This can be done for example like:

        .. code-block:: python

            import os
            from pySPACE.tools.filesystem import create_directory
            if self.store:
                #set the specific directory for this particular node
                node_dir = os.path.join(result_dir, self.__class__.__name__)
                #do we have an index-number?
                if index is None:
                    #add the index-number...
                    node_dir += "_%d" % int(index)
                create_directory(node_dir)

        Furthermore, it is very important to integrate the split number into
        the file name when storing, because otherwise your results will be
        overwritten. The convention in pySPACE is, to have a meaningful name
        of the part of the node you store followed by an underscore and 'sp'
        and the split number as done in

        .. code-block:: python

            file_name = "%s_sp%s.pickle" % ("patterns", self.current_split)
        """
        pass

[docs]    def reset(self):
        """ Resets the node to a clean state

        Every parameter set with
        :func:`~pySPACE.missions.nodes.base_node.BaseNode.set_permanent_attributes`
        is by default reset here to its specified value or deleted,
        if no value is specified.

        Since this method copies every parameter or some variables escape
        from the normal class variables scope, some methods need to overwrite
        this method.

        When you really need to overwrite this method some points have to be
        considered. For the normal functionality of the node, the super method
        needs to be called. To avoid deleting of the special variables,
        they have to be made local variables beforehand and afterwards again
        cast to class variables. This is depicted in the following example code,
        taken from the
        :class:`~pySPACE.missions.nodes.meta.same_input_layer.SameInputLayerNode`.

        .. code-block:: python

            def reset(self):
                ''' Also reset internal nodes '''
                nodes = self.nodes
                for node in nodes:
                    node.reset()
                super(SameInputLayerNode, self).reset()
                self.nodes = nodes
        """
        pass

[docs]    def get_result_dataset(self):
        """ Implementing this function, makes a node a :mod:`~pySPACE.missions.nodes.sink` """
        pass

[docs]    def request_data_for_training(self, use_test_data):
        """ Returns generator for training data for subsequent nodes of the node chain

        If *use_test_data* is true, all available data is used for
        training, otherwise only the data that is explicitly for training.

        These methods normally use the
        :class:`~pySPACE.tools.memoize_generator.MemoizeGenerator`
        to define their generator.
        When implementing such a method, one should always try not to double
        data but only redirect it, without extra storing it.

        The definition or redefinition of training data is done
        by :mod:`~pySPACE.missions.nodes.source` and
        :mod:`~pySPACE.missions.nodes.splitter` nodes.
        """
        pass

[docs]    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        When defining :func:`request_data_for_training` this method
        normally has to be
        implemented/overwritten, too and vice versa.
        """
        pass

[docs]    def process_current_split(self):
        """ Main processing part on test and training data of current split

        This method is called in the usage with benchmark node chains
        and defines the gathering of the result data of the node chain
        for a :mod:`~pySPACE.missions.nodes.sink` node.

        Hereby it gets the data by calling
        :func:`request_data_for_training` and :func:`request_data_for_testing`.

        In the case of using the
        :class:`~pySPACE.missions.nodes.cv_splitter.CrossValidationSplitterNode`,
        this method is called multiple times
        for each split and stores every time the result in the result dataset
        separately.

        Though this approach seems on first sight very complicated on first
        sight, it gives three very strong advantages.

        * The cross validation can be done exactly before the first trainable
          node in the node chain and circumvents unnecessary double processing.

        * By handling indices instead of real data,
          the data for training and testing is not copied and memory is saved.

        * The cross validation is very easy to use.
          Moving this functionality to the
          :mod:`~pySPACE.resources.dataset_types`
          would make the usage muh mor complicated and inefficient.
          Especially for nodes, which internally use node chains,
          like the :mod:`~pySPACE.missions.nodes.meta.parameter_optimization`
          nodes, this easy access pays off.
        """
        pass

[docs]    def get_sensor_ranking(self):
        """ Return sensor ranking fitting to the algorithm

        For usage with the ranking variant in the
        :class:`~pySPACE.missions.nodes.spatial_filtering.sensor_selection.SensorSelectionRankingNode`
        this method of the node is called to get the ranking to reduce sensors.

        The ranking is a sorted list of tuple (sensor name, weight).
        The first element has to correspond to the
        sensor with the lowest weight, meaning it is the most unimportant.

        .. note:: The code here is a copy from
            :class:`~pySPACE.missions.nodes.classification.base`
            which takes the classification vector `self.features`
            and sums up the absolute values fitting to one channel.
            It is only used as an example.
        """
        # channel name is what comes after the first underscore
        feat_channel_names = [chnames.split('_')[1]
                              for chnames in self.features.feature_names]
        from collections import defaultdict
        ranking_dict = defaultdict(float)
        for i in range(len(self.features[0])):
            ranking_dict[feat_channel_names[i]] += abs(self.features[0][i])
        ranking = sorted(ranking_dict.items(),key=lambda t: t[1])
        return ranking


[docs]class SimpleSourceTemplateNode(BaseNode):
    """ A simple template that illustrates the basic principles of a source node

    In `pySPACE`, source nodes are used at the beginning of the node chain.
    The source nodes are responsible for the input of data, be it from a
    static source or from a live stream.

    It is very important to note that these nodes just serve the purpose of
    providing the node chain with an input dataset and do not perform any
    changes on the data itself. That being said, these nodes are **do not**
    have an **input node** and are **not trainable**!

    In the following we will discuss the general strategy for building a new
    source node for a static input data set which has been saved to disk.
    In the case of more complicated inputs, please consult the documentation of
    :mod:`~pySPACE.missions.nodes.source.external_generator_source.ExternalGeneratorSourceNode`
    and :mod:`~pySPACE.missions.nodes.source.time_series_source.Stream2TimeSeriesSourceNode`
    """
[docs]    def __init__(self, **kwargs):
        """ Initialize some values to 0 or `None`

        The initialization routine of the source node is basically completely
        empty. Should you feel the need to do something in this part of the
        code, you can initialize the ``input_dataset`` to ``None``. This
        attribute will then later be changed when the ``set_input_dataset``
        method is called.

        If the user wants to generate the dataset inside the SourceNode,
        this should be done in the ``__init__`` method though. A good example
        of this practice can be found in the
        :mod:`~pySPACE.missions.nodes.source.random_time_series_source.RandomTimeSeriesSourceNode`
        """
        super(SimpleSourceTemplateNode, self).__init__(**kwargs)

        self.set_permanent_attributes(dataset=None)

[docs]    def set_input_dataset(self, dataset):
        """ Sets the dataset from which this node reads the data

        This method is the beginning of the node. Put simply, this method
        starts the feeding process of your node chain by telling the node chain
        where to get the data from.
        """
        self.set_permanent_attributes(dataset=dataset)

[docs]    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        This method streams training data and sends it to the subsequent nodes.
        If one looks at the tutorial related to building new nodes (available in
        the tutorial section), one can see exactly where the ``request_data``
        methods are put to use.

        The following example is one that was extracted from the
        :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`

        which should(in theory at least) be implementable for all types of data.
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training feature vector windows.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()


[docs]    def request_data_for_testing(self):
        """ Returns the data that can be used for testing of subsequent nodes

        The principle of obtaining the testing data are the same as the principles
        used in obtaining the training data set. The only difference here is that,
        in the case in which there is no testing data available, we allow for the
        training data to be used as testing data.
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()


[docs]    def getMetadata(self, key):
        """ Return the value corresponding to the given key from the dataset meta data of this source node

        At some point in time, you might need to know the metadata of some
        specific input in your input and this is when you would use this method.
        """
        return self.dataset.meta_data.get(key)

[docs]    def use_next_split(self):
        """ Return False

        The method will always return `False` since the SourceNode
        should(in the case of more than 1 split) execute the splits in
        parallel and not in series.
        """
        return False


[docs]class SimpleSinkTemplateNode(BaseNode):
    """ A simple template that illustrates the basic principles of a sink node

    The sink node is always placed at the end of the node chain. You can think
    of a sink node as a place in which you can throw all your data and it will
    do something with this data e.g. saving it to disk.

    Of course, this is not the only possibility for a Sink node but it is the
    most basic one. One example of a more complex process happening inside the
    Sink node is that of the
    :mod:`~pySPACE.missions.nodes.sink.classification_performance_sink.PerformanceSinkNode`
    whereby the classification results are collected into a complex structure
    that reflects the performance of the entire node chain.

    That being said, this template addresses the very simple case of just
    collecting the results of the node chain and doing something with them.

    For a complete list of the available nodes, please consult
    :mod:`~pySPACE.missions.nodes.sink`
    """

[docs]    def __init__(self, selection_criterion=None, data=None, **kwargs):
        """ Initialize some criterion of selection for the data

        In the initialization stage, the node is expected to just save some
        permanent attributes that it might use at a later point in time.
        In the case of :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
        data, this criterion might represent selected channel names(as
        implemented in
        :mod:`~pySPACE.missions.nodes.sink.feature_vector_sink.FeatureVectorSinkNode`
        while for :mod:`~pySPACE.resources.data_types.time_series.TimeSeries`
        it might represent a sorting criterion, as implemented in
        :mod:`~pySPACE.missions.nodes.sink.time_series_sink.TimeSeriesSinkNode`
        Since this is only a mere template, we will call our selection criterion
        `selection_criterion` and leave it up to the user to implement specific
        selection criteria.
        """
        super(SimpleSinkTemplateNode, self).__init__(**kwargs)

        self.set_permanent_attributes(selection_crit=selection_criterion,
                                      data=data)

[docs]    def is_trainable(self):
        """ Return True if the node is trainable

        While the sink nodes do not need to be trained, they do need access to
        the training data that is sent through the node chain. In order to
        achieve this, the :func:`~pySPACE.missions.nodes.base_node.BaseNode.is_trainable`
        function from the `BaseNode` is overwritten such that it
        always returns `True` when access to the training data is required.
        """
        return True

[docs]    def is_supervised(self):
        """ Returns True if the node requires supervised training

        The function will almost always return True. If the node requires access
        to the training data i.e. if the node `is_trainable` it will almost
        surely also be supervised.
        """
        return True

[docs]    def _train(self, data, label):
        """ Tell the node what to do with specific data inputs

        In the case of Sink nodes, the `_train` function is usually overwritten
        with a dummy function that either returns the input data e.g.
        :mod:`~pySPACE.missions.nodes.sink.analyzer_sink.AnalyzerSinkNode`
        or just does not(as we will implement it here)
        """
        pass

[docs]    def reset(self):
        """ Reset the permanent parameters of the node chain

        When used inside a node chain, the Sink node should also be
        responsible for saving the permanent state parameters. These
        parameters get reinitialized whenever the node chain reaches its
        end. Nevertheless, the parameters should be saved such that they
        can be inspected after the entire procedure has finished.


        The following piece of code was adapted from
        :mod:`~pySPACE.missions.nodes.sink.feature_vector_sink.FeatureVectorSinkNode`
        with the `FeatureVector` specific parameters changed to dummy
        variables.
        """
        import copy
        tmp = self.permanent_state
        tmp["dataset"] = self.data
        self.__dict__ = copy.copy(tmp)
        self.permanent_state = tmp

[docs]    def process_current_split(self):
        """  The final processing step for the current split

        This function should contain the last activities that need to be run in
        the current split. You should include any method that combines, selects
        or transforms the result data set in any way into this function.
        """
        pass

[docs]    def get_result_dataset(self):
        """ Return the result dataset

        This function should be built such that it returns the result dataset.
        """
        return self.data