Source code for pySPACE.environments.chains.node_chain

# coding=utf-8
""" NodeChains are sequential orders of :mod:`~pySPACE.missions.nodes`

.. image:: ../../graphics/node_chain.png
   :width: 500

There are two main use cases:

    * the application for :mod:`~pySPACE.run.launch_live` and the
        :mod:`~pySPACE.environments.live` using the default
        :class:`NodeChain` and
    * the benchmarking with :mod:`~pySPACE.run.launch` using
        the :class:`BenchmarkNodeChain` with the
        :mod:`~pySPACE.missions.operations.node_chain` operation.

.. seealso::

    - :mod:`~pySPACE.missions.nodes`
    - :ref:`node_list`
    - :mod:`~pySPACE.missions.operations.node_chain` operation

.. image:: ../../graphics/launch_live.png
   :width: 500

.. todo:: Documentation

This module extends/reimplements the original MDP flow class and
has some additional methods like reset(), save() etc.

Furthermore it supports the construction of NodeChains and
also running them inside nodes in parallel.

MDP is distributed under the following BSD license::

    This file is part of Modular toolkit for Data Processing (MDP).
    All the code in this package is distributed under the following conditions:

    Copyright (c) 2003-2012, MDP Developers <mdp-toolkit-devel@lists.sourceforge.net>

    All rights reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:

        * Redistributions of source code must retain the above copyright
          notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above copyright
          notice, this list of conditions and the following disclaimer in the
          documentation and/or other materials provided with the distribution.
        * Neither the name of the Modular toolkit for Data Processing (MDP)
          nor the names of its contributors may be used to endorse or promote
          products derived from this software without specific prior written
          permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import sys
import os

if __name__ == '__main__':
    # add root of the code to system path
    file_path = os.path.dirname(os.path.abspath(__file__))
    pyspace_path = file_path[:file_path.rfind('pySPACE')-1]
    if not pyspace_path in sys.path:
        sys.path.append(pyspace_path)

import cPickle
import gc
import logging
import multiprocessing
import shutil
import socket
import time
import uuid
import yaml
import pySPACE
from pySPACE.tools.filesystem import create_directory
from pySPACE.tools.socket_utils import talk, inform
from pySPACE.tools.conversion import python2yaml, replace_parameters_and_convert, replace_parameters
import copy

import warnings
import traceback
import numpy

[docs]class CrashRecoveryException(Exception): """Class to handle crash recovery """
[docs] def __init__(self, *args): """Allow crash recovery. Arguments: (error_string, crashing_obj, parent_exception) The crashing object is kept in self.crashing_obj The triggering parent exception is kept in ``self.parent_exception``. """ errstr = args[0] self.crashing_obj = args[1] self.parent_exception = args[2] # ?? python 2.5: super(CrashRecoveryException, self).__init__(errstr) super(CrashRecoveryException,self).__init__(self, errstr)
[docs] def dump(self, filename = None): """ Save a pickle dump of the crashing object on filename. If filename is None, the crash dump is saved on a file created by the tempfile module. Return the filename. """ import cPickle import tempfile if filename is None: (fd, filename)=tempfile.mkstemp(suffix=".pic", prefix="NodeChainCrash_") fl = os.fdopen(fd, 'w+b', -1) else: fl = open(filename, 'w+b', -1) cPickle.dump(self.crashing_obj, fl) fl.close() return filename
[docs]class NodeChainException(Exception): """Base class for exceptions in node chains.""" pass
[docs]class NodeChainExceptionCR(CrashRecoveryException, NodeChainException): """Class to handle crash recovery """
[docs] def __init__(self, *args): """Allow crash recovery. Arguments: (error_string, flow_instance, parent_exception) The triggering parent exception is kept in self.parent_exception. If ``flow_instance._crash_recovery`` is set, save a crash dump of flow_instance on the file self.filename """ CrashRecoveryException.__init__(self, *args) rec = self.crashing_obj._crash_recovery errstr = args[0] if rec: if isinstance(rec, str): name = rec else: name = None name = CrashRecoveryException.dump(self, name) dumpinfo = '\nA crash dump is available on: "%s"' % name self.filename = name errstr = errstr+dumpinfo Exception.__init__(self, errstr)
[docs]class NodeChain(object): """ Reimplement/overwrite mdp.Flow methods e.g., for supervised learning """
[docs] def __init__(self, node_sequence, crash_recovery=False, verbose=False): """ Creates the NodeChain based on the node_sequence .. note:: The NodeChain cannot be executed before not all trainable nodes have been trained, i.e. self.trained() == True. """ self._check_nodes_consistency(node_sequence) self.flow = node_sequence self.verbose = verbose self.set_crash_recovery(crash_recovery) # Register the direct predecessor of a node as its input # (i.e. we assume linear flows) for i in range(len(node_sequence) - 1): node_sequence[i+1].register_input_node(node_sequence[i]) self.use_test_data = False # set a default run number self[-1].set_run_number(0) # give this flow a unique identifier self.id = str(uuid.uuid4()) self.handler = None self.store_intermediate_results = True
[docs] def train(self, data_iterators=None): """ Train NodeChain with data from iterator or source node The method can proceed in two different ways: * If no data is provided, it is checked that the first node of the flow is a source node. If that is the case, the data provided by this node is passed forward through the flow. During this forward propagation, the flow is trained. The request of the data is done in the last node. * If a list of data iterators is provided, then it is checked that no source and split nodes are contained in the NodeChain. these nodes only include already a data handling and should not be used, when training is done in different way. Furthermore, split nodes are relevant for benchmarking. One iterator for each node has to be given. If only one is given, or no list, it is mapped to a list with the same iterator for each node. .. note:: The iterator approach is normally not used in pySPACE, because pySPACE supplies the data with special source nodes and is doing the training automatically without explicit calls on data samples. The approach came with MDP. .. todo:: The iterator approach needs some use cases and testings, especially, because it is not used in the normal setting. """ if data_iterators is not None: # Check if no source and split nodes are contained in the node chain assert(not self[0].is_source_node()), \ "Node chains with source nodes cannot be trained "\ "with external data_iterators!" for node in self: assert(not node.is_split_node()), \ "Node chains with split nodes cannot be trained "\ "with external data_iterators!" # prepare iterables if not type(data_iterators) == list: data_iterators = [data_iterators] * len(self.flow) elif not len(data_iterators)==len(self.flow): data_iterators = [data_iterators] * len(self.flow) # Delegate to iterative training self.iter_train(data_iterators) else: # Use the pySPACE train semantic and not MDP type # Check if the first node of the node chain is a source node assert(self[0].is_source_node()), \ "Training of a node chain without source node requires a "\ "data_iterator argument!" # Training is accomplished by requesting the iterator # of the last node of the chain. This node will recursively call # the train method of all its predecessor nodes. # As soon as the first element is yielded the node has been trained. for _ in self[-1].request_data_for_training( use_test_data=self.use_test_data): return
[docs] def iter_train(self, data_iterables): """ Train all trainable nodes in the NodeChain with data from iterator *data_iterables* is a list of iterables, one for each node in the chain. The iterators returned by the iterables must return data arrays that are then used for the node training (so the data arrays are the data for the nodes). Note that the data arrays are processed by the nodes which are in front of the node that gets trained, so the data dimension must match the input dimension of the first node. If a node has only a single training phase then instead of an iterable you can alternatively provide an iterator (including generator-type iterators). For nodes with multiple training phases this is not possible, since the iterator cannot be restarted after the first iteration. For more information on iterators and iterables see http://docs.python.org/library/stdtypes.html#iterator-types . In the special case that *data_iterables* is one single array, it is used as the data array *x* for all nodes and training phases. Instead of a data array *x* the iterators can also return a list or tuple, where the first entry is *x* and the following are args for the training of the node (e.g., for supervised training). """ data_iterables = self._train_check_iterables(data_iterables) # train each Node successively for i in range(len(self.flow)): if self.verbose: print "Training node #%d (%s)" % (i, str(self.flow[i])) self._train_node(data_iterables[i], i) if self.verbose: print "Training finished" self._close_last_node()
[docs] def trained(self): """ Returns whether the complete training is finished, i.e. if all nodes have been trained. """ return self[-1].get_remaining_train_phase() == 0
[docs] def execute(self, data_iterators=None): """ Process the data through all nodes """ if data_iterators is not None: # Delegate to super class return self.iter_execute(data_iterators) else: # Use the evaluate semantic # Check if the first node of the flow is a source node assert (self[0].is_source_node()), \ "Evaluation of a node chain without source node requires a " \ "data_iterator argument!" # This is accomplished by calling the request_data_for_testing # method of the last node of the chain. This node will recursively # call the request_data_for_testing method of all its predecessor # nodes return self[-1].process()
[docs] def iter_execute(self, iterable, nodenr = None): """ Process the data through all nodes in the chain till *nodenr* 'iterable' is an iterable or iterator (note that a list is also an iterable), which returns data arrays that are used as input. Alternatively, one can specify one data array as input. If 'nodenr' is specified, the flow is executed only up to node nr. 'nodenr'. This is equivalent to 'flow[:nodenr+1](iterable)'. .. note:: In contrary to MDP, results are not concatenated to one big object. Each data object remains separate. """ if isinstance(iterable, numpy.ndarray): return self._execute_seq(iterable, nodenr) res = [] empty_iterator = True for x in iterable: empty_iterator = False res.append(self._execute_seq(x, nodenr)) if empty_iterator: errstr = ("The execute data iterator is empty.") raise NodeChainException(errstr) return res
[docs] def _inc_train(self, data, class_label=None): """ Iterate through the nodes to train them """ for node in self: if node.is_retrainable() and not node.buffering and hasattr(node, "_inc_train"): if not node.retraining_phase: node.retraining_phase=True node.start_retraining() node._inc_train(data,class_label) if not (hasattr(self, "buffering") and self.buffering): data = node.execute(data) else: # workaround to inherit meta data self.buffering = False data = node.execute(data) self.buffering = True
[docs] def save(self, filename, protocol = -1): """ Save a pickled representation to *filename* If *filename* is None, return a string. .. note:: the pickled NodeChain is not guaranteed to be upward or backward compatible. .. note:: Having C-Code in the node might cause problems with saving. Therefore, the code has special handling for the LibSVMClassifierNode. .. todo:: Intrinsic node methods for storing should be used. .. seealso:: :func:`store_node_chain` """ if self[-1].__class__.__name__ in ["LibSVMClassifierNode"] \ and self[-1].multinomial: indx = filename.find(".pickle") if indx != -1: self[-1].save_model(filename[0:indx]+'.model') else: self[-1].save_model(filename+'.model') import cPickle odict = self.__dict__.copy() # copy the dict since we change it # Remove other non-pickable stuff remove_keys=[] k = 0 for key, value in odict.iteritems(): if key == "input_node" or key == "flow": continue try: cPickle.dumps(value) except (ValueError, TypeError, cPickle.PicklingError): remove_keys.append(key) for key in remove_keys: odict.pop(key) self.__dict__ = odict if filename is None: return cPickle.dumps(self, protocol) else: # if protocol != 0 open the file in binary mode if protocol != 0: mode = 'wb' else: mode = 'w' flh = open(filename , mode) cPickle.dump(self, flh, protocol) flh.close()
[docs] def get_output_type(self, input_type, as_string=True): """ Returns the output type of the entire node chain Recursively iterate over nodes in flow """ output = input_type for i in range(len(self.flow)): if i == 0: output = self.flow[i].get_output_type( input_type, as_string=True) else: output = self.flow[i].get_output_type(output, as_string=True) if as_string: return output else: return self.string_to_class(output)
@staticmethod
[docs] def string_to_class(string_encoding): """ given a string variable, outputs a class instance e.g. obtaining a TimeSeries """ from pySPACE.resources.data_types.time_series import TimeSeries from pySPACE.resources.data_types.feature_vector import FeatureVector from pySPACE.resources.data_types.prediction_vector import PredictionVector if "TimeSeries" in string_encoding: return TimeSeries elif "PredictionVector" in string_encoding: return PredictionVector elif "FeatureVector" in string_encoding: return FeatureVector else: raise NotImplementedError
################# # MDP Code copy #
[docs] def _propagate_exception(self, exception, nodenr): # capture exception. the traceback of the error is printed and a # new exception, containing the identity of the node in the NodeChain # is raised. Allow crash recovery. (etype, val, tb) = sys.exc_info() prev = ''.join(traceback.format_exception(exception.__class__, exception,tb)) act = "\n! Exception in node #%d (%s):\n" % (nodenr, str(self.flow[nodenr])) errstr = ''.join(('\n', 40*'-', act, 'Node Traceback:\n', prev, 40*'-')) raise NodeChainExceptionCR(errstr, self, exception)
[docs] def _train_node(self, data_iterable, nodenr): """ Train a single node in the flow. nodenr -- index of the node in the flow """ node = self.flow[nodenr] if (data_iterable is not None) and (not node.is_trainable()): # attempted to train a node although it is not trainable. # raise a warning and continue with the next node. # wrnstr = "\n! Node %d is not trainable" % nodenr + \ # "\nYou probably need a 'None' iterable for"+\ # " this node. Continuing anyway." #warnings.warn(wrnstr, UserWarning) return elif (data_iterable is None) and node.is_training(): # None instead of iterable is passed to a training node err_str = ("\n! Node %d is training" " but instead of iterable received 'None'." % nodenr) raise NodeChainException(err_str) elif (data_iterable is None) and (not node.is_trainable()): # skip training if node is not trainable return try: train_arg_keys = self._get_required_train_args(node) train_args_needed = bool(len(train_arg_keys)) ## We leave the last training phase open for the ## CheckpointFlow class. ## Checkpoint functions must close it explicitly if needed! ## Note that the last training_phase is closed ## automatically when the node is executed. while True: empty_iterator = True for x in data_iterable: empty_iterator = False # the arguments following the first are passed only to the # currently trained node, allowing the implementation of # supervised nodes if (type(x) is tuple) or (type(x) is list): arg = x[1:] x = x[0] else: arg = () # check if the required number of arguments was given if train_args_needed: if len(train_arg_keys) != len(arg): err = ("Wrong number of arguments provided by " + "the iterable for node #%d " % nodenr + "(%d needed, %d given).\n" % (len(train_arg_keys), len(arg)) + "List of required argument keys: " + str(train_arg_keys)) raise NodeChainException(err) # filter x through the previous nodes if nodenr > 0: x = self._execute_seq(x, nodenr-1) # train current node node.train(x, *arg) if empty_iterator: if node.get_current_train_phase() == 1: err_str = ("The training data iteration for node " "no. %d could not be repeated for the " "second training phase, you probably " "provided an iterator instead of an " "iterable." % (nodenr+1)) raise NodeChainException(err_str) else: err_str = ("The training data iterator for node " "no. %d is empty." % (nodenr+1)) raise NodeChainException(err_str) self._stop_training_hook() # close the previous training phase node.stop_training() if node.get_remaining_train_phase() > 0: continue else: break except self.flow[-1].TrainingFinishedException, e: # attempted to train a node although its training phase is already # finished. raise a warning and continue with the next node. wrnstr = ("\n! Node %d training phase already finished" " Continuing anyway." % nodenr) warnings.warn(wrnstr, UserWarning) except NodeChainExceptionCR, e: # this exception was already propagated, # probably during the execution of a node upstream in the flow (exc_type, val) = sys.exc_info()[:2] prev = ''.join(traceback.format_exception_only(e.__class__, e)) prev = prev[prev.find('\n')+1:] act = "\nWhile training node #%d (%s):\n" % (nodenr, str(self.flow[nodenr])) err_str = ''.join(('\n', 40*'=', act, prev, 40*'=')) raise NodeChainException(err_str) except Exception, e: # capture any other exception occurred during training. self._propagate_exception(e, nodenr)
[docs] def _stop_training_hook(self): """Hook method that is called before stop_training is called.""" pass
@staticmethod
[docs] def _get_required_train_args(node): """Return arguments in addition to self and x for node.train. Arguments that have a default value are ignored. """ import inspect train_arg_spec = inspect.getargspec(node.train) train_arg_keys = train_arg_spec[0][2:] # ignore self, x if train_arg_spec[3]: # subtract arguments with a default value train_arg_keys = train_arg_keys[:-len(train_arg_spec[3])] return train_arg_keys
[docs] def _train_check_iterables(self, data_iterables): """Return the data iterables after some checks and sanitizing. Note that this method does not distinguish between iterables and iterators, so this must be taken care of later. """ # verifies that the number of iterables matches that of # the signal nodes and multiplies them if needed. flow = self.flow # # if a single array is given wrap it in a list of lists, # # note that a list of 2d arrays is not valid # if isinstance(data_iterables, numpy.ndarray): # data_iterables = [[data_iterables]] * len(flow) if not isinstance(data_iterables, list): err_str = ("'data_iterables' must be either a list of " "iterables or an array, but got %s" % str(type(data_iterables))) raise NodeChainException(err_str) # check that all elements are iterable for i, iterable in enumerate(data_iterables): if (iterable is not None) and (not hasattr(iterable, '__iter__')): err = ("Element number %d in the data_iterables" " list is not an iterable." % i) raise NodeChainException(err) # check that the number of data_iterables is correct if len(data_iterables) != len(flow): err_str = ("%d data iterables specified," " %d needed" % (len(data_iterables), len(flow))) raise NodeChainException(err_str) return data_iterables
[docs] def _close_last_node(self): if self.verbose: print "Close the training phase of the last node" try: self.flow[-1].stop_training() except self.flow[-1].TrainingFinishedException: pass except Exception, e: self._propagate_exception(e, len(self.flow)-1)
[docs] def set_crash_recovery(self, state = True): """Set crash recovery capabilities. When a node raises an Exception during training, execution, or inverse execution that the flow is unable to handle, a NodeChainExceptionCR is raised. If crash recovery is set, a crash dump of the flow instance is saved for later inspection. The original exception can be found as the 'parent_exception' attribute of the NodeChainExceptionCR instance. - If 'state' = False, disable crash recovery. - If 'state' is a string, the crash dump is saved on a file with that name. - If 'state' = True, the crash dump is saved on a file created by the tempfile module. """ self._crash_recovery = state
[docs] def _execute_seq(self, x, nodenr = None): """ Executes input data 'x' through the nodes 0..'node_nr' included If no *nodenr* is specified, the complete node chain is used for processing. """ flow = self.flow if nodenr is None: nodenr = len(flow)-1 for node_index in range(nodenr+1): try: x = flow[node_index].execute(x) except Exception, e: self._propagate_exception(e, node_index) return x
[docs] def copy(self, protocol=None): """Return a deep copy of the flow. The protocol parameter should not be used. """ import copy if protocol is not None: warnings.warn("protocol parameter to copy() is ignored", DeprecationWarning, stacklevel=2) return copy.deepcopy(self)
[docs] def __call__(self, iterable, nodenr = None): """Calling an instance is equivalent to call its 'execute' method.""" return self.iter_execute(iterable, nodenr=nodenr)
###### string representation
[docs] def __str__(self): nodes = ', '.join([str(x) for x in self.flow]) return '['+nodes+']'
[docs] def __repr__(self): # this should look like a valid Python expression that # could be used to recreate an object with the same value # eval(repr(object)) == object name = type(self).__name__ pad = len(name)+2 sep = ',\n'+' '*pad nodes = sep.join([repr(x) for x in self.flow]) return '%s([%s])' % (name, nodes)
###### private container methods
[docs] def __len__(self): return len(self.flow)
[docs] def _check_dimension_consistency(self, out, inp): """Raise ValueError when both dimensions are set and different.""" if ((out and inp) is not None) and out != inp: errstr = "dimensions mismatch: %s != %s" % (str(out), str(inp)) raise ValueError(errstr)
[docs] def _check_nodes_consistency(self, flow = None): """Check the dimension consistency of a list of nodes.""" if flow is None: flow = self.flow len_flow = len(flow) for i in range(1, len_flow): out = flow[i-1].output_dim inp = flow[i].input_dim self._check_dimension_consistency(out, inp)
[docs] def _check_value_type_isnode(self, value): if not isinstance(value, pySPACE.missions.nodes.base.BaseNode): raise TypeError("flow item must be Node instance")
[docs] def __getitem__(self, key): if isinstance(key, slice): flow_slice = self.flow[key] self._check_nodes_consistency(flow_slice) return self.__class__(flow_slice) else: return self.flow[key]
[docs] def __setitem__(self, key, value): if isinstance(key, slice): [self._check_value_type_isnode(item) for item in value] else: self._check_value_type_isnode(value) # make a copy of list flow_copy = list(self.flow) flow_copy[key] = value # check dimension consistency self._check_nodes_consistency(flow_copy) # if no exception was raised, accept the new sequence self.flow = flow_copy
[docs] def __delitem__(self, key): # make a copy of list flow_copy = list(self.flow) del flow_copy[key] # check dimension consistency self._check_nodes_consistency(flow_copy) # if no exception was raised, accept the new sequence self.flow = flow_copy
[docs] def __contains__(self, item): return self.flow.__contains__(item)
[docs] def __iter__(self): return self.flow.__iter__()
[docs] def __add__(self, other): # append other to self if isinstance(other, NodeChain): flow_copy = list(self.flow).__add__(other.flow) # check dimension consistency self._check_nodes_consistency(flow_copy) # if no exception was raised, accept the new sequence return self.__class__(flow_copy) elif isinstance(other, pySPACE.missions.nodes.base.BaseNode): flow_copy = list(self.flow) flow_copy.append(other) # check dimension consistency self._check_nodes_consistency(flow_copy) # if no exception was raised, accept the new sequence return self.__class__(flow_copy) else: err_str = ('can only concatenate flow or node' ' (not \'%s\') to flow' % (type(other).__name__)) raise TypeError(err_str)
[docs] def __iadd__(self, other): # append other to self if isinstance(other, NodeChain): self.flow += other.flow elif isinstance(other, pySPACE.missions.nodes.base.BaseNode): self.flow.append(other) else: err_str = ('can only concatenate flow or node' ' (not \'%s\') to flow' % (type(other).__name__)) raise TypeError(err_str) self._check_nodes_consistency(self.flow) return self
###### public container methods
[docs] def append(self, x): """flow.append(node) -- append node to flow end""" self[len(self):len(self)] = [x]
[docs] def extend(self, x): """flow.extend(iterable) -- extend flow by appending elements from the iterable""" if not isinstance(x, NodeChain): err_str = ('can only concatenate flow' ' (not \'%s\') to flow' % (type(x).__name__)) raise TypeError(err_str) self[len(self):len(self)] = x
[docs] def insert(self, i, x): """flow.insert(index, node) -- insert node before index""" self[i:i] = [x]
[docs] def pop(self, i = -1): """flow.pop([index]) -> node -- remove and return node at index (default last)""" x = self[i] del self[i] return x
[docs] def reset(self): """ Reset the flow and obey permanent_attributes where available Method was moved to the end of class code, due to program environment problems which needed the __getitem__ method beforehand. """ for i in range(len(self)): self[i].reset()
[docs]class BenchmarkNodeChain(NodeChain): """ This subclass overwrites the train method in order to provide a more convenient way of doing supervised learning. Furthermore, it contains a benchmark method that can be used for benchmarking. This includes logging, setting of run numbers, delivering the result collection, handling of source and sink nodes, ... :Author: Jan Hendrik Metzen (jhm@informatik.uni-bremen.de) :Created: 2008/08/18 """
[docs] def __init__(self, node_sequence): """ Creates the BenchmarkNodeChain based on the node_sequence """ super(BenchmarkNodeChain, self).__init__(node_sequence) # Each BenchmarkNodeChain must start with an source node # and end with a sink node assert(self[0].is_source_node()), \ "A benchmark flow must start with a source node" assert(self[-1].is_sink_node()), \ "A benchmark flow must end with a sink node"
[docs] def use_next_split(self): """ Use the next split of the data into training and test data This method is useful for pySPACE-benchmarking """ # This is handled by calling use_next_split() of the last node of # the flow which will recursively call predecessor nodes in the flow # until a node is found that handles the splitting return self[-1].use_next_split()
[docs] def benchmark(self, input_collection, run=0, persistency_directory=None, store_node_chain=False): """ Perform the benchmarking of this data flow with the given collection Benchmarking is accomplished by iterating through all splits of the data into training and test data. **Parameters**: :input_collection: A sequence of data/label-tuples that serves as a generator or a BaseDataset which contains the data to be processed. :run: The current run which defines all random seeds within the flow. :persistency_directory: Optional information of the nodes as well as the trained node chain (if *store_node_chain* is not False) are stored to the given *persistency_directory*. :store_node_chain: If True the trained flow is stored to *persistency_directory*. If *store_node_chain* is a tuple of length 2---lets say (i1,i2)-- only the subflow starting at the i1-th node and ending at the (i2-1)-th node is stored. This may be useful when the stored flow should be used in an ensemble. """ # Inform the first node of this flow about the input collection if hasattr(input_collection,'__iter__'): # assume a generator is given self[0].set_generator(input_collection) else: # assume BaseDataset self[0].set_input_dataset(input_collection) # Inform all nodes recursively about the number of the current run self[-1].set_run_number(int(run)) # set temp file folder if persistency_directory != None: self[-1].set_temp_dir(persistency_directory+os.sep+"temp_dir") split_counter = 0 # For every split of the dataset while True: # As long as more splits are available # Compute the results for the current split # by calling the method on its last node self[-1].process_current_split() if persistency_directory != None: if store_node_chain: self.store_node_chain(persistency_directory + os.sep + \ "node_chain_sp%s.pickle" % split_counter, store_node_chain) # Store nodes that should be persistent self.store_persistent_nodes(persistency_directory) # If no more splits are available if not self.use_next_split(): break split_counter += 1 # print "Input benchmark" # print gc.get_referrers(self[0].collection) # During the flow numerous pointers are put to the flow but they are # not deleted. So memory is not given free, which can be seen by the # upper comment. Therefore we now free the input collection and only # then the gc collector can free the memory. Otherwise under not yet # found reasons, the pointers to the input collection will remain even # between processes. if hasattr(input_collection,'__iter__'): self[0].set_generator(None) else: self[0].set_input_dataset(None) gc.collect() # Return the result collection of this flow return self[-1].get_result_dataset()
[docs] def __call__(self, iterable=None, train_instances=None, runs=[]): """ Call *execute* or *benchmark* and return (id, PerformanceResultSummary) If *iterable* is given, calling an instance is equivalent to call its 'execute' method. If *train_instances* and *runs* are given, 'benchmark' is called for every run number specified and results are merged. This is useful for e.g. parallel execution of subflows with the multiprocessing module, since instance methods can not be serialized in Python but whole objects. """ if iterable != None: return self.execute(iterable) elif train_instances != None and runs != []: # parallelization case # we have to reinitialize logging cause otherwise deadlocks occur # when parallelization is done via multiprocessing.Pool self.prepare_logging() for ind, run in enumerate(runs): result = self.benchmark(train_instances, run=run) if ind == 0: result_collection = result else: result_collection.data.update(result.data) # reset node chain for new training if another call of # :func:`benchmark` is expected. if not ind == len(runs) - 1: self.reset() self.clean_logging() return (self.id, result_collection) else: import warnings warnings.warn("__call__ methods needs at least one parameter (data)") return None
[docs] def store_node_chain(self, result_dir, store_node_chain): """ Pickle this flow into *result_dir* for later usage""" if isinstance(store_node_chain,basestring): store_node_chain = eval(store_node_chain) if isinstance(store_node_chain,tuple): assert(len(store_node_chain) == 2) # Keep only subflow starting at the i1-th node and ending at the # (i2-1) node. flow = NodeChain(self.flow[store_node_chain[0]:store_node_chain[1]]) elif isinstance(store_node_chain,list): # Keep only nodes with indices contained in the list # nodes have to be copied, otherwise input_node-refs of current flow # are overwritten from copy import copy store_node_list = [copy(node) for ind, node in enumerate(self.flow) \ if ind in store_node_chain] flow = NodeChain(store_node_list) else: # Per default, get rid of source and sink nodes flow = NodeChain(self.flow[1:-1]) input_node = flow[0].input_node flow[0].input_node = None flow.save(result_dir)
[docs] def prepare_logging(self): """ Set up logging This method is only needed if one forks subflows, i.e. to execute them via multiprocessing.Pool """ # Prepare remote logging root_logger = logging.getLogger("%s-%s" % (socket.gethostname(), os.getpid())) root_logger.setLevel(logging.DEBUG) root_logger.propagate = False if len(root_logger.handlers)==0: self.handler = logging.handlers.SocketHandler(socket.gethostname(), logging.handlers.DEFAULT_TCP_LOGGING_PORT) root_logger.addHandler(self.handler)
[docs] def clean_logging(self): """ Remove logging handlers if existing Call this method only if you have called *prepare_logging* before. """ # Remove potential logging handlers if self.handler is not None: self.handler.close() root_logger = logging.getLogger("%s-%s" % (socket.gethostname(), os.getpid())) root_logger.removeHandler(self.handler)
[docs] def store_persistent_nodes(self, result_dir): """ Store all nodes that should be persistent """ # For all node for index, node in enumerate(self): # Store them in the result dir if they enabled storing node.store_state(result_dir, index)
[docs]class NodeChainFactory(object): """ Provide static methods to create and instantiate data flows :Author: Jan Hendrik Metzen (jhm@informatik.uni-bremen.de) :Created: 2009/01/26 """ @staticmethod
[docs] def flow_from_yaml(Flow_Class, flow_spec): """ Creates a Flow object Reads from the given *flow_spec*, which should be a valid YAML specification of a NodeChain object, and returns this dataflow object. **Parameters** :Flow_Class: The class name of node chain to create. Valid are 'NodeChain' and 'BenchmarkNodeChain'. :flow_spec: A valid YAML specification stream; this could be a file object, a string representation of the YAML file or the Python representation of the YAML file (list of dicts) """ from pySPACE.missions.nodes.base_node import BaseNode # Reads and parses the YAML file if necessary if type(flow_spec) != list: dataflow_spec = yaml.load(flow_spec) else: dataflow_spec = flow_spec node_sequence = [] # For all nodes of the flow for node_spec in dataflow_spec: # Use factory method to create node node_obj = BaseNode.node_from_yaml(node_spec) # Append this node to the sequence of node node_sequence.append(node_obj) # Check if the nodes have to cache their outputs for index, node in enumerate(node_sequence): # If a node is trainable, it uses the outputs of its input node # at least twice, so we have to cache. if node.is_trainable(): node_sequence[index - 1].set_permanent_attributes(caching = True) # Split node might also request the data from their input nodes # (once for each split), depending on their implementation. We # assume the worst case and activate caching if node.is_split_node(): node_sequence[index - 1].set_permanent_attributes(caching = True) # Create the flow based on the node sequence and the given flow class # and return it return Flow_Class(node_sequence)
@staticmethod
[docs] def instantiate(template, parametrization): """ Instantiate a template recursively for the given parameterization Instantiate means to replace the parameter in the template by the chosen value. **Parameters** :template: A dictionary with key-value pairs, where values might contain parameter keys which have to be replaced. A typical example of a template would be a Python representation of a node read from YAML. :parametrization: A dictionary with parameter names as keys and exact one value for this parameter as value. """ instance = {} for key, value in template.iteritems(): if value in parametrization.keys(): # Replacement instance[key] = parametrization[value] elif isinstance(value, dict): # Recursive call instance[key] = NodeChainFactory.instantiate(value, parametrization) elif isinstance(value, basestring): # String replacement for param_key, param_value in parametrization.iteritems(): try: value = value.replace(param_key, repr(param_value)) except: value = value.replace(param_key, python2yaml(param_value)) instance[key] = value elif hasattr(value, "__iter__"): # Iterate over all items in sequence instance[key] = [] for iter_item in value: if iter_item in parametrization.keys(): # Replacement instance[key].append(parametrization[iter_item]) elif isinstance(iter_item, dict): instance[key].append(NodeChainFactory.instantiate( iter_item, parametrization)) elif isinstance(value, basestring): # String replacement for param_key, param_value in parametrization.iteritems(): try: iter_item = iter_item.replace(param_key, repr(param_value)) except: iter_item = iter_item.replace( param_key, python2yaml(param_value)) instance[key] = value else: instance[key].append(iter_item) else: # Not parameterized instance[key] = value return instance
@staticmethod
[docs] def replace_parameters_in_node_chain(node_chain_template, parametrization): node_chain_template = copy.copy(node_chain_template) if parametrization == {}: return node_chain_template elif type(node_chain_template) == list: return [NodeChainFactory.instantiate( template=node,parametrization=parametrization) for node in node_chain_template] elif isinstance(node_chain_template, basestring): node_chain_template = \ replace_parameters(node_chain_template, parametrization) return node_chain_template
[docs]class SubflowHandler(object): """ Interface for nodes to generate and execute subflows (subnode-chains) A subflow means a node chain used inside a node for processing data. This class provides functions that can be used by nodes to generate and execute subflows. It serves thereby as a communication daemon to the backend (if it is used). Most important when inheriting from this class is that the subclass MUST be a node. The reason is that this class uses node functionality, e.g. logging, the *temp_dir*-variable and so on. **Parameters** :processing_modality: One of the valid strings: 'backend', 'serial', 'local'. :backend: The current backends modality is used. This is implemented at the moment only for 'LoadlevelerBackend' and 'LocalBackend'. :serial: All subflows are executed sequentially, i.e. one after the other. :local: Subflows are executed in a Pool using *pool_size* cpus. This may be also needed when no backend is used. (*optional, default: 'serial'*) :pool_size: If a parallelization is based on using several processes on a local system in parallel, e.g. option 'backend' and :class:`pySPACEMulticoreBackend` or option 'local', the number of worker processes for subflow evaluation has to be specified. .. note:: When using the LocalBackend, there is also the possibility to specify the pool size of parallel executed processes, e.g. data sets. Your total number of cpu's should be pool size (pySPACE) + pool size (subflows). (*optional, default: 2*) :batch_size: If parallelization of subflow execution is done together with the :class:`~pySPACE.environments.backends.ll_backend.LoadLevelerBackend`, *batch_size* determines how many subflows are executed in one serial LoadLeveler job. This option is useful if execution of a single subflow is really short (range of seconds) since there is significant overhead in creating new jobs. (*optional, default: 1*) :Author: Anett Seeland (anett.seeland@dfki.de) :Created: 2012/09/04 :LastChange: 2012/11/06 batch_size option added """
[docs] def __init__(self, processing_modality='serial', pool_size=2, batch_size=1, **kwargs): self.modality = processing_modality self.pool_size = int(pool_size) self.batch_size = int(batch_size) # a flag to send pool_size / batch_size only once to the backend self.already_send = False self.backend_com = None self.backend_name = None # to indicate the end of a message received over a socket self.end_token = '!END!' if processing_modality not in ["serial", "local", "backend"]: import warnings warnings.warn("Processing modality not found! Serial mode is used!") self.modality = 'serial'
@staticmethod
[docs] def generate_subflow(flow_template, parametrization=None, flow_class=None): """ Return a *flow_class* object of the given *flow_template* This methods wraps two function calls (NodeChainFactory.instantiate and NodeChainFactory.flow_from_yaml. **Parameters** :flow_template: List of dicts - a valid representation of a node chain. Alternatively, a YAML-String representation could be used, which simplifies parameter replacement. :parametrization: A dictionary with parameter names as keys and exact one value for this parameter as value. Passed to NodeChainFactory.instantiate (*optional, default: None*) :flow_class: The flow class name of which an object should be returned (*optional, default: BenchmarkNodeChain*) """ if flow_class is None: flow_class = BenchmarkNodeChain flow_spec = NodeChainFactory.replace_parameters_in_node_chain( flow_template,parametrization) # create a new Benchmark flow flow = NodeChainFactory.flow_from_yaml(flow_class, flow_spec) return flow
[docs] def execute_subflows(self, train_instances, subflows, run_numbers=None): """ Execute subflows and return result collection. **Parameters** :training_instances: List of training instances which should be used to execute *subflows*. :subflows: List of BenchmarkNodeChain objects. ..note:: Note that every subflow object is stored in memory! :run_numbers: All subflows will be executed with every run_number specified in this list. If None, the current self.run_number (from the node class) is used. (*optional, default: None*) """ if run_numbers == None: run_numbers = [self.run_number] # in case of serial backend, modality is mapped to serial # in the other case communication must be set up and # jobs need to be submitted to backend if self.modality == 'backend': self.backend_com = pySPACE.configuration.backend_com if not self.backend_com is None: # ask for backend_name # create a socket and keep it alive as long as possible since # handshaking costs really time client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client_socket.connect(self.backend_com) client_socket, self.backend_name = talk('name' + self.end_token, client_socket, self.backend_com) else: import warnings #necessary for serial backend! warnings.warn("Seems that no backend is used! Modality of subflow execution "\ "has to be specified! Assuming serial backend.") self.backend_name = 'serial' self._log("Preparing subflows for backend execution.") if self.backend_name in ['loadl','mcore'] : # we have to pickle training instances and store it on disk store_path = os.path.join(self.temp_dir, "sp%d" % self.current_split) create_directory(store_path) filename = os.path.join(store_path, "subflow_data.pickle") if not os.path.isfile(filename): cPickle.dump(train_instances, open(filename,'wb'), protocol=cPickle.HIGHEST_PROTOCOL) subflows_to_compute = [subflows[ind].id for ind in \ range(len(subflows))] if self.backend_name == 'loadl': # send batch_size to backend if not already done if not self.already_send: client_socket = inform("subflow_batchsize;%d%s" % \ (self.batch_size, self.end_token), client_socket, self.backend_com) self.already_send = True for subflow in subflows: cPickle.dump(subflow, open(os.path.join(store_path, subflow.id+".pickle"),"wb"), protocol=cPickle.HIGHEST_PROTOCOL) send_flows = subflows_to_compute else: # backend_name == mcore # send pool_size to backend if not already done if not self.already_send: client_socket = inform("subflow_poolsize;%d%s" % \ (self.pool_size, self.end_token), client_socket, self.backend_com) self.already_send = True # send flow objects via socket send_flows = [cPickle.dumps(subflow, cPickle.HIGHEST_PROTOCOL) \ for subflow in subflows] # inform backend client_socket,msg = talk('execute_subflows;%s;%d;%s;%s%s' % \ (store_path, len(subflows), str(send_flows), str(run_numbers), self.end_token), client_socket, self.backend_com) time.sleep(10) not_finished_subflows = set(subflows_to_compute) while len(not_finished_subflows) != 0: # ask backend for finished jobs client_socket, msg = talk('is_ready;%d;%s%s' % \ (len(not_finished_subflows), str(not_finished_subflows), self.end_token), client_socket, self.backend_com) # parse message finished_subflows = eval(msg) #should be a set # set difference not_finished_subflows -= finished_subflows time.sleep(10) if self.backend_name == 'loadl': # read results and delete store_dir result_pattern = os.path.join(store_path, '%s_result.pickle') result_collections = [cPickle.load(open(result_pattern % \ subflows[ind].id,'rb')) for ind in range(len(subflows))] # ..todo:: check if errors have occurred and if so do not delete! shutil.rmtree(store_path) else: # backend_name == mcore # ask backend to send results client_socket, msg = talk("send_results;%s!END!" % \ subflows_to_compute, client_socket, self.backend_com) # should be a list of collections results = eval(msg) result_collections = [cPickle.loads(result) for result in results] self._log("Finished subflow execution.") client_socket.shutdown(socket.SHUT_RDWR) client_socket.close() return result_collections elif self.backend_name == 'serial': # do the same as modality=='serial' self.modality = 'serial' else: # e.g. mpi backend : import warnings warnings.warn("Subflow Handling with %s backend not supported,"\ " serial-modality is used!" % self.backend_name) self.modality = 'serial' if self.modality == 'serial': # serial execution # .. note:: the here executed flows can not store anything. # meta data of result collection is NOT updated! results = [subflow(train_instances=train_instances, runs=run_numbers) for subflow in subflows] result_collections = [result[1] for result in results] return result_collections else: # modality local, e.g. usage without backend in application case self._log("Subflow Handler starts processes in pool.") pool = multiprocessing.Pool(processes=self.pool_size) results = [pool.apply_async(func=subflow, kwds={"train_instances": train_instances, "runs": run_numbers}) \ for subflow in subflows] pool.close() self._log("Waiting for parallel processes to finish.") pool.join(timeout=1e6) result_collections = [result.get()[1] for result in results] del pool return result_collections