Source code for pySPACE.tools.csv_analysis

""" Deal with csv files in general, and in particular after classification

The functions provided here focus on two issues:

    1) Manipulation of csv files (load, save, change)
    2) Repair csv files after unsuccessful classification,
       e.g. to be able to perform an analysis operation

**Examples**

    1) Loading csv file, extracting relevant data, saving new csv file:

        *Problem*:

        A csv file is existing, but the file is huge and you need only certain
        values, which are all entries with

            - Parameter __Range__=500
            - Parameter __Start__=100
            - Parameter __SamplingFreq__=25

        *Solution*:

        .. code-block:: python

            import csv_analysis
            data=csv_analysis.csv2dict('results.csv')
            conditions=csv_analysis.empty_dict(data)
            conditions['__Range__'].append('500')
            conditions['__Start__'].append('200')
            conditions['__SamplingFreq__'].append('25')
            new_dict=csv_analysis.strip_dict(data, conditions)
            csv_analysis.dict2csv('new_results.csv', new_dict)

    2) Build results.csv after classification failure and complement with reconstructed conditions:

        *Problem*:

        A classification procedure failed or has been aborted. What is needed is a
        procedure that

            (i)     builds a results.csv from conditions that were ready
            (ii)    identifies the conditions which were not ready
            (iii)   reconstructs missing conditions according to parameters inferable
                    from path and user defined default values (e.g. AUC=0.5 and
                    F_measure=0)
            (iv)    merges into existing results and saves.

        *Solution short*:

        .. code-block:: python

            from pySPACE.tools import csv_analysis
            from pySPACE.resources.dataset_defs.performance_result import PerformanceResultSummary
            mydefaults=dict()
            mydefaults['AUC']=0.5
            mydefaults['F_measure']=0
            PerformanceResultSummary.repair_csv(datapath, default_dict=mydefaults)

        *Solution long*:

        .. code-block:: python

            import csv_analysis
            from pySPACE.resources.dataset_defs.performance_result import PerformanceResultSummary
            num_splits=52
            PerformanceResultSummary.merge_performance_results(datapath)
            csv_dict = csv_analysis.csv2dict(datapath + '/results.csv')
            oplist=csv_analysis.check_op_libSVM(datapath)
            failures = csv_analysis.report_failures(oplist, num_splits)
            mydefaults=dict()
            mydefaults['AUC']=0.5
            mydefaults['F_measure']=0
            final_dict=csv_analysis.reconstruct_failures(csv_dict, failures,
                                                num_splits, default_dict=mydefaults)
            csv_analysis.dict2csv(datapath + '/repaired_results.csv', final_dict)

:Author: Sirko Straube (sirko.straube@dfki.de), Mario Krell,
         Anett Seeland, David Feess
:Created: 2010/11/09
"""

[docs]def csv2dict(filename, filter_keys = None, delimiter=',', **kwargs):
    """ Load a csv file and return content in a dictionary
    
    The dictionary has n list elements,
    with n being equal to the number of columns in the csv file.
    Additional keyword arguments are passed to the reader
    instance, e.g. a different delimiter than ',' (see csv.Reader).
    
    **Parameters**
    
        :filename:
            Contains the filename as a string.
            
        :filter_keys:
            If a list of filter keys is specified, only the specified
            keys are left and all other are discarded
            
        :delimiter:
            The delimiter between columns in the csv. Defaults to ',', as csv
            actually stands for comma separated, but sometimes different symbols
            are used.
    
    :Author: Sirko Straube (sirko.straube@dfki.de)
    :Created: 2010/11/09
    """

    import csv
    from collections import defaultdict

    csv_file = open(filename)
    csvDictReader = csv.DictReader(csv_file, delimiter=delimiter, **kwargs)

    
    data_dict = defaultdict(list)
    for line_dict in csvDictReader:
        for key, value in line_dict.iteritems():
            if not filter_keys:
                data_dict[key].append(value)
            elif key in filter_keys or key.startswith("__"):
                data_dict[key].append(value)
            
    csv_file.close()

    return data_dict

[docs]def dict2csv(filename, data_dict, delimiter=','):
    """ Write a dictionary to a csv file in a sorted way
    
    The function converts the dictionary into a list of dictionaries,
    with each entry representing one row in the final csv file.
    The dictionary can be of the form returned by csv2dict.
    
    The sorting is in alphabetic order, large characters first and
    variables starting with '__' first.
        
    **Parameters**
        :filename:
            Contains the filename as a string.
            
        :data_dict:
            Dictionary containing data as a dictionary of lists 
            (one list for each column identified by the key).
            
        :delimiter:
            The delimiter between columns in the csv. Defaults to ',', as csv
            actually stands for comma separated, but sometimes different symbols
            are used.
    
    :Author: Sirko Straube, Mario Krell
    :Created: 2010/11/09
    """
    # init
    import csv
    import copy
    csv_file=open(filename,'w')
    final_list=[]
    # sorting of key
    temp_keys = sorted(copy.deepcopy(data_dict.keys()))
    keys = [key  for key in temp_keys if key.startswith("__")]
    for key in keys:
        temp_keys.remove(key)
    for key in temp_keys:
        keys.append(key)
    del(temp_keys)
    # check for consistency (delete columns with wrong length)
    l = len(data_dict[keys[0]])
    remove_keys = []
    for key in keys:
        if not len(data_dict[key])==l:
            import warnings
            warnings.warn("Different length of columns with names %s (deleted) and %s (reference)."%(key,keys[0]))
            data_dict.pop(key)
            remove_keys.append(key)
    for key in remove_keys:
        keys.remove(key)
    # make a list of dictionaries for each row
    for current_line in range(l):
        ldict = {}
        for key in keys:
            ldict[key] = data_dict[key][current_line]
        final_list.append(ldict)
    # save it
    csvDictWriter=csv.DictWriter(csv_file, quoting=csv.QUOTE_ALL,
                                 fieldnames=keys, delimiter=delimiter,
                                 lineterminator='\n')
    csvDictWriter.writerow(dict(zip(keys,keys)))
    csvDictWriter.writerows(final_list)
    csv_file.close()

[docs]def empty_dict(old_dict):
    """ Return a dictionary of empty lists with exactly the same keys as old_dict
    
    **Parameters**
    
        :old_dict:
            Dictionary of lists (identified by the key).
    
    :Author: Sirko Straube
    :Created: 2010/11/09
    """
    
    from collections import defaultdict
    new_dict=defaultdict(list)
    [new_dict[i] for i in old_dict.keys()]
    
    return new_dict

[docs]def strip_dict(data_dict, cond_dict, invert_mask=False, limit2keys=None):
    """ Return a stripped dictionary according to the conditions specified with cond_dict and invert_mask
        
    This function is useful, if only some parameter combinations are
    interesting. Then the values of interest can be stored in cond_dict and
    after execution of mynewdict=strip_dict(data_dict, cond_dict) all
    unnecessary information is eliminated in mynewdict.
    
    **Parameters**
    
        :data_dict:
            Dictionary of lists (identified by the key). E.g. as returned by
            csv2dict.
        :cond_dict:
            Dictionary containing all keys and values that should be used to
            strip data_dict. E.g. constructed by empty_dict(data_dict) and
            subsequent modifications.
        :invert_mask:
            optional: If set to False, the cond_dict will be interpreted as
            positive list, i.e. only values are kept that are specified in
            cond_dict. If set to True, the cond_dict will be interpreted as
            negative list, i.e. only values are kept that are NOT specified in
            cond_dict. default=False
        :limit2keys:
            optional: Contains a list of key names (strings) that should be
            included in the returned dictionary. All other keys (i.e. columns)
            are skipped. default=None
    
    :Author: Sirko Straube
    :Created: 2010/11/09
    """

    from collections import defaultdict

    constr_not_valid=False
    
    #in the beginning all indices are valid...
    #take first key to determine length of csv-table
    first_key=data_dict.keys()[0] 
    valid_indices=range(len(data_dict[first_key])) 
    
    #check if condition actually appears in the data_dict
    for key in cond_dict.keys():
        if key not in data_dict.keys():
            constr_not_valid=True
            import warnings
            warnings.warn("The condition key (column heading) %s is not " \
                          "present in the dictionary you want to strip!" % key)

    for current_param in data_dict:
        if current_param in cond_dict:
            if cond_dict[current_param]: #if != []
                old_indices=valid_indices
                valid_indices=[] #reset indices to add new constraint
                constraint=cond_dict[current_param]
                for index, value in enumerate(data_dict[current_param]):
                    if not invert_mask:
                        #keep index only if new AND old constraints are valid
                        if value in constraint and index in old_indices:
                            valid_indices.append(index)            
                    else: # i.e., invert_mask == True
                        #keep index only if new AND old constraints are valid
                        if value not in constraint and index in old_indices:
                            valid_indices.append(index)
                if valid_indices == []:
                    constr_not_valid=True
                    import warnings
                    warnings.warn("Constraint values %s of key %s were not"\
                                  " found in the dictionary you want to strip!"\
                                  " Returning empty dict!"\
                                  % (str(cond_dict[current_param]), 
                                     current_param))
                        
    result_dict=dict()
    
    if not constr_not_valid:
        #wrapping up
        for current_param in data_dict:
            current_list=data_dict[current_param]
            new_list = [current_list[i] for i in valid_indices]
            result_dict[current_param]=new_list
        
        #limit2keys restriction
        if limit2keys:
            all_results=result_dict
            result_dict=dict()
            for key in limit2keys:
                result_dict[key]=all_results[key]
        
    return result_dict


[docs]def merge_dicts(dict1,dict2):
    """Merge two dictionaries into a new one

    Both have ideally the same keys and lengths.
    The merge procedure is performed even if the keys are not identical,
    but a warning is elicited.
        
    **Parameters**
    
        :dict1:
            the one dictionary
        :dict2:
            the other dictionary
    
    :Author: Mario Michael Krell
    :Created: 2010/11/09
    """
    
    import copy, warnings

    result_dict = dict()
    if not len(dict1.keys()) == len(dict2.keys()) or \
            not all([key1 in dict2.keys() for key1 in dict1.keys()]):
        warnings.warn('Inconsistency while merging: ' +
                      'The two directories have different keys!')
        bad_keys = True
    else:
        bad_keys = False
    for key in dict1.keys():
        if dict2.has_key(key):
            result_dict[key] = copy.deepcopy(dict1[key])
            result_dict[key].extend(copy.deepcopy(dict2[key]))
        else:
            warnings.warn('Inconsistency while merging: Key ' + key + 
                          ' is only existing in one dictionary!')
    if bad_keys:
        for key in dict2.keys():
            if not dict1.has_key(key):
                warnings.warn('Inconsistency while merging: Key ' + key +
                              ' is only existing in one dictionary!')
    return result_dict


[docs]def merge_multiple_dicts(dictlist):
    """ Merge multiple dictionaries into a single one
    
    This function merges every dictionary into a single one.
    The merge procedure is performed even if the keys are not identical (or of
    identical length), but a warning is elicited once.
        
    **Parameters**
    
        :dictlist:
            a list of dictionaries to merge
    
    :Author: Sirko Straube
    :Created: 2011/04/20
    """
    
    n=len(dictlist)
    
    if n==0:
        raise ValueError('List of dictionaries is empty!')
    elif n==1:
        return dictlist[0]
    elif n==2:
        return merge_dicts(dictlist[0],dictlist[1])
    else:
        data=dictlist[0]

        for pos in range(1,n):
            data=merge_dicts(data, dictlist[pos])
        return data
    
[docs]def add_key(orig_dict, key_str, key_list):
    """ Add a key to the dictionary with as many elements (rows) as other entries
    
    When called, this function adds one key in the dictionary (which is equal to adding one column
    in the csv table. The name of the key is specified in key_str, and the elements are specified in
    key_list. Note that the latter has to be a list.
    If key_list has only one element, it is expanded according to the number of rows in the table.
    If the key is already existing, the original dictionary is returned without any modification.)
        
    **Parameters**
    
        :orig_dict:
            the dictionary to modify
        :key_str:
            string containing name of the dict key
        :key_list:
            either list containing all elements or
            list with one element which is appended n times
    
    :Author: Sirko Straube
    :Created: 2011/04/20
    """
    
    import copy, warnings

    if orig_dict.has_key(key_str):
        warnings.warn('Key to be added is already existing: Key ' + key_str + '. Adding canceled!')
        return orig_dict
    
    n=len(orig_dict[orig_dict.keys()[0]]) #determine number of entries per column
    n_newlist=len(key_list)
    
    if n_newlist == 1: #create a list of n entries with the same content
        key_list=key_list*n
    elif n_newlist != n:
        warnings.warn('Length of new entry (n=' + str(n_newlist) + ') does not match length of other entries (n=' + str(n) + ')!')
    
    orig_dict[key_str]=copy.deepcopy(key_list)

    return orig_dict


[docs]def extend_dict(orig_dict, extension_dict, retain_unique_items=True):
    """ Extend one dictionary with another
    
    .. note:: This function returns a modified dictionary, even if the extension
        dictionary is completely different (i.e. there is no check if the
        extension makes sense to guarantee maximal functionality).
        
    **Parameters**
    
        :orig_dict:
            the dictionary to be extended and returned
        :extension_dict:
            the dictionary defining the extension
    
    :Author: Sirko Straube, Mario Michael Krell
    :Created: 2010/11/09
    """
    import copy, warnings
    
    if not len(orig_dict.keys()) == len(extension_dict.keys()) or \
            not all([key1 in extension_dict.keys() for key1 in orig_dict.keys()]):
        warnings.warn('Inconsistency while merging: ' +
                      'The two directories have different keys!')
    
    current_num_entries = len(orig_dict[orig_dict.keys()[0]])
    for key in extension_dict.keys():
        if orig_dict.has_key(key):
            orig_dict[key].extend(copy.deepcopy(extension_dict[key]))
        elif retain_unique_items:
            orig_dict[key] = current_num_entries*[None]
            orig_dict[key].extend(copy.deepcopy(extension_dict[key]))
            warnings.warn('Key ' + key +
                          ' retained during dictionary extension:' +
                          ' Does not exist in all files!')
        else:
            warnings.warn('Key ' + key +
                          ' dismissed during dictionary extension:' +
                          ' Does not exist in all files!')
    
    num_new_entries = len(extension_dict[extension_dict.keys()[0]])
    for key in orig_dict:
        if key in extension_dict:
            pass
        elif retain_unique_items:       
            orig_dict[key].extend(num_new_entries*[None])
            warnings.warn('Key ' + key +
                          ' retained during dictionary extension:' +
                          ' Does not exist in all files!')
        else:
            warnings.warn('Key ' + key +
                          ' dismissed during dictionary extension:' +
                          ' Does not exist in all files!')
            orig_dict.pop(key)
        
    return orig_dict

[docs]def average_rows(data_dict, key_list, n=None, new_n=None):
    """ Average across all values of the specified columns 
    
    Reduces the number of rows, i.e., the number of values in the lists, by
    averaging all values of a specific key, e.g., across all splits or subjects.

    .. note::
        It is assumed that for two parameters A and B which have a and b
        different values the number of rows to average is a*b. If you have 
        certain constraints so that the number of rows to average is not a*b,
        you have to specify them explicitly. 
    
    **Parameters**
    
        :data_dict:
            Dictionary as returned by csv2dict.
        
        :key_list:
            List of keys (equals column names in a csv table) over which the
            average is computed.
            
        :n:
            Number of rows that are averaged. If None it is determined 
            automatically. default=None.
        
        :new_n:
            Number of rows after averaging. If None it is determined 
            automatically. default=None.
            
            
    """
    import warnings
    import numpy
    
    # check some special keys
    ignore_cols = []
    if "__Split__" in key_list:
        ignore_cols.append('__Key_Fold__')
    elif "__Key_Fold__" in key_list:
        ignore_cols.append('__Split__')
    if "__Run__" in key_list:
        ignore_cols.append('__Key_Run__')
    elif "__Key_Run__" in key_list:
        ignore_cols.append('__Run__')
    
    # determine dim of rows to average and result table
    if n is None:
        n = 1
        for key in key_list:
            n *= len(set(data_dict[key]))
    if new_n is None:
        new_n = len(data_dict[key_list[0]]) / n
    # averaging over *key* means all other parameter columns have to be the same    
    indices = [[] for _ in range(new_n)]
    patterns = [[] for _ in range(new_n)]
    values = [data_dict[key] for key in data_dict.keys() \
              if (key.startswith('__') and not (key in key_list or key in \
                                                                  ignore_cols))]
    # determine indices of rows that are averaged
    i = 0
    for pattern in zip(*values):
        inserted = False
        for j in range(new_n):
            if pattern in patterns[j]:
                indices[j].append(i)
                inserted = True
                break
            if patterns[j] == []:
                patterns[j].append(pattern)
                indices[j].append(i)
                inserted = True
                break
        if inserted != True:
            warnings.warn("Line %d not included in average! Check dimensions." % i)
        i += 1
    
    # average the data
    data_dict = parse_data(data_dict)
    result_dict = empty_dict(data_dict)
    for key in result_dict.keys():
        for avg_inds in indices:
            a = numpy.array(data_dict[key])[avg_inds]
            # we can only average numbers
            if isinstance(data_dict[key][0], (float,int)):
                # since int would be converted to float by averaging we try to
                # prevent that if possible
                if (a == a[0]).all():
                    result_dict[key].append(a[0])
                else:
                    result_dict[key].append(numpy.mean(a))
            elif key in key_list or key in ignore_cols:
                result_dict[key].append("averaged")
            else:
                result_dict[key].append(a[0])
                # check if not equal!
                if not((a == a[0]).all()):
                    warnings.warn("Averaged across different conditions... %s" % str(a))
    
    return result_dict

[docs]def parse_data(data_dict):
    """ Parse the data of type string to int and float values where possible 
    
    **Parameters**
    
        :data_dict:
            Dictionary as returned by csv2dict.
    """
    result_dict = empty_dict(data_dict)
    for key in data_dict.keys():
        for s in data_dict[key]:
            try:
                result_dict[key].append(int(s))
            except ValueError:
                try:
                    result_dict[key].append(float(s))
                except ValueError:
                    result_dict[key].append(s)
    return result_dict

[docs]def check_for_failures(data, num_splits, conditions, remove_count=False):
    """ Compute a list of conditions for which the classification failed
    
    Given a possibly incomplete results.csv and a set of parameters as defined
    in an operation.yaml, this function compares all the expected combinations
    of parameters with what has actually been evaluated according to
    results.csv. It returns a list of failures, i.e., a list of dictionaries,
    each representing one combination of parameters for which results are
    missing.
    
    Besides the actual parameters, the dictionaries in failures have one
    additional key 'count'. The value of 'count' is the number of times this
    particular parameter setting occurred in the results file. The expected
    number of occurrences is the number of splits, 'num_splits'. If the
    failures list is to be further used, it might be necessary to remove the
    count key again - if remove_count=True, this will be done automatically.
    
    .. note:: Even though __Dataset__ is not explicitly stated in the
        operation.yaml, this function needs you to specify the collections as
        parameter all the time. See the following example.
    
    .. note:: This implementation is highly inefficient as it just loops through
        the results list and the list of expected parameter settings instead of
        making use of any sophisticated search algorithms. Large problem might
        thus take some time.

    **Parameters**
    
        :data:
            Dictionary as returned by csv2dict. Usually this dictionary should
            contain the (incomplete) analysis results, hence it will in most
            cases be the product of something like csv2dict('results.csv').
        :num_splits:
            Number of splits. The decision if the condition is interpreted as
            failure depends on this parameter.
        :conditions:
            A dictionary containing the parameter ranges as specified in the
            operation.yaml. Additionally, __Dataset__ has to be specified. See
            the following example.
        :remove_count:
            optional: controls if the count variable will be removed from the
            entries in the failures list.
            default=False
    
    
    ** Examplary Workflow **

    .. code-block:: python

        import csv_analysis
        data=csv_analysis.csv2dict('results.csv')
        conditions={}
        conditions['__CLASSIFIER__']=['1RMM', '2RMM']
        conditions['__C__']=[0.01, 0.1, 1.0, 10.0]
        conditions['__Dataset__']=['Set1','Set2','Set3']
        nsplits = 10
        failures=csv_analysis.check_for_failures(data,nsplits,conditions,True)

    :Author: David Feess
    :Created: 2011/04/05
    """
    # This is used to generate crossproducts of arbitrary many parameters and
    # stolen as is from missions/operations.base._get_parameter_space()
    crossproduct = lambda ss,row=[],level=0: len(ss)>1 \
        and reduce(lambda x,y:x+y,[crossproduct(ss[1:],row+[i],level+1)
                                         for i in ss[0]]) \
        or [row+[i] for i in ss[0]]
    parameter_ranges = [eval(range_expression)
                            if isinstance(range_expression, basestring)
                            else range_expression
                                for range_expression in conditions.values()]
    # parameter_settings will contain a list with dict entries, each dict
    # representing one particular combination of parameters
    parameter_settings = map(lambda x: dict(zip(conditions.keys(), x)),
                                    crossproduct(parameter_ranges))

    # Add a counter variable to each of the expected conditions. This will
    # later be compared to num_splits
    for x in parameter_settings:
        x['count'] = 0
    
    # Iterate through entire data object
    for i in range(len(data['__Dataset__'])):
        # Iterate through expected parameter settings:
        for expected in parameter_settings:
            skip = False # skip this setting if any parameter mismatches
            # iterate through all parameters in this parameter setting
            for expected_key, expected_val in expected.iteritems():
                if expected_key == 'count': # forget about the count parameter
                    continue
                try: # convert strings to numbers if possible
                    x = eval(data[expected_key][i])
                except:
                    x = data[expected_key][i]
                if expected_val == x: # if we have a match continue
                    continue
                else: # else skip this parameter
                    skip = True # ...  and the whole param. setting
                    break
            if skip: # go for next parameter setting
                continue
            # if not skip: found a match: ...
            expected['count'] += 1 # ... increase count
            break                  # and go for next entry in data

    failures = []
    # Failures are all entries in the expected parameter_settings where count
    # does not equal the number of splits
    for x in parameter_settings:
        if x['count'] != num_splits:
            if remove_count:
                x.pop('count')
            failures.append(x)

    return failures

[docs]def check_op_libSVM(input_dir='.', delete_file=True):
    """Perform terminal operation to identify possible classification failures
       on the basis of number of files.
    
    This works only for libSVM classification with stored results, as it
    relies on files stored in the persistency directories.
    
    This function navigates to input_dir (which is the result directory of the
    classification) and checks the number of files starting with 'features' in
    'persistency_run0/LibSVMClassifierNode/' in each subdirectory. In case the
    classification was successfully performed, the number of files here should
    equal the number of splits used. If not, this is a hint that something
    went wrong!
    The list returned by this function contains alternating
    (i) name of 'root directory' for the respective condition
    (ii) number of files
    ...
    
    .. note:: This function only works if the feature*.pickle files are
              explicitly saved in your NodeChain!

    
    **Parameters**
    
        :input_dir:
            optional: string with the path where csv files are stored.
            default='.'
        :delete_file:
            optional: controls if the file 'temp_check_op.txt' will be removed
            default=True
    
    :Author: Sirko Straube, Anett Seeland
    :Created: 2010/11/09
    """
    
    import os
    
    #navigating to operation dir
    current_path=os.getcwd()
    os.chdir(input_dir)
    #rcode=os.system('cd ' + input_dir)
    #analyzing directories and writing results in temp_check_op.txt
    rcode=os.system('for f in *; do if [ -d $f ]; then echo $f; ' + 
      'echo find $f/persistency_run0/LibSVMClassifierNode/feature*.pickle ' + 
      '| wc -w; fi; done > temp_check_op.txt')
    
    #transferring data to Python list
    f=open('temp_check_op.txt')
    oplist=[]
    
    for line in f:
        oplist.append(line)
    
    f.close()        
    
    #probably deleting and navigating back
    if delete_file:
        rcode=os.system('rm temp_check_op.txt')
    rcode=os.system('cd ' + current_path)
    
    return oplist



[docs]def report_failures(oplist, num_splits):
    """Sort output of terminal operation (e.g. performed by check_op_libSVM).

    This function returns a list where each element contains the parameters of
    a condition where the classification probably failed. This judgment is
    made according to the number of files which are expected according to the
    used number of splits. See also: check_op_libSVM
    
    **Parameters**
    
        :oplist:
            An iterable that has to contain
            (i) name of 'root directory' for the respective condition
            (ii) number of files
            ...

        This parameter can either be the list returned by check_op_libSVM or a
        file type object (pointing to a manually constructed file).

        :num_splits:
            Number of splits. The decision if the condition is interpreted as
            failure depends on this parameter.
    
    :Author: Mario Krell, Sirko Straube
    :Created: 2010/11/09
    """
    
    import warnings
    
    dirstats=False
    dirline = None
    failures=[]

    for line in oplist:
            if dirstats: #the actual line should contains the number of files
                #remove possible whitespaces and endl
                line=line.strip().strip('\n') 
                if line.isdigit():
                    #-1 because of batch command (see check_op_libSVM)
                    num_files=int(line)-1 
                    if num_files<num_splits:
                        result = dict()
                        current_params= \
                            dirline.strip().strip('{').strip("}").split("}{")
                        result['__Dataset__']=current_params[0]
                        result['count']=num_files #include number of splits
                        for param in current_params[1:]:
                            # TODO if anything else then template has no # this will fail;
                            # delete as soon as no more data with templates in folder names
                            # circulate
                            if '#' not in param:
                                result["__Template__"] = param
                                continue
                            entry =param.split('#')
                            result[entry[0]] = entry[1]
                        failures.append(result)
                else:
                    warnings.warn("Inconsistency while analyzing " + 
                      "check_op_libSVM data: Line " + line + 
                      " is not a digit reporting number of feature pickles." )
                dirstats=False
            else:   
                dirstats=True
                dirline=line
    return failures

[docs]def reconstruct_failures(csv_dict, missing_conds, num_splits, default_dict=None):
    """Reconstruct classification failures in csv dictionary according to
    known parameters and default values.

    This function takes the csv-dictionary (probably constructed using
    merge_performance_results from PerformanceResultSummary) and reconstructs the classification failures defined in
    missing_conds (probably constructed using report_failures) according to
    known parameters (given in missing_conds) and some default values that may
    be specified in default_dict (probably constructed with the help of
    empty_dict and a subsequent modification). All other keys are specified
    with the 'unknown' value. Finally the reconstructed dictionary is merged
    with the original csv-dictionary and returned.
    
    **Parameters**
    
        :csv_dict:
            The data dictionary. Has the form returned by csv2dict.

        :missing_conds:
            A list of dictionaries specifying the missing conditions.
            Has the form returned by report_failures.
        
        :num_splits:
            Number of splits used for classification.
        
        :default_dict:
            optional: A dictionary specifying default values for missing
            conditions. This dictionary can e.g. be constructed using
            empty_dict(csv_dict) and subsequent modification, e.g.
            default_dict['Metric'].append(0).

            (*optional, default: None*)
            
    :Author: Mario Krell, Sirko Straube
    :Created: 2010/11/09
    """
    
    reconstruct_dict = None
    
    for line in missing_conds:
        missing_dict = empty_dict(csv_dict)
        count = line.pop('count')
    
        for key in line.keys(): #transfer known variables to missing_dict
            missing_dict[key].append(line[key])
            
        if default_dict:
            #transfer user specified default values to missing_dict 
            for key in default_dict.keys():
                #...only if there is an entry in default_dict
                # AND the key is existing in missing_dict
                if default_dict[key] and key in missing_dict.keys(): 
                    missing_dict[key].append(default_dict[key])
        
        for key in missing_dict.keys(): #set all other keys to 'unknown'
            if not missing_dict[key]: #entry key is empty list
                missing_dict[key].append('unknown')
        #reconstruct a line for every missing split
        for each_missing_split in range(num_splits-count):
            if not reconstruct_dict: #only true once
                reconstruct_dict = missing_dict
            else:
                reconstruct_dict = extend_dict(reconstruct_dict,missing_dict)
    #finally, merge the original and the reconstruction
    return merge_dicts(csv_dict,reconstruct_dict)