Source code for pySPACE.missions.operations.statistic

""" Calculate a two-tailed paired t-test on a result collection for a certain parameter

Furthermore some statistical important values are calculated.

Specification file Parameters
+++++++++++++++++++++++++++++

type
----

Should be *statistic*

(*obligatory, statistic*)

metric
------

list of function values on which we want to make the test

(*optional, default: 'Balanced_accuracy'*)

parameter
---------

name of the varying parameter, we want to analyze

(*optional, default: '__Dataset__'*)

filter
------

dictionary saying which subarray of the csv tabular shall be analyzed

average
-------

parameter, over which one should average

(*optional, default: None*)

input_collection
----------------

Path to the input collection of type 'result'

related_parameters
------------------

list of parameters, being relevant for the related t-test

(*optional, default: ["__Dataset__", Key_Run, Key_Fold]*)

Exemplary Call
++++++++++++++
    
.. code-block:: yaml

        type : statistic
        input_path : "result_col_example"
        metric : "Balanced_accuracy"
        parameter : '__metric__'
        related_parameters : ["__Dataset__", "Key_Run", "Key_Fold"]
        average : "Key_Run"
        filter : {"__metric__":["Balanced_accuracy","k_Balanced_accuracy","soft_Balanced_accuracy"]}
        
.. todo:: Anett says a unit test should check if the statistical calculations work as expected

"""
import sys
if sys.version_info[0] == 2 and sys.version_info[1] < 6:
    import processing
else:
    import multiprocessing as processing
from scipy.stats import ttest_rel as ttest_related
from scipy.stats import kstest
import numpy as np
import os

import logging
import pySPACE
from pySPACE.missions.operations.base import Operation, Process
from pySPACE.resources.dataset_defs.base import BaseDataset

import pySPACE.tools.csv_analysis as csv_analysis

import warnings

[docs]class StatisticOperation(Operation):
    """ Start only the one StatisticProcess after reading the specification file
    and reducing the performance tabular to the relevant entries
    
    For further calculations, the performance tabular and its metadata.yaml file
    are copied into the new collection such that other operations can follow,
    as for example visualization operations.
    """
[docs]    def __init__(self, processes, operation_spec, result_directory, 
                 number_processes, create_process=None):
        super(StatisticOperation, self).__init__(processes, operation_spec,
                                               result_directory)
        self.number_processes = number_processes
        self.create_process   = create_process
        warnings.warn("You are using the statistic operation to calculate p-values for the paired t-test. \
            Check if the paired t-test is the correct method in your model! \
            This operation shall help to find important parameters but does not \
            replace a good statistical model.")
        
    @classmethod
[docs]    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """
        A factory method that creates a statistic operation based on the
        information given in the operation specification operation_spec.
        If debug is TRUE the creation of the statistic processes will not
        be in a separated thread.
        """
        assert(operation_spec["type"] == "statistic")
        input_path = operation_spec["input_path"]
        tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
        
        if operation_spec.has_key("filter"):
            conditions= csv_analysis.empty_dict(tabular)
            for key,l in operation_spec["filter"].items():
                conditions[key].extend(l)
            tabular = csv_analysis.strip_dict(tabular,conditions)
        metric = operation_spec.get("metric","Balanced_accuracy")
        parameter = operation_spec.get("parameter","__Dataset__")
        rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
        average = operation_spec.get("average",None)
        
        if average in rel_par:
            rel_par.remove(average)
        if metric in rel_par:
            rel_par.remove(metric)
        if parameter in rel_par:
            rel_par.remove(parameter)
            
        reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
        number_processes = 1
        processes = processing.Queue()
        cls._createProcesses(processes, result_directory, reduced_tabular)
        
        import shutil
        shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
        shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
        # create and return the shuffle operation object
        return cls(processes, operation_spec, result_directory, number_processes)
    
    @classmethod
[docs]    def reduce_tabular(cls,tabular,rel_par,metric,parameter,average):
        keylist = []
        if average == None:
            for i in range(len(tabular[metric])):
                keylist.append((tabular[parameter][i],
                               tuple([tabular[par][i] for par in rel_par]),
                               tabular[metric][i]))
            values   = []
            par_list = []
            keylist.sort()
            # filter parameter and metric from sorted tabular
            for par, key, value in keylist:
                values.append(float(value))
                par_list.append(par)
        else:
            unique_average = sorted(list(set(tabular[average])))
            l=len(unique_average)
            for i in range(len(tabular[metric])):
                keylist.append((tabular[parameter][i],
                               tuple([tabular[par][i] for par in rel_par]),
                               tabular[average][i],
                               tabular[metric][i]))
            values   = []
            par_list = []
            keylist.sort()
            # filter parameter and metric from sorted tabular
            for par, key, p_average,value in keylist:
                if p_average == unique_average[0]:
                    v = float(value)
                    p = par
                    k = key
                    i = 1
                else:
                    v += float(value)
                    assert(p==par),"Wrong sorting in list."
                    assert(k==key),"Wrong sorting in list."
                    i += 1
                if p_average == unique_average[-1]:
                    values.append(v/l)
                    par_list.append(p)
                    assert(i==l),"Wrong sorting in list."
        return {"values": values, "parameters": par_list}
    
    @classmethod
[docs]    def _createProcesses(cls, processes, result_directory, data):
        """Function that creates the process.
        
        Create the Process (it is not distributed over different processes)
        """
        # Create the process and put it in the execution queue
        processes.put(StatisticProcess(result_directory, data))
        # give executing process the sign that creation is now finished
        processes.put(False)
    
[docs]    def consolidate(self):
        """ Consolidation of the operation's results """
        # Just do nothing
        pass
    
[docs]class StatisticProcess(Process):
    """ Calculate several statistic metrics on the specified metric and parameter
    
    At the moment mean, correlation, difference of means, standard deviation,
    standard error, p-value, t-value and some basic significance test
    are calculated and written to a tabular.
    """
[docs]    def __init__(self, result_directory, data):
        super(StatisticProcess, self).__init__()
        
        self.result_directory = result_directory
        self.data = data
        self.alpha = 0.05
        
[docs]    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(StatisticProcess, self).pre_benchmarking()
        
        unique_parameters = list(set(self.data["parameters"]))
        assert(len(unique_parameters)>1),"No different parameter given!"
        if len(unique_parameters)>2:
            self._log("No correction yet implemented for multiple t test.",logging.CRITICAL)
        
        n = len(unique_parameters)
        l = len(self.data["parameters"])
        k = l/n
        p_values = {"parameter_1":[],"parameter_2":[],"p_value":[],"t_value":[],
                    "correlation":[],"mean_1":[],"mean_2":[],"mean_1-mean_2":[],
                    "std_1":[],"std_2":[],"se_1":[],"se_2":[],"Bonferroni_significant":[],
                    "Sidac_significant":[]}
        for i in range(n):
            assert(self.data["parameters"][i*k:(i+1)*k]==[self.data["parameters"][i*k]]*k),\
                "Error in list sorting! Parameter not consistent. Problematic parameter: %s"%self.data["parameters"][i*k]
            data1 = self.data["values"][i*k:(i+1)*k]
            mean1 = np.mean(data1)
            std1 = np.std(data1)
            if not self.kstest((data1-mean1)/std1):
                self._log("Data is probably not normal distributed \
                    according to parameter %s \
                    and you do not use the proper statistic test.\
                    The parameter is ignored!"
                    %self.data["parameters"][i*k],logging.CRITICAL)
                continue
            for j in range(i):
                # check if parameters match in one part
                assert(len(self.data["parameters"][i*k:(i+1)*k]) == len(self.data["parameters"][j*k:(j+1)*k])),\
                    "Error in list sorting! Parameters are not equal."
                data2 = self.data["values"][j*k:(j+1)*k]
                mean2 = np.mean(data2)
                std2 = np.std(data2)
                if not self.kstest((data2-mean2)/std2):
                    self._log("Data is probably not normal distributed \
                    according to parameter %s \
                    and you do not use the proper statistic test.\
                    The parameter is ignored!"
                    %self.data["parameters"][j*k],logging.CRITICAL)
                    continue
                
                t,p = self.p_value(data1,data2)
                corr = np.corrcoef(data1,data2)[0][1]
                p_values["correlation"].append(corr)
                p_values["p_value"].append(p)
                
                test_sig = self.alpha/n
                if p < test_sig:
                    p_values["Bonferroni_significant"].append("True")
                else:
                    p_values["Bonferroni_significant"].append("False")
                test_sig = 1.0-(1.0-self.alpha)**(1.0/n)
                if p < test_sig:
                    p_values["Sidac_significant"].append("True")
                else:
                    p_values["Sidac_significant"].append("False")
                
                if mean1>mean2:
                    p_values["t_value"].append(t)
                    
                    p_values["parameter_1"].append(self.data["parameters"][i*k])
                    p_values["parameter_2"].append(self.data["parameters"][j*k])
                    p_values["mean_1"].append(mean1)
                    p_values["mean_2"].append(mean2)
                    p_values["mean_1-mean_2"].append(mean1-mean2)
                    p_values["std_1"].append(std1)
                    p_values["std_2"].append(std2)
                    p_values["se_1"].append(std1/np.sqrt(k))
                    p_values["se_2"].append(std2/np.sqrt(k))
                else:
                    p_values["t_value"].append(-t)
                    # construct symmetric result
                    p_values["parameter_2"].append(self.data["parameters"][i*k])
                    p_values["parameter_1"].append(self.data["parameters"][j*k])
                    p_values["mean_2"].append(mean1)
                    p_values["mean_1"].append(mean2)
                    p_values["mean_1-mean_2"].append(mean2-mean1)
                    p_values["std_2"].append(std1)
                    p_values["std_1"].append(std2)
                    p_values["se_2"].append(std1/np.sqrt(k))
                    p_values["se_1"].append(std2/np.sqrt(k))
                    
        csv_analysis.dict2csv(os.path.join(self.result_directory,'p_values.csv'), p_values)

        ############## Clean up after benchmarking ##############
        super(StatisticProcess, self).post_benchmarking()
    
[docs]    def p_value(self,p1_list,p2_list):
        t,p = ttest_related(p1_list,p2_list)
        return t,p
    
[docs]    def kstest(self,data):
        d,p=kstest(data,'norm')
        
        return p>0.05