# Calculating statistics for RSR and RSCC values

The RSR and RSCC values were generated using code (dcc) provided by Huanwang Yang at the RCSB Rutgers Univ

This notebook will generate tables comparing

    RSR
        AFITT-CIF, Phenix-AFITT
    RSCC
        AFITT-CIF, Phenix-AFITT
        
Statistical significance was determined using

    dependent paired t-test -> stats.ttest_rel
    two sample distribution Kolmogorov-Smirnov test -> stats.ks_2sampl
    Wilcoxon (Wilcoxon results not reported) -> stats.wilcoxon

Substantive significance was determined using
   
    Cohen d - CohensEffectSize


In [1]:
# Importing all the external python packages
import numpy as np
import scipy as sp
import math as math
import pylab as pl
import pandas as pd
from IPython.display import display, HTML


In [2]:
# For Future Compatibility
from __future__ import division

from __future__ import print_function


In [3]:
# Effect size estmation using Cohen's d. Larger d = more substantial effect/difference between two methods.
# THIS IS NOT IN ANY WAY RELATED TO STATISTICAL SIGNIFICANCE.
# d <= 0.2 - trivial effect  (there is no difference worth worrying about)
# 0.2 <= d < 0.5 - small effect.
# 0.5 <= d <= 0.8 - medium effect.
# d > 0.8 - large effect.
def CohensEffectSize(lis1,lis2):
    n1=len(lis1)
    n2=len(lis2)
    mean1=sp.mean(lis1)
    mean2=sp.mean(lis2)
    stddev1=sp.std(lis1)
    stddev2=sp.std(lis2)
    s1=math.sqrt((stddev1**2 + stddev2**2)/2)
    s2=math.sqrt((n1-1)*stddev1**2 + (n2-1)*stddev2**2)
    #2 populations are equal
    d1=round(math.fabs(mean1-mean2)/s1,3)
    #2 populations are different
    d2=round(math.fabs(mean1-mean2)*math.sqrt(n1+n2-2)/s2,3)

    return d1

In [4]:
# Use pandas to read a csv file containing the RSR and RSCC data
# To run this you will need to change the path
d =  pd.read_csv("/Users/warreng3/Desktop/sf_valid_out_clean.csv")


In [5]:
# Parse RSR and RSCC data
RSCC_Phenix_AFITT = d['RSCC'][d['mode']=='afitt']
RSCC_AFITT_CIF = d['RSCC'][d['mode']=='no_afitt']
RSR_Phenix_AFITT = d['RSR'][d['mode']=='afitt']
RSR_AFITT_CIF = d['RSR'][d['mode']=='no_afitt']

In [6]:
from scipy import stats

In [7]:
# Calculate mean
mean_rsr_Phenix_AFITT = np.mean(RSR_Phenix_AFITT)
mean_rsr_AFITT_CIF = np.mean(RSR_AFITT_CIF)
mean_rscc_Phenix_AFITT = np.mean(RSCC_Phenix_AFITT)
mean_rscc_AFITT_CIF = np.mean(RSCC_AFITT_CIF)


In [8]:
# Calculate standard deviation
std_dev_rsr_Phenix_AFITT = np.std(RSR_Phenix_AFITT)
std_dev_rsr_AFITT_CIF = np.std(RSR_AFITT_CIF)
std_dev_rscc_Phenix_AFITT = np.std(RSCC_Phenix_AFITT)
std_dev_rscc_AFITT_CIF = np.std(RSCC_AFITT_CIF)


In [9]:
# 2 sample test where Ho => populations are the same for RSCC
paired_depend_t_rscc = stats.ttest_rel(RSCC_AFITT_CIF,RSCC_Phenix_AFITT)
ks_rscc = stats.ks_2samp(RSCC_AFITT_CIF,RSCC_Phenix_AFITT)
wilcox_rscc = stats.wilcoxon(RSCC_AFITT_CIF,RSCC_Phenix_AFITT)

In [10]:
# 2 sample test Ho => populations are the same for RSR
paired_depend_t_rsr = stats.ttest_rel(RSR_Phenix_AFITT,RSR_AFITT_CIF)
ks_rsr = stats.ks_2samp(RSR_Phenix_AFITT,RSR_AFITT_CIF)
wilcox_rsr=stats.wilcoxon(RSR_Phenix_AFITT,RSR_AFITT_CIF)

In [11]:
# Are the differences in the means substantial
cohen_d_rscc = CohensEffectSize(RSCC_AFITT_CIF,RSCC_Phenix_AFITT)
cohen_d_rsr = CohensEffectSize(RSR_Phenix_AFITT,RSR_AFITT_CIF)


In [15]:
# Make a rscc dictionary
rscc = {'Mean': pd.Series(['{:.3G}'.format(float(mean_rscc_AFITT_CIF)), '{:.3G}'.format(float(mean_rscc_Phenix_AFITT))], 
                        index=['AFITT-CIF' ,'Phenix-AFITT']),
     'Std Dev' : pd.Series(['{:.3G}'.format(float(std_dev_rscc_AFITT_CIF)), '{:.3G}'.format(float(std_dev_rscc_Phenix_AFITT))], 
                           index=['AFITT-CIF' ,'Phenix-AFITT']),
     'dep t-test': pd.Series(['', '{:.3G}'.format(float(paired_depend_t_rscc[0]))],
                             index=['AFITT-CIF', 'Phenix-AFITT']),
     'dep t-test (p)': pd.Series(['', '{:.3G}'.format(float(paired_depend_t_rscc[1]))],
                             index=['AFITT-CIF', 'Phenix-AFITT']),
     'ks 2-sample': pd.Series(['', '{:.3G}'.format(float(ks_rscc[0]))], 
                              index=['AFITT-CIF', 'Phenix-AFITT']),
     'ks 2-sample (p)': pd.Series(['', '{:.3G}'.format(float(ks_rscc[1]))], 
                              index=['AFITT-CIF', 'Phenix-AFITT']),
     'Wilcoxon': pd.Series(['', '{:.3G}'.format(float(wilcox_rscc[0]))], 
                           index=['AFITT-CIF', 'Phenix-AFITT']),
     'Wilcoxon (p)': pd.Series(['', '{:.3G}'.format(float(wilcox_rscc[1]))], 
                           index=['AFITT-CIF', 'Phenix-AFITT']),
     'Cohen d': pd.Series(['', cohen_d_rscc], 
                          index=['AFITT-CIF', 'Phenix-AFITT'])}

In [16]:
# Make a rsr dictionary
rsr = {'Mean': pd.Series(['{:.3G}'.format(float(mean_rsr_AFITT_CIF)), '{:.3G}'.format(float(mean_rsr_Phenix_AFITT))], 
                        index=['AFITT-CIF' ,'Phenix-AFITT']),
     'Std Dev' : pd.Series(['{:.3G}'.format(float(std_dev_rsr_AFITT_CIF)), '{:.3G}'.format(float(std_dev_rsr_Phenix_AFITT))], 
                           index=['AFITT-CIF' ,'Phenix-AFITT']),
     'dep t-test': pd.Series(['', '{:.3G}'.format(float(paired_depend_t_rsr[0]))],
                             index=['AFITT-CIF', 'Phenix-AFITT']),
     'dep t-test (p)': pd.Series(['', '{:.3G}'.format(float(paired_depend_t_rsr[1]))],
                             index=['AFITT-CIF', 'Phenix-AFITT']),
     'ks 2-sample': pd.Series(['', '{:.3G}'.format(float(ks_rsr[0]))], 
                              index=['AFITT-CIF', 'Phenix-AFITT']),
     'ks 2-sample (p)': pd.Series(['', '{:.3G}'.format(float(ks_rsr[1]))], 
                              index=['AFITT-CIF', 'Phenix-AFITT']),
     'Wilcoxon': pd.Series(['', '{:.3G}'.format(float(wilcox_rsr[0]))], 
                           index=['AFITT-CIF', 'Phenix-AFITT']),
     'Wilcoxon (p)': pd.Series(['', '{:.3G}'.format(float(wilcox_rsr[1]))], 
                           index=['AFITT-CIF', 'Phenix-AFITT']),
     'Cohen d': pd.Series(['', cohen_d_rsr], 
                          index=['AFITT-CIF', 'Phenix-AFITT'])}

In [17]:
# print RSCC table
# the statistical test is (Ho | Ha=Hb)
df_rscc = pd.DataFrame(rscc, columns=['Mean', 'Std Dev', 'dep t-test (p)', 'ks 2-sample (p)', 'Wilcoxon (p)', 'Cohen d'])
df_rscc.columns.name = 'RSCC'
df_rscc

RSCC,Mean,Std Dev,dep t-test (p),ks 2-sample (p),Wilcoxon (p),Cohen d
AFITT-CIF,0.945,0.0403,,,,
Phenix-AFITT,0.939,0.0469,3.87e-08,0.531,1.21e-11,0.119


In [18]:
# print RSR table
# the statistical test is (Ho | Ha=Hb)
df_rsr = pd.DataFrame(rsr, columns=['Mean', 'Std Dev', 'dep t-test (p)', 'ks 2-sample (p)', 'Wilcoxon (p)', 'Cohen d'])
df_rsr.columns.name = 'RSR'
df_rsr

RSR,Mean,Std Dev,dep t-test (p),ks 2-sample (p),Wilcoxon (p),Cohen d
AFITT-CIF,0.125,0.05,,,,
Phenix-AFITT,0.132,0.0546,9.17e-18,0.106,2.31e-24,0.139
