Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as stats
import numpy as np
In [3]:
# define ANOVA function to calculate adjusted effect sizes
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

Loading the data

In [114]:
# load the raw cortisol data
cort_data = pd.read_excel('cortisol_data.xlsx')
cort_data.head()
Out[114]:
Subject Number Sample Cortisol (ug/dL) Cortisol (nmol/L) Notes
0 2707 D1S1 0.182 5.02138 NaN
1 2707 D1S2 0.550 15.17450 NaN
2 2707 D1S3 0.552 15.22968 NaN
3 2707 D1S4 0.444 12.24996 NaN
4 2707 D1S5 0.088 2.42792 NaN
In [115]:
# load the study data
df = pd.read_excel('data.xlsx')

Wrangling the data into shape

As seen above, the raw cortisol data is output from an ELISA plate reader, so is long form, with each row representing a single cortisol sample. This needs to be converted into wide data so it can be merged into the existing study database (by subject).

In [116]:
wide_data = cort_data.pivot(index='Subject Number', columns='Sample', values='Cortisol (nmol/L)')
wide_data.head()
Out[116]:
Sample D1S1 D1S2 D1S3 D1S4 D1S5 D1S6 D2S1 D2S2 D2S3 D2S4 D2S5 cTBS
Subject Number
1411 3.94537 3.50393 3.39357 2.81418 0.55180 0.33100 7.55966 11.53262 11.14636 7.53207 0.82770 3.44875
1833 3.58670 9.90481 14.70547 13.71223 1.82094 0.96565 2.70382 8.63567 14.01572 15.56076 1.60022 1.54504
2045 1.60022 0.88288 7.09063 8.35977 6.12498 2.04166 6.12498 6.84232 7.83556 7.20099 NaN 3.11767
2060 NaN 5.07656 7.69761 6.48365 0.85529 0.33100 4.60753 4.27645 3.44875 NaN 0.46903 1.04842
2152 21.87887 11.36708 7.06304 4.41440 2.97972 0.88288 8.24941 2.42792 14.04331 15.17450 2.78659 2.97972
In [117]:
# assigning cTBS values to D2S5 where appropriate (these samples are equivalent in some subjects)
wide_data['D2S5'] = np.where(wide_data['D2S5'].isna(), wide_data['cTBS'], wide_data['D2S5'])

That's better. Before merging the data, first we will impute some missing data. This is a small sample, so dropping subjects with missing samples is not possible.

In [118]:
# imputing missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer = imputer.fit(wide_data.iloc[:, 0:10])
wide_data.iloc[:, 0:10] = imputer.transform(wide_data.iloc[:, 0:10])

Merging datasets

In [119]:
df = pd.merge(df, wide_data, left_on='study_no', right_on='Subject Number')
df.head()
Out[119]:
study_no Gest_age_del age_s2 RMTS2 AMTpa AMTap AMTlm AMTap_pa_prop SI1mVS2 RMTcTBS_S2 ... D1S3 D1S4 D1S5 D1S6 D2S1 D2S2 D2S3 D2S4 D2S5 cTBS
0 1411 274.0 13.583333 40.0 34.0 48.0 37.0 1.411765 61.0 53.0 ... 3.39357 2.81418 0.55180 0.33100 7.55966 11.53262 11.14636 7.532070 0.82770 3.44875
1 1833 265.0 12.083333 48.0 35.0 53.0 46.0 1.514286 81.0 65.0 ... 14.70547 13.71223 1.82094 0.96565 2.70382 8.63567 14.01572 15.560760 1.60022 1.54504
2 2045 287.0 13.833333 47.0 38.0 44.0 39.0 1.157895 53.0 55.0 ... 7.09063 8.35977 6.12498 2.04166 6.12498 6.84232 7.83556 7.200990 3.11767 3.11767
3 2060 278.0 12.750000 46.0 35.0 42.0 40.0 1.200000 56.0 55.0 ... 7.69761 6.48365 0.85529 0.33100 4.60753 4.27645 3.44875 8.097665 0.46903 1.04842
4 2152 278.0 14.083333 50.0 43.0 60.0 52.0 1.395349 63.0 68.0 ... 7.06304 4.41440 2.97972 0.88288 8.24941 2.42792 14.04331 15.174500 2.78659 2.97972

5 rows × 104 columns

Now each subject in the study database has their cortisol samples as columns.

Dealing with time data

Now we need to create some variables. Some of the data represents time, but is stored as int (e.g. 930 for 09:30 am). We'll convert these to datetime and then calculate some variables we'll need later in analysis.

Sampling duration

In [120]:
# calculating the duration of each subject's day (first to last sample)
for t in range(1,7):
    df['D1T{}dt'.format(t)] = pd.to_datetime(df['D1T{}'.format(t)], format='%H%M')
In [121]:
df['duration'] = (df['D1T6dt'] - df['D1T1dt'])
In [122]:
df['duration_hours'] = df.apply(lambda x: (x['D1T6dt']-x['D1T1dt']).seconds/60/60, axis=1)

Sampling latency

In [123]:
# calculating the time taken between awakening and first sample
var_list = ['WakeTimeD1','WakeTimeD2','D1T1','D2T1']
for var in var_list:
    df[var+'dt'] = pd.to_datetime(df[var], format='%H%M')
    
df['sample_latencyD1'] = df['D1T1dt'] - df['WakeTimeD1dt']
df['sample_latencyD1_mins'] = df.apply(lambda x: x['sample_latencyD1'].seconds/60, axis=1)

df['sample_latencyD2'] = df['D2T1dt'] - df['WakeTimeD2dt']
df['sample_latencyD2_mins'] = df.apply(lambda x: x['sample_latencyD2'].seconds/60, axis=1)

df['sample_latency_mean_mins'] = (df['sample_latencyD1_mins'] + df['sample_latencyD2_mins'])/2

Inter-sample intervals

In [124]:
# time taken between each sample
times = ['D1T1dt','D1T2dt','D1T3dt','D1T4dt','D1T5dt','D1T6dt']

for i in range(1,6):
    df['interval{}'.format(i)] = df.apply(lambda x: (x[times[i]]-x[times[i-1]]).seconds/60, axis=1)

Calculating AUC

Perhaps the most important step in this analysis is calculating the area under the curve (AUC) of the cortisol awakening response, as this is the primary index of the CAR. There are two types: AUC with respect to ground (AUCg) and AUC with respect to increase (AUCi). The former represents total hormonal output, while the latter reflects responsitivity, or rate of change over the samples. We'll calculate each of these for each day and then take the mean.

In [125]:
# AUC formulas
aucg_d1_formula = lambda x: (((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)
auci_d1_formula = lambda x: ((((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)) - x.D1S1*(15*4)

aucg_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2))
auci_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2)) - x.D2S1*(15*4)
In [126]:
# Computing AUCs
# Day 1
df['AUCgD1'] = df.apply(aucg_d1_formula, axis=1)
df['AUCiD1'] = df.apply(auci_d1_formula, axis=1)
# Day 2
df['AUCgD2'] = df.apply(aucg_d2_formula, axis=1)
df['AUCiD2'] = df.apply(auci_d2_formula, axis=1)
# Mean
df['AUCimean'] = (df['AUCiD1'] + df['AUCiD2'])/2
df['AUCgmean'] = (df['AUCgD1'] + df['AUCgD2'])/2

Calculating the diurnal slope

In [127]:
subjects = list(df['study_no'].unique())
values = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
cort_data_slope = cort_data
cort_data_slope['Sample'].replace(values, inplace=True)
# selecting data from day 1
cort_data_slope = cort_data_slope[cort_data_slope['Sample'].isin([0,15,30,45,60,75])]

# slope function - plots a regression line through each subjects morning, afternoon, and night samples
def slope(subject):
    data = cort_data_slope[cort_data_slope['Subject Number']==subject]
    data = data[~data['Sample'].isin([15,30,45])]
    X = data['Sample']
    X = sm.add_constant(X.values)
    y = data['Cortisol (nmol/L)']
    reg = sm.OLS(y.astype(float), X.astype(float)).fit()
    b = reg.params[1]
    return b
In [128]:
# compute slopes and add to the dataframe
df['diurnal_slope'] = np.nan

for subject in subjects:
    b = slope(subject)
    df.loc[df['study_no']==subject, 'diurnal_slope'] = b
In [87]:
# now that the data is ready, we split into groups to aid in later comparisons
gdm = df[df['GDM']==1]
control = df[df['GDM']==0]

Converting back to long

We'll need a long dataset to easily plot the CAR later.

In [138]:
d1_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D1S1':'D1S6'], var_name='sample', value_name='cortisol', frame=df)
d1_long['day'] = 1

d2_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D2S1':'cTBS'], var_name='sample', value_name='cortisol', frame=df)
d2_long['day'] = 2
In [139]:
# changing the sample names for easier plotting
sample_names_d1 = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
sample_names_d2 = {'D2S1':0, 'D2S2':15, 'D2S3':30, 'D2S4':45, 'D2S5':60, 'cTBS':75}

d1_long = d1_long.replace({'sample':sample_names_d1})
d2_long = d2_long.replace({'sample':sample_names_d2})
In [140]:
# concatenate into one dataframe
df_long = pd.concat([d1_long, d2_long])

Statistics

Now that's done, we can start analysing the data!

Descriptives

In [109]:
round(df[['AUCimean','AUCgmean','auc_diurnal_d1','diurnal_slope']].describe(), 2)
Out[109]:
AUCimean AUCgmean auc_diurnal_d1 diurnal_slope
count 32.00 32.00 31.00 27.00
mean 8.87 416.70 4667.15 -0.08
std 159.66 158.25 1866.87 0.07
min -455.65 122.91 957.69 -0.29
25% -74.86 307.57 2968.29 -0.11
50% -2.28 447.48 4721.62 -0.05
75% 101.37 518.50 6114.94 -0.03
max 318.46 763.45 8072.60 0.03

Normality tests

We'll be running linear models and ANOVAs, so first we'll check that our data is normally distributed.

In [51]:
print(stats.shapiro(df['AUCimean']))
print(stats.shapiro(df['AUCgmean']))
(0.9635738134384155, 0.34294548630714417)
(0.9709528684616089, 0.5261117219924927)
In [52]:
print(stats.shapiro(gdm['AUCimean']))
print(stats.shapiro(gdm['AUCgmean']))
(0.9537766575813293, 0.37444818019866943)
(0.9686221480369568, 0.6791777014732361)
In [53]:
print(stats.shapiro(control['AUCimean']))
print(stats.shapiro(control['AUCgmean']))
(0.9695404767990112, 0.8865522146224976)
(0.9240884184837341, 0.3923186957836151)

Difference in AUCs

In [26]:
model = smf.ols('AUCgmean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCgmean'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCgmean'], control['AUCgmean']))
                 sum_sq    df        mean_sq         F    PR(>F)    eta_sq  \
C(GDM)    120904.429150   1.0  120904.429150  5.533686  0.025409  0.155731   
Residual  655464.185743  30.0   21848.806191       NaN       NaN       NaN   

          omega_sq  
C(GDM)    0.124096  
Residual       NaN  
           mean         std
GDM                        
0    507.866374  127.002616
1    375.253785  155.884146
Normality:    (0.9776697754859924, 0.7295458912849426)
LeveneResult(statistic=1.150873739559068, pvalue=0.29192041912223843)
In [113]:
model = smf.ols('AUCimean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCimean'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCimean'], control['AUCimean']))
                 sum_sq    df       mean_sq         F    PR(>F)    eta_sq  \
C(GDM)      5103.760568   1.0   5103.760568  0.195026  0.661933  0.006459   
Residual  785091.007695  30.0  26169.700256       NaN       NaN       NaN   

          omega_sq  
C(GDM)   -0.025805  
Residual       NaN  
          mean         std
GDM                       
0    -9.859976  101.642931
1    17.386403  181.542261
Normality:    (0.9619717001914978, 0.3107261657714844)
LeveneResult(statistic=2.686413664192418, pvalue=0.11165406436510628)
In [29]:
model = smf.ols('aucg_diurnal_d1 ~ C(GDM) + duration_hours + interval4', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['auc_diurnal_d1'].agg(['mean','std']))
                      sum_sq    df       mean_sq         F    PR(>F)  \
C(GDM)          3.675610e+06   1.0  3.675610e+06  4.943904  0.034742   
duration_hours  1.900929e+04   1.0  1.900929e+04  0.025569  0.874149   
interval4       5.664103e+06   1.0  5.664103e+06  7.618540  0.010253   
Residual        2.007350e+07  27.0  7.434630e+05       NaN       NaN   

                  eta_sq  omega_sq  
C(GDM)          0.124884  0.097169  
duration_hours  0.000646 -0.024008  
interval4       0.192446  0.163066  
Residual             NaN       NaN  
            mean          std
GDM                          
0    5665.100191  1626.178084
1    4258.899540  1835.759955
In [141]:
#fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)}, 

g = sns.catplot(data=df_long, x='sample', y='cortisol', hue='GDM', kind='point', 
                capsize=0.1, legend=True, aspect=1.3, ci=95)

g.set_xticklabels(labels=['0 min','15 min','30 min','45 min','Afternoon','Evening'])
g.set_xlabels('Time (post-awakening)')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))

plt.tight_layout()

Difference in diurnal slopes

In [143]:
model = smf.ols('diurnal_slope ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['diurnal_slope'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))
            sum_sq    df   mean_sq         F    PR(>F)    eta_sq  omega_sq
C(GDM)    0.004985   1.0  0.004985  1.133969  0.297108  0.043391  0.004937
Residual  0.109895  25.0  0.004396       NaN       NaN       NaN       NaN
         mean       std
GDM                    
0   -0.102127  0.031100
1   -0.069444  0.072477
Normality:    (0.8813735246658325, 0.00514401588588953)
In [111]:
model = smf.ols('diurnal_slope ~ C(GDM) + duration_hours', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
                  sum_sq    df   mean_sq         F    PR(>F)    eta_sq  \
C(GDM)          0.005733   1.0  0.005733  1.333511  0.259550  0.049578   
duration_hours  0.006722   1.0  0.006722  1.563584  0.223196  0.058132   
Residual        0.103174  24.0  0.004299       NaN       NaN       NaN   

                omega_sq  
C(GDM)          0.011955  
duration_hours  0.020202  
Residual             NaN  
In [112]:
model = smf.ols('diurnal_slope ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
            sum_sq    df   mean_sq          F        PR(>F)   eta_sq  omega_sq
S1mean    0.080678   1.0  0.080678  58.971586  4.913379e-08  0.70228  0.682247
Residual  0.034202  25.0  0.001368        NaN           NaN      NaN       NaN
In [142]:
sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)}, 

diurnal_long = df_long[~df_long['sample'].isin([15,30,45])]

g = sns.lmplot(x='sample', y='cortisol', data=diurnal_long, hue='GDM', x_estimator=np.mean, legend=True, aspect=1, ci=95)
ax = g.axes[0,0]

g.set_xticklabels(labels=['','Awakening','','','Afternoon','Evening'])
g.set_xlabels('')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))
#ax.xaxis.set_tick_params(length=0)
#ax.xaxis.set_major_locator(plt.NullLocator())
xticks = ax.xaxis.get_major_ticks()
xticks[2].set_visible(False)
xticks[3].set_visible(False)

plt.tight_layout()

Association between S1 and AUC

In [80]:
model = smf.ols('AUCimean ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
sns.lmplot(x='S1mean', y='AUCimean', data=df, hue='GDM')
                 sum_sq    df        mean_sq          F    PR(>F)   eta_sq  \
S1mean    342225.716266   1.0  342225.716266  22.918484  0.000042  0.43309   
Residual  447969.051997  30.0   14932.301733        NaN       NaN      NaN   

          omega_sq  
S1mean    0.406512  
Residual       NaN  
Out[80]:
<seaborn.axisgrid.FacetGrid at 0x1698d3d7be0>

Effect of sampling latency

In [46]:
# Sampling latency
model = smf.ols('AUCgmean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
                                 sum_sq    df       mean_sq         F  \
sample_latency_mean_mins    5728.304733   1.0   5728.304733  0.160328   
Residual                  607387.285654  17.0  35728.663862       NaN   

                            PR(>F)    eta_sq  omega_sq  
sample_latency_mean_mins  0.693842  0.009343 -0.046237  
Residual                       NaN       NaN       NaN  
In [47]:
# Sampling latency
model = smf.ols('AUCimean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
                                 sum_sq    df       mean_sq         F  \
sample_latency_mean_mins    1188.403691   1.0   1188.403691  0.059162   
Residual                  341484.751258  17.0  20087.338309       NaN   

                            PR(>F)    eta_sq  omega_sq  
sample_latency_mean_mins  0.810736  0.003468 -0.052098  
Residual                       NaN       NaN       NaN  

Difference from baseline

In [50]:
samples = ['S2mean','S3mean','S4mean']
for s in samples:
    print(stats.ttest_rel(df['S1mean'], df['{}'.format(s)]))
Ttest_relResult(statistic=-3.8007236551469226, pvalue=0.0006335539550446315)
Ttest_relResult(statistic=-5.384080173617944, pvalue=7.138616851783919e-06)
Ttest_relResult(statistic=-3.399751510609136, pvalue=0.0018724461895478228)