import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as stats
import numpy as np
# define ANOVA function to calculate adjusted effect sizes
def anova_table(aov):
aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
aov = aov[cols]
return aov
# load the raw cortisol data
cort_data = pd.read_excel('cortisol_data.xlsx')
cort_data.head()
# load the study data
df = pd.read_excel('data.xlsx')
As seen above, the raw cortisol data is output from an ELISA plate reader, so is long form, with each row representing a single cortisol sample. This needs to be converted into wide data so it can be merged into the existing study database (by subject).
wide_data = cort_data.pivot(index='Subject Number', columns='Sample', values='Cortisol (nmol/L)')
wide_data.head()
# assigning cTBS values to D2S5 where appropriate (these samples are equivalent in some subjects)
wide_data['D2S5'] = np.where(wide_data['D2S5'].isna(), wide_data['cTBS'], wide_data['D2S5'])
That's better. Before merging the data, first we will impute some missing data. This is a small sample, so dropping subjects with missing samples is not possible.
# imputing missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer = imputer.fit(wide_data.iloc[:, 0:10])
wide_data.iloc[:, 0:10] = imputer.transform(wide_data.iloc[:, 0:10])
df = pd.merge(df, wide_data, left_on='study_no', right_on='Subject Number')
df.head()
Now each subject in the study database has their cortisol samples as columns.
Now we need to create some variables. Some of the data represents time, but is stored as int (e.g. 930 for 09:30 am). We'll convert these to datetime and then calculate some variables we'll need later in analysis.
# calculating the duration of each subject's day (first to last sample)
for t in range(1,7):
df['D1T{}dt'.format(t)] = pd.to_datetime(df['D1T{}'.format(t)], format='%H%M')
df['duration'] = (df['D1T6dt'] - df['D1T1dt'])
df['duration_hours'] = df.apply(lambda x: (x['D1T6dt']-x['D1T1dt']).seconds/60/60, axis=1)
# calculating the time taken between awakening and first sample
var_list = ['WakeTimeD1','WakeTimeD2','D1T1','D2T1']
for var in var_list:
df[var+'dt'] = pd.to_datetime(df[var], format='%H%M')
df['sample_latencyD1'] = df['D1T1dt'] - df['WakeTimeD1dt']
df['sample_latencyD1_mins'] = df.apply(lambda x: x['sample_latencyD1'].seconds/60, axis=1)
df['sample_latencyD2'] = df['D2T1dt'] - df['WakeTimeD2dt']
df['sample_latencyD2_mins'] = df.apply(lambda x: x['sample_latencyD2'].seconds/60, axis=1)
df['sample_latency_mean_mins'] = (df['sample_latencyD1_mins'] + df['sample_latencyD2_mins'])/2
# time taken between each sample
times = ['D1T1dt','D1T2dt','D1T3dt','D1T4dt','D1T5dt','D1T6dt']
for i in range(1,6):
df['interval{}'.format(i)] = df.apply(lambda x: (x[times[i]]-x[times[i-1]]).seconds/60, axis=1)
Perhaps the most important step in this analysis is calculating the area under the curve (AUC) of the cortisol awakening response, as this is the primary index of the CAR. There are two types: AUC with respect to ground (AUCg) and AUC with respect to increase (AUCi). The former represents total hormonal output, while the latter reflects responsitivity, or rate of change over the samples. We'll calculate each of these for each day and then take the mean.
# AUC formulas
aucg_d1_formula = lambda x: (((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)
auci_d1_formula = lambda x: ((((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)) - x.D1S1*(15*4)
aucg_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2))
auci_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2)) - x.D2S1*(15*4)
# Computing AUCs
# Day 1
df['AUCgD1'] = df.apply(aucg_d1_formula, axis=1)
df['AUCiD1'] = df.apply(auci_d1_formula, axis=1)
# Day 2
df['AUCgD2'] = df.apply(aucg_d2_formula, axis=1)
df['AUCiD2'] = df.apply(auci_d2_formula, axis=1)
# Mean
df['AUCimean'] = (df['AUCiD1'] + df['AUCiD2'])/2
df['AUCgmean'] = (df['AUCgD1'] + df['AUCgD2'])/2
subjects = list(df['study_no'].unique())
values = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
cort_data_slope = cort_data
cort_data_slope['Sample'].replace(values, inplace=True)
# selecting data from day 1
cort_data_slope = cort_data_slope[cort_data_slope['Sample'].isin([0,15,30,45,60,75])]
# slope function - plots a regression line through each subjects morning, afternoon, and night samples
def slope(subject):
data = cort_data_slope[cort_data_slope['Subject Number']==subject]
data = data[~data['Sample'].isin([15,30,45])]
X = data['Sample']
X = sm.add_constant(X.values)
y = data['Cortisol (nmol/L)']
reg = sm.OLS(y.astype(float), X.astype(float)).fit()
b = reg.params[1]
return b
# compute slopes and add to the dataframe
df['diurnal_slope'] = np.nan
for subject in subjects:
b = slope(subject)
df.loc[df['study_no']==subject, 'diurnal_slope'] = b
# now that the data is ready, we split into groups to aid in later comparisons
gdm = df[df['GDM']==1]
control = df[df['GDM']==0]
We'll need a long dataset to easily plot the CAR later.
d1_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D1S1':'D1S6'], var_name='sample', value_name='cortisol', frame=df)
d1_long['day'] = 1
d2_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D2S1':'cTBS'], var_name='sample', value_name='cortisol', frame=df)
d2_long['day'] = 2
# changing the sample names for easier plotting
sample_names_d1 = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
sample_names_d2 = {'D2S1':0, 'D2S2':15, 'D2S3':30, 'D2S4':45, 'D2S5':60, 'cTBS':75}
d1_long = d1_long.replace({'sample':sample_names_d1})
d2_long = d2_long.replace({'sample':sample_names_d2})
# concatenate into one dataframe
df_long = pd.concat([d1_long, d2_long])
Now that's done, we can start analysing the data!
round(df[['AUCimean','AUCgmean','auc_diurnal_d1','diurnal_slope']].describe(), 2)
We'll be running linear models and ANOVAs, so first we'll check that our data is normally distributed.
print(stats.shapiro(df['AUCimean']))
print(stats.shapiro(df['AUCgmean']))
print(stats.shapiro(gdm['AUCimean']))
print(stats.shapiro(gdm['AUCgmean']))
print(stats.shapiro(control['AUCimean']))
print(stats.shapiro(control['AUCgmean']))
model = smf.ols('AUCgmean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCgmean'].agg(['mean','std']))
print('Normality: ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCgmean'], control['AUCgmean']))
model = smf.ols('AUCimean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCimean'].agg(['mean','std']))
print('Normality: ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCimean'], control['AUCimean']))
model = smf.ols('aucg_diurnal_d1 ~ C(GDM) + duration_hours + interval4', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['auc_diurnal_d1'].agg(['mean','std']))
#fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)},
g = sns.catplot(data=df_long, x='sample', y='cortisol', hue='GDM', kind='point',
capsize=0.1, legend=True, aspect=1.3, ci=95)
g.set_xticklabels(labels=['0 min','15 min','30 min','45 min','Afternoon','Evening'])
g.set_xlabels('Time (post-awakening)')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))
plt.tight_layout()
model = smf.ols('diurnal_slope ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['diurnal_slope'].agg(['mean','std']))
print('Normality: ', stats.shapiro(model.resid))
model = smf.ols('diurnal_slope ~ C(GDM) + duration_hours', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
model = smf.ols('diurnal_slope ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)},
diurnal_long = df_long[~df_long['sample'].isin([15,30,45])]
g = sns.lmplot(x='sample', y='cortisol', data=diurnal_long, hue='GDM', x_estimator=np.mean, legend=True, aspect=1, ci=95)
ax = g.axes[0,0]
g.set_xticklabels(labels=['','Awakening','','','Afternoon','Evening'])
g.set_xlabels('')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))
#ax.xaxis.set_tick_params(length=0)
#ax.xaxis.set_major_locator(plt.NullLocator())
xticks = ax.xaxis.get_major_ticks()
xticks[2].set_visible(False)
xticks[3].set_visible(False)
plt.tight_layout()
model = smf.ols('AUCimean ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
sns.lmplot(x='S1mean', y='AUCimean', data=df, hue='GDM')
# Sampling latency
model = smf.ols('AUCgmean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
# Sampling latency
model = smf.ols('AUCimean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
samples = ['S2mean','S3mean','S4mean']
for s in samples:
print(stats.ttest_rel(df['S1mean'], df['{}'.format(s)]))