# define ANOVA function to calculate adjusted effect sizes
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

Loading the data¶

# load the raw cortisol data
cort_data = pd.read_excel('cortisol_data.xlsx')
cort_data.head()

# load the study data
df = pd.read_excel('data.xlsx')

Wrangling the data into shape¶

As seen above, the raw cortisol data is output from an ELISA plate reader, so is long form, with each row representing a single cortisol sample. This needs to be converted into wide data so it can be merged into the existing study database (by subject).

wide_data = cort_data.pivot(index='Subject Number', columns='Sample', values='Cortisol (nmol/L)')
wide_data.head()

# assigning cTBS values to D2S5 where appropriate (these samples are equivalent in some subjects)
wide_data['D2S5'] = np.where(wide_data['D2S5'].isna(), wide_data['cTBS'], wide_data['D2S5'])

That's better. Before merging the data, first we will impute some missing data. This is a small sample, so dropping subjects with missing samples is not possible.

# imputing missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer = imputer.fit(wide_data.iloc[:, 0:10])
wide_data.iloc[:, 0:10] = imputer.transform(wide_data.iloc[:, 0:10])

Merging datasets¶

df = pd.merge(df, wide_data, left_on='study_no', right_on='Subject Number')
df.head()

Now each subject in the study database has their cortisol samples as columns.

Dealing with time data¶

Now we need to create some variables. Some of the data represents time, but is stored as int (e.g. 930 for 09:30 am). We'll convert these to datetime and then calculate some variables we'll need later in analysis.

Sampling duration¶

# calculating the duration of each subject's day (first to last sample)
for t in range(1,7):
    df['D1T{}dt'.format(t)] = pd.to_datetime(df['D1T{}'.format(t)], format='%H%M')

df['duration'] = (df['D1T6dt'] - df['D1T1dt'])

df['duration_hours'] = df.apply(lambda x: (x['D1T6dt']-x['D1T1dt']).seconds/60/60, axis=1)

Sampling latency¶

# calculating the time taken between awakening and first sample
var_list = ['WakeTimeD1','WakeTimeD2','D1T1','D2T1']
for var in var_list:
    df[var+'dt'] = pd.to_datetime(df[var], format='%H%M')
    
df['sample_latencyD1'] = df['D1T1dt'] - df['WakeTimeD1dt']
df['sample_latencyD1_mins'] = df.apply(lambda x: x['sample_latencyD1'].seconds/60, axis=1)

df['sample_latencyD2'] = df['D2T1dt'] - df['WakeTimeD2dt']
df['sample_latencyD2_mins'] = df.apply(lambda x: x['sample_latencyD2'].seconds/60, axis=1)

df['sample_latency_mean_mins'] = (df['sample_latencyD1_mins'] + df['sample_latencyD2_mins'])/2

Inter-sample intervals¶

# time taken between each sample
times = ['D1T1dt','D1T2dt','D1T3dt','D1T4dt','D1T5dt','D1T6dt']

for i in range(1,6):
    df['interval{}'.format(i)] = df.apply(lambda x: (x[times[i]]-x[times[i-1]]).seconds/60, axis=1)

Calculating AUC¶

Perhaps the most important step in this analysis is calculating the area under the curve (AUC) of the cortisol awakening response, as this is the primary index of the CAR. There are two types: AUC with respect to ground (AUCg) and AUC with respect to increase (AUCi). The former represents total hormonal output, while the latter reflects responsitivity, or rate of change over the samples. We'll calculate each of these for each day and then take the mean.

# AUC formulas
aucg_d1_formula = lambda x: (((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)
auci_d1_formula = lambda x: ((((x.D1S2 + x.D1S1)*15)/2) + (((x.D1S3 + x.D1S2)*15)/2) + (((x.D1S4 + x.D1S3)*15)/2)) - x.D1S1*(15*4)

aucg_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2))
auci_d2_formula = lambda x: ((((x.D2S2 + x.D2S1)*15)/2) + (((x.D2S3 + x.D2S2)*15)/2) + (((x.D2S4 + x.D2S3)*15)/2)) - x.D2S1*(15*4)

# Computing AUCs
# Day 1
df['AUCgD1'] = df.apply(aucg_d1_formula, axis=1)
df['AUCiD1'] = df.apply(auci_d1_formula, axis=1)
# Day 2
df['AUCgD2'] = df.apply(aucg_d2_formula, axis=1)
df['AUCiD2'] = df.apply(auci_d2_formula, axis=1)
# Mean
df['AUCimean'] = (df['AUCiD1'] + df['AUCiD2'])/2
df['AUCgmean'] = (df['AUCgD1'] + df['AUCgD2'])/2

Calculating the diurnal slope¶

subjects = list(df['study_no'].unique())
values = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
cort_data_slope = cort_data
cort_data_slope['Sample'].replace(values, inplace=True)
# selecting data from day 1
cort_data_slope = cort_data_slope[cort_data_slope['Sample'].isin([0,15,30,45,60,75])]

# slope function - plots a regression line through each subjects morning, afternoon, and night samples
def slope(subject):
    data = cort_data_slope[cort_data_slope['Subject Number']==subject]
    data = data[~data['Sample'].isin([15,30,45])]
    X = data['Sample']
    X = sm.add_constant(X.values)
    y = data['Cortisol (nmol/L)']
    reg = sm.OLS(y.astype(float), X.astype(float)).fit()
    b = reg.params[1]
    return b

# compute slopes and add to the dataframe
df['diurnal_slope'] = np.nan

for subject in subjects:
    b = slope(subject)
    df.loc[df['study_no']==subject, 'diurnal_slope'] = b

# now that the data is ready, we split into groups to aid in later comparisons
gdm = df[df['GDM']==1]
control = df[df['GDM']==0]

Converting back to long¶

We'll need a long dataset to easily plot the CAR later.

d1_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D1S1':'D1S6'], var_name='sample', value_name='cortisol', frame=df)
d1_long['day'] = 1

d2_long = pd.melt(id_vars=['study_no','GDM'], value_vars=df.loc[:,'D2S1':'cTBS'], var_name='sample', value_name='cortisol', frame=df)
d2_long['day'] = 2

# changing the sample names for easier plotting
sample_names_d1 = {'D1S1':0, 'D1S2':15, 'D1S3':30, 'D1S4':45, 'D1S5':60, 'D1S6':75}
sample_names_d2 = {'D2S1':0, 'D2S2':15, 'D2S3':30, 'D2S4':45, 'D2S5':60, 'cTBS':75}

d1_long = d1_long.replace({'sample':sample_names_d1})
d2_long = d2_long.replace({'sample':sample_names_d2})

# concatenate into one dataframe
df_long = pd.concat([d1_long, d2_long])

Statistics¶

Now that's done, we can start analysing the data!

Descriptives¶

round(df[['AUCimean','AUCgmean','auc_diurnal_d1','diurnal_slope']].describe(), 2)

Normality tests¶

We'll be running linear models and ANOVAs, so first we'll check that our data is normally distributed.

print(stats.shapiro(df['AUCimean']))
print(stats.shapiro(df['AUCgmean']))

(0.9635738134384155, 0.34294548630714417)
(0.9709528684616089, 0.5261117219924927)

print(stats.shapiro(gdm['AUCimean']))
print(stats.shapiro(gdm['AUCgmean']))

(0.9537766575813293, 0.37444818019866943)
(0.9686221480369568, 0.6791777014732361)

print(stats.shapiro(control['AUCimean']))
print(stats.shapiro(control['AUCgmean']))

(0.9695404767990112, 0.8865522146224976)
(0.9240884184837341, 0.3923186957836151)

Difference in AUCs¶

model = smf.ols('AUCgmean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCgmean'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCgmean'], control['AUCgmean']))

                 sum_sq    df        mean_sq         F    PR(>F)    eta_sq  \
C(GDM)    120904.429150   1.0  120904.429150  5.533686  0.025409  0.155731   
Residual  655464.185743  30.0   21848.806191       NaN       NaN       NaN   

          omega_sq  
C(GDM)    0.124096  
Residual       NaN  
           mean         std
GDM                        
0    507.866374  127.002616
1    375.253785  155.884146
Normality:    (0.9776697754859924, 0.7295458912849426)
LeveneResult(statistic=1.150873739559068, pvalue=0.29192041912223843)

model = smf.ols('AUCimean ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['AUCimean'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))
print(stats.levene(gdm['AUCimean'], control['AUCimean']))

                 sum_sq    df       mean_sq         F    PR(>F)    eta_sq  \
C(GDM)      5103.760568   1.0   5103.760568  0.195026  0.661933  0.006459   
Residual  785091.007695  30.0  26169.700256       NaN       NaN       NaN   

          omega_sq  
C(GDM)   -0.025805  
Residual       NaN  
          mean         std
GDM                       
0    -9.859976  101.642931
1    17.386403  181.542261
Normality:    (0.9619717001914978, 0.3107261657714844)
LeveneResult(statistic=2.686413664192418, pvalue=0.11165406436510628)

model = smf.ols('aucg_diurnal_d1 ~ C(GDM) + duration_hours + interval4', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['auc_diurnal_d1'].agg(['mean','std']))

                      sum_sq    df       mean_sq         F    PR(>F)  \
C(GDM)          3.675610e+06   1.0  3.675610e+06  4.943904  0.034742   
duration_hours  1.900929e+04   1.0  1.900929e+04  0.025569  0.874149   
interval4       5.664103e+06   1.0  5.664103e+06  7.618540  0.010253   
Residual        2.007350e+07  27.0  7.434630e+05       NaN       NaN   

                  eta_sq  omega_sq  
C(GDM)          0.124884  0.097169  
duration_hours  0.000646 -0.024008  
interval4       0.192446  0.163066  
Residual             NaN       NaN  
            mean          std
GDM                          
0    5665.100191  1626.178084
1    4258.899540  1835.759955

#fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)}, 

g = sns.catplot(data=df_long, x='sample', y='cortisol', hue='GDM', kind='point', 
                capsize=0.1, legend=True, aspect=1.3, ci=95)

g.set_xticklabels(labels=['0 min','15 min','30 min','45 min','Afternoon','Evening'])
g.set_xlabels('Time (post-awakening)')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))

plt.tight_layout()

Difference in diurnal slopes¶

model = smf.ols('diurnal_slope ~ C(GDM)', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
print(df.groupby('GDM')['diurnal_slope'].agg(['mean','std']))
print('Normality:   ', stats.shapiro(model.resid))

            sum_sq    df   mean_sq         F    PR(>F)    eta_sq  omega_sq
C(GDM)    0.004985   1.0  0.004985  1.133969  0.297108  0.043391  0.004937
Residual  0.109895  25.0  0.004396       NaN       NaN       NaN       NaN
         mean       std
GDM                    
0   -0.102127  0.031100
1   -0.069444  0.072477
Normality:    (0.8813735246658325, 0.00514401588588953)

model = smf.ols('diurnal_slope ~ C(GDM) + duration_hours', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))

                  sum_sq    df   mean_sq         F    PR(>F)    eta_sq  \
C(GDM)          0.005733   1.0  0.005733  1.333511  0.259550  0.049578   
duration_hours  0.006722   1.0  0.006722  1.563584  0.223196  0.058132   
Residual        0.103174  24.0  0.004299       NaN       NaN       NaN   

                omega_sq  
C(GDM)          0.011955  
duration_hours  0.020202  
Residual             NaN

model = smf.ols('diurnal_slope ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))

            sum_sq    df   mean_sq          F        PR(>F)   eta_sq  omega_sq
S1mean    0.080678   1.0  0.080678  58.971586  4.913379e-08  0.70228  0.682247
Residual  0.034202  25.0  0.001368        NaN           NaN      NaN       NaN

sns.set(style='ticks', font_scale=1.3) #rc={'figure.figsize':(4,4)}, 

diurnal_long = df_long[~df_long['sample'].isin([15,30,45])]

g = sns.lmplot(x='sample', y='cortisol', data=diurnal_long, hue='GDM', x_estimator=np.mean, legend=True, aspect=1, ci=95)
ax = g.axes[0,0]

g.set_xticklabels(labels=['','Awakening','','','Afternoon','Evening'])
g.set_xlabels('')
g.set_ylabels('Cortisol (nmol/L)')
#plt.legend(title='Group', loc='upper right', labels=['Control', 'GDM'])
g._legend.texts[0].set_text("Control")
g._legend.texts[1].set_text("GDM")
g._legend.set_title("")
g._legend.set_bbox_to_anchor((0.9, 0.75))
#ax.xaxis.set_tick_params(length=0)
#ax.xaxis.set_major_locator(plt.NullLocator())
xticks = ax.xaxis.get_major_ticks()
xticks[2].set_visible(False)
xticks[3].set_visible(False)

plt.tight_layout()

Association between S1 and AUC¶

model = smf.ols('AUCimean ~ S1mean', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))
sns.lmplot(x='S1mean', y='AUCimean', data=df, hue='GDM')

                 sum_sq    df        mean_sq          F    PR(>F)   eta_sq  \
S1mean    342225.716266   1.0  342225.716266  22.918484  0.000042  0.43309   
Residual  447969.051997  30.0   14932.301733        NaN       NaN      NaN   

          omega_sq  
S1mean    0.406512  
Residual       NaN

<seaborn.axisgrid.FacetGrid at 0x1698d3d7be0>

Effect of sampling latency¶

# Sampling latency
model = smf.ols('AUCgmean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))

                                 sum_sq    df       mean_sq         F  \
sample_latency_mean_mins    5728.304733   1.0   5728.304733  0.160328   
Residual                  607387.285654  17.0  35728.663862       NaN   

                            PR(>F)    eta_sq  omega_sq  
sample_latency_mean_mins  0.693842  0.009343 -0.046237  
Residual                       NaN       NaN       NaN

# Sampling latency
model = smf.ols('AUCimean ~ sample_latency_mean_mins', data=df).fit()
#print(model.summary())
aov_table = sm.stats.anova_lm(model, typ=2)
print(anova_table(aov_table))

                                 sum_sq    df       mean_sq         F  \
sample_latency_mean_mins    1188.403691   1.0   1188.403691  0.059162   
Residual                  341484.751258  17.0  20087.338309       NaN   

                            PR(>F)    eta_sq  omega_sq  
sample_latency_mean_mins  0.810736  0.003468 -0.052098  
Residual                       NaN       NaN       NaN

Difference from baseline¶

samples = ['S2mean','S3mean','S4mean']
for s in samples:
    print(stats.ttest_rel(df['S1mean'], df['{}'.format(s)]))

Ttest_relResult(statistic=-3.8007236551469226, pvalue=0.0006335539550446315)
Ttest_relResult(statistic=-5.384080173617944, pvalue=7.138616851783919e-06)
Ttest_relResult(statistic=-3.399751510609136, pvalue=0.0018724461895478228)

	Subject Number	Sample	Cortisol (ug/dL)	Cortisol (nmol/L)	Notes
0	2707	D1S1	0.182	5.02138	NaN
1	2707	D1S2	0.550	15.17450	NaN
2	2707	D1S3	0.552	15.22968	NaN
3	2707	D1S4	0.444	12.24996	NaN
4	2707	D1S5	0.088	2.42792	NaN

Sample	D1S1	D1S2	D1S3	D1S4	D1S5	D1S6	D2S1	D2S2	D2S3	D2S4	D2S5	cTBS
Subject Number
1411	3.94537	3.50393	3.39357	2.81418	0.55180	0.33100	7.55966	11.53262	11.14636	7.53207	0.82770	3.44875
1833	3.58670	9.90481	14.70547	13.71223	1.82094	0.96565	2.70382	8.63567	14.01572	15.56076	1.60022	1.54504
2045	1.60022	0.88288	7.09063	8.35977	6.12498	2.04166	6.12498	6.84232	7.83556	7.20099	NaN	3.11767
2060	NaN	5.07656	7.69761	6.48365	0.85529	0.33100	4.60753	4.27645	3.44875	NaN	0.46903	1.04842
2152	21.87887	11.36708	7.06304	4.41440	2.97972	0.88288	8.24941	2.42792	14.04331	15.17450	2.78659	2.97972

	study_no	Gest_age_del	age_s2	RMTS2	AMTpa	AMTap	AMTlm	AMTap_pa_prop	SI1mVS2	RMTcTBS_S2	...	D1S3	D1S4	D1S5	D1S6	D2S1	D2S2	D2S3	D2S4	D2S5	cTBS
0	1411	274.0	13.583333	40.0	34.0	48.0	37.0	1.411765	61.0	53.0	...	3.39357	2.81418	0.55180	0.33100	7.55966	11.53262	11.14636	7.532070	0.82770	3.44875
1	1833	265.0	12.083333	48.0	35.0	53.0	46.0	1.514286	81.0	65.0	...	14.70547	13.71223	1.82094	0.96565	2.70382	8.63567	14.01572	15.560760	1.60022	1.54504
2	2045	287.0	13.833333	47.0	38.0	44.0	39.0	1.157895	53.0	55.0	...	7.09063	8.35977	6.12498	2.04166	6.12498	6.84232	7.83556	7.200990	3.11767	3.11767
3	2060	278.0	12.750000	46.0	35.0	42.0	40.0	1.200000	56.0	55.0	...	7.69761	6.48365	0.85529	0.33100	4.60753	4.27645	3.44875	8.097665	0.46903	1.04842
4	2152	278.0	14.083333	50.0	43.0	60.0	52.0	1.395349	63.0	68.0	...	7.06304	4.41440	2.97972	0.88288	8.24941	2.42792	14.04331	15.174500	2.78659	2.97972

	AUCimean	AUCgmean	auc_diurnal_d1	diurnal_slope
count	32.00	32.00	31.00	27.00
mean	8.87	416.70	4667.15	-0.08
std	159.66	158.25	1866.87	0.07
min	-455.65	122.91	957.69	-0.29
25%	-74.86	307.57	2968.29	-0.11
50%	-2.28	447.48	4721.62	-0.05
75%	101.37	518.50	6114.94	-0.03
max	318.46	763.45	8072.60	0.03

Table of Contents

Imports¶