#### Simulation model to explore the biasing effects of controlling for baseline TL in analyses of TL change. ##### # August 2019 # Version incorporating addtional analyses for responses to RSOS referees # written by Melissa Bateson, Newcastle University, UK, melissa.bateson@ncl.ac.uk rm(list=ls()) library(psych) library(plyr) library(ggplot2) library(lme4) library(gridExtra) #### Verhulst function #### # Verhulst function for calculating Verhulst's D telomere attrition measure # # Call: verhulst(name.of.first.measurement, name.of.second.measurement, name.of.data.frame) # First and second must be same length # Returns a vector of same length as data # If you want it the way around used in Bateson et al. 2014 [more attrition equals more negative number], call -verhulst() verhulst<-function(first,second,data){ attach(data) rho = (2*cor(first,second,use="complete.obs")*sd(first, na.rm=TRUE)*sd(second, na.rm=TRUE))/(var(first, na.rm=TRUE) + var(second, na.rm=TRUE)) d = rho*(first-mean(first, na.rm=TRUE)) - (second - mean(second, na.rm=TRUE)) detach(data) return(d) } #### Telomere data function #### # Function to generate longitudinal telomere data with measurement error generate.TL.data = function( years, # the follow up interval baselineTL.mean = 7000, # the mean of the TL distribution at baseline baselineTL.sd = 777, # the sd of the TL distribution at baseline attrition.mean = 30, # mean annual attrition in bp attrition.sd = 46, # attrition sd CV = 10, # Coefficient of variation of measurement error distribution as a percentage proportional.error = TRUE, # toggle to determine whether measurement error is proportional to TL (CVs used) or not n = 1000 # the number of individuals in the simulation ){ CV = CV/100 # convert % to a proportion baseline.true.TL = NULL followup.true.TL = NULL baseline.measured.TL = NULL followup.measured.TL = NULL # generate true baseline TL using mean and sd (assume normally distributed) baseline.true.TL = rnorm(n,baselineTL.mean,baselineTL.sd) # calculate follow-up TL # note that to do this we choose a rate of annual attrition from a normal distribution and apply this for the follow-up interval (10 years here) followup.true.TL = baseline.true.TL-(years*rnorm(n,attrition.mean,attrition.sd)) # calculated 'measured' TL by adding normally distributed measurement error to true values if (proportional.error==TRUE) { baseline.measured.TL = rnorm(n,baseline.true.TL,baseline.true.TL*CV) followup.measured.TL = rnorm(n,followup.true.TL,followup.true.TL*CV) } if (proportional.error==FALSE) { baseline.measured.TL = rnorm(n,baseline.true.TL,7000*CV) # generates measurement error independent of TL followup.measured.TL = rnorm(n,followup.true.TL,7000*CV) } d = data.frame(baseline.true.TL,baseline.measured.TL,followup.true.TL,followup.measured.TL) d$true.attrition.rate = (d$baseline.true.TL-d$followup.true.TL)/years d$measured.attrition.rate = (d$baseline.measured.TL-d$followup.measured.TL)/years d$verhulst = verhulst(baseline.measured.TL,followup.measured.TL,d) d$verhulst.scaled = (d$verhulst + (mean(d$baseline.measured.TL) - mean(d$followup.measured.TL)))/years # this scales the RTM-corrected attrition into the same units as attrition return(d) } #### Main simulation #### # Simulate data for smokers and non-smokers under four possible scenarios. # Values from Aviv et al 2009: # baseline TL (all) mean = 7451 bp # baseline TL (all) sd = 777 bp # baseline TL for non-smokers (adjusted for age and BMI) = 7500 # baseline TL for smokers (adjusted for age and BMI) = 7359 # this represents a difference of 141 bp # overall attrition = 40.7 bp/year # sd attrition for all subjects = 46.0 bp/year # attrition for non-smokers = 40 bp/year # attrition for smokers = 42 bp/year n = 1000 # individuals per group years = 10 replicates = 1000 prop.error = TRUE # make empty vectors for the output scenario = NULL baseline.diff = NULL attrition.diff = NULL coeff.var = NULL st.dev = NULL model.type = NULL coeff.smoke = NULL p.value = NULL row=1 for (k in c("A: No diff. in baseline/ No diff. in attrition", "B: No diff. in baseline/ Diff. in attrition", "C: Diff. in baseline/ No diff. in attrition", "D: Diff. in baseline/ Diff. in attrition")){print(k) # These are the four different scenarios to simulate if (k=="A: No diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 } if (k=="B: No diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40 baselineTL.mean.S = 7429.5 attrition.mean.S = 42 } if (k=="C: Diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40.7 baselineTL.mean.S = 7359 attrition.mean.S = 40.7 } if (k=="D: Diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40 baselineTL.mean.S = 7359 attrition.mean.S = 42 } for (i in c(0,1,2,4,8,16)){ print(i) # values of CV for (j in 1:replicates){ # replicates of each CV value # generate data for non-smokers non.smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) d$id = c(1:(n*2)) # model the effect of smoking on the rate of attrition in different ways # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 # model 3: follow up TL as response with control for baseline TL m3 = lm(followup.measured.TL~baseline.measured.TL + as.factor(smoker), data=d) # model using fu LTL as resoponse variable scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 3" coeff.smoke[row] = -(summary(m3)$coefficients[3])/10 # this coeff needs to be divided by the number of years to get attrition per year p.value[row] = anova(m3)$"Pr(>F)"[2] row = row + 1 # model 3a: follow up TL as response with baseline TL as an offset (no parameter estimated) # this model should give the same parameter estimates and p-values as model 1 m3a = lm(followup.measured.TL ~ offset(baseline.measured.TL) + as.factor(smoker), data=d) # model using fu LTL as resoponse variable scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 3a" coeff.smoke[row] = -(summary(m3a)$coefficients[2])/10 # this coeff needs to be divided by the number of years to get attrition per year p.value[row] = anova(m3a)$"Pr(>F)"[1] row = row + 1 # model 4: LTL as response (repeated measures model) d1 = data.frame(d$id,d$baseline.measured.TL,d$followup.measured.TL) # first need to reshape data to long thin #d2 = melt(d1,id.vars = "d.id") d2=reshape(d1, idvar = "d.id", v.names="TL", direction = "long", varying=c("d.baseline.measured.TL", "d.followup.measured.TL"), timevar = "time.point") d3=merge(d2, d, all.x=TRUE, by.x="d.id", by.y="id") m4 = lmer(TL~as.factor(time.point) + as.factor(smoker) + as.factor(time.point)*as.factor(smoker) + (1|d.id), data=d3, REML=FALSE) # model using repeated measures approach m4a = lmer(TL~as.factor(time.point) + as.factor(smoker) + (1|d.id), data=d3, REML=FALSE) scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 4" coeff.smoke[row] = -(m4@beta[4])/10 # this is for the interaction term which gives difference in slope between smokers and non-smokers p.value[row] = anova(m4, m4a, test="Chisq")$'Pr(>Chisq)'[2] # this is the p-value for the interaction term row = row + 1 # model 5: Verhulst-adusted version of model 1 m5 = lm(verhulst.scaled~as.factor(smoker), data=d) # model using measure of attrition correct for RTM scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 5" coeff.smoke[row] = summary(m5)$coefficients[2] p.value[row] = anova(m5)$"Pr(>F)"[1] row = row + 1 # model 6: Verhulst-adusted version of model 2 m6 = lm(verhulst.scaled~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 6" coeff.smoke[row] = summary(m6)$coefficients[3] p.value[row] = anova(m6)$"Pr(>F)"[2] row = row + 1 } } } output = data.frame(scenario, baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) output$significant[output$p.value<0.05]=1 sum = ddply(output, .(scenario, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Supplementary_dataset.CSV") #### Draw the figures presented in the paper #### sum = read.csv("Supplementary_dataset.CSV", header = TRUE) # Figures in the main text: # Figure 2: Effect size estimates # Note that the models fall into two groups: those with control for baseline and those without # models 1, 4, 3a, and 5 are identical. # models 2 and 3 are identical and biased. # Therefore just plot data for models 1 and 2. sum.plot = sum[(sum$model.type=="Model 1"| sum$model.type=="Model 2") ,] fig.2 = ggplot(sum.plot, aes(x=coeff.var, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=.3) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_fill_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.25,0.65)) fig.2 png("Figure 2.png", res=600, units="in", width=6, height=6) print(fig.2) dev.off() # Type 1 errors sum.ac=sum[sum$scenario=="A: No diff. in baseline/ No diff. in attrition"|sum$scenario=="C: Diff. in baseline/ No diff. in attrition",] # use just the scenarios where attrition is not different sum.plot.ac = sum.ac[(sum.ac$model.type=="Model 1"| sum.ac$model.type=="Model 2") ,] fig.3 = ggplot(sum.plot.ac, aes(x=coeff.var, y=significant, colour=as.factor(model.type)) )+ geom_hline(yintercept = 0.05, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_fill_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab("Probability of type 1 error") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.22,0.78)) fig.3 png("Figure 3.png", res=600, units="in", width=5.5, height=3) print(fig.3) dev.off() # Power to reject null hypothesis (include this in supplement) sum.bd = sum[sum$scenario=="B: No diff. in baseline/ Diff. in attrition"|sum$scenario=="D: Diff. in baseline/ Diff. in attrition",] # use just the scenarios where attrition is trully different sum.plot.bd = sum.bd[(sum.bd$model.type=="Model 1"| sum.bd$model.type=="Model 2") ,] fig.S1 = ggplot(sum.plot.bd, aes(x=coeff.var, y=significant))+ facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_fill_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote('Power to reject '*H[0]*'')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.25,0.78)) fig.S1 png("Figure S1.png", res=600, units="in", width=5.5, height=3) print(fig.S1) dev.off() # How do the sds compare at bl and fu? Use scenario A baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 example.data = generate.TL.data(n = 1000000, years = 10, CV = 20, proportional.error = TRUE, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) describe(example.data) #### What is the effect of correcting the outcome variable for regression to the mean using Verhulst's approach? #### # Model 5 is the same as model 1 but uses Verhulst's D as the outcome variable (attrition scores corrected for regression to the mean) # Model 6 is the same as model 2 but uses Verhulst's D as the outcome variable # Figure S11: Effect size estimates # Model 5 produces identical parameter estimates to model 1 # Model 6 produces biased parameter estimates but with a different pattern to model 2 sum.plot = sum[(sum$model.type=="Model 1"| sum$model.type=="Model 2" | sum$model.type=="Model 6") ,] fig.S11 = ggplot(sum.plot, aes(x=coeff.var, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=.3) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Models", values = c("lightgray", "black", "red"),labels=c("1", "2","2 on Verhulst-corrected data")) + scale_fill_manual(name = "Models", values = c("lightgray", "black", "red"),labels=c("1", "2", "2 on Verhulst-corrected data")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.25,0.65)) fig.S11 png("Figure S11.png", res=600, units="in", width=6, height=6) print(fig.S11) dev.off() ##### Sensitivity analysis 1 - effect of baseline difference and CV #### # What is the effect of the size of the baseline difference between smokers and non-smokers? # Repeat the simulation above with 6 different values of difference in baseline TL # Choose numbers that reflect the possible range of variation: 0-1600 bp # 1600 is chosen to reflect the difference between 20 and 60 year olds assuming attrition of 40 bp/year # Just simulate scenario C and vary CV and baseline diff. years = 10 replicates = 1000 prop.error = TRUE # make empty vectors for the output baseline.diff = NULL attrition.diff = NULL coeff.var = NULL model.type = NULL coeff.smoke = NULL p.value = NULL attrition.mean.NS = 40.7 attrition.mean.S = 40.7 row=1 for (k in c(0,1,2,4,8,16)){ print(k) # values of CV for (i in c(0,100,200,400,800,1600)){ print(i) # values of baseline.diff baselineTL.mean.NS = 7429.5+(i/2) baselineTL.mean.S = 7429.5-(i/2) for (j in 1:replicates){ # replicates # generate data for non-smokers non.smokers = generate.TL.data(n = 1000, years = years, CV = k, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = 1000, years = years, CV = k, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) # model the effect of smoking on the rate of attrition in different ways # only compare models 1 and 2 for this sensitivity analysis # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS coeff.var[row] = k model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS coeff.var[row] = k model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 } } } output = data.frame(baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) output$significant[output$p.value<0.05]=1 sum = ddply(output, .(baseline.diff, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Sensitivity_dataset4.CSV") # Note that this simulation is just for scenario C (diff in baseline, no diff in attrition) s4 = read.csv("Sensitivity_dataset4.CSV") s4$CV[s4$coeff.var==0] = "CV = 0%" s4$CV[s4$coeff.var==1] = "CV = 1%" s4$CV[s4$coeff.var==2] = "CV = 2%" s4$CV[s4$coeff.var==4] = "CV = 4%" s4$CV[s4$coeff.var==8] = "CV = 8%" s4$CV[s4$coeff.var==16] = "CV = 16%" s4$CV = factor(s4$CV, levels = c("CV = 0%", "CV = 1%", "CV = 2%", "CV = 4%", "CV = 8%", "CV = 16%")) fig.4A = ggplot(s4, aes(x=baseline.diff, y=-effect))+ facet_wrap(~CV,ncol=6) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=.3) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab(bquote('Difference in '*LTL[b]*' between smokers and non-smokers (bp)')) + ylab(bquote(''*beta*' smoking (bp/year)')) + labs(title = "A") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.17,0.27)) fig.4A # Run a model to understand these effects of CV and baseline diff m4 = lm(effect ~ baseline.diff*coeff.var*model.type, data = s4) summary(m4) hist(resid(m4)) # Shows nicely the three-way interaction between model type, CV and baseline diff. # The effect of a given change in CV depends on the baseline diff. # The bias is non-additively exaggerated when CV and baseline diff are both larger. # Type 1 errors fig.4B = ggplot(s4, aes(x=baseline.diff, y=significant))+ facet_wrap(~CV,ncol=6) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model:", values = c("lightgray", "black"),labels=c("1", "2")) + scale_fill_manual(name = "Model:", values = c("lightgray", "black"),labels=c("1", "2")) + xlab(bquote('Difference in '*LTL[b]*' between smokers and non-smokers (bp)')) + ylab("Probability of type 1 error") + labs(title = "B") + guides(fill=FALSE, colour=FALSE) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) fig.4B png("Figure 4.png", res=600, units="in", width=7, height=6) grid.arrange(fig.4A,fig.4B, ncol = 1) dev.off() ##### Sensitivity analysis 2 - effect of number of participants (n) #### # 1. Does it make any difference if the sample size is larger (i.e. more participants)? # Repeat the simulation above with different n's; keep CV contstant at 8 years = 10 replicates = 1000 prop.error = TRUE # make empty vectors for the output participants = NULL scenario = NULL baseline.diff = NULL attrition.diff = NULL coeff.var = NULL model.type = NULL coeff.smoke = NULL p.value = NULL row=1 for (k in c("A: No diff. in baseline/ No diff. in attrition", "B: No diff. in baseline/ Diff. in attrition", "C: Diff. in baseline/ No diff. in attrition", "D: Diff. in baseline/ Diff. in attrition")){print(k) # These are the four different scenarios to simulate if (k=="A: No diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 } if (k=="B: No diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40 baselineTL.mean.S = 7429.5 attrition.mean.S = 42 } if (k=="C: Diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40.7 baselineTL.mean.S = 7359 attrition.mean.S = 40.7 } if (k=="D: Diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40 baselineTL.mean.S = 7359 attrition.mean.S = 42 } for (i in c(100,200,400,800,1600,3200)){ print(i) # values of n for (j in 1:replicates){ # replicates of each sample size # generate data for non-smokers non.smokers = generate.TL.data(n = i, years = years, CV = 8, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = i, years = years, CV = 8, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) d$id = c(1:(i*2)) # model the effect of smoking on the rate of attrition in different ways # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model participants[row] = i*2 scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS coeff.var[row] = 6 model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL participants[row] = i*2 scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS coeff.var[row] = 6 model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 } } } if (prop.error==TRUE) { output = data.frame(scenario, baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) } if (prop.error==FALSE) { output = data.frame(scenario, baseline.diff, attrition.diff, st.dev, model.type, coeff.smoke, p.value) } output$significant[output$p.value<0.05]=1 if (prop.error==TRUE) { sum = ddply(output, .(scenario, participants, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } if (prop.error==FALSE) { sum = ddply(output, .(scenario, participants, st.dev, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Sensitivity_dataset1.CSV") # Note that in this analysis CV is constant at 8% s1 = read.csv("Sensitivity_dataset1.CSV") sum.plot = s1[(s1$model.type=="Model 1"| s1$model.type=="Model 2") ,] fig.S2 = ggplot(sum.plot, aes(x=participants, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=100) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Total number of participants (n)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.20,0.65)) fig.S2 png("Figure S2.png", res=600, units="in", width=6, height=6) print(fig.S2) dev.off() # Type 1 errors sum.ac=s1[s1$scenario=="A: No diff. in baseline/ No diff. in attrition"|s1$scenario=="C: Diff. in baseline/ No diff. in attrition",] # use just the scenarios where attrition is not different sum.plot.ac = sum.ac[(sum.ac$model.type=="Model 1"| sum.ac$model.type=="Model 2") ,] fig.S3 = ggplot(sum.plot.ac, aes(x=participants, y=significant))+ geom_hline(yintercept = 0.05, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Total number of participants") + ylab("Probability of type 1 error") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.78)) fig.S3 png("Figure S3.png", res=600, units="in", width=5.5, height=3) print(fig.S3) dev.off() # Power to reject null hypothesis sum.bd = s1[s1$scenario=="B: No diff. in baseline/ Diff. in attrition"|s1$scenario=="D: Diff. in baseline/ Diff. in attrition",] # use just the scenarios where attrition is trully different sum.plot.bd = sum.bd[(sum.bd$model.type=="Model 1"| sum.bd$model.type=="Model 2") ,] fig.S4 = ggplot(sum.plot.bd, aes(x=participants, y=significant, colour=as.factor(model.type)) )+ facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Total number of participants (n)") + ylab(bquote('Power to reject '*H[0]*'')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.8)) fig.S4 # Conclusion: with realistic values (CV = 8%) the probability of a false positive result rises to over 20% with a total sample size of 6000 # and power to detect real effects also increases due to the exaggeration of the true effect size. png("Figure S4.png", res=600, units="in", width=5.5, height=3) print(fig.S4) dev.off() #### Sensitivity analysis 3: The effect of the difference in attrition between smokers and non-smokers #### # Does it make any difference if the true difference in attrition between smokers and non-smokers is larger than 2bp? # Repeat the first simulation but increase the true diff. in attrition to 20bp/year # overall attrition = 40.7 bp/year # sd attrition for all subjects = 46.0 bp/year # attrition for non-smokers = 30 bp/year # attrition for smokers = 50 bp/year n = 1000 # individuals per group years = 10 replicates = 1000 prop.error = TRUE # make empty vectors for the output scenario = NULL baseline.diff = NULL attrition.diff = NULL coeff.var = NULL st.dev = NULL model.type = NULL coeff.smoke = NULL p.value = NULL row=1 for (k in c("A: No diff. in baseline/ No diff. in attrition", "B: No diff. in baseline/ Diff. in attrition", "C: Diff. in baseline/ No diff. in attrition", "D: Diff. in baseline/ Diff. in attrition")){print(k) # These are the four different scenarios to simulate if (k=="A: No diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 } if (k=="B: No diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 30 baselineTL.mean.S = 7429.5 attrition.mean.S = 50 } if (k=="C: Diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40.7 baselineTL.mean.S = 7359 attrition.mean.S = 40.7 } if (k=="D: Diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 30 baselineTL.mean.S = 7359 attrition.mean.S = 50 } for (i in c(0,1,2,4,8,16)){ print(i) # values of CV for (j in 1:replicates){ # replicates of each CV value # generate data for non-smokers non.smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) d$id = c(1:(n*2)) # model the effect of smoking on the rate of attrition in different ways # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 } } } if (prop.error==TRUE) { output = data.frame(scenario, baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) } if (prop.error==FALSE) { output = data.frame(scenario, baseline.diff, attrition.diff, st.dev, model.type, coeff.smoke, p.value) } output$significant[output$p.value<0.05]=1 if (prop.error==TRUE) { sum = ddply(output, .(scenario, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } if (prop.error==FALSE) { sum = ddply(output, .(scenario, st.dev, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Sensitivity_dataset2.CSV") s2 = read.csv("Sensitivity_dataset2.CSV") # Effect size estimates sum.plot = s2[(s2$model.type=="Model 1"| s2$model.type=="Model 2") ,] fig.S5 = ggplot(sum.plot, aes(x=coeff.var, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=.3) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.65)) fig.S5 png("Figure S5.png", res=600, units="in", width=6, height=6) print(fig.S5) dev.off() # Type 1 errors sum.ac=s2[s2$scenario=="A: No diff. in baseline/ No diff. in attrition"|s2$scenario=="C: Diff. in baseline/ No diff. in attrition",] # use just the scenarios where attrition is not different sum.plot.ac = sum.ac[(sum.ac$model.type=="Model 1"| sum.ac$model.type=="Model 2") ,] fig.S6 = ggplot(sum.plot.ac, aes(x=coeff.var, y=significant))+ geom_hline(yintercept = 0.05, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab("Probability of type 1 error") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.8)) fig.S6 png("Figure S6.png", res=600, units="in", width=5.5, height=3) print(fig.S6) dev.off() # Power to reject null hypothesis (include this in supplement) sum.bd = s2[s2$scenario=="B: No diff. in baseline/ Diff. in attrition"|s2$scenario=="D: Diff. in baseline/ Diff. in attrition",] # use just the scenarios where attrition is trully different sum.plot.bd = sum.bd[(sum.bd$model.type=="Model 1"| sum.bd$model.type=="Model 2") ,] fig.S7 = ggplot(sum.plot.bd, aes(x=coeff.var, y=significant))+ facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote('Power to reject '*H[0]*'')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.22)) fig.S7 png("Figure S7.png", res=600, units="in", width=5.5, height=3) print(fig.S7) dev.off() #### Sensitivity analysis 4 - effect of proportionality of measurement error #### # Does it make any difference if the measurement error is proportional to TL? # Repeat the simulation but with measurement error that is not proportional to TL n = 1000 # individuals per group years = 10 replicates = 1000 prop.error = FALSE # make empty vectors for the output scenario = NULL baseline.diff = NULL attrition.diff = NULL coeff.var = NULL st.dev = NULL model.type = NULL coeff.smoke = NULL p.value = NULL row=1 for (k in c("A: No diff. in baseline/ No diff. in attrition", "B: No diff. in baseline/ Diff. in attrition", "C: Diff. in baseline/ No diff. in attrition", "D: Diff. in baseline/ Diff. in attrition")){print(k) # These are the four different scenarios to simulate if (k=="A: No diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 } if (k=="B: No diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40 baselineTL.mean.S = 7429.5 attrition.mean.S = 42 } if (k=="C: Diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40.7 baselineTL.mean.S = 7359 attrition.mean.S = 40.7 } if (k=="D: Diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40 baselineTL.mean.S = 7359 attrition.mean.S = 42 } for (i in c(0,1,2,4,8,16)){ print(i) # values of CV for (j in 1:replicates){ # replicates of each CV value # generate data for non-smokers non.smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) d$id = c(1:(n*2)) # model the effect of smoking on the rate of attrition in different ways # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 } } } if (prop.error==TRUE) { output = data.frame(scenario, baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) } if (prop.error==FALSE) { output = data.frame(scenario, baseline.diff, attrition.diff, st.dev, model.type, coeff.smoke, p.value) } output$significant[output$p.value<0.05]=1 if (prop.error==TRUE) { sum = ddply(output, .(scenario, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } if (prop.error==FALSE) { sum = ddply(output, .(scenario, st.dev, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) } sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Sensitivity_dataset3.CSV") s3 = read.csv("Sensitivity_dataset3.CSV") # Effect size estimates fig.S8 = ggplot(s3, aes(x=st.dev, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=20) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Standard deviation of LTL measurement error (bp)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.65)) fig.S8 png("Figure S8.png", res=600, units="in", width=6, height=6) print(fig.S8) dev.off() # Type 1 errors sum.ac=s3[s3$scenario=="A: No diff. in baseline/ No diff. in attrition"|s3$scenario=="C: Diff. in baseline/ No diff. in attrition",] # use just the scenarios where attrition is not different fig.S9 = ggplot(sum.ac, aes(x=st.dev, y=significant))+ geom_hline(yintercept = 0.05, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Standard deviation of LTL measurement error (bp)") + ylab("Probability of type 1 error") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.8)) fig.S9 png("Figure S9.png", res=600, units="in", width=5.5, height=3) print(fig.S9) dev.off() # Power to reject null hypothesis (include this in supplement) sum.bd = s3[s3$scenario=="B: No diff. in baseline/ Diff. in attrition"|s3$scenario=="D: Diff. in baseline/ Diff. in attrition",] # use just the scenarios where attrition is trully different fig.S10 = ggplot(sum.bd, aes(x=st.dev, y=significant))+ facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_point(size=2, shape = 23, colour = "black", aes(fill = model.type)) + scale_colour_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + scale_fill_manual(name = "Model", values = c("lightgray", "black"),labels=c("1 (no control for baseline)", "2 (control for baseline)")) + xlab("Standard deviation of LTL measurement error (bp)") + ylab(bquote('Power to reject '*H[0]*'')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.2,0.8)) fig.S10 png("Figure S10.png", res=600, units="in", width=5.5, height=3) print(fig.S10) dev.off() #### Simulation showing that the bias can reverse true effects #### # In this simulation we reverse the true effect of smoking on attrition such that smokers have slower attrition than non-smokers # Everything else is identical to the main simulation n = 1000 # individuals per group years = 10 replicates = 10 prop.error = TRUE # make empty vectors for the output scenario = NULL baseline.diff = NULL attrition.diff = NULL coeff.var = NULL st.dev = NULL model.type = NULL coeff.smoke = NULL p.value = NULL row=1 for (k in c("A: No diff. in baseline/ No diff. in attrition", "B: No diff. in baseline/ Diff. in attrition", "C: Diff. in baseline/ No diff. in attrition", "D: Diff. in baseline/ Diff. in attrition")){print(k) # These are the four different scenarios to simulate if (k=="A: No diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 40.7 baselineTL.mean.S = 7429.5 attrition.mean.S = 40.7 } if (k=="B: No diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7429.5 attrition.mean.NS = 42 baselineTL.mean.S = 7429.5 attrition.mean.S = 40 } if (k=="C: Diff. in baseline/ No diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 40.7 baselineTL.mean.S = 7359 attrition.mean.S = 40.7 } if (k=="D: Diff. in baseline/ Diff. in attrition") { baselineTL.mean.NS = 7500 attrition.mean.NS = 42 baselineTL.mean.S = 7359 attrition.mean.S = 40 } for (i in c(0,1,2,4,8,16)){ print(i) # values of CV for (j in 1:replicates){ # replicates of each CV value # generate data for non-smokers non.smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.NS, attrition.mean = attrition.mean.NS) non.smokers = data.frame(non.smokers) non.smokers$smoker = 0 # generate data for smokers smokers = generate.TL.data(n = n, years = years, CV = i, proportional.error = prop.error, baselineTL.mean = baselineTL.mean.S, attrition.mean = attrition.mean.S) smokers = data.frame(smokers) smokers$smoker = 1 d = rbind(non.smokers, smokers) d$id = c(1:(n*2)) # model the effect of smoking on the rate of attrition in different ways # model 1: basic regression model with no control for baseline TL m1 = lm(measured.attrition.rate~as.factor(smoker), data=d) # simple model scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 1" coeff.smoke[row] = summary(m1)$coefficients[2] p.value[row] = anova(m1)$"Pr(>F)"[1] row = row + 1 # model 2: ANCOVA controlling for baseline TL m2 = lm(measured.attrition.rate~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 2" coeff.smoke[row] = summary(m2)$coefficients[3] p.value[row] = anova(m2)$"Pr(>F)"[2] row = row + 1 # model 3: follow up TL as response with control for baseline TL m3 = lm(followup.measured.TL~baseline.measured.TL + as.factor(smoker), data=d) # model using fu LTL as resoponse variable scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 3" coeff.smoke[row] = -(summary(m3)$coefficients[3])/10 # this coeff needs to be divided by the number of years to get attrition per year p.value[row] = anova(m3)$"Pr(>F)"[2] row = row + 1 # model 3a: follow up TL as response with baseline TL as an offset (no parameter estimated) # this model should give the same parameter estimates and p-values as model 1 m3a = lm(followup.measured.TL ~ offset(baseline.measured.TL) + as.factor(smoker), data=d) # model using fu LTL as resoponse variable scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 3a" coeff.smoke[row] = -(summary(m3a)$coefficients[2])/10 # this coeff needs to be divided by the number of years to get attrition per year p.value[row] = anova(m3a)$"Pr(>F)"[1] row = row + 1 # model 4: LTL as response (repeated measures model) d1 = data.frame(d$id,d$baseline.measured.TL,d$followup.measured.TL) # first need to reshape data to long thin #d2 = melt(d1,id.vars = "d.id") d2=reshape(d1, idvar = "d.id", v.names="TL", direction = "long", varying=c("d.baseline.measured.TL", "d.followup.measured.TL"), timevar = "time.point") d3=merge(d2, d, all.x=TRUE, by.x="d.id", by.y="id") m4 = lmer(TL~as.factor(time.point) + as.factor(smoker) + as.factor(time.point)*as.factor(smoker) + (1|d.id), data=d3, REML=FALSE) # model using repeated measures approach m4a = lmer(TL~as.factor(time.point) + as.factor(smoker) + (1|d.id), data=d3, REML=FALSE) scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 4" coeff.smoke[row] = -(m4@beta[4])/10 # this is for the interaction term which gives difference in slope between smokers and non-smokers p.value[row] = anova(m4, m4a, test="Chisq")$'Pr(>Chisq)'[2] # this is the p-value for the interaction term row = row + 1 # model 5: Verhulst-adusted version of model 1 m5 = lm(verhulst.scaled~as.factor(smoker), data=d) # model using measure of attrition correct for RTM scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 5" coeff.smoke[row] = summary(m5)$coefficients[2] p.value[row] = anova(m5)$"Pr(>F)"[1] row = row + 1 # model 6: Verhulst-adusted version of model 2 m6 = lm(verhulst.scaled~baseline.measured.TL+as.factor(smoker), data=d) # model controlling for baseline TL scenario[row] = k baseline.diff[row] = baselineTL.mean.NS-baselineTL.mean.S attrition.diff[row] = attrition.mean.S-attrition.mean.NS if (prop.error==TRUE) {coeff.var[row] = i} if (prop.error==FALSE) {st.dev[row] = ((i/100)*7000)} model.type[row] = "Model 6" coeff.smoke[row] = summary(m6)$coefficients[3] p.value[row] = anova(m6)$"Pr(>F)"[2] row = row + 1 } } } output = data.frame(scenario, baseline.diff, attrition.diff, coeff.var, model.type, coeff.smoke, p.value) output$significant[output$p.value<0.05]=1 sum = ddply(output, .(scenario, coeff.var, model.type), summarise, effect=mean(coeff.smoke), sd=sd(coeff.smoke), significant=sum(significant, na.rm=TRUE)) sum$significant=sum$significant/replicates sum$CI = qnorm(0.975)*sum$sd/sqrt(replicates) write.csv(sum, "Supplementary_dataset_reversal.CSV") sum = read.csv("Supplementary_dataset_reversal.CSV", header = TRUE) # Effect size estimates # Note that the models fall into two groups: those with control for baseline and those without # Therefore just plot data for models 1 and 2. sum.plot = sum[(sum$model.type=="Model 1"| sum$model.type=="Model 2") ,] fig.S12 = ggplot(sum.plot, aes(x=coeff.var, y=-effect))+ geom_hline(yintercept = 0, linetype="dashed", colour="black") + facet_wrap(~scenario,ncol=2) + geom_line(aes(colour = model.type)) + geom_errorbar(aes(ymin=-effect-CI, ymax=-effect+CI), width=.3) + geom_point(size=2, shape = 23, aes(fill = model.type)) + scale_colour_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_fill_manual(name = "Models", values = c("lightgray", "black"),labels=c("1 & 4 (no control for baseline)", "2 & 3 (control for baseline)")) + scale_x_continuous(breaks=c(0,2,4,6,8,10,12,14,16)) + xlab("CV of LTL measurement error (%)") + ylab(bquote(''*beta*' smoking (bp/year)')) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(legend.background = element_rect(colour = 'black', fill = 'white', linetype='solid')) + theme(legend.position=c(0.25,0.65)) fig.S12 png("Figure S12.png", res=600, units="in", width=6, height=6) print(fig.S12) dev.off()