Code
library(data.table)
# this only works for the correct naming of the variable names to diff, ratio and resid
descriptives = function(samples = c("HCP", "Share", "both")){
# define statistics to include
stats = c("N", "TBV: Mean (SD)", "ICV: Mean (SD)", "cor(ICV,TBV)",
"*Difference score*", "Mean (SD)", "Median", "Range", "Variance", "Cut off",
"*Ratio score*", "Mean (SD)", "Median", "Range", "Variance", "Cut off",
"*Residual score*", "Mean (SD)", "Median", "Range", "Variance", "Cut off")
# object to hold results
res = as.data.frame(matrix(ncol = length(samples)+1, nrow = length(stats)))
names(res) = c("Statistic", samples)
res$Statistic = stats
for(i in samples){
# pull sample
dat = as.data.frame(get(i))
# N
N = sum(!is.na(dat$diff))
res[which(res$Statistic == "N"), which(names(res) == i)] = N
# TBV: Mean (SD)
mean = round(mean(dat$TBV, na.rm = T), digits = 2)
SD = signif(sd(dat$TBV, na.rm = T), digits = 2)
res[which(res$Statistic == "TBV: Mean (SD)"), which(names(res) == i)] = paste0(mean, " (", SD,")")
# ICV: Mean (SD)
mean = round(mean(dat$ICV, na.rm = T), digits = 2)
SD = signif(sd(dat$ICV, na.rm = T), digits = 2)
res[which(res$Statistic == "ICV: Mean (SD)"), which(names(res) == i)] = paste0(mean, " (", SD,")")
# ICV TBV correlation
cor = round(cor.test(dat$ICV, dat$TBV)$estimate, digits = 2)
res[which(res$Statistic == "cor(ICV,TBV)"), which(names(res) == i)] = cor
# Cycle through different scores
for(j in c("Difference", "Ratio", "Resid")){
# determine variable that matches the right score
if(j == "Difference"){
VarName = "diff"
}else if(j == "Ratio"){
VarName = "ratio"
}else if(j == "Resid"){
VarName = "resid"
}
dat$var = dat[,VarName]
### Calculate mean and SD
mean = round(mean(dat$var, na.rm=T), digits = 2)
sd = round(sd(dat$var, na.rm=T), digits = 2)
# find correct position in res to store result
index = grep(j, res$Statistic)
Cand = grep("Mean", res$Statistic)
pos = Cand[which(Cand > index)][1]
# store mean result
res[pos, which(names(res) == i)] = paste0(mean, " (", sd, ")")
### Calculate median
median = round(median(dat$var, na.rm=T), digits = 2)
#store median result
Cand = grep("Median", res$Statistic)
pos = Cand[which(Cand > index)][1]
res[pos, which(names(res) == i)] = median
### Calculate range
min = round(min(dat$var, na.rm = T), digits = 2)
max = round(max(dat$var, na.rm = T), digits = 2)
# store results
Cand = grep("Range", res$Statistic)
pos = Cand[which(Cand > index)][1]
res[pos, which(names(res) == i)] = paste0(min, " to ", max)
## Calculate variance
variance = round(var(dat$var, na.rm = T))
# store variance result
Cand = grep("Variance", res$Statistic)
pos = Cand[which(Cand > index)][1]
res[pos, which(names(res) == i)] = variance
### calculate cut-off
if(j == "Difference"){
cutOff = mean(dat$var, na.rm = T)+(2*sd(dat$var, na.rm = T))
}else{
cutOff = mean(dat$var, na.rm = T)-(2*sd(dat$var, na.rm = T))
}
# store results
Cand = grep("Cut", res$Statistic)
pos = Cand[which(Cand > index)][1]
res[pos, which(names(res) == i)] = round(cutOff, digit = 1)
}
}
return(res)
}
# add in STRADL
STRADL = fread(paste0(STRADLdir, "/", list.files(path = STRADLdir, pattern = "STRADL")))
names(STRADL) = c("ID", "Age", "Sex", "TBV", "ICV")
# convert mm3 estimates to more intuitive cm3 estimates
STRADL$ICV = STRADL$ICV/1000
STRADL$TBV = STRADL$TBV/1000
# estimate brain atrophy from single MRI scan
STRADL$diff = STRADL$ICV - STRADL$TBV
STRADL$ratio = STRADL$TBV / STRADL$ICV
# remove participants with zero estimates for TBV and ICV (11 participants)
STRADL = STRADL[STRADL$TBV != 0,]
STRADL = STRADL[STRADL$ICV != 0,]
# remove participants where ICV is smaller than TBV (excluding 45 participants)
STRADL = STRADL[which(STRADL$diff > 0),]
model <- lm(TBV ~ ICV, data = STRADL)
STRADL$resid = resid(model)
# standardise variables
STRADL$diff_stand = as.vector(scale(STRADL$diff))
STRADL$ratio_stand = as.vector(scale(STRADL$ratio))
STRADL$resid_stand = as.vector(scale(STRADL$resid))
# read in MRi-Share
Share = fread(paste0(out, "/MRiShare_global_IDPs_BSAF2021.csv"))
Share$TBV = Share$SPM_GM_Volume + Share$SPM_WM_Volume
Share = Share[,c("ID", "Age", "Sex", "eTIV", "TBV")]
names(Share) = c("ID", "Age", "Sex", "ICV", "TBV")
# convert mm3 estimates to more intuitive cm3 estimates
Share$ICV = Share$ICV/1000
Share$TBV = Share$TBV/1000
# estimate brain atrophy from single MRI scan
Share$diff = Share$ICV - Share$TBV
Share$ratio = Share$TBV / Share$ICV
model <- lm(TBV ~ ICV, data = Share)
Share$resid = resid(model)
# save intercept value from the regression
Shareintercept = summary(model)$coefficients[1,1]
# standardise variables
Share$diff_stand = as.vector(scale(Share$diff))
Share$ratio_stand = as.vector(scale(Share$ratio))
Share$resid_stand = as.vector(scale(Share$resid))
# sanity check
#sum((Share$diff < 0))
# read in HCP data
HCP = fread(paste0(out,"/unrestricted_hcp_freesurfer.csv"))
HCP = HCP[,c("Subject", "Gender", "FS_InterCranial_Vol", "FS_BrainSeg_Vol_No_Vent")]
names(HCP) = c("ID", "Sex", "ICV", "TBV")
# add age information
HCPage = fread(paste0(out, "/RESTRICTED_annafurtjes_12_14_2023_4_18_2.csv"))
names(HCPage)[which(names(HCPage) == "Subject")] = "ID"
names(HCPage)[which(names(HCPage) == "Age_in_Yrs")] = "Age"
HCP = merge(HCP, HCPage[,c("ID","Age")], by = "ID")
# as outlined elsewhere, empirical investigations warrant to use an age cut-off of 31 years in this sample
HCP = HCP[which(HCP$Age <= 31),]
# convert mm3 estimates to more intuitive cm3 estimates
HCP$ICV = HCP$ICV/1000
HCP$TBV = HCP$TBV/1000
# estimate brain atrophy from single MRI scan
HCP$diff = HCP$ICV - HCP$TBV
HCP$ratio = HCP$TBV / HCP$ICV
# Quality control:
#print(paste0("Some participants have negative difference scores and ratio scores > 1, which means that their ICV estimate is smaller than their TBV estimate. This must be an error as the skull always surrounds the brain. Those ", sum((HCP$diff < 0))," HCP participants were excluded from the data set."))
deletedHCP = sum(HCP$diff < 0)
# delete those from data
if(sum(HCP$diff < 0) != 0){HCP=HCP[-which(HCP$diff < 0),]}
# estimate residual model
model <- lm(TBV ~ ICV, data = HCP)
HCP$resid = as.vector(resid(model, na.rm=T))
# standardise variables
HCP$diff_stand = as.vector(scale(HCP$diff))
HCP$ratio_stand = as.vector(scale(HCP$ratio))
HCP$resid_stand = as.vector(scale(HCP$resid))
First and last neuroimaging visit as cleaned in data preparation.
First and last neuroimaging visit as cleaned in data preparation. Due to additional exclusions, this step re-calculates LBA residual to ensure zero correlation with ICV.
# read in UKB neuro data
UKB = fread(paste0(out, "/UKB_neuroNoLongProcess.txt"))
# restrict to first neuroimaging visit (i.e., second visit altogether)
UKB2 = UKB[UKB$wave == 2,]
# cleaning extreme outliers (looks messy because only realised later that there were a few extreme outliers)
UKB2 <- UKB2[which(UKB2$TBVdiff_2to3_stand < 10),]
UKB2 <- UKB2[which(UKB2$TBVdiff_2to3_stand > (-10)),]
UKB2 <- UKB2[which(UKB2$TBVratio_3to2_stand < 10),]
UKB2 <- UKB2[which(UKB2$TBVratio_3to2_stand > (-10)),]
UKB2 <- UKB2[which(UKB2$TBVresid_2to3_stand < 10),]
UKB2 <- UKB2[which(UKB2$TBVresid_2to3_stand > (-10)),]
# now that more participants were excluded, need to re-calculate the residual score
model <- lm(TBV ~ ICV, data = UKB2)
UKB2$resid = resid(model)
UKB2$resid_stand <- as.vector(scale(UKB2$resid))
# for some reason the mean for resid here is not zero ... don't know why - recalculate
model <- lm(TBV ~ ICV, data = UKB2)
UKB2$resid = resid(model)
UKB2$resid_stand <- as.vector(scale(UKB2$resid))
# restrict to second neuroimaging visit (i.e., third visit altogether)
UKB3 = UKB[UKB$wave == 3,]
# cleaning extreme outliers (looks messy because only realised later that there were a few extreme outliers)
UKB3 <- UKB3[which(UKB3$TBVdiff_2to3_stand < 10),]
UKB3 <- UKB3[which(UKB3$TBVdiff_2to3_stand > (-10)),]
UKB3 <- UKB3[which(UKB3$TBVratio_3to2_stand < 10),]
UKB3 <- UKB3[which(UKB3$TBVratio_3to2_stand > (-10)),]
UKB3 <- UKB3[which(UKB3$TBVresid_2to3_stand < 10),]
UKB3 <- UKB3[which(UKB3$TBVresid_2to3_stand > (-10)),]
# now that more participants were excluded, need to re-calculate the residual score
model <- lm(TBV ~ ICV, data = UKB3)
UKB3$resid = resid(model)
UKB3$resid_stand <- as.vector(scale(UKB3$resid))
des = descriptives(samples = c("HCP", "Share", "UKB2", "UKB3","LBC1","LBC4","STRADL"))
# remove cut-off values (artifact)
des = des[!grepl("Cut off", des$Statistic),]
# add ages
HCPage = paste0(round(mean(HCP$Age)), " (",round(min(HCP$Age)), "-", round(max(HCP$Age)), ")")
Shareage = paste0(round(mean(Share$Age)), " (",round(min(Share$Age)), "-", round(max(Share$Age)), ")")
# UKB
UKB2$ageY = UKB2$age / 12
UKB2age = paste0(round(mean(UKB2$ageY, na.rm=T)), " (",round(min(UKB2$ageY, na.rm=T)), "-", round(max(UKB2$ageY, na.rm=T)), ")")
UKB3$ageY = UKB3$age / 12
UKB3age = paste0(round(mean(UKB3$ageY, na.rm=T)), " (",round(min(UKB3$ageY, na.rm=T)), "-", round(max(UKB3$ageY, na.rm=T)), ")")
# LBC
age = fread(paste0(out, "/LBC1936_allPheno.txt"), select = c("lbc36no","ageMRI_w2"))
LBC1 = merge(LBC1, age, by = "lbc36no")
LBC1$ageMRI_w2 = LBC1$ageMRI_w2/365
LBC1age = paste0(round(mean(LBC1$ageMRI_w2, na.rm=T)), " (",round(min(LBC1$ageMRI_w2, na.rm=T)), "-", round(max(LBC1$ageMRI_w2, na.rm=T)), ")")
LBC4age = "82 (81-83)"
# STRADL
STRADLage = paste0(round(mean(STRADL$Age, na.rm=T)), " (",round(min(STRADL$Age, na.rm=T)), "-", round(max(STRADL$Age, na.rm=T)), ")")
# merge ages into data.frame
res=rbind(des[1,],
c("Age in years", HCPage, Shareage, UKB2age, UKB3age, LBC1age, LBC4age, STRADLage),
des[2:nrow(des),])
knitr::kable(res, col.names = c("Statistic","HCP","MRi-Share","UKB (first visit)","UKB (second visit)", "LBC (first visit)", "LBC (fourth visit)", "STRADL"))
Statistic | HCP | MRi-Share | UKB (first visit) | UKB (second visit) | LBC (first visit) | LBC (fourth visit) | STRADL | |
---|---|---|---|---|---|---|---|---|
1 | N | 800 | 1831 | 4674 | 4674 | 634 | 286 | 987 |
2 | Age in years | 27 (22-31) | 22 (18-35) | 62 (46-81) | 65 (49-82) | 73 (71-74) | 82 (81-83) | 60 (26-84) |
22 | TBV: Mean (SD) | 1173.81 (120) | 1131.88 (100) | 1186.03 (110) | 1172.09 (110) | 1011.23 (97) | 935.48 (92) | 1101.77 (110) |
3 | ICV: Mean (SD) | 1606.3 (170) | 1568.07 (140) | 1555.14 (150) | 1547.39 (150) | 1396.53 (140) | 1382.16 (150) | 1414.44 (210) |
4 | cor(ICV,TBV) | 0.92 | 0.96 | 0.9 | 0.89 | 0.81 | 0.75 | 0.77 |
5 | Difference score | |||||||
6 | Mean (SD) | 432.49 (79.34) | 436.18 (48.29) | 369.11 (69.41) | 375.3 (72.23) | 385.31 (86.88) | 446.68 (101.51) | 312.67 (144.35) |
7 | Median | 437.57 | 431.42 | 363.11 | 369.36 | 382.43 | 444.7 | 353.16 |
8 | Range | 7.9 to 644.09 | 298.44 to 630.19 | 6.01 to 834.2 | 120.02 to 739.69 | 53.15 to 664.4 | 100.58 to 882.81 | 2.41 to 660.4 |
9 | Variance | 6295 | 2332 | 4818 | 5217 | 7549 | 10304 | 20836 |
11 | Ratio score | |||||||
12 | Mean (SD) | 0.73 (0.04) | 0.72 (0.02) | 0.76 (0.03) | 0.76 (0.03) | 0.73 (0.05) | 0.68 (0.05) | 0.79 (0.09) |
13 | Median | 0.73 | 0.72 | 0.76 | 0.76 | 0.73 | 0.68 | 0.76 |
14 | Range | 0.65 to 0.99 | 0.64 to 0.78 | 0.6 to 1 | 0.59 to 0.9 | 0.6 to 0.95 | 0.48 to 0.9 | 0.62 to 1 |
15 | Variance | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
17 | Residual score | |||||||
18 | Mean (SD) | 0 (48.09) | 0 (30.06) | 0 (49.21) | 0 (50.64) | 0 (57.37) | 0 (61.08) | 0 (71.7) |
19 | Median | -7.2 | 2.29 | 0.99 | 0.69 | 0.46 | 3.26 | -6.29 |
20 | Range | -128.64 to 238.91 | -145.08 to 90.54 | -301.01 to 317.8 | -274.31 to 157.87 | -175.03 to 178.61 | -259.78 to 151.3 | -192.39 to 252.4 |
21 | Variance | 2313 | 904 | 2421 | 2564 | 3291 | 3731 | 5141 |