The scripts below follow a number of steps using OSCA which are necessary to fir mixed linear models and obtain morphometricity estimates. Because data structures between atlas data and vertex-wise measures differ, some steps also differ for the analysis of either atlases or vertex-wise measures. The first example here is for atlases, and the second example below is for vertex-wise measures that require more computation due to large file sizes.
#!/bin/bash -l
#SBATCH --job-name=make_bod
#SBATCH --output=~/morphometricity/output/make_bod_%A_%a.out
#SBATCH -D /mnt/lustre/datasets/ukbiobank/ukb40933/imaging/output
#SBATCH --mem-per-cpu=3G
#SBATCH --array=1-7 # 7 atlases
wd="~/morphometricity"
pheno_dir="/mnt/lustre/datasets/ukbiobank/ukb40933/phenotypes"
results_dir="~/morphometricity/results/morph_results"
scripts_dir="~/morphometricity/scripts"
refList=${scripts_dir}/ref_atlases.txt
atlasName=$(awk "FNR==$SLURM_ARRAY_TASK_ID" $refList)
##################################################
### OSCA analyses for brain atlases
### The processing works with the formatting from combine_atlas_output.R
##################################################
for mod in ThickAvg SurfArea GrayVol
do
############################################################################
# MAKE BOD FILES
# Make binary input files for LMM analyses
# delete all ROIs/vertices with a SD of zero
############################################################################
/users/k1894405/osca_Linux \
--efile ${wd}/data/atlases/${atlasName}_${mod}_all_* \
--sd-min 0.000001 \
--no-fid \
--make-bod \
--out ${wd}/data/bod_files/Bod_${atlasName}_${mod}
#--keep ${wd}/data/indi.list.fid \
#
echo "Created bod files for $atlasName $mod"
###########################################################################
## OSCA analyses:
## manage bod files to include subset of individuals that all atlases & vertices have in common
## Saved in indi.list
############################################################################
/users/k1894405/osca_Linux \
--befile ${wd}/data/bod_files/Bod_${atlasName}_${mod} \
--keep ${wd}/data/indi.list.fid \
--make-bod \
--out ${wd}/data/bod_files/Bod_${atlasName}_${mod}
echo "Created bod files for $atlasName $mod"
done
##### sbatch -p shared,brc ~/morphometricity/scripts/make_bod_files.sh
### This is the first script needed for morphometricity estimates (second is calculate BRM)
#!/bin/bash -l
#SBATCH --job-name=DK_morph_est
#SBATCH --output=~/morphometricity/output/upd_morph_est_DK_%A_%a.out
#SBATCH --mem=80G
#SBATCH --cpus-per-task=10
#SBATCH --time=2-00:00
#SBATCH --array=1-7 # parallel analysis for 7 phenos
module load apps/R/3.6.0
wd="~/morphometricity"
pheno_dir="/mnt/lustre/datasets/ukbiobank/ukb40933/phenotypes"
results_dir="~/morphometricity/results/morp_results_update"
scripts_dir="~/morphometricity/scripts"
atlasName="DK"
echo "Starting morph estimates for " $atlasName
refList=${scripts_dir}/ref_pheno.txt
pheno=$(awk "FNR==$SLURM_ARRAY_TASK_ID" $refList)
for mod in ThickAvg SurfArea GrayVol
do
# exclude analyses that were already performed
if [ -f morph_est_${atlasName}_${mod}_${pheno}.rsq ]
then
echo "Now executing " $mod $pheno
/users/k1894405/osca_Linux \
--reml \
--orm ${wd}/data/BRM/BRM_${atlasName}_${mod} \
--pheno ${pheno_dir}/${pheno}_nofid.txt \
--covar ${pheno_dir}/bcovariates_nofid.txt \
--qcovar ${pheno_dir}/qcovariates_nofid.txt \
--out ${results_dir}/morph_est_${atlasName}_${mod}_${pheno} \
--reml-est-fix \
--reml-maxit 10000
echo "Calculated morph estimate for $atlasName $mod $pheno"
else
echo "This already exists. Not running analysis." $mod $pheno
continue
fi
done
##### sbatch -p brc,shared ~/morphometricity/scripts/DK_calculate_morph_est_up.sh
#!/bin/bash -l
#SBATCH --job-name=vertex_bod
#SBATCH --output=~/morphometricity/output/vertices_make_bod_%A_%a.out
#SBATCH -D /mnt/lustre/datasets/ukbiobank/ukb40933/imaging/output
#SBATCH --mem-per-cpu=70G
#SBATCH --array=1-3 # 3 modalities
wd="~/morphometricity"
pheno_dir="/mnt/lustre/datasets/ukbiobank/ukb40933/phenotypes"
results_dir="~/morphometricity/results/morph_results"
scripts_dir="~/morphometricity/scripts"
refList=${scripts_dir}/ref_meas.txt
meas=$(awk "FNR==$SLURM_ARRAY_TASK_ID" $refList)
##################################################
### OSCA analyses for brain vertices
### Step 1: get bod files
### Transposed format --tefile
##################################################
for hemi in rh lh
do
/users/k1894405/osca_Linux \
--tefile ${wd}/data/vertices/${hemi}.${meas}_all_nodup_42999 \
--make-bod \
--no-fid \
--sd-min 0.000001 \
--out ${wd}/data/bod_files/Bod_vertices_${hemi}.${meas}
###########################################################################
## OSCA analyses:
## manage bod files to include subset of individuals that all atlases & vertices have in common
## Saved in indi.list
############################################################################
/users/k1894405/osca_Linux \
--befile ${wd}/data/bod_files/Bod_vertices_${hemi}.${meas} \
--keep ${wd}/data/indi.list.fid \
--make-bod \
--out ${wd}/data/bod_files/Bod_vertices_${hemi}.${meas}
done
### sbatch -p shared,brc ~/morphometricity/scripts/vertices_make_bod.sh
#!/bin/bash -l
#SBATCH --job-name=vertex_morph
#SBATCH --output=~/morphometricity/output/vertex_morph_%A_%a.out
#SBATCH -D ~/morphometricity
#####SBATCH --mem-per-cpu=300G
#SBATCH --mem=300G
#SBATCH --exclusive
#SBATCH --array=1-7 # 7 phenotypes
wd="~/morphometricity"
pheno_dir="/mnt/lustre/datasets/ukbiobank/ukb40933/phenotypes"
results_dir="~/morphometricity/results/morp_results_update"
scripts_dir="~/morphometricity/scripts"
refList=${scripts_dir}/ref_pheno.txt
pheno=$(awk "FNR==$SLURM_ARRAY_TASK_ID" $refList)
for meas in area volume thickness
do
echo "Starting morph estimate $meas $pheno"
if [ ! -f ${results_dir}/morph_est_vertices_${meas}_${pheno}.rsq ]
then
#########################################
### Merge hemispheres for vertex BRM
#########################################
/users/k1894405/osca_Linux \
--reml \
--orm ${wd}/data/BRM/BRM_vertices_bothhemi_${meas} \
--pheno ${pheno_dir}/${pheno}_nofid.txt \
--covar ${pheno_dir}/bcovariates_nofid.txt \
--qcovar ${pheno_dir}/qcovariates_nofid.txt \
--out ${results_dir}/morph_est_vertices_${meas}_${pheno} \
--reml-est-fix \
--reml-maxit 10000
fi
done
##### sbatch -p brc,shared ~/morphometricity/scripts/vertices_get_morph_est.sh
Launch this Rscript (and specify the brain measurement type to be cleaned) in order to read in BRMs and detect outliers of participants that are too similar or dissimilar to the rest of the sample (BRM cut-off 8SDs). Based on this script, we excluded 4 participants altogether. If any participants get excluded based on this, you will have to re-run the step making bod files, BRMs and morphometricity estimation.
args <- commandArgs(trailingOnly = TRUE)
meas=args[1]
print(paste("Looking for outliers in ", meas))
## define functions
## taken from Baptistes Github
## https://github.com/baptisteCD/Brain-LMM/blob/master/RR_4.0_BRM_QC_functions.R
## this may require ~220GB RAM
ReadORMBin=function(prefix, AllN=F, size=4){
sum_i=function(i){
return(sum(1:i))
}
BinFileName=paste(prefix,".orm.bin",sep="")
NFileName=paste(prefix,".orm.N.bin",sep="")
IDFileName=paste(prefix,".orm.id",sep="")
id = read.table(IDFileName)
n=dim(id)[1]
BinFile=file(BinFileName, "rb");
grm=readBin(BinFile, n=n*(n+1)/2, what=numeric(0), size=size)
NFile=file(NFileName, "rb");
if(AllN==T){
N=readBin(NFile, n=n*(n+1)/2, what=numeric(0), size=size)
}
else N=readBin(NFile, n=1, what=numeric(0), size=size)
i=sapply(1:n, sum_i)
return(list(diag=grm[i], off=grm[-i], id=id, N=N))
}
# Format matrix
asBRM<-function(BRMbin){
mat<-matrix(0, nrow = length(BRMbin$diag), ncol = length(BRMbin$diag))
mat[upper.tri(mat)]<-BRMbin$off
mat<-mat+t(mat)
diag(mat)<-BRMbin$diag
colnames(mat)<-BRMbin$id[,2]
rownames(mat)<-BRMbin$id[,2]
return(mat)
}
################################################################################################
setwd("/mnt/lustre/users/k1894405/morphometricity/data/BRM")
hemi = c("rh","lh")
for(i in meas){
for(j in hemi){
# read in BRM from binary file
bin_mat <- ReadORMBin(prefix=paste0("BRM_vertices_",j,".",i))
# format as BRM
mat <- asBRM(BRMbin = bin_mat)
# clean up
rm("bin_mat")
# calculate rowMeans to get average assoc of each participants with the rest of the sample
rs <- rowMeans(abs(mat))
# clean up
rm("mat")
# extract Ids of participants whos rowMeans are outside of 8SDs
outliers <- names(rs[which(rs > (mean(rs)+8*sd(rs)) ) ])
# if outliers is not empty, save IDs to exclude from analysis
if(length(outliers) > 0){
print(paste0("Outliers in ",j,".",i," are printed in ","IDs_BRM_outliers_",j,".",i,".txt")); write.table(outliers, paste0("IDs_BRM_outliers_",j,".",i,".txt"), col.names = F, row.names = F, quote = F )
}else{print(paste0(j,".",i," did not have any outliers"))}
}
}
print("Done identifying outliers for all atlases")
When all analyses ran, you will have seperate output files from each model. The script below reads all these seperate files in and summarises them in a table. It also calculates BIC.
##### Get morphometricity estimates from OSCA output
#setwd("~/morphometricity/results/morp_results_update")
#setwd("~/morphometricity/results/morph_results_null")
setwd("~/morphometricity/results/morph_est_nulldistr")
library(stringr)
files <- list.files(pattern=".rsq")
## formatting for null distributions
#files=files[grep(pattern="68randomROIs", files)]
# for files with 34: files = files[!grepl("334", files)]
## format empty output file: 7 atlases x 3 measurements x 7 phenotypes = 147 rows
## information we want: 1. morphometricity estimate, 2. SE, 3. LRT, 4. BIC, 5. N, 6. pheno, 7. atlas, 8. measurements
output <- as.data.frame(matrix(ncol = 9, nrow = length(files)))
names(output) <- c("atlas", "meas", "pheno", "morph", "SE", "pval","LRT", "BIC", "N")
for(i in 1:length(files)){
# read in raw file
raw <-read.table(files[i], fill=T, stringsAsFactors = F, header=T)
# extract info from file name about atlas, pheno & meas
info <- str_remove(files[i], pattern = "morph_est_")
info <- str_remove(info, pattern = "_pheno.rsq")
info <- str_remove(info, pattern = ".rsq")
# split all characters by _
info <- strsplit(info, "_")[[1]][1:3]
# calculate BIC
n = as.numeric(raw[10,2])
p = 1 # same for all atlases because they estimate only one variance component
logL = as.numeric(raw[5,2])
# (log(n)*p) for model complexity, (2*logL) for model performance
# the smaller BIC, the better
BIC <- (log(n)*p) - (2*logL)
# col bind info, morph, SE, LRT, BIC, N
output[i,] <- c(unlist(info), (as.numeric(raw[4,2])*100), raw[4,3], raw[9,2], raw[7,2], BIC, n)
}
# convert numeric columns
output[,4:9] <- apply(output[,4:9], 2, as.numeric)
# calculate confidence interval
output$ci_l <- output$morph - 1.96*output$SE*100
output$ci_u <- output$morph + 1.96*output$SE*100
# order output by pheno
output <- output[order(output$meas, output$pheno, output$atlas),]
# Document how many more are missing
## expecting 100*3*7=2100 for each nulldistrib
print(paste("Processed", nrow(output), "out of", (7*3*7),"estimates"))
# inspect available results
table(output$meas, output$pheno)
#write.table(output, "morph_results_QCed.table", sep = "\t", col.names =T, row.names = F, quote = F)
#write.table(output, "morph_results_68randomROIs.table", sep = "\t", col.names =T, row.names = F, quote = F)
# Notes:
# Can Bayesian Information Criterion negative?
# Generally, the aim is to minimize BIC, so if you are in a negative territory, a negative number that has the largest modulus (deepest down in the negative territory) indicates the preferred model.
# https://www.sidmartinbio.org/how-do-you-calculate-bayesian-information-criterion-in-r/#:~:text=Can%20Bayesian%20Information%20Criterion%20negative%3F%20Generally%2C%20the%20aim,be%20%E2%80%9C2%E2%80%9D.%20Should%20AIC%20be%20high%20or%20low%3F
By Anna Elisabeth Fürtjes
anna.furtjes@kcl.ac.uk