We created random parcellations for two purposes:
Obtain parcellations comprising 1,000 5,000 10,000 50,000 ROIs with random boundaries. Morphometricity estimates from these random parcellations are reported here, alongside morphometricity estimated from empirical brain atlases.
Obtain parcellations comprising random regional boundaries matched with the number of ROIs in empirical atlases. Morphometricity estimates from these parcellations can be found here.
First, we feed the fsaverage 3-dimensional coordinates of the ~300.000 vertices as input to a clustering algorithm (vcgKDtree()
in Rvcg package). We repeat this for the right and left hemisphere.
####### create random vertices on Rosalind
# We pre-registered atlases with 1.000, 5.000, 10.000 and 50.000 randon ROIs
# Here we create the base files that contain the vertexIDs assigned to their random ROI and their allocated colors so we can plot the random ROIs
# for now we don't worry about the random distributions
########################################
# get input from shell
args = commandArgs(trailingOnly=TRUE)
nseeds=as.numeric(args[1])
array=args[2]
# select 500 seed vertices for a parcellation of 1.000 ROIs across both hemispheres
#nseeds=500 #nseeds=2500 # nseeds=5000
#####################################
### Right hemisphere
#####################################
## READ IN fsaverage data with vertex ID and X,Y,Z
library(Rvcg)
library(data.table)
# read in vertex coordinates and name columns appropriately
vertex_coord <-fread("~/datasets/ukbiobank/ukb40933/imaging/output/rh.fsaverage_vertex_coordinates",header=F)
names(vertex_coord) <- c("vertex_ID","X","Y","Z")
vertex_coord$vertex_ID=paste0("rh_", vertex_coord$vertex_ID)
# tested, this is the best to get contiguous areas
k=6
set.seed(as.numeric(array)*9876/30*22)
select_vertex <- sample(vertex_coord$vertex_ID, size=nseeds, replace=F)
#select_vertex
vertex_coord$ROI=NA
vertex_coord$ROI[which(vertex_coord$vertex_ID %in% select_vertex)]=1:nseeds
table(vertex_coord$ROI) # non attributed vertices have a value of 0 otherwise the number is the ROI number/name
# First plot (seeds)
roisCols=viridis::viridis(n = nseeds, alpha = 1, option = "H")
#roisCols=rainbow(nseeds)
#barplot(1:nseeds, col = roisCols) # visualise colors
vertex_coord$col <- roisCols[vertex_coord$ROI]
vertex_coord$col[which(is.na(vertex_coord$ROI))]="#D3D3D3" # a lighter grey
table(vertex_coord$col)
vertex_coord$radius=1
vertex_coord$radius[which(!is.na(vertex_coord$ROI))]=2
# Start looping
jjj=1
restot=NULL
## keep looping as long as there are vertices that have not yet been assigned to a region
while (sum(is.na(vertex_coord$ROI))>1){
gc()
# Start growing the ROIs from seeds
for (roi in 1:nseeds){
# query = seed
query = as.matrix(vertex_coord[which(vertex_coord$ROI==roi), c("X", "Y", "Z")])
# target = all vertices
target <- as.matrix(vertex_coord[,c("X", "Y","Z")], rownames = vertex_coord$vertex_ID)
# k = nbNearestNeighbors
cluster <- vcgKDtree(target = target, query = query, k=k)
newVertNames=vertex_coord$vertex_ID[cluster$index]
# attribute ROI name and colors
vertex_coord$col[which(vertex_coord$vertex_ID %in% newVertNames & is.na(vertex_coord$ROI) )]=roisCols[roi]
vertex_coord$ROI[which(vertex_coord$vertex_ID %in% newVertNames & is.na(vertex_coord$ROI) )]=roi
# print(table(vertex_coord$col))
}
# Store progress
res=c(table(vertex_coord$ROI), sum(is.na(vertex_coord$ROI)), jjj )
restot=rbind(restot,res)
# Plot everytime all the ROIs have grown
#plotCorticalRois(vertex_coordDF = vertex_coord, iter = jjj)
print(paste0(jjj, " iteration. ", sum(is.na(vertex_coord$ROI)), " vertices not yet allocated."))
jjj=jjj+1
#print(c(jjj, sum(is.na(vertex_coord$ROI))))
}
# Visualise the vertex without a ROI
#vertex_coord$radius[which(is.na(vertex_coord$ROI))]=3
# save vertex_coord
setwd(paste0("~/morphometricity/data/null_distributions/",array))
fwrite(vertex_coord, file=paste0("rh_vertex_coord_seed12345_ROIs",nseeds,"_k",k), quote=F, col.names=T, sep="\t")
# save info on iteration
fwrite(as.data.frame(restot), file=paste0("rh_restot_seed12345_ROIs",nseeds,"_k",k,".log"), quote=F, col.names=T)
rm(list = setdiff(ls(), c("nseeds","k","array")))
#####################################
### Left hemisphere
#####################################
# read in vertex coordinates and name columns appropriately
vertex_coord <-fread("~/datasets/ukbiobank/ukb40933/imaging/output/lh.fsaverage_vertex_coordinates",header=F)
names(vertex_coord) <- c("vertex_ID","X","Y","Z")
vertex_coord$vertex_ID=paste0("lh_", vertex_coord$vertex_ID)
set.seed(as.numeric(array)*9876/30*22)
select_vertex <- sample(vertex_coord$vertex_ID, size=nseeds, replace=F)
#select_vertex
vertex_coord$ROI=NA
vertex_coord$ROI[which(vertex_coord$vertex_ID %in% select_vertex)]=1:nseeds
table(vertex_coord$ROI) # non attributed vertices have a value of 0 otherwise the number is the ROI number/name
# First plot (seeds)
roisCols=viridis::viridis(n = nseeds, alpha = 1, option = "H")
#roisCols=rainbow(nseeds)
#barplot(1:nseeds, col = roisCols) # visualise colors
vertex_coord$col <- roisCols[vertex_coord$ROI]
vertex_coord$col[which(is.na(vertex_coord$ROI))]="#D3D3D3" # a lighter grey
table(vertex_coord$col)
vertex_coord$radius=1
vertex_coord$radius[which(!is.na(vertex_coord$ROI))]=2
# Start looping
jjj=1
restot=NULL
## keep looping as long as there are vertices that have not yet been assigned to a region
while (sum(is.na(vertex_coord$ROI))>1){
gc()
# Start growing the ROIs from seeds
for (roi in 1:nseeds){
# query = seed
query = as.matrix(vertex_coord[which(vertex_coord$ROI==roi), c("X", "Y", "Z")])
# target = all vertices
target <- as.matrix(vertex_coord[,c("X", "Y","Z")], rownames = vertex_coord$vertex_ID)
# k = nbNearestNeighbors
cluster <- vcgKDtree(target = target, query = query, k=k)
newVertNames=vertex_coord$vertex_ID[cluster$index]
# attribute ROI name and colors
vertex_coord$col[which(vertex_coord$vertex_ID %in% newVertNames & is.na(vertex_coord$ROI) )]=roisCols[roi]
vertex_coord$ROI[which(vertex_coord$vertex_ID %in% newVertNames & is.na(vertex_coord$ROI) )]=roi
# print(table(vertex_coord$col))
}
# Store progress
res=c(table(vertex_coord$ROI), sum(is.na(vertex_coord$ROI)), jjj )
restot=rbind(restot,res)
print(paste0(jjj, " iteration. ", sum(is.na(vertex_coord$ROI)), " vertices not yet allocated."))
jjj=jjj+1
#print(c(jjj, sum(is.na(vertex_coord$ROI))))
}
# Visualise the vertex without a ROI
#vertex_coord$radius[which(is.na(vertex_coord$ROI))]=3
setwd(paste0("~/morphometricity/data/null_distributions/",as.numeric(array)))
fwrite(vertex_coord, file=paste0("lh_vertex_coord_seed12345_ROIs",nseeds,"_k",k), quote=F, col.names=T, sep="\t")
fwrite(as.data.frame(restot), file=paste0("lh_restot_seed12345_ROIs",nseeds,"_k",k,".log"), quote=F, col.names=T)
Second, we apply the random parcellation to the vertex-wise data to get a random atlas representation for each individual. This script successively reads in chunks of individuals, and averages the vertex-wise measures across random ROIs to cut down on RAM requirements.
###### calculate morph for random atlases
## Analysis steps:
## 1. Read in data (not whole but in chunks)
## 2. Read in random atlas allocation
## 3. Get colSums for each participant across random vertices
## 4. Format output which shouldn't be a transposed file
##### Run this using shell script to get reference for subjnames
## line=$(head -n 1 lh.volume_all_vertices_47602)
## echo $line | tr " " "\n" >> header_${mod}
# Load arguments
args = commandArgs(trailingOnly=TRUE)
mod=args[1]
nchunks=args[2] # for 43.000, 40 repitiions, where 1075 are handled simultaneously
nchunks=as.numeric(nchunks)
randomROIs=as.numeric(args[3])
headerName=paste0("header_",mod)
print(mod)
print(nchunks)
print(randomROIs)
print(headerName)
###########################################################################
# PART 1: Create tempfiles
###########################################################################
# dependencies
library(data.table)
library(stringr)
library(dplyr)
setwd("~/morphometricity/")
data_dir <- "~/morphometricity/data/vertices"
random_dir <- "~/morphometricity/data/Random_parcellation"
# read in header to figure out unique columns, e.g., header_lh.volume
headerNames<-fread(paste0(random_dir,"/",headerName), header=T)
headerNames<-as.vector(unlist(append(names(headerNames), headerNames)))
# identify unique subjectNames
## because subjNames are numbers, the input files HAVE to be cleaned beforehand and only contain unique subjNames
#subjNames <- as.vector(unique(headerNames))
# work out if we're in lh or rh
hemi <- substring(mod, 1,2)
# read in coordinates for random atlas
vertex_coord <- fread(paste0(random_dir, "/", hemi, "_vertex_coord_seed12345_ROIs", randomROIs, "_k6"))
# only need the vertex_IDs and allocated ROI
vertex_coord <- vertex_coord[,c("vertex_ID", "ROI")]
# loop through nchunks to use less RAM
# with this input we get steps that tells us how many participants are loaded through each iteration
steps=ceiling(length(headerNames)/nchunks)
print(paste("Processing vertex-wise files in steps of", steps,"."))
for(n in 0:nchunks){
# check if tempfiles already exist
out_files <- list.files(path=random_dir, pattern=paste0("temp_",mod,"_",randomROIs))
if(length(out_files) == 40){print("Tempfiles already exist"); break}
print(paste("Starting iteration no. ",n))
# create structure of desired output
output <- vector(mode = "list", length = randomROIs)
# work out columns we want depending on n input (steps of 1000)
counter1=2+(n*steps)
counter2=counter1+(steps-1)
# if last iteration exceeds dimensions of the file, chose last headerName as last column to read in
if(counter2 > length(headerNames)){counter2=length(headerNames)}
if(counter1 > length(headerNames)){next}
# find appropriate file
file <- list.files(path=data_dir, pattern=paste0(mod,"_all_"))
#file <- list.files(path=data_dir, pattern=paste0(mod,"_output_NaN_remains"))
# read in all vertices for 1000 participants at a time
vertex_wise <- fread(paste0(data_dir,"/",file), header=T, select = c(1,counter1:counter2))
# merge vertex_wise values with vertex_coord
names(vertex_coord)[1] <- names(vertex_wise)[1]
vertex_wise <- plyr::join(vertex_coord, vertex_wise, by = names(vertex_wise)[1])
## get measurement per ROI
# loop through each ROI
for(j in 1:randomROIs){
output[[j]]<-colSums(vertex_wise[which(vertex_wise$ROI == j),3:length(vertex_wise)], na.rm=TRUE)
}
# format output
output <- as.data.frame(output)
output$IID <- row.names(as.data.frame(output))
# get ID columnas first column
output <- output[,c("IID", names(output)[1:(length(names(output))-1)])]
names(output)<-c("IID",paste0(hemi,".ROI",1:randomROIs))
# save this in a tempfile
fwrite(output, file=paste0(random_dir,"/temp_",mod, "_", randomROIs,"randomROIs_iter",n), sep="\t", col.names=T, row.names=F, quote=F)
# clean up
rm(vertex_wise)
}
print("Created and saved all temp files")
###########################################################################
# PART 2: Combine tempfiles
###########################################################################
# save file names from all iterations
out_files <- list.files(path=random_dir, pattern=paste0("temp_",mod,"_",randomROIs))
output <- NULL
setwd(random_dir)
for(i in 1:length(out_files)){
# read in tempfile
randomAtlas <- fread(paste0(random_dir,"/",out_files[i]), header=T)
# merge with outputfile
output <- rbind(output, randomAtlas)
}
### remove any duplicated IIDs
print(paste(sum(duplicated(output$IID)),"duplicated IIDs."))
output<-output[!duplicated(output$IID),]
print(paste("After removing duplicates, we have ", nrow(output), "participants left."))
fwrite(output, file=paste0(random_dir,"/",mod, "_", randomROIs,"randomROIs"), sep="\t", col.names=T, row.names=F, quote=F)
print(paste("Merged and saved final output file", mod, randomROIs, "random ROIs"))
# clean up
rm(list = setdiff(ls(), c("random_dir","mod", "randomROIs","out_files")))
###########################################################################
# PART 3: Delete tempfiles
###########################################################################
# clean up temp files, do it at very end in case job gets killed
for(k in out_files){
file.remove(paste0(random_dir,"/",k))
}
print("Cleaned tempfiles")
###########################################################################
# PART 4: Combine with other hemisphere if exists
###########################################################################
current <- paste0(random_dir,"/",mod, "_", randomROIs,"randomROIs")
if(grepl("lh.",current,fixed = TRUE)){
alternative <- str_replace(current, pattern="lh.", replacement="rh.")
}else if(grepl("rh.",current,fixed = TRUE)){
alternative <- str_replace(current, pattern="rh.", replacement="lh.")
}
# check if counterpart in other hemisphere exists
if(file.exists(alternative)){
print("Counterpart already exists. Merging two hemispheres.")
# read in both files
hemi1<-fread(current, header=T)
hemi2<-fread(alternative, header=T)
# check that colnames are denoted lh and rh
if(names(hemi1)[2] == names(hemi2)[2]){print("Lh and Rh have same labelling."); break}
# merge both hemispheres by maintaining order
both_hemi <- plyr::join(hemi1, hemi2, by = "IID")
# clean up
rm("hemi1")
rm("hemi2")
# save
meas<-substring(mod, 4)
fwrite(both_hemi, file=paste0(random_dir,"/both_hemi_",meas, "_", (2*randomROIs),"randomROIs"), sep="\t", col.names=T, row.names=F, quote=F)
# clean up & remove indiv hemi files
file.remove(current)
file.remove(alternative)
print(paste0("Done creating random atlas with ", (2*randomROIs)," randomROIs for ", meas))
}else{print(paste("Counterpart for ", mod, "does not yet exist."))}
Example of launching this last script in the chunk below.
#!/bin/bash -l
#SBATCH --output=~/morphometricity/output/get_random_atlases_%A_%a.out
#SBATCH --job-name=random_atlases
#SBATCH --time=2-00:00
#SBATCH --mem-per-cpu=80G
#SBATCH --array=1-3 ## we have rh/ lh and area/ thickness/ volume
scripts_dir="~/morphometricity/scripts"
data_dir="~/morphometricity/data/vertices"
out_dir="~/morphometricity/data/Random_parcellation"
refList=${scripts_dir}/ref_meas.txt
meas=$(awk "FNR==$SLURM_ARRAY_TASK_ID" $refList)
# indicate number of ROis to be included in random atlas
nroi=25000
module load apps/R/3.6.0
for hemi in rh lh
do
# create reference list for subjIDs
line=$(head -n 1 ${data_dir}/${hemi}.${meas}_all_*)
echo $line | tr " " "\n" >> ${out_dir}/header_${hemi}.${meas}
# args = mod & nchunks & randomROIs
Rscript ~/morphometricity/scripts/get_random_atlases.R ${hemi}.${meas} 40 $nroi
rm ${out_dir}/header_${hemi}.${meas}
done
#/usr/bin/sacct --format="CPUTime,MaxRSS"
#### sbatch -p shared,brc ~/morphometricity/scripts/Run_get_random_atlases.sh
The following script generates morphometricity estimates for random atlases.
#!/bin/bash -l
#SBATCH --job-name=eff_null_morph
#SBATCH --output=~/morphometricity/output/area_null_morph_%A_%a.out
#SBATCH -D ~/morphometricity
#SBATCH --mem-per-cpu=80G
#SBATCH --time=2-00:00
#SBATCH --array=1-100 # 100 distributions
pheno_dir="~/datasets/ukbiobank/ukb40933/phenotypes"
wd="~/morphometricity/data/null_distributions/$SLURM_ARRAY_TASK_ID"
ref_dir="~/morphometricity/data"
scripts_dir="~/morphometricity/scripts"
results_dir="~/morphometricity/results/morph_est_nulldistr"
nroi=68
mod="area"
cd $wd
for pheno in sex_pheno bmi_pheno edu_pheno alc_pheno cig_pheno gpheno age_pheno
do
# only calculate BRM if it doesn't already exist (if a previous job ran out of time, it won't have deleted the BRM, so we don't have to re-calculate it) OR if we already have a morph estimate
if [ ! -f ${wd}/BRM/BRM_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID}.orm.bin ] && [ ! -f ${results_dir}/morph_est_${nroi}randomROIs_${mod}_${pheno}_${SLURM_ARRAY_TASK_ID}.rsq ]
then
mkdir -p ${wd}/bod_files
############################################################################
# MAKE BOD FILES
# Make binary input files for LMM analyses
# delete all vertices with a SD of zero
############################################################################
osca_Linux \
--efile ${wd}/both_hemi_${mod}_${nroi}randomROIs \
--make-bod \
--sd-min 0.000001 \
--no-fid \
--out ${wd}/bod_files/Bod_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID}
###########################################################################
## OSCA analyses:
## manage bod files to include subset of individuals that all atlases & vertices have in common
## Saved in indi.list
############################################################################
osca_Linux \
--befile ${wd}/bod_files/Bod_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID} \
--keep ${ref_dir}/indi.list.fid \
--make-bod \
--out ${wd}/bod_files/Bod_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID}
echo "Created bod files for $mod $nroi random areas"
echo "Calculating BRM for $nroi random ROIs $mod"
mkdir -p ${wd}/BRM
############################################################################
# CALCULATE BRAIN-RELATEDNESS-MATRIX
############################################################################
osca_Linux \
--befile ${wd}/bod_files/Bod_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID} \
--make-orm \
--out ${wd}/BRM/BRM_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID}
echo "Calculated BRMs for ${mod} ${nroi} randomROIs"
#####################################
## Clean up bod files to make space
#####################################
rm ${wd}/bod_files/Bod_${mod}_${nroi}randomROIs_*
echo "Cleaned up bod files"
else
echo "BRM already exists"
fi
# only calculate morph est if it doesn't already exist
if [ ! -f ${results_dir}/morph_est_${nroi}randomROIs_${mod}_${pheno}_${SLURM_ARRAY_TASK_ID}.rsq ]
then
#####################################
## Get morph estimate
#####################################
osca_Linux \
--reml \
--orm ${wd}/BRM/BRM_${mod}_${nroi}randomROIs_${SLURM_ARRAY_TASK_ID} \
--pheno ${pheno_dir}/${pheno}_nofid.txt \
--covar ${pheno_dir}/bcovariates_nofid.txt \
--qcovar ${pheno_dir}/qcovariates_nofid.txt \
--out ${results_dir}/morph_est_${nroi}randomROIs_${mod}_${pheno}_${SLURM_ARRAY_TASK_ID} \
--reml-est-fix \
--reml-maxit 10000
echo "Calculated morph estimate for $nroi $mod $pheno ${SLURM_ARRAY_TASK_ID}"
else
echo "Morph results already exist for $nroi $mod $pheno ${SLURM_ARRAY_TASK_ID}. Skipping"
fi
done
#####################################
## Clean up BRM files to make space
#####################################
rm ${wd}/BRM/BRM_${mod}_${nroi}randomROIs_*
################ sbatch -p brc,shared ~/morphometricity/scripts/Efficient_null_distrib_morph_area.sh
By Anna Elisabeth Fürtjes
anna.furtjes@kcl.ac.uk