# Initial sample QC (R code)## 1. Do 4 means clustering to get europeans only## 2. Get variable with het & miss info## 3. Generate a keep list # load packageslibrary(data.table)library(ggplot2)# get all participant IDs that have neuroimaging data (this is the phenotypic data prepared as GWAS input)neuro =fread("UKB_CrossNeuroIDP.txt", select =1:2)# get PC1:40 data using ukbtools packageukb_sqc=fread("UKBB_500K/Decrypted/ukb_sqc_v2.txt")# https://kenhanscombe.github.io/ukbtools/articles/explore-ukb-data.html# With ukb_sqc_v2.txt read into the dataframe my_sqc_dataukb_sqc <-ukb_gen_sqc_names(ukb_sqc)# genetic file is organised in the same order as application-specific ref fam fileref =fread("UKBB_500K/Decrypted/ukb1027_cal_chr1_v2_s488374.fam")# merge ref file to get IDsukb_sqc =cbind(ref[,c("V1")], ukb_sqc)# re-name ID column that comes from the fam filenames(ukb_sqc)[which(names(ukb_sqc) =="V1")] ="IID"################################ Get variable with het & miss info###############################table(ukb_sqc$het_missing_outliers)print(paste("We lose ",nrow(neuro)-sum(neuro$IID %in% ukb_sqc$IID), " participants from the neuroimaging data set because these IDs are not available in the genetic data. (N = ", sum(neuro$IID %in% ukb_sqc$IID),")"))#[1] "We lose 1220 participants from the neuroimaging data set because these IDs are not available in the genetic data. (N = 45598 )"neuro =merge(neuro, ukb_sqc[,c("IID","het_missing_outliers")], by ="IID")print(paste("We lose ", sum(neuro$het_missing_outliers ==1)," participants because they were outliers in heterozygosity and missinggness. N = ", sum(neuro$het_missing_outliers ==0), ""))#[1] "We lose 83 participants because they were outliers in heterozygosity and missinggness. N = 45515 "# exclude particpants that were labeled as outliers by UKB core teamneuro = neuro[which(neuro$het_missing_outlier ==0),]################################# do 4 means clustering##################################Read in PCs, remove NAs, rename#PCs<-fread(args[1], data.table=F)colNames =c("V1", paste0("pc",1:40))PCs <- ukb_sqc[, ..colNames]PCs<-na.omit(PCs)names(PCs)<-c("ID","PC.1","PC.2","PC.3","PC.4","PC.5","PC.6","PC.7","PC.8","PC.9","PC.10","PC.11","PC.12","PC.13","PC.14","PC.15","PC.16","PC.17","PC.18","PC.19","PC.20","PC.21","PC.22","PC.23","PC.24","PC.25","PC.26","PC.27","PC.28","PC.29","PC.30","PC.31","PC.32","PC.33","PC.34","PC.35","PC.36","PC.37","PC.38","PC.39","PC.40")##Set seedset.seed(1204688)##K means clustering on each PCK_MEAN <-4PC1_K<-kmeans(PCs$PC.1, K_MEAN)PC2_K<-kmeans(PCs$PC.2, K_MEAN)##Add clusters to PC dataframePCs$PC1.Cluster<-PC1_K$clusterPCs$PC2.Cluster<-PC2_K$clusterPCs$Clusters<-as.factor(paste(PC1_K$cluster,PC2_K$cluster,sep="."))##WWE group is the majorityMAX_PC1<-ifelse(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN ==0, K_MEAN, match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN)MAX_PC2<-ceiling(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster))/K_MEAN)##Make lists of WWE IDsWWE<-as.data.frame(PCs[PCs$PC1.Cluster == MAX_PC1 & PCs$PC2.Cluster == MAX_PC2,1])names(WWE)<-"ID"## Get overlap with participants of interestWWE = WWE[WWE$ID %in% neuro$FID,]print(paste("We excluded ", nrow(neuro)-length(WWE),"participants based on 4-means clustering. N = ", length(WWE), ""))#[1] "We excluded 1332 participants based on 4-means clustering. N = 44183 "WWE_PLINK<-as.data.frame(cbind(WWE,WWE))names(WWE_PLINK)<-c("FID","IID")##Write to filewrite.table(WWE_PLINK, file=paste0(out,"/ukb_neuroimaging_4MeansClustering_excl_het_miss.txt"), row.names=F, col.names=T, quote = F)##Plotpdf(paste0(out,"/ukb_neuroimaging_4MeansClustering", ".pdf"))with(PCs, print(qplot(PC.1, PC.2, colour=Clusters)))dev.off()################################ check how many of those participants now have self-reported different ancestry############################## this function identifies the path to the most recent download file on our servergetFieldLoc =function(path = path, fileName = fileName, fieldID = fieldID){library(stringr)# read all the field.ukb files files_to_read =list.files(path = path,pattern = fileName,recursive = T,full.names = T )# read all files dat =lapply(files_to_read, fread)names(dat) = files_to_read# search for field ID of interest candidates =names(dat)[grep(fieldID, dat)]# figure out which one is from the most recent file (i.e., highest number) candidates =str_remove(candidates, paste0(path, "/")) candidates =str_remove(candidates, paste0("/", fileName)) candidates =unique(as.numeric(sapply(str_extract_all(candidates, "\\d+"), tail , 1))) most_recent =max(candidates, na.rm =T)return(paste0(path, most_recent))}path1 =getFieldLoc(path = path, fileName ="fields.ukb", fieldID =21000)fileID =list.files(pat=path1,pattern="csv")# read in filefile =fread(paste0(path1, "/", fileID))# file doesnt like column names that start with number and it doesnt like -names(file) =paste0("f.",names(file))names(file) =gsub("-", "_", names(file), fixed = T)# list columns of interestid=which(names(file) =="f.eid")Cols =grep("f.21000", names(file))file = file[, c(..id, ..Cols)]names(file)[which(names(file) =="f.eid")] ="IID"# merge file with neuro but only keep the IDs available in neuroneuro1 =merge(neuro, file, by ="IID", all.x=T)# make a table for the remaining particpants to see what ethnicity they reportedtable(neuro1$f.21000_0.0)# make variable that encodes European ancestryneuro1$European <-ifelse(neuro1$f.21000_0.0==1| neuro1$f.21000_0.0==1001| neuro1$f.21000_0.0==1002| neuro1$f.21000_0.0==1003,1,0)print(paste("We're losing ", sum(neuro1$European ==0, na.rm=T)," participants because they self-reported to be non-European.",sum(is.na(neuro1$European)),"are missing this info and will also be excluded. N remaining = ", sum(neuro1$European ==1, na.rm=T)))#[1] "We're losing 1470 participants because they self-reported to be non-European. 9 are missing this info and will also be excluded. N remaining = 44036"# delete non-Europeanneuro1 = neuro1[which(neuro1$European ==1),]write.table(neuro1, file=paste0(out,"/ukb_neuroimaging_4MeansClustering_excl_het_missEUR.txt"), row.names=F, col.names=T, quote = F)##### At this point, we now have a list of IDs saved in ukb_neuroimaging_4MeansClustering_excl_het_miss.txt that are:# People with neuroimaging data# Have genetic data# Have not been labeled unusual by the UKB core team for missingness and heterozygosity# Have been included based on 4 means clustering# Have self-reported that they are of European descent
Format genetic data
Code
########################################################################################################### Step 0: Only include participants with neuroimaging data ## this set of individuals will already exclude non-Europeans (PCs and self-report)## extreme scores on hetezygosity and missingness (see above)# copy bim bed and fam files to get consistent naming across bim and bed files## bed filesfor CHR in{1..22}docp${sourceBED}/ukb_cal_chr${CHR}_v2.bed ${target}/ukb_chr${CHR}_v2.beddone## bim filesfor CHR in{1..22}docp${sourceBIM}/ukb_snp_chr${CHR}_v2.bim ${target}/ukb_chr${CHR}_v2.bimdone## fam filecp /GWAS_Source/UB_BB/UKBB_500K/Decrypted/ukb1027_cal_chr1_v2_s488374.fam ${target}/ukb_v2.fam# do fam file for each chromosomefor CHR in{1..22}docp${target}/ukb_v2.fam ${target}/ukb_chr${CHR}_v2.famdone## put data from each chromosome together into one bim, bed and fam file## and filter participants of interestcd$targetphenoIDs="/BrainAtrophy/data/geneticQC/ukb_neuroimaging_4MeansClustering_excl_het_missEUR.txt"plink19\--merge-list $target/allGenoFiles.txt \--make-bed \--keep $phenoIDs\--out ukb_neuroimaging_reQC## starting out with 784256 variants and 488377 participants, that are getting cut down to the 44036 participants that survived cleaning above################################################## I realised later that this is not including the sex-chromosomes which are needed for sex-check later (this merging back and forth could have been done in one step, but I am saving resources to not to it from scratch again)# Copy X, Y and XY chromosome info first## bed filesfor CHR in X XY Ydocp${sourceBED}/ukb_cal_chr${CHR}_v2.bed ${target}/ukb_chr${CHR}_v2.beddone## bim filesfor CHR in X XY Ydocp${sourceBIM}/ukb_snp_chr${CHR}_v2.bim ${target}/ukb_chr${CHR}_v2.bimdone## fam file for each chromosomefor CHR in X XY Ydocp${target}/ukb_v2.fam ${target}/ukb_chr${CHR}_v2.famdone## merge X, Y, XY files togetherplink19\--merge-list ${target}/allXYfiles.txt \--make-bed \--keep $phenoIDs\--out ukb_neuroimaging_sexchrom# now merge sex chromosomes with other dataplink19\--merge-list ${target}/allAuto_and_Sex.txt \--make-bed \--keep $phenoIDs\--out ukb_neuroimaging_autosomal_sex_preQC## now altogether, we have 44036 partcicipants left, and 805161 variants
Filter for missing genotype data and minor allele frequency
Code
# Step 1: Get SNP-list based on geno filterplink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--geno 0.02 \--write-snplist \--out ${target}/ukb_neuroimaging_GENO0.02# removed 104462 variants, 700699 remaining# no participants removed (44036 remaining)## this creates a snplist that feeds into next step# Step 2: apply various cleaning steps to get ID list (.fam) to feed into relatedness analysisplink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--geno 0.02 \--extract ukb_neuroimaging_GENO0.02.snplist \--write-snplist \--make-just-fam \--freq --maf 0.01 \--out ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR# this fam file still contains 44036 participants# MAF filter removed 103158 variants, 597541 remaining
Remove related individuals
Our server was incompatible with the greedyRelated software, which is why I use R package here instead that runs the same analysis.
Code
library(ukbtools)library(data.table)# read in ukb relatedness filerel =fread("/UKBB_500K/Decrypted/ukb1027_rel_s488374.dat")neuro =fread("ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR.fam")sum(neuro$V1 %in% rel$ID1)# 7509sum(neuro$V1 %in% rel$ID2)# 7748# identify degree of relatedness ukb_gen_rel_count(rel)ukb_gen_rel_count(rel, plot =TRUE)# generate a list of participants to exclude based on relatedness# default cut-off > 0.0884 King coefficient corresponding to 3rd degree relatednessIDtoRemove =ukb_gen_samples_to_remove(rel, ukb_with_data = neuro$V1)# get PLINK formatIDtoRemove<-as.data.frame(cbind(IDtoRemove,IDtoRemove))names(IDtoRemove)<-c("FID","IID")# write table write.table(IDtoRemove, file=paste0(out,"/IDstoRemove_related.txt"), row.names=F, col.names=T, quote = F)# identified 622 IDs to remove due to relatedness
This next step removes IDs of related individuals just identified in the previous step.
Code
# Step 5: Make fam ID list excluding related individualsrelatedIDs="/BrainAtrophy/data/geneticQC/sourceCopy/IDstoRemove_related.txt"plink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--geno 0.02 \--extract ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR.snplist \--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR.fam \--write-snplist \--make-just-fam \--remove $relatedIDs\--hardy --hwe 0.00000001 \--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001# after removing 622 IDs due to relatedness, we have 43414 participants remaining# 84 variants removed due to missing genotype data# 11398 variants removed due to Hardy-Weinberg exact test (running this analysis throws a warning that "--hwe observation counts vary by more than 10%, due to the X chromosome. You may want to use a less stringent --hwe p-value threshold for X chromosome variants" - however, without the X chromsome, this filter removes 11169 variants which is about the same so will leave this# 586058 variants remaining (which is about the same as in my previous study in another application)# the resulting .fam and .snplist files could now be fed into regenie, but first we'll also do sex-check
Perform sex-check
Code
# Step 6: Prune SNPs for sex-check# to do sex-check, we prune SNP data to be independent and exclude high LD regions# this is because patterns of LD will impair chromosome-specific tests of homozygosityawk-f${target}/highLDregions4bim_b37.awk ${target}/ukb_neuroimaging_reQC.bim >${target}/ukb_neuroimaging_High_LD_Regions_To_Exclude.txt# this file contains 28923 SNPs to exclude from sex-check analysisplink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--maf 0.05 \--hwe 0.001 \--geno 0.02 \--thin-indiv-count 300 \--indep-pairphase 200 100 0.2 \--seed 1204688 \--exclude ${target}/ukb_neuroimaging_High_LD_Regions_To_Exclude.txt \--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.snplist \--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_LD1_ALL# Step 7: Perform sex-check# F statistic used here is a function of teh deviation of the observed number of heterozygote variants from that expected under Hardy-Weinberg equilibrium# males should have F ~1 because all X chromosome variants are hemizygous and no heterozygotes can be observed# females should have lower values of F, distributed around 0 (but females with very high F stats have been observed)# this script shouldn't need a seperate --split-x analysis because the X chromosome's pseudoautosomal region is already presented as XY chromosomeplink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--check-sex \--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_LD1_ALL.prune.in \--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_PRUNE_SEX
Remove individuals that did not pass sex-check
Code
### Extract participants that have been labeled as PROBLEMRlibrary(data.table)dat=fread("ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_PRUNE_SEX.sexcheck")IDstoRemove=dat[which(dat$STATUS =="PROBLEM"),c("FID","IID")]# 22 write.table(IDstoRemove, file=paste0(out,"/sexCheck_toRemove.txt"), row.names=F, col.names=T, quote = F)
Generate final .fam and .snplist files
Code
## Step 8: Generate final .fam and .snplist to feed into regenie (excluding failed sex checks)plink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.snplist \--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \--write-snplist \--make-just-fam \--remove ${target}/sexCheck_toRemove.txt \--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck
Generate final bed file
Code
# Step 9: Generate final bed files that will be used as inputplink19\--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck.snplist \--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck.fam \--chr 1-22 \--make-bed \--out ${final}/ukb_neuroimaging_brainAtrophy_GWASinput