UK Biobank: Genetic data cleaning

Author

Anna Elisabeth Furtjes

Published

June 6, 2025

Doi

Initial sample-level quality control (QC)

Code
# Initial sample QC (R code)
## 1. Do 4 means clustering to get europeans only
## 2. Get variable with het & miss info
## 3. Generate a keep list 

# load packages
library(data.table)
library(ggplot2)

# get all participant IDs that have neuroimaging data (this is the phenotypic data prepared as GWAS input)
neuro = fread("UKB_CrossNeuroIDP.txt", select = 1:2)

# get PC1:40 data using ukbtools package
ukb_sqc=fread("UKBB_500K/Decrypted/ukb_sqc_v2.txt")

# https://kenhanscombe.github.io/ukbtools/articles/explore-ukb-data.html
# With ukb_sqc_v2.txt read into the dataframe my_sqc_data
ukb_sqc <- ukb_gen_sqc_names(ukb_sqc)

# genetic file is organised in the same order as application-specific ref fam file
ref = fread("UKBB_500K/Decrypted/ukb1027_cal_chr1_v2_s488374.fam")

# merge ref file to get IDs
ukb_sqc = cbind(ref[,c("V1")], ukb_sqc)
# re-name ID column that comes from the fam file
names(ukb_sqc)[which(names(ukb_sqc) == "V1")] = "IID"

###############################
# Get variable with het & miss info
###############################
table(ukb_sqc$het_missing_outliers)

print(paste("We lose ",nrow(neuro)-sum(neuro$IID %in% ukb_sqc$IID), " participants from the neuroimaging data set because these IDs are not available in the genetic data. (N = ", sum(neuro$IID %in% ukb_sqc$IID),")"))
#[1] "We lose  1220  participants from the neuroimaging data set because these IDs are not available in the genetic data. (N =  45598 )"

neuro = merge(neuro, ukb_sqc[,c("IID","het_missing_outliers")], by = "IID")

print(paste("We lose ", sum(neuro$het_missing_outliers == 1)," participants because they were outliers in heterozygosity and missinggness. N = ", sum(neuro$het_missing_outliers == 0), ""))
#[1] "We lose  83  participants because they were outliers in heterozygosity and missinggness. N =  45515 "

# exclude particpants that were labeled as outliers by UKB core team
neuro = neuro[which(neuro$het_missing_outlier == 0),]

################################
# do 4 means clustering
################################
##Read in PCs, remove NAs, rename
#PCs<-fread(args[1], data.table=F)
colNames = c("V1", paste0("pc",1:40))
PCs <- ukb_sqc[, ..colNames]
PCs<-na.omit(PCs)
names(PCs)<-c("ID","PC.1","PC.2","PC.3","PC.4","PC.5","PC.6","PC.7","PC.8","PC.9","PC.10","PC.11","PC.12","PC.13","PC.14","PC.15","PC.16","PC.17","PC.18","PC.19","PC.20","PC.21","PC.22","PC.23","PC.24","PC.25","PC.26","PC.27","PC.28","PC.29","PC.30","PC.31","PC.32","PC.33","PC.34","PC.35","PC.36","PC.37","PC.38","PC.39","PC.40")

##Set seed
set.seed(1204688)

##K means clustering on each PC
K_MEAN <- 4

PC1_K<-kmeans(PCs$PC.1, K_MEAN)
PC2_K<-kmeans(PCs$PC.2, K_MEAN)

##Add clusters to PC dataframe
PCs$PC1.Cluster<-PC1_K$cluster
PCs$PC2.Cluster<-PC2_K$cluster
PCs$Clusters<-as.factor(paste(PC1_K$cluster,PC2_K$cluster,sep="."))

##WWE group is the majority

MAX_PC1<-ifelse(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN == 0, K_MEAN, match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN)

MAX_PC2<-ceiling(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster))/K_MEAN)

##Make lists of WWE IDs
WWE<-as.data.frame(PCs[PCs$PC1.Cluster == MAX_PC1 & PCs$PC2.Cluster == MAX_PC2,1])
names(WWE)<-"ID"

## Get overlap with participants of interest
WWE = WWE[WWE$ID %in% neuro$FID,]
print(paste("We excluded ", nrow(neuro)-length(WWE),"participants based on 4-means clustering. N = ", length(WWE), ""))
#[1] "We excluded  1332 participants based on 4-means clustering. N =  44183 "

WWE_PLINK<-as.data.frame(cbind(WWE,WWE))
names(WWE_PLINK)<-c("FID","IID")

##Write to file
write.table(WWE_PLINK, file=paste0(out,"/ukb_neuroimaging_4MeansClustering_excl_het_miss.txt"), row.names=F, col.names=T, quote = F)

##Plot
pdf(paste0(out,"/ukb_neuroimaging_4MeansClustering", ".pdf"))
with(PCs, print(qplot(PC.1, PC.2, colour=Clusters)))
dev.off()


#############################
### check how many of those participants now have self-reported different ancestry
#############################
# this function identifies the path to the most recent download file on our server
getFieldLoc = function(path = path, fileName = fileName, fieldID = fieldID){
  library(stringr)
  # read all the field.ukb files
  files_to_read = list.files(
    path = path,
    pattern = fileName,
    recursive = T,
    full.names = T
  )
  # read all files
  dat = lapply(files_to_read, fread)
  names(dat) = files_to_read
  
  # search for field ID of interest 
  candidates = names(dat)[grep(fieldID, dat)]
  # figure out which one is from the most recent file (i.e., highest number)
  candidates = str_remove(candidates, paste0(path, "/"))
  candidates = str_remove(candidates, paste0("/", fileName))
  candidates = unique(as.numeric(sapply(str_extract_all(candidates, "\\d+"), tail , 1)))
  most_recent = max(candidates, na.rm =T)
  
  return(paste0(path, most_recent))
}

path1 = getFieldLoc(path = path, 
            fileName = "fields.ukb", 
            fieldID = 21000)

fileID = list.files(pat=path1,pattern="csv")
# read in file
file = fread(paste0(path1, "/", fileID))
# file doesnt like column names that start with number and it doesnt like -
names(file) = paste0("f.",names(file))
names(file) = gsub("-", "_", names(file), fixed = T)

# list columns of interest
id=which(names(file) == "f.eid")
Cols = grep("f.21000", names(file))
file = file[, c(..id, ..Cols)]
names(file)[which(names(file) == "f.eid")] = "IID"

# merge file with neuro but only keep the IDs available in neuro
neuro1 = merge(neuro, file, by = "IID", all.x=T)

# make a table for the remaining particpants to see what ethnicity they reported
table(neuro1$f.21000_0.0)

# make variable that encodes European ancestry
neuro1$European <- ifelse(neuro1$f.21000_0.0 == 1 | neuro1$f.21000_0.0 == 1001 | neuro1$f.21000_0.0 == 1002 | neuro1$f.21000_0.0 == 1003,1,0)

print(paste("We're losing ", sum(neuro1$European == 0, na.rm=T)," participants because they self-reported to be non-European.",sum(is.na(neuro1$European)),"are missing this info and will also be excluded. N remaining = ", sum(neuro1$European ==1, na.rm=T)))
#[1] "We're losing  1470  participants because they self-reported to be non-European. 9 are missing this info and will also be excluded. N remaining =  44036"

# delete non-European
neuro1 = neuro1[which(neuro1$European == 1),]

write.table(neuro1, file=paste0(out,"/ukb_neuroimaging_4MeansClustering_excl_het_missEUR.txt"), row.names=F, col.names=T, quote = F)

##### At this point, we now have a list of IDs saved in ukb_neuroimaging_4MeansClustering_excl_het_miss.txt that are:
# People with neuroimaging data
# Have genetic data
# Have not been labeled unusual by the UKB core team for missingness and heterozygosity
# Have been included based on 4 means clustering
# Have self-reported that they are of European descent

Format genetic data

Code
##########################################################################################################
# Step 0: Only include participants with neuroimaging data 
## this set of individuals will already exclude non-Europeans (PCs and self-report)
## extreme scores on hetezygosity and missingness (see above)

# copy bim bed and fam files to get consistent naming across bim and bed files
## bed files
for CHR in {1..22}
do
cp ${sourceBED}/ukb_cal_chr${CHR}_v2.bed ${target}/ukb_chr${CHR}_v2.bed
done
## bim files
for CHR in {1..22}
do
cp ${sourceBIM}/ukb_snp_chr${CHR}_v2.bim ${target}/ukb_chr${CHR}_v2.bim
done
## fam file
cp /GWAS_Source/UB_BB/UKBB_500K/Decrypted/ukb1027_cal_chr1_v2_s488374.fam ${target}/ukb_v2.fam

# do fam file for each chromosome
for CHR in {1..22}
do
cp ${target}/ukb_v2.fam ${target}/ukb_chr${CHR}_v2.fam
done

## put data from each chromosome together into one bim, bed and fam file
## and filter participants of interest
cd $target
phenoIDs="/BrainAtrophy/data/geneticQC/ukb_neuroimaging_4MeansClustering_excl_het_missEUR.txt"

plink19 \
--merge-list $target/allGenoFiles.txt \
--make-bed \
--keep $phenoIDs \
--out ukb_neuroimaging_reQC

## starting out with 784256 variants and 488377 participants, that are getting cut down to the 44036 participants that survived cleaning above

#################################################
# I realised later that this is not including the sex-chromosomes which are needed for sex-check later (this merging back and forth could have been done in one step, but I am saving resources to not to it from scratch again)
# Copy X, Y and XY chromosome info first
## bed files
for CHR in X XY Y
do
cp ${sourceBED}/ukb_cal_chr${CHR}_v2.bed ${target}/ukb_chr${CHR}_v2.bed
done
## bim files
for CHR in X XY Y
do
cp ${sourceBIM}/ukb_snp_chr${CHR}_v2.bim ${target}/ukb_chr${CHR}_v2.bim
done
## fam file for each chromosome
for CHR in X XY Y
do
cp ${target}/ukb_v2.fam ${target}/ukb_chr${CHR}_v2.fam
done

## merge X, Y, XY files together
plink19 \
--merge-list ${target}/allXYfiles.txt \
--make-bed \
--keep $phenoIDs \
--out ukb_neuroimaging_sexchrom

# now merge sex chromosomes with other data
plink19 \
--merge-list ${target}/allAuto_and_Sex.txt \
--make-bed \
--keep $phenoIDs \
--out ukb_neuroimaging_autosomal_sex_preQC

## now altogether, we have 44036 partcicipants left, and 805161 variants 

Filter for missing genotype data and minor allele frequency

Code
# Step 1: Get SNP-list based on geno filter
plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--geno 0.02 \
--write-snplist \
--out ${target}/ukb_neuroimaging_GENO0.02
# removed 104462 variants, 700699 remaining
# no participants removed (44036 remaining)

## this creates a snplist that feeds into next step

# Step 2: apply various cleaning steps to get ID list (.fam) to feed into relatedness analysis
plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--geno 0.02 \
--extract ukb_neuroimaging_GENO0.02.snplist \
--write-snplist \
--make-just-fam \
--freq --maf 0.01 \
--out ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR

# this fam file still contains 44036 participants
# MAF filter removed 103158 variants, 597541 remaining

Perform sex-check

Code
# Step 6: Prune SNPs for sex-check
# to do sex-check, we prune SNP data to be independent and exclude high LD regions
# this is because patterns of LD will impair chromosome-specific tests of homozygosity
awk -f ${target}/highLDregions4bim_b37.awk ${target}/ukb_neuroimaging_reQC.bim > ${target}/ukb_neuroimaging_High_LD_Regions_To_Exclude.txt
# this file contains 28923 SNPs to exclude from sex-check analysis

plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--maf 0.05 \
--hwe 0.001 \
--geno 0.02 \
--thin-indiv-count 300 \
--indep-pairphase 200 100 0.2 \
--seed 1204688 \
--exclude ${target}/ukb_neuroimaging_High_LD_Regions_To_Exclude.txt \
--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.snplist \
--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \
--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_LD1_ALL



# Step 7: Perform sex-check
# F statistic used here is a function of teh deviation of the observed number of heterozygote variants from that expected under Hardy-Weinberg equilibrium
# males should have F ~1 because all X chromosome variants are hemizygous and no heterozygotes can be observed
# females should have lower values of F, distributed around 0 (but females with very high F stats have been observed)
# this script shouldn't need a seperate --split-x analysis because the X chromosome's pseudoautosomal region is already presented as XY chromosome

plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--check-sex \
--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \
--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_LD1_ALL.prune.in \
--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_PRUNE_SEX

Remove individuals that did not pass sex-check

Code
### Extract participants that have been labeled as PROBLEM
R
library(data.table)
dat=fread("ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_PRUNE_SEX.sexcheck")

IDstoRemove=dat[which(dat$STATUS == "PROBLEM"),c("FID","IID")]
# 22 
write.table(IDstoRemove, file=paste0(out,"/sexCheck_toRemove.txt"), row.names=F, col.names=T, quote = F)

Generate final .fam and .snplist files

Code
## Step 8: Generate final .fam and .snplist to feed into regenie (excluding failed sex checks)
plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.snplist \
--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001.fam \
--write-snplist \
--make-just-fam \
--remove ${target}/sexCheck_toRemove.txt \
--out ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck

Generate final bed file

Code
# Step 9: Generate final bed files that will be used as input
plink19 \
--bed ${target}/ukb_neuroimaging_autosomal_sex_preQC.bed \
--bim ${target}/ukb_neuroimaging_autosomal_sex_preQC.bim \
--fam ${target}/ukb_neuroimaging_autosomal_sex_preQC.fam \
--extract ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck.snplist \
--keep ${target}/ukb_neuroimaging_MAF0.01_GENO0.02_QA_EUR_HWE0.00000001_sexcheck.fam \
--chr 1-22 \
--make-bed \
--out ${final}/ukb_neuroimaging_brainAtrophy_GWASinput