This RMarkdown is written as a self-contained codebook for the centralised quality control of the genetic data downloaded from the UK Biobank at the Social, Genetic and Developmental Psychiatry [SGDP] Centre, King’s College London [KCL]. The output of this centralised quality control produces cleaned datasets for use by multiple analysts across KCL. The scripts are constructed to be usable for the different projects run from KCL, and so are generalisable to UKBB projects overall. Developments of this code were used in the R package ukbtools [https://cran.r-project.org/web/packages/ukbtools/index.html] - credit: Ken Hanscombe This document should not be run as a R script itself, but is rather a single document to list all code. All code was run on the King’s College London / National Institute of Health Research Maudsley Biomedical Research Centre high performance computing cluster “Rosalind”, unless otherwise stated.
Scripts have been simplified by removing paths to files and renaming certain files to more widely-understandable names where relevant
No previous scripts UKBB genotype and imputed genetic data PLINK 1.9 (see below) FlashPCA2 (see below) GreedyRelated (see below) bgenix (see below)
Contributions to the logic of this code were made by numerous analysts at the SGDP. The code itself was written by Jonathan Coleman and Sam Choi. Additional programs are the credit of their authors: PLINK[https://www.cog-genomics.org/plink/1.9/] Chris Chang, Shaun Purcell FlashPCA2[https://github.com/gabraham/flashpca] Gad Abraham GreedyRelated[https://github.com/choishingwan/GreedyRelated] Sam Choi bgenix[https://bitbucket.org/gavinband/bgen/wiki/bgenix] Gavin Band
## Note - text in square brackets below should be replaced with suitable file name (don't put the square brackets around the files!)
Rscript Make_UKBB_Caucasians_By_4_Means_Clustering.R \
[File containing the IDs and the 40 UKBB PCs extracted from ...sqc_v2.txt - e.g. ukb1657_40PCs_ID_from_sqc_v2.txt] \
[k for k-means clustering, e.g. 4] \
[Full path to output root, e.g. to/UKBB_Data_Ancestry. Example output will be to/UKBB_Data_Ancestry.txt, to/UKBB_Data_Ancestry.pdf, to/UKBB_Data_Ancestry_PLINK.txt]
bash Genotyped_Data_QC_JRIC_240119.bash \
-b /mnt/lustre/groups/ukbiobank/ukb18177_glanville/genotyped/ukb18177_glanville_binary_pre_qc.bed \
-j /mnt/lustre/groups/ukbiobank/ukb18177_glanville/genotyped/ukb18177_glanville_binary_pre_qc.bim \
-f /mnt/lustre/groups/ukbiobank/ukb18177_glanville/genotyped/ukb18177_glanville_binary_pre_qc.fam \
-m 0.01 \
-v 0.02 \
-s 0.02 \
-a 1 \
-q 1 \
-e /scratch/groups/ukbiobank/KCL_Data/Genotypes/kylie_application/ukb1817_het_miss_excl_rows_v3.txt \
-g 1 \
-x /scratch/groups/ukbiobank/KCL_Data/Genotypes/kylie_application/ukb1817_reported_sex_from_sqc_v2.txt \
-r 0.044 \
-l /scratch/groups/ukbiobank/KCL_Data/Genotypes/kylie_application/ukb1817_rel_pairs.txt \
-t 1 \
-u /mnt/lustre/groups/ukbiobank/Edinburgh_Data/usr/anna/PhD/output/geno_qc/participants_of_interest.txt \ # this file will have header: ID Pheno
-h 0.00000001 \
-d 0 \
-p 1 \
-c /scratch/groups/ukbiobank/KCL_Data/Genotypes/kylie_application/ukb1817_Caucasians_4Means_PLINK.txt \
-o /mnt/lustre/groups/ukbiobank/Edinburgh_Data/usr/anna/PhD/output/geno_qc/genetic_qc_UKB
##Load dependencies
library(data.table)
library(ggplot2)
args<-commandArgs(trailingOnly=TRUE)
if(length(args)==0){
stop("Please provide an input file containing an ID column and all PCs (full path), K for k-means clustering, and an output file path", call.=FALSE)
}
##Read in PCs, remove NAs, rename
PCs<-fread(args[1], data.table=F)
PCs<-na.omit(PCs)
names(PCs)<-c("ID","PC.1","PC.2","PC.3","PC.4","PC.5","PC.6","PC.7","PC.8","PC.9","PC.10","PC.11","PC.12","PC.13","PC.14","PC.15","PC.16","PC.17","PC.18","PC.19","PC.20","PC.21","PC.22","PC.23","PC.24","PC.25","PC.26","PC.27","PC.28","PC.29","PC.30","PC.31","PC.32","PC.33","PC.34","PC.35","PC.36","PC.37","PC.38","PC.39","PC.40")
##Set seed
set.seed(1204688)
##K means clustering on each PC
K_MEAN<-as.numeric(args[2])
PC1_K<-kmeans(PCs$PC.1, K_MEAN)
PC2_K<-kmeans(PCs$PC.2, K_MEAN)
##Add clusters to PC dataframe
PCs$PC1.Cluster<-PC1_K$cluster
PCs$PC2.Cluster<-PC2_K$cluster
PCs$Clusters<-as.factor(paste(PC1_K$cluster,PC2_K$cluster,sep="."))
##WWE group is the majority
MAX_PC1<-ifelse(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN == 0, K_MEAN, match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster)) %% K_MEAN)
MAX_PC2<-ceiling(match(max(table(PCs$PC1.Cluster, PCs$PC2.Cluster)), table(PCs$PC1.Cluster, PCs$PC2.Cluster))/K_MEAN)
##Make lists of WWE IDs
WWE<-as.data.frame(PCs[PCs$PC1.Cluster == MAX_PC1 & PCs$PC2.Cluster == MAX_PC2,1])
names(WWE)<-"ID"
WWE_PLINK<-as.data.frame(cbind(WWE,WWE))
names(WWE_PLINK)<-c("FID","IID")
##Write to file
write.table(WWE, file=paste(args[3],".txt",sep=""), row.names=F, col.names=T, quote = F)
write.table(WWE_PLINK, file=paste(args[3],"_PLINK.txt",sep=""), row.names=F, col.names=T, quote = F)
##Plot
pdf(paste(args[3], ".pdf", sep=""))
with(PCs, print(qplot(PC.1, PC.2, colour=Clusters)))
dev.off()
#Check inputs are suitable
while getopts ":b:j:f:m:v:s:a:q:e:g:x:r:l:t:u:h:d:p:c:o:" opt
do
case $opt in
b) INPUT_BED="$OPTARG"
;;
j) INPUT_BIM="$OPTARG"
;;
f) INPUT_FAM="$OPTARG"
;;
m) MAF="$OPTARG"
;;
v) GENO="$OPTARG"
;;
s) MIND="$OPTARG"
;;
a) CAUC="$OPTARG"
;;
q) QA="$OPTARG"
;;
e) QA_FILE="$OPTARG"
;;
g) GENDER="$OPTARG"
;;
x) SEX_FILE="$OPTARG"
;;
r) RELATEDNESS="$OPTARG"
;;
l) RELATE_FILE="$OPTARG"
;;
t) RELATE_PHENO="$OPTARG"
;;
u) RELATE_PHENO_FILE="$OPTARG"
;;
h) HWE="$OPTARG"
;;
p) PRUNE="$OPTARG"
;;
d) SEXSPLIT="$OPTARG"
;;
c) CAUCASIANS_FILE="$OPTARG"
;;
o) OUTPUT="$OPTARG"
;;
\?) echo "Invalid option -$OPTARG" >&2
;;
esac
done
if [ $# -ne 40 ]
then
echo -e "\nPlease provide all arguments. If argument not needed:\n -b [Full path to input bed]\n -j [Full path to input bim]\n -f [Full path to input fam]\n -m [MAF]\n -v [GENO]\n -s [MIND]\n -a [Exclude non-Caucasians? 1 = Yes, 0 = No]\n -q [Exclude QA errors? 1 = Yes, 0 = No]\n -e [Full path to UKBB QA Excludes file]\n -g [Exclude gender mismatches? 1 = Yes, 0 = No]\n -x [Full path to UKBB Phenotypic Sex file]\n -r [Relatedness threshold for greedyRelated]\n -l [Full path to Relatedness Pairs file]\n -t [Use phenotype file to retain cases preferentially? 1 = Yes 0 = No]\n -u [Full path to phenotype file for Relatedness pairs]\n -h [HWE for SNPs]\n -d [Create sex split? 1 = Yes, 0 = No]\n -p [Create pruned file? 1 = Yes, 0 = No]\n -c [Full path to Caucasians File]\n -o [Full path to output root]\nArguments provided = $#"
exit
fi
## MAF
MAF_FLAG=$(echo ${MAF} != 0 | bc -l)
MIND_FLAG=$(echo ${MIND} != 0 | bc -l)
RELATE_FLAG=$(echo ${RELATEDNESS} != 1 | bc -l)
first_plink="plink --bed ${INPUT_BED} --bim ${INPUT_BIM} --fam ${INPUT_FAM} --geno ${GENO} --write-snplist --out ${OUTPUT}_GENO${GENO}"
$first_plink
## Keep {OUTPUT}_GENO${GENO}.fam
plink="plink --bed ${INPUT_BED} --bim ${INPUT_BIM} --fam ${INPUT_FAM} --geno ${GENO} --extract ${OUTPUT}_GENO${GENO}.snplist --write-snplist --make-just-fam"
if [ ${MAF_FLAG} -ne 0 ]
then
plink="${plink} --freq --maf ${MAF}"
fi
## Sample Missingness
if [ ${MIND_FLAG} -eq 1 ]
then
echo -e "\nWARNING: You are applying a missingness filter to samples.\n If your base file is not a whole-genome file, this may produce different sample sets between clumps/chromosomes.\n"
plink="${plink} --mind ${MIND}"
fi
if [ ${CAUC} -eq 1 ]
then
## Keep {OUTPUT}_GENO${GENO}.fam and Caucasians file
plink="${plink} --keep ${CAUCASIANS_FILE}"
fi
if [ ${CAUC} -eq 0 ]
then
echo -e "\nWARNING: You are retaining all individuals - this will produce a transethnic dataset \n"
fi
##Exclude relatives greedily
if [ ${RELATE_FLAG} -eq 0 ]
then
echo -e "\nWARNING: You are not removing related individuals \n"
fi
if [ ${RELATE_FLAG} -ne 0 ]
then
if [ ${QA} -eq 1 ]
then
##Get plink .fam for exclusions to date, including QA
plink_pre_relate="${plink} --remove ${QA_FILE} --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}"
$plink_pre_relate
fi
if [ ${QA} -eq 0 ]
then
##Get plink .fam for exclusions to date, including QA
plink_pre_relate_no_QA="${plink} --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}"
$plink_pre_relate_no_QA
fi
##Limit relatedness file to included participants
cat <(head -1 ${RELATE_FILE}) <(LANG=C fgrep -wf <(awk '{print $1}' ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}.fam) ${RELATE_FILE}) > ${RELATE_FILE}_reduced
if [ ${RELATE_PHENO} -eq 0 ]
then
greedRelate/greedyRelated \
-r ${RELATE_FILE}_reduced \
-t ${RELATEDNESS} \
-s 1234 \
> ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}.txt
fi
if [ ${RELATE_PHENO} -ne 0 ]
then
greedRelate/greedyRelated \
-r ${RELATE_FILE}_reduced \
-p ${RELATE_PHENO_FILE} \
-t ${RELATEDNESS} \
-s 1234 \
> ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}.txt
fi
cat <(echo -e "FID IID") <(awk '{print $1, $1}' ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}.txt) > ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}_FOR_PLINK.txt
fi
## Exclude all UKB exclusions (UKBileve_Affy... + UKBileve_Geno... + Recommended Excl... For details of these exlcusions, see http://biobank.ctsu.ox.ac.uk/showcase/search.cgi?wot=0&srch=quality+&sta0=on&sta1=on&sta2=on&sta3=on&str0=on&str3=on&fit0=on&fit10=on&fit20=on&fit30=on&fvt11=on&fvt21=on&fvt22=on&fvt31=on&fvt41=on&fvt51=on&fvt61=on&fvt101=on)
if [ ${QA} -eq 1 ]
then
if [ ${RELATE_FLAG} -ne 0 ]
then
cat ${QA_FILE} ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}_FOR_PLINK.txt | sort | uniq > ${OUTPUT}_KCL_UKBB_QA_And_Relatedness_Exclusions_FOR_PLINK.txt
plink="${plink} --remove ${OUTPUT}_KCL_UKBB_QA_And_Relatedness_Exclusions_FOR_PLINK.txt --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE} --hardy --hwe ${HWE}"
fi
if [ ${RELATE_FLAG} -eq 0 ]
then
plink="${plink} --remove ${QA_FILE} --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE} --hardy --hwe ${HWE}"
fi
fi
if [ ${QA} -eq 0 ]
then
echo -e "\nWARNING: You are not removing UKBiobank QA exclusions - ARE YOU SURE? \n"
if [ ${RELATE_FLAG} -ne 0 ]
then
plink="${plink} --remove ${OUTPUT}_KCL_UKBB_Relatedness_Exclusions_Greedy_1234_${RELATEDNESS}_FOR_PLINK.txt --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE} --hardy --hwe ${HWE}"
fi
if [ ${RELATE_FLAG} -eq 0 ]
then
plink="${plink} --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE} --hardy --hwe ${HWE}"
fi
fi
##Prune
###JRIC - 020617 - This command previously included Split-X. This makes a new file, which we want to avoid. I have moved this to the start of the analysis so that this is run first time (if needed)
#Run the previous QC first
$plink
if [ $PRUNE -eq 1 ]
then
awk -f gwas_scripts/highLDregions4bim_b37.awk ${INPUT_BIM} > ${OUTPUT}_High_LD_Regions_To_Exclude.txt
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--maf 0.05 \
--hwe 0.001 \
--geno 0.02 \
--thin-indiv-count 300 \
--indep-pairphase 200 100 0.2 \
--seed 1204688 \
--exclude ${OUTPUT}_High_LD_Regions_To_Exclude.txt \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_LD1_ALL
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--autosome \
--maf 0.05 \
--hwe 0.001 \
--geno 0.02 \
--thin-indiv-count 300 \
--indep-pairphase 200 100 0.2 \
--seed 1204688 \
--exclude ${OUTPUT}_High_LD_Regions_To_Exclude.txt \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_LD1_NO_X
if [ ${GENDER} -eq 1 ]
then
sex_plink="plink --bed ${INPUT_BED} --bim ${INPUT_BIM} --fam ${INPUT_FAM} --check-sex --keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam --extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_LD1_ALL.prune.in --out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_PRUNE_SEX"
$sex_plink
##Compare to UKBB_Sex - no males have F <0.9, some females with high Fs, but this is possible, so no exclusions for now
cat <(echo -e "FID SEX IID PEDSEX SNPSEX STATUS F") <(join -1 1 -2 1 <(sort -k1,1 ${SEX_FILE}) <(tail -n+2 ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_PRUNE_SEX.sexcheck | sort -k1,1)) > ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEXCHECK_WITH_UKBB_SEX.sexcheck
##Deviations - exclude phenotypic males with F < 0.9, phenotypic females with F > 0.6
#Reduce file read by 1
cat <(awk '( NR >1 && $2 == "F" && $7 < 0.5)||(NR >1 && $2 == "M" && $7 > 0.9){ print $1, $3;}' ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEXCHECK_WITH_UKBB_SEX.sexcheck) > ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist
fi
if [ $GENDER -eq 0 ]
then
echo -e "\nWARNING: You are not removing mismatched genders - ARE YOU SURE? \n"
fi
fi
if [ $PRUNE -ne 1 ]
then
if [ $GENDER -eq 1 ]
then
echo -e "\nWARNING: You are not performing pruning before checking mismatched genders - ARE YOU SURE? \n"
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--check-sex \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_PRUNE_SEX
cat <(echo -e "FID SEX IID PEDSEX SNPSEX STATUS F") <(join -1 1 -2 1 <(sort -k1,1 ${SEX_FILE}) <(tail -n+2 ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_PRUNE_SEX.sexcheck)) > ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEXCHECK_WITH_UKBB_SEX.sexcheck
cat <(awk '( NR >1 && $2 == 0 && $7 < 0.6)||(NR >1 && $2 == 1 && $7 > 0.85){ print $1, $3;}' ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEXCHECK_WITH_UKBB_SEX.sexcheck) > ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist
fi
if [ $GENDER -eq 0 ]
then
echo -e "\nWARNING: You are not removing mismatched genders - ARE YOU SURE? \n"
fi
fi
##Sex split
if [ $SEXSPLIT -eq 1 ]
then
if [ $GENDER -eq 1 ]
then
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist \
--make-just-fam \
--write-snplist \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist \
--filter-males \
--make-just-fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}_MALES
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist \
--filter-females \
--make-just-fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}_FEMALES
fi
if [ $GENDER -eq 0 ]
then
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--make-just-fam \
--write-snplist \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--filter-males \
--make-just-fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}_MALES
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--filter-females \
--make-just-fam \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}_FEMALES
fi
fi
if [ $SEXSPLIT -eq 0 ]
then
if [ $GENDER -eq 1 ]
then
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}.samplelist \
--make-just-fam \
--write-snplist \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}
fi
if [ $GENDER -eq 0 ]
then
plink \
--bed ${INPUT_BED} \
--bim ${INPUT_BIM} \
--fam ${INPUT_FAM} \
--extract ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.snplist \
--keep ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}.fam \
--make-just-fam \
--write-snplist \
--out ${OUTPUT}_MAF${MAF}_GENO${GENO}_MIND${MIND}_CAUC${CAUC}_UKBQC${QA}_UNREL${RELATEDNESS}_HWE${HWE}_SEX${GENDER}
fi
fi
UKBB data as received is vast and larger than most users will use for common-variant analyses. Thin the UKBB data to MAF >= 0.01 and INFO >= 0.4.
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr1_v3.txt > ukb_mfi_chr1_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr2_v3.txt > ukb_mfi_chr2_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr3_v3.txt > ukb_mfi_chr3_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr4_v3.txt > ukb_mfi_chr4_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr5_v3.txt > ukb_mfi_chr5_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr6_v3.txt > ukb_mfi_chr6_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr7_v3.txt > ukb_mfi_chr7_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr8_v3.txt > ukb_mfi_chr8_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr9_v3.txt > ukb_mfi_chr9_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr10_v3.txt > ukb_mfi_chr10_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr11_v3.txt > ukb_mfi_chr11_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr12_v3.txt > ukb_mfi_chr12_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr13_v3.txt > ukb_mfi_chr13_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr14_v3.txt > ukb_mfi_chr14_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr15_v3.txt > ukb_mfi_chr15_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr16_v3.txt > ukb_mfi_chr16_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr17_v3.txt > ukb_mfi_chr17_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr18_v3.txt > ukb_mfi_chr18_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr19_v3.txt > ukb_mfi_chr19_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr20_v3.txt > ukb_mfi_chr20_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr21_v3.txt > ukb_mfi_chr21_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chr22_v3.txt > ukb_mfi_chr22_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chrX_v3.txt > ukb_mfi_chrX_v3_MAF1_INFO4.txt
awk '$6 >= 0.01 && $8 >= 0.4 {print $2}' ukb_mfi_chrXY_v3.txt > ukb_mfi_chrXY_v3_MAF1_INFO4.txt
for i in {22..1} X XY
do
bgen_tools/bin/bgenix -g ukb_imp_chr${i}_v3.bgen -i ukb_imp_chr${i}_v3.bgen.bgi -incl-rsids ukb_mfi_chr${i}_v3_MAF1_INFO4.txt > ukb_imp_chr${i}_v3_MAF1_INFO4.bgen
bgen_tools/bin/bgenix -g ukb_imp_chr${i}_v3_MAF1_INFO4.bgen -index
done
By Anna Elisabeth Fürtjes
anna.furtjes@kcl.ac.uk