Download the VCF file from: Zenodo Record 15173226
Install vcftools in a new conda environment:
conda create -n vcftools -c bioconda vcftools
conda activate vcftools
variants_renamed.vcf.gz
vcftools --gzvcf variants_renamed.vcf.gz \
--minQ 30 --minGQ 30 \
--out variants_minQ30_minGQ30 --recode
vcftools --vcf variants_minQ30_minGQ30.recode.vcf \
--remove-indels \
--out variants_minQ30_minGQ30_rmvIndels --recode
vcftools --vcf variants_minQ30_minGQ30_rmvIndels.recode.vcf \
--hwe 0.05 \
--out variants_minQ30_minGQ30_rmvIndels_hwe_0.05 --recode
vcftools --vcf variants_minQ30_minGQ30_rmvIndels_hwe_0.05.recode.vcf \
--mac 3 \
--out variants_minQ30_minGQ30_rmvIndels_hwe_0.05_mac3 --recode
vcftools --vcf variants_minQ30_minGQ30_rmvIndels_hwe_0.05_mac3.recode.vcf \
--missing-indv \
--out imiss_stats
Remove individuals with >60% missing data:
vcftools --vcf variants_minQ30_minGQ30_rmvIndels_hwe_0.05_mac3.recode.vcf \
--remove <(awk '$5 > 0.6 {print $1}' imiss_stats.imiss) \
--recode --out variants_minQ30_minGQ30_rmvIndels_hwe_0.05_mac3_imiss_0.6
for miss in {10..90..10}; do
perc=$(echo "scale=2; $miss / 100" | bc)
vcftools --vcf variants_minQ30_minGQ30_rmvIndels_hwe_0.05_mac3_imiss_0.6.recode.vcf \
--max-missing $perc \
--out variants_miss_${miss} --recode
done
for miss in {10..90..10}; do
count=$(grep -vc "^#" variants_miss_${miss}.recode.vcf)
echo "${miss} ${count}" >> variant_counts.txt
done
conda create -n ggplot2 -c conda-forge r-ggplot2
conda activate ggplot2
R -e "library(ggplot2); library(scales); \
data <- read.table('variant_counts.txt', header=FALSE, col.names=c('Missingness', 'Variants')); \
p <- ggplot(data, aes(x=Missingness, y=Variants)) + geom_line(color='blue') + geom_point(color='red') + \
ggtitle('Number of Passed Variants vs. Missingness Filter') + xlab('Max Missingness (%)') + ylab('Number of Variants') + \
scale_y_continuous(labels = comma) + scale_x_continuous(limits = c(10, 90)) + theme_minimal(); \
ggsave('variant_plot.png', plot=p)"
vcftools --vcf variants_miss_60.recode.vcf --site-mean-depth --out depth
conda create -n R_env -c conda-forge r-base
conda activate R_env
R
depth_data <- read.table("depth.ldepth.mean", header = TRUE)
depths <- depth_data$MEAN_DEPTH
lower_bound <- quantile(depths, 0.025, na.rm = TRUE)
upper_bound <- quantile(depths, 0.975, na.rm = TRUE)
cat("Middle 95% range:", lower_bound, "to", upper_bound, "\n")
Example Output: Middle 95% range: 12.8302 to 24.6604
vcftools --vcf variants_miss_60.recode.vcf \
--min-meanDP 12.8302 --max-meanDP 24.6604 --recode \
--out variants_miss_60_mid95percentile