PCA, ADMIXTURE, and Heterozygosity Analysis Workflow

1. PCA: Principal Component Analysis

Convert VCF to PLINK Binary Format

conda activate plink
plink --vcf machali_...vcf \
  --make-bed \
  --double-id \
  --allow-extra-chr \
  --out machali_...

Edit .fam file to add region info

awk '{
  id = $1
  if (id ~ /_CI/)       region = "CenIndia"
  else if (id ~ /_SI/)  region = "SouIndia"
  else if (id ~ /_NE/)  region = "NorEasIndia"
  else if (id ~ /_NW/)  region = "NorWesIndia"
  else if (id ~ /_NOR/) region = "NorIndia"
  else if (id ~ /_SU/)  region = "Sunderban"
  else if (id ~ /^LGS/) region = "CenIndia"
  else                  region = "Unknown"
  $2 = region
  print
}' input.fam > tmp.fam && mv tmp.fam input.fam

Fix chromosome names in .bim

awk '{$1 = 1; print}' input.bim > tmp && mv tmp input.bim

Run PCA

plink --bfile input --pca 5 --allow-extra-chr --out input_pca
conda deactivate

Plot PCA in R

conda activate R_env
R
fam <- read.table("input.fam")
eigenvec <- read.table("input_pca.eigenvec")
colnames(eigenvec) <- c("FID", "IID", paste0("PC", 1:5))
colnames(fam)[2] <- "Region"
eigenvec$Region <- fam$V2
library(ggplot2)
ggplot(eigenvec, aes(x = PC1, y = PC2, color = Region)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_minimal() +
  labs(title = "PCA Plot by Region", x = "PC1", y = "PC2")
ggsave("pca_by_region.png")
ggsave("pca_by_region.pdf")
q()

2. ADMIXTURE Analysis

conda create -n admixture -c bioconda admixture
conda activate admixture

for K in {2..4}
do 
  admixture input.bed $K
done

Plot Admixture Results in R

conda activate R_env
R
library(ggplot2)
library(reshape2)
library(dplyr)
q3 <- read.table("input.3.Q")
fam <- read.table("input.fam")
q3$ID <- fam$V1
q3$Group <- gsub(".*_(.*)", "\\1", q3$ID)
q3_long <- melt(q3, id.vars = c("ID", "Group"))
q3_long <- q3_long %>%
  arrange(Group, ID) %>%
  mutate(ID = factor(ID, levels = unique(ID)))
p <- ggplot(q3_long, aes(x = ID, y = value, fill = variable)) +
  geom_bar(stat = "identity", width = 1) +
  facet_grid(~Group, scales = "free_x", space = "free_x") +
  theme_minimal() +
  labs(x = "Individuals", y = "Ancestry Proportion", title = "ADMIXTURE Plot (K=3)") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.spacing = unit(0.5, "lines"),
        strip.text.x = element_text(angle = 0, face = "bold"),
        legend.position = "right") +
  scale_fill_brewer(palette = "Set1")
ggsave("admixture_K3_grouped.png", p, width = 12, height = 6, dpi = 300)
q()

3. Heterozygosity Estimation

conda create -n rtg-tools -c bioconda rtg-tools
conda activate rtg-tools
rtg vcfstats input.vcf > input.vcfstats
conda deactivate