############################ ## Population genetics : general script ## J. Plantade ## Jan 2021 ############################ # packages library(ade4) library(pegas) library(adegenet) library(PopGenome) library(hierfstat) library(ggplot2) library(poppr) ################ ### setting data # data import into a loci object loci <- read.vcf("data/CD4_panTro3.vcf_fixed.vcf", from = 1, to = 572) nloci <- length(loci) # convert loci object into a genind object genind <- loci2genind(loci) # add population factor pop(genind)<-rep(c("paniscus","troglodytes_ellioti", "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), c(11,10,6,4, 5)) # convert genind object into a hierfstat object hierf <- genind2hierfstat(genind) ################## ### heterozygosity div <- summary(genind) locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE) # ggplot2 : Hobs-Hexp VS position g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light() g1 # Bartlett test (homogeneity of variance) # H0 : Hobs = Hexp bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured # create one genind object for each population pop <- c("pan", "ellioti", "schwein", "troglo", "verus") npop <- length(pop) for (i in 1:length(pop)){ tmp <- popsub(genind, sublist = c(subspecies[i])) assign(pop[i], tmp) } # extracting heterozygosity for each population list_genind <- c(pan, ellioti, schwein, troglo, verus) for (i in 1:length(list_genind)){ tmp <- summary(list_genind[[i]]) assign(paste0(pop[i], "_div"), tmp) } # -> possibility to to Bartlett test and plot Hobs for each pop ####### ### Fst # Matrix X with colnames and rownames = pop, create the name of pairs npop <- 5 M <- matrix(NA, nrow = npop, ncol = npop) M <- as.data.frame(M) rownames(M) <- pop colnames(M) <- pop for (i in 1:npop){ start <- colnames(M)[i] for (j in i+1:npop-i){ end <- rownames(M)[j] M[j, i] <- paste0(start, "_", end) } } # creates a list of pair names pair_name <- c(rep("NA", (npop^2-npop)/2)) k <- 1 for (i in 1:(npop-1)){ for (j in i+1:(npop-i)){ pair_name[k] <- M[j, i] k <- k+1 } } # genind subset -> couple of population. pair_list <- NULL for (i in 2:(npop)){ # in matrix M we go from col 2 to 5 for (j in (i+1):(npop+1)){ # in matrix M we go from row 3 to 6 tmp <- popsub(genind, sublist = c(pop(genind)[i-1], pop(genind)[j-1])) tmp2 <- assign(M[j, i], tmp) pair_list <- c(pair_list, tmp2) # list of all pop pairs } } # calculation of F statistics and extraction of Fst values fst <- matrix(NA, nrow = nloci, ncol = length(pair_list)) for (i in 1:length(pair_list)){ tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "") fst[,i] <- tmp[,2] } fst <- as.data.frame(fst) colnames(fst) <- pair_name # Fst values VS loci position plot(locinfo$POS, fst[,1], cex=0.2, xlab="Position", ylab="Fst value", main="Fst value per locus") abline(0.15, 0) ############# ## Clustering with DAPC # Identify the number of cluster grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4 # Kstat : BIC for each value of k, here k=4 has the lower BIC grp$Kstat # stat : gives the selected nb of clusters (selected k) and its BIC value grp$stat # grp : gives the id of the group to which each individual is assigned head(grp$grp) # size : number of individuals assigned to each group grp$size # Compare inferred groups (inf) with actual groups (ori) # Find actual groups with function pop on the genind object pop(genind) tab <- table(pop(genind), grp$grp) # Display table.value(tab, col.labels = colnames(tab), row.labels = rownames(tab)) # describe clusters using dapc1 dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5 # visualization of clusters myCol <- c("darkblue", "purple", "green", "orange") scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=1, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4))) ########## ## Tajimas'D with popGenome # list of populations populations <- list("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus") # create a GENOME class object with vcf file genome <- readData(path = "data/", format = "VCF") # function that create a list, which element is the ensemble of individual for each pop listing_ind <- function(npan, nellio, nschwein, ntroglo, nverus){ pan_vec <- c(rep("NA", npan)) ellio_vec <- c(rep("NA", nellio)) schwein_vec <- c(rep("NA", nschwein)) troglo_vec <- c(rep("NA", ntroglo)) verus_vec <- c(rep("NA", nverus)) for (i in 1:npan){ pan_vec[i] <- paste0("pan_", i) } for (i in 1:nellio){ ellio_vec[i] <- paste0("ellioti_", i) } for (i in 1:nschwein){ schwein_vec[i] <- paste0("schwein_", i) } for (i in 1:ntroglo){ troglo_vec[i] <- paste0("troglo_", i) } for (i in 1:nverus){ verus_vec[i] <- paste0("verus_", i) } populations <- list(pan_vec, ellio_vec, schwein_vec, troglo_vec, verus_vec) } # apply listing_ind with the rignt number of individuals for each pop # and save it in the object 'populations' populations <- listing_ind(11, 10, 6, 4, 5) # set populations in the genome object genome <- set.populations(genome, new.populations = populations, diploid = TRUE) # check genome@populations