####################################### ## Population genetics : general script ## J. Plantade ## Jan 2021 ####################################### # packages library(ade4) library(pegas) library(adegenet) library(PopGenome) library(hierfstat) library(ggplot2) library(poppr) library(rmarkdown) library(knitr) library(stringr) library(dplyr) ################ ### setting data # data import into a loci object loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf") nloci <- length(loci) # convert loci object into a genind object genind <- loci2genind(loci) # add population factor pop(genind)<-rep(c("paniscus","troglodytes_ellioti", "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), c(11,10,6,4, 5)) # convert genind object into a hierfstat object # hierf <- genind2hierfstat(genind) ################## ### heterozygosity div <- summary(genind) locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE) # ggplot2 : Hobs-Hexp VS position g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light() g1 # test Hobs and Hexp normality : Kolmogorov-Smirnov test for samples > 50 hist(div$Hobs) ks.test(x = div$Hobs, y = "pnorm") hist(div$Hexp) ks.test(x = div$Hobs, y = "pnorm") # WARNING : if distributions are not normal we cannot use the Bartlett test (not robust enough) -> use the Levene test # BARTLETT TEST (homogeneity of variance between two samples : obs VS exp) # H0 : Var(Hobs) = Var(Hexp) bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured # theory : https://eric.univ-lyon2.fr/~ricco/cours/cours/Comp_Pop_Tests_Parametriques.pdf # extract H values and bind them Hobs <- div$Hobs Hexp <- div$Hexp H <- cbind(Hobs, Hexp) H <- as.data.frame(H) # tidy the data : df -> one column with values, one column with the origin of values (observed or expected) library(tidyr) longer_H <- H %>% pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value") # LEVENE TEST library(lawstat) test <- levene.test(y = longer_H$H_value, group = longer_H$origin) # create one genind object for each population subspecies <- c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus") pop <- c("pan", "ellioti", "schwein", "troglo", "verus") npop <- length(pop) for (i in 1:length(pop)){ tmp <- popsub(genind, sublist = c(subspecies[i])) assign(pop[i], tmp) } # extracting heterozygosity for each population list_genind <- c(pan, ellioti, schwein, troglo, verus) for (i in 1:length(list_genind)){ tmp <- summary(list_genind[[i]]) assign(paste0(pop[i], "_div"), tmp) } # plot Hobs - Hexp for each population (for geom_rect see report_pan.Rmd) gg_pan <- ggplot() + geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + geom_point(aes(x=locinfo$POS, y=pan_div$Hobs-pan_div$Hexp), size = 0.1) + labs(x = "position", y = "Hobs-Hexp", title = "P. paniscus") + theme_classic() gg_ellioti <- ggplot() + geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + geom_point(aes(x=locinfo$POS, y=ellioti_div$Hobs-ellioti_div$Hexp), size = 0.1) + labs(x = "position", y = "Hobs-Hexp", title = "P. t. ellioti") + theme_classic() gg_schwein <- ggplot() + geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + geom_point(aes(x=locinfo$POS, y=schwein_div$Hobs-schwein_div$Hexp), size = 0.1) + labs(x = "position", y = "Hobs-Hexp", title = "P. t. schweinfurthii") + theme_classic() gg_troglo <- ggplot() + geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + geom_point(aes(x=locinfo$POS, y=troglo_div$Hobs-troglo_div$Hexp), size = 0.1) + labs(x = "position", y = "Hobs-Hexp", title = "P. t. troglodytes") + theme_classic() gg_verus <- ggplot() + geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + geom_point(aes(x=locinfo$POS, y=verus_div$Hobs-verus_div$Hexp), size = 0.1) + labs(x = "position", y = "Hobs-Hexp", title = "P. t. verus") + theme_classic() ####### ### Fst # calculation of F statistics and extraction of Fst values # setting short names for each population pop <- c("pan", "ellioti", "schwein", "troglo", "verus") # Matrix X with colnames and rownames = pop, create the name of pairs npop <- 5 M <- matrix(NA, nrow = npop, ncol = npop) M <- as.data.frame(M) rownames(M) <- pop colnames(M) <- pop for (i in 1:npop){ start <- colnames(M)[i] for (j in i+1:npop-i){ end <- rownames(M)[j] M[j, i] <- paste0(start, "_", end) } } # creates a list of pair's name pair_name <- c(rep("NA", (npop^2-npop)/2)) k <- 1 for (i in 1:(npop-1)){ for (j in i+1:(npop-i)){ pair_name[k] <- M[j, i] k <- k+1 } } # genind subset -> pair of populations. pair_list <- NULL for (i in 1:(npop-1)){ # in matrix M we go from col 2 to 5 for (j in (i+1):(npop)){ # in matrix M we go from row 3 to 6 tmp <- popsub(genind, sublist = c(subspecies[i], subspecies[j])) tmp2 <- assign(M[j, i], tmp) pair_list <- c(pair_list, tmp2) # list of all pop pairs } } # calculation of F statistics and extraction of Fst values for each pair fst <- matrix(NA, nrow = nloci, ncol = length(pair_list)) for (i in 1:length(pair_list)){ tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "") fst[,i] <- tmp[,2] } fst <- as.data.frame(fst) colnames(fst) <- pair_name # replace all NaN values by proper NA values in fst data frame fst[fst == "NaN"] <- NA # create a list containing every plot of fst values gg <- lapply(1:length(pair_name), function(i) { pair <- pair_name[i] title <- str_replace(pair, pattern = "_", replacement = "/") ggplot() + geom_point(aes(x=locinfo$POS, y=fst[,i]), size=0.3) + xlab("position") + ylab("Fst value") + ggtitle(title) + theme_light() }) ####################### ## Clustering with DAPC loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf") genind <- loci2genind(loci) ### Looking for 5 clusters = species/subspecies pop(genind)<-rep(c("paniscus","troglodytes_ellioti", "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), c(11,10,6,4, 5)) # Identify the number of cluster grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4 # Kstat : BIC for each value of k, here k=4 has the lower BIC grp$Kstat # stat : gives the selected nb of clusters (selected k) and its BIC value grp$stat # grp : gives the id of the group to which each individual is assigned head(grp$grp) # size : number of individuals assigned to each group grp$size # extract the group partition group_df <- as.data.frame(grp$grp) # add the infectious status for each individual inf_status <- rep(c("nonSIV", "SIV","nonSIV"), c(21,10, 5)) size <- rep(0, 36) tab <- cbind(group_df, pop(genind), inf_status, size) colnames(tab) <- c("group", "population", "infectious_status", "size") # set point size in function of the size of the group grp_size <- grp$size for (i in 1:length(tab$group)){ for (j in 1:6){ if (tab[i, 1]==j){ tab[i,4] <- grp_size[j] } } } # plot the groups with population, infectious status and group size gg <- ggplot(data = tab, aes(x = group, y = population, colour = infectious_status, size = size)) + geom_point() gg # decompose the find.cluster function kmean <- kmeans(x = genind@tab, centers = 4) kmeanBIC <- function (clust) { m = ncol(clust$centers) n = length(clust$cluster) k = nrow(clust$centers) D = clust$tot.withinss return(BIC = D + log(n)*m*k) } # save the BIC values in a df k <- c(seq(1,10,1)) BIC <- c(rep("NA", 10)) bic_value <- cbind(k, BIC) for (i in 1:10){ # cluster the data with k=i clust <- kmeans(x = genind@tab, centers = i) # compute the BIC value for k=i bic <- kmeanBIC(clust) bic_value[i,2] <- bic } # return wird values # describe clusters using dapc1 dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5 # visualization of clusters myCol <- c("darkblue", "purple", "green", "orange") scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=2, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4))) ### looking for infected/non infected populations # reload the genind object # data import into a loci object # then run the dapc again ########## ## Tajimas'D with popGenome # create a GENOME class object with vcf file genome <- readData(path = "popgenome_data/", format = "VCF") ### Marie's method : setting pop considering that individuals are diploid # extract the names of individuals in the genome object pops <- get.individuals(genome)[[1]] # subset populations # grep : search for matches of a pattern and return the indices that yielded a match # by putting the indices in [] we extract the individuals names paniscus <- pops[grep("Pan_paniscus",pops)] ellioti <- pops[grep("Pan_troglodytes_ellioti",pops)] schwein <- pops[grep("Pan_troglodytes_schweinfurthii",pops)] troglo <- pops[grep("Pan_troglodytes_troglodytes",pops)] verus <- pops[grep("Pan_troglodytes_verus",pops)] # we set the populations with a list genome <- set.populations(genome, list(paniscus, troglo, ellioti, schwein, verus)) # check : two copies for each individual because they are diploid genome@populations ### # set the outgroup (it changes the D values) genome <- set.outgroup(genome, paniscus) # check genome@outgroup # neutrality tests genome <- neutrality.stats(genome) # get the Tajima's D values genome@Tajima.D # pop : # 1 > paniscus # 2 > troglo # 3 > ellioti # 4 > schwein # 5 > verus # figure of D values for one gene library(tidyr) taj <- genome@Tajima.D taj <- as.data.frame(t(taj)) taj <- cbind(taj, pop = c("paniscus", "t. troglodytes", "t. ellioti", "t. schweinwurthii", "t. verus")) colnames(taj)[1] <- c("d_value") taj <- taj %>% drop_na() gg <- ggplot(data = taj, mapping = aes(x = d_value, y = pop)) + geom_point() + labs(title = "Tajima's D value for each P. troglodytes subspecies", x = "D value", y = "populations") gg