############################
## Population genetics : general script
## J. Plantade
## Jan 2021 
############################

# packages
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(ggplot2)
library(poppr)

################
### setting data

# data import into a loci object
loci <- read.vcf("data/CD4_panTro3.vcf_fixed.vcf", from = 1, to = 572)
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
                   "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), 
                 c(11,10,6,4, 5))
# convert genind object into a hierfstat object
hierf <-  genind2hierfstat(genind)

##################
### heterozygosity

div <- summary(genind)
locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE)
# ggplot2 : Hobs-Hexp VS position
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light()
g1
# Bartlett test (homogeneity of variance)
# H0 : Hobs = Hexp
bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured
# create one genind object for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
npop <- length(pop)
for (i in 1:length(pop)){
  tmp <- popsub(genind, sublist = c(subspecies[i]))
  assign(pop[i], tmp)
}
# extracting heterozygosity for each population
list_genind <- c(pan, ellioti, schwein, troglo, verus)
for (i in 1:length(list_genind)){
  tmp <- summary(list_genind[[i]])
  assign(paste0(pop[i], "_div"), tmp)
}
# -> possibility to to Bartlett test and plot Hobs for each pop

#######
### Fst

# Matrix X with colnames and rownames = pop, create the name of pairs
npop <- 5
M <- matrix(NA, nrow = npop, ncol = npop)
M <- as.data.frame(M)            
rownames(M) <- pop            
colnames(M) <- pop
for (i in 1:npop){
  start <- colnames(M)[i]
  for (j in i+1:npop-i){
    end <- rownames(M)[j]
    M[j, i] <- paste0(start, "_", end)
  }
}
# creates a list of pair names
pair_name <- c(rep("NA", (npop^2-npop)/2))
k <- 1
for (i in 1:(npop-1)){
  for (j in i+1:(npop-i)){
    pair_name[k] <- M[j, i]
    k <- k+1
  }
}

# genind subset -> couple of population.
pair_list <- NULL
for (i in 2:(npop)){ # in matrix M we go from col 2 to 5
  for (j in (i+1):(npop+1)){ # in matrix M we go from row 3 to 6
    tmp <- popsub(genind, sublist = c(pop(genind)[i-1], pop(genind)[j-1]))
    tmp2 <- assign(M[j, i], tmp)
    pair_list <- c(pair_list, tmp2) # list of all pop pairs
  }
}

# calculation of F statistics and extraction of Fst values
fst <- matrix(NA, nrow = nloci, ncol = length(pair_list))
for (i in 1:length(pair_list)){
  tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "")
  fst[,i] <- tmp[,2]
} 
fst <- as.data.frame(fst)
colnames(fst) <- pair_name

# Fst values VS loci position
plot(locinfo$POS, fst[,1], cex=0.2, xlab="Position", ylab="Fst value", main="Fst value per locus")
abline(0.15, 0)

#############
## Clustering with DAPC 

# Identify the number of cluster
grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4

# Kstat : BIC for each value of k, here k=4 has the lower BIC
grp$Kstat
# stat : gives the selected nb of clusters (selected k) and its BIC value
grp$stat
# grp : gives the id of the group to which each individual is assigned
head(grp$grp)
# size : number of individuals assigned to each group
grp$size

# Compare inferred groups (inf) with actual groups (ori)
# Find actual groups with function pop on the genind object
pop(genind)
tab <- table(pop(genind), grp$grp)
# Display
table.value(tab, col.labels = colnames(tab), row.labels = rownames(tab))

# describe clusters using dapc1
dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5
# visualization of clusters
myCol <- c("darkblue", "purple", "green", "orange")
scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=1, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4)))

##########
## Tajimas'D with popGenome

# list of populations
populations <- list("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
# create a GENOME class object with vcf file
genome <- readData(path = "data/", format = "VCF")
# function that create a list, which element is the ensemble of individual for each pop
listing_ind <- function(npan, nellio, nschwein, ntroglo, nverus){
  pan_vec <- c(rep("NA", npan))
  ellio_vec <- c(rep("NA", nellio))
  schwein_vec <- c(rep("NA", nschwein))
  troglo_vec <- c(rep("NA", ntroglo))
  verus_vec <- c(rep("NA", nverus))
  for (i in 1:npan){
    pan_vec[i] <- paste0("pan_", i)
  }
  for (i in 1:nellio){
    ellio_vec[i] <- paste0("ellioti_", i)
  }
  for (i in 1:nschwein){
    schwein_vec[i] <- paste0("schwein_", i)
  }
  for (i in 1:ntroglo){
    troglo_vec[i] <- paste0("troglo_", i)
  }
  for (i in 1:nverus){
    verus_vec[i] <- paste0("verus_", i)
  }
  populations <- list(pan_vec, ellio_vec, schwein_vec, troglo_vec, verus_vec)
}
# apply listing_ind with the rignt number of individuals for each pop
# and save it in the object 'populations'
populations <- listing_ind(11, 10, 6, 4, 5)
# set populations in the genome object
genome <- set.populations(genome, new.populations = populations, diploid = TRUE)
# check
genome@populations