#######################################
## Population genetics : general script
## J. Plantade
## Jan 2021 
#######################################

# packages
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(knitr)
library(stringr)
library(dplyr)

################
### setting data

# data import into a loci object
loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf")
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
                   "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), 
                 c(11,10,6,4, 5))
# convert genind object into a hierfstat object
# hierf <-  genind2hierfstat(genind)

##################
### heterozygosity

div <- summary(genind)
locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE)
# ggplot2 : Hobs-Hexp VS position
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light()
g1

# test Hobs and Hexp normality : Kolmogorov-Smirnov test for samples > 50
hist(div$Hobs)
ks.test(x = div$Hobs, y = "pnorm")
hist(div$Hexp)
ks.test(x = div$Hobs, y = "pnorm")
# WARNING : if distributions are not normal we cannot use the Bartlett test (not robust enough) -> use the Levene test

# BARTLETT TEST (homogeneity of variance between two samples : obs VS exp)
# H0 : Var(Hobs) = Var(Hexp)
bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured
# theory : https://eric.univ-lyon2.fr/~ricco/cours/cours/Comp_Pop_Tests_Parametriques.pdf 

# extract H values and bind them
Hobs <- div$Hobs
Hexp <- div$Hexp
H <- cbind(Hobs, Hexp)
H <- as.data.frame(H)
# tidy the data : df -> one column with values, one column with the origin of values (observed or expected)
library(tidyr)
longer_H <- H %>%
  pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value")
# LEVENE TEST
library(lawstat)
test <- levene.test(y = longer_H$H_value, group = longer_H$origin)

# create one genind object for each population
subspecies <- c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
npop <- length(pop)
for (i in 1:length(pop)){
  tmp <- popsub(genind, sublist = c(subspecies[i]))
  assign(pop[i], tmp)
}
# extracting heterozygosity for each population
list_genind <- c(pan, ellioti, schwein, troglo, verus)
for (i in 1:length(list_genind)){
  tmp <- summary(list_genind[[i]])
  assign(paste0(pop[i], "_div"), tmp)
}
# plot Hobs - Hexp for each population (for geom_rect see report_pan.Rmd)
gg_pan <- ggplot() + 
  geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + 
  geom_point(aes(x=locinfo$POS, y=pan_div$Hobs-pan_div$Hexp), size = 0.1) + 
  labs(x = "position", y = "Hobs-Hexp", title = "P. paniscus") + 
  theme_classic()
gg_ellioti <- ggplot() + 
  geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + 
  geom_point(aes(x=locinfo$POS, y=ellioti_div$Hobs-ellioti_div$Hexp), size = 0.1) + 
  labs(x = "position", y = "Hobs-Hexp", title = "P. t. ellioti") + 
  theme_classic()
gg_schwein <- ggplot() + 
  geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + 
  geom_point(aes(x=locinfo$POS, y=schwein_div$Hobs-schwein_div$Hexp), size = 0.1) + 
  labs(x = "position", y = "Hobs-Hexp", title = "P. t. schweinfurthii") + 
  theme_classic()
gg_troglo <- ggplot() + 
  geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + 
  geom_point(aes(x=locinfo$POS, y=troglo_div$Hobs-troglo_div$Hexp), size = 0.1) + 
  labs(x = "position", y = "Hobs-Hexp", title = "P. t. troglodytes") + 
  theme_classic()
gg_verus <- ggplot() + 
  geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') + 
  geom_point(aes(x=locinfo$POS, y=verus_div$Hobs-verus_div$Hexp), size = 0.1) + 
  labs(x = "position", y = "Hobs-Hexp", title = "P. t. verus") + 
  theme_classic()

#######
### Fst
# calculation of F statistics and extraction of Fst values
# setting short names for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
# Matrix X with colnames and rownames = pop, create the name of pairs
npop <- 5
M <- matrix(NA, nrow = npop, ncol = npop)
M <- as.data.frame(M)            
rownames(M) <- pop            
colnames(M) <- pop
for (i in 1:npop){
  start <- colnames(M)[i]
  for (j in i+1:npop-i){
    end <- rownames(M)[j]
    M[j, i] <- paste0(start, "_", end)
  }
}
# creates a list of pair's name
pair_name <- c(rep("NA", (npop^2-npop)/2))
k <- 1
for (i in 1:(npop-1)){
  for (j in i+1:(npop-i)){
    pair_name[k] <- M[j, i]
    k <- k+1
  }
}
# genind subset -> pair of populations.
pair_list <- NULL
for (i in 1:(npop-1)){ # in matrix M we go from col 2 to 5
  for (j in (i+1):(npop)){ # in matrix M we go from row 3 to 6
    tmp <- popsub(genind, sublist = c(subspecies[i], subspecies[j]))
    tmp2 <- assign(M[j, i], tmp)
    pair_list <- c(pair_list, tmp2) # list of all pop pairs
  }
}
# calculation of F statistics and extraction of Fst values for each pair
fst <- matrix(NA, nrow = nloci, ncol = length(pair_list))
for (i in 1:length(pair_list)){
  tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "")
  fst[,i] <- tmp[,2]
} 
fst <- as.data.frame(fst)
colnames(fst) <- pair_name
# replace all NaN values by proper NA values in fst data frame
fst[fst == "NaN"] <- NA
# create a list containing every plot of fst values
gg <- lapply(1:length(pair_name), function(i) {
  pair <- pair_name[i]
  title <- str_replace(pair, pattern = "_", replacement = "/")
  ggplot() + geom_point(aes(x=locinfo$POS, y=fst[,i]), size=0.3) + xlab("position") + ylab("Fst value") + ggtitle(title) + theme_light()
})
#######################
## Clustering with DAPC 

loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf")
genind <- loci2genind(loci)
### Looking for 5 clusters = species/subspecies
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
                   "troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"), 
                 c(11,10,6,4, 5))

# Identify the number of cluster
grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4

# Kstat : BIC for each value of k, here k=4 has the lower BIC
grp$Kstat
# stat : gives the selected nb of clusters (selected k) and its BIC value
grp$stat
# grp : gives the id of the group to which each individual is assigned
head(grp$grp)
# size : number of individuals assigned to each group
grp$size

# extract the group partition
group_df <- as.data.frame(grp$grp)
# add the infectious status for each individual
inf_status <- rep(c("nonSIV", "SIV","nonSIV"), c(21,10, 5))
size <- rep(0, 36)
tab <- cbind(group_df, pop(genind), inf_status, size)
colnames(tab) <- c("group", "population", "infectious_status", "size")
# set point size in function of the size of the group
grp_size <- grp$size
for (i in 1:length(tab$group)){
  for (j in 1:6){
    if (tab[i, 1]==j){
      tab[i,4] <- grp_size[j]
    }
  }
}
# plot the groups with population, infectious status and group size
gg <- ggplot(data = tab, aes(x = group, y = population, colour = infectious_status, size = size)) + geom_point()
gg

# decompose the find.cluster function
kmean <- kmeans(x = genind@tab, centers = 4)
kmeanBIC <- function (clust) {
  m = ncol(clust$centers)
  n = length(clust$cluster)
  k = nrow(clust$centers)
  D = clust$tot.withinss
  return(BIC = D + log(n)*m*k)
}
# save the BIC values in a df
k <- c(seq(1,10,1))
BIC <- c(rep("NA", 10))
bic_value <- cbind(k, BIC)
for (i in 1:10){
  # cluster the data with k=i
  clust <- kmeans(x = genind@tab, centers = i)
  # compute the BIC value for k=i
  bic <- kmeanBIC(clust)
  bic_value[i,2] <- bic
} # return wird values

# describe clusters using dapc1
dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5
# visualization of clusters
myCol <- c("darkblue", "purple", "green", "orange")
scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=2, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4)))

### looking for infected/non infected populations
# reload the genind object
# data import into a loci object

# then run the dapc again

##########
## Tajimas'D with popGenome

# create a GENOME class object with vcf file
genome <- readData(path = "popgenome_data/", format = "VCF")

### Marie's method : setting pop considering that individuals are diploid
# extract the names of individuals in the genome object
pops <- get.individuals(genome)[[1]]
# subset populations
# grep : search for matches of a pattern and return the indices that yielded a match
# by putting the indices in [] we extract the individuals names
paniscus <- pops[grep("Pan_paniscus",pops)]
ellioti <- pops[grep("Pan_troglodytes_ellioti",pops)]
schwein <- pops[grep("Pan_troglodytes_schweinfurthii",pops)]
troglo <- pops[grep("Pan_troglodytes_troglodytes",pops)]
verus <- pops[grep("Pan_troglodytes_verus",pops)]
# we set the populations with a list
genome  <- set.populations(genome, list(paniscus, troglo, ellioti, schwein, verus))
# check : two copies for each individual because they are diploid
genome@populations
###

# set the outgroup (it changes the D values)
genome <- set.outgroup(genome, paniscus)
# check
genome@outgroup

# neutrality tests
genome <- neutrality.stats(genome)
# get the Tajima's D values
genome@Tajima.D
# pop :
# 1 > paniscus
# 2 > troglo
# 3 > ellioti
# 4 > schwein
# 5 > verus

# figure of D values for one gene
library(tidyr)
taj <- genome@Tajima.D
taj <- as.data.frame(t(taj))
taj <- cbind(taj, pop = c("paniscus", "t. troglodytes", "t. ellioti", "t. schweinwurthii", "t. verus"))
colnames(taj)[1] <- c("d_value")
taj <- taj %>% drop_na()

gg <- ggplot(data = taj, mapping = aes(x = d_value, y = pop)) + geom_point() + labs(title = "Tajima's D value for each P. troglodytes subspecies", x = "D value", y = "populations")
gg