Newer
Older
############################
## Population genetics : general script
## J. Plantade
## Jan 2021
############################
# packages
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(ggplot2)
library(poppr)
################
### setting data
# data import into a loci object
loci <- read.vcf("data/CD4_panTro3.vcf_fixed.vcf", from = 1, to = 572)
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
"troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"),
c(11,10,6,4, 5))
# convert genind object into a hierfstat object
hierf <- genind2hierfstat(genind)
##################
### heterozygosity
div <- summary(genind)
locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE)
# ggplot2 : Hobs-Hexp VS position
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light()
g1
# Bartlett test (homogeneity of variance)
# H0 : Hobs = Hexp
bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured
# create one genind object for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
npop <- length(pop)
tmp <- popsub(genind, sublist = c(subspecies[i]))
assign(pop[i], tmp)
}
# extracting heterozygosity for each population
list_genind <- c(pan, ellioti, schwein, troglo, verus)
for (i in 1:length(list_genind)){
tmp <- summary(list_genind[[i]])
assign(paste0(pop[i], "_div"), tmp)
}
# -> possibility to to Bartlett test and plot Hobs for each pop
#######
### Fst
# Matrix X with colnames and rownames = pop, create the name of pairs
npop <- 5
M <- matrix(NA, nrow = npop, ncol = npop)
M <- as.data.frame(M)
rownames(M) <- pop
colnames(M) <- pop
for (i in 1:npop){
start <- colnames(M)[i]
for (j in i+1:npop-i){
end <- rownames(M)[j]
M[j, i] <- paste0(start, "_", end)
}
}
# creates a list of pair names
pair_name <- c(rep("NA", (npop^2-npop)/2))
k <- 1
for (i in 1:(npop-1)){
for (j in i+1:(npop-i)){
pair_name[k] <- M[j, i]
k <- k+1
}
}
# genind subset -> couple of population.
pair_list <- NULL
for (i in 2:(npop)){ # in matrix M we go from col 2 to 5
for (j in (i+1):(npop+1)){ # in matrix M we go from row 3 to 6
tmp <- popsub(genind, sublist = c(pop(genind)[i-1], pop(genind)[j-1]))
tmp2 <- assign(M[j, i], tmp)
}
}
# calculation of F statistics and extraction of Fst values
fst <- matrix(NA, nrow = nloci, ncol = length(pair_list))
for (i in 1:length(pair_list)){
tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "")
fst[,i] <- tmp[,2]
}
fst <- as.data.frame(fst)
# Fst values VS loci position
plot(locinfo$POS, fst[,1], cex=0.2, xlab="Position", ylab="Fst value", main="Fst value per locus")
abline(0.15, 0)
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#############
## Clustering with DAPC
# Identify the number of cluster
grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4
# Kstat : BIC for each value of k, here k=4 has the lower BIC
grp$Kstat
# stat : gives the selected nb of clusters (selected k) and its BIC value
grp$stat
# grp : gives the id of the group to which each individual is assigned
head(grp$grp)
# size : number of individuals assigned to each group
grp$size
# Compare inferred groups (inf) with actual groups (ori)
# Find actual groups with function pop on the genind object
pop(genind)
tab <- table(pop(genind), grp$grp)
# Display
table.value(tab, col.labels = colnames(tab), row.labels = rownames(tab))
# describe clusters using dapc1
dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5
# visualization of clusters
myCol <- c("darkblue", "purple", "green", "orange")
scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=1, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4)))
##########
## Tajimas'D with popGenome
# list of populations
populations <- list("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
# create a GENOME class object with vcf file
genome <- readData(path = "data/", format = "VCF")
# function that create a list, which element is the ensemble of individual for each pop
listing_ind <- function(npan, nellio, nschwein, ntroglo, nverus){
pan_vec <- c(rep("NA", npan))
ellio_vec <- c(rep("NA", nellio))
schwein_vec <- c(rep("NA", nschwein))
troglo_vec <- c(rep("NA", ntroglo))
verus_vec <- c(rep("NA", nverus))
for (i in 1:npan){
pan_vec[i] <- paste0("pan_", i)
}
for (i in 1:nellio){
ellio_vec[i] <- paste0("ellioti_", i)
}
for (i in 1:nschwein){
schwein_vec[i] <- paste0("schwein_", i)
}
for (i in 1:ntroglo){
troglo_vec[i] <- paste0("troglo_", i)
}
for (i in 1:nverus){
verus_vec[i] <- paste0("verus_", i)
}
populations <- list(pan_vec, ellio_vec, schwein_vec, troglo_vec, verus_vec)
}
# apply listing_ind with the rignt number of individuals for each pop
# and save it in the object 'populations'
populations <- listing_ind(11, 10, 6, 4, 5)
# set populations in the genome object
genome <- set.populations(genome, new.populations = populations, diploid = TRUE)
# check
genome@populations