Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#######################################
## Population genetics : general script
## J. Plantade
## Jan 2021
#######################################
# packages
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(knitr)
library(stringr)
library(dplyr)
################
### setting data
# data import into a loci object
loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf")
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
"troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"),
c(11,10,6,4, 5))
# convert genind object into a hierfstat object
# hierf <- genind2hierfstat(genind)
##################
### heterozygosity
div <- summary(genind)
locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE)
# ggplot2 : Hobs-Hexp VS position
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light()
g1
# test Hobs and Hexp normality : Kolmogorov-Smirnov test for samples > 50
hist(div$Hobs)
ks.test(x = div$Hobs, y = "pnorm")
hist(div$Hexp)
ks.test(x = div$Hobs, y = "pnorm")
# WARNING : if distributions are not normal we cannot use the Bartlett test (not robust enough) -> use the Levene test
# BARTLETT TEST (homogeneity of variance between two samples : obs VS exp)
# H0 : Var(Hobs) = Var(Hexp)
bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured
# theory : https://eric.univ-lyon2.fr/~ricco/cours/cours/Comp_Pop_Tests_Parametriques.pdf
# extract H values and bind them
Hobs <- div$Hobs
Hexp <- div$Hexp
H <- cbind(Hobs, Hexp)
H <- as.data.frame(H)
# tidy the data : df -> one column with values, one column with the origin of values (observed or expected)
library(tidyr)
longer_H <- H %>%
pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value")
# LEVENE TEST
library(lawstat)
test <- levene.test(y = longer_H$H_value, group = longer_H$origin)
# create one genind object for each population
subspecies <- c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
npop <- length(pop)
for (i in 1:length(pop)){
tmp <- popsub(genind, sublist = c(subspecies[i]))
assign(pop[i], tmp)
}
# extracting heterozygosity for each population
list_genind <- c(pan, ellioti, schwein, troglo, verus)
for (i in 1:length(list_genind)){
tmp <- summary(list_genind[[i]])
assign(paste0(pop[i], "_div"), tmp)
}
# plot Hobs - Hexp for each population (for geom_rect see report_pan.Rmd)
gg_pan <- ggplot() +
geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(aes(x=locinfo$POS, y=pan_div$Hobs-pan_div$Hexp), size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = "P. paniscus") +
theme_classic()
gg_ellioti <- ggplot() +
geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(aes(x=locinfo$POS, y=ellioti_div$Hobs-ellioti_div$Hexp), size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = "P. t. ellioti") +
theme_classic()
gg_schwein <- ggplot() +
geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(aes(x=locinfo$POS, y=schwein_div$Hobs-schwein_div$Hexp), size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = "P. t. schweinfurthii") +
theme_classic()
gg_troglo <- ggplot() +
geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(aes(x=locinfo$POS, y=troglo_div$Hobs-troglo_div$Hexp), size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = "P. t. troglodytes") +
theme_classic()
gg_verus <- ggplot() +
geom_rect(aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(aes(x=locinfo$POS, y=verus_div$Hobs-verus_div$Hexp), size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = "P. t. verus") +
theme_classic()
#######
### Fst
# calculation of F statistics and extraction of Fst values
# setting short names for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
# Matrix X with colnames and rownames = pop, create the name of pairs
npop <- 5
M <- matrix(NA, nrow = npop, ncol = npop)
M <- as.data.frame(M)
rownames(M) <- pop
colnames(M) <- pop
for (i in 1:npop){
start <- colnames(M)[i]
for (j in i+1:npop-i){
end <- rownames(M)[j]
M[j, i] <- paste0(start, "_", end)
}
}
# creates a list of pair's name
pair_name <- c(rep("NA", (npop^2-npop)/2))
k <- 1
for (i in 1:(npop-1)){
for (j in i+1:(npop-i)){
pair_name[k] <- M[j, i]
k <- k+1
}
}
# genind subset -> pair of populations.
pair_list <- NULL
for (i in 1:(npop-1)){ # in matrix M we go from col 2 to 5
for (j in (i+1):(npop)){ # in matrix M we go from row 3 to 6
tmp <- popsub(genind, sublist = c(subspecies[i], subspecies[j]))
tmp2 <- assign(M[j, i], tmp)
pair_list <- c(pair_list, tmp2) # list of all pop pairs
}
}
# calculation of F statistics and extraction of Fst values for each pair
fst <- matrix(NA, nrow = nloci, ncol = length(pair_list))
for (i in 1:length(pair_list)){
tmp <- Fst(as.loci(pair_list[[i]]), na.alleles = "")
fst[,i] <- tmp[,2]
}
fst <- as.data.frame(fst)
colnames(fst) <- pair_name
# replace all NaN values by proper NA values in fst data frame
fst[fst == "NaN"] <- NA
# create a list containing every plot of fst values
gg <- lapply(1:length(pair_name), function(i) {
pair <- pair_name[i]
title <- str_replace(pair, pattern = "_", replacement = "/")
ggplot() + geom_point(aes(x=locinfo$POS, y=fst[,i]), size=0.3) + xlab("position") + ylab("Fst value") + ggtitle(title) + theme_light()
})
#######################
## Clustering with DAPC
loci <- read.vcf("data_fixed_999/999_CD4_panTro3.vcf_fixed.vcf")
genind <- loci2genind(loci)
### Looking for 5 clusters = species/subspecies
pop(genind)<-rep(c("paniscus","troglodytes_ellioti",
"troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus"),
c(11,10,6,4, 5))
# Identify the number of cluster
grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4
# Kstat : BIC for each value of k, here k=4 has the lower BIC
grp$Kstat
# stat : gives the selected nb of clusters (selected k) and its BIC value
grp$stat
# grp : gives the id of the group to which each individual is assigned
head(grp$grp)
# size : number of individuals assigned to each group
grp$size
# extract the group partition
group_df <- as.data.frame(grp$grp)
# add the infectious status for each individual
inf_status <- rep(c("nonSIV", "SIV","nonSIV"), c(21,10, 5))
size <- rep(0, 36)
tab <- cbind(group_df, pop(genind), inf_status, size)
colnames(tab) <- c("group", "population", "infectious_status", "size")
# set point size in function of the size of the group
grp_size <- grp$size
for (i in 1:length(tab$group)){
for (j in 1:6){
if (tab[i, 1]==j){
tab[i,4] <- grp_size[j]
}
}
}
# plot the groups with population, infectious status and group size
gg <- ggplot(data = tab, aes(x = group, y = population, colour = infectious_status, size = size)) + geom_point()
gg
# decompose the find.cluster function
kmean <- kmeans(x = genind@tab, centers = 4)
kmeanBIC <- function (clust) {
m = ncol(clust$centers)
n = length(clust$cluster)
k = nrow(clust$centers)
D = clust$tot.withinss
return(BIC = D + log(n)*m*k)
}
# save the BIC values in a df
k <- c(seq(1,10,1))
BIC <- c(rep("NA", 10))
bic_value <- cbind(k, BIC)
for (i in 1:10){
# cluster the data with k=i
clust <- kmeans(x = genind@tab, centers = i)
# compute the BIC value for k=i
bic <- kmeanBIC(clust)
bic_value[i,2] <- bic
} # return wird values
# describe clusters using dapc1
dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5
# visualization of clusters
myCol <- c("darkblue", "purple", "green", "orange")
scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=2, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4)))
### looking for infected/non infected populations
# reload the genind object
# data import into a loci object
# then run the dapc again
##########
## Tajimas'D with popGenome
# create a GENOME class object with vcf file
genome <- readData(path = "popgenome_data/", format = "VCF")
### Marie's method : setting pop considering that individuals are diploid
# extract the names of individuals in the genome object
pops <- get.individuals(genome)[[1]]
# subset populations
# grep : search for matches of a pattern and return the indices that yielded a match
# by putting the indices in [] we extract the individuals names
paniscus <- pops[grep("Pan_paniscus",pops)]
ellioti <- pops[grep("Pan_troglodytes_ellioti",pops)]
schwein <- pops[grep("Pan_troglodytes_schweinfurthii",pops)]
troglo <- pops[grep("Pan_troglodytes_troglodytes",pops)]
verus <- pops[grep("Pan_troglodytes_verus",pops)]
# we set the populations with a list
genome <- set.populations(genome, list(paniscus, troglo, ellioti, schwein, verus))
# check : two copies for each individual because they are diploid
genome@populations
###
# set the outgroup (it changes the D values)
genome <- set.outgroup(genome, paniscus)
# check
genome@outgroup
# neutrality tests
genome <- neutrality.stats(genome)
# get the Tajima's D values
genome@Tajima.D
# pop :
# 1 > paniscus
# 2 > troglo
# 3 > ellioti
# 4 > schwein
# 5 > verus
# figure of D values for one gene
library(tidyr)
taj <- genome@Tajima.D
taj <- as.data.frame(t(taj))
taj <- cbind(taj, pop = c("paniscus", "t. troglodytes", "t. ellioti", "t. schweinwurthii", "t. verus"))
colnames(taj)[1] <- c("d_value")
taj <- taj %>% drop_na()
gg <- ggplot(data = taj, mapping = aes(x = d_value, y = pop)) + geom_point() + labs(title = "Tajima's D value for each P. troglodytes subspecies", x = "D value", y = "populations")
gg