Skip to content
Snippets Groups Projects
Commit 33806038 authored by jplantad's avatar jplantad
Browse files

Exploration script

parent f9363a1f
No related branches found
No related tags found
No related merge requests found
......@@ -33,86 +33,60 @@ hierf <- genind2hierfstat(genind)
div <- summary(genind)
locinfo<-VCFloci("data/CD4_panTro3.vcf_fixed.vcf", what = "all", chunk.size = 1e9, quiet = FALSE)
# observed heterozygosity VS loci position
plot(locinfo$POS, div$Hobs, xlab="Loci position", ylab="Observed Heterozygosity",
main="Observed heterozygosity per locus", pch=20)
# observed heterozygosity VS expected heterozygosity
plot(div$Hexp, div$Hobs, xlab="Hexp", ylab="Hobs",
main="Expected heterozygosity as a function of observed heterozygosity per locus")
abline(0, 1, col="blue")
# Hobs - Hexp VS position
plot(locinfo$POS, div$Hobs-div$Hexp, xlab = "loci position", ylab = "Hobs-Hexp", cex=0.3)
# ggplot2 : Hobs-Hexp VS position
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs-div$Hexp), size=0.3) + xlab("Position") + ylab("Hobs - Hexp") + ggtitle("Difference between Hobs and Hexp for each position") + theme_light()
g1
# Bartlett test (homogeneity of variance)
# H0 : Hobs = Hexp
bartlett.test(list(div$Hexp, div$Hobs))
# ggplot2 try
g1 <- ggplot() + geom_point(aes(x=locinfo$POS, y=div$Hobs)) + xlab("Position") + ylab("Observed heterozygosity") + ggtitle("Observed heterozygosity per locus") + theme_light()
g1
# splitting genind objet according to population
subspecies <- c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
bartlett.test(list(div$Hexp, div$Hobs)) # S : population is structured
# create one genind object for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
npop <- length(pop)
for (i in 1:length(subspecies)){
for (i in 1:length(pop)){
tmp <- popsub(genind, sublist = c(subspecies[i]))
assign(pop[i], tmp)
}
# Bartlett test on all pop except verus because geographically distant
quatre_pop <- popsub(genind, sublist = c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes"))
div_quatre <- summary(quatre_pop)
bartlett.test(list(div_quatre$Hexp, div_quatre$Hobs)) # S
# hist of Hobs and Hexp distribution for paniscus
par(mfrow=c(1, 2))
hist(pan_div$Hobs)
hist(pan_div$Hexp)
# Bartlett test on each pop
bartlett.test(list(pan_div$Hexp, pan_div$Hobs)) # S
bartlett.test(list(ellioti_div$Hexp, ellioti_div$Hobs)) # S
bartlett.test(list(schwein_div$Hexp, schwein_div$Hobs)) # S
bartlett.test(list(troglo_div$Hexp, troglo_div$Hobs)) # S
bartlett.test(list(verus_div$Hexp, verus_div$Hobs)) # S
# ? all test are significant ?
# extracting heterozygosity for each population
list_genind <- c(pan, ellioti, schwein, troglo, verus)
for (i in 1:length(list_genind)){
tmp <- summary(list_genind[[i]])
assign(paste0(pop[i], "_div"), tmp)
}
par(mfrow =c(2, 2))
plot(locinfo$POS, pan_div$Hobs, cex=0.3)
plot(locinfo$POS, ellioti_div$Hobs, cex=0.3)
plot(locinfo$POS, schwein_div$Hobs, cex=0.3)
plot(locinfo$POS, troglo_div$Hobs, cex=0.3)
# plot(locinfo$POS, verus_div$Hobs, cex=0.3)
# -> possibility to to Bartlett test and plot Hobs for each pop
#######
### Fst
duo <- c("pan_ellioti", "pan_schwein", "pan_troglo", "pan_verus", "ellioti_schwein", "ellioti_troglo", "ellioti_verus", "schwein_troglo", "schwein_verus", "troglo_verus")
# Matrix X with colnames and rownames = pop, create the name of pairs
M <- matrix(NA, nrow = (npop+1), ncol = (npop+1))
M[1, 2:6] <- pop
M[2:6, 1] <- pop
for (i in 2:(npop+1)){
start <- M[1, i]
for (j in 2:(npop+1)){
end <- M[j, 1]
npop <- 5
M <- matrix(NA, nrow = npop, ncol = npop)
M <- as.data.frame(M)
rownames(M) <- pop
colnames(M) <- pop
for (i in 1:npop){
start <- colnames(M)[i]
for (j in i+1:npop-i){
end <- rownames(M)[j]
M[j, i] <- paste0(start, "_", end)
}
}
# need a loop that creates a list of pair names
# creates a list of pair names
pair_name <- c(rep("NA", (npop^2-npop)/2))
k <- 1
for (i in 1:(npop-1)){
for (j in i+1:(npop-i)){
pair_name[k] <- M[j, i]
k <- k+1
}
}
# genind subset -> couple of population.
pair_list <- NULL
for (i in 2:(npop)){ # in matrix M we go from col 2 to 5
for (j in (i+1):(npop+1)){ # in matrix M we go from row 3 to 6
tmp <- popsub(genind, sublist = c(subspecies[i-1], subspecies[j-1]))
tmp <- popsub(genind, sublist = c(pop(genind)[i-1], pop(genind)[j-1]))
tmp2 <- assign(M[j, i], tmp)
pair_list <- c(pair_list, tmp2)
pair_list <- c(pair_list, tmp2) # list of all pop pairs
}
}
......@@ -123,9 +97,75 @@ for (i in 1:length(pair_list)){
fst[,i] <- tmp[,2]
}
fst <- as.data.frame(fst)
colnames(fst) <- duo
colnames(fst) <- pair_name
# Fst values VS loci position
plot(locinfo$POS, fst[,1], cex=0.2, xlab="Position", ylab="Fst value", main="Fst value per locus")
abline(0.15, 0)
#############
## Clustering with DAPC
# Identify the number of cluster
grp <- find.clusters(genind, max.n.clust = 10) #PCs to retain = 35; nb of clusters = 4
# Kstat : BIC for each value of k, here k=4 has the lower BIC
grp$Kstat
# stat : gives the selected nb of clusters (selected k) and its BIC value
grp$stat
# grp : gives the id of the group to which each individual is assigned
head(grp$grp)
# size : number of individuals assigned to each group
grp$size
# Compare inferred groups (inf) with actual groups (ori)
# Find actual groups with function pop on the genind object
pop(genind)
tab <- table(pop(genind), grp$grp)
# Display
table.value(tab, col.labels = colnames(tab), row.labels = rownames(tab))
# describe clusters using dapc1
dapc1 <- dapc(genind, grp$grp) # PCs=10, Linear discriminant=5
# visualization of clusters
myCol <- c("darkblue", "purple", "green", "orange")
scatter(dapc1, scree.da = FALSE, bg="white", pch=20, cell=0, cstar=0, col=myCol, solid=0.4, cex=1, clab=0, leg=TRUE, txt.leg = (paste("Cluster", 1:4)))
##########
## Tajimas'D with popGenome
# list of populations
populations <- list("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
# create a GENOME class object with vcf file
genome <- readData(path = "data/", format = "VCF")
# function that create a list, which element is the ensemble of individual for each pop
listing_ind <- function(npan, nellio, nschwein, ntroglo, nverus){
pan_vec <- c(rep("NA", npan))
ellio_vec <- c(rep("NA", nellio))
schwein_vec <- c(rep("NA", nschwein))
troglo_vec <- c(rep("NA", ntroglo))
verus_vec <- c(rep("NA", nverus))
for (i in 1:npan){
pan_vec[i] <- paste0("pan_", i)
}
for (i in 1:nellio){
ellio_vec[i] <- paste0("ellioti_", i)
}
for (i in 1:nschwein){
schwein_vec[i] <- paste0("schwein_", i)
}
for (i in 1:ntroglo){
troglo_vec[i] <- paste0("troglo_", i)
}
for (i in 1:nverus){
verus_vec[i] <- paste0("verus_", i)
}
populations <- list(pan_vec, ellio_vec, schwein_vec, troglo_vec, verus_vec)
}
# apply listing_ind with the rignt number of individuals for each pop
# and save it in the object 'populations'
populations <- listing_ind(11, 10, 6, 4, 5)
# set populations in the genome object
genome <- set.populations(genome, new.populations = populations, diploid = TRUE)
# check
genome@populations
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment