Skip to content
Snippets Groups Projects
Commit 4fb66391 authored by jplantad's avatar jplantad
Browse files

General script that iterates population genomiques analyses on every gene, and...

General script that iterates population genomiques analyses on every gene, and report generating scripts
parent 2218b3c5
No related branches found
No related tags found
No related merge requests found
####################################
## Population genetics : loop script
## J. Plantade
## Feb 2021
####################################
# packages
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(knitr)
library(stringr)
library(dplyr)
########################### PAN ################################################
n_species <- 5
# listing pan files in the data folder
files <- list.files("data_fixed_999/", pattern = "pan")
# extract the gene name for each pan file
genes <- NULL
for (i in 1:length(files)){
tmp <- str_split(files[i], pattern = "_")
genes[i] <- tmp[[1]][2]
}
# create a empty matrix to store the Levene test results
df_levene <- matrix(NA, nrow = length(genes)*n_species, ncol = 4)
colnames(df_levene) <- c("gene", "species", "Fstatistic", "pvalue")
df_levene[,1] <- rep(genes, rep(n_species, length(genes)))
df_levene[,2] <- rep(c("paniscus", "t.troglodytes", "t.ellioti", "t.schweinwurthii", "t.verus"), length(genes))
# set the start row number of each gene in df_levene (5 to 5)
tmp <- seq(0, nrow(df_levene)-n_species, n_species)
df_levene_start <- cbind(genes, tmp)
#for popGenome :
#listing the folders in the popgenome_data folder
popfold <- list.files(path = "./popgenome_data/pan/")
# create an empty data frame to collect all the Tajima's D values
tajima_df <- matrix(nrow = length(genes), ncol = 5)
row.names(tajima_df) <- genes
colnames(tajima_df) <- c("paniscus", "t.troglodytes", "t.ellioti", "t.schweinwurthii", "t.verus")
###############
# gene report #
###############
for (i in 1:length(genes)){
#################### data for pegas and adegenet packages
# get the gene name
gene_name <- genes[i]
# get the pan file name
file_name <- files[i]
# df_levene start row
a <- as.numeric(df_levene_start[i,2])
# load data
loci <- read.vcf(paste0("data_fixed_999/", file_name))
locinfo<-VCFloci(paste0("data_fixed_999/", file_name), what = "all", chunk.size = 1e9, quiet = FALSE)
#################### data for popGenome package
genome <- readData(path = paste0("popgenome_data/pan/", popfold[i]), format = "VCF")
#################### gene report
# launch the Rmd script that compute all calculation and save the report
render(input = "report_pan.Rmd", output_file = paste0("report/", gene_name, "_pan.pdf"))
}
# save the Levene test results
df_levene <- as.data.frame(df_levene)
write.table(df_levene, file ="levene_test/pan_levene.txt", quote = FALSE, na = "NaN", row.names = FALSE, col.names = TRUE)
# save the Tajima's D values
tajima_df <- as.data.frame(tajima_df)
write.table(tajima_df, file = "./tajimaD/pan_tajimaD_999.txt", quote = FALSE, na = "NaN", row.names = TRUE, col.names = TRUE)
################################## GOR #########################################
# listing gor files in the data folder
files <- list.files("data_fixed_999/", pattern = "gor")
# extract the gene name for each pan file
genes <- NULL
for (i in 1:length(files)){
tmp2 <- str_split(files[i], pattern = "_")
genes[i] <- tmp2[[1]][2]
}
# some VCF are empty for gor, we need to remove them
names(files) <- genes
files_suppressed <- c("Apobec3H", "CBX1.4full", "CBX3.5csd", "CBX5.1csd", "CXCR4")
for (i in 1:length(files_suppressed)){
files <- files[names(files)!=files_suppressed[i]]
}
# rerun line 90-91 to update 'genes'
# create a empty matrix to store the Levene test results
df_levene <- matrix(NA, nrow = length(genes), ncol = 4)
colnames(df_levene) <- c("gene", "species", "Fstatistic", "pvalue")
df_levene[,1] <- genes
df_levene[,2] <- rep(c("g.gorilla"), length(genes))
#for popGenome :
#listing the folders in the popgenome_data folder
popfold <- list.files(path = "./popgenome_data/gor/")
# create an empty data frame to collect all the Tajima's D values
tajima_df <- matrix(nrow = length(genes), ncol = 2)
tajima_df <- as.data.frame(tajima_df)
###############
# gene report #
###############
for (i in 1:length(genes)){
#################### data for pegas and adegenet packages
# get the gene name
gene_name <- genes[i]
# get the pan file name
file_name <- files[i]
# df_levene start row
a <- i
# load data
loci <- read.vcf(paste0("data_fixed_999/", file_name))
locinfo<-VCFloci(paste0("data_fixed_999/", file_name), what = "all", chunk.size = 1e9, quiet = FALSE)
#################### data for popGenome package
# load data for popGenome package
genome <- readData(path = paste0("popgenome_data/gor/", popfold[i]), format = "VCF")
#################### gene report
# launch the Rmd script that compute all calculation and save the report
render(input = "report_gor.Rmd", output_file = paste0("report/", gene_name, "_gor.pdf"))
}
# save the Levene test results
df_levene <- as.data.frame(df_levene)
write.table(df_levene, file ="levene_test/gor_levene.txt", quote = FALSE, na = "NaN", row.names = FALSE, col.names = TRUE)
# save the Tajima's D values
tajima_df <- as.data.frame(tajima_df[,1])
row.names(tajima_df) <- genes
colnames(tajima_df) <- c("Gorilla_gorilla_gorilla")
# save the Tajima's D values
write.table(tajima_df, file = "./tajimaD/gor_tajimaD_999.txt", quote = FALSE, na = "NaN", row.names = TRUE, col.names = TRUE)
################################## PONGO #########################################
n_species <- 2
# listing pongo files in the data folder
files <- list.files("data_fixed_999/", pattern = "pon")
# extract the gene name for each pan file
genes <- NULL
for (i in 1:length(files)){
tmp2 <- str_split(files[i], pattern = "_")
genes[i] <- tmp2[[1]][2]
}
# create a empty matrix to store the Levene test results
df_levene <- matrix(NA, nrow = length(genes)*n_species, ncol = 4)
colnames(df_levene) <- c("gene", "species", "Fstatistic", "pvalue")
df_levene[,1] <- rep(genes, rep(n_species, length(genes)))
df_levene[,2] <- rep(c("abelii", "pygmaeus"), length(genes))
# set the start row number of each gene in df_levene (5 to 5)
tmp <- seq(0, nrow(df_levene)-n_species, n_species)
df_levene_start <- cbind(genes, tmp)
#for popGenome :
#listing the folders in the popgenome_data folder
popfold <- list.files(path = "./popgenome_data/pon/")
# create an empty data frame to collect all the Tajima's D values
tajima_df <- matrix(nrow = length(genes), ncol = 2)
tajima_df <- as.data.frame(tajima_df)
###############
# gene report #
###############
for (i in 1:length(genes)){
#################### data for pegas and adegenet packages
# get the gene name
gene_name <- genes[i]
# get the pan file name
file_name <- files[i]
# df_levene start row
a <- as.numeric(df_levene_start[i,2])
# load data
loci <- read.vcf(paste0("data_fixed_999/", file_name))
locinfo<-VCFloci(paste0("data_fixed_999/", file_name), what = "all", chunk.size = 1e9, quiet = FALSE)
#################### data for popGenome package
# load data for popGenome package
genome <- readData(path = paste0("popgenome_data/pon/", popfold[i]), format = "VCF")
#################### gene report
# launch the Rmd script that compute all calculation and save the report
render(input = "report_pon.Rmd", output_file = paste0("report/", gene_name, "_pon.pdf"))
}
# save the Levene test results
df_levene <- as.data.frame(df_levene)
write.table(df_levene, file ="levene_test/pon_levene.txt", quote = FALSE, na = "NaN", row.names = FALSE, col.names = TRUE)
# save the Tajima's D values
row.names(tajima_df) <- genes
colnames(tajima_df) <- c("Pongo_abelii", "Pongo_pygmaeus")
# save the Tajima's D values
write.table(tajima_df, file = "./tajimaD/pon_tajimaD_999.txt", quote = FALSE, na = "NaN", row.names = TRUE, col.names = TRUE)
---
title: "Population genetics analysis of Gorilla genus populations"
author: "J. Plantade"
date: "2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(stringr)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(gridExtra)
library(knitr)
library(tidyr)
library(lawstat)
library(imager)
```
```{r data setting, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
subspecies <- c("Gorilla_beringei_graueri", "Gorilla_gorilla_gorilla")
nindiv <- c(1, 25)
pop(genind)<-rep(subspecies, nindiv)
# get the diversity parameters
div <- summary(genind)
```
# Informations
This document presents information about `r gene_name` gene in 2 Gorilla species. `r nloci` loci are analyzed.
```{r populations, echo=FALSE}
tmp <- str_replace(subspecies[], pattern = "_", replacement = " ")
tmp2 <- c("NI", "SIVgor+")
tab <- as.data.frame(cbind(tmp, nindiv, tmp2))
colnames(tab) <- c("Species", "nb of individuals", "SIV infection")
kable(tab, caption = "Gorilla data details with infectious status. NI means non infected. ")
```
# Heterozygosity
## Observed and expected heterozygosity
```{r heterozygosity1, echo=FALSE, out.width="70%", warning=FALSE, message=FALSE, fig.align='center', fig.show='hold', fig.cap="Difference between Hobs and Hexp for each SNP position."}
# create one genind object for each population
pop <- c("graueri", "gorilla")
list_genind <- list()
for (i in 1:length(pop)){
name <- pop[i]
tmp <- popsub(genind, sublist = c(subspecies[i]))
list_genind[[name]] <- tmp
}
# extracting diversity summary for each pop
list_div <- list()
for (i in 1:length(list_genind)){
name <- pop[i]
tmp <- summary(list_genind[[i]])
list_div[[name]] <- tmp
}
# extract the diff between Hobs and Hexp for each pop
tab_Hdiff <- matrix(NA, nrow = nloci, ncol = length(pop)+1)
tab_Hdiff[,1] <- locinfo$POS
colnames(tab_Hdiff) <- c("pos", pop)
for (i in 1:length(list_div)){
H_diff <- list_div[[i]][["Hobs"]]-list_div[[i]][["Hexp"]]
tab_Hdiff[,i+1] <- H_diff
}
tab_Hdiff <- as.data.frame(tab_Hdiff)
# import the coding exons coordinates if they exist and build ggplots
if (file.exists(paste0("./coordinates/", gene_name, "_codingexons_gor.txt"))==TRUE){
# import the coding exons coordinates
coord <- read.table(paste0("coordinates/", gene_name, "_codingexons_gor.txt"), header = FALSE)
n <- length(coord[,1])
# set values for the height of rectangles
ymin <- c(rep(-0.5, n))
ymax <- c(rep(0.5, n))
# set values for the length of rectangles
start <- coord[,2]
end <- coord[,3]
# merge into a data frame
tab_coord <- as.data.frame(cbind(ymin, ymax, start, end))
# create a list containing ggplots with exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_rect(data = tab_coord, aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates used here come from the UCSC database. They are displayed by the gray zones."
} else {
# create a list containing ggplots without exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates were not available on the UCSC database."
}
# diplay plots
list_gg_H[[2]]
```
`r message`
## Homoscedasticity
In the previous section, we see that observed heterozygosity is not always similar to the expected one. But this is a visual conclusion. To test this conclusion we compare the variance of distribution between the observed heterozygosity and the expected one. Because heterozygosity is not always normally distributed we use the Levene test that is more robust than the Bartlett test when normality is not respected.
```{r levene, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
# create an empty list
list_test <- vector(mode = "list", length = length(list_div))
# loop : extract and test Hobs and Hexp
obs <- list_div[[2]][["Hobs"]]
exp <- list_div[[2]][["Hexp"]]
H <- cbind(obs, exp)
H <- as.data.frame(H)
colnames(H) <- c("Hobs", "Hexp")
H <- H %>%
pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value")
test <- levene.test(y = H$H_value, group = H$origin)
# build test table
tab_test <- matrix(NA, nrow = 1, ncol = 2)
rownames(tab_test) <- c("Gor gorilla gorilla")
colnames(tab_test) <- c("Fstatistic", "p value")
# fill df_levene for results summary
df_levene[a,3] <- test[["statistic"]]
df_levene[a,4] <- test[["p.value"]]
# fill tab_test and display it
tab_test[1,1] <- test[["statistic"]]
tab_test[1,2] <- test[["p.value"]]
tab_test <- format(tab_test, digits = 3, scientific = TRUE)
knitr::kable(tab_test, caption = "Levene test results")
```
\newpage
# Tajima's D
Tajima's D display the difference between the observed diversity and the diversity expected under neutral evolution. A D value equal to 0 means that the gene is only driven by evolutionary drift. If D value diverts from 0, the gene is evolving under a non-random process (selection).
Here, *Gorilla beringei graueri* is defined as the outgroup. Therefore we obtain D values for *Gorilla gorilla gorilla* only.
```{r tajima1, echo=FALSE, warning=FALSE, message=FALSE, comment=NA, results='hide', fig.keep='all', fig.align='center', fig.cap=paste("Tajima's D values for", gene_name, "gene"), out.width = "60%"}
# set the populations in the genome object
pops <- get.individuals(genome)[[1]]
beringei <- pops[grep("Gorilla_beringei_graueri",pops)]
gorilla <- pops[grep("Gorilla_gorilla_gorilla",pops)]
genome <- set.populations(genome, list(beringei, gorilla))
# set the outgroup
genome <- set.outgroup(genome, beringei)
# neutrality tests
genome <- neutrality.stats(genome)
# save the Tajima's D values in tajima_df
taj <- genome@Tajima.D
tajima_df[i,] <- genome@Tajima.D
# build a data frame with D values and plot it
taj <- as.data.frame(t(taj))
taj <- cbind(taj, pop = c("Gorilla_beringei_graueri", "Gorilla_gorilla_gorilla"))
colnames(taj)[1] <- c("D_value")
taj <- taj %>% drop_na()
gg_D <- ggplot(data = taj, mapping = aes(x = D_value, y = pop)) + geom_point() + labs(x = "D value", y = "populations")
gg_D
```
\ No newline at end of file
---
title: "Population genetics analysis of Pan genus populations"
author: "J. Plantade"
date: "02/2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(stringr)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(gridExtra)
library(knitr)
library(tidyr)
library(lawstat)
library(imager)
```
```{r data setting, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
subspecies <- c("paniscus","troglodytes_ellioti","troglodytes_schweinfurthii","troglodytes_troglodytes","troglodytes_verus")
nindiv <- c(11, 10, 6, 4, 5)
pop(genind)<-rep(subspecies, nindiv)
# get the diversity parameters
div <- summary(genind)
```
# Informations
This document presents information about `r gene_name` gene in 5 Pan species and subspecies. `r nloci` loci are analyzed.
```{r populations, echo=FALSE}
tmp <- str_replace(subspecies[], pattern = "_", replacement = " ")
tmp2 <- paste("Pan", tmp[])
tmp3 <- c("NI", "NI", "SIVcpz+", "SIVcpz+", "NI")
tab <- as.data.frame(cbind(tmp2, nindiv, tmp3))
colnames(tab) <- c("Species/subspecies", "nb of individuals", "SIV infection")
kable(tab, caption = "Pan data details with infectious status. NI means non infected.")
```
# Heterozygosity
## Observed and expected heterozygosity
```{r heterozygosity1, echo=FALSE, out.width="60%", warning=FALSE, message=FALSE, fig.align='center', fig.show='hold', fig.cap="Difference between Hobs and Hexp for each SNP position."}
# create one genind object for each population
pop <- c("pan", "ellioti", "schwein", "troglo", "verus")
list_genind <- list()
for (i in 1:length(pop)){
name <- pop[i]
tmp <- popsub(genind, sublist = c(subspecies[i]))
list_genind[[name]] <- tmp
}
# extracting diversity summary for each pop
list_div <- list()
for (i in 1:length(list_genind)){
name <- pop[i]
tmp <- summary(list_genind[[i]])
list_div[[name]] <- tmp
}
# extract the diff between Hobs and Hexp for each pop
tab_Hdiff <- matrix(NA, nrow = nloci, ncol = length(pop)+1)
tab_Hdiff[,1] <- locinfo$POS
colnames(tab_Hdiff) <- c("pos", pop)
for (i in 1:length(list_div)){
H_diff <- list_div[[i]][["Hobs"]]-list_div[[i]][["Hexp"]]
tab_Hdiff[,i+1] <- H_diff
}
tab_Hdiff <- as.data.frame(tab_Hdiff)
# import the coding exons coordinates if they exist and build ggplots
if (file.exists(paste0("./coordinates/", gene_name, "_codingexons_pan.txt"))==TRUE){
# import the coding exons coordinates
coord <- read.table(paste0("coordinates/", gene_name, "_codingexons_pan.txt"), header = FALSE)
n <- length(coord[,1])
# set values for the height of rectangles
ymin <- c(rep(-0.5, n))
ymax <- c(rep(0.5, n))
# set values for the length of rectangles
start <- coord[,2]
end <- coord[,3]
# merge into a data frame
tab_coord <- as.data.frame(cbind(ymin, ymax, start, end))
# create a list containing ggplots with exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_rect(data = tab_coord, aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates used here come from the UCSC database. They are displayed by the gray zones."
} else {
# create a list containing ggplots without exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates were not available on the UCSC database."
}
# diplay plots
list_gg_H[[1]]
list_gg_H[[2]]
list_gg_H[[3]]
```
```{r heterozygosity2, echo=FALSE, out.width="60%", warning=FALSE, message=FALSE, fig.align='center', fig.show='hold', fig.cap="Difference between Hobs and Hexp for each SNP position"}
list_gg_H[[4]]
list_gg_H[[5]]
```
`r message`
\newpage
## Homoscedasticity
In the previous section, we see that observed heterozygosity is not always similar to the expected one. But this is a visual conclusion. To test this conclusion we compare the variance of distribution between the observed heterozygosity and the expected one. Because heterozygosity is not always normally distributed we use the Levene test that is more robust than the Bartlett test when normality is not respected.
```{r levene, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
# create an empty list
list_test <- vector(mode = "list", length = length(list_div))
# loop : extract and test Hobs and Hexp
for (i in 1:length(list_div)){
obs <- list_div[[i]][["Hobs"]]
exp <- list_div[[i]][["Hexp"]]
H <- cbind(obs, exp)
H <- as.data.frame(H)
colnames(H) <- c("Hobs", "Hexp")
H <- H %>%
pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value")
tmp <- levene.test(y = H$H_value, group = H$origin)
list_test[[i]] <- tmp
names(list_test[[i]]) <- paste0(pop[i], "_test")
}
# build test table
tab_test <- matrix(NA, nrow = length(list_test), ncol = 2)
rownames(tab_test) <- pop
colnames(tab_test) <- c("Fstatistic", "p value")
for (i in 1:length(list_test)){
# fill little table for the report
tab_test[i,1] <- list_test[[i]][[1]]
tab_test[i,2] <- list_test[[i]][[2]]
# fill df_levene for results summary
df_levene[a+i,3] <- list_test[[i]][[1]]
df_levene[a+i,4] <- list_test[[i]][[2]]
}
tab_test <- format(tab_test, digits = 1, scientific = FALSE)
knitr::kable(tab_test, caption = "Levene test results for each species/subspecies")
```
# Cluster building
In order to detect a particular population structure based on the variants, we build virtual clusters and compare them with taxonomy information and infectious status. The number of cluster (k) is chosen as the value for which the BIC curve shows an elbow (a higher value of k would not provide much more information).
```{r cluster, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
if (file.exists(paste0("dapc_figures/pan/elbow/", gene_name, "_dapc_figure.png"))==TRUE){
image_path <- paste0("dapc_figures/pan/elbow/", gene_name, "_dapc_figure.png")
}
```
![Figure describing the partition of individuals in the different clusters, with their species/subspecies and their infectious status.](`r image_path`)
# Fixation index (Fst) for pairs of populations
```{r Fst1, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
# create genind objects with pair of populations
list_pairwise <- list()
for (i in 1:4){
for (j in 1:(5-i)){
# subset genind object (2 pop)
tmp <- popsub(genind, sublist = c(subspecies[i], subspecies[i+j]))
# create the pair name
pairname <- paste0(pop[i], "_", pop[i+j])
# add the new genind object to the list
list_pairwise[[pairname]] <- tmp
}
}
# calcul FST values pairwise
tab_fstpair <- matrix(NA, nrow = nloci, ncol = length(list_pairwise))
colnames(tab_fstpair) <- names(list_pairwise)
for (i in 1:length(list_pairwise)){
tmp <- Fst(as.loci(list_pairwise[[i]]), na.alleles = "")
# extract FST values that are in the second column
tab_fstpair[,i] <- tmp[,2]
}
tab_fstpair <- cbind(tab_fstpair, pos = locinfo$POS)
tab_fstpair <- as.data.frame(tab_fstpair)
```
```{r Fst2, echo=FALSE, warning=FALSE, message=FALSE, comment=NA, out.width='50%', fig.align='center', fig.cap="Fst values in function of SNP position"}
# plot FST values for each pair of pop
# create a function that create ggplots
if (file.exists(paste0("./coordinates/", gene_name, "_codingexons_pan.txt"))==TRUE){
n <- length(coord[,1])
# set values for the height of rectangles
ymin <- c(rep(-0.5, n))
ymax <- c(rep(1.1, n))
# set values for the length of rectangles
start <- coord[,2]
end <- coord[,3]
# merge into a data frame
tab_coord <- as.data.frame(cbind(ymin, ymax, start, end))
list_gg_fstpair <- list()
# fill the list with ggplots with exons
for (i in 1:(length(tab_fstpair)-1)){
name <- colnames(tab_fstpair)[i]
tmp <- ggplot() +
geom_rect(data = tab_coord, aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(data = tab_fstpair, aes_string(x = tab_fstpair[,11], y = name), size = 0.1) +
geom_hline(yintercept=c(0, 0.05, 0.15, 0.25), linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Fst", title = name) +
theme_classic()
list_gg_fstpair[[name]] <- tmp
}
} else {
list_gg_fstpair <- list()
# fill the list with ggplots without exons
for (i in 1:(length(tab_fstpair)-1)){
name <- colnames(tab_fstpair)[i]
tmp <- ggplot() +
geom_point(data = tab_fstpair, aes_string(x = tab_fstpair[,11], y = name), size = 0.1) +
geom_hline(yintercept=c(0, 0.05, 0.15, 0.25), linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Fst", title = name) +
theme_classic()
list_gg_fstpair[[name]] <- tmp
}
}
# plot
list_gg_fstpair[[1]]
list_gg_fstpair[[2]]
list_gg_fstpair[[3]]
list_gg_fstpair[[4]]
list_gg_fstpair[[5]]
list_gg_fstpair[[6]]
list_gg_fstpair[[7]]
list_gg_fstpair[[8]]
list_gg_fstpair[[9]]
list_gg_fstpair[[10]]
```
\newpage
# Tajima's D
Tajima's D display the difference between the observed diversity and the diversity expected under neutral evolution. A D value equal to 0 means that the gene is only driven by evolutionary drift. If D value diverts from 0, the gene is evolving under a non-random process (selection).
```{r tajima1, echo=FALSE, warning=FALSE, message=FALSE, comment=NA, results='hide', fig.keep='all', fig.align='center', fig.cap=paste("Tajima's D values for", gene_name, "gene"), out.width = "60%"}
# set the populations in the genome object
pops <- get.individuals(genome)[[1]]
paniscus <- pops[grep("Pan_paniscus",pops)]
ellioti <- pops[grep("Pan_troglodytes_ellioti",pops)]
schwein <- pops[grep("Pan_troglodytes_schweinfurthii",pops)]
troglo <- pops[grep("Pan_troglodytes_troglodytes",pops)]
verus <- pops[grep("Pan_troglodytes_verus",pops)]
genome <- set.populations(genome, list(paniscus, troglo, ellioti, schwein, verus))
# set the outgroup
genome <- set.outgroup(genome, paniscus)
# neutrality tests
genome <- neutrality.stats(genome)
# save the Tajima's D values in big tajima_df
taj <- genome@Tajima.D
tajima_df[i,] <- taj
# build a data frame with D values and plot it
taj <- as.data.frame(t(taj))
taj <- cbind(taj, pop = c("paniscus", "t. troglodytes", "t. ellioti", "t. schweinwurthii", "t. verus"))
colnames(taj)[1] <- c("D_value")
taj <- taj %>% drop_na()
gg_D <- ggplot(data = taj, mapping = aes(x = D_value, y = pop)) + geom_point() + labs(x = "D value", y = "populations")
gg_D
```
\ No newline at end of file
---
title: "Population genetics analysis of Pongo genus populations"
author: "Julie Plantade"
date: "2021"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ade4)
library(pegas)
library(adegenet)
library(PopGenome)
library(hierfstat)
library(stringr)
library(ggplot2)
library(poppr)
library(rmarkdown)
library(gridExtra)
library(knitr)
library(tidyr)
library(lawstat)
library(imager)
```
```{r data setting, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
nloci <- length(loci)
# convert loci object into a genind object
genind <- loci2genind(loci)
# add population factor
subspecies <- c("Pongo_abelii", "Pongo_pygmaeus")
nindiv <- c(5, 5)
pop(genind)<-rep(subspecies, nindiv)
# get the diversity parameters
div <- summary(genind)
```
# Informations
This document presents information about `r gene_name` gene in 2 Pongo species. `r nloci` loci are analyzed.
```{r populations, echo=FALSE}
tmp <- str_replace(subspecies[], pattern = "_", replacement = " ")
tmp2 <- c("NI", "NI")
tab <- as.data.frame(cbind(tmp, nindiv, tmp2))
colnames(tab) <- c("Species", "nb of individuals", "SIV infection")
kable(tab, caption = "Pongo data details with infectious status. NI means non infected.")
```
# Heterozygosity
## Observed and expected heterozygosity
```{r heterozygosity1, echo=FALSE, out.width="70%", warning=FALSE, message=FALSE, fig.align='center', fig.show='hold', fig.cap="Difference between Hobs and Hexp for each SNP position."}
# create one genind object for each population
pop <- c("abelii", "pygmaeus")
list_genind <- list()
for (i in 1:length(pop)){
name <- pop[i]
tmp <- popsub(genind, sublist = c(subspecies[i]))
list_genind[[name]] <- tmp
}
# extracting diversity summary for each pop
list_div <- list()
for (i in 1:length(list_genind)){
name <- pop[i]
tmp <- summary(list_genind[[i]])
list_div[[name]] <- tmp
}
# extract the diff between Hobs and Hexp for each pop
tab_Hdiff <- matrix(NA, nrow = nloci, ncol = length(pop)+1)
tab_Hdiff[,1] <- locinfo$POS
colnames(tab_Hdiff) <- c("pos", pop)
for (i in 1:length(list_div)){
H_diff <- list_div[[i]][["Hobs"]]-list_div[[i]][["Hexp"]]
tab_Hdiff[,i+1] <- H_diff
}
tab_Hdiff <- as.data.frame(tab_Hdiff)
# import the coding exons coordinates if they exist and build ggplots
if (file.exists(paste0("./coordinates/", gene_name, "_codingexons_pon.txt"))==TRUE){
# import the coding exons coordinates
coord <- read.table(paste0("coordinates/", gene_name, "_codingexons_pon.txt"), header = FALSE)
n <- length(coord[,1])
# set values for the height of rectangles
ymin <- c(rep(-0.5, n))
ymax <- c(rep(0.5, n))
# set values for the length of rectangles
start <- coord[,2]
end <- coord[,3]
# merge into a data frame
tab_coord <- as.data.frame(cbind(ymin, ymax, start, end))
# create a list containing ggplots with exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_rect(data = tab_coord, aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates used here come from the UCSC database. They are displayed by the gray zones."
} else {
# create a list containing ggplots without exons
list_gg_H <- list()
for (i in 2:length(tab_Hdiff)){
name <- colnames(tab_Hdiff)[i]
tmp <- ggplot() +
geom_point(data = tab_Hdiff, aes_string(x = tab_Hdiff[,1], y = name), size = 0.1) +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Hobs-Hexp", title = name) +
theme_classic()
list_gg_H[[name]] <- tmp
}
#
message <- "The coding exons coordinates were not available on the UCSC database."
}
# diplay plots
list_gg_H[[1]]
list_gg_H[[2]]
```
`r message`
\newpage
## Homoscedasticity
In the previous section, we see that observed heterozygosity is not always similar to the expected one. But this is a visual conclusion. To test this conclusion we compare the variance of distribution between the observed heterozygosity and the expected one. Because heterozygosity is not always normally distributed we use the Levene test that is more robust than the Bartlett test when normality is not respected.
```{r levene, echo=FALSE, warning=FALSE, message=FALSE, comment=NA}
# create an empty list
list_test <- vector(mode = "list", length = length(list_div))
# loop : extract and test Hobs and Hexp
for (i in 1:length(list_div)){
obs <- list_div[[i]][["Hobs"]]
exp <- list_div[[i]][["Hexp"]]
H <- cbind(obs, exp)
H <- as.data.frame(H)
colnames(H) <- c("Hobs", "Hexp")
H <- H %>%
pivot_longer("Hobs":"Hexp", names_to = "origin", values_to = "H_value")
tmp <- levene.test(y = H$H_value, group = H$origin)
list_test[[i]] <- tmp
names(list_test[[i]]) <- paste0(pop[i], "_test")
}
# build test table
tab_test <- matrix(NA, nrow = length(list_test), ncol = 2)
rownames(tab_test) <- pop
colnames(tab_test) <- c("Fstatistic", "p value")
for (i in 1:length(list_test)){
# fill little table for the report
tab_test[i,1] <- list_test[[i]][[1]]
tab_test[i,2] <- list_test[[i]][[2]]
# fill df_levene for results summary
df_levene[a+i,3] <- list_test[[i]][[1]]
df_levene[a+i,4] <- list_test[[i]][[2]]
}
tab_test <- format(tab_test, digits = 1, scientific = FALSE)
knitr::kable(tab_test, caption = "Levene test results for each species/subspecies")
```
# Fixation index (Fst) for pairs of populations
Here we use the Fst value combined with the position to identify nucleotides that seem to be the most structured between the different species.
```{r Fst, echo=FALSE, warning=FALSE, message=FALSE, comment=NA, out.width='50%', fig.align='center', fig.cap="Fst values in function of SNP position"}
tmp <- Fst(as.loci(genind), na.alleles = "")
fst <- as.data.frame(cbind(locinfo$POS, tmp[,2]))
colnames(fst) <- c("pos", "abelii_pygmaeus")
# create a function that create ggplots
if (file.exists(paste0("./coordinates/", gene_name, "_codingexons_pon.txt")) == TRUE){
# import the coding exons coordinates
coord <- read.table(paste0("coordinates/", gene_name, "_codingexons_pon.txt"), header = FALSE)
n <- length(coord[,1])
# set values for the height of rectangles
ymin <- c(rep(-0.5, n))
ymax <- c(rep(1.1, n))
# set values for the length of rectangles
start <- coord[,2]
end <- coord[,3]
# merge into a data frame
tab_coord <- as.data.frame(cbind(ymin, ymax, start, end))
ggplot() +
geom_rect(data = tab_coord, aes(xmin = start, ymin = ymin, xmax = end, ymax = ymax), fill = 'grey') +
geom_point(data = fst, aes_string(x = fst$pos, y = fst$abelii_pygmaeus), size = 0.1) +
geom_hline(yintercept=c(0, 0.05, 0.15, 0.25), linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Fst", title = colnames(fst)[2]) +
theme_classic()
} else {
ggplot() +
geom_point(data = fst, aes_string(x = fst$pos, y = fst$abelii_pygmaeus), size = 0.1) +
geom_hline(yintercept=c(0, 0.05, 0.15, 0.25), linetype="dashed", color = "blue", size = 0.1) +
labs(x = "position", y = "Fst", title = colnames(fst)[2]) +
theme_classic()
}
```
\newpage
# Tajima's D
Tajima's D display the difference between the observed diversity and the diversity expected under neutral evolution. A D value equal to 0 means that the gene is only driven by evolutionary drift. If D value diverts from 0, the gene is evolving under a non-random process (selection).
```{r tajima1, echo=FALSE, warning=FALSE, message=FALSE, comment=NA, fig.align='center', fig.cap=paste("Tajima's D values for", gene_name, "gene"), out.width = "60%"}
# set the populations in the genome object
pops <- get.individuals(genome)[[1]]
abelii <- pops[grep("Pongo_abelii",pops)]
pygmaeus <- pops[grep("Pongo_pygmaeus",pops)]
genome <- set.populations(genome, list(abelii, pygmaeus))
# neutrality tests
genome <- neutrality.stats(genome)
# save the Tajima's D values in tajima_df
taj <- genome@Tajima.D
tajima_df[i,] <- genome@Tajima.D
# build a data frame with D values
taj <- as.data.frame(t(taj))
taj <- cbind(taj, pop = c("Pongo_abelii", "Pongo_pygmaeus"))
colnames(taj)[1] <- c("D_value")
taj <- taj %>% drop_na()
gg_D <- ggplot(data = taj, mapping = aes(x = D_value, y = pop)) + geom_point() + labs(x = "D value", y = "populations")
gg_D
```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment