Skip to content
Snippets Groups Projects
Verified Commit 07c09e21 authored by nfontrod's avatar nfontrod
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
Package: TopGOwrapper
Version: 1.0.0
Title: A Wrapper to perform TopGO enrichment analysis
Licence: "aGPL3"
Authors@R: person("Nicolas", "Fontrodona",
email = "nicolas.fontrodona@ens-lyon.fr",
role = c("aut", "cre"))
Description: The package provides a way to easyly make TopGO enrichment analysis
Depends:
R (>= 4.1.2),
Imports:
topGO (>= 2.46.0),
org.Hs.eg.db (>= 3.14.0),
argparser (>= 0.7.1),
forcats (>= 0.5.1),
readr (>= 2.1.2),
dplyr (>= 1.0.7),
ggplot2 (>= 3.3.5)
LazyData: true
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.0
Suggests:
testthat
This diff is collapsed.
# Generated by roxygen2: do not edit by hand
export(build_figure)
export(cli_function)
export(cli_run_topgo)
export(get_gene_list)
export(load_input)
export(run_topgo)
export(top_go_analysis)
import(argparser)
import(dplyr)
import(forcats)
import(ggplot2)
import(org.Hs.eg.db)
import(readr)
import(topGO)
#' Produce figure displaying the top enriched go term
#'
#' @param top_go_result dataframe containing top_go results
#' @param go_term_type The go term type of the figure
#' @param outfile File were the figure will be created
#' @import ggplot2
#' @import dplyr
#' @import forcats
#' @export
build_figure <- function(top_go_result, go_term_type, outfile) {
if (go_term_type == "BP") {
go_name <- "Biological Process"
} else if (go_term_type == "MF") {
go_name <- "Molecular Function"
} else {
go_name <- "Cellular Component"
}
top_go_result$fish <- as.numeric(gsub("< ", "", top_go_result$fish))
top_go_result <- top_go_result %>%
mutate(significance = -log10(fish), ft = paste(GO.ID, ":", Term)) %>%
filter(fish <= 0.05) %>%
arrange(desc(significance)) %>%
ggplot(top_go_result, mapping = aes(
x = significance, y = fct_reorder(ft, significance),
size = Significant,
color = Term
)) +
geom_point() +
xlab("-log10(pvalue)") +
ylab(go_name) +
theme_bw() +
theme(
legend.position = "none"
)
ggsave(outfile, width = 6, height = 5)
}
#' Load the file containing differentially or not expressed genes
#'
#' @param de_file the file containing differentially or not expressed genes
#'
#' @return The loaded file with a gene column
#'
#' @import dplyr
#' @export
load_input <- function(de_file) {
df <- read.table(de_file, sep = "\t", header = T)
if (!"gene" %in% colnames(df)) {
df$gene <- rownames(df)
}
df <- dplyr::tibble(df)
return(df)
}
R/main.R 0 → 100644
#' Perform all go term enrchment analysis
#'
#' @param de_table tibble containing DESeq2 enrichment score
#' @param output_folder Folder where the results will be created
#' @param id_kind The kind of id to use in the topGO analysis. Must correspond
#' to the ids defined in gene column of de_file
#' @param top The number of most enriched GO term to return
#' @param alpha The padj threshold of a genes to be considered a significant
#' @param log2fc_threshold The threshold above which we want to consider
#' de-genes. It must be positive
#' @return The table containing the go enrichment analysis performed by topGO
#' @import dplyr
#' @import readr
#' @export
run_topgo <- function(de_table, output_folder, id_kind, top, alpha,
log2fc_threshold) {
for (go_term_type in c("BP", "MF", "CC")) {
for (regulation in c("down", "up", "all")) {
res <- top_go_analysis(
de_table, id_kind, go_term_type, regulation, top,
alpha, log2fc_threshold
)
res <- res %>%
as_tibble() %>%
select(-Expected)
if (regulation == "all") {
tmp_name <- "all_de_genes"
} else {
tmp_name <- paste0(regulation, "_regulated_genes")
}
stem_name <- paste0(
tmp_name, "_", go_term_type, "_a", alpha, "_lfct",
log2fc_threshold, "_top", top
)
txt_file <- paste0(output_folder, "/", stem_name, ".txt")
write_tsv(res, txt_file)
figure_file <- paste0(output_folder, "/", stem_name, ".pdf")
build_figure(res, go_term_type, figure_file)
}
}
}
#' Cli TopGO Wrapper
#'
#' @details perform a TopGO analysis using fisher statistics on human go term
#' for:
#' 1. Each go term type: BP, MF, CC
#' 2. Down-regulated, Up-regulated and all differentially expressed genes
#'
#' @export
cli_run_topgo <- function() {
cli <- cli_function()
if (!dir.exists(cli$output)) {
if (dir.exists(dirname(cli$output))) {
dir.create(cli$output, showWarnings = F)
} else {
stop(paste0(cli$output, "or its parent must exist !"))
}
}
if (!file.exists(cli$de_file)) {
stop("The file --de_file doesn't exists !")
}
de_table <- load_input(cli$de_file)
run_topgo(
de_table, cli$output, cli$id, cli$top, cli$alpha,
cli$log2fc_threshold
)
}
#' Parse the command line arguments
#'
#' Parse the command line arguments and return an object through which all
#' given arguments are stored
#' @import argparser
#' @export
cli_function <- function() {
# Create a parser
p <- arg_parser(paste0(
"Wrapper to perform TopGO enrichment analysis ",
"For now, it only work on human genes, with the fisher enrichment",
" method. Moreover, all genes in that files are used as the gene universe"
))
# Add command line arguments
p <- add_argument(p, "--de_file",
type = "character",
help = paste0(
"A file containing deseq2 enrichment analysis.",
"All genes must be defined in this file event",
"those not differentially expressed"
)
)
p <- add_argument(p, "--id",
type = "character",
help = paste0(
"The id identifying the genes in de_file. It can take",
" the following values: 'entrez', 'genbank', 'alias', 'ensembl',",
" 'symbol', 'genename', 'unigene'. Defaults to 'symbol' "
),
default = "symbol"
)
p <- add_argument(p, "--output",
short = "-o", type = "character",
help = "folder were the results will be created",
default = "."
)
p <- add_argument(p, "--top",
short = "-t", type = "numeric",
help = "The number of top go term to display", default = 20
)
p <- add_argument(p, "--alpha",
short = "-t", type = "numeric",
help = paste0(
"The padj threshold in de_file below which",
" genes are considered as differentially expressed",
" defaults to 0.05"
), default = 0.05
)
p <- add_argument(p, "--log2fc_threshold",
short = "-t", type = "numeric",
help = paste0(
"The log2fc threshold in de_file above which( in absolute value)",
" genes are considered as differentially expressed",
", defaults to 0"
), default = 0
)
# Parse de command line arguments
argv <- argparser::parse_args(p)
if (is.null(argv$de_file)) {
msg <- paste0(
"Argument --de_file is required !,",
" see --help for more information."
)
stop(msg)
}
return(argv)
}
#' Get the gene list vector needed for differential expression
#'
#' @param de_table tibble containing DESeq2 enrichment score
#' @param regulation The regulation of the gene of interest 'down', 'up' 'all'
#' where 'down to perform enrichment for down-regulated genes, 'up'
#' to perform enrichement for up-regulated genes and 'all' to perform
#' enrichment for all differentially expressed genes
#' @param alpha The padj threshold of a genes to be considered a significant
#' @param log2fc_threshold The threshold above which we want to consider
#' de-genes. It must be positive
#' @return the named vector used to build the topGO object
#'
#' @export
get_gene_list <- function(de_table, regulation, alpha, log2fc_threshold) {
if (regulation == "down") {
gene_list <- as.integer(de_table$padj <= alpha &
de_table$log2FoldChange <= -log2fc_threshold)
} else if (regulation == "up") {
gene_list <- as.integer(de_table$padj <= alpha &
de_table$log2FoldChange >= log2fc_threshold)
} else {
gene_list <- as.integer(de_table$padj <= alpha & de_table$log2FoldChange > abs(log2fc_threshold))
}
names(gene_list) <- de_table$gene
return(gene_list)
}
#' Perform the TopGO analysis
#'
#' @param de_table tibble containing DESeq2 enrichment score
#' @param id_kind The kind of id to use in the topGO analysis. Must correspond
#' to the ids defined in gene column of de_file
#' @param go_term_type The type of go term to use in the analysis
#' @param regulation The regulation of the gene of interest 'down', 'up' 'all'
#' where 'down to perform enrichment for down-regulated genes, 'up'
#' to perform enrichement for up-regulated genes and 'all' to perform
#' enrichment for all differentially expressed genes
#' @param top The number of most enriched GO term to return
#' @param alpha The padj threshold of a genes to be considered a significant
#' @param log2fc_threshold The threshold above which we want to consider
#' de-genes. It must be positive
#' @return The table containing the go enrichment analysis performed by topGO
#' @import topGO
#' @import org.Hs.eg.db
#' @export
top_go_analysis <- function(de_table, id_kind, go_term_type, regulation, top,
alpha, log2fc_threshold) {
gene_list <- get_gene_list(de_table, regulation, alpha, log2fc_threshold)
GOdata <- new("topGOdata",
ontology = go_term_type,
allGenes = gene_list,
nodeSize = 10,
annot = annFUN.org,
mapping = "org.Hs.eg.db",
ID = id_kind,
geneSel = function(score) {
return(score == 1)
}
)
results.f <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
res <- GenTable(GOdata, fish = results.f, orderBy = "fish", topNodes = top)
return(res)
}
README.md 0 → 100644
# TopGO_wrapper
# Description
This is an R package dedicated to perform GO enrichment analysis from deseq2 differential expression file
# Dependencies
Depends:
R (>= 4.1.2),
Imports:
topGO (>= 2.46.0),
org.Hs.eg.db (>= 3.14.0),
argparser (>= 0.7.1),
forcats (>= 0.5.1),
readr (>= 2.1.2),
dplyr (>= 1.0.7),
ggplot2 (>= 3.3.5)
# Installation
To install this package, the `devtools` package must be installed.
Run in R the following command to install the package:
```R
> library(devtools)
> install_gitlab("LBMC/regards/topgo-wrapper", host = "https://gitbio.ens-lyon.fr", quiet = FALSE)
```
# Limitations
For not, it only works with human datasets and only performs GO enrichment using the 'classic' topGO algorithm and the 'fisher' statistic
# Usage
## With a command line interface (CLI) script
First, you must create an R file containing only the following code:
```R
# my_R_file.R
#!/bin/Rscript
#!/bin/Rscript
library('TopGOqwrapper')
library('topGO')
library('org.Hs.eg.db')
cli_run_topgo()
```
Then you can type the following commands to see if everything works:
```console
$ Rscript my_R_file.R --help
...
usage: test.R [--] [--help] [--opts OPTS] [--de_file DE_FILE] [--id ID]
[--output OUTPUT] [--top TOP] [--alpha ALPHA]
[--log2fc_threshold LOG2FC_THRESHOLD]
Wrapper to perform TopGO enrichment analysis For now, it only work on
human genes, with the fisher enrichment method. Moreover, all genes in
that files are used as the gene universe
flags:
-h, --help show this help message and exit
optional arguments:
-x, --opts RDS file containing argument values
-d, --de_file A file containing deseq2 enrichment analysis.All
genes must be defined in this file eventthose not
differentially expressed
-i, --id The id identifying the genes in de_file. It can
take the following values: 'entrez', 'genbank',
'alias', 'ensembl', 'symbol', 'genename',
'unigene'. Defaults to 'symbol' [default: symbol]
-o, --output folder were the results will be created [default:
.]
-t, --top The number of top go term to display [default:
20]
--alpha The padj threshold in de_file below which genes
are considered as differentially expressed
defaults to 0.05 [default: 0.05]
--log2fc_threshold The log2fc threshold in de_file above which( in
absolute value) genes are considered as
differentially expressed, defaults to 0 [default:
0]
```
The de_file parameter must correspond to a file with the following structure:
| gene | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
| ------ | ---------------- | ------------------ | ----------------- | ------------------- | ------------------ | ----------------- |
| A1BG | 240.956914340076 | 0.133226932617053 | 0.328043296485508 | 0 | 1 | 1 |
| A2M | 9636.70629697928 | -0.595284877763812 | 0.502037280842549 | -0.0204862829042333 | 0.983655454438162 | 1 |
| A4GALT | 402.374065197262 | -1.0931216879495 | 0.347107578223515 | -1.46387379540962 | 0.143228434481562 | 0.976692858834766 |
| AAAS | 226.084731795302 | 2.97777448404702 | 1.26846531308784 | 1.88635389502473 | 0.0592472810840546 | 0.549734319535217 |
1. The `gene` columns must contain the ID of the gene. you can specify the type of ID with the `--id` parameter
2. The `baseMean` column correspond the the mean gene expression across samples
3. The `log2FoldChange` column, corresponds to the log2FoldChange of expression between conditions
4. The `padj` column, corresponds to the pvalues adjusted
Note that only `gene`, `log2FoldChange` and `padj` column are required
The columns must be `tab-separated`.
Note that the column gene can corresponds to rownames of the table
** WARNING: This input file must contain ALL genes whether they are differentially expressed or not. Indeed, all genes defined in this file are used as the gene universe. **
# output
Here a description of the results file you get by running the package:
```
result_folder
├── [REG]_genes_[GOTYPE]_a[ALPHA]_lfc[LFCT]_top[NUM].pdf
└── [REG]_genes_[GOTYPE]_a[ALPHA]_lfc[LFCT]_top[NUM].txt
```
Where:
- `REG`: Can be equal to down_regulated, up_regulated or all_de. When `REG` is
- `down_regulated`: Only consider the down-regulated (`padj` <= `ALPHA` & `log2FoldChange` <= `LFCT`) genes...
- `up_regulated`: Only consider the up-regulated (`padj` <= `ALPHA` & `log2FoldChange` >= `LFCT`) genes...
- `all_de`: Consider all differentially expressed (`padj` <= `ALPHA` & `log2FoldChange` >= abs(`LFCT`)) genes...
...to perform the GO enrichement analysis against all genes defined in the input file given with the `--de_file` parameter
- `GOTYPE`: The GO term type considered in the result file:
- `BP`: Biological Process
- `MF`: Molecular Function
- `CC`Cellular component
- `ALPHA`: consider genes having a `padj` <= `ALPHA` in the input file as significant
- `LFCT`: consider genes having a
- `log2FoldChange` <= `LFCT` (for down-regulated genes)...
- `log2FoldChange` >= `LFCT` (for up-regulated genes)...
- `log2FoldChange` >= abs(`LFCT`) (for differentially expressed genes)...
... in the input file as differentially expressed
- `NUM`: The top number of enriched go term to display
The pdf file corresponds to a figure displying the top enriched go terms and the text file is the same thing but in the form a tabulated file with this structure:
| GO.ID | Term | Annotated | Significant | fish |
| ---------- | ------------------------------------------- | --------- | ----------- | ------- |
| GO:0051240 | positive regulation of multicellular org... | 986 | 67 | 4.6e-10 |
| GO:0010941 | regulation of cell death | 1206 | 76 | 8.2e-10 |
| GO:0042127 | regulation of cell population proliferat... | 1171 | 74 | 1.3e-09 |
Where:
- `fish`: p-value of the GO term enrichment analysis using the Fisher method
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/figure.R
\name{build_figure}
\alias{build_figure}
\title{Produce figure displaying the top enriched go term}
\usage{
build_figure(top_go_result, go_term_type, outfile)
}
\arguments{
\item{top_go_result}{dataframe containing top_go results}
\item{go_term_type}{The go term type of the figure}
\item{outfile}{File were the figure will be created}
}
\description{
Produce figure displaying the top enriched go term
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/parser.R
\name{cli_function}
\alias{cli_function}
\title{Parse the command line arguments}
\usage{
cli_function()
}
\description{
Parse the command line arguments and return an object through which all
given arguments are stored
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/main.R
\name{cli_run_topgo}
\alias{cli_run_topgo}
\title{Cli TopGO Wrapper}
\usage{
cli_run_topgo()
}
\description{
Cli TopGO Wrapper
}
\details{
perform a TopGO analysis using fisher statistics on human go term
for:
\enumerate{
\item Each go term type: BP, MF, CC
\item Down-regulated, Up-regulated and all differentially expressed genes
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/topGO_analysis.R
\name{get_gene_list}
\alias{get_gene_list}
\title{Get the gene list vector needed for differential expression}
\usage{
get_gene_list(de_table, regulation, alpha, log2fc_threshold)
}
\arguments{
\item{de_table}{tibble containing DESeq2 enrichment score}
\item{regulation}{The regulation of the gene of interest 'down', 'up' 'all'
where 'down to perform enrichment for down-regulated genes, 'up'
to perform enrichement for up-regulated genes and 'all' to perform
enrichment for all differentially expressed genes}
\item{alpha}{The padj threshold of a genes to be considered a significant}
\item{log2fc_threshold}{The threshold above which we want to consider
de-genes. It must be positive}
}
\value{
the named vector used to build the topGO object
}
\description{
Get the gene list vector needed for differential expression
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/load_df.R
\name{load_input}
\alias{load_input}
\title{Load the file containing differentially or not expressed genes}
\usage{
load_input(de_file)
}
\arguments{
\item{de_file}{the file containing differentially or not expressed genes}
}
\value{
The loaded file with a gene column
}
\description{
Load the file containing differentially or not expressed genes
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/main.R
\name{run_topgo}
\alias{run_topgo}
\title{Perform all go term enrchment analysis}
\usage{
run_topgo(de_table, output_folder, id_kind, top, alpha, log2fc_threshold)
}
\arguments{
\item{de_table}{tibble containing DESeq2 enrichment score}
\item{output_folder}{Folder where the results will be created}
\item{id_kind}{The kind of id to use in the topGO analysis. Must correspond
to the ids defined in gene column of de_file}
\item{top}{The number of most enriched GO term to return}
\item{alpha}{The padj threshold of a genes to be considered a significant}
\item{log2fc_threshold}{The threshold above which we want to consider
de-genes. It must be positive}
}
\value{
The table containing the go enrichment analysis performed by topGO
}
\description{
Perform all go term enrchment analysis
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/topGO_analysis.R
\name{top_go_analysis}
\alias{top_go_analysis}
\title{Perform the TopGO analysis}
\usage{
top_go_analysis(
de_table,
id_kind,
go_term_type,
regulation,
top,
alpha,
log2fc_threshold
)
}
\arguments{
\item{de_table}{tibble containing DESeq2 enrichment score}
\item{id_kind}{The kind of id to use in the topGO analysis. Must correspond
to the ids defined in gene column of de_file}
\item{go_term_type}{The type of go term to use in the analysis}
\item{regulation}{The regulation of the gene of interest 'down', 'up' 'all'
where 'down to perform enrichment for down-regulated genes, 'up'
to perform enrichement for up-regulated genes and 'all' to perform
enrichment for all differentially expressed genes}
\item{top}{The number of most enriched GO term to return}
\item{alpha}{The padj threshold of a genes to be considered a significant}
\item{log2fc_threshold}{The threshold above which we want to consider
de-genes. It must be positive}
}
\value{
The table containing the go enrichment analysis performed by topGO
}
\description{
Perform the TopGO analysis
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment