From a70b5adfb8d22bbbcd3bc3ea103657d241f89823 Mon Sep 17 00:00:00 2001 From: aliarifki <aliarifki@outlook.fr> Date: Thu, 25 May 2023 11:17:38 +0200 Subject: [PATCH] Ajout des scripts R --- src/.docker_modules/r-scripts/1.0/Dockerfile | 6 + .../r-scripts/1.0/HBV_RNAs_count_2.R | 290 +++++++++++++ .../r-scripts/1.0/Install_packages.R | 3 + .../r-scripts/1.0/Junctions_NanoSplicer_2.R | 399 ++++++++++++++++++ .../r-scripts/1.0/docker_init.sh | 2 + .../1.0/start_positions_individuals_2.R | 213 ++++++++++ src/nf_modules/junction_nanosplicer/main.nf | 26 ++ src/nf_modules/rna_count/main.nf | 0 src/nf_modules/start_positions/main.nf | 25 ++ 9 files changed, 964 insertions(+) create mode 100644 src/.docker_modules/r-scripts/1.0/Dockerfile create mode 100755 src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R create mode 100644 src/.docker_modules/r-scripts/1.0/Install_packages.R create mode 100644 src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R create mode 100755 src/.docker_modules/r-scripts/1.0/docker_init.sh create mode 100755 src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R create mode 100644 src/nf_modules/junction_nanosplicer/main.nf create mode 100644 src/nf_modules/rna_count/main.nf create mode 100644 src/nf_modules/start_positions/main.nf diff --git a/src/.docker_modules/r-scripts/1.0/Dockerfile b/src/.docker_modules/r-scripts/1.0/Dockerfile new file mode 100644 index 0000000..dfadf74 --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/Dockerfile @@ -0,0 +1,6 @@ +FROM rocker/r-base:4.2.3 + +## copy Rscript files +COPY ./*.R . + +RUN Rscript Install_packages.R diff --git a/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R b/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R new file mode 100755 index 0000000..0010d51 --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R @@ -0,0 +1,290 @@ +#!/bin/Rscript +# Packages installation +library(ggplot2, quietly = TRUE) +library(tidyr, quietly = TRUE) +library(plyr, quietly = TRUE) +library(dplyr, quietly = TRUE) +library(stringr, quietly = TRUE) +library(RColorBrewer) +library(optparse) + +# Load files +option_list = list( + make_option(c("-s", "--SPvariants"), type="character", default=NULL, + help="input identified SP variants table (.csv)", metavar="character"), + make_option(c("-c", "--classification"), type="character", default=NULL, + help="input classification of reads file (.txt)", metavar="character")) +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +# vectors of species & Palettes Colors: +SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08", + "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16", + "SP17", "SP18", "SP19", "SP20", "SP21", "SP22") +SPvariants <- factor(SPvariants, levels = SPvariants) +colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", + "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", + "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", + "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", + "#4FBCBA", "#608AD2") +palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, + stringsAsFactors = FALSE) + +TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx") +TSS_species <- factor(TSS_species, levels = TSS_species) +colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") +palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, + stringsAsFactors = FALSE) + +new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", + "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined") +new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates) +colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", + "#A92322", "#964B34") +palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, + stringsAsFactors = FALSE) + +SPxx <- c("SPvariants") +SPxx <- factor(SPxx, levels = SPxx) +color_SPxx <- c("#33C5FF") +palette_SPxx <- data.frame(nom = SPxx, + teinte = color_SPxx, + stringsAsFactors = FALSE) + +all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx) + +palette_complete <- rbind.data.frame(palette_TSS, + palette_SP1_22, + palette_new_SP, + palette_SPxx, + stringsAsFactors = FALSE) + +# Load Start_positions_count files: +identified_SP <- read.table(file = opt$SPvariants[1], + header = TRUE) + +clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% + select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name) + +countSP <- clean_SP %>% count(SP_name) +countSP <- dplyr::inner_join(palette_complete, + countSP, + by = c("nom" = "SP_name")) +#print(names(palette_complete)) +#print(names(countSP)) + +countSP$nom <- factor(countSP$nom, levels = all_species_name) +countSP <- mutate(countSP, + proportion = (as.numeric(n)/sum(as.numeric(n))*100)) +#print(countSP) +ggplot(countSP, aes(x = "percent", + y = proportion, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = countSP$teinte) + + labs(fill = "spliced-variants") + +ggsave(file = "SP_proportion_camembert.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) + + geom_col() + + scale_fill_manual(values = countSP$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "spliced-variants") + + xlab(label = "spliced-variants") + + ylab(label = "percent") + +ggsave(file = "SP_proportion.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# TSS not spliced: +classified_reads <- read.table(file = opt$classification[1], + header = TRUE) + +not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),] +not_spliced <- mutate(not_spliced, + species = not_spliced$promoter) +#print(not_spliced) +not_spliced <- not_spliced %>% select(read_ID, species) +colnames(not_spliced) <- c("id", "species") + +clean_SP_type <- clean_SP %>% select(id, SP_name) +colnames(clean_SP_type) <- c("id", "species") + +df_species <- rbind.data.frame(not_spliced, clean_SP_type, + stringsAsFactors = FALSE) +count_species <- df_species %>% count(species) +count_species <- mutate(count_species, + percent = (as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_species) +write.table(df_species, file = "All_reads_identified.csv", + sep = "\t", quote = FALSE, row.names = FALSE) + +# Null dataset: +species <- c(TSS_species, SPvariants, new_SP_candidates) + +canonical_species <- TSS_species[c(1:3,5)] + +null_count <- data.frame(species = species, + n = rep(0, times = length(species)), + percent = rep(0, times = length(species)), + stringsAsFactors = FALSE) + +null_count <- transform(null_count, + n = as.integer(n), + percent = as.numeric(percent)) + +# Join: +count_species <- rbind.data.frame(count_species, + subset(null_count, + !(species %in% count_species$species)), + stringsAsFactors = FALSE) + +# Merge all SP variants: +count_species_SPxx <- data.frame(species = "SPvariants", + n = sum(count_species[count_species$species + %in% SPvariants,]$n)) +count_species_SPxx <- rbind.data.frame(count_species_SPxx, + count_species[count_species$species + %in% c(canonical_species, + SPvariants), 1:2], + stringsAsFactors = FALSE) +count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% + all_species_name[c(1:3,5,35)],] +count_species_SPxx <- mutate(count_species_SPxx, + percent=(as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_species_SPxx) +# save the tab: +write.csv(count_species_SPxx, file = "Count_canonical_species_SPxx.csv") + +# prepare to plot: +count_species_SPxx <- dplyr::inner_join(palette_complete, + count_species_SPxx, + by = c("nom" = "species")) +#names(palette_complete) +#names(count_species_SPxx) +count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name) + +# Save: +write.csv(count_species, file = "Count_species.csv") + +# RNA species composition all species: +count_species <- dplyr::inner_join(palette_complete, count_species, + by = c("nom" = "species"), + suffix = c("","")) +count_species$nom <- factor(count_species$nom, levels = all_species_name) + +ggplot(count_species, + aes(x = nom, + y = percent, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_species$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "RNA species & spliced-variants") + + xlab(label = "RNA species & spliced-variants") + +ggsave(file = "Count_RNAs_species.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# Filter RNA species canonical + SPvariants: +count_species_clear <- count_species[count_species$species %in% c(canonical_species, SPvariants),] + +# RNA species composition all species: +count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear, + by = c("nom" = "nom"), + suffix = c("","")) +count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name) + +ggplot(count_species_clear, + aes(x = nom, + y = percent, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_species_clear$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "RNA species & spliced-variants") + + xlab(label = "RNA species & spliced-variants") + +ggsave(file = "Count_RNAs_species_clear.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# SP composition clear: +count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name) +count_clear <- mutate(count_clear, + proportion=(as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_clear) +count_clear <- dplyr::inner_join(palette_complete, + count_clear, + by = c("nom" = "SP_name")) +#names(count_clear) +count_clear$nom <- factor(count_clear$nom, levels = all_species_name) + +ggplot(count_clear, aes(x = "percent", + y = proportion, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = count_clear$teinte) + + labs(fill = "spliced-variants") + +ggsave(file = "SP_clear_proportion_camembert.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +ggplot(count_clear, aes(x = nom, + y = proportion, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_clear$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "spliced-variants") + + xlab(label = "spliced-variants") + + ylab(label = "percent") + +ggsave(file = "SP_clear_proportion.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# Camembert SPxx: +ggplot(count_species_SPxx, aes(x = "species", + y = percent, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = count_species_SPxx$teinte) + + labs(fill = "TSS") + + ylab(label = "TSS usage") + + xlab(label = "percent") + +ggsave(file = "Count_RNAs_species_camembert.png", + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + diff --git a/src/.docker_modules/r-scripts/1.0/Install_packages.R b/src/.docker_modules/r-scripts/1.0/Install_packages.R new file mode 100644 index 0000000..384435c --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/Install_packages.R @@ -0,0 +1,3 @@ +list.of.packages <- c("ggplot2", "tidyr", "plyr", "dplyr", "tidyverse", "stringr", "optparse", "RColorBrewer", "conflicted", "BiocManager", "resshape2", "R.utils") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages, dependencies = T) diff --git a/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R b/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R new file mode 100644 index 0000000..720226d --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R @@ -0,0 +1,399 @@ +#!/bin/Rscript + +################################################################################ +### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ### +################################################################################ +library(ggplot2, quietly = TRUE) +library(tidyr, quietly = TRUE) +library(plyr, quietly = TRUE) +library(dplyr, quietly = TRUE) +library(stringr, quietly = TRUE) +library(optparse) + +# Load classification per promoter: +option_list = list( + make_option(c("-c", "--classification"), type="character", default=NULL, + help="input classification or reads file (.txt)", metavar="character"), + make_option(c("-j", "--jwr"), type="character", default=NULL, + help="input nanosplicer results table (.csv)", metavar="character")) +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) +reads_pos <- read.table(opt$classification[1], + sep = "\t") +colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])]) +reads_pos <- reads_pos[2:length(reads_pos$id),] + +# Load Nanosplicer results: +df <- read.csv(opt$jwr[1]) +colnames(df)[1] <- "juncNumber" + +# split donor and acceptor positions: +df <- df %>% + separate(col = "loc", + into = c("donor", "acceptor"), sep = ", ", + extra = "warn") + +df$donor <- str_replace(df$donor, '[(]', '') +df$acceptor <- str_replace(df$acceptor, '[)]', '') + +df <- mutate(df, + pg_donor = as.numeric(donor)-122, + pg_acceptor = as.numeric(acceptor)-122) + +df <- merge(df, reads_pos, by = 'id') + +assignation_donor <- function(pg_donor) { + if (pg_donor >= 620) { + if (pg_donor < 643 & pg_donor >= 620) { + donor_site <- "J2450_" + } + else if (pg_donor <= 675 & pg_donor >= 643) { + donor_site <- "J2474_" + } + else if (pg_donor <= 1182 & pg_donor >= 1162) { + donor_site <- "J2988_" + } + else if (pg_donor <= 1835 & pg_donor >= 1815) { + donor_site <- "J461_" + } + else { + donor_site <- paste0("J", pg_donor, "new_") + } + } + else if (pg_donor <= 284 & pg_donor >= 264) { + donor_site <- "J2090_" + } + else if (pg_donor <= 261 & pg_donor >= 240) { + donor_site <- "J2070_" + } + else { + donor_site <- paste0("J", pg_donor, "new_") + } + return(donor_site) +} + +assignation_acceptor <- function(pg_acceptor) { + if (pg_acceptor > 1868) { + if (pg_acceptor >= 2452 & pg_acceptor <= 2468) { + acceptor_site <- "3170" + } + else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) { + acceptor_site <- "1306" + } + else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) { + acceptor_site <- "1386" + } + else { + acceptor_site <- paste0(pg_acceptor,"new") + } + } + else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) { + acceptor_site <- "490" + } + else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) { + acceptor_site <- "283" + } + else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) { + acceptor_site <- "2903" + } + else if (pg_acceptor >= 525 & pg_acceptor <= 547) { + acceptor_site <- "2351" + } + else if (pg_acceptor >= 410 & pg_acceptor <= 432) { + acceptor_site <- "2237" + } + else { + acceptor_site <- paste0(pg_acceptor,"new") + } + return(acceptor_site) +} + +df$donor_site <- sapply(df$pg_donor, assignation_donor) +df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor) +df <- mutate(df, + junction = paste0(donor_site, acceptor_site)) + +write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t") + +duplicated2 <- function(x){ + if (sum(dup <- duplicated(x))==0) + return(dup) + if (class(x) %in% c("data.frame","matrix")) + duplicated(rbind(x[dup,],x))[-(1:sum(dup))] + else duplicated(c(x[dup],x))[-(1:sum(dup))] +} + +list_read_multiple <- unique(df[duplicated2(df$id),]$id) +multiple_junction <- df[duplicated2(df$id),] +single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple)) + +SP_assignation_single <- function(site, promoter) { + if (promoter == "pgRNA" | promoter == "preCore") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else if (site == "J2450_490") { SP_name <- "SP01" } + else if (site == "J2070_490") { SP_name <- "SP03" } + else if (site == "J2090_490") { SP_name <- "SP05" } + else if (site == "J2474_490") { SP_name <- "SP06" } + else if (site == "J2450_283") { SP_name <- "SP09" } + else if (site == "J2474_283") { SP_name <- "SP11" } + else if (site == "J2988_490") { SP_name <- "SP13" } + else if (site == "J2450_2903") { SP_name <- "SP14" } + else if (site == "J2070_283") { SP_name <- "SP17" } + else if (site == "J2988_3170") { SP_name <- "SP19" } + else if (site == "J2988_283") { SP_name <- "SP20" } + else { SP_name <- "new_comb_pg" } + } + else if (promoter == "preS1" | promoter == "preS2/S") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else if (site == "J461_1306") { SP_name <- "SP21" } + else if (site == "J461_1386") { SP_name <- "SP22" } + else { SP_name <- "new_comb_HBs" } + } + else if (promoter == "HBx") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_HBx" } + } + else if (promoter == "Undefined") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_UnC" } + } + else { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "error" } + } +} + +single_junction <- ddply(single_junction, + .(id), + mutate, + SP_name = SP_assignation_single(junction, promoter)) + +SP_assignation_multiple <- function(read_id, combinaison, promoter) { + if (promoter == "pgRNA" | promoter == "preCore") { + + # New junctions detection: + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP" + } + + #SP02, SP08, SP10, SP16: + else if ("J2070_2351" %in% combinaison) { + #SP02: + if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP02" + } + + #SP08: + else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison & + length(combinaison) == 3) { + SP_name <- "SP08" + } + + #SP10: + else if ("J2450_283" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP10" + } + + #SP16: + else if ("J2474_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP16" + } + + #If there is another combination of already described junction associated to "J2070_2351": + else {SP_name <- "new_comb_pg"} + } + + #SP12, SP15: + else if ("J2070_2237" %in% combinaison) { + #SP12: + if ("J2450_283" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP12" + } + else if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP15" + } + #If there is another combination of already described junction associated to "J2070_2237": + else {SP_name <- "new_comb_pg"} + } + + #SP04: + else if ("J2090_2351" %in% combinaison) { + if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP04" + } + #If there is another combination of already described junction associated to "J2090_2351": + else {SP_name <- "new_comb_pg"} + } + + #SP07, SP18: + else if ("J2988_490" %in% combinaison) { + #SP07: + if ("J2450_2903" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP07" + } + #SP18: + else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP18" + } + #If there is another combination of already described junction associated to "J2988_490": + else {SP_name <- "new_comb_pg"} + } + + #New combinaisons of multiple already described junctions: + else {SP_name <- "new_comb_pg"} + } + + #SP from preS1, preS2/2: + else if (promoter == "preS1" | promoter == "preS2/S") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP_HBs" + } + else { SP_name <- "new_comb_HBs"} + } + + #SP from HBx: + else if (promoter == "HBx") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP_HBx" + } + else { SP_name <- "new_comb_HBx"} + } + + #SP from Undefined: + else if (promoter == "Undefined") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_UnC" } + } + + #Error case: + else { + SP_name <- "error" + } +return(SP_name) +} + +tmp <- multiple_junction %>% select(id, junction, promoter) +df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(df_combinaison) <- c("id", "SP_name") + +for (read_id in list_read_multiple) { + SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1]) + res_vector <- data.frame(t(c(read_id, SP_name_computed))) + colnames(res_vector) <- colnames(df_combinaison) + df_combinaison <- rbind(df_combinaison, res_vector) +} + +# df_combinaison <- df_combinaison[2:length(df_combinaison$id),] + +multiple_junction <- merge(multiple_junction, df_combinaison, by="id") + +df_SPvariants <- rbind(single_junction, multiple_junction) + +SP_variant_unique <- df_SPvariants %>% select(id, SP_name) +SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id) + +write.table(df_SPvariants, + "identified_SPvariants.csv", + row.names = FALSE, + sep = "\t", + quote = FALSE) + +ggplot(df, aes(x=pg_donor)) + + geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") + + geom_density(alpha=.2, fill="lightblue") + + geom_vline(aes(xintercept=median(pg_donor)), + color="blue", linetype="dashed", linewidth=1) + + geom_vline(aes(xintercept=quantile(pg_donor, 0.025)), + linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_donor, 0.975)), + linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_donor, 0.01)), + color="green", linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_donor, 0.99)), + color="green", linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))), + color="red", linewidth=0.5) + + geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))), + color="red", linewidth=0.5) + + scale_x_continuous(breaks=c(min(df$pg_donor), + quantile(df$pg_donor, 0.025), + quantile(df$pg_donor, 0.005), + median(df$pg_donor)-sd(df$pg_donor), + median(df$pg_donor), + median(df$pg_donor)+sd(df$pg_donor), + quantile(df$pg_donor, 0.975), + quantile(df$pg_donor, 0.995), + max(df$pg_donor)), + label = c(min(df$pg_donor), + floor(quantile(df$pg_donor, 0.025)), + floor(quantile(df$pg_donor, 0.005)), + round(median(df$pg_donor)-sd(df$pg_donor)), + median(df$pg_donor), + round(median(df$pg_donor)+sd(df$pg_donor)), + floor(quantile(df$pg_donor, 0.975))+1, + round(quantile(df$pg_donor, 0.995))+1, + max(df$pg_donor))) + + theme(axis.text.x = element_text(angle = 45)) + +ggsave(filename = "Donor_curve.png", + device = "png", + scale = 1, + width = 1920, + height = 1080, + units = "px", + dpi = 320) + +ggplot(df, aes(x=pg_acceptor)) + + geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") + + geom_density(alpha=.2, fill="darksalmon") + + geom_vline(aes(xintercept=median(pg_acceptor)), + color="red", linetype="dashed", linewidth=1) + + geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)), + linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)), + linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)), + color="green", linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)), + color="green", linetype="dashed", linewidth=0.25) + + geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))), + color="blue", linewidth=0.5) + + geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))), + color="blue", linewidth=0.5) + + scale_x_continuous(breaks=c(min(df$pg_acceptor), + quantile(df$pg_acceptor, 0.025), + quantile(df$pg_acceptor, 0.005), + median(df$pg_acceptor)-sd(df$pg_acceptor), + median(df$pg_acceptor), + median(df$pg_acceptor)+sd(df$pg_acceptor), + quantile(df$pg_acceptor, 0.975), + quantile(df$pg_acceptor, 0.995), + max(df$pg_acceptor)), + label = c(min(df$pg_acceptor), + floor(quantile(df$pg_acceptor, 0.025)), + floor(quantile(df$pg_acceptor, 0.005)), + round(median(df$pg_acceptor)-sd(df$pg_acceptor)), + median(df$pg_acceptor), + round(median(df$pg_acceptor)+sd(df$pg_acceptor)), + floor(quantile(df$pg_acceptor, 0.975))+1, + floor(quantile(df$pg_acceptor, 0.995))+1, + max(df$pg_acceptor))) + + theme(axis.text.x = element_text(angle = 45)) + +ggsave(filename = "Acceptor_curve.png", + device = "png", + scale = 1, + width = 1920, + height = 1080, + units = "px", + dpi = 320) + +# Graphs and tests: + +# sink("test_shapiro.txt") +# print("Normality test: Shapiro-Wilk") +# print("Donor site:") +# print(shapiro.test(df$pg_donor)) +# print("Acceptor site:") +# print(shapiro.test(df$pg_acceptor)) +# sink() \ No newline at end of file diff --git a/src/.docker_modules/r-scripts/1.0/docker_init.sh b/src/.docker_modules/r-scripts/1.0/docker_init.sh new file mode 100755 index 0000000..b0b0edb --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/docker_init.sh @@ -0,0 +1,2 @@ +docker build src/.docker_modules/r-scripts/1.0 -t 'xgrand/r-scripts:1.0' +docker push xgrand/r-scripts:1.0 diff --git a/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R b/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R new file mode 100755 index 0000000..31b4e61 --- /dev/null +++ b/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R @@ -0,0 +1,213 @@ +#!/bin/Rscript +# Packages installation +library(dplyr) +library(ggplot2) +library(RColorBrewer) +library(conflicted) +library(optparse) +library(tidyverse) + +# Résolution de conflits entre les bibliothèques dplyr et stats +conflict_prefer("filter", "dplyr") +conflict_prefer("lag", "dplyr") + +# Load Start_positions_count files: +option_list = list( + make_option(c("-i", "--input"), type="character", default=NULL, + help="input start position file (.txt)", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) +#opt = ("/home/alia/scripts/Start_position/Start_positions_counts.txt") + +#list_file <- list.files(path=".", +# pattern="*.txt", +# all.files=FALSE, +# full.names=FALSE) +file_to_load <- opt$input[1] +splitted <- strsplit(opt$input[1], split = "[/]")[[1]] +filename <- strsplit(splitted[length(splitted)], split = "[.]")[[1]][1] + +sam_bc01 <- read.table(file_to_load, header = F) +sam_bc01[3] <- rep(filename, length(sam_bc01[,1])) + +# Function to parse and arrange data: +parsingData <- function(df) { + binsize <- 10 + pos <- as.data.frame(table(df[,2])) + colnames(pos)[1] <- "Start" + + Start <- as.data.frame(as.factor(seq(0, 3300))) + colnames(Start)[1] = "Start" + + tmp <- dplyr::left_join(Start, pos) + tmp[is.na(tmp)] <- 0 + + tmp$Start <- as.numeric(tmp$Start) + + df2 <- as_tibble(tmp) %>% + mutate(bin = round(Start/binsize)*binsize) %>% + group_by(bin) %>% + summarize(nb_reads = sum(Freq, na.rm = T)) + df2[is.na(df2)] <- 0 + df2[3] <- rep(df[1,3], length(df2$bin)) + colnames(df2) <- c("Start_position", "nb_reads", "Barcode") + df2 +} + +df_parsed <- parsingData(sam_bc01) + +ggplot(df_parsed, aes(Start_position, nb_reads)) + + geom_area(alpha = 0.5, fill = "blue") + + scale_y_sqrt() + + facet_wrap(facets = vars(df_parsed$Barcode)) + + theme_light()+ + scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421), + label = c("1692", "1819", "2806", "EcoRI", "1065", + "1243", "1418", "1932")) + + theme(axis.text.x = element_text(angle = 45) + ) + +ggsave(paste0(filename,".jpg"), + plot = last_plot(), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300, +) + +# Classify reads based on start-position: + +# Separate preCore & pg: +classify_reads <- function(read_info) { + if (read_info <= 103) { + promoter <- "preCore" + } + else if (read_info >= 117 & + read_info <= 276) { + promoter <- "pgRNA" + } + else if (read_info >= 1106 & + read_info <= 1221 ) { + promoter <- "preS1" + } + else if (read_info >= 1455 & + read_info <= 1632 ) { + promoter <- "preS2/S" + } + else if (read_info >= 2550 & + read_info <= 2968 ) { + promoter <- "HBx" + } + else promoter <- "Undefined" +} + +colnames(sam_bc01) <- c("read_ID", "start_position", "barcode") +sam_bc01$promoter <- sapply(sam_bc01$start_position, + classify_reads) + +write.table(sam_bc01, + file = "classification_of_reads_per_RNA.txt", + quote = FALSE, + sep = "\t", + row.names = FALSE) + +# Compute Reads number per promoters: +list_name_samples <- list(filename) + +count_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, + preS2S/total*100, HBx/total*100, total) + return(res) +} + +abscount_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore, pgRNA, preS1, preS2S, + HBx, total) + return(res) +} + +promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), + levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")) + +abs_count_reads <- data.frame() +abs_count_reads <- sapply(list_name_samples, + abscount_promoter_reads, + df_parsed) +abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads) +colnames(abs_count_reads) <- c("promoter", "read_number") + +write.table(abs_count_reads, + file = "Count_reads_per_promoter.tsv", + quote = FALSE, + sep = "\t", + row.names = FALSE) + +resultats_start_promoters <- lapply(list_name_samples, + count_promoter_reads, + df_parsed) + +resultats_start_promoters <- as.data.frame(do.call(cbind, + resultats_start_promoters)) +totalCountSample <- as.data.frame(resultats_start_promoters[6,]) +colnames(totalCountSample) <- c(filename) +resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,]) +colnames(resultats_start_promoters) <- as.vector(list_name_samples) +resultats_start_promoters <- cbind(promoters, resultats_start_promoters) +formated_start_promoters <- pivot_longer(resultats_start_promoters, + cols = c(filename), + names_to = "Barcodes", + values_to = "nb_reads") + +mycolors <- colorRampPalette(brewer.pal(10, "Paired"))(10) +mycolors5 <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") +mycolors6 <- c("#A6CEE3", "#3362ff", "#33c5ff", "#6A3D9A", "#d60000") + +plot_camembert <- function(barcode, df, tot) { + camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, + y = nb_reads, + fill=promoters)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = mycolors5) + + labs(title = paste0("#reads = ", tot[1,barcode]), x=element_blank(), y=element_blank()) + + theme_light() + + print(camembert) + + ggsave(filename = paste0("./Reads_start_promoters_", barcode, "_camembert.jpg"), + plot = last_plot(), + scale = 1, + width = 1920, + height = 1080, + units = "px", + dpi = 300) +} + +lapply(list_name_samples, plot_camembert, formated_start_promoters, totalCountSample) diff --git a/src/nf_modules/junction_nanosplicer/main.nf b/src/nf_modules/junction_nanosplicer/main.nf new file mode 100644 index 0000000..8598562 --- /dev/null +++ b/src/nf_modules/junction_nanosplicer/main.nf @@ -0,0 +1,26 @@ +version = "1.0" +container_url = "xgrand/r-scripts:${version}" + +params.junctions_out = "" +process junctions_nanosplicer{ + container = "${container_url}" + label "small_mem_mono_cpus" + tag "identification de variants d'épissage" + if (params.junctions_out != "") { + publishDir "results/${params.junctions_out}", mode: 'copy' + } + + input: + path(csv) + path(classification_of_reads_per_RNA) + + output: + path("Rplots.pdf") + path("JWR_check_parsed.csv") + path("identified_SPvariants.csv"), emit: identified_SPvariants + + script: + """ + Rscript /Junctions_NanoSplicer/Junctions_NanoSplicer_2.R + """ +} \ No newline at end of file diff --git a/src/nf_modules/rna_count/main.nf b/src/nf_modules/rna_count/main.nf new file mode 100644 index 0000000..e69de29 diff --git a/src/nf_modules/start_positions/main.nf b/src/nf_modules/start_positions/main.nf new file mode 100644 index 0000000..06e2e4f --- /dev/null +++ b/src/nf_modules/start_positions/main.nf @@ -0,0 +1,25 @@ +version = "1.0" +container_url = "xgrand/r-scripts:${version}" + +params.start_position_counts_out = "" +process start_position_individuals{ + container = "${container_url}" + label "small_mem_mono_cpus" + tag "identification de variants d'épissage" + if (params.start_position_counts_out != "") { + publishDir "results/${params.start_position_counts_out}", mode: 'copy' + } + + input: + path(start_position_counts) + + output: + path("Rplots.pdf") + path("Count_reads_per_promoter.tsv") + path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads + + script: + """ + Rscript start_positions_individuals.R -i start_position_counts + """ +} \ No newline at end of file -- GitLab