From 705089e97ae2d350ace6b45203ed21d111f2e703 Mon Sep 17 00:00:00 2001 From: xgrand <xavier.grand@ens-lyon.fr> Date: Thu, 26 Oct 2023 14:43:59 +0200 Subject: [PATCH] Add r-bolero:1.1 to adapt graphical representation. --- src/.docker_modules/r-bolero/1.1/Dockerfile | 29 ++ .../r-bolero/1.1/HBV_RNAs_count.R | 313 ++++++++++++++++ .../r-bolero/1.1/Install_packages.R | 3 + .../r-bolero/1.1/Junctions_NanoSplicer.R | 351 ++++++++++++++++++ .../r-bolero/1.1/Start_positions.R | 228 ++++++++++++ .../r-bolero/1.1/docker_init.sh | 2 + .../r-bolero/1.1/ggplot_theme_Publication-2.R | 165 ++++++++ 7 files changed, 1091 insertions(+) create mode 100644 src/.docker_modules/r-bolero/1.1/Dockerfile create mode 100644 src/.docker_modules/r-bolero/1.1/HBV_RNAs_count.R create mode 100644 src/.docker_modules/r-bolero/1.1/Install_packages.R create mode 100644 src/.docker_modules/r-bolero/1.1/Junctions_NanoSplicer.R create mode 100644 src/.docker_modules/r-bolero/1.1/Start_positions.R create mode 100755 src/.docker_modules/r-bolero/1.1/docker_init.sh create mode 100644 src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R diff --git a/src/.docker_modules/r-bolero/1.1/Dockerfile b/src/.docker_modules/r-bolero/1.1/Dockerfile new file mode 100644 index 0000000..dd5a0a3 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/Dockerfile @@ -0,0 +1,29 @@ +FROM rocker/r-base:4.2.3 +LABEL AUTHOR="Alia Rifki" +LABEL MAINTAINER="Xavier Grand <xavier.grand@inserm.fr>" +LABEL build_date="2023-05-26" + + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libfontconfig1-dev \ + libxml2-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev \ + procps + +## copy Rscript files +COPY ./*.R . + +## Install packages +RUN Rscript Install_packages.R + +## Langage +CMD ["bash"] + diff --git a/src/.docker_modules/r-bolero/1.1/HBV_RNAs_count.R b/src/.docker_modules/r-bolero/1.1/HBV_RNAs_count.R new file mode 100644 index 0000000..6ea15b7 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/HBV_RNAs_count.R @@ -0,0 +1,313 @@ +#!/bin/Rscript +# Packages loading +library(ggplot2, quietly = TRUE) +library(tidyr, quietly = TRUE) +library(dplyr, quietly = TRUE) +library(stringr, quietly = TRUE) +library(RColorBrewer, quietly = TRUE) +library(optparse, quietly = TRUE) +library(conflicted, quietly = TRUE) + +# Résolution de conflits entre les bibliothèques dplyr et stats +conflict_prefer("filter", "dplyr") +conflict_prefer("lag", "dplyr") + +# source graphical theme: +source(file="src/ggplot_theme_publication-2.R") + +# Load files +option_list = list( + make_option(c("-s", "--SPvariants"), type="character", default=NULL, + help="input identified SP variants table (.csv)", metavar="character"), + make_option(c("-c", "--classification"), type="character", default=NULL, + help="input classification of reads file (.txt)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character")) +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +# vectors of species & Palettes Colors: +SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08", + "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16", + "SP17", "SP18", "SP19", "SP20", "SP21", "SP22") +SPvariants <- factor(SPvariants, levels = SPvariants) +colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", + "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", + "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", + "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", + "#4FBCBA", "#608AD2") +palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, + stringsAsFactors = FALSE) + +TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx") +TSS_species <- factor(TSS_species, levels = TSS_species) +colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") +palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, + stringsAsFactors = FALSE) + +new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", + "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined") +new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates) +colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", + "#A92322", "#964B34") +palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, + stringsAsFactors = FALSE) + +SPxx <- c("SPvariants") +SPxx <- factor(SPxx, levels = SPxx) +color_SPxx <- c("#33C5FF") +palette_SPxx <- data.frame(nom = SPxx, + teinte = color_SPxx, + stringsAsFactors = FALSE) + +all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx) + +palette_complete <- rbind.data.frame(palette_TSS, + palette_SP1_22, + palette_new_SP, + palette_SPxx, + stringsAsFactors = FALSE) + +# Load Start_positions_count files: +identified_SP <- read.table(file = opt$SPvariants, + header = TRUE) + +clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% + select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name) + +countSP <- clean_SP %>% count(SP_name) +countSP$SP_name <- factor(countSP$SP_name) +countSP <- dplyr::inner_join(palette_complete, + countSP, + by = c("nom" = "SP_name")) +#print(names(palette_complete)) +#print(names(countSP)) + +countSP$nom <- factor(countSP$nom, levels = all_species_name) +countSP <- dplyr::mutate(countSP, + proportion = (as.numeric(n)/sum(as.numeric(n))*100)) +#print(countSP) +if (length(countSP$nom) != 0) { + ggplot(countSP, aes(x = "percent", + y = proportion, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = countSP$teinte) + + labs(fill = "spliced-variants") + + xlab(element_blank()) + ylab(element_blank()) + + scale_colour_Publication() + theme_Publication() + + ggsave(file = paste0(opt$barcode, "_SP_proportion_piechart.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + + ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) + + geom_col() + + scale_fill_manual(values = countSP$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + xlab("Position on HBV genome.") + ylab(element_blank()) + + scale_colour_Publication() + theme_Publication() + + theme(axis.text.x = element_text(angle = 45, hjust=1)) + + ggsave(file = paste0(opt$barcode, "_SP_proportion.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) +} + +# TSS not spliced: +classified_reads <- read.table(file = opt$classification, + header = TRUE) + +not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),] +not_spliced <- dplyr::mutate(not_spliced, + species = not_spliced$promoter) +#print(not_spliced) +not_spliced <- not_spliced %>% select(read_ID, species) +colnames(not_spliced) <- c("id", "species") + +clean_SP_type <- clean_SP %>% select(id, SP_name) +colnames(clean_SP_type) <- c("id", "species") + +df_species <- rbind.data.frame(not_spliced, clean_SP_type, + stringsAsFactors = FALSE) +count_species <- df_species %>% count(species) +count_species <- dplyr::mutate(count_species, + percent = (as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_species) +write.table(df_species, file = paste0(opt$barcode, "_all_reads_identified.csv"), + sep = "\t", quote = FALSE, row.names = FALSE) + +# Null dataset: +species <- c(TSS_species, SPvariants, new_SP_candidates) + +# canonical_species <- TSS_species[c(1:3,5)] + +null_count <- data.frame(species = species, + n = rep(0, times = length(species)), + percent = rep(0, times = length(species)), + stringsAsFactors = FALSE) + +null_count <- transform(null_count, + n = as.integer(n), + percent = as.numeric(percent)) + +# Join: +count_species <- rbind.data.frame(count_species, + subset(null_count, + !(species %in% count_species$species)), + stringsAsFactors = FALSE) + +# Merge all SP variants: +count_species_SPxx <- data.frame(species = "SPvariants", + n = sum(count_species[count_species$species + %in% SPvariants,]$n)) +count_species_SPxx <- rbind.data.frame(count_species_SPxx, + count_species[count_species$species + %in% c(TSS_species, + SPvariants), 1:2], + stringsAsFactors = FALSE) +count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% + all_species_name[c(1:5,35)],] # c(1:3,5,35) +count_species_SPxx <- dplyr::mutate(count_species_SPxx, + percent=(as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_species_SPxx) +# save the tab: +write.csv(count_species_SPxx, + file = paste0(opt$barcode, "_count_canonical_species_SPxx.csv")) + +# prepare to plot: +count_species_SPxx <- dplyr::inner_join(palette_complete, + count_species_SPxx, + by = c("nom" = "species")) +#names(palette_complete) +#names(count_species_SPxx) +count_species_SPxx$nom <- factor(count_species_SPxx$nom, + levels = all_species_name) + +# Save: +write.csv(count_species, file = paste0(opt$barcode, "_count_species.csv")) + +# RNA species composition all species: +count_species <- dplyr::inner_join(palette_complete, count_species, + by = c("nom" = "species"), + suffix = c("","")) +count_species$nom <- factor(count_species$nom, levels = all_species_name) + +ggplot(count_species, + aes(x = nom, + y = percent, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_species$teinte) + + labs(fill = element_blank()) + + xlab(label = element_blank())+ + scale_colour_Publication() + theme_Publication() + + theme(axis.text.x = element_text(angle = 45, hjust=1)) + +ggsave(file = paste0(opt$barcode, "_count_RNAs_species.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# Filter RNA species canonical + SPvariants: +count_species_clear <- count_species[count_species$nom %in% c(TSS_species, SPvariants),] + +# RNA species composition all species: +count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear, + by = c("nom" = "nom"), + suffix = c("","")) +count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name) + +ggplot(count_species_clear, + aes(x = nom, + y = percent, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_species_clear$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "RNA species & spliced-variants") + + xlab(label = "RNA species & spliced-variants") + + scale_colour_Publication() + theme_Publication() + +ggsave(file = paste0(opt$barcode, "_count_RNAs_species_clear.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# SP composition clear: +count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name) +count_clear <- dplyr::mutate(count_clear, + proportion=(as.numeric(n)/sum(as.numeric(n))*100)) +#print(count_clear) +count_clear$SP_name <- factor(count_clear$SP_name) +count_clear <- dplyr::inner_join(palette_complete, + count_clear, + by = c("nom" = "SP_name")) +#names(count_clear) +count_clear$nom <- factor(count_clear$nom, levels = all_species_name) + +if(nrow(count_clear)!=0){ + ggplot(count_clear, aes(x = "percent", + y = proportion, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = count_clear$teinte) + + labs(fill = "spliced-variants") + + scale_colour_Publication() + theme_Publication() + + ggsave(file = paste0(opt$barcode, "_SP_clear_proportion_piechart.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300)} + +ggplot(count_clear, aes(x = nom, + y = proportion, + fill = nom)) + + geom_col() + + scale_fill_manual(values = count_clear$teinte) + + theme(axis.text.x = element_text(angle = 45)) + + labs(fill = "spliced-variants") + + xlab(label = "spliced-variants") + + ylab(label = "percent") + + scale_colour_Publication() + theme_Publication() + +ggsave(file = paste0(opt$barcode, "_SP_clear_proportion.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + +# Camembert SPxx: +ggplot(count_species_SPxx, aes(x = "species", + y = percent, + fill = nom)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = count_species_SPxx$teinte) + + labs(fill = "TSS") + + ylab(label = "TSS usage") + + xlab(label = "percent") + + scale_colour_Publication() + theme_Publication() + +ggsave(file = paste0(opt$barcode, "_count_RNAs_species_piechart.png"), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300) + diff --git a/src/.docker_modules/r-bolero/1.1/Install_packages.R b/src/.docker_modules/r-bolero/1.1/Install_packages.R new file mode 100644 index 0000000..1716eee --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/Install_packages.R @@ -0,0 +1,3 @@ +list.of.packages <- c("ggplot2", "tidyr", "dplyr", "tidyverse", "stringr", "optparse", "RColorBrewer", "conflicted", "BiocManager", "resshape2", "R.utils", "conflicted", "future", "future.apply") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages, dependencies = T) diff --git a/src/.docker_modules/r-bolero/1.1/Junctions_NanoSplicer.R b/src/.docker_modules/r-bolero/1.1/Junctions_NanoSplicer.R new file mode 100644 index 0000000..4e17944 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/Junctions_NanoSplicer.R @@ -0,0 +1,351 @@ +#!/bin/Rscript +#library(ggplot2, quietly = TRUE) +library(tidyr, quietly = TRUE) +library(dplyr, quietly = TRUE) +library(stringr, quietly = TRUE) +library(optparse, quietly = TRUE) +library(future, quietly = TRUE) +library(future.apply) +library(conflicted, quietly = TRUE) + +# Set up parallel processing plan +plan("multisession", workers = 18) + +# Résolution de conflits entre les bibliothèques dplyr et stats +conflict_prefer("filter", "dplyr") +conflict_prefer("lag", "dplyr") + +# Load classification per promoter: +option_list = list( + make_option(c("-c", "--classification"), type="character", default="./classification.txt", + help="input classification or reads file (.txt)", metavar="character"), + make_option(c("-j", "--jwr"), type="character", default=NULL, + help="input nanosplicer results table (.csv)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character")) +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) +reads_pos <- read.table(opt$classification, + sep = "\t") +colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])]) +reads_pos <- reads_pos[2:length(reads_pos$id),] + +# Load Nanosplicer results: +if (file.exists(opt$jwr)) { + df <- read.csv(opt$jwr) + colnames(df)[1] <- "juncNumber" +} else { # Define column names + column_names <- c("juncNumber", "id", "mapQ", "transcript_strand", "chrID", "loc", "JAQ") + + # Create an empty dataframe + df <- data.frame(matrix(ncol = length(column_names), nrow = 0)) + colnames(df) <- column_names +} + +# split donor and acceptor positions: +df <- df %>% + separate(col = "loc", + into = c("donor", "acceptor"), sep = ", ", + extra = "warn") + +df$donor <- str_replace(df$donor, '[(]', '') +df$acceptor <- str_replace(df$acceptor, '[)]', '') + +df <- dplyr::mutate(df, + pg_donor = as.numeric(donor)-122, + pg_acceptor = as.numeric(acceptor)-122) + +df <- merge(df, reads_pos, by = 'id') + +assignation_donor <- function(pg_donor) { + if (pg_donor >= 620) { + if (pg_donor < 643 & pg_donor >= 620) { + donor_site <- "J2450_" + } + else if (pg_donor <= 675 & pg_donor >= 643) { + donor_site <- "J2474_" + } + else if (pg_donor <= 1182 & pg_donor >= 1162) { + donor_site <- "J2988_" + } + else if (pg_donor <= 1835 & pg_donor >= 1815) { + donor_site <- "J461_" + } + else { + donor_site <- paste0("J", pg_donor, "new_") + } + } + else if (pg_donor <= 284 & pg_donor >= 264) { + donor_site <- "J2090_" + } + else if (pg_donor <= 261 & pg_donor >= 240) { + donor_site <- "J2070_" + } + else { + donor_site <- paste0("J", pg_donor, "new_") + } + return(donor_site) +} + +assignation_acceptor <- function(pg_acceptor) { + if (pg_acceptor > 1868) { + if (pg_acceptor >= 2452 & pg_acceptor <= 2468) { + acceptor_site <- "3170" + } + else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) { + acceptor_site <- "1306" + } + else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) { + acceptor_site <- "1386" + } + else { + acceptor_site <- paste0(pg_acceptor,"new") + } + } + else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) { + acceptor_site <- "490" + } + else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) { + acceptor_site <- "283" + } + else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) { + acceptor_site <- "2903" + } + else if (pg_acceptor >= 525 & pg_acceptor <= 547) { + acceptor_site <- "2351" + } + else if (pg_acceptor >= 410 & pg_acceptor <= 432) { + acceptor_site <- "2237" + } + else { + acceptor_site <- paste0(pg_acceptor,"new") + } + return(acceptor_site) +} + +df$donor_site <- future_sapply(df$pg_donor, assignation_donor) +df$acceptor_site <- future_sapply(df$pg_acceptor, assignation_acceptor) +df <- dplyr::mutate(df, + junction = paste0(donor_site, acceptor_site)) + +write.table(df, file = paste0(opt$barcode, "_JWR_check_parsed.csv"), row.names = FALSE, sep = "\t") + +duplicated2 <- function(x){ + if (sum(dup <- duplicated(x))==0) + return(dup) + if (class(x) %in% c("data.frame","matrix")) + duplicated(rbind(x[dup,],x))[-(1:sum(dup))] + else duplicated(c(x[dup],x))[-(1:sum(dup))] +} + +list_read_multiple <- unique(df[duplicated2(df$id),]$id) +multiple_junction <- df[duplicated2(df$id),] +single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple)) + +SP_assignation_single <- function(site, promoter) { + if (promoter == "pgRNA" | promoter == "preCore") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else if (site == "J2450_490") { SP_name <- "SP01" } + else if (site == "J2070_490") { SP_name <- "SP03" } + else if (site == "J2090_490") { SP_name <- "SP05" } + else if (site == "J2474_490") { SP_name <- "SP06" } + else if (site == "J2450_283") { SP_name <- "SP09" } + else if (site == "J2474_283") { SP_name <- "SP11" } + else if (site == "J2988_490") { SP_name <- "SP13" } + else if (site == "J2450_2903") { SP_name <- "SP14" } + else if (site == "J2070_283") { SP_name <- "SP17" } + else if (site == "J2988_3170") { SP_name <- "SP19" } + else if (site == "J2988_283") { SP_name <- "SP20" } + else { SP_name <- "new_comb_pg" } + } + else if (promoter == "preS1" | promoter == "preS2/S") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else if (site == "J461_1306") { SP_name <- "SP21" } + else if (site == "J461_1386") { SP_name <- "SP22" } + else { SP_name <- "new_comb_HBs" } + } + else if (promoter == "HBx") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_HBx" } + } + else if (promoter == "Undefined") { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_UnC" } + } + else { + if (str_detect(site, "new")) { SP_name <- "new_SP" } + else { SP_name <- "error" } + } +} + + +#dplyr version: +single_junction <- single_junction %>% dplyr::group_by(id) + +if (length(single_junction$promoter) != 0) { + single_junction <- single_junction %>% + dplyr::mutate(SP_name = SP_assignation_single(junction, promoter)) +} else { single_junction$SP_name = "" } + +SP_assignation_multiple <- function(read_id, combinaison, promoter) { + if (promoter == "pgRNA" | promoter == "preCore") { + + # New junctions detection: + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP" + } + + #SP02, SP08, SP10, SP16: + else if ("J2070_2351" %in% combinaison) { + #SP02: + if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP02" + } + + #SP08: + else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison & + length(combinaison) == 3) { + SP_name <- "SP08" + } + + #SP10: + else if ("J2450_283" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP10" + } + + #SP16: + else if ("J2474_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP16" + } + + #If there is another combination of already described junction associated to "J2070_2351": + else {SP_name <- "new_comb_pg"} + } + + #SP12, SP15: + else if ("J2070_2237" %in% combinaison) { + #SP12: + if ("J2450_283" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP12" + } + else if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP15" + } + #If there is another combination of already described junction associated to "J2070_2237": + else {SP_name <- "new_comb_pg"} + } + + #SP04: + else if ("J2090_2351" %in% combinaison) { + if ("J2450_490" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP04" + } + #If there is another combination of already described junction associated to "J2090_2351": + else {SP_name <- "new_comb_pg"} + } + + #SP07, SP18: + else if ("J2988_490" %in% combinaison) { + #SP07: + if ("J2450_2903" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP07" + } + #SP18: + else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) { + SP_name <- "SP18" + } + #If there is another combination of already described junction associated to "J2988_490": + else {SP_name <- "new_comb_pg"} + } + + #New combinaisons of multiple already described junctions: + else {SP_name <- "new_comb_pg"} + } + + #SP from preS1, preS2/2: + else if (promoter == "preS1" | promoter == "preS2/S") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP_HBs" + } + else { SP_name <- "new_comb_HBs"} + } + + #SP from HBx: + else if (promoter == "HBx") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { + SP_name <- "new_SP_HBx" + } + else { SP_name <- "new_comb_HBx"} + } + + #SP from Undefined: + else if (promoter == "Undefined") { + if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" } + else { SP_name <- "new_comb_UnC" } + } + + #Error case: + else { + SP_name <- "error" + } +return(SP_name) +} + +tmp <- multiple_junction %>% select(all_of(c("id", "junction", "promoter"))) + +# Create an empty data frame with appropriate column names +df_combinaison <- data.frame(id = character(0), SP_name = character(0)) + +# Function to process each read_id +process_read_id <- function(read_id) { + SP_name_computed <- SP_assignation_multiple(read_id, + tmp[tmp$id == read_id,]$junction, + tmp[tmp$id == read_id,]$promoter[1]) + data.frame(id = read_id, SP_name = SP_name_computed) +} + +if (length(multiple_junction$promoter) != 0) { + for (read_id in list_read_multiple) { + SP_name_computed <- SP_assignation_multiple(read_id, + tmp[tmp$id == read_id,]$junction, + tmp[tmp$id == read_id,]$promoter[1]) + res_vector <- data.frame(t(c(read_id, SP_name_computed))) + colnames(res_vector) <- colnames(df_combinaison) + df_combinaison <- rbind(df_combinaison, res_vector) + } + + df_combinaison <- df_combinaison[2:length(df_combinaison$id),] +} + +# Parallel processing using future_apply: +# results <- future_apply(multiple_junction$id, 2, FUN = process_read_id, +# future.seed = FALSE, future.progress = FALSE) +# Combine the results into a single data frame +# df_combinaison <- do.call(rbind, results) + +# Old loop: +# for (read_id in list_read_multiple) { +# SP_name_computed <- SP_assignation_multiple(read_id, +# tmp[tmp$id == read_id,]$junction, +# tmp[tmp$id == read_id,]$promoter[1]) +# res_vector <- data.frame(t(c(read_id, SP_name_computed))) +# colnames(res_vector) <- colnames(df_combinaison) +# df_combinaison <- rbind(df_combinaison, res_vector) +# } + +# df_combinaison <- df_combinaison[2:length(df_combinaison$id),] + +multiple_junction <- merge(multiple_junction, df_combinaison, by="id") + +df_SPvariants <- rbind(as.data.frame(single_junction), multiple_junction) + +SP_variant_unique <- df_SPvariants %>% select(id, SP_name) +SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] +# distinct(SP_variant_unique, id) + +write.table(df_SPvariants, + paste0(opt$barcode, "_identified_SPvariants.csv"), + row.names = FALSE, + sep = "\t", + quote = FALSE) + diff --git a/src/.docker_modules/r-bolero/1.1/Start_positions.R b/src/.docker_modules/r-bolero/1.1/Start_positions.R new file mode 100644 index 0000000..757ffa3 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/Start_positions.R @@ -0,0 +1,228 @@ +#!/bin/Rscript +# Packages loading +library(dplyr, quietly = TRUE) +library(ggplot2, quietly = TRUE) +library(RColorBrewer, quietly = TRUE) +library(conflicted, quietly = TRUE) +library(optparse, quietly = TRUE) +library(tidyverse, quietly = TRUE) +library(future, quietly = TRUE) +library(future.apply) + +# Set up parallel processing plan +plan("multisession", workers = 18) + +# Résolution de conflits entre les bibliothèques dplyr et stats +conflict_prefer("filter", "dplyr") +conflict_prefer("lag", "dplyr") + +# source graphical theme: +source(file="src/ggplot_theme_publication-2.R") + +# Load Start_positions_count files: +option_list = list( + make_option(c("-i", "--input"), type="character", default=NULL, + help="input start position file (.txt)", metavar="character"), + make_option(c("-b", "--barcode"), type="character", default=NULL, + help="input barcode", metavar="character"), + make_option(c("-s", "--start-positions"), type = "character", + default = "103,117,276,1106,1221,1455,1632,2550,2968") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) + +file_to_load <- opt$input +splitted <- strsplit(opt$input, split = "[/]")[[1]] +filename <- strsplit(splitted[length(splitted)], split = "[.]")[[1]][1] + +sam_bc01 <- read.table(file_to_load, header = F) +sam_bc01[3] <- rep(filename, length(sam_bc01[,1])) + +# Function to parse and arrange data: +parsingData <- function(df) { + binsize <- 10 + pos <- as.data.frame(table(df[,2])) + colnames(pos)[1] <- "Start" + + Start <- as.data.frame(as.factor(seq(0, 3300))) + colnames(Start)[1] = "Start" + + tmp <- dplyr::left_join(Start, pos) + tmp[is.na(tmp)] <- 0 + + tmp$Start <- as.numeric(tmp$Start) + + df2 <- as_tibble(tmp) %>% + dplyr::mutate(bin = round(Start/binsize)*binsize) %>% + group_by(bin) %>% + dplyr::summarize(nb_reads = sum(Freq, na.rm = T)) + df2[is.na(df2)] <- 0 + df2[3] <- rep(df[1,3], length(df2$bin)) + colnames(df2) <- c("Start_position", "nb_reads", "Barcode") + df2 +} + +df_parsed <- parsingData(sam_bc01) + +ggplot(df_parsed, aes(Start_position, nb_reads)) + + geom_area(alpha = 0.5, fill = "blue") + + scale_y_sqrt() + + facet_wrap(facets = vars(df_parsed$Barcode)) + scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421), + label = c("1692", "1819", "2806", "EcoRI", "1065", + "1243", "1418", "1932")) + + scale_fill_discrete(name="") + + scale_colour_Publication() + theme_Publication()+ + theme(axis.text.x = element_text(angle = 45, hjust=1) + ) + +ggsave(paste0(filename,".png"), + plot = last_plot(), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300, +) + +# Classify reads based on start-position: + +# Separate preCore & pg: +classify_reads <- function(read_info) { + if (read_info <= 103) { + promoter <- "preCore" + } + else if (read_info >= 117 & + read_info <= 276) { + promoter <- "pgRNA" + } + else if (read_info >= 1106 & + read_info <= 1221 ) { + promoter <- "preS1" + } + else if (read_info >= 1455 & + read_info <= 1632 ) { + promoter <- "preS2/S" + } + else if (read_info >= 2550 & + read_info <= 2968 ) { + promoter <- "HBx" + } + else promoter <- "Undefined" +} + +colnames(sam_bc01) <- c("read_ID", "start_position", "barcode") +sam_bc01$promoter <- future_sapply(sam_bc01$start_position, + classify_reads) + +write.table(sam_bc01, + file = paste0(opt$barcode, "_classification_of_reads_per_RNA.txt"), + quote = FALSE, + sep = "\t", + row.names = FALSE) + +# Compute Reads number per promoters: +list_name_samples <- list(filename) + +count_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, + preS2S/total*100, HBx/total*100, total) + return(res) +} + +abscount_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore, pgRNA, preS1, preS2S, + HBx, total) + return(res) +} + +promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), + levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")) + +abs_count_reads <- data.frame() +abs_count_reads <- future_sapply(list_name_samples, + abscount_promoter_reads, + df_parsed) +abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads) +colnames(abs_count_reads) <- c("promoter", "read_number") + +write.table(abs_count_reads, + file = paste0(opt$barcode, "_count_reads_per_promoter.tsv"), + quote = FALSE, + sep = "\t", + row.names = FALSE) + +resultats_start_promoters <- future_lapply(list_name_samples, + count_promoter_reads, + df_parsed) + +resultats_start_promoters <- as.data.frame(do.call(cbind, + resultats_start_promoters)) +totalCountSample <- as.data.frame(resultats_start_promoters[6,]) +colnames(totalCountSample) <- c(filename) +resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,]) +colnames(resultats_start_promoters) <- as.vector(list_name_samples) +resultats_start_promoters <- cbind(promoters, resultats_start_promoters) +formated_start_promoters <- pivot_longer(resultats_start_promoters, + cols = c(filename), + names_to = "Barcodes", + values_to = "nb_reads") + +mycolors <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") + +plot_camembert <- function(barcode, df, tot) { + camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, + y = nb_reads, + fill=promoters)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = mycolors) + + labs(title = paste0("#reads = ", tot[1,barcode]), + x=element_blank(), + y=element_blank()) + + scale_fill_discrete(name="") + + scale_colour_Publication() + theme_Publication()+ + theme(axis.text.x = element_text(angle = 45, hjust=1) + ) + + print(camembert) + + ggsave(filename = paste0("./", opt$barcode, + "_reads_start_promoters_piechart.png"), + plot = last_plot(), + scale = 1, + width = 1920, + height = 1080, + units = "px", + dpi = 300) +} + +future_lapply(list_name_samples, + plot_camembert, + formated_start_promoters, + totalCountSample) diff --git a/src/.docker_modules/r-bolero/1.1/docker_init.sh b/src/.docker_modules/r-bolero/1.1/docker_init.sh new file mode 100755 index 0000000..caa62b7 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/docker_init.sh @@ -0,0 +1,2 @@ +docker build src/.docker_modules/r-bolero/1.1 -t 'xgrand/r-bolero:1.1' +docker push xgrand/r-bolero:1.1 diff --git a/src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R b/src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R new file mode 100644 index 0000000..953db35 --- /dev/null +++ b/src/.docker_modules/r-bolero/1.1/ggplot_theme_Publication-2.R @@ -0,0 +1,165 @@ + + +theme_Publication <- function(base_size=14, base_family="sans") { + library(grid) + library(ggthemes) + (theme_foundation(base_size=base_size, base_family=base_family) + + theme(plot.title = element_text(face = "bold", + size = rel(1.2), hjust = 0.5, margin = margin(0,0,20,0)), + text = element_text(), + panel.background = element_rect(colour = NA), + plot.background = element_rect(colour = NA), + panel.border = element_rect(colour = NA), + axis.title = element_text(face = "bold",size = rel(1)), + axis.title.y = element_text(angle=90,vjust =2), + axis.title.x = element_text(vjust = -0.2), + axis.text = element_text(), + axis.line.x = element_line(colour="black"), + axis.line.y = element_line(colour="black"), + axis.ticks = element_line(), + panel.grid.major = element_line(colour="#f0f0f0"), + panel.grid.minor = element_blank(), + legend.key = element_rect(colour = NA), + legend.position = "bottom", + legend.direction = "horizontal", + legend.box = "vetical", + legend.key.size= unit(0.5, "cm"), + #legend.margin = unit(0, "cm"), + legend.title = element_text(face="italic"), + plot.margin=unit(c(10,5,5,5),"mm"), + strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"), + strip.text = element_text(face="bold") + )) + +} + +scale_fill_Publication <- function(...){ + library(scales) + discrete_scale("fill","Publication",manual_pal(values = c("#386cb0","#f87f01","#7fc97f","#ef3b2c","#feca01","#a6cee3","#fb9a99","#984ea3","#8C591D")), ...) + +} + +scale_colour_Publication <- function(...){ + library(scales) + discrete_scale("colour","Publication",manual_pal(values = c("#386cb0","#f87f01","#7fc97f","#ef3b2c","#feca01","#a6cee3","#fb9a99","#984ea3","#8C591D")), ...) + +} + + +### Dark theme for ggplot plots + +theme_dark_grey <- function(base_size=14, base_family="sans") { + library(grid) + library(ggthemes) + (theme_foundation(base_size=base_size, base_family=base_family) + + theme(plot.title = element_text(face = "bold", colour = '#ffffb3', + size = rel(1.2), hjust = 0.5, margin = margin(0,0,20,0)), + text = element_text(), + panel.background = element_rect(colour = NA, fill = 'grey20'), + plot.background = element_rect(colour = NA, fill = '#262626'), + panel.border = element_rect(colour = NA), + axis.title = element_text(face = "bold",size = rel(1), colour = 'white'), + axis.title.y = element_text(angle=90,vjust =2), + axis.title.x = element_text(vjust = -0.2), + axis.text = element_text(colour = 'grey70'), + axis.line.x = element_line(colour="grey70"), + axis.line.y = element_line(colour="grey70"), + axis.ticks = element_line(colour="grey70"), + panel.grid.major = element_line(colour="#262626"), + panel.grid.minor = element_blank(), + legend.background = element_rect(fill ='#262626'), + legend.text = element_text(color = 'white'), + legend.key = element_rect(colour = NA, fill = '#262626'), + legend.position = "bottom", + legend.direction = "horizontal", + legend.box = "vetical", + legend.key.size= unit(0.5, "cm"), + #legend.margin = unit(0, "cm"), + legend.title = element_text(face="italic", colour = 'white'), + plot.margin=unit(c(10,5,5,5),"mm"), + strip.background=element_rect(colour="#2D3A4C",fill="#2D3A4C"), + strip.text = element_text(face="bold", colour = 'white') + )) +} + +scale_fill_Publication_dark <- function(...){ + library(scales) + discrete_scale("fill","Publication",manual_pal(values = c("#fbb4ae","#b3cde3","#ccebc5","#decbe4","#fed9a6","#ffffcc","#e5d8bd","#fddaec","#f2f2f2")), ...) + +} + +scale_colour_Publication_dark <- function(...){ + library(scales) + discrete_scale("colour","Publication",manual_pal(values = c("#fbb4ae","#b3cde3","#ccebc5","#decbe4","#fed9a6","#ffffcc","#e5d8bd","#fddaec","#f2f2f2")), ...) + +} + + +# theme_transparent <- function(base_size=14, base_family="sans") { +# library(grid) +# library(ggthemes) +# (theme_foundation(base_size=base_size, base_family=base_family) +# + theme(plot.title = element_text(face = "bold", colour = '#ffffb3', +# size = rel(1.2), hjust = 0.5), +# text = element_text(), +# panel.background = element_rect(colour = NA, fill = 'transparent'), +# plot.background = element_rect(colour = NA, fill = 'transparent'), +# panel.border = element_rect(colour = NA), +# axis.title = element_text(face = "bold",size = rel(1), colour = 'white'), +# axis.title.y = element_text(angle=90,vjust =2), +# axis.title.x = element_text(vjust = -0.2), +# axis.text = element_text(colour = 'grey70'), +# axis.line.x = element_line(colour="grey70"), +# axis.line.y = element_line(colour="grey70"), +# axis.ticks = element_line(colour="grey70"), +# panel.grid.major = element_line(colour="#262626"), +# panel.grid.minor = element_blank(), +# legend.background = element_rect(fill = 'transparent'), +# legend.text = element_text(color = 'white'), +# legend.key = element_rect(colour = NA, fill = 'grey20'), +# legend.position = "bottom", +# legend.direction = "horizontal", +# legend.box = "vetical", +# legend.key.size= unit(0.5, "cm"), +# #legend.margin = unit(0, "cm"), +# legend.title = element_text(face="italic", colour = 'white'), +# plot.margin=unit(c(10,5,5,5),"mm"), +# strip.background=element_rect(colour="#2D3A4C",fill="#2D3A4C"), +# strip.text = element_text(face="bold", colour = 'white') +# )) +# } + +theme_dark_blue <- function(base_size=14, base_family="sans") { + library(grid) + library(ggthemes) + (theme_foundation(base_size=base_size, base_family=base_family) + + theme(plot.title = element_text(face = "bold", colour = '#ffffb3', + size = rel(1.2), hjust = 0.5, margin = margin(0,0,20,0)), + text = element_text(), + panel.background = element_rect(colour = NA, fill = '#282C33'), + plot.background = element_rect(colour = NA, fill = '#282C33'), + panel.border = element_rect(colour = NA), + axis.title = element_text(face = "bold",size = rel(1), colour = 'white'), + axis.title.y = element_text(angle=90,vjust =2), + axis.title.x = element_text(vjust = -0.2), + axis.text = element_text(colour = 'grey70'), + axis.line.x = element_line(colour="grey70"), + axis.line.y = element_line(colour="grey70"), + axis.ticks = element_line(colour="grey70"), + panel.grid.major = element_line(colour="#343840"), + panel.grid.minor = element_blank(), + legend.background = element_rect(fill ='#282C33'), + legend.text = element_text(color = 'white'), + legend.key = element_rect(colour = NA, fill = '#282C33'), + legend.position = "bottom", + legend.direction = "horizontal", + legend.box = "vetical", + legend.key.size= unit(0.5, "cm"), + #legend.margin = unit(0, "cm"), + legend.title = element_text(face="italic", colour = 'white'), + plot.margin=unit(c(10,5,5,5),"mm"), + strip.background=element_rect(colour="#2D3A4C",fill="#2D3A4C"), + strip.text = element_text(face="bold", colour = 'white'), + axis.text.x = element_text(angle = 45) + )) +} \ No newline at end of file -- GitLab