From a70b5adfb8d22bbbcd3bc3ea103657d241f89823 Mon Sep 17 00:00:00 2001
From: aliarifki <aliarifki@outlook.fr>
Date: Thu, 25 May 2023 11:17:38 +0200
Subject: [PATCH] Ajout des scripts R

---
 src/.docker_modules/r-scripts/1.0/Dockerfile  |   6 +
 .../r-scripts/1.0/HBV_RNAs_count_2.R          | 290 +++++++++++++
 .../r-scripts/1.0/Install_packages.R          |   3 +
 .../r-scripts/1.0/Junctions_NanoSplicer_2.R   | 399 ++++++++++++++++++
 .../r-scripts/1.0/docker_init.sh              |   2 +
 .../1.0/start_positions_individuals_2.R       | 213 ++++++++++
 src/nf_modules/junction_nanosplicer/main.nf   |  26 ++
 src/nf_modules/rna_count/main.nf              |   0
 src/nf_modules/start_positions/main.nf        |  25 ++
 9 files changed, 964 insertions(+)
 create mode 100644 src/.docker_modules/r-scripts/1.0/Dockerfile
 create mode 100755 src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R
 create mode 100644 src/.docker_modules/r-scripts/1.0/Install_packages.R
 create mode 100644 src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R
 create mode 100755 src/.docker_modules/r-scripts/1.0/docker_init.sh
 create mode 100755 src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R
 create mode 100644 src/nf_modules/junction_nanosplicer/main.nf
 create mode 100644 src/nf_modules/rna_count/main.nf
 create mode 100644 src/nf_modules/start_positions/main.nf

diff --git a/src/.docker_modules/r-scripts/1.0/Dockerfile b/src/.docker_modules/r-scripts/1.0/Dockerfile
new file mode 100644
index 0000000..dfadf74
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/Dockerfile
@@ -0,0 +1,6 @@
+FROM rocker/r-base:4.2.3
+
+## copy Rscript files
+COPY ./*.R .
+
+RUN Rscript Install_packages.R
diff --git a/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R b/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R
new file mode 100755
index 0000000..0010d51
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/HBV_RNAs_count_2.R
@@ -0,0 +1,290 @@
+#!/bin/Rscript
+# Packages installation
+library(ggplot2, quietly = TRUE)
+library(tidyr, quietly = TRUE)
+library(plyr, quietly = TRUE)
+library(dplyr, quietly = TRUE)
+library(stringr, quietly = TRUE)
+library(RColorBrewer)
+library(optparse)
+
+# Load files
+option_list = list(
+  make_option(c("-s", "--SPvariants"), type="character", default=NULL, 
+              help="input identified SP variants table (.csv)", metavar="character"),
+  make_option(c("-c", "--classification"), type="character", default=NULL, 
+              help="input classification of reads file (.txt)", metavar="character"))
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+# vectors of species & Palettes Colors:
+SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08",
+                "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16",
+                "SP17", "SP18", "SP19", "SP20", "SP21", "SP22")
+SPvariants <- factor(SPvariants, levels = SPvariants)
+colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", 
+                   "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", 
+                   "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", 
+                   "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", 
+                   "#4FBCBA", "#608AD2")
+palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, 
+                             stringsAsFactors = FALSE)
+
+TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")
+TSS_species <- factor(TSS_species, levels = TSS_species)
+colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
+palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, 
+                          stringsAsFactors = FALSE)
+
+new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", 
+                       "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined")
+new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates)
+colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", 
+                  "#A92322", "#964B34")
+palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, 
+                             stringsAsFactors = FALSE)
+
+SPxx <- c("SPvariants")
+SPxx <- factor(SPxx, levels = SPxx)
+color_SPxx <- c("#33C5FF")
+palette_SPxx <- data.frame(nom = SPxx, 
+                           teinte = color_SPxx, 
+                           stringsAsFactors = FALSE)
+
+all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx)
+
+palette_complete <- rbind.data.frame(palette_TSS, 
+                                     palette_SP1_22, 
+                                     palette_new_SP,
+                                     palette_SPxx,
+                                     stringsAsFactors = FALSE)
+
+# Load Start_positions_count files:
+identified_SP <- read.table(file = opt$SPvariants[1],
+                            header = TRUE)
+
+clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% 
+  select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name)
+
+countSP <- clean_SP %>% count(SP_name)
+countSP <- dplyr::inner_join(palette_complete, 
+                      countSP, 
+                      by = c("nom" = "SP_name"))
+#print(names(palette_complete))
+#print(names(countSP))
+
+countSP$nom <- factor(countSP$nom, levels = all_species_name)
+countSP <- mutate(countSP,
+                  proportion = (as.numeric(n)/sum(as.numeric(n))*100))
+#print(countSP)
+ggplot(countSP, aes(x = "percent", 
+                    y = proportion,
+                    fill = nom)) +
+  geom_col() +
+  coord_polar("y") +
+  scale_fill_manual(values = countSP$teinte) +
+  labs(fill = "spliced-variants")
+
+ggsave(file = "SP_proportion_camembert.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) +
+  geom_col() +
+  scale_fill_manual(values = countSP$teinte) + 
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(fill = "spliced-variants") +
+  xlab(label = "spliced-variants") +
+  ylab(label = "percent")
+
+ggsave(file = "SP_proportion.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+# TSS not spliced:
+classified_reads <- read.table(file = opt$classification[1], 
+                               header = TRUE)
+
+not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),]
+not_spliced <- mutate(not_spliced,
+                      species = not_spliced$promoter)
+#print(not_spliced)
+not_spliced <- not_spliced %>% select(read_ID, species)
+colnames(not_spliced) <- c("id", "species")
+
+clean_SP_type <- clean_SP %>% select(id, SP_name)
+colnames(clean_SP_type) <- c("id", "species")
+
+df_species <- rbind.data.frame(not_spliced, clean_SP_type, 
+                               stringsAsFactors = FALSE)
+count_species <- df_species %>% count(species)
+count_species <- mutate(count_species,
+                        percent = (as.numeric(n)/sum(as.numeric(n))*100))
+#print(count_species)
+write.table(df_species, file = "All_reads_identified.csv", 
+            sep = "\t", quote = FALSE, row.names = FALSE)
+
+# Null dataset:
+species <- c(TSS_species, SPvariants, new_SP_candidates)
+
+canonical_species <- TSS_species[c(1:3,5)]
+
+null_count <- data.frame(species = species, 
+                         n = rep(0, times = length(species)),
+                         percent = rep(0, times = length(species)), 
+                         stringsAsFactors = FALSE)
+
+null_count <- transform(null_count,
+                        n = as.integer(n),
+                        percent = as.numeric(percent))
+
+# Join:
+count_species <- rbind.data.frame(count_species, 
+                                  subset(null_count, 
+                                         !(species %in% count_species$species)),
+                                  stringsAsFactors = FALSE)
+
+# Merge all SP variants:
+count_species_SPxx <- data.frame(species = "SPvariants", 
+                                 n = sum(count_species[count_species$species 
+                                                       %in% SPvariants,]$n))
+count_species_SPxx <- rbind.data.frame(count_species_SPxx,
+                                       count_species[count_species$species 
+                                                     %in% c(canonical_species, 
+                                                            SPvariants), 1:2],
+                                       stringsAsFactors = FALSE)
+count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% 
+                                           all_species_name[c(1:3,5,35)],]
+count_species_SPxx <- mutate(count_species_SPxx,
+                             percent=(as.numeric(n)/sum(as.numeric(n))*100))
+#print(count_species_SPxx)
+# save the tab:
+write.csv(count_species_SPxx, file = "Count_canonical_species_SPxx.csv")
+
+# prepare to plot:
+count_species_SPxx <- dplyr::inner_join(palette_complete,
+                                 count_species_SPxx,
+                                 by = c("nom" = "species"))
+#names(palette_complete)
+#names(count_species_SPxx)
+count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name)
+
+# Save:
+write.csv(count_species, file = "Count_species.csv")
+
+# RNA species composition all species:
+count_species <- dplyr::inner_join(palette_complete, count_species,
+                                  by = c("nom" = "species"),
+                                  suffix = c("",""))
+count_species$nom <- factor(count_species$nom, levels = all_species_name)
+
+ggplot(count_species, 
+       aes(x = nom, 
+           y = percent,
+           fill = nom)) +
+  geom_col() +
+  scale_fill_manual(values = count_species$teinte) +
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(fill = "RNA species & spliced-variants") +
+  xlab(label = "RNA species & spliced-variants")
+
+ggsave(file = "Count_RNAs_species.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+# Filter RNA species canonical + SPvariants:
+count_species_clear <- count_species[count_species$species %in% c(canonical_species, SPvariants),]
+
+# RNA species composition all species:
+count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear,
+                            by = c("nom" = "nom"),
+                            suffix = c("",""))
+count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name)
+
+ggplot(count_species_clear, 
+       aes(x = nom, 
+           y = percent,
+           fill = nom)) +
+  geom_col() +
+  scale_fill_manual(values = count_species_clear$teinte) +
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(fill = "RNA species & spliced-variants") +
+  xlab(label = "RNA species & spliced-variants")
+
+ggsave(file = "Count_RNAs_species_clear.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+# SP composition clear:
+count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name)
+count_clear <- mutate(count_clear,
+                      proportion=(as.numeric(n)/sum(as.numeric(n))*100))
+#print(count_clear)
+count_clear <- dplyr::inner_join(palette_complete,
+                          count_clear, 
+                          by = c("nom" = "SP_name"))
+#names(count_clear)
+count_clear$nom <- factor(count_clear$nom, levels = all_species_name)
+
+ggplot(count_clear, aes(x = "percent", 
+                        y = proportion,
+                        fill = nom)) +
+  geom_col() +
+  coord_polar("y") +
+  scale_fill_manual(values = count_clear$teinte) +
+  labs(fill = "spliced-variants")
+
+ggsave(file = "SP_clear_proportion_camembert.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+ggplot(count_clear, aes(x = nom, 
+                        y = proportion,
+                        fill = nom)) +
+  geom_col() +
+  scale_fill_manual(values = count_clear$teinte) + 
+  theme(axis.text.x = element_text(angle = 45)) +
+  labs(fill = "spliced-variants") +
+  xlab(label = "spliced-variants") +
+  ylab(label = "percent")
+
+ggsave(file = "SP_clear_proportion.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
+# Camembert SPxx:
+ggplot(count_species_SPxx, aes(x = "species", 
+                               y = percent,
+                               fill = nom)) +
+  geom_col() +
+  coord_polar("y") +
+  scale_fill_manual(values = count_species_SPxx$teinte) +
+  labs(fill = "TSS") +
+  ylab(label = "TSS usage") +
+  xlab(label = "percent")
+
+ggsave(file = "Count_RNAs_species_camembert.png",
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300)
+
diff --git a/src/.docker_modules/r-scripts/1.0/Install_packages.R b/src/.docker_modules/r-scripts/1.0/Install_packages.R
new file mode 100644
index 0000000..384435c
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/Install_packages.R
@@ -0,0 +1,3 @@
+list.of.packages <- c("ggplot2", "tidyr", "plyr", "dplyr", "tidyverse", "stringr", "optparse", "RColorBrewer", "conflicted", "BiocManager", "resshape2", "R.utils")
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages, dependencies = T)
diff --git a/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R b/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R
new file mode 100644
index 0000000..720226d
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/Junctions_NanoSplicer_2.R
@@ -0,0 +1,399 @@
+#!/bin/Rscript
+
+################################################################################
+### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ###
+################################################################################
+library(ggplot2, quietly = TRUE)
+library(tidyr, quietly = TRUE)
+library(plyr, quietly = TRUE)
+library(dplyr, quietly = TRUE)
+library(stringr, quietly = TRUE)
+library(optparse)
+
+# Load classification per promoter:
+option_list = list(
+  make_option(c("-c", "--classification"), type="character", default=NULL, 
+              help="input classification or reads file (.txt)", metavar="character"),
+  make_option(c("-j", "--jwr"), type="character", default=NULL, 
+              help="input nanosplicer results table (.csv)", metavar="character"))
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+reads_pos <- read.table(opt$classification[1],
+                        sep = "\t")
+colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])])
+reads_pos <- reads_pos[2:length(reads_pos$id),]
+
+# Load Nanosplicer results:
+df <- read.csv(opt$jwr[1])
+colnames(df)[1] <- "juncNumber"
+
+# split donor and acceptor positions:
+df <- df %>%
+  separate(col = "loc",
+           into = c("donor", "acceptor"), sep = ", ",
+           extra = "warn")
+
+df$donor <- str_replace(df$donor, '[(]', '')
+df$acceptor <- str_replace(df$acceptor, '[)]', '')
+
+df <- mutate(df, 
+             pg_donor = as.numeric(donor)-122,
+             pg_acceptor = as.numeric(acceptor)-122)
+
+df <- merge(df, reads_pos, by = 'id')
+
+assignation_donor <- function(pg_donor) {
+  if (pg_donor >= 620) {
+    if (pg_donor < 643 & pg_donor >= 620) {
+      donor_site <- "J2450_"
+    }
+    else if (pg_donor <= 675 & pg_donor >= 643) {
+      donor_site <- "J2474_"
+    }
+    else if (pg_donor <= 1182 & pg_donor >= 1162) {
+      donor_site <- "J2988_"
+    }
+    else if (pg_donor <= 1835 & pg_donor >= 1815) {
+      donor_site <- "J461_"
+    }
+    else {
+      donor_site <- paste0("J", pg_donor, "new_")
+    }
+  }
+  else if (pg_donor <= 284 & pg_donor >= 264) {
+    donor_site <- "J2090_"
+  }
+  else if (pg_donor <= 261 & pg_donor >= 240) {
+    donor_site <- "J2070_"
+  }
+  else {
+    donor_site <- paste0("J", pg_donor, "new_")
+  }
+  return(donor_site)
+}
+
+assignation_acceptor <- function(pg_acceptor) {
+  if (pg_acceptor > 1868) {
+    if (pg_acceptor >= 2452 & pg_acceptor <= 2468) {
+      acceptor_site <- "3170"
+    }
+    else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) {
+      acceptor_site <- "1306"
+    }
+    else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) {
+      acceptor_site <- "1386"
+    }
+    else {
+      acceptor_site <- paste0(pg_acceptor,"new")
+    }
+  }
+  else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) {
+    acceptor_site <- "490"
+  }
+  else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) {
+    acceptor_site <- "283"
+  }
+  else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) {
+    acceptor_site <- "2903"
+  }
+  else if (pg_acceptor >= 525 & pg_acceptor <= 547) {
+    acceptor_site <- "2351"
+  }
+  else if (pg_acceptor >= 410 & pg_acceptor <= 432) {
+    acceptor_site <- "2237"
+  }
+  else {
+    acceptor_site <- paste0(pg_acceptor,"new")
+  }
+  return(acceptor_site)
+}
+
+df$donor_site <- sapply(df$pg_donor, assignation_donor)
+df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor)
+df <- mutate(df,
+             junction = paste0(donor_site, acceptor_site))
+
+write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t")
+
+duplicated2 <- function(x){
+  if (sum(dup <- duplicated(x))==0)
+    return(dup)
+  if (class(x) %in% c("data.frame","matrix"))
+    duplicated(rbind(x[dup,],x))[-(1:sum(dup))]
+  else duplicated(c(x[dup],x))[-(1:sum(dup))]
+}
+
+list_read_multiple <- unique(df[duplicated2(df$id),]$id)
+multiple_junction <- df[duplicated2(df$id),]
+single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple))
+
+SP_assignation_single <- function(site, promoter) {
+  if (promoter == "pgRNA" | promoter == "preCore") {
+    if (str_detect(site, "new")) { SP_name <- "new_SP" }
+    else if (site == "J2450_490") { SP_name <- "SP01" }
+    else if (site == "J2070_490") { SP_name <- "SP03" }
+    else if (site == "J2090_490") { SP_name <- "SP05" }
+    else if (site == "J2474_490") { SP_name <- "SP06" }
+    else if (site == "J2450_283") { SP_name <- "SP09" }
+    else if (site == "J2474_283") { SP_name <- "SP11" }
+    else if (site == "J2988_490") { SP_name <- "SP13" }
+    else if (site == "J2450_2903") { SP_name <- "SP14" }
+    else if (site == "J2070_283") { SP_name <- "SP17" }
+    else if (site == "J2988_3170") { SP_name <- "SP19" }
+    else if (site == "J2988_283") { SP_name <- "SP20" }
+    else { SP_name <- "new_comb_pg" }
+  }
+  else if (promoter == "preS1" | promoter == "preS2/S") {
+    if (str_detect(site, "new")) { SP_name <- "new_SP" }
+    else if (site == "J461_1306") { SP_name <- "SP21" }
+    else if (site == "J461_1386") { SP_name <- "SP22" }
+    else { SP_name <- "new_comb_HBs" }
+  }
+  else if (promoter == "HBx") {
+    if (str_detect(site, "new")) { SP_name <- "new_SP" }
+    else { SP_name <- "new_comb_HBx" }
+  }
+  else if (promoter == "Undefined") {
+    if (str_detect(site, "new")) { SP_name <- "new_SP" }
+    else { SP_name <- "new_comb_UnC" }
+  }
+  else { 
+    if (str_detect(site, "new")) { SP_name <- "new_SP" }
+    else { SP_name <- "error" }
+  }
+}
+
+single_junction <- ddply(single_junction,
+                         .(id), 
+                         mutate,
+                         SP_name = SP_assignation_single(junction, promoter))
+
+SP_assignation_multiple <- function(read_id, combinaison, promoter) {
+  if (promoter == "pgRNA" | promoter == "preCore") {
+    
+    # New junctions detection:
+    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
+      SP_name <- "new_SP"
+    }
+    
+    #SP02, SP08, SP10, SP16:
+    else if ("J2070_2351" %in% combinaison) {
+      #SP02:
+      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP02"
+      }
+      
+      #SP08:
+      else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison &
+               length(combinaison) == 3) {
+        SP_name <- "SP08"
+      }
+      
+      #SP10:
+      else if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP10"
+      }
+      
+      #SP16:
+      else if ("J2474_490" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP16"
+      }
+      
+      #If there is another combination of already described junction associated to "J2070_2351":
+      else {SP_name <- "new_comb_pg"}
+    }
+    
+    #SP12, SP15:
+    else if ("J2070_2237" %in% combinaison) {
+      #SP12:
+      if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP12"
+      }
+      else if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP15"
+      }
+      #If there is another combination of already described junction associated to "J2070_2237":
+      else {SP_name <- "new_comb_pg"}
+    }
+    
+    #SP04:
+    else if ("J2090_2351" %in% combinaison) {
+      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP04"
+      }
+      #If there is another combination of already described junction associated to "J2090_2351":
+      else {SP_name <- "new_comb_pg"}
+    }
+    
+    #SP07, SP18:
+    else if ("J2988_490" %in% combinaison) {
+      #SP07:
+      if ("J2450_2903" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP07"
+      }
+      #SP18:
+      else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) {
+        SP_name <- "SP18"
+      }
+      #If there is another combination of already described junction associated to "J2988_490":
+      else {SP_name <- "new_comb_pg"}
+    }
+    
+    #New combinaisons of multiple already described junctions:
+    else {SP_name <- "new_comb_pg"}
+  }
+  
+  #SP from preS1, preS2/2:
+  else if (promoter == "preS1" | promoter == "preS2/S") {
+    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
+      SP_name <- "new_SP_HBs"
+    }
+    else { SP_name <- "new_comb_HBs"}
+  }
+  
+  #SP from HBx:
+  else if (promoter == "HBx") {
+    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
+        SP_name <- "new_SP_HBx"
+      }
+    else { SP_name <- "new_comb_HBx"}
+  }
+  
+  #SP from Undefined:
+  else if (promoter == "Undefined") {
+    if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" }
+    else { SP_name <- "new_comb_UnC" }
+  }
+  
+  #Error case:
+  else {
+  SP_name <- "error"
+  }
+return(SP_name)
+}
+
+tmp <- multiple_junction %>% select(id, junction, promoter)
+df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2))
+colnames(df_combinaison) <- c("id", "SP_name")
+
+for (read_id in list_read_multiple) {
+  SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1])
+  res_vector <- data.frame(t(c(read_id, SP_name_computed)))
+  colnames(res_vector) <- colnames(df_combinaison)
+  df_combinaison <- rbind(df_combinaison, res_vector)
+}
+
+# df_combinaison <- df_combinaison[2:length(df_combinaison$id),]
+
+multiple_junction <- merge(multiple_junction, df_combinaison, by="id")
+
+df_SPvariants <- rbind(single_junction, multiple_junction)
+
+SP_variant_unique <- df_SPvariants %>% select(id, SP_name)
+SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id)
+
+write.table(df_SPvariants, 
+            "identified_SPvariants.csv", 
+            row.names = FALSE, 
+            sep = "\t", 
+            quote = FALSE)
+
+ggplot(df, aes(x=pg_donor)) +
+  geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") +
+  geom_density(alpha=.2, fill="lightblue") +
+  geom_vline(aes(xintercept=median(pg_donor)),
+              color="blue", linetype="dashed", linewidth=1) +
+  geom_vline(aes(xintercept=quantile(pg_donor, 0.025)),
+           linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_donor, 0.975)),
+           linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_donor, 0.01)),
+             color="green", linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_donor, 0.99)),
+             color="green", linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))),
+             color="red", linewidth=0.5) +
+  geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))),
+             color="red", linewidth=0.5) +
+  scale_x_continuous(breaks=c(min(df$pg_donor), 
+                              quantile(df$pg_donor, 0.025),
+                              quantile(df$pg_donor, 0.005),
+                              median(df$pg_donor)-sd(df$pg_donor),
+                              median(df$pg_donor),
+                              median(df$pg_donor)+sd(df$pg_donor),
+                              quantile(df$pg_donor, 0.975),
+                              quantile(df$pg_donor, 0.995),
+                              max(df$pg_donor)),
+                     label = c(min(df$pg_donor),
+                               floor(quantile(df$pg_donor, 0.025)),
+                               floor(quantile(df$pg_donor, 0.005)),
+                               round(median(df$pg_donor)-sd(df$pg_donor)),
+                               median(df$pg_donor),
+                               round(median(df$pg_donor)+sd(df$pg_donor)),
+                               floor(quantile(df$pg_donor, 0.975))+1,
+                               round(quantile(df$pg_donor, 0.995))+1,
+                               max(df$pg_donor))) +
+  theme(axis.text.x = element_text(angle = 45))
+
+ggsave(filename = "Donor_curve.png",
+       device = "png",
+       scale = 1,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 320)
+
+ggplot(df, aes(x=pg_acceptor)) +
+  geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") +
+  geom_density(alpha=.2, fill="darksalmon") +
+  geom_vline(aes(xintercept=median(pg_acceptor)),
+             color="red", linetype="dashed", linewidth=1) +
+  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)),
+           linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)),
+           linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)),
+             color="green", linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)),
+             color="green", linetype="dashed", linewidth=0.25) +
+  geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))),
+             color="blue", linewidth=0.5) +
+  geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))),
+             color="blue", linewidth=0.5) +
+  scale_x_continuous(breaks=c(min(df$pg_acceptor), 
+                              quantile(df$pg_acceptor, 0.025),
+                              quantile(df$pg_acceptor, 0.005),
+                              median(df$pg_acceptor)-sd(df$pg_acceptor),
+                              median(df$pg_acceptor),
+                              median(df$pg_acceptor)+sd(df$pg_acceptor),
+                              quantile(df$pg_acceptor, 0.975),
+                              quantile(df$pg_acceptor, 0.995),
+                              max(df$pg_acceptor)),
+                       label = c(min(df$pg_acceptor),
+                                 floor(quantile(df$pg_acceptor, 0.025)),
+                                 floor(quantile(df$pg_acceptor, 0.005)),
+                                 round(median(df$pg_acceptor)-sd(df$pg_acceptor)),
+                                 median(df$pg_acceptor),
+                                 round(median(df$pg_acceptor)+sd(df$pg_acceptor)),
+                                 floor(quantile(df$pg_acceptor, 0.975))+1,
+                                 floor(quantile(df$pg_acceptor, 0.995))+1,
+                                 max(df$pg_acceptor))) +
+  theme(axis.text.x = element_text(angle = 45))
+
+ggsave(filename = "Acceptor_curve.png",
+       device = "png",
+       scale = 1,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 320)
+
+# Graphs and tests:
+
+# sink("test_shapiro.txt")
+# print("Normality test: Shapiro-Wilk")
+# print("Donor site:")
+# print(shapiro.test(df$pg_donor))
+# print("Acceptor site:")
+# print(shapiro.test(df$pg_acceptor))
+# sink()
\ No newline at end of file
diff --git a/src/.docker_modules/r-scripts/1.0/docker_init.sh b/src/.docker_modules/r-scripts/1.0/docker_init.sh
new file mode 100755
index 0000000..b0b0edb
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/docker_init.sh
@@ -0,0 +1,2 @@
+docker build src/.docker_modules/r-scripts/1.0 -t 'xgrand/r-scripts:1.0'
+docker push xgrand/r-scripts:1.0
diff --git a/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R b/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R
new file mode 100755
index 0000000..31b4e61
--- /dev/null
+++ b/src/.docker_modules/r-scripts/1.0/start_positions_individuals_2.R
@@ -0,0 +1,213 @@
+#!/bin/Rscript
+# Packages installation
+library(dplyr)
+library(ggplot2)
+library(RColorBrewer)
+library(conflicted)
+library(optparse)
+library(tidyverse)
+
+# RÃ©solution de conflits entre les bibliothÃ¨ques dplyr et stats
+conflict_prefer("filter", "dplyr")
+conflict_prefer("lag", "dplyr")
+
+# Load Start_positions_count files:
+option_list = list(
+  make_option(c("-i", "--input"), type="character", default=NULL, 
+              help="input start position file (.txt)", metavar="character")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+#opt = ("/home/alia/scripts/Start_position/Start_positions_counts.txt")
+
+#list_file <- list.files(path=".", 
+#                        pattern="*.txt", 
+#                        all.files=FALSE, 
+#                        full.names=FALSE)
+file_to_load <- opt$input[1]
+splitted <- strsplit(opt$input[1], split = "[/]")[[1]]
+filename <- strsplit(splitted[length(splitted)], split = "[.]")[[1]][1]
+
+sam_bc01 <- read.table(file_to_load, header = F)
+sam_bc01[3] <- rep(filename, length(sam_bc01[,1]))
+
+# Function to parse and arrange data:
+parsingData <- function(df) {
+  binsize <- 10
+  pos <- as.data.frame(table(df[,2]))
+  colnames(pos)[1] <- "Start"
+  
+  Start <- as.data.frame(as.factor(seq(0, 3300)))
+  colnames(Start)[1] = "Start"
+  
+  tmp <- dplyr::left_join(Start, pos)
+  tmp[is.na(tmp)] <- 0
+  
+  tmp$Start <- as.numeric(tmp$Start)
+  
+  df2 <- as_tibble(tmp) %>% 
+    mutate(bin = round(Start/binsize)*binsize) %>% 
+    group_by(bin) %>% 
+    summarize(nb_reads = sum(Freq, na.rm = T))
+  df2[is.na(df2)] <- 0
+  df2[3] <- rep(df[1,3], length(df2$bin))
+  colnames(df2) <- c("Start_position", "nb_reads", "Barcode")
+  df2
+}
+
+df_parsed <- parsingData(sam_bc01)
+
+ggplot(df_parsed, aes(Start_position, nb_reads)) + 
+  geom_area(alpha = 0.5, fill = "blue") + 
+  scale_y_sqrt() +
+  facet_wrap(facets = vars(df_parsed$Barcode)) +
+  theme_light()+
+  scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421),
+                     label = c("1692", "1819", "2806", "EcoRI", "1065", 
+                               "1243", "1418", "1932")) + 
+  theme(axis.text.x = element_text(angle = 45)
+  )
+
+ggsave(paste0(filename,".jpg"),
+       plot = last_plot(),
+       scale = 2,
+       width = 1920,
+       height = 1080,
+       units = "px",
+       dpi = 300,
+)
+
+# Classify reads based on start-position:
+
+# Separate preCore & pg:
+classify_reads <- function(read_info) {
+  if (read_info <= 103) {
+    promoter <- "preCore"
+  }
+  else if (read_info >= 117 &
+           read_info <= 276) {
+    promoter <- "pgRNA"
+  }
+  else if (read_info >= 1106 & 
+           read_info <= 1221 ) {
+    promoter <- "preS1"
+  }
+  else if (read_info >= 1455 & 
+           read_info <= 1632 ) {
+    promoter <- "preS2/S"
+  }
+  else if (read_info >= 2550 & 
+           read_info <= 2968 ) {
+    promoter <- "HBx"
+  }
+  else promoter <- "Undefined"
+}
+
+colnames(sam_bc01) <- c("read_ID", "start_position", "barcode")
+sam_bc01$promoter <- sapply(sam_bc01$start_position, 
+                            classify_reads)
+
+write.table(sam_bc01,
+            file = "classification_of_reads_per_RNA.txt",
+            quote = FALSE, 
+            sep = "\t", 
+            row.names = FALSE)
+
+# Compute Reads number per promoters:
+list_name_samples <- list(filename)
+
+count_promoter_reads <- function(barcode, df) {
+  tmpdf <- as.data.frame(df)
+  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
+  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
+  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
+                                tmpdf$Start_position <= 276])
+  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
+                                tmpdf$Start_position <= 1221])
+  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
+                                 tmpdf$Start_position <= 1632])
+  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
+                              tmpdf$Start_position <= 2968])
+  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
+  res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, 
+           preS2S/total*100, HBx/total*100, total)
+  return(res)
+}
+
+abscount_promoter_reads <- function(barcode, df) {
+  tmpdf <- as.data.frame(df)
+  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
+  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
+  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
+                                tmpdf$Start_position <= 276])
+  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
+                                tmpdf$Start_position <= 1221])
+  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
+                                 tmpdf$Start_position <= 1632])
+  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
+                              tmpdf$Start_position <= 2968])
+  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
+  res <- c(preCore, pgRNA, preS1, preS2S, 
+           HBx, total)
+  return(res)
+}
+
+promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), 
+                    levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"))
+
+abs_count_reads <- data.frame()
+abs_count_reads <- sapply(list_name_samples, 
+                          abscount_promoter_reads, 
+                          df_parsed)
+abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads)
+colnames(abs_count_reads) <- c("promoter", "read_number")
+
+write.table(abs_count_reads,
+            file = "Count_reads_per_promoter.tsv",
+            quote = FALSE, 
+            sep = "\t", 
+            row.names = FALSE)
+
+resultats_start_promoters <- lapply(list_name_samples, 
+                                    count_promoter_reads, 
+                                    df_parsed)
+
+resultats_start_promoters <- as.data.frame(do.call(cbind, 
+                                                   resultats_start_promoters))
+totalCountSample <- as.data.frame(resultats_start_promoters[6,])
+colnames(totalCountSample) <- c(filename)
+resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,])
+colnames(resultats_start_promoters) <- as.vector(list_name_samples)
+resultats_start_promoters <- cbind(promoters, resultats_start_promoters)
+formated_start_promoters <- pivot_longer(resultats_start_promoters, 
+                                         cols = c(filename),
+                                         names_to = "Barcodes", 
+                                         values_to = "nb_reads")
+
+mycolors <- colorRampPalette(brewer.pal(10, "Paired"))(10)
+mycolors5 <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
+mycolors6 <- c("#A6CEE3", "#3362ff", "#33c5ff", "#6A3D9A", "#d60000")
+
+plot_camembert <- function(barcode, df, tot) {
+  camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, 
+                                                       y = nb_reads, 
+                                                       fill=promoters)) +
+    geom_col() +
+    coord_polar("y") +
+    scale_fill_manual(values = mycolors5) +
+    labs(title = paste0("#reads = ", tot[1,barcode]), x=element_blank(), y=element_blank()) +
+    theme_light()
+  
+  print(camembert)
+  
+  ggsave(filename = paste0("./Reads_start_promoters_", barcode, "_camembert.jpg"),
+         plot = last_plot(),
+         scale = 1,
+         width = 1920,
+         height = 1080,
+         units = "px",
+         dpi = 300)
+}
+
+lapply(list_name_samples, plot_camembert, formated_start_promoters, totalCountSample)
diff --git a/src/nf_modules/junction_nanosplicer/main.nf b/src/nf_modules/junction_nanosplicer/main.nf
new file mode 100644
index 0000000..8598562
--- /dev/null
+++ b/src/nf_modules/junction_nanosplicer/main.nf
@@ -0,0 +1,26 @@
+version = "1.0"
+container_url = "xgrand/r-scripts:${version}"
+
+params.junctions_out = ""
+process junctions_nanosplicer{
+  container = "${container_url}"
+  label "small_mem_mono_cpus"
+  tag "identification de variants d'Ã©pissage"
+  if (params.junctions_out != "") {
+    publishDir "results/${params.junctions_out}", mode: 'copy'
+  }
+
+  input:
+    path(csv)
+    path(classification_of_reads_per_RNA)
+
+  output:
+    path("Rplots.pdf")
+    path("JWR_check_parsed.csv")
+    path("identified_SPvariants.csv"), emit: identified_SPvariants
+
+  script:
+    """
+    Rscript /Junctions_NanoSplicer/Junctions_NanoSplicer_2.R
+    """
+}
\ No newline at end of file
diff --git a/src/nf_modules/rna_count/main.nf b/src/nf_modules/rna_count/main.nf
new file mode 100644
index 0000000..e69de29
diff --git a/src/nf_modules/start_positions/main.nf b/src/nf_modules/start_positions/main.nf
new file mode 100644
index 0000000..06e2e4f
--- /dev/null
+++ b/src/nf_modules/start_positions/main.nf
@@ -0,0 +1,25 @@
+version = "1.0"
+container_url = "xgrand/r-scripts:${version}"
+
+params.start_position_counts_out = ""
+process start_position_individuals{
+  container = "${container_url}"
+  label "small_mem_mono_cpus"
+  tag "identification de variants d'Ã©pissage"
+  if (params.start_position_counts_out != "") {
+    publishDir "results/${params.start_position_counts_out}", mode: 'copy'
+  }
+
+  input:
+    path(start_position_counts)
+
+  output:
+    path("Rplots.pdf")
+    path("Count_reads_per_promoter.tsv")
+    path("classification_of_reads_per_RNA.txt"), emit: classification_of_reads
+
+  script:
+    """
+    Rscript start_positions_individuals.R -i start_position_counts
+    """
+}
\ No newline at end of file
-- 
GitLab