diff --git a/src/.docker_modules/r-docher-test/1.0/Dockerfile b/src/.docker_modules/r-docher-test/1.0/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..9c3fa5cf773c6b44ed8f61a56acc5924df3edfcd --- /dev/null +++ b/src/.docker_modules/r-docher-test/1.0/Dockerfile @@ -0,0 +1,9 @@ +FROM rocker/r-base:4.2.3 + +## copy Rscript files +COPY ./*.R . + +RUN Rscript install_pkgs.R + +# command to run on container start +CMD [ "bash" ] \ No newline at end of file diff --git a/src/.docker_modules/r-docher-test/1.0/install_pkgs.R b/src/.docker_modules/r-docher-test/1.0/install_pkgs.R new file mode 100644 index 0000000000000000000000000000000000000000..20def570d4eea82c9488e92f888efacef4a94f6b --- /dev/null +++ b/src/.docker_modules/r-docher-test/1.0/install_pkgs.R @@ -0,0 +1,6 @@ +#!/bin/Rscript +# Packages installation: +list.of.packages <- c("BiocManager", "ggplot2", "dplyr", "reshape2", + "RColorBrewer", "R.utils") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages, dependencies = T) \ No newline at end of file diff --git a/src/.docker_modules/r-docher-test/1.0/start_positions_individuals_2.R b/src/.docker_modules/r-docher-test/1.0/start_positions_individuals_2.R new file mode 100755 index 0000000000000000000000000000000000000000..0ba0f18208c4c186d152aad9bb2add79a41ace12 --- /dev/null +++ b/src/.docker_modules/r-docher-test/1.0/start_positions_individuals_2.R @@ -0,0 +1,202 @@ +#!/bin/Rscript +library(dplyr) +library(ggplot2) +library(tidyverse) +library(RColorBrewer) +library(conflicted) +#résolution de conflits entre les bibliothèques dplyr et stats +conflict_prefer("filter", "dplyr") +conflict_prefer("lag", "dplyr") + +# Load Start_positions_count files: + +list_file <- list.files(path=".", + pattern="*.txt", + all.files=FALSE, + full.names=FALSE) +file_to_load <- paste0("./", list_file[1]) +filename <- strsplit(list_file[1], split = "[.]")[[1]][1] + +sam_bc01 <- read.table(file_to_load, header = F) +sam_bc01[3] <- rep(filename, length(sam_bc01[,1])) + +# Function to parse and arrange data: + +parsingData <- function(df) { + binsize <- 10 + pos <- as.data.frame(table(df[,2])) + colnames(pos)[1] <- "Start" + + Start <- as.data.frame(as.factor(seq(0, 3300))) + colnames(Start)[1] = "Start" + + tmp <- dplyr::left_join(Start, pos) + tmp[is.na(tmp)] <- 0 + + tmp$Start <- as.numeric(tmp$Start) + + df2 <- as_tibble(tmp) %>% + mutate(bin = round(Start/binsize)*binsize) %>% + group_by(bin) %>% + summarize(nb_reads = sum(Freq, na.rm = T)) + df2[is.na(df2)] <- 0 + df2[3] <- rep(df[1,3], length(df2$bin)) + colnames(df2) <- c("Start_position", "nb_reads", "Barcode") + df2 +} + +df_parsed <- parsingData(sam_bc01) + +ggplot(df_parsed, aes(Start_position, nb_reads)) + + geom_area(alpha = 0.5, fill = "blue") + + scale_y_sqrt() + + facet_wrap(facets = vars(df_parsed$Barcode)) + + theme_light()+ + scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421), + label = c("1692", "1819", "2806", "EcoRI", "1065", + "1243", "1418", "1932")) + + theme(axis.text.x = element_text(angle = 45) + ) + +ggsave(paste0(filename,".jpg"), + plot = last_plot(), + scale = 2, + width = 1920, + height = 1080, + units = "px", + dpi = 300, +) + +# Classify reads based on start-position: + +# Separate preCore & pg: +classify_reads <- function(read_info) { + if (read_info <= 103) { + promoter <- "preCore" + } + else if (read_info >= 117 & + read_info <= 276) { + promoter <- "pgRNA" + } + else if (read_info >= 1106 & + read_info <= 1221 ) { + promoter <- "preS1" + } + else if (read_info >= 1455 & + read_info <= 1632 ) { + promoter <- "preS2/S" + } + else if (read_info >= 2550 & + read_info <= 2968 ) { + promoter <- "HBx" + } + else promoter <- "Undefined" +} + +colnames(sam_bc01) <- c("read_ID", "start_position", "barcode") +sam_bc01$promoter <- sapply(sam_bc01$start_position, + classify_reads) + +write.table(sam_bc01, + file = "classification_of_reads_per_RNA.txt", + quote = FALSE, + sep = "\t", + row.names = FALSE) + +# Compute Reads number per promoters: +list_name_samples <- list(filename) + +count_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, + preS2S/total*100, HBx/total*100, total) + return(res) +} + +abscount_promoter_reads <- function(barcode, df) { + tmpdf <- as.data.frame(df) + tmpdf <- tmpdf[tmpdf$Barcode == barcode,] + preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) + pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & + tmpdf$Start_position <= 276]) + preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & + tmpdf$Start_position <= 1221]) + preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & + tmpdf$Start_position <= 1632]) + HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & + tmpdf$Start_position <= 2968]) + total <- sum(preCore, pgRNA, preS1, preS2S, HBx) + res <- c(preCore, pgRNA, preS1, preS2S, + HBx, total) + return(res) +} + +promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), + levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")) + +abs_count_reads <- data.frame() +abs_count_reads <- sapply(list_name_samples, + abscount_promoter_reads, + df_parsed) +abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads) +colnames(abs_count_reads) <- c("promoter", "read_number") + +write.table(abs_count_reads, + file = "Count_reads_per_promoter.tsv", + quote = FALSE, + sep = "\t", + row.names = FALSE) + +resultats_start_promoters <- lapply(list_name_samples, + count_promoter_reads, + df_parsed) + +resultats_start_promoters <- as.data.frame(do.call(cbind, + resultats_start_promoters)) +totalCountSample <- as.data.frame(resultats_start_promoters[6,]) +colnames(totalCountSample) <- c(filename) +resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,]) +colnames(resultats_start_promoters) <- as.vector(list_name_samples) +resultats_start_promoters <- cbind(promoters, resultats_start_promoters) +formated_start_promoters <- pivot_longer(resultats_start_promoters, + cols = c(filename), + names_to = "Barcodes", + values_to = "nb_reads") + +mycolors <- colorRampPalette(brewer.pal(10, "Paired"))(10) +mycolors5 <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") +mycolors6 <- c("#A6CEE3", "#3362ff", "#33c5ff", "#6A3D9A", "#d60000") + +plot_camembert <- function(barcode, df, tot) { + camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, + y = nb_reads, + fill=promoters)) + + geom_col() + + coord_polar("y") + + scale_fill_manual(values = mycolors5) + + labs(title = paste0("#reads = ", tot[1,barcode]), x=element_blank(), y=element_blank()) + + theme_light() + + print(camembert) + + ggsave(filename = paste0("./Reads_start_promoters_", barcode, "_camembert.jpg"), + plot = last_plot(), + scale = 1, + width = 1920, + height = 1080, + units = "px", + dpi = 300) +} + +lapply(list_name_samples, plot_camembert, formated_start_promoters, totalCountSample)