#!/bin/Rscript # Packages loading library(dplyr, quietly = TRUE) library(ggplot2, quietly = TRUE) library(RColorBrewer, quietly = TRUE) library(conflicted, quietly = TRUE) library(optparse, quietly = TRUE) library(tidyverse, quietly = TRUE) library(future, quietly = TRUE) library(future.apply) # Set up parallel processing plan plan("multisession", workers = 18) # Résolution de conflits entre les bibliothèques dplyr et stats conflict_prefer("filter", "dplyr") conflict_prefer("lag", "dplyr") # source graphical theme: source("ggplot_theme_Publication-2.R") # Load Start_positions_count files: option_list = list( make_option(c("-i", "--input"), type="character", default=NULL, help="input start position file (.txt)", metavar="character"), make_option(c("-b", "--barcode"), type="character", default=NULL, help="input barcode", metavar="character"), make_option(c("-s", "--start-positions"), type = "character", default = "103,117,276,1106,1221,1455,1632,2550,2968") ) opt_parser = OptionParser(option_list=option_list) opt = parse_args(opt_parser) file_to_load <- opt$input splitted <- strsplit(opt$input, split = "[/]")[[1]] filename <- strsplit(splitted[length(splitted)], split = "[.]")[[1]][1] sam_bc01 <- read.table(file_to_load, header = F) sam_bc01[3] <- rep(filename, length(sam_bc01[,1])) # Function to parse and arrange data: parsingData <- function(df) { binsize <- 10 pos <- as.data.frame(table(df[,2])) colnames(pos)[1] <- "Start" Start <- as.data.frame(as.factor(seq(0, 3300))) colnames(Start)[1] = "Start" tmp <- dplyr::left_join(Start, pos) tmp[is.na(tmp)] <- 0 tmp$Start <- as.numeric(tmp$Start) df2 <- as_tibble(tmp) %>% dplyr::mutate(bin = round(Start/binsize)*binsize) %>% group_by(bin) %>% dplyr::summarize(nb_reads = sum(Freq, na.rm = T)) df2[is.na(df2)] <- 0 df2[3] <- rep(df[1,3], length(df2$bin)) colnames(df2) <- c("Start_position", "nb_reads", "Barcode") df2 } df_parsed <- parsingData(sam_bc01) ggplot(df_parsed, aes(Start_position, nb_reads)) + geom_area(alpha = 0.5, fill = "blue") + scale_y_sqrt() + facet_wrap(facets = vars(df_parsed$Barcode)) + scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421), label = c("1692", "1819", "2806", "EcoRI", "1065", "1243", "1418", "1932")) + scale_fill_discrete(name="") + xlab("Position on HBV genome.") + ylab(element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.x = element_text(angle = 45, hjust=1) ) ggsave(paste0(filename,".png"), plot = last_plot(), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300, ) # Classify reads based on start-position: # Separate preCore & pg: classify_reads <- function(read_info) { if (read_info <= 103) { promoter <- "preCore" } else if (read_info >= 117 & read_info <= 276) { promoter <- "pgRNA" } else if (read_info >= 1106 & read_info <= 1221 ) { promoter <- "preS1" } else if (read_info >= 1455 & read_info <= 1632 ) { promoter <- "preS2/S" } else if (read_info >= 2550 & read_info <= 2968 ) { promoter <- "HBx" } else promoter <- "Undefined" } colnames(sam_bc01) <- c("read_ID", "start_position", "barcode") sam_bc01$promoter <- future_sapply(sam_bc01$start_position, classify_reads) write.table(sam_bc01, file = paste0(opt$barcode, "_classification_of_reads_per_RNA.txt"), quote = FALSE, sep = "\t", row.names = FALSE) # Compute Reads number per promoters: list_name_samples <- list(filename) count_promoter_reads <- function(barcode, df) { tmpdf <- as.data.frame(df) tmpdf <- tmpdf[tmpdf$Barcode == barcode,] preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & tmpdf$Start_position <= 276]) preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & tmpdf$Start_position <= 1221]) preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & tmpdf$Start_position <= 1632]) HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & tmpdf$Start_position <= 2968]) total <- sum(preCore, pgRNA, preS1, preS2S, HBx) res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, preS2S/total*100, HBx/total*100, total) return(res) } abscount_promoter_reads <- function(barcode, df) { tmpdf <- as.data.frame(df) tmpdf <- tmpdf[tmpdf$Barcode == barcode,] preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103]) pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & tmpdf$Start_position <= 276]) preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & tmpdf$Start_position <= 1221]) preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & tmpdf$Start_position <= 1632]) HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & tmpdf$Start_position <= 2968]) total <- sum(preCore, pgRNA, preS1, preS2S, HBx) res <- c(preCore, pgRNA, preS1, preS2S, HBx, total) return(res) } promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")) abs_count_reads <- data.frame() abs_count_reads <- future_sapply(list_name_samples, abscount_promoter_reads, df_parsed) abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads) colnames(abs_count_reads) <- c("promoter", "read_number") write.table(abs_count_reads, file = paste0(opt$barcode, "_count_reads_per_promoter.tsv"), quote = FALSE, sep = "\t", row.names = FALSE) resultats_start_promoters <- future_lapply(list_name_samples, count_promoter_reads, df_parsed) resultats_start_promoters <- as.data.frame(do.call(cbind, resultats_start_promoters)) totalCountSample <- as.data.frame(resultats_start_promoters[6,]) colnames(totalCountSample) <- c(filename) resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,]) colnames(resultats_start_promoters) <- as.vector(list_name_samples) resultats_start_promoters <- cbind(promoters, resultats_start_promoters) formated_start_promoters <- pivot_longer(resultats_start_promoters, cols = c(filename), names_to = "Barcodes", values_to = "nb_reads") mycolors <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") plot_camembert <- function(barcode, df, tot) { camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, y = nb_reads, fill=promoters)) + geom_col() + coord_polar("y") + scale_fill_manual(name="", values = mycolors) + labs(title = paste0("#reads = ", tot[1,barcode]), x=element_blank(), y=element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.y = element_blank(), axis.line.x = element_line(colour=NA), axis.line.y = element_line(colour=NA), axis.ticks = element_blank()) print(camembert) ggsave(filename = paste0("./", opt$barcode, "_reads_start_promoters_piechart.png"), plot = last_plot(), scale = 1, width = 1920, height = 1080, units = "px", dpi = 300) } future_lapply(list_name_samples, plot_camembert, formated_start_promoters, totalCountSample)