Start_positions.R

#!/bin/Rscript
# Packages loading
library(dplyr, quietly = TRUE)
library(ggplot2, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(conflicted, quietly = TRUE)
library(optparse, quietly = TRUE)
library(tidyverse, quietly = TRUE)
library(future, quietly = TRUE)
library(future.apply)

# Set up parallel processing plan
plan("multisession", workers = 18)

# Résolution de conflits entre les bibliothèques dplyr et stats
conflict_prefer("filter", "dplyr")
conflict_prefer("lag", "dplyr")

# source graphical theme:
source("ggplot_theme_Publication-2.R")

# Load Start_positions_count files:
option_list = list(
  make_option(c("-i", "--input"), type="character", default=NULL, 
              help="input start position file (.txt)", metavar="character"),
  make_option(c("-b", "--barcode"), type="character", default=NULL, 
              help="input barcode", metavar="character"),
  make_option(c("-s", "--start-positions"), type = "character", 
              default = "103,117,276,1106,1221,1455,1632,2550,2968")
)

opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)

file_to_load <- opt$input
splitted <- strsplit(opt$input, split = "[/]")[[1]]
filename <- strsplit(splitted[length(splitted)], split = "[.]")[[1]][1]

sam_bc01 <- read.table(file_to_load, header = F)
sam_bc01[3] <- rep(filename, length(sam_bc01[,1]))

# Function to parse and arrange data:
parsingData <- function(df) {
  binsize <- 10
  pos <- as.data.frame(table(df[,2]))
  colnames(pos)[1] <- "Start"
  
  Start <- as.data.frame(as.factor(seq(0, 3300)))
  colnames(Start)[1] = "Start"
  
  tmp <- dplyr::left_join(Start, pos)
  tmp[is.na(tmp)] <- 0
  
  tmp$Start <- as.numeric(tmp$Start)
  
  df2 <- as_tibble(tmp) %>% 
    dplyr::mutate(bin = round(Start/binsize)*binsize) %>% 
    group_by(bin) %>% 
    dplyr::summarize(nb_reads = sum(Freq, na.rm = T))
  df2[is.na(df2)] <- 0
  df2[3] <- rep(df[1,3], length(df2$bin))
  colnames(df2) <- c("Start_position", "nb_reads", "Barcode")
  df2
}

df_parsed <- parsingData(sam_bc01)

ggplot(df_parsed, aes(Start_position, nb_reads)) + 
  geom_area(alpha = 0.5, fill = "blue") + 
  scale_y_sqrt() +
  facet_wrap(facets = vars(df_parsed$Barcode)) +
  scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421),
                     label = c("1692", "1819", "2806", "EcoRI", "1065", 
                               "1243", "1418", "1932")) +
  scale_fill_discrete(name="") +
  xlab("Position on HBV genome.") + ylab(element_blank()) +
  scale_colour_Publication() + theme_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust=1)
  )

ggsave(paste0(filename,".png"),
       plot = last_plot(),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300,
)

# Classify reads based on start-position:

# Separate preCore & pg:
classify_reads <- function(read_info) {
  if (read_info <= 103) {
    promoter <- "preCore"
  }
  else if (read_info >= 117 &
           read_info <= 276) {
    promoter <- "pgRNA"
  }
  else if (read_info >= 1106 & 
           read_info <= 1221 ) {
    promoter <- "preS1"
  }
  else if (read_info >= 1455 & 
           read_info <= 1632 ) {
    promoter <- "preS2/S"
  }
  else if (read_info >= 2550 & 
           read_info <= 2968 ) {
    promoter <- "HBx"
  }
  else promoter <- "Undefined"
}

colnames(sam_bc01) <- c("read_ID", "start_position", "barcode")
sam_bc01$promoter <- future_sapply(sam_bc01$start_position, 
                            classify_reads)

write.table(sam_bc01,
            file = paste0(opt$barcode, "_classification_of_reads_per_RNA.txt"),
            quote = FALSE, 
            sep = "\t", 
            row.names = FALSE)

# Compute Reads number per promoters:
list_name_samples <- list(filename)

count_promoter_reads <- function(barcode, df) {
  tmpdf <- as.data.frame(df)
  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
                                tmpdf$Start_position <= 276])
  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
                                tmpdf$Start_position <= 1221])
  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
                                 tmpdf$Start_position <= 1632])
  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
                              tmpdf$Start_position <= 2968])
  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
  res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, 
           preS2S/total*100, HBx/total*100, total)
  return(res)
}

abscount_promoter_reads <- function(barcode, df) {
  tmpdf <- as.data.frame(df)
  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
                                tmpdf$Start_position <= 276])
  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
                                tmpdf$Start_position <= 1221])
  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
                                 tmpdf$Start_position <= 1632])
  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
                              tmpdf$Start_position <= 2968])
  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
  res <- c(preCore, pgRNA, preS1, preS2S, 
           HBx, total)
  return(res)
}

promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), 
                    levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"))

abs_count_reads <- data.frame()
abs_count_reads <- future_sapply(list_name_samples, 
                          abscount_promoter_reads, 
                          df_parsed)
abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads)
colnames(abs_count_reads) <- c("promoter", "read_number")

write.table(abs_count_reads,
            file = paste0(opt$barcode, "_count_reads_per_promoter.tsv"),
            quote = FALSE, 
            sep = "\t", 
            row.names = FALSE)

resultats_start_promoters <- future_lapply(list_name_samples, 
                                    count_promoter_reads, 
                                    df_parsed)

resultats_start_promoters <- as.data.frame(do.call(cbind, 
                                                   resultats_start_promoters))
totalCountSample <- as.data.frame(resultats_start_promoters[6,])
colnames(totalCountSample) <- c(filename)
resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,])
colnames(resultats_start_promoters) <- as.vector(list_name_samples)
resultats_start_promoters <- cbind(promoters, resultats_start_promoters)
formated_start_promoters <- pivot_longer(resultats_start_promoters, 
                                         cols = c(filename),
                                         names_to = "Barcodes", 
                                         values_to = "nb_reads")

mycolors <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")

plot_camembert <- function(barcode, df, tot) {
  camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, 
                                                       y = nb_reads, 
                                                       fill=promoters)) +
    geom_col() +
    coord_polar("y") +
    scale_fill_manual(name="", values = mycolors) +
    labs(title = paste0("#reads = ", tot[1,barcode]), 
                        x=element_blank(), 
                        y=element_blank()) +
    scale_colour_Publication() + theme_Publication() +
    theme(axis.text.y = element_blank(),
          axis.line.x = element_line(colour=NA),
          axis.line.y = element_line(colour=NA),
          axis.ticks = element_blank())
    
  
  print(camembert)
  
  ggsave(filename = paste0("./", opt$barcode, 
                           "_reads_start_promoters_piechart.png"),
         plot = last_plot(),
         scale = 1,
         width = 1920,
         height = 1080,
         units = "px",
         dpi = 300)
}

future_lapply(list_name_samples, 
              plot_camembert, 
              formated_start_promoters, 
              totalCountSample)