start_positions_individuals_2.R

#!/bin/Rscript
library(dplyr)
library(ggplot2)
library(tidyverse)
library(RColorBrewer)
library(conflicted)
#résolution de conflits entre les bibliothèques dplyr et stats
conflict_prefer("filter", "dplyr")
conflict_prefer("lag", "dplyr")

# Load Start_positions_count files:

list_file <- list.files(path=".", 
                        pattern="*.txt", 
                        all.files=FALSE, 
                        full.names=FALSE)
file_to_load <- paste0("./", list_file[1])
filename <- strsplit(list_file[1], split = "[.]")[[1]][1]

sam_bc01 <- read.table(file_to_load, header = F)
sam_bc01[3] <- rep(filename, length(sam_bc01[,1]))

# Function to parse and arrange data:

parsingData <- function(df) {
  binsize <- 10
  pos <- as.data.frame(table(df[,2]))
  colnames(pos)[1] <- "Start"
  
  Start <- as.data.frame(as.factor(seq(0, 3300)))
  colnames(Start)[1] = "Start"
  
  tmp <- dplyr::left_join(Start, pos)
  tmp[is.na(tmp)] <- 0
  
  tmp$Start <- as.numeric(tmp$Start)
  
  df2 <- as_tibble(tmp) %>% 
    mutate(bin = round(Start/binsize)*binsize) %>% 
    group_by(bin) %>% 
    summarize(nb_reads = sum(Freq, na.rm = T))
  df2[is.na(df2)] <- 0
  df2[3] <- rep(df[1,3], length(df2$bin))
  colnames(df2) <- c("Start_position", "nb_reads", "Barcode")
  df2
}

df_parsed <- parsingData(sam_bc01)

ggplot(df_parsed, aes(Start_position, nb_reads)) + 
  geom_area(alpha = 0.5, fill = "blue") + 
  scale_y_sqrt() +
  facet_wrap(facets = vars(df_parsed$Barcode)) +
  theme_light()+
  scale_x_continuous(breaks = c(0, 127, 1114, 1490, 2554, 2732, 2907, 3421),
                     label = c("1692", "1819", "2806", "EcoRI", "1065", 
                               "1243", "1418", "1932")) + 
  theme(axis.text.x = element_text(angle = 45)
  )

ggsave(paste0(filename,".jpg"),
       plot = last_plot(),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300,
)

# Classify reads based on start-position:

# Separate preCore & pg:
classify_reads <- function(read_info) {
  if (read_info <= 103) {
    promoter <- "preCore"
  }
  else if (read_info >= 117 &
           read_info <= 276) {
    promoter <- "pgRNA"
  }
  else if (read_info >= 1106 & 
           read_info <= 1221 ) {
    promoter <- "preS1"
  }
  else if (read_info >= 1455 & 
           read_info <= 1632 ) {
    promoter <- "preS2/S"
  }
  else if (read_info >= 2550 & 
           read_info <= 2968 ) {
    promoter <- "HBx"
  }
  else promoter <- "Undefined"
}

colnames(sam_bc01) <- c("read_ID", "start_position", "barcode")
sam_bc01$promoter <- sapply(sam_bc01$start_position, 
                            classify_reads)

write.table(sam_bc01,
            file = "classification_of_reads_per_RNA.txt",
            quote = FALSE, 
            sep = "\t", 
            row.names = FALSE)

# Compute Reads number per promoters:
list_name_samples <- list(filename)

count_promoter_reads <- function(barcode, df) {
  tmpdf <- as.data.frame(df)
  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
                                tmpdf$Start_position <= 276])
  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
                                tmpdf$Start_position <= 1221])
  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
                                 tmpdf$Start_position <= 1632])
  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
                              tmpdf$Start_position <= 2968])
  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
  res <- c(preCore/total*100, pgRNA/total*100, preS1/total*100, 
           preS2S/total*100, HBx/total*100, total)
  return(res)
}

abscount_promoter_reads <- function(barcode, df) {
  tmpdf <- as.data.frame(df)
  tmpdf <- tmpdf[tmpdf$Barcode == barcode,]
  preCore <- sum(tmpdf$nb_reads[tmpdf$Start_position <= 103])
  pgRNA <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 117 & 
                                tmpdf$Start_position <= 276])
  preS1 <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1106 & 
                                tmpdf$Start_position <= 1221])
  preS2S <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 1455 & 
                                 tmpdf$Start_position <= 1632])
  HBx <- sum(tmpdf$nb_reads[tmpdf$Start_position >= 2550 & 
                              tmpdf$Start_position <= 2968])
  total <- sum(preCore, pgRNA, preS1, preS2S, HBx)
  res <- c(preCore, pgRNA, preS1, preS2S, 
           HBx, total)
  return(res)
}

promoters <- factor(c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"), 
                    levels = c("preCore", "pgRNA", "preS1", "preS2/S", "HBx"))

abs_count_reads <- data.frame()
abs_count_reads <- sapply(list_name_samples, 
                          abscount_promoter_reads, 
                          df_parsed)
abs_count_reads <- cbind(c(as.vector(promoters),"total"), abs_count_reads)
colnames(abs_count_reads) <- c("promoter", "read_number")

write.table(abs_count_reads,
            file = "Count_reads_per_promoter.tsv",
            quote = FALSE, 
            sep = "\t", 
            row.names = FALSE)

resultats_start_promoters <- lapply(list_name_samples, 
                                    count_promoter_reads, 
                                    df_parsed)

resultats_start_promoters <- as.data.frame(do.call(cbind, 
                                                   resultats_start_promoters))
totalCountSample <- as.data.frame(resultats_start_promoters[6,])
colnames(totalCountSample) <- c(filename)
resultats_start_promoters <- as.data.frame(resultats_start_promoters[1:5,])
colnames(resultats_start_promoters) <- as.vector(list_name_samples)
resultats_start_promoters <- cbind(promoters, resultats_start_promoters)
formated_start_promoters <- pivot_longer(resultats_start_promoters, 
                                         cols = c(filename),
                                         names_to = "Barcodes", 
                                         values_to = "nb_reads")

mycolors <- colorRampPalette(brewer.pal(10, "Paired"))(10)
mycolors5 <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
mycolors6 <- c("#A6CEE3", "#3362ff", "#33c5ff", "#6A3D9A", "#d60000")

plot_camembert <- function(barcode, df, tot) {
  camembert <- ggplot(df[df$Barcodes == barcode,], aes(x = barcode, 
                                                       y = nb_reads, 
                                                       fill=promoters)) +
    geom_col() +
    coord_polar("y") +
    scale_fill_manual(values = mycolors5) +
    labs(title = paste0("#reads = ", tot[1,barcode]), x=element_blank(), y=element_blank()) +
    theme_light()
  
  print(camembert)
  
  ggsave(filename = paste0("./Reads_start_promoters_", barcode, "_camembert.jpg"),
         plot = last_plot(),
         scale = 1,
         width = 1920,
         height = 1080,
         units = "px",
         dpi = 300)
}

lapply(list_name_samples, plot_camembert, formated_start_promoters, totalCountSample)