#!/bin/Rscript
# Packages loading
library(ggplot2, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(RColorBrewer, quietly = TRUE)
library(optparse, quietly = TRUE)
library(conflicted, quietly = TRUE)

# Résolution de conflits entre les bibliothèques dplyr et stats
conflict_prefer("filter", "dplyr")
conflict_prefer("lag", "dplyr")

# source graphical theme:
source("ggplot_theme_Publication-2.R")

# Load files
option_list = list(
  make_option(c("-s", "--SPvariants"), type="character", default=NULL, 
              help="input identified SP variants table (.csv)", metavar="character"),
  make_option(c("-c", "--classification"), type="character", default=NULL, 
              help="input classification of reads file (.txt)", metavar="character"),
  make_option(c("-b", "--barcode"), type="character", default=NULL, 
              help="input barcode", metavar="character"))
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)

# vectors of species & Palettes Colors:
SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08",
                "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16",
                "SP17", "SP18", "SP19", "SP20", "SP21", "SP22")
SPvariants <- factor(SPvariants, levels = SPvariants)
colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", 
                   "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", 
                   "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", 
                   "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", 
                   "#4FBCBA", "#608AD2")
palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, 
                             stringsAsFactors = FALSE)

TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")
TSS_species <- factor(TSS_species, levels = TSS_species)
colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, 
                          stringsAsFactors = FALSE)

new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", 
                       "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined")
new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates)
colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", 
                  "#A92322", "#964B34")
palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, 
                             stringsAsFactors = FALSE)

SPxx <- c("SPvariants")
SPxx <- factor(SPxx, levels = SPxx)
color_SPxx <- c("#33C5FF")
palette_SPxx <- data.frame(nom = SPxx, 
                           teinte = color_SPxx, 
                           stringsAsFactors = FALSE)

all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx)

palette_complete <- rbind.data.frame(palette_TSS, 
                                     palette_SP1_22, 
                                     palette_new_SP,
                                     palette_SPxx,
                                     stringsAsFactors = FALSE)

# Load Start_positions_count files:
identified_SP <- read.table(file = opt$SPvariants,
                            header = TRUE)

clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% 
  select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name)

countSP <- clean_SP %>% count(SP_name)
countSP$SP_name <- factor(countSP$SP_name)
countSP <- dplyr::inner_join(palette_complete, 
                      countSP, 
                      by = c("nom" = "SP_name"))
#print(names(palette_complete))
#print(names(countSP))

countSP$nom <- factor(countSP$nom, levels = all_species_name)
countSP <- dplyr::mutate(countSP,
                  proportion = (as.numeric(n)/sum(as.numeric(n))*100))
#print(countSP)
if (length(countSP$nom) != 0) {
  ggplot(countSP, aes(x = "percent", 
                      y = proportion,
                      fill = nom)) +
    geom_col() +
    coord_polar("y") +
    scale_fill_manual(values = countSP$teinte) +
    labs(fill = "spliced-variants") +
    xlab(element_blank()) + ylab(element_blank()) +
    scale_colour_Publication() + theme_Publication() +
    theme(axis.text.y = element_blank(),
          axis.line.x = element_line(colour=NA),
          axis.line.y = element_line(colour=NA),
          axis.ticks = element_blank())
  
  ggsave(file = paste0(opt$barcode, "_SP_proportion_piechart.png"),
         scale = 2,
         width = 1920,
         height = 1080,
         units = "px",
         dpi = 300)
  
  ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) +
    geom_col() +
    scale_fill_manual(values = countSP$teinte) + 
    theme(axis.text.x = element_text(angle = 45)) +
    xlab("Position on HBV genome.") + ylab(element_blank()) +
    scale_colour_Publication() + theme_Publication() +
    theme(axis.text.x = element_text(angle = 45, hjust=1))
  
  ggsave(file = paste0(opt$barcode, "_SP_proportion.png"),
         scale = 2,
         width = 1920,
         height = 1080,
         units = "px",
         dpi = 300)
}

# TSS not spliced:
classified_reads <- read.table(file = opt$classification, 
                               header = TRUE)

not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),]
not_spliced <- dplyr::mutate(not_spliced,
                      species = not_spliced$promoter)
#print(not_spliced)
not_spliced <- not_spliced %>% select(read_ID, species)
colnames(not_spliced) <- c("id", "species")

clean_SP_type <- clean_SP %>% select(id, SP_name)
colnames(clean_SP_type) <- c("id", "species")

df_species <- rbind.data.frame(not_spliced, clean_SP_type, 
                               stringsAsFactors = FALSE)
count_species <- df_species %>% count(species)
count_species <- dplyr::mutate(count_species,
                        percent = (as.numeric(n)/sum(as.numeric(n))*100))
#print(count_species)
write.table(df_species, file = paste0(opt$barcode, "_all_reads_identified.csv"), 
            sep = "\t", quote = FALSE, row.names = FALSE)

# Null dataset:
species <- c(TSS_species, SPvariants, new_SP_candidates)

# canonical_species <- TSS_species[c(1:3,5)]

null_count <- data.frame(species = species, 
                         n = rep(0, times = length(species)),
                         percent = rep(0, times = length(species)), 
                         stringsAsFactors = FALSE)

null_count <- transform(null_count,
                        n = as.integer(n),
                        percent = as.numeric(percent))

# Join:
count_species <- rbind.data.frame(count_species, 
                                  subset(null_count, 
                                         !(species %in% count_species$species)),
                                  stringsAsFactors = FALSE)

# Merge all SP variants:
count_species_SPxx <- data.frame(species = "SPvariants", 
                                 n = sum(count_species[count_species$species 
                                                       %in% SPvariants,]$n))
count_species_SPxx <- rbind.data.frame(count_species_SPxx,
                                       count_species[count_species$species 
                                                     %in% c(TSS_species, 
                                                            SPvariants), 1:2],
                                       stringsAsFactors = FALSE)
count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% 
                                           all_species_name[c(1:5,35)],] # c(1:3,5,35)
count_species_SPxx <- dplyr::mutate(count_species_SPxx,
                             percent=(as.numeric(n)/sum(as.numeric(n))*100))
#print(count_species_SPxx)
# save the tab:
write.csv(count_species_SPxx, 
          file = paste0(opt$barcode, "_count_canonical_species_SPxx.csv"))

# prepare to plot:
count_species_SPxx <- dplyr::inner_join(palette_complete,
                                 count_species_SPxx,
                                 by = c("nom" = "species"))
#names(palette_complete)
#names(count_species_SPxx)
count_species_SPxx$nom <- factor(count_species_SPxx$nom, 
                                 levels = all_species_name)

# Save:
write.csv(count_species, file = paste0(opt$barcode, "_count_species.csv"))

# RNA species composition all species:
count_species <- dplyr::inner_join(palette_complete, count_species,
                                  by = c("nom" = "species"),
                                  suffix = c("",""))
count_species$nom <- factor(count_species$nom, levels = all_species_name)

ggplot(count_species, 
       aes(x = nom, 
           y = percent,
           fill = nom)) +
  geom_col() +
  scale_fill_manual(values = count_species$teinte) +
  labs(fill = element_blank()) +
  xlab(label = element_blank())+
  scale_colour_Publication() + theme_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

ggsave(file = paste0(opt$barcode, "_count_RNAs_species.png"),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300)

# Filter RNA species canonical + SPvariants:
count_species_clear <- count_species[count_species$nom %in% c(TSS_species, SPvariants),]

# RNA species composition all species:
count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear,
                            by = c("nom" = "nom"),
                            suffix = c("",""))
count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name)

ggplot(count_species_clear, 
       aes(x = nom, 
           y = percent,
           fill = nom)) +
  geom_col() +
  scale_fill_manual(values = count_species_clear$teinte) +
  labs(fill = element_blank()) +
  xlab(label = element_blank()) +
  scale_colour_Publication() + theme_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave(file = paste0(opt$barcode, "_count_RNAs_species_clear.png"),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300)

# SP composition clear:
count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name)
count_clear <- dplyr::mutate(count_clear,
                      proportion=(as.numeric(n)/sum(as.numeric(n))*100))
#print(count_clear)
count_clear$SP_name <- factor(count_clear$SP_name)
count_clear <- dplyr::inner_join(palette_complete,
                          count_clear, 
                          by = c("nom" = "SP_name"))
#names(count_clear)
count_clear$nom <- factor(count_clear$nom, levels = all_species_name)

if(nrow(count_clear)!=0){
  ggplot(count_clear, aes(x = "percent", 
                          y = proportion,
                          fill = nom)) +
    geom_col() +
    coord_polar("y") +
    scale_fill_manual(values = count_clear$teinte) +
    labs(fill = element_blank()) +
    scale_colour_Publication() + 
    theme_Publication() +
    theme(axis.text.y = element_blank(),
          axis.line.x = element_line(colour=NA),
          axis.line.y = element_line(colour=NA),
          axis.ticks = element_blank())
  
  ggsave(file = paste0(opt$barcode, "_SP_clear_proportion_piechart.png"),
         scale = 2,
         width = 1920,
         height = 1080,
         units = "px",
         dpi = 300)}

ggplot(count_clear, aes(x = nom, 
                        y = proportion,
                        fill = nom)) +
  geom_col() +
  scale_fill_manual(values = count_clear$teinte) + 
  labs(fill = element_blank()) +
  xlab(label = element_blank()) +
  ylab(label = "percent") +
  scale_colour_Publication() + theme_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

ggsave(file = paste0(opt$barcode, "_SP_clear_proportion.png"),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300)

# Camembert SPxx:
ggplot(count_species_SPxx, aes(x = "species", 
                               y = percent,
                               fill = nom)) +
  geom_col() +
  coord_polar("y") +
  scale_fill_manual(values = count_species_SPxx$teinte) +
  labs(fill = element_blank()) +
  ylab(label = element_blank()) +
  xlab(label = element_blank()) +
  scale_colour_Publication() + theme_Publication() +
  theme(axis.text.y = element_blank(),
        axis.line.x = element_line(colour=NA),
        axis.line.y = element_line(colour=NA),
        axis.ticks = element_blank())

ggsave(file = paste0(opt$barcode, "_count_RNAs_species_piechart.png"),
       scale = 2,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 300)