#!/bin/Rscript # Packages loading library(ggplot2, quietly = TRUE) library(tidyr, quietly = TRUE) library(dplyr, quietly = TRUE) library(stringr, quietly = TRUE) library(RColorBrewer, quietly = TRUE) library(optparse, quietly = TRUE) library(conflicted, quietly = TRUE) # Résolution de conflits entre les bibliothèques dplyr et stats conflict_prefer("filter", "dplyr") conflict_prefer("lag", "dplyr") # source graphical theme: source("ggplot_theme_Publication-2.R") # Load files option_list = list( make_option(c("-s", "--SPvariants"), type="character", default=NULL, help="input identified SP variants table (.csv)", metavar="character"), make_option(c("-c", "--classification"), type="character", default=NULL, help="input classification of reads file (.txt)", metavar="character"), make_option(c("-b", "--barcode"), type="character", default=NULL, help="input barcode", metavar="character")) opt_parser = OptionParser(option_list=option_list) opt = parse_args(opt_parser) # vectors of species & Palettes Colors: SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08", "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16", "SP17", "SP18", "SP19", "SP20", "SP21", "SP22") SPvariants <- factor(SPvariants, levels = SPvariants) colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", "#4FBCBA", "#608AD2") palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, stringsAsFactors = FALSE) TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx") TSS_species <- factor(TSS_species, levels = TSS_species) colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A") palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, stringsAsFactors = FALSE) new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined") new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates) colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", "#A92322", "#964B34") palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, stringsAsFactors = FALSE) SPxx <- c("SPvariants") SPxx <- factor(SPxx, levels = SPxx) color_SPxx <- c("#33C5FF") palette_SPxx <- data.frame(nom = SPxx, teinte = color_SPxx, stringsAsFactors = FALSE) all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx) palette_complete <- rbind.data.frame(palette_TSS, palette_SP1_22, palette_new_SP, palette_SPxx, stringsAsFactors = FALSE) # Load Start_positions_count files: identified_SP <- read.table(file = opt$SPvariants, header = TRUE) clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name) countSP <- clean_SP %>% count(SP_name) countSP$SP_name <- factor(countSP$SP_name) countSP <- dplyr::inner_join(palette_complete, countSP, by = c("nom" = "SP_name")) #print(names(palette_complete)) #print(names(countSP)) countSP$nom <- factor(countSP$nom, levels = all_species_name) countSP <- dplyr::mutate(countSP, proportion = (as.numeric(n)/sum(as.numeric(n))*100)) #print(countSP) if (length(countSP$nom) != 0) { ggplot(countSP, aes(x = "percent", y = proportion, fill = nom)) + geom_col() + coord_polar("y") + scale_fill_manual(values = countSP$teinte) + labs(fill = "spliced-variants") + xlab(element_blank()) + ylab(element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.y = element_blank(), axis.line.x = element_line(colour=NA), axis.line.y = element_line(colour=NA), axis.ticks = element_blank()) ggsave(file = paste0(opt$barcode, "_SP_proportion_piechart.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300) ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) + geom_col() + scale_fill_manual(values = countSP$teinte) + theme(axis.text.x = element_text(angle = 45)) + xlab("Position on HBV genome.") + ylab(element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.x = element_text(angle = 45, hjust=1)) ggsave(file = paste0(opt$barcode, "_SP_proportion.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300) } # TSS not spliced: classified_reads <- read.table(file = opt$classification, header = TRUE) not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),] not_spliced <- dplyr::mutate(not_spliced, species = not_spliced$promoter) #print(not_spliced) not_spliced <- not_spliced %>% select(read_ID, species) colnames(not_spliced) <- c("id", "species") clean_SP_type <- clean_SP %>% select(id, SP_name) colnames(clean_SP_type) <- c("id", "species") df_species <- rbind.data.frame(not_spliced, clean_SP_type, stringsAsFactors = FALSE) count_species <- df_species %>% count(species) count_species <- dplyr::mutate(count_species, percent = (as.numeric(n)/sum(as.numeric(n))*100)) #print(count_species) write.table(df_species, file = paste0(opt$barcode, "_all_reads_identified.csv"), sep = "\t", quote = FALSE, row.names = FALSE) # Null dataset: species <- c(TSS_species, SPvariants, new_SP_candidates) # canonical_species <- TSS_species[c(1:3,5)] null_count <- data.frame(species = species, n = rep(0, times = length(species)), percent = rep(0, times = length(species)), stringsAsFactors = FALSE) null_count <- transform(null_count, n = as.integer(n), percent = as.numeric(percent)) # Join: count_species <- rbind.data.frame(count_species, subset(null_count, !(species %in% count_species$species)), stringsAsFactors = FALSE) # Merge all SP variants: count_species_SPxx <- data.frame(species = "SPvariants", n = sum(count_species[count_species$species %in% SPvariants,]$n)) count_species_SPxx <- rbind.data.frame(count_species_SPxx, count_species[count_species$species %in% c(TSS_species, SPvariants), 1:2], stringsAsFactors = FALSE) count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% all_species_name[c(1:5,35)],] # c(1:3,5,35) count_species_SPxx <- dplyr::mutate(count_species_SPxx, percent=(as.numeric(n)/sum(as.numeric(n))*100)) #print(count_species_SPxx) # save the tab: write.csv(count_species_SPxx, file = paste0(opt$barcode, "_count_canonical_species_SPxx.csv")) # prepare to plot: count_species_SPxx <- dplyr::inner_join(palette_complete, count_species_SPxx, by = c("nom" = "species")) #names(palette_complete) #names(count_species_SPxx) count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name) # Save: write.csv(count_species, file = paste0(opt$barcode, "_count_species.csv")) # RNA species composition all species: count_species <- dplyr::inner_join(palette_complete, count_species, by = c("nom" = "species"), suffix = c("","")) count_species$nom <- factor(count_species$nom, levels = all_species_name) ggplot(count_species, aes(x = nom, y = percent, fill = nom)) + geom_col() + scale_fill_manual(values = count_species$teinte) + labs(fill = element_blank()) + xlab(label = element_blank())+ scale_colour_Publication() + theme_Publication() + theme(axis.text.x = element_text(angle = 45, hjust=1)) ggsave(file = paste0(opt$barcode, "_count_RNAs_species.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300) # Filter RNA species canonical + SPvariants: count_species_clear <- count_species[count_species$nom %in% c(TSS_species, SPvariants),] # RNA species composition all species: count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear, by = c("nom" = "nom"), suffix = c("","")) count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name) ggplot(count_species_clear, aes(x = nom, y = percent, fill = nom)) + geom_col() + scale_fill_manual(values = count_species_clear$teinte) + labs(fill = element_blank()) + xlab(label = element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ggsave(file = paste0(opt$barcode, "_count_RNAs_species_clear.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300) # SP composition clear: count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name) count_clear <- dplyr::mutate(count_clear, proportion=(as.numeric(n)/sum(as.numeric(n))*100)) #print(count_clear) count_clear$SP_name <- factor(count_clear$SP_name) count_clear <- dplyr::inner_join(palette_complete, count_clear, by = c("nom" = "SP_name")) #names(count_clear) count_clear$nom <- factor(count_clear$nom, levels = all_species_name) if(nrow(count_clear)!=0){ ggplot(count_clear, aes(x = "percent", y = proportion, fill = nom)) + geom_col() + coord_polar("y") + scale_fill_manual(values = count_clear$teinte) + labs(fill = element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.y = element_blank(), axis.line.x = element_line(colour=NA), axis.line.y = element_line(colour=NA), axis.ticks = element_blank()) ggsave(file = paste0(opt$barcode, "_SP_clear_proportion_piechart.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300)} ggplot(count_clear, aes(x = nom, y = proportion, fill = nom)) + geom_col() + scale_fill_manual(values = count_clear$teinte) + labs(fill = element_blank()) + xlab(label = element_blank()) + ylab(label = "percent") + scale_colour_Publication() + theme_Publication() + theme(axis.text.x = element_text(angle = 45, hjust=1)) ggsave(file = paste0(opt$barcode, "_SP_clear_proportion.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300) # Camembert SPxx: ggplot(count_species_SPxx, aes(x = "species", y = percent, fill = nom)) + geom_col() + coord_polar("y") + scale_fill_manual(values = count_species_SPxx$teinte) + labs(fill = element_blank()) + ylab(label = element_blank()) + xlab(label = element_blank()) + scale_colour_Publication() + theme_Publication() + theme(axis.text.y = element_blank(), axis.line.x = element_line(colour=NA), axis.line.y = element_line(colour=NA), axis.ticks = element_blank()) ggsave(file = paste0(opt$barcode, "_count_RNAs_species_piechart.png"), scale = 2, width = 1920, height = 1080, units = "px", dpi = 300)