Skip to content
Snippets Groups Projects
Select Git revision
  • a972f993a389ca6cdc9e5b6d17ac1e0fe4d568fd
  • master default protected
  • dev
  • v2.0.0
  • v0.4.0
  • v0.3.0
  • v0.2.9
  • v0.2.8
  • v0.2.7
  • v0.2.6
  • v0.1.0
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
18 results

build.sh

Blame
  • HBV_RNAs_count.R 10.59 KiB
    #!/bin/Rscript
    # Packages loading
    library(ggplot2, quietly = TRUE)
    library(tidyr, quietly = TRUE)
    library(plyr, quietly = TRUE)
    library(dplyr, quietly = TRUE)
    library(stringr, quietly = TRUE)
    library(RColorBrewer)
    library(optparse)
    
    # Load files
    option_list = list(
      make_option(c("-s", "--SPvariants"), type="character", default=NULL, 
                  help="input identified SP variants table (.csv)", metavar="character"),
      make_option(c("-c", "--classification"), type="character", default=NULL, 
                  help="input classification of reads file (.txt)", metavar="character"))
    opt_parser = OptionParser(option_list=option_list)
    opt = parse_args(opt_parser)
    
    # vectors of species & Palettes Colors:
    SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08",
                    "SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16",
                    "SP17", "SP18", "SP19", "SP20", "SP21", "SP22")
    SPvariants <- factor(SPvariants, levels = SPvariants)
    colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3", 
                       "#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC", 
                       "#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA", 
                       "#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E", 
                       "#4FBCBA", "#608AD2")
    palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22, 
                                 stringsAsFactors = FALSE)
    
    TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")
    TSS_species <- factor(TSS_species, levels = TSS_species)
    colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
    palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species, 
                              stringsAsFactors = FALSE)
    
    new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx", 
                           "new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined")
    new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates)
    colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77", 
                      "#A92322", "#964B34")
    palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP, 
                                 stringsAsFactors = FALSE)
    
    SPxx <- c("SPvariants")
    SPxx <- factor(SPxx, levels = SPxx)
    color_SPxx <- c("#33C5FF")
    palette_SPxx <- data.frame(nom = SPxx, 
                               teinte = color_SPxx, 
                               stringsAsFactors = FALSE)
    
    all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx)
    
    palette_complete <- rbind.data.frame(palette_TSS, 
                                         palette_SP1_22, 
                                         palette_new_SP,
                                         palette_SPxx,
                                         stringsAsFactors = FALSE)
    
    # Load Start_positions_count files:
    identified_SP <- read.table(file = opt$SPvariants,
                                header = TRUE)
    
    clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>% 
      select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name)
    
    countSP <- clean_SP %>% count(SP_name)
    countSP <- dplyr::inner_join(palette_complete, 
                          countSP, 
                          by = c("nom" = "SP_name"))
    #print(names(palette_complete))
    #print(names(countSP))
    
    countSP$nom <- factor(countSP$nom, levels = all_species_name)
    countSP <- dplyr::mutate(countSP,
                      proportion = (as.numeric(n)/sum(as.numeric(n))*100))
    #print(countSP)
    ggplot(countSP, aes(x = "percent", 
                        y = proportion,
                        fill = nom)) +
      geom_col() +
      coord_polar("y") +
      scale_fill_manual(values = countSP$teinte) +
      labs(fill = "spliced-variants")
    
    ggsave(file = "SP_proportion_camembert.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) +
      geom_col() +
      scale_fill_manual(values = countSP$teinte) + 
      theme(axis.text.x = element_text(angle = 45)) +
      labs(fill = "spliced-variants") +
      xlab(label = "spliced-variants") +
      ylab(label = "percent")
    
    ggsave(file = "SP_proportion.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    # TSS not spliced:
    classified_reads <- read.table(file = opt$classification, 
                                   header = TRUE)
    
    not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),]
    not_spliced <- dplyr::mutate(not_spliced,
                          species = not_spliced$promoter)
    #print(not_spliced)
    not_spliced <- not_spliced %>% select(read_ID, species)
    colnames(not_spliced) <- c("id", "species")
    
    clean_SP_type <- clean_SP %>% select(id, SP_name)
    colnames(clean_SP_type) <- c("id", "species")
    
    df_species <- rbind.data.frame(not_spliced, clean_SP_type, 
                                   stringsAsFactors = FALSE)
    count_species <- df_species %>% count(species)
    count_species <- dplyr::mutate(count_species,
                            percent = (as.numeric(n)/sum(as.numeric(n))*100))
    #print(count_species)
    write.table(df_species, file = "All_reads_identified.csv", 
                sep = "\t", quote = FALSE, row.names = FALSE)
    
    # Null dataset:
    species <- c(TSS_species, SPvariants, new_SP_candidates)
    
    canonical_species <- TSS_species[c(1:3,5)]
    
    null_count <- data.frame(species = species, 
                             n = rep(0, times = length(species)),
                             percent = rep(0, times = length(species)), 
                             stringsAsFactors = FALSE)
    
    null_count <- transform(null_count,
                            n = as.integer(n),
                            percent = as.numeric(percent))
    
    # Join:
    count_species <- rbind.data.frame(count_species, 
                                      subset(null_count, 
                                             !(species %in% count_species$species)),
                                      stringsAsFactors = FALSE)
    
    # Merge all SP variants:
    count_species_SPxx <- data.frame(species = "SPvariants", 
                                     n = sum(count_species[count_species$species 
                                                           %in% SPvariants,]$n))
    count_species_SPxx <- rbind.data.frame(count_species_SPxx,
                                           count_species[count_species$species 
                                                         %in% c(canonical_species, 
                                                                SPvariants), 1:2],
                                           stringsAsFactors = FALSE)
    count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in% 
                                               all_species_name[c(1:3,5,35)],]
    count_species_SPxx <- dplyr::mutate(count_species_SPxx,
                                 percent=(as.numeric(n)/sum(as.numeric(n))*100))
    #print(count_species_SPxx)
    # save the tab:
    write.csv(count_species_SPxx, file = "Count_canonical_species_SPxx.csv")
    
    # prepare to plot:
    count_species_SPxx <- dplyr::inner_join(palette_complete,
                                     count_species_SPxx,
                                     by = c("nom" = "species"))
    #names(palette_complete)
    #names(count_species_SPxx)
    count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name)
    
    # Save:
    write.csv(count_species, file = "Count_species.csv")
    
    # RNA species composition all species:
    count_species <- dplyr::inner_join(palette_complete, count_species,
                                      by = c("nom" = "species"),
                                      suffix = c("",""))
    count_species$nom <- factor(count_species$nom, levels = all_species_name)
    
    ggplot(count_species, 
           aes(x = nom, 
               y = percent,
               fill = nom)) +
      geom_col() +
      scale_fill_manual(values = count_species$teinte) +
      theme(axis.text.x = element_text(angle = 45)) +
      labs(fill = "RNA species & spliced-variants") +
      xlab(label = "RNA species & spliced-variants")
    
    ggsave(file = "Count_RNAs_species.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    # Filter RNA species canonical + SPvariants:
    count_species_clear <- count_species[count_species$species %in% c(canonical_species, SPvariants),]
    
    # RNA species composition all species:
    count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear,
                                by = c("nom" = "nom"),
                                suffix = c("",""))
    count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name)
    
    ggplot(count_species_clear, 
           aes(x = nom, 
               y = percent,
               fill = nom)) +
      geom_col() +
      scale_fill_manual(values = count_species_clear$teinte) +
      theme(axis.text.x = element_text(angle = 45)) +
      labs(fill = "RNA species & spliced-variants") +
      xlab(label = "RNA species & spliced-variants")
    
    ggsave(file = "Count_RNAs_species_clear.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    # SP composition clear:
    count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name)
    count_clear <- dplyr::mutate(count_clear,
                          proportion=(as.numeric(n)/sum(as.numeric(n))*100))
    #print(count_clear)
    count_clear <- dplyr::inner_join(palette_complete,
                              count_clear, 
                              by = c("nom" = "SP_name"))
    #names(count_clear)
    count_clear$nom <- factor(count_clear$nom, levels = all_species_name)
    
    ggplot(count_clear, aes(x = "percent", 
                            y = proportion,
                            fill = nom)) +
      geom_col() +
      coord_polar("y") +
      scale_fill_manual(values = count_clear$teinte) +
      labs(fill = "spliced-variants")
    
    ggsave(file = "SP_clear_proportion_camembert.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    ggplot(count_clear, aes(x = nom, 
                            y = proportion,
                            fill = nom)) +
      geom_col() +
      scale_fill_manual(values = count_clear$teinte) + 
      theme(axis.text.x = element_text(angle = 45)) +
      labs(fill = "spliced-variants") +
      xlab(label = "spliced-variants") +
      ylab(label = "percent")
    
    ggsave(file = "SP_clear_proportion.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)
    
    # Camembert SPxx:
    ggplot(count_species_SPxx, aes(x = "species", 
                                   y = percent,
                                   fill = nom)) +
      geom_col() +
      coord_polar("y") +
      scale_fill_manual(values = count_species_SPxx$teinte) +
      labs(fill = "TSS") +
      ylab(label = "TSS usage") +
      xlab(label = "percent")
    
    ggsave(file = "Count_RNAs_species_camembert.png",
           scale = 2,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 300)