Skip to content
Snippets Groups Projects
Select Git revision
  • e7e81d53b41e34b12679e3b3f3a00d3162a58d31
  • master default protected
  • dev
  • 1.0
4 results

Junctions_NanoSplicer.R

Blame
  • user avatar
    aliarifki authored
    e7e81d53
    History
    Junctions_NanoSplicer.R 14.09 KiB
    #!/bin/Rscript
    
    ################################################################################
    ### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ###
    ################################################################################
    library(ggplot2, quietly = TRUE)
    library(tidyr, quietly = TRUE)
    library(plyr, quietly = TRUE)
    library(dplyr, quietly = TRUE)
    library(stringr, quietly = TRUE)
    library(optparse)
    
    # Load classification per promoter:
    option_list = list(
      make_option(c("-c", "--classification"), type="character", default="./classification.txt", 
                  help="input classification or reads file (.txt)", metavar="character"),
      make_option(c("-j", "--jwr"), type="character", default=NULL, 
                  help="input nanosplicer results table (.csv)", metavar="character"))
    opt_parser = OptionParser(option_list=option_list)
    opt = parse_args(opt_parser)
    reads_pos <- read.table(opt$classification,
                            sep = "\t")
    colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])])
    reads_pos <- reads_pos[2:length(reads_pos$id),]
    
    # Load Nanosplicer results:
    df <- read.csv(opt$jwr)
    colnames(df)[1] <- "juncNumber"
    
    # split donor and acceptor positions:
    df <- df %>%
      separate(col = "loc",
               into = c("donor", "acceptor"), sep = ", ",
               extra = "warn")
    
    df$donor <- str_replace(df$donor, '[(]', '')
    df$acceptor <- str_replace(df$acceptor, '[)]', '')
    
    df <- mutate(df, 
                 pg_donor = as.numeric(donor)-122,
                 pg_acceptor = as.numeric(acceptor)-122)
    
    df <- merge(df, reads_pos, by = 'id')
    
    assignation_donor <- function(pg_donor) {
      if (pg_donor >= 620) {
        if (pg_donor < 643 & pg_donor >= 620) {
          donor_site <- "J2450_"
        }
        else if (pg_donor <= 675 & pg_donor >= 643) {
          donor_site <- "J2474_"
        }
        else if (pg_donor <= 1182 & pg_donor >= 1162) {
          donor_site <- "J2988_"
        }
        else if (pg_donor <= 1835 & pg_donor >= 1815) {
          donor_site <- "J461_"
        }
        else {
          donor_site <- paste0("J", pg_donor, "new_")
        }
      }
      else if (pg_donor <= 284 & pg_donor >= 264) {
        donor_site <- "J2090_"
      }
      else if (pg_donor <= 261 & pg_donor >= 240) {
        donor_site <- "J2070_"
      }
      else {
        donor_site <- paste0("J", pg_donor, "new_")
      }
      return(donor_site)
    }
    
    assignation_acceptor <- function(pg_acceptor) {
      if (pg_acceptor > 1868) {
        if (pg_acceptor >= 2452 & pg_acceptor <= 2468) {
          acceptor_site <- "3170"
        }
        else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) {
          acceptor_site <- "1306"
        }
        else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) {
          acceptor_site <- "1386"
        }
        else {
          acceptor_site <- paste0(pg_acceptor,"new")
        }
      }
      else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) {
        acceptor_site <- "490"
      }
      else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) {
        acceptor_site <- "283"
      }
      else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) {
        acceptor_site <- "2903"
      }
      else if (pg_acceptor >= 525 & pg_acceptor <= 547) {
        acceptor_site <- "2351"
      }
      else if (pg_acceptor >= 410 & pg_acceptor <= 432) {
        acceptor_site <- "2237"
      }
      else {
        acceptor_site <- paste0(pg_acceptor,"new")
      }
      return(acceptor_site)
    }
    
    df$donor_site <- sapply(df$pg_donor, assignation_donor)
    df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor)
    df <- mutate(df,
                 junction = paste0(donor_site, acceptor_site))
    
    write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t")
    
    duplicated2 <- function(x){
      if (sum(dup <- duplicated(x))==0)
        return(dup)
      if (class(x) %in% c("data.frame","matrix"))
        duplicated(rbind(x[dup,],x))[-(1:sum(dup))]
      else duplicated(c(x[dup],x))[-(1:sum(dup))]
    }
    
    list_read_multiple <- unique(df[duplicated2(df$id),]$id)
    multiple_junction <- df[duplicated2(df$id),]
    single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple))
    
    SP_assignation_single <- function(site, promoter) {
      if (promoter == "pgRNA" | promoter == "preCore") {
        if (str_detect(site, "new")) { SP_name <- "new_SP" }
        else if (site == "J2450_490") { SP_name <- "SP01" }
        else if (site == "J2070_490") { SP_name <- "SP03" }
        else if (site == "J2090_490") { SP_name <- "SP05" }
        else if (site == "J2474_490") { SP_name <- "SP06" }
        else if (site == "J2450_283") { SP_name <- "SP09" }
        else if (site == "J2474_283") { SP_name <- "SP11" }
        else if (site == "J2988_490") { SP_name <- "SP13" }
        else if (site == "J2450_2903") { SP_name <- "SP14" }
        else if (site == "J2070_283") { SP_name <- "SP17" }
        else if (site == "J2988_3170") { SP_name <- "SP19" }
        else if (site == "J2988_283") { SP_name <- "SP20" }
        else { SP_name <- "new_comb_pg" }
      }
      else if (promoter == "preS1" | promoter == "preS2/S") {
        if (str_detect(site, "new")) { SP_name <- "new_SP" }
        else if (site == "J461_1306") { SP_name <- "SP21" }
        else if (site == "J461_1386") { SP_name <- "SP22" }
        else { SP_name <- "new_comb_HBs" }
      }
      else if (promoter == "HBx") {
        if (str_detect(site, "new")) { SP_name <- "new_SP" }
        else { SP_name <- "new_comb_HBx" }
      }
      else if (promoter == "Undefined") {
        if (str_detect(site, "new")) { SP_name <- "new_SP" }
        else { SP_name <- "new_comb_UnC" }
      }
      else { 
        if (str_detect(site, "new")) { SP_name <- "new_SP" }
        else { SP_name <- "error" }
      }
    }
    
    single_junction <- ddply(single_junction,
                             .(id), 
                             mutate,
                             SP_name = SP_assignation_single(junction, promoter))
    
    SP_assignation_multiple <- function(read_id, combinaison, promoter) {
      if (promoter == "pgRNA" | promoter == "preCore") {
        
        # New junctions detection:
        if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
          SP_name <- "new_SP"
        }
        
        #SP02, SP08, SP10, SP16:
        else if ("J2070_2351" %in% combinaison) {
          #SP02:
          if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP02"
          }
          
          #SP08:
          else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison &
                   length(combinaison) == 3) {
            SP_name <- "SP08"
          }
          
          #SP10:
          else if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP10"
          }
          
          #SP16:
          else if ("J2474_490" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP16"
          }
          
          #If there is another combination of already described junction associated to "J2070_2351":
          else {SP_name <- "new_comb_pg"}
        }
        
        #SP12, SP15:
        else if ("J2070_2237" %in% combinaison) {
          #SP12:
          if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP12"
          }
          else if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP15"
          }
          #If there is another combination of already described junction associated to "J2070_2237":
          else {SP_name <- "new_comb_pg"}
        }
        
        #SP04:
        else if ("J2090_2351" %in% combinaison) {
          if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP04"
          }
          #If there is another combination of already described junction associated to "J2090_2351":
          else {SP_name <- "new_comb_pg"}
        }
        
        #SP07, SP18:
        else if ("J2988_490" %in% combinaison) {
          #SP07:
          if ("J2450_2903" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP07"
          }
          #SP18:
          else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) {
            SP_name <- "SP18"
          }
          #If there is another combination of already described junction associated to "J2988_490":
          else {SP_name <- "new_comb_pg"}
        }
        
        #New combinaisons of multiple already described junctions:
        else {SP_name <- "new_comb_pg"}
      }
      
      #SP from preS1, preS2/2:
      else if (promoter == "preS1" | promoter == "preS2/S") {
        if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
          SP_name <- "new_SP_HBs"
        }
        else { SP_name <- "new_comb_HBs"}
      }
      
      #SP from HBx:
      else if (promoter == "HBx") {
        if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
            SP_name <- "new_SP_HBx"
          }
        else { SP_name <- "new_comb_HBx"}
      }
      
      #SP from Undefined:
      else if (promoter == "Undefined") {
        if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" }
        else { SP_name <- "new_comb_UnC" }
      }
      
      #Error case:
      else {
      SP_name <- "error"
      }
    return(SP_name)
    }
    
    tmp <- multiple_junction %>% select(id, junction, promoter)
    df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2))
    colnames(df_combinaison) <- c("id", "SP_name")
    
    for (read_id in list_read_multiple) {
      SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1])
      res_vector <- data.frame(t(c(read_id, SP_name_computed)))
      colnames(res_vector) <- colnames(df_combinaison)
      df_combinaison <- rbind(df_combinaison, res_vector)
    }
    
    # df_combinaison <- df_combinaison[2:length(df_combinaison$id),]
    
    multiple_junction <- merge(multiple_junction, df_combinaison, by="id")
    
    df_SPvariants <- rbind(single_junction, multiple_junction)
    
    SP_variant_unique <- df_SPvariants %>% select(id, SP_name)
    SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id)
    
    write.table(df_SPvariants, 
                "identified_SPvariants.csv", 
                row.names = FALSE, 
                sep = "\t", 
                quote = FALSE)
    
    ggplot(df, aes(x=pg_donor)) +
      geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") +
      geom_density(alpha=.2, fill="lightblue") +
      geom_vline(aes(xintercept=median(pg_donor)),
                  color="blue", linetype="dashed", linewidth=1) +
      geom_vline(aes(xintercept=quantile(pg_donor, 0.025)),
               linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_donor, 0.975)),
               linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_donor, 0.01)),
                 color="green", linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_donor, 0.99)),
                 color="green", linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))),
                 color="red", linewidth=0.5) +
      geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))),
                 color="red", linewidth=0.5) +
      scale_x_continuous(breaks=c(min(df$pg_donor), 
                                  quantile(df$pg_donor, 0.025),
                                  quantile(df$pg_donor, 0.005),
                                  median(df$pg_donor)-sd(df$pg_donor),
                                  median(df$pg_donor),
                                  median(df$pg_donor)+sd(df$pg_donor),
                                  quantile(df$pg_donor, 0.975),
                                  quantile(df$pg_donor, 0.995),
                                  max(df$pg_donor)),
                         label = c(min(df$pg_donor),
                                   floor(quantile(df$pg_donor, 0.025)),
                                   floor(quantile(df$pg_donor, 0.005)),
                                   round(median(df$pg_donor)-sd(df$pg_donor)),
                                   median(df$pg_donor),
                                   round(median(df$pg_donor)+sd(df$pg_donor)),
                                   floor(quantile(df$pg_donor, 0.975))+1,
                                   round(quantile(df$pg_donor, 0.995))+1,
                                   max(df$pg_donor))) +
      theme(axis.text.x = element_text(angle = 45))
    
    ggsave(filename = "Donor_curve.png",
           device = "png",
           scale = 1,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 320)
    
    ggplot(df, aes(x=pg_acceptor)) +
      geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") +
      geom_density(alpha=.2, fill="darksalmon") +
      geom_vline(aes(xintercept=median(pg_acceptor)),
                 color="red", linetype="dashed", linewidth=1) +
      geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)),
               linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)),
               linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)),
                 color="green", linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)),
                 color="green", linetype="dashed", linewidth=0.25) +
      geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))),
                 color="blue", linewidth=0.5) +
      geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))),
                 color="blue", linewidth=0.5) +
      scale_x_continuous(breaks=c(min(df$pg_acceptor), 
                                  quantile(df$pg_acceptor, 0.025),
                                  quantile(df$pg_acceptor, 0.005),
                                  median(df$pg_acceptor)-sd(df$pg_acceptor),
                                  median(df$pg_acceptor),
                                  median(df$pg_acceptor)+sd(df$pg_acceptor),
                                  quantile(df$pg_acceptor, 0.975),
                                  quantile(df$pg_acceptor, 0.995),
                                  max(df$pg_acceptor)),
                           label = c(min(df$pg_acceptor),
                                     floor(quantile(df$pg_acceptor, 0.025)),
                                     floor(quantile(df$pg_acceptor, 0.005)),
                                     round(median(df$pg_acceptor)-sd(df$pg_acceptor)),
                                     median(df$pg_acceptor),
                                     round(median(df$pg_acceptor)+sd(df$pg_acceptor)),
                                     floor(quantile(df$pg_acceptor, 0.975))+1,
                                     floor(quantile(df$pg_acceptor, 0.995))+1,
                                     max(df$pg_acceptor))) +
      theme(axis.text.x = element_text(angle = 45))
    
    ggsave(filename = "Acceptor_curve.png",
           device = "png",
           scale = 1,
           width = 1920,
           height = 1080,
           units = "px",
           dpi = 320)
    
    # Graphs and tests:
    
    # sink("test_shapiro.txt")
    # print("Normality test: Shapiro-Wilk")
    # print("Donor site:")
    # print(shapiro.test(df$pg_donor))
    # print("Acceptor site:")
    # print(shapiro.test(df$pg_acceptor))
    # sink()