Junctions_NanoSplicer.R

#!/bin/Rscript

################################################################################
### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ###
################################################################################
library(ggplot2, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(plyr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(optparse)

# Load classification per promoter:
option_list = list(
  make_option(c("-c", "--classification"), type="character", default="./classification.txt", 
              help="input classification or reads file (.txt)", metavar="character"),
  make_option(c("-j", "--jwr"), type="character", default=NULL, 
              help="input nanosplicer results table (.csv)", metavar="character"))
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
reads_pos <- read.table(opt$classification,
                        sep = "\t")
colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])])
reads_pos <- reads_pos[2:length(reads_pos$id),]

# Load Nanosplicer results:
df <- read.csv(opt$jwr)
colnames(df)[1] <- "juncNumber"

# split donor and acceptor positions:
df <- df %>%
  separate(col = "loc",
           into = c("donor", "acceptor"), sep = ", ",
           extra = "warn")

df$donor <- str_replace(df$donor, '[(]', '')
df$acceptor <- str_replace(df$acceptor, '[)]', '')

df <- mutate(df, 
             pg_donor = as.numeric(donor)-122,
             pg_acceptor = as.numeric(acceptor)-122)

df <- merge(df, reads_pos, by = 'id')

assignation_donor <- function(pg_donor) {
  if (pg_donor >= 620) {
    if (pg_donor < 643 & pg_donor >= 620) {
      donor_site <- "J2450_"
    }
    else if (pg_donor <= 675 & pg_donor >= 643) {
      donor_site <- "J2474_"
    }
    else if (pg_donor <= 1182 & pg_donor >= 1162) {
      donor_site <- "J2988_"
    }
    else if (pg_donor <= 1835 & pg_donor >= 1815) {
      donor_site <- "J461_"
    }
    else {
      donor_site <- paste0("J", pg_donor, "new_")
    }
  }
  else if (pg_donor <= 284 & pg_donor >= 264) {
    donor_site <- "J2090_"
  }
  else if (pg_donor <= 261 & pg_donor >= 240) {
    donor_site <- "J2070_"
  }
  else {
    donor_site <- paste0("J", pg_donor, "new_")
  }
  return(donor_site)
}

assignation_acceptor <- function(pg_acceptor) {
  if (pg_acceptor > 1868) {
    if (pg_acceptor >= 2452 & pg_acceptor <= 2468) {
      acceptor_site <- "3170"
    }
    else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) {
      acceptor_site <- "1306"
    }
    else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) {
      acceptor_site <- "1386"
    }
    else {
      acceptor_site <- paste0(pg_acceptor,"new")
    }
  }
  else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) {
    acceptor_site <- "490"
  }
  else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) {
    acceptor_site <- "283"
  }
  else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) {
    acceptor_site <- "2903"
  }
  else if (pg_acceptor >= 525 & pg_acceptor <= 547) {
    acceptor_site <- "2351"
  }
  else if (pg_acceptor >= 410 & pg_acceptor <= 432) {
    acceptor_site <- "2237"
  }
  else {
    acceptor_site <- paste0(pg_acceptor,"new")
  }
  return(acceptor_site)
}

df$donor_site <- sapply(df$pg_donor, assignation_donor)
df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor)
df <- mutate(df,
             junction = paste0(donor_site, acceptor_site))

write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t")

duplicated2 <- function(x){
  if (sum(dup <- duplicated(x))==0)
    return(dup)
  if (class(x) %in% c("data.frame","matrix"))
    duplicated(rbind(x[dup,],x))[-(1:sum(dup))]
  else duplicated(c(x[dup],x))[-(1:sum(dup))]
}

list_read_multiple <- unique(df[duplicated2(df$id),]$id)
multiple_junction <- df[duplicated2(df$id),]
single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple))

SP_assignation_single <- function(site, promoter) {
  if (promoter == "pgRNA" | promoter == "preCore") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else if (site == "J2450_490") { SP_name <- "SP01" }
    else if (site == "J2070_490") { SP_name <- "SP03" }
    else if (site == "J2090_490") { SP_name <- "SP05" }
    else if (site == "J2474_490") { SP_name <- "SP06" }
    else if (site == "J2450_283") { SP_name <- "SP09" }
    else if (site == "J2474_283") { SP_name <- "SP11" }
    else if (site == "J2988_490") { SP_name <- "SP13" }
    else if (site == "J2450_2903") { SP_name <- "SP14" }
    else if (site == "J2070_283") { SP_name <- "SP17" }
    else if (site == "J2988_3170") { SP_name <- "SP19" }
    else if (site == "J2988_283") { SP_name <- "SP20" }
    else { SP_name <- "new_comb_pg" }
  }
  else if (promoter == "preS1" | promoter == "preS2/S") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else if (site == "J461_1306") { SP_name <- "SP21" }
    else if (site == "J461_1386") { SP_name <- "SP22" }
    else { SP_name <- "new_comb_HBs" }
  }
  else if (promoter == "HBx") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_HBx" }
  }
  else if (promoter == "Undefined") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_UnC" }
  }
  else { 
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "error" }
  }
}

single_junction <- ddply(single_junction,
                         .(id), 
                         mutate,
                         SP_name = SP_assignation_single(junction, promoter))

SP_assignation_multiple <- function(read_id, combinaison, promoter) {
  if (promoter == "pgRNA" | promoter == "preCore") {
    
    # New junctions detection:
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
      SP_name <- "new_SP"
    }
    
    #SP02, SP08, SP10, SP16:
    else if ("J2070_2351" %in% combinaison) {
      #SP02:
      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP02"
      }
      
      #SP08:
      else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison &
               length(combinaison) == 3) {
        SP_name <- "SP08"
      }
      
      #SP10:
      else if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP10"
      }
      
      #SP16:
      else if ("J2474_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP16"
      }
      
      #If there is another combination of already described junction associated to "J2070_2351":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP12, SP15:
    else if ("J2070_2237" %in% combinaison) {
      #SP12:
      if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP12"
      }
      else if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP15"
      }
      #If there is another combination of already described junction associated to "J2070_2237":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP04:
    else if ("J2090_2351" %in% combinaison) {
      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP04"
      }
      #If there is another combination of already described junction associated to "J2090_2351":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP07, SP18:
    else if ("J2988_490" %in% combinaison) {
      #SP07:
      if ("J2450_2903" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP07"
      }
      #SP18:
      else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP18"
      }
      #If there is another combination of already described junction associated to "J2988_490":
      else {SP_name <- "new_comb_pg"}
    }
    
    #New combinaisons of multiple already described junctions:
    else {SP_name <- "new_comb_pg"}
  }
  
  #SP from preS1, preS2/2:
  else if (promoter == "preS1" | promoter == "preS2/S") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
      SP_name <- "new_SP_HBs"
    }
    else { SP_name <- "new_comb_HBs"}
  }
  
  #SP from HBx:
  else if (promoter == "HBx") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
        SP_name <- "new_SP_HBx"
      }
    else { SP_name <- "new_comb_HBx"}
  }
  
  #SP from Undefined:
  else if (promoter == "Undefined") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_UnC" }
  }
  
  #Error case:
  else {
  SP_name <- "error"
  }
return(SP_name)
}

tmp <- multiple_junction %>% select(id, junction, promoter)
df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(df_combinaison) <- c("id", "SP_name")

for (read_id in list_read_multiple) {
  SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1])
  res_vector <- data.frame(t(c(read_id, SP_name_computed)))
  colnames(res_vector) <- colnames(df_combinaison)
  df_combinaison <- rbind(df_combinaison, res_vector)
}

# df_combinaison <- df_combinaison[2:length(df_combinaison$id),]

multiple_junction <- merge(multiple_junction, df_combinaison, by="id")

df_SPvariants <- rbind(single_junction, multiple_junction)

SP_variant_unique <- df_SPvariants %>% select(id, SP_name)
SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id)

write.table(df_SPvariants, 
            "identified_SPvariants.csv", 
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE)

ggplot(df, aes(x=pg_donor)) +
  geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") +
  geom_density(alpha=.2, fill="lightblue") +
  geom_vline(aes(xintercept=median(pg_donor)),
              color="blue", linetype="dashed", linewidth=1) +
  geom_vline(aes(xintercept=quantile(pg_donor, 0.025)),
           linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_donor, 0.975)),
           linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_donor, 0.01)),
             color="green", linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_donor, 0.99)),
             color="green", linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))),
             color="red", linewidth=0.5) +
  geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))),
             color="red", linewidth=0.5) +
  scale_x_continuous(breaks=c(min(df$pg_donor), 
                              quantile(df$pg_donor, 0.025),
                              quantile(df$pg_donor, 0.005),
                              median(df$pg_donor)-sd(df$pg_donor),
                              median(df$pg_donor),
                              median(df$pg_donor)+sd(df$pg_donor),
                              quantile(df$pg_donor, 0.975),
                              quantile(df$pg_donor, 0.995),
                              max(df$pg_donor)),
                     label = c(min(df$pg_donor),
                               floor(quantile(df$pg_donor, 0.025)),
                               floor(quantile(df$pg_donor, 0.005)),
                               round(median(df$pg_donor)-sd(df$pg_donor)),
                               median(df$pg_donor),
                               round(median(df$pg_donor)+sd(df$pg_donor)),
                               floor(quantile(df$pg_donor, 0.975))+1,
                               round(quantile(df$pg_donor, 0.995))+1,
                               max(df$pg_donor))) +
  theme(axis.text.x = element_text(angle = 45))

ggsave(filename = "Donor_curve.png",
       device = "png",
       scale = 1,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 320)

ggplot(df, aes(x=pg_acceptor)) +
  geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") +
  geom_density(alpha=.2, fill="darksalmon") +
  geom_vline(aes(xintercept=median(pg_acceptor)),
             color="red", linetype="dashed", linewidth=1) +
  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)),
           linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)),
           linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)),
             color="green", linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)),
             color="green", linetype="dashed", linewidth=0.25) +
  geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))),
             color="blue", linewidth=0.5) +
  geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))),
             color="blue", linewidth=0.5) +
  scale_x_continuous(breaks=c(min(df$pg_acceptor), 
                              quantile(df$pg_acceptor, 0.025),
                              quantile(df$pg_acceptor, 0.005),
                              median(df$pg_acceptor)-sd(df$pg_acceptor),
                              median(df$pg_acceptor),
                              median(df$pg_acceptor)+sd(df$pg_acceptor),
                              quantile(df$pg_acceptor, 0.975),
                              quantile(df$pg_acceptor, 0.995),
                              max(df$pg_acceptor)),
                       label = c(min(df$pg_acceptor),
                                 floor(quantile(df$pg_acceptor, 0.025)),
                                 floor(quantile(df$pg_acceptor, 0.005)),
                                 round(median(df$pg_acceptor)-sd(df$pg_acceptor)),
                                 median(df$pg_acceptor),
                                 round(median(df$pg_acceptor)+sd(df$pg_acceptor)),
                                 floor(quantile(df$pg_acceptor, 0.975))+1,
                                 floor(quantile(df$pg_acceptor, 0.995))+1,
                                 max(df$pg_acceptor))) +
  theme(axis.text.x = element_text(angle = 45))

ggsave(filename = "Acceptor_curve.png",
       device = "png",
       scale = 1,
       width = 1920,
       height = 1080,
       units = "px",
       dpi = 320)

# Graphs and tests:

# sink("test_shapiro.txt")
# print("Normality test: Shapiro-Wilk")
# print("Donor site:")
# print(shapiro.test(df$pg_donor))
# print("Acceptor site:")
# print(shapiro.test(df$pg_acceptor))
# sink()