Skip to content
Snippets Groups Projects
Junctions_NanoSplicer.R 9.73 KiB
Newer Older
aliarifki's avatar
aliarifki committed
#!/bin/Rscript

################################################################################
### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ###
################################################################################
library(ggplot2, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(optparse)

# Load classification per promoter:
option_list = list(
aliarifki's avatar
aliarifki committed
  make_option(c("-c", "--classification"), type="character", default="./classification.txt", 
aliarifki's avatar
aliarifki committed
              help="input classification or reads file (.txt)", metavar="character"),
  make_option(c("-j", "--jwr"), type="character", default=NULL, 
              help="input nanosplicer results table (.csv)", metavar="character"),
  make_option(c("-b", "--barcode"), type="character", default=NULL, 
              help="input barcode", metavar="character"))
aliarifki's avatar
aliarifki committed
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
aliarifki's avatar
aliarifki committed
reads_pos <- read.table(opt$classification,
aliarifki's avatar
aliarifki committed
                        sep = "\t")
colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])])
reads_pos <- reads_pos[2:length(reads_pos$id),]

# Load Nanosplicer results:
aliarifki's avatar
aliarifki committed
df <- read.csv(opt$jwr)
aliarifki's avatar
aliarifki committed
colnames(df)[1] <- "juncNumber"

# split donor and acceptor positions:
df <- df %>%
  separate(col = "loc",
           into = c("donor", "acceptor"), sep = ", ",
           extra = "warn")

df$donor <- str_replace(df$donor, '[(]', '')
df$acceptor <- str_replace(df$acceptor, '[)]', '')

aliarifki's avatar
aliarifki committed
df <- dplyr::mutate(df, 
aliarifki's avatar
aliarifki committed
             pg_donor = as.numeric(donor)-122,
             pg_acceptor = as.numeric(acceptor)-122)

df <- merge(df, reads_pos, by = 'id')

assignation_donor <- function(pg_donor) {
  if (pg_donor >= 620) {
    if (pg_donor < 643 & pg_donor >= 620) {
      donor_site <- "J2450_"
    }
    else if (pg_donor <= 675 & pg_donor >= 643) {
      donor_site <- "J2474_"
    }
    else if (pg_donor <= 1182 & pg_donor >= 1162) {
      donor_site <- "J2988_"
    }
    else if (pg_donor <= 1835 & pg_donor >= 1815) {
      donor_site <- "J461_"
    }
    else {
      donor_site <- paste0("J", pg_donor, "new_")
    }
  }
  else if (pg_donor <= 284 & pg_donor >= 264) {
    donor_site <- "J2090_"
  }
  else if (pg_donor <= 261 & pg_donor >= 240) {
    donor_site <- "J2070_"
  }
  else {
    donor_site <- paste0("J", pg_donor, "new_")
  }
  return(donor_site)
}

assignation_acceptor <- function(pg_acceptor) {
  if (pg_acceptor > 1868) {
    if (pg_acceptor >= 2452 & pg_acceptor <= 2468) {
      acceptor_site <- "3170"
    }
    else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) {
      acceptor_site <- "1306"
    }
    else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) {
      acceptor_site <- "1386"
    }
    else {
      acceptor_site <- paste0(pg_acceptor,"new")
    }
  }
  else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) {
    acceptor_site <- "490"
  }
  else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) {
    acceptor_site <- "283"
  }
  else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) {
    acceptor_site <- "2903"
  }
  else if (pg_acceptor >= 525 & pg_acceptor <= 547) {
    acceptor_site <- "2351"
  }
  else if (pg_acceptor >= 410 & pg_acceptor <= 432) {
    acceptor_site <- "2237"
  }
  else {
    acceptor_site <- paste0(pg_acceptor,"new")
  }
  return(acceptor_site)
}

df$donor_site <- sapply(df$pg_donor, assignation_donor)
df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor)
aliarifki's avatar
aliarifki committed
df <- dplyr::mutate(df,
aliarifki's avatar
aliarifki committed
             junction = paste0(donor_site, acceptor_site))

write.table(df, file = paste0(opt$barcode, "_JWR_check_parsed.csv"), row.names = FALSE, sep = "\t")
aliarifki's avatar
aliarifki committed

duplicated2 <- function(x){
  if (sum(dup <- duplicated(x))==0)
    return(dup)
  if (class(x) %in% c("data.frame","matrix"))
    duplicated(rbind(x[dup,],x))[-(1:sum(dup))]
  else duplicated(c(x[dup],x))[-(1:sum(dup))]
}

list_read_multiple <- unique(df[duplicated2(df$id),]$id)
multiple_junction <- df[duplicated2(df$id),]
single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple))

SP_assignation_single <- function(site, promoter) {
  if (promoter == "pgRNA" | promoter == "preCore") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else if (site == "J2450_490") { SP_name <- "SP01" }
    else if (site == "J2070_490") { SP_name <- "SP03" }
    else if (site == "J2090_490") { SP_name <- "SP05" }
    else if (site == "J2474_490") { SP_name <- "SP06" }
    else if (site == "J2450_283") { SP_name <- "SP09" }
    else if (site == "J2474_283") { SP_name <- "SP11" }
    else if (site == "J2988_490") { SP_name <- "SP13" }
    else if (site == "J2450_2903") { SP_name <- "SP14" }
    else if (site == "J2070_283") { SP_name <- "SP17" }
    else if (site == "J2988_3170") { SP_name <- "SP19" }
    else if (site == "J2988_283") { SP_name <- "SP20" }
    else { SP_name <- "new_comb_pg" }
  }
  else if (promoter == "preS1" | promoter == "preS2/S") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else if (site == "J461_1306") { SP_name <- "SP21" }
    else if (site == "J461_1386") { SP_name <- "SP22" }
    else { SP_name <- "new_comb_HBs" }
  }
  else if (promoter == "HBx") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_HBx" }
  }
  else if (promoter == "Undefined") {
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_UnC" }
  }
  else { 
    if (str_detect(site, "new")) { SP_name <- "new_SP" }
    else { SP_name <- "error" }
  }
}

#dplyr version:
single_junction <- single_junction %>% dplyr::group_by(id) %>%
  dplyr::mutate(SP_name = SP_assignation_single(junction, promoter))

#plyr version
# single_junction <- ddply(single_junction,
#                          .(id), 
#                          mutate,
#                          SP_name = SP_assignation_single(junction, promoter))
aliarifki's avatar
aliarifki committed

SP_assignation_multiple <- function(read_id, combinaison, promoter) {
  if (promoter == "pgRNA" | promoter == "preCore") {
    
    # New junctions detection:
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
      SP_name <- "new_SP"
    }
    
    #SP02, SP08, SP10, SP16:
    else if ("J2070_2351" %in% combinaison) {
      #SP02:
      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP02"
      }
      
      #SP08:
      else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison &
               length(combinaison) == 3) {
        SP_name <- "SP08"
      }
      
      #SP10:
      else if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP10"
      }
      
      #SP16:
      else if ("J2474_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP16"
      }
      
      #If there is another combination of already described junction associated to "J2070_2351":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP12, SP15:
    else if ("J2070_2237" %in% combinaison) {
      #SP12:
      if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP12"
      }
      else if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP15"
      }
      #If there is another combination of already described junction associated to "J2070_2237":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP04:
    else if ("J2090_2351" %in% combinaison) {
      if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP04"
      }
      #If there is another combination of already described junction associated to "J2090_2351":
      else {SP_name <- "new_comb_pg"}
    }
    
    #SP07, SP18:
    else if ("J2988_490" %in% combinaison) {
      #SP07:
      if ("J2450_2903" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP07"
      }
      #SP18:
      else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) {
        SP_name <- "SP18"
      }
      #If there is another combination of already described junction associated to "J2988_490":
      else {SP_name <- "new_comb_pg"}
    }
    
    #New combinaisons of multiple already described junctions:
    else {SP_name <- "new_comb_pg"}
  }
  
  #SP from preS1, preS2/2:
  else if (promoter == "preS1" | promoter == "preS2/S") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
      SP_name <- "new_SP_HBs"
    }
    else { SP_name <- "new_comb_HBs"}
  }
  
  #SP from HBx:
  else if (promoter == "HBx") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
        SP_name <- "new_SP_HBx"
      }
    else { SP_name <- "new_comb_HBx"}
  }
  
  #SP from Undefined:
  else if (promoter == "Undefined") {
    if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" }
    else { SP_name <- "new_comb_UnC" }
  }
  
  #Error case:
  else {
  SP_name <- "error"
  }
return(SP_name)
}

tmp <- multiple_junction %>% select(id, junction, promoter)
df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(df_combinaison) <- c("id", "SP_name")

for (read_id in list_read_multiple) {
  SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1])
  res_vector <- data.frame(t(c(read_id, SP_name_computed)))
  colnames(res_vector) <- colnames(df_combinaison)
  df_combinaison <- rbind(df_combinaison, res_vector)
}

# df_combinaison <- df_combinaison[2:length(df_combinaison$id),]

multiple_junction <- merge(multiple_junction, df_combinaison, by="id")

df_SPvariants <- rbind(as.data.frame(single_junction), multiple_junction)
aliarifki's avatar
aliarifki committed

SP_variant_unique <- df_SPvariants %>% select(id, SP_name)
SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id)

write.table(df_SPvariants, 
            paste0(opt$barcode, "_identified_SPvariants.csv"), 
aliarifki's avatar
aliarifki committed
            row.names = FALSE, 
            sep = "\t", 
            quote = FALSE)