Junctions_NanoSplicer.R 14.09 KiB
#!/bin/Rscript
################################################################################
### NEED TO ADD A CASE OF NO SPLICED-VARIANTS ARE IDENTIFIED !!!!!!!!!!!!!!! ###
################################################################################
library(ggplot2, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(plyr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(optparse)
# Load classification per promoter:
option_list = list(
make_option(c("-c", "--classification"), type="character", default="./classification.txt",
help="input classification or reads file (.txt)", metavar="character"),
make_option(c("-j", "--jwr"), type="character", default=NULL,
help="input nanosplicer results table (.csv)", metavar="character"))
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
reads_pos <- read.table(opt$classification,
sep = "\t")
colnames(reads_pos) <- c("id", reads_pos[1,2:length(reads_pos[1,])])
reads_pos <- reads_pos[2:length(reads_pos$id),]
# Load Nanosplicer results:
df <- read.csv(opt$jwr)
colnames(df)[1] <- "juncNumber"
# split donor and acceptor positions:
df <- df %>%
separate(col = "loc",
into = c("donor", "acceptor"), sep = ", ",
extra = "warn")
df$donor <- str_replace(df$donor, '[(]', '')
df$acceptor <- str_replace(df$acceptor, '[)]', '')
df <- mutate(df,
pg_donor = as.numeric(donor)-122,
pg_acceptor = as.numeric(acceptor)-122)
df <- merge(df, reads_pos, by = 'id')
assignation_donor <- function(pg_donor) {
if (pg_donor >= 620) {
if (pg_donor < 643 & pg_donor >= 620) {
donor_site <- "J2450_"
}
else if (pg_donor <= 675 & pg_donor >= 643) {
donor_site <- "J2474_"
}
else if (pg_donor <= 1182 & pg_donor >= 1162) {
donor_site <- "J2988_"
}
else if (pg_donor <= 1835 & pg_donor >= 1815) {
donor_site <- "J461_"
}
else {
donor_site <- paste0("J", pg_donor, "new_")
}
}
else if (pg_donor <= 284 & pg_donor >= 264) {
donor_site <- "J2090_"
}
else if (pg_donor <= 261 & pg_donor >= 240) {
donor_site <- "J2070_"
}
else {
donor_site <- paste0("J", pg_donor, "new_")
}
return(donor_site)
}
assignation_acceptor <- function(pg_acceptor) {
if (pg_acceptor > 1868) {
if (pg_acceptor >= 2452 & pg_acceptor <= 2468) {
acceptor_site <- "3170"
}
else if (pg_acceptor >= 2661 & pg_acceptor <= 2685) {
acceptor_site <- "1306"
}
else if (pg_acceptor >= 2743 & pg_acceptor <= 2762) {
acceptor_site <- "1386"
}
else {
acceptor_site <- paste0(pg_acceptor,"new")
}
}
else if (pg_acceptor >= 1843 & pg_acceptor <= 1868) {
acceptor_site <- "490"
}
else if (pg_acceptor >= 1639 & pg_acceptor <= 1668) {
acceptor_site <- "283"
}
else if (pg_acceptor >= 1076 & pg_acceptor <= 1098) {
acceptor_site <- "2903"
}
else if (pg_acceptor >= 525 & pg_acceptor <= 547) {
acceptor_site <- "2351"
}
else if (pg_acceptor >= 410 & pg_acceptor <= 432) {
acceptor_site <- "2237"
}
else {
acceptor_site <- paste0(pg_acceptor,"new")
}
return(acceptor_site)
}
df$donor_site <- sapply(df$pg_donor, assignation_donor)
df$acceptor_site <- sapply(df$pg_acceptor, assignation_acceptor)
df <- mutate(df,
junction = paste0(donor_site, acceptor_site))
write.table(df, file = "JWR_check_parsed.csv", row.names = FALSE, sep = "\t")
duplicated2 <- function(x){
if (sum(dup <- duplicated(x))==0)
return(dup)
if (class(x) %in% c("data.frame","matrix"))
duplicated(rbind(x[dup,],x))[-(1:sum(dup))]
else duplicated(c(x[dup],x))[-(1:sum(dup))]
}
list_read_multiple <- unique(df[duplicated2(df$id),]$id)
multiple_junction <- df[duplicated2(df$id),]
single_junction <- df %>% dplyr::filter(!(id %in% list_read_multiple))
SP_assignation_single <- function(site, promoter) {
if (promoter == "pgRNA" | promoter == "preCore") {
if (str_detect(site, "new")) { SP_name <- "new_SP" }
else if (site == "J2450_490") { SP_name <- "SP01" }
else if (site == "J2070_490") { SP_name <- "SP03" }
else if (site == "J2090_490") { SP_name <- "SP05" }
else if (site == "J2474_490") { SP_name <- "SP06" }
else if (site == "J2450_283") { SP_name <- "SP09" }
else if (site == "J2474_283") { SP_name <- "SP11" }
else if (site == "J2988_490") { SP_name <- "SP13" }
else if (site == "J2450_2903") { SP_name <- "SP14" }
else if (site == "J2070_283") { SP_name <- "SP17" }
else if (site == "J2988_3170") { SP_name <- "SP19" }
else if (site == "J2988_283") { SP_name <- "SP20" }
else { SP_name <- "new_comb_pg" }
}
else if (promoter == "preS1" | promoter == "preS2/S") {
if (str_detect(site, "new")) { SP_name <- "new_SP" }
else if (site == "J461_1306") { SP_name <- "SP21" }
else if (site == "J461_1386") { SP_name <- "SP22" }
else { SP_name <- "new_comb_HBs" }
}
else if (promoter == "HBx") {
if (str_detect(site, "new")) { SP_name <- "new_SP" }
else { SP_name <- "new_comb_HBx" }
}
else if (promoter == "Undefined") {
if (str_detect(site, "new")) { SP_name <- "new_SP" }
else { SP_name <- "new_comb_UnC" }
}
else {
if (str_detect(site, "new")) { SP_name <- "new_SP" }
else { SP_name <- "error" }
}
}
single_junction <- ddply(single_junction,
.(id),
mutate,
SP_name = SP_assignation_single(junction, promoter))
SP_assignation_multiple <- function(read_id, combinaison, promoter) {
if (promoter == "pgRNA" | promoter == "preCore") {
# New junctions detection:
if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
SP_name <- "new_SP"
}
#SP02, SP08, SP10, SP16:
else if ("J2070_2351" %in% combinaison) {
#SP02:
if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP02"
}
#SP08:
else if ("J2450_2903" %in% combinaison & "J2988_490" %in% combinaison &
length(combinaison) == 3) {
SP_name <- "SP08"
}
#SP10:
else if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP10"
}
#SP16:
else if ("J2474_490" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP16"
}
#If there is another combination of already described junction associated to "J2070_2351":
else {SP_name <- "new_comb_pg"}
}
#SP12, SP15:
else if ("J2070_2237" %in% combinaison) {
#SP12:
if ("J2450_283" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP12"
}
else if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP15"
}
#If there is another combination of already described junction associated to "J2070_2237":
else {SP_name <- "new_comb_pg"}
}
#SP04:
else if ("J2090_2351" %in% combinaison) {
if ("J2450_490" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP04"
}
#If there is another combination of already described junction associated to "J2090_2351":
else {SP_name <- "new_comb_pg"}
}
#SP07, SP18:
else if ("J2988_490" %in% combinaison) {
#SP07:
if ("J2450_2903" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP07"
}
#SP18:
else if ("J2474_2903" %in% combinaison & length(combinaison) == 2) {
SP_name <- "SP18"
}
#If there is another combination of already described junction associated to "J2988_490":
else {SP_name <- "new_comb_pg"}
}
#New combinaisons of multiple already described junctions:
else {SP_name <- "new_comb_pg"}
}
#SP from preS1, preS2/2:
else if (promoter == "preS1" | promoter == "preS2/S") {
if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
SP_name <- "new_SP_HBs"
}
else { SP_name <- "new_comb_HBs"}
}
#SP from HBx:
else if (promoter == "HBx") {
if (str_detect(str_c(combinaison, collapse = "_"), "new")) {
SP_name <- "new_SP_HBx"
}
else { SP_name <- "new_comb_HBx"}
}
#SP from Undefined:
else if (promoter == "Undefined") {
if (str_detect(str_c(combinaison, collapse = "_"), "new")) { SP_name <- "new_SP" }
else { SP_name <- "new_comb_UnC" }
}
#Error case:
else {
SP_name <- "error"
}
return(SP_name)
}
tmp <- multiple_junction %>% select(id, junction, promoter)
df_combinaison <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(df_combinaison) <- c("id", "SP_name")
for (read_id in list_read_multiple) {
SP_name_computed <- SP_assignation_multiple(read_id, tmp[tmp$id == read_id,]$junction, tmp[tmp$id == read_id,]$promoter[1])
res_vector <- data.frame(t(c(read_id, SP_name_computed)))
colnames(res_vector) <- colnames(df_combinaison)
df_combinaison <- rbind(df_combinaison, res_vector)
}
# df_combinaison <- df_combinaison[2:length(df_combinaison$id),]
multiple_junction <- merge(multiple_junction, df_combinaison, by="id")
df_SPvariants <- rbind(single_junction, multiple_junction)
SP_variant_unique <- df_SPvariants %>% select(id, SP_name)
SP_variant_unique <- SP_variant_unique[!duplicated(SP_variant_unique$id),] # distinct(SP_variant_unique, id)
write.table(df_SPvariants,
"identified_SPvariants.csv",
row.names = FALSE,
sep = "\t",
quote = FALSE)
ggplot(df, aes(x=pg_donor)) +
geom_histogram(aes(y=after_stat(density)),color="darkblue", fill="lightblue") +
geom_density(alpha=.2, fill="lightblue") +
geom_vline(aes(xintercept=median(pg_donor)),
color="blue", linetype="dashed", linewidth=1) +
geom_vline(aes(xintercept=quantile(pg_donor, 0.025)),
linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_donor, 0.975)),
linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_donor, 0.01)),
color="green", linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_donor, 0.99)),
color="green", linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=(median(pg_donor)+sd(pg_donor))),
color="red", linewidth=0.5) +
geom_vline(aes(xintercept=(median(pg_donor)-sd(pg_donor))),
color="red", linewidth=0.5) +
scale_x_continuous(breaks=c(min(df$pg_donor),
quantile(df$pg_donor, 0.025),
quantile(df$pg_donor, 0.005),
median(df$pg_donor)-sd(df$pg_donor),
median(df$pg_donor),
median(df$pg_donor)+sd(df$pg_donor),
quantile(df$pg_donor, 0.975),
quantile(df$pg_donor, 0.995),
max(df$pg_donor)),
label = c(min(df$pg_donor),
floor(quantile(df$pg_donor, 0.025)),
floor(quantile(df$pg_donor, 0.005)),
round(median(df$pg_donor)-sd(df$pg_donor)),
median(df$pg_donor),
round(median(df$pg_donor)+sd(df$pg_donor)),
floor(quantile(df$pg_donor, 0.975))+1,
round(quantile(df$pg_donor, 0.995))+1,
max(df$pg_donor))) +
theme(axis.text.x = element_text(angle = 45))
ggsave(filename = "Donor_curve.png",
device = "png",
scale = 1,
width = 1920,
height = 1080,
units = "px",
dpi = 320)
ggplot(df, aes(x=pg_acceptor)) +
geom_histogram(aes(y=after_stat(density)),color="red", fill="darksalmon") +
geom_density(alpha=.2, fill="darksalmon") +
geom_vline(aes(xintercept=median(pg_acceptor)),
color="red", linetype="dashed", linewidth=1) +
geom_vline(aes(xintercept=quantile(pg_acceptor, 0.025)),
linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_acceptor, 0.975)),
linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_acceptor, 0.005)),
color="green", linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=quantile(pg_acceptor, 0.995)),
color="green", linetype="dashed", linewidth=0.25) +
geom_vline(aes(xintercept=(median(pg_acceptor)+sd(pg_acceptor))),
color="blue", linewidth=0.5) +
geom_vline(aes(xintercept=(median(pg_acceptor)-sd(pg_acceptor))),
color="blue", linewidth=0.5) +
scale_x_continuous(breaks=c(min(df$pg_acceptor),
quantile(df$pg_acceptor, 0.025),
quantile(df$pg_acceptor, 0.005),
median(df$pg_acceptor)-sd(df$pg_acceptor),
median(df$pg_acceptor),
median(df$pg_acceptor)+sd(df$pg_acceptor),
quantile(df$pg_acceptor, 0.975),
quantile(df$pg_acceptor, 0.995),
max(df$pg_acceptor)),
label = c(min(df$pg_acceptor),
floor(quantile(df$pg_acceptor, 0.025)),
floor(quantile(df$pg_acceptor, 0.005)),
round(median(df$pg_acceptor)-sd(df$pg_acceptor)),
median(df$pg_acceptor),
round(median(df$pg_acceptor)+sd(df$pg_acceptor)),
floor(quantile(df$pg_acceptor, 0.975))+1,
floor(quantile(df$pg_acceptor, 0.995))+1,
max(df$pg_acceptor))) +
theme(axis.text.x = element_text(angle = 45))
ggsave(filename = "Acceptor_curve.png",
device = "png",
scale = 1,
width = 1920,
height = 1080,
units = "px",
dpi = 320)
# Graphs and tests:
# sink("test_shapiro.txt")
# print("Normality test: Shapiro-Wilk")
# print("Donor site:")
# print(shapiro.test(df$pg_donor))
# print("Acceptor site:")
# print(shapiro.test(df$pg_acceptor))
# sink()