Newer
Older
library(ggplot2, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(plyr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(RColorBrewer)
library(optparse)
# Load files
option_list = list(
make_option(c("-s", "--SPvariants"), type="character", default=NULL,
help="input identified SP variants table (.csv)", metavar="character"),
make_option(c("-c", "--classification"), type="character", default=NULL,
help="input classification of reads file (.txt)", metavar="character"),
make_option(c("-b", "--barcode"), type="character", default=NULL,
help="input barcode", metavar="character"))
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
# vectors of species & Palettes Colors:
SPvariants <- c("SP01", "SP02", "SP03", "SP04", "SP05", "SP06", "SP07", "SP08",
"SP09", "SP10", "SP11", "SP12", "SP13", "SP14", "SP15", "SP16",
"SP17", "SP18", "SP19", "SP20", "SP21", "SP22")
SPvariants <- factor(SPvariants, levels = SPvariants)
colors_SP1_22 <- c("#FADD4E", "#FBE0C3", "#C8FF94", "#9ED7C3", "#CCEDE3",
"#A9E5F0", "#A9C8E6", "#BBC7DF", "#D8C8EB", "#FFBADC",
"#E2C9B5", "#E2B5AB", "#CAC4C5", "#D7D9DF", "#F9B7BA",
"#99DDF9", "#ABA7D1", "#B26572", "#FB9651", "#4F7E7E",
"#4FBCBA", "#608AD2")
palette_SP1_22 <- data.frame(nom = SPvariants, teinte = colors_SP1_22,
stringsAsFactors = FALSE)
TSS_species <- c("preCore", "pgRNA", "preS1", "preS2/S", "HBx")
TSS_species <- factor(TSS_species, levels = TSS_species)
colors_RNAs_species <- c("#712E80", "#006695", "#3B9746", "#1F4F25", "#F5751A")
palette_TSS <- data.frame(nom = TSS_species, teinte = colors_RNAs_species,
stringsAsFactors = FALSE)
new_SP_candidates <- c("new_comb_pg", "new_comb_HBs", "new_comb_HBx",
"new_comb_UnC", "new_SP", "new_SP_HBs", "Undefined")
new_SP_candidates <- factor(new_SP_candidates, levels = new_SP_candidates)
colors_newSP <- c("#AC0E0D", "#CB674F", "#AD565C", "#923222", "#C78A77",
"#A92322", "#964B34")
palette_new_SP <- data.frame(nom = new_SP_candidates, teinte = colors_newSP,
stringsAsFactors = FALSE)
SPxx <- c("SPvariants")
SPxx <- factor(SPxx, levels = SPxx)
color_SPxx <- c("#33C5FF")
palette_SPxx <- data.frame(nom = SPxx,
teinte = color_SPxx,
stringsAsFactors = FALSE)
all_species_name <- c(TSS_species, SPvariants, new_SP_candidates, SPxx)
palette_complete <- rbind.data.frame(palette_TSS,
palette_SP1_22,
palette_new_SP,
palette_SPxx,
stringsAsFactors = FALSE)
# Load Start_positions_count files:
header = TRUE)
clean_SP <- identified_SP[!duplicated(identified_SP$id),] %>%
select(id, mapQ, transcript_strand, JAQ, barcode, promoter, junction, SP_name)
countSP <- clean_SP %>% count(SP_name)
countSP <- dplyr::inner_join(palette_complete,
countSP,
by = c("nom" = "SP_name"))
#print(names(palette_complete))
#print(names(countSP))
countSP$nom <- factor(countSP$nom, levels = all_species_name)
proportion = (as.numeric(n)/sum(as.numeric(n))*100))
#print(countSP)
ggplot(countSP, aes(x = "percent",
y = proportion,
fill = nom)) +
geom_col() +
coord_polar("y") +
scale_fill_manual(values = countSP$teinte) +
labs(fill = "spliced-variants")
ggsave(file = paste0(opt$barcode, "_SP_proportion_piechart.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)
ggplot(countSP, aes(x = nom, y = proportion, fill = nom)) +
geom_col() +
scale_fill_manual(values = countSP$teinte) +
theme(axis.text.x = element_text(angle = 45)) +
labs(fill = "spliced-variants") +
xlab(label = "spliced-variants") +
ylab(label = "percent")
ggsave(file = paste0(opt$barcode, "_SP_proportion.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)
# TSS not spliced:
classified_reads <- read.table(file = opt$classification,
header = TRUE)
not_spliced <- classified_reads[!(classified_reads$read_ID %in% clean_SP$id),]
species = not_spliced$promoter)
#print(not_spliced)
not_spliced <- not_spliced %>% select(read_ID, species)
colnames(not_spliced) <- c("id", "species")
clean_SP_type <- clean_SP %>% select(id, SP_name)
colnames(clean_SP_type) <- c("id", "species")
df_species <- rbind.data.frame(not_spliced, clean_SP_type,
stringsAsFactors = FALSE)
count_species <- df_species %>% count(species)
percent = (as.numeric(n)/sum(as.numeric(n))*100))
#print(count_species)
write.table(df_species, file = paste0(opt$barcode, "_all_reads_identified.csv"),
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
sep = "\t", quote = FALSE, row.names = FALSE)
# Null dataset:
species <- c(TSS_species, SPvariants, new_SP_candidates)
canonical_species <- TSS_species[c(1:3,5)]
null_count <- data.frame(species = species,
n = rep(0, times = length(species)),
percent = rep(0, times = length(species)),
stringsAsFactors = FALSE)
null_count <- transform(null_count,
n = as.integer(n),
percent = as.numeric(percent))
# Join:
count_species <- rbind.data.frame(count_species,
subset(null_count,
!(species %in% count_species$species)),
stringsAsFactors = FALSE)
# Merge all SP variants:
count_species_SPxx <- data.frame(species = "SPvariants",
n = sum(count_species[count_species$species
%in% SPvariants,]$n))
count_species_SPxx <- rbind.data.frame(count_species_SPxx,
count_species[count_species$species
%in% c(canonical_species,
SPvariants), 1:2],
stringsAsFactors = FALSE)
count_species_SPxx <- count_species_SPxx[count_species_SPxx$species %in%
all_species_name[c(1:3,5,35)],]
count_species_SPxx <- dplyr::mutate(count_species_SPxx,
percent=(as.numeric(n)/sum(as.numeric(n))*100))
#print(count_species_SPxx)
# save the tab:
write.csv(count_species_SPxx, file = paste0(opt$barcode, "_count_canonical_species_SPxx.csv"))
# prepare to plot:
count_species_SPxx <- dplyr::inner_join(palette_complete,
count_species_SPxx,
by = c("nom" = "species"))
#names(palette_complete)
#names(count_species_SPxx)
count_species_SPxx$nom <- factor(count_species_SPxx$nom, levels = all_species_name)
# Save:
write.csv(count_species, file = paste0(opt$barcode, "_count_species.csv"))
# RNA species composition all species:
count_species <- dplyr::inner_join(palette_complete, count_species,
by = c("nom" = "species"),
suffix = c("",""))
count_species$nom <- factor(count_species$nom, levels = all_species_name)
ggplot(count_species,
aes(x = nom,
y = percent,
fill = nom)) +
geom_col() +
scale_fill_manual(values = count_species$teinte) +
theme(axis.text.x = element_text(angle = 45)) +
labs(fill = "RNA species & spliced-variants") +
xlab(label = "RNA species & spliced-variants")
ggsave(file = paste0(opt$barcode, "_count_RNAs_species.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)
# Filter RNA species canonical + SPvariants:
count_species_clear <- count_species[count_species$species %in% c(canonical_species, SPvariants),]
# RNA species composition all species:
count_species_clear <- dplyr::inner_join(palette_complete, count_species_clear,
by = c("nom" = "nom"),
suffix = c("",""))
count_species_clear$nom <- factor(count_species_clear$nom, levels = all_species_name)
ggplot(count_species_clear,
aes(x = nom,
y = percent,
fill = nom)) +
geom_col() +
scale_fill_manual(values = count_species_clear$teinte) +
theme(axis.text.x = element_text(angle = 45)) +
labs(fill = "RNA species & spliced-variants") +
xlab(label = "RNA species & spliced-variants")
ggsave(file = paste0(opt$barcode, "_count_RNAs_species_clear.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)
# SP composition clear:
count_clear <- clean_SP[clean_SP$SP_name %in% SPvariants,] %>% count(SP_name)
proportion=(as.numeric(n)/sum(as.numeric(n))*100))
#print(count_clear)
count_clear <- dplyr::inner_join(palette_complete,
count_clear,
by = c("nom" = "SP_name"))
#names(count_clear)
count_clear$nom <- factor(count_clear$nom, levels = all_species_name)
if(nrow(count_clear)!=0){
ggplot(count_clear, aes(x = "percent",
y = proportion,
fill = nom)) +
geom_col() +
coord_polar("y") +
scale_fill_manual(values = count_clear$teinte) +
labs(fill = "spliced-variants")
ggsave(file = paste0(opt$barcode, "_SP_clear_proportion_piechart.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)}
ggplot(count_clear, aes(x = nom,
y = proportion,
fill = nom)) +
geom_col() +
scale_fill_manual(values = count_clear$teinte) +
theme(axis.text.x = element_text(angle = 45)) +
labs(fill = "spliced-variants") +
xlab(label = "spliced-variants") +
ylab(label = "percent")
ggsave(file = paste0(opt$barcode, "_SP_clear_proportion.png"),
scale = 2,
width = 1920,
height = 1080,
units = "px",
dpi = 300)
# Camembert SPxx:
ggplot(count_species_SPxx, aes(x = "species",
y = percent,
fill = nom)) +
geom_col() +
coord_polar("y") +
scale_fill_manual(values = count_species_SPxx$teinte) +
labs(fill = "TSS") +
ylab(label = "TSS usage") +
xlab(label = "percent")
ggsave(file = paste0(opt$barcode, "_count_RNAs_species_piechart.png"),