Skip to content
Snippets Groups Projects
Verified Commit 0d8c6e56 authored by Mia Croiset's avatar Mia Croiset
Browse files

add cumsum plot + nm contacts plot

parent 810d4268
No related branches found
No related tags found
No related merge requests found
import polars as pl
from pathlib import Path
files = list(Path("../../benchmark/wt_AD281").glob("*/hicstuff/matrix/*.txt"))
files2 = list(Path("../../benchmark/wt_AD281").glob("*/hicpro/matrix/raw/*.matrix"))
files = files + files2
with open("matrices.txt", "w") as outf :
outf.write("\n".join([str(f) for f in files])+ "\n")
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("LaF")
library(tidyverse)
library(ggplot2)
library(dplyr)
library(forcats)
library(LaF)
#################################
# #
# PLOTS ON PROCESS TIME #
# #
#################################
df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE)
rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE)
......@@ -11,7 +19,6 @@ rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", head
merge_df <- bind_rows(df, rep2, rep3)
convert_time <- function(x){
parts <- strsplit(x, " ")
list_durations <- character(0)
......@@ -71,11 +78,18 @@ merge_df$workflow <- workflow
processes <- character(0)
for (x in merge_df$name) {
process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
processes <- append(processes, as.character(process))
}
merge_df$name <- processes
short_process <- character(0)
for (x in merge_df$name) {
short <- strsplit(x, split = "\\(")[[1]][1]
short_process <- append(short_process, as.character(short))
}
merge_df$name <- short_process
filenames <- character(0)
for (x in merge_df$file) {
file_short_name <- sapply(strsplit(x, split = "/"), "[", 2)
......@@ -115,7 +129,7 @@ for (x in merge_df$name) {
# categories
merge_df$categorie <- categories
#print(levels(as.factor(merge_df$name)))
print(levels(as.factor(merge_df$name)))
ordered_processes <- tibble(levels(as.factor(merge_df$name)))
order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8)
ordered_processes <- ordered_processes %>% add_column(order = order)
......@@ -123,32 +137,155 @@ ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df
merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF")
# pdf("process_time.pdf")
h# pdf("process_time.pdf")
ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 7) +
xlab("Processes") +
ylab("Duration in minutes") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+
ggtitle("Duration time of each process") +
scale_y_log10() +
theme_bw()
ggplot(merge_df, aes(x = name, y = duration_minutes)) +
geom_point(aes(color = name, shape = name)) +
ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
facet_wrap(~ file, ncol = 8) +
scale_y_continuous(breaks = seq(0, 490, by = 30)) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 4) +
xlab("Processes") +
ylab("Duration in minutes") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
ggtitle("Duration time of each process")
ggtitle("Real execution time of each process") +
scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes)) +
geom_boxplot(aes(color = categorie)) +
ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
facet_wrap(~ file, ncol = 8) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 7) +
xlab("Processes") +
ylab("Duration in minutes") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+
ggtitle("Real execution time of each process") +
scale_y_log10() +
theme_bw()
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
merge_df.ordered <- merge_df %>%
group_by(file) %>%
arrange(order, .by_group = TRUE) %>%
mutate(cum_time = cumsum(realtime_minutes))
ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = file)) +
geom_line(aes(color = file)) +
xlab("Processes") +
ylab("Duration in minutes") +
ggtitle("Duration per conformation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
geom_point(aes(fill = categorie), shape = 21, size = 2.5) +
scale_fill_manual(values = categories_colors) +
scale_y_continuous(breaks = seq(0, 2300, by = 200))
# dev.off()
merge_df
#######################################
# #
# PLOTS ON NUMBER OF CONTACTS #
# #
#######################################
listFiles <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices.txt", sep = "\n")
listFiles2 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt", sep = "\n")
listFiles3 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt", sep = "\n")
names(listFiles) <- "File"
names(listFiles2) <- "File"
names(listFiles3) <- "File"
merge_lf <- bind_rows(listFiles, listFiles2, listFiles3)
df.counts <- tibble(File = character(), Counts = numeric())
for (filenames in merge_lf) {
filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L)
filenames <- paste("/home/mcroiset/HiC/", filenames ,sep = "")
for (f in filenames) {
count <- determine_nlines(f)
df.counts <- add_row(df.counts, File = f, Counts = count)
# print(f)
# print(count)
}
}
df.counts$File
df.filtered <- df.counts %>%
filter(grepl('AD281.1000', File) | grepl('AD281_abs',File))
#df.filtered$File
filenames <- character(0)
for (x in df.filtered$File) {
file_short_name <- strsplit(x, split = "/")[[1]][7]
filenames <- append(filenames,file_short_name)
}
#filenames
df.filtered$File <- filenames
#df.filtered
workflow <- character(0)
for (x in df.filtered$File) {
w <- strsplit(x, split = "_")[[1]][1]
workflow <- append(workflow,w)
}
df.filtered$workflow <- workflow
align <- character(0)
for (x in df.filtered$File) {
a <- strsplit(x, split = "_")[[1]][2]
align <- append(align,a)
}
df.filtered$align <- align
filtering <- character(0)
for (x in df.filtered$File) {
f <- strsplit(x, split = "_")[[1]][3]
f2 <- strsplit(x, split = "_")[[1]][4]
f3 <- strsplit(x, split = "_")[[1]][5]
f4 <- strsplit(x, split = "_")[[1]][6]
ff <- paste(f, f2, f3, f4 ,sep = "_")
if (strsplit(ff, split = "_")[[1]][2] == "NA") {
ff <- strsplit(ff, split = "_")[[1]][1]
}
else if (strsplit(ff, split = "_")[[1]][3] == "NA") {
ff1 <- strsplit(ff, split = "_")[[1]][1]
ff2 <- strsplit(ff, split = "_")[[1]][2]
ff <- paste(ff1, ff2 ,sep = "_")
}
filtering <- append(filtering,ff)
}
df.filtered$filtering <- filtering
df.excludePicard <- df.filtered %>%
filter(!grepl('filter_picard', filtering))
ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) +
geom_point(size = 5) +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Number of contacts")
ggplot(df.excludePicard, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) +
geom_point(size = 5) +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Number of contacts (without Picard)")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment