diff --git a/benchmark/count_contacts.py b/benchmark/count_contacts.py new file mode 100755 index 0000000000000000000000000000000000000000..193966a99875b6cec499b3bba00b7310120ce478 --- /dev/null +++ b/benchmark/count_contacts.py @@ -0,0 +1,10 @@ +import polars as pl +from pathlib import Path +files = list(Path("../../benchmark/wt_AD281").glob("*/hicstuff/matrix/*.txt")) +files2 = list(Path("../../benchmark/wt_AD281").glob("*/hicpro/matrix/raw/*.matrix")) + +files = files + files2 + +with open("matrices.txt", "w") as outf : + outf.write("\n".join([str(f) for f in files])+ "\n") + diff --git a/benchmark/plots.r b/benchmark/plots.r index c259d4d57aa29dcca3b5a9103fd4af857d8132eb..c54a4e8da0e614ec50dfd065b78c57251cb88fdb 100644 --- a/benchmark/plots.r +++ b/benchmark/plots.r @@ -1,9 +1,17 @@ install.packages("tidyverse") install.packages("ggplot2") +install.packages("LaF") library(tidyverse) library(ggplot2) library(dplyr) library(forcats) +library(LaF) + +################################# +# # +# PLOTS ON PROCESS TIME # +# # +################################# df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE) rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE) @@ -11,7 +19,6 @@ rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", head merge_df <- bind_rows(df, rep2, rep3) - convert_time <- function(x){ parts <- strsplit(x, " ") list_durations <- character(0) @@ -71,11 +78,18 @@ merge_df$workflow <- workflow processes <- character(0) for (x in merge_df$name) { - process <- lapply(strsplit(x, split = ":"), tail, n = 1L) + process <- lapply(strsplit(x, split = ":"), tail, n = 1L) processes <- append(processes, as.character(process)) } merge_df$name <- processes +short_process <- character(0) +for (x in merge_df$name) { + short <- strsplit(x, split = "\\(")[[1]][1] + short_process <- append(short_process, as.character(short)) +} +merge_df$name <- short_process + filenames <- character(0) for (x in merge_df$file) { file_short_name <- sapply(strsplit(x, split = "/"), "[", 2) @@ -115,7 +129,7 @@ for (x in merge_df$name) { # categories merge_df$categorie <- categories -#print(levels(as.factor(merge_df$name))) +print(levels(as.factor(merge_df$name))) ordered_processes <- tibble(levels(as.factor(merge_df$name))) order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8) ordered_processes <- ordered_processes %>% add_column(order = order) @@ -123,32 +137,155 @@ ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name")) +categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF") +# pdf("process_time.pdf") -h# pdf("process_time.pdf") +ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) + + geom_boxplot() + + scale_shape_manual(values = important_processes) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 7) + + xlab("Processes") + + ylab("Duration in minutes") + + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ + ggtitle("Duration time of each process") + + scale_y_log10() + + theme_bw() -ggplot(merge_df, aes(x = name, y = duration_minutes)) + - geom_point(aes(color = name, shape = name)) + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) + + geom_boxplot() + scale_shape_manual(values = important_processes) + - facet_wrap(~ file, ncol = 8) + - scale_y_continuous(breaks = seq(0, 490, by = 30)) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 4) + xlab("Processes") + ylab("Duration in minutes") + - theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + - ggtitle("Duration time of each process") + ggtitle("Real execution time of each process") + + scale_y_log10() + + theme_bw() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) -ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes)) + - geom_boxplot(aes(color = categorie)) + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) + + geom_boxplot() + scale_shape_manual(values = important_processes) + - facet_wrap(~ file, ncol = 8) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 7) + xlab("Processes") + ylab("Duration in minutes") + - theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ ggtitle("Real execution time of each process") + scale_y_log10() + - theme_bw() + theme_bw() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) +merge_df.ordered <- merge_df %>% + group_by(file) %>% + arrange(order, .by_group = TRUE) %>% + mutate(cum_time = cumsum(realtime_minutes)) + +ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = file)) + + geom_line(aes(color = file)) + + xlab("Processes") + + ylab("Duration in minutes") + + ggtitle("Duration per conformation") + + theme_bw() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + + geom_point(aes(fill = categorie), shape = 21, size = 2.5) + + scale_fill_manual(values = categories_colors) + + scale_y_continuous(breaks = seq(0, 2300, by = 200)) + # dev.off() -merge_df +####################################### +# # +# PLOTS ON NUMBER OF CONTACTS # +# # +####################################### + + +listFiles <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices.txt", sep = "\n") +listFiles2 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt", sep = "\n") +listFiles3 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt", sep = "\n") + +names(listFiles) <- "File" +names(listFiles2) <- "File" +names(listFiles3) <- "File" + +merge_lf <- bind_rows(listFiles, listFiles2, listFiles3) + +df.counts <- tibble(File = character(), Counts = numeric()) +for (filenames in merge_lf) { + filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L) + filenames <- paste("/home/mcroiset/HiC/", filenames ,sep = "") + for (f in filenames) { + count <- determine_nlines(f) + df.counts <- add_row(df.counts, File = f, Counts = count) + # print(f) + # print(count) + } +} + +df.counts$File +df.filtered <- df.counts %>% + filter(grepl('AD281.1000', File) | grepl('AD281_abs',File)) +#df.filtered$File + +filenames <- character(0) +for (x in df.filtered$File) { + file_short_name <- strsplit(x, split = "/")[[1]][7] + filenames <- append(filenames,file_short_name) +} +#filenames +df.filtered$File <- filenames +#df.filtered + +workflow <- character(0) +for (x in df.filtered$File) { + w <- strsplit(x, split = "_")[[1]][1] + workflow <- append(workflow,w) +} +df.filtered$workflow <- workflow + +align <- character(0) +for (x in df.filtered$File) { + a <- strsplit(x, split = "_")[[1]][2] + align <- append(align,a) +} +df.filtered$align <- align + +filtering <- character(0) +for (x in df.filtered$File) { + f <- strsplit(x, split = "_")[[1]][3] + f2 <- strsplit(x, split = "_")[[1]][4] + f3 <- strsplit(x, split = "_")[[1]][5] + f4 <- strsplit(x, split = "_")[[1]][6] + ff <- paste(f, f2, f3, f4 ,sep = "_") + if (strsplit(ff, split = "_")[[1]][2] == "NA") { + ff <- strsplit(ff, split = "_")[[1]][1] + } + else if (strsplit(ff, split = "_")[[1]][3] == "NA") { + ff1 <- strsplit(ff, split = "_")[[1]][1] + ff2 <- strsplit(ff, split = "_")[[1]][2] + ff <- paste(ff1, ff2 ,sep = "_") + } + filtering <- append(filtering,ff) +} +df.filtered$filtering <- filtering + +df.excludePicard <- df.filtered %>% + filter(!grepl('filter_picard', filtering)) + +ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) + + geom_point(size = 5) + + scale_y_log10() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + ggtitle("Number of contacts") + +ggplot(df.excludePicard, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) + + geom_point(size = 5) + + scale_y_log10() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + ggtitle("Number of contacts (without Picard)") +