diff --git a/benchmark/plots.r b/benchmark/plots.r index 28a0609311e5ae1fa30b57d65c1c27cacf67542e..c259d4d57aa29dcca3b5a9103fd4af857d8132eb 100644 --- a/benchmark/plots.r +++ b/benchmark/plots.r @@ -6,6 +6,11 @@ library(dplyr) library(forcats) df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE) +rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE) +rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", header = TRUE) + +merge_df <- bind_rows(df, rep2, rep3) + convert_time <- function(x){ parts <- strsplit(x, " ") @@ -45,44 +50,44 @@ convert_time <- function(x){ return(list_durations) } -list_dur <- convert_time(df$duration) -list_realtime <- convert_time(df$realtime) +list_dur <- convert_time(merge_df$duration) +list_realtime <- convert_time(merge_df$realtime) -df$duration_minutes <- list_dur -df$duration_minutes <- as.numeric(df$duration_minutes) +merge_df$duration_minutes <- list_dur +merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes) -df$realtime_minutes <- list_realtime -df$realtime_minutes <- as.numeric(df$realtime_minutes) +merge_df$realtime_minutes <- list_realtime +merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes) -val <- df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes)) +val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes)) workflow <- character(0) -for (x in df$file) { +for (x in merge_df$file) { full <- sapply(strsplit(x, split = "/"), "[", 2) w <- sapply(strsplit(full, split = "_"), "[", 1) workflow <- append(workflow,w) } -df$workflow <- workflow +merge_df$workflow <- workflow processes <- character(0) -for (x in df$name) { +for (x in merge_df$name) { process <- lapply(strsplit(x, split = ":"), tail, n = 1L) processes <- append(processes, as.character(process)) } -df$name <- processes +merge_df$name <- processes filenames <- character(0) -for (x in df$file) { +for (x in merge_df$file) { file_short_name <- sapply(strsplit(x, split = "/"), "[", 2) filenames <- append(filenames,file_short_name) } -df$file <- filenames +merge_df$file <- filenames important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1) categories <- character(0) -for (x in df$name) { - if (grepl('\\w*_ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) { +for (x in merge_df$name) { + if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) { categories <- append(categories, "align") } else if (grepl('\\w*COOLER\\w*', x)) { @@ -91,7 +96,7 @@ for (x in df$name) { else if (grepl('\\w*PAIRS\\w*', x)) { categories<- append(categories, "pairs") } - else if (grepl('\\w*MATRIX\\w*', x)) { + else if (grepl('\\w*MATRIX\\w*', x) || grepl('\\w*BUILD_CONTACT\\w*', x) || grepl('\\w*ICE_NORM\\w*', x)) { categories<- append(categories, "matrix") } else if (grepl('\\w*FILTER\\w*', x) || grepl('\\w*PICARD\\w*', x) || grepl('\\w*SAMTOOLS\\w*', x)) { @@ -100,9 +105,6 @@ for (x in df$name) { else if (grepl('\\w*CUTSITE\\w*', x)) { categories<- append(categories, "cutsite") } - else if (grepl('\\w*ITERALIGN\\w*', x)) { - categories<- append(categories, "iteralign") - } else if (grepl('\\w*SAMPLESHEET\\w*', x) || (grepl('BOWTIE2_BUILD', x) || grepl('\\w*GETCHROM\\w*', x) || grepl('\\w*GET_RESTRIC\\w*', x))) { categories<- append(categories, "data_prep") } @@ -111,21 +113,21 @@ for (x in df$name) { } } # categories -df$categorie <- categories +merge_df$categorie <- categories -print(levels(as.factor(df$name))) -ordered_processes <- tibble(levels(as.factor(df$name))) -order <- c(18, 7, 8, 7, 2, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8) +#print(levels(as.factor(merge_df$name))) +ordered_processes <- tibble(levels(as.factor(merge_df$name))) +order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8) ordered_processes <- ordered_processes %>% add_column(order = order) -ordered_processes <- rename(ordered_processes, name = `levels(as.factor(df$name))`) +ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df$name))`) -df <- left_join(df, ordered_processes, by= c("name" = "name")) +merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name")) -# pdf("process_time.pdf") +h# pdf("process_time.pdf") -ggplot(df, aes(x = name, y = duration_minutes)) + +ggplot(merge_df, aes(x = name, y = duration_minutes)) + geom_point(aes(color = name, shape = name)) + scale_shape_manual(values = important_processes) + facet_wrap(~ file, ncol = 8) + @@ -135,15 +137,18 @@ ggplot(df, aes(x = name, y = duration_minutes)) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + ggtitle("Duration time of each process") -ggplot(df, aes(x = fct_reorder(name, order), y = realtime_minutes)) + - geom_point(aes(color = categorie)) + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes)) + + geom_boxplot(aes(color = categorie)) + scale_shape_manual(values = important_processes) + facet_wrap(~ file, ncol = 8) + xlab("Processes") + ylab("Duration in minutes") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ ggtitle("Real execution time of each process") + - scale_y_log10() + scale_y_log10() + + theme_bw() # dev.off() + +merge_df