diff --git a/benchmark/plots.r b/benchmark/plots.r index c54a4e8da0e614ec50dfd065b78c57251cb88fdb..07e84b39b395ef5d719ff14f83e8c1083ba64ccf 100644 --- a/benchmark/plots.r +++ b/benchmark/plots.r @@ -13,12 +13,16 @@ library(LaF) # # ################################# +#Get the 3 replicates + df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE) rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE) rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", header = TRUE) merge_df <- bind_rows(df, rep2, rep3) + +#Function to convert the time from report file (format Xd XXm XXs) to minutes convert_time <- function(x){ parts <- strsplit(x, " ") list_durations <- character(0) @@ -28,22 +32,26 @@ convert_time <- function(x){ hours <- 0 mili <- 0 for ( single in part ) { - if (grepl('\\d+s', single)) { + if (grepl('\\d+s', single)) #for seconds + { sec <- as.numeric(substr(single,1,nchar(single)-1)) sec <- sec/60 sec <- as.numeric(format(round(sec, 3), nsmall = 3)) } - if (grepl('\\d+m$', single)) { + if (grepl('\\d+m$', single)) #for minutes + { minute <- as.numeric(substr(single,1,nchar(single)-1)) minute <- as.numeric(format(round(minute, 2), nsmall = 2)) } - if (grepl('\\d+h', single)) { + if (grepl('\\d+h', single)) #for days + { hours <- as.numeric(substr(single,1,nchar(single)-1)) hours <- hours*60 hours <- as.numeric(format(round(hours, 2), nsmall = 2)) } - if (grepl('\\d+ms', single)) { + if (grepl('\\d+ms', single)) #for milliseconds, may be equal to 0 already + { mili <- as.numeric(substr(single,1,nchar(single)-2)) if (mili > 0) { mili <- (mili/1000)/60 @@ -56,7 +64,7 @@ convert_time <- function(x){ } return(list_durations) } - +#convert the time for both duration (from submit of the process to end) and realtime (only process time) list_dur <- convert_time(merge_df$duration) list_realtime <- convert_time(merge_df$realtime) @@ -66,8 +74,10 @@ merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes) merge_df$realtime_minutes <- list_realtime merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes) +#mean and standard deviation, but in the end not used in this script val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes)) +#add workflow column (either hicpro or hicstuff) workflow <- character(0) for (x in merge_df$file) { full <- sapply(strsplit(x, split = "/"), "[", 2) @@ -76,6 +86,7 @@ for (x in merge_df$file) { } merge_df$workflow <- workflow +#strip process names : only last part, remove the workflow:subworkflow:process structure processes <- character(0) for (x in merge_df$name) { process <- lapply(strsplit(x, split = ":"), tail, n = 1L) @@ -83,6 +94,8 @@ for (x in merge_df$name) { } merge_df$name <- processes +#shorten the process names : removes the parenthesis with the data +#works if you don't care mixing all data for one process ! short_process <- character(0) for (x in merge_df$name) { short <- strsplit(x, split = "\\(")[[1]][1] @@ -90,6 +103,8 @@ for (x in merge_df$name) { } merge_df$name <- short_process +#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile)) +#keeps first part before "/" filenames <- character(0) for (x in merge_df$file) { file_short_name <- sapply(strsplit(x, split = "/"), "[", 2) @@ -97,8 +112,10 @@ for (x in merge_df$file) { } merge_df$file <- filenames +#used to add a particular shape for specific processes, not used in the end important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1) +#define categories for easier plot reading, n is the rest categories <- character(0) for (x in merge_df$name) { if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) { @@ -129,7 +146,9 @@ for (x in merge_df$name) { # categories merge_df$categorie <- categories -print(levels(as.factor(merge_df$name))) +#print(levels(as.factor(merge_df$name))) +#order the processes manually in order of the pipeline +#may fail if there's more options than I definded here, may make the order false because it depends of the alphabetical order of the processes ordered_processes <- tibble(levels(as.factor(merge_df$name))) order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8) ordered_processes <- ordered_processes %>% add_column(order = order) @@ -140,6 +159,8 @@ merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name")) categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF") # pdf("process_time.pdf") +#2 first plots for both duration time and realtime +#Then realtime whitout x labels for better visibility ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) + geom_boxplot() + @@ -179,6 +200,7 @@ ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = axis.text.x=element_blank(), axis.ticks.x=element_blank()) +#Cumulative sum of the realtime of the processes for each conformation merge_df.ordered <- merge_df %>% group_by(file) %>% @@ -204,7 +226,7 @@ ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = # # ####################################### - +#get the 3 replicates matrices for all conformation listFiles <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices.txt", sep = "\n") listFiles2 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt", sep = "\n") listFiles3 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt", sep = "\n") @@ -215,6 +237,7 @@ names(listFiles3) <- "File" merge_lf <- bind_rows(listFiles, listFiles2, listFiles3) +#set the dataframe with the file and the number of contact (= number of line in the raw matrix) df.counts <- tibble(File = character(), Counts = numeric()) for (filenames in merge_lf) { filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L) @@ -227,11 +250,15 @@ for (filenames in merge_lf) { } } -df.counts$File +#df.counts$File + +#keep only AD281 data, because I had previous results in the same folder +#If you have different data, change the code below df.filtered <- df.counts %>% filter(grepl('AD281.1000', File) | grepl('AD281_abs',File)) #df.filtered$File +#strip the file names, same as earlier filenames <- character(0) for (x in df.filtered$File) { file_short_name <- strsplit(x, split = "/")[[1]][7] @@ -241,6 +268,7 @@ for (x in df.filtered$File) { df.filtered$File <- filenames #df.filtered +#get the workflow, same as earlier workflow <- character(0) for (x in df.filtered$File) { w <- strsplit(x, split = "_")[[1]][1] @@ -248,6 +276,7 @@ for (x in df.filtered$File) { } df.filtered$workflow <- workflow +#get the alignment type (normal | iteralign | cutsite) align <- character(0) for (x in df.filtered$File) { a <- strsplit(x, split = "_")[[1]][2] @@ -255,6 +284,7 @@ for (x in df.filtered$File) { } df.filtered$align <- align +#get the filtering options (nofilter | filter_events | filter_pcr | filter_and_filter_pcr | filter_picard) filtering <- character(0) for (x in df.filtered$File) { f <- strsplit(x, split = "_")[[1]][3] @@ -274,9 +304,11 @@ for (x in df.filtered$File) { } df.filtered$filtering <- filtering +#exclude picard filtering because really low df.excludePicard <- df.filtered %>% filter(!grepl('filter_picard', filtering)) +#plot the number of contacts per conformation, with or without Picard filtering ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) + geom_point(size = 5) + scale_y_log10() +