diff --git a/benchmark/loadCooler.py b/benchmark/loadCooler.py new file mode 100644 index 0000000000000000000000000000000000000000..5ba267f417808203f27e0fc65ee5527ca8c4e64d --- /dev/null +++ b/benchmark/loadCooler.py @@ -0,0 +1,209 @@ +# import core packages +import warnings +warnings.filterwarnings("ignore") +from itertools import combinations +import os +from pathlib import Path + + +# import semi-core packages +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.backends.backend_pdf import PdfPages +# %matplotlib inline +plt.style.use('seaborn-v0_8-poster') +import numpy as np +import pandas as pd +from multiprocessing import Pool + +# import open2c libraries +import bioframe + +import cooler +import cooltools + +# count cpus +num_cpus = os.getenv('SLURM_CPUS_PER_TASK') +if not num_cpus: + num_cpus = os.cpu_count() +num_cpus = int(num_cpus) + + + +# path to your mcool files +files = list(Path("./rerunParasplitNov24/tests_parasplit_nobalance").glob("*/contact_maps/cool/*.mcool")) +filesBalanced = list(Path("./rerunParasplitNov24/tests_parasplit_balance").glob("*/contact_maps/cool/*.mcool")) + +# print(files) + +allfiles = list() + +for f in files: + # select a resolution from your mcool + c = cooler.Cooler(str(f)+"::resolutions/1000") + name = str(f).split("/")[2] + #quick sort to exclude bowtie2 --very-sensitive-local option results + if 'local' not in name: + #print(name) + grp = (name, c) + allfiles.append(grp) + +# same for balanced files +""" for f in filesBalanced: + c = cooler.Cooler(str(f)+"::resolutions/4000") + name = str(f).split("/")[2] + if 'local' not in name: + print(name) + grp = (name, c) + allfiles.append(grp) """ + +# print(allfiles) +# print(len(allfiles)) + +# combine each case with the other to compare each conditions +comb = list(combinations(allfiles, 2)) +#print(len(comb)) +#print(comb[0][1]) + +# have a matrix to make the list of chromosomes for all +c = comb[0][1][1] +# quick test to check the combinations +""" for x in range(len(comb)): + print(str(x) + " -> "+ str(comb[x])) """ + +### to make a list of chromosome start/ends in bins: +chromstarts = [] +for i in c.chromnames: + print(f'{i} : {c.extent(i)}') + chromstarts.append(c.extent(i)[0]) + +# print(chromstarts) +#list of chromosomes arms for S288c +chr_arms = bioframe.make_viewframe([("chr1",0,151465,"chr1_p"),("chr1",151465,230218,"chr1_q"),("chr2",0,238207,"chr2_p"),("chr2",238207,813184,"chr2_q"),("chr3",0,114385,"chr3_p"),("chr3",114385,316620,"chr3_q"),("chr4",0,449711,"chr4_p"),("chr4",449711,1531933,"chr4_q"),("chr5",0,153771,"chr5_p"),("chr5",153771,578613,"chr5_q"),("chr6",0,148510,"chr6_p"),("chr6",148510,270161,"chr6_q"),("chr7",0,497038,"chr7_p"),("chr7",497038,1090940,"chr7_q"),("chr8",0,105703,"chr8_p"),("chr8",105703,562643,"chr8_q"),("chr9",0,355629,"chr9_p"),("chr9",355629,439888,"chr9_q"),("chr10",0,436425,"chr10_p"),("chr10",436425,745751,"chr10_q"),("chr11",0,440246,"chr11_p"),("chr11",440246,666816,"chr11_q"),("chr12",0,150947,"chr12_p"),("chr12",150947,1078177,"chr12_q"),("chr13",0,268031,"chr13_p"),("chr13",268031,924431,"chr13_q"),("chr14",0,628758,"chr14_p"),("chr14",628758,784333,"chr14_q"),("chr15",0,326702,"chr15_p"),("chr15",326702,1091291,"chr15_q"),("chr16",0,555957,"chr16_p"),("chr16",555957,948066,"chr16_q"),("2_micron",0,6318,"2_micron"),("mitochondrion",0,85779,"mitochondrion")]) +chr_arms = chr_arms[chr_arms.chrom.isin(c.chromnames)].reset_index(drop=True) +# print(chr_arms) + + +def plotGraph(diff,c,n1,n2): + f, ax = plt.subplots( + figsize=(7,6)) + im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1); + plt.colorbar(im ,fraction=0.046, pad=0.04) + plt.title("Diff %s / %s" %(n1, n2), fontsize=14) + ax.set(xticks=chromstarts, xticklabels=c.chromnames) + ax.xaxis.set_label_position('bottom') + ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False) + return f + +def plotGraphChr3(diff,c,n1,n2): + f, ax = plt.subplots( + figsize=(7,6)) + im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1); + plt.colorbar(im ,fraction=0.046, pad=0.04) + plt.title("Diff (chr3) %s / %s" %(n1, n2), fontsize=14) + ax.xaxis.set_label_position('bottom') + ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False) + return f + +# plot only given combinations +n1 = comb[62][1][0] +n2 = comb[54][0][0] +c1_tot = np.sum(comb[62][1][1].matrix(balance=False).fetch('chr3')) +c2_tot = np.sum(comb[54][0][1].matrix(balance=False).fetch('chr3')) +# to normalize to 1 +vec1 = np.array([c1_tot]) +vec2 = np.array([c2_tot]) +diff = ((comb[62][1][1].matrix(balance=False).fetch('chr3')/vec1)/(comb[54][0][1].matrix(balance=False).fetch('chr3')/vec2)) +for x in range(0, diff.shape[0]): + for y in range(0, diff.shape[1]): + if x == y: + diff[x,y] = 0 +plot = plotGraphChr3(diff, c, n1, n2) +plot.savefig('chr3_2c.png') +plt.close(plot) + +# plot all combinations +""" n=1 +for x in comb: + # print(x) + print("Figure %i" % n) + c1_tot = np.sum(x[0][1].matrix(balance=False)[:]) + c2_tot = np.sum(x[1][1].matrix(balance=False)[:]) + vec1 = np.array([c1_tot]) + vec2 = np.array([c2_tot]) + diff = ((x[0][1].matrix(balance=False)[:]/vec1)/(x[1][1].matrix(balance=False)[:]/vec2)) + n1 = x[0][0] + n2 = x[1][0] + # pp = PdfPages('%s.pdf' % n) + n = n+1 + for x in range(0, diff.shape[0]): + for y in range(0, diff.shape[1]): + if x == y: + diff[x,y] = 0 + plot = plotGraph(diff, c, n1, n2) + plot.savefig('%s.png' % n) + plt.close(plot) + # pp.savefig(plot) + # pp.close() + # print(diff) """ + +# plot all combinations but just chr3 +""" n=1 +for x in comb: + # print(x) + print("Figure %i" % n) + c1_tot = np.sum(x[0][1].matrix(balance=False).fetch('chr3')) + c2_tot = np.sum(x[1][1].matrix(balance=False).fetch('chr3')) + vec1 = np.array([c1_tot]) + vec2 = np.array([c2_tot]) + diff = ((x[0][1].matrix(balance=False).fetch('chr3'))/(x[1][1].matrix(balance=False).fetch('chr3'))) + n1 = x[0][0] + n2 = x[1][0] + # pp = PdfPages('chr3_%s_noNorm.pdf' % n) + n = n+1 + for x in range(0, diff.shape[0]): + for y in range(0, diff.shape[1]): + if x == y: + diff[x,y] = 0 + plot = plotGraphChr3(diff, c, n1, n2) + plot.savefig('chr3_%s_noNorm.png' % n) + plt.close(plot) + # pp.savefig(plot) + # pp.close() + # print(diff) """ + + +print("\n=========\n") +#print(allfiles[0]) + +# cvd == contacts-vs-distance +cvd = cooltools.expected_cis( + clr=c, + view_df=chr_arms, + smooth=True, + aggregate_smoothed=False, + nproc=1, + clr_weight_name=None + # nproc=num_cpus #if you do not have multiple cores available, set to 1 +) + + +# plot contact vs distance for each chromosomal arm +""" print(cvd.head(4)) + +f, ax = plt.subplots(1,1) + +for region in chr_arms['name']: + pp = PdfPages('dist_normal_nofilter_%s.pdf' % region) + ax.loglog( + cvd['dist_bp'].loc[cvd['region1']==region], + cvd['contact_frequency'].loc[cvd['region1']==region], + ) + ax.set( + xlabel='separation, bp', + ylabel='IC contact frequency') + ax.set_aspect(1.0) + ax.grid(lw=0.5) + pp.savefig(f) + pp.close() +""" diff --git a/benchmark/plotsBenchmarkParasplit.R b/benchmark/plotsBenchmarkParasplit.R new file mode 100644 index 0000000000000000000000000000000000000000..3cedc6f616e796c0842ad4575ec4418f034720fd --- /dev/null +++ b/benchmark/plotsBenchmarkParasplit.R @@ -0,0 +1,388 @@ +install.packages("tidyverse") +install.packages("ggplot2") +install.packages("LaF") +library(tidyverse) +library(ggplot2) +library(dplyr) +library(forcats) +library(LaF) +library(scales) + +################################# +# # +# PLOTS ON PROCESS TIME # +# # +################################# + +#Get the 3 replicates + +df <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep0.txt", sep = "\t", header = TRUE) +rep2 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep1.txt", sep = "\t", header = TRUE) +rep3 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep2.txt", sep = "\t", header = TRUE) + +# para_test <- read.csv("/home/mcroiset/HiC/test_parasplit/hicstuff_parasplit/pipeline_info/execution_trace_2024-08-30_11-40-06.txt", sep = "\t", header= TRUE) + +merge_df <- bind_rows(df, rep2, rep3) + + +#Function to convert the time from report file (format Xd XXm XXs) to minutes +convert_time <- function(x){ + parts <- strsplit(x, " ") + list_durations <- character(0) + for( part in parts ) { + sec <- 0 + minute <- 0 + hours <- 0 + mili <- 0 + for ( single in part ) { + if (grepl('\\d+s', single)) #for seconds + { + sec <- as.numeric(substr(single,1,nchar(single)-1)) + sec <- sec/60 + sec <- as.numeric(format(round(sec, 3), nsmall = 3)) + } + if (grepl('\\d+m$', single)) #for minutes + { + minute <- as.numeric(substr(single,1,nchar(single)-1)) + minute <- as.numeric(format(round(minute, 2), nsmall = 2)) + } + if (grepl('\\d+h', single)) #for days + { + hours <- as.numeric(substr(single,1,nchar(single)-1)) + hours <- hours*60 + hours <- as.numeric(format(round(hours, 2), nsmall = 2)) + } + + if (grepl('\\d+ms', single)) #for milliseconds, may be equal to 0 already + { + mili <- as.numeric(substr(single,1,nchar(single)-2)) + if (mili > 0) { + mili <- (mili/1000)/60 + mili <- as.numeric(format(round(mili, 4), nsmall = 4)) + } + } + } + new_duration <- (hours+minute+sec+mili) + list_durations <- append(list_durations,new_duration) + } + return(list_durations) +} +#convert the time for both duration (from submit of the process to end) and realtime (only process time) +list_dur <- convert_time(merge_df$duration) +list_realtime <- convert_time(merge_df$realtime) + +# list_dur_para <- convert_time(para_test$duration) +# list_realtime_para <- convert_time(para_test$realtime) + +merge_df$duration_minutes <- list_dur +merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes) + +# para_test$duration_minutes <- list_dur_para +# para_test$duration_minutes <- as.numeric(para_test$duration_minutes) + +merge_df$realtime_minutes <- list_realtime +merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes) + +# para_test$realtime_minutes <- list_realtime_para +# para_test$realtime_minutes <- as.numeric(para_test$realtime_minutes) + +#mean and standard deviation, but in the end not used in this script +val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes)) + +#add workflow column (either hicpro or hicstuff) +workflow <- character(0) +for (x in merge_df$file) { + full <- strsplit(x, split = "/")[[1]][4] + w <- sapply(strsplit(full, split = "_"), "[", 1) + workflow <- append(workflow,w) +} +merge_df$workflow <- workflow + +# workflow <- character(0) +# for (x in para_test$duration) { +# w <- "hicstuff" +# workflow <- append(workflow,w) +# } +# para_test$workflow <- workflow + +#strip process names : only last part, remove the workflow:subworkflow:process structure +processes <- character(0) +for (x in merge_df$name) { + process <- lapply(strsplit(x, split = ":"), tail, n = 1L) + processes <- append(processes, as.character(process)) +} +merge_df$name <- processes + +# processes <- character(0) +# for (x in para_test$name) { +# process <- lapply(strsplit(x, split = ":"), tail, n = 1L) +# processes <- append(processes, as.character(process)) +# } +# para_test$name <- processes + +#shorten the process names : removes the parenthesis with the data +#works if you don't care mixing all data for one process ! +short_process <- character(0) +for (x in merge_df$name) { + short <- strsplit(x, split = "\\(")[[1]][1] + short_process <- append(short_process, as.character(short)) +} +merge_df$name <- short_process + +# short_process <- character(0) +# for (x in para_test$name) { +# short <- strsplit(x, split = "\\(")[[1]][1] +# short_process <- append(short_process, as.character(short)) +# } +# para_test$name <- short_process + +#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile)) +#keeps first part before "/" +filenames <- character(0) +for (x in merge_df$file) { + file_short_name <- strsplit(x, split = "/")[[1]][4] + filenames <- append(filenames,file_short_name) +} +merge_df$file <- filenames + +# filenames <- character(0) +# for (x in para_test$duration) { +# file_short_name <- "parasplit" +# filenames <- append(filenames,file_short_name) +# } +# para_test$file <- filenames + +#used to add a particular shape for specific processes, not used in the end +important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1) + +# merge_df <- bind_rows(merge_df, para_test) + +#define categories for easier plot reading, n is the rest +categories <- character(0) +for (x in merge_df$name) { + if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) { + categories <- append(categories, "align") + } + else if (grepl('\\w*COOLER\\w*', x)) { + categories <- append(categories, "cooler") + } + else if (grepl('\\w*PAIRS\\w*', x)) { + categories<- append(categories, "pairs") + } + else if (grepl('\\w*MATRIX\\w*', x) || grepl('\\w*BUILD_CONTACT\\w*', x) || grepl('\\w*ICE_NORM\\w*', x)) { + categories<- append(categories, "matrix") + } + else if (grepl('\\w*FILTER\\w*', x) || grepl('\\w*PICARD\\w*', x) || grepl('\\w*SAMTOOLS\\w*', x)) { + categories<- append(categories, "filter") + } + else if (grepl('\\w*CUTSITE\\w*', x) || grepl('\\w*PARASPLIT\\w*', x)) { + categories<- append(categories, "parasplit") + } + else if (grepl('\\w*SAMPLESHEET\\w*', x) || (grepl('BOWTIE2_BUILD', x) || grepl('\\w*GETCHROM\\w*', x) || grepl('\\w*GET_RESTRIC\\w*', x))) { + categories<- append(categories, "data_prep") + } + else { + categories <- append(categories, "n") + } +} +# categories +merge_df$categorie <- categories + +print(levels(as.factor(merge_df$name))) +#order the processes manually in order of the pipeline +#may fail if there's more options than I defined here, may make the order false because it depends of the alphabetical order of the processes +ordered_processes <- tibble(levels(as.factor(merge_df$name))) +order <- c(18, 7, 2, 20, 21, 24, 23, 26, 22, 25, 30, 31, 32, 3, 5, 19, 19, 9, 4, 28, 29, 7, 6, 1, 27) +ordered_processes <- ordered_processes %>% add_column(order = order) +ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df$name))`) + +merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name")) + +categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF") + +# pdf("process_time.pdf") +#2 first plots for both duration time and realtime +#Then realtime whitout x labels for better visibility + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) + + geom_boxplot() + + scale_shape_manual(values = important_processes) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 7) + + xlab("Processes") + + ylab("Duration in minutes") + + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ + ggtitle("Duration time of each process") + + scale_y_log10() + + theme_bw() + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) + + geom_boxplot() + + scale_shape_manual(values = important_processes) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 4) + + xlab("Processes") + + ylab("Duration in minutes") + + ggtitle("Real execution time of each process") + + scale_y_log10() + + theme_bw() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + +ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) + + geom_boxplot() + + scale_shape_manual(values = important_processes) + + scale_color_manual(values=categories_colors) + + facet_wrap(~ file, ncol = 7) + + xlab("Processes") + + ylab("Duration in minutes") + + ggtitle("Real execution time of each process") + + scale_y_log10() + + theme_bw() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + +#Cumulative sum of the realtime of the processes for each conformation + +merge_df.ordered <- merge_df %>% + group_by(file) %>% + arrange(order, .by_group = TRUE) %>% + mutate(cum_time = cumsum(realtime_minutes)) + +ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = file)) + + geom_line(aes(color = file)) + + xlab("Processes") + + ylab("Duration in minutes") + + ggtitle("Duration per conformation") + + theme_bw() + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + + geom_point(aes(fill = categorie), shape = 21, size = 2.5) + + scale_fill_manual(values = categories_colors) + + scale_y_continuous(breaks = seq(0, 2300, by = 200)) + + geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter"), color = "black", linetype="dotted") + + geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter_local"), color = "black", linetype="dotted") + +# dev.off() + +####################################### +# # +# PLOTS ON NUMBER OF CONTACTS # +# # +####################################### + +#get the 3 replicates matrices for all conformation +listFiles <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep0_bis.txt", sep = "\n", header = FALSE) +# listFiles2 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep1_bis.txt", sep = "\n") +# listFiles3 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep2_bis.txt", sep = "\n") + +names(listFiles) <- "File" +names(listFiles2) <- "File" +names(listFiles3) <- "File" + +# merge_lf <- bind_rows(listFiles, listFiles2, listFiles3) +merge_lf <- listFiles + +#set the dataframe with the file and the number of contact (= number of line in the raw matrix) +df.counts <- tibble(File = character(), Counts = numeric()) +for (filenames in merge_lf) { + filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L) + filenames <- paste("/home/mcroiset/HiC/", filenames ,sep = "") + for (f in filenames) { + count <- read_log(f, col_names = F)$X5[1] + # count <- determine_nlines(f) + df.counts <- add_row(df.counts, File = f, Counts = count) + # print(f) + # print(count) + } +} + +#df.counts$File + +#keep only AD281 data, because I had previous results in the same folder +#If you have different data, change the code below +df.filtered <- df.counts %>% + filter(grepl('AD281',File)) +#df.filtered$File + +#strip the file names, same as earlier +filenames <- character(0) +for (x in df.filtered$File) { + file_short_name <- strsplit(x, split = "/")[[1]][8] + if (grepl('matrix', file_short_name)) { + file_short_name <- strsplit(x, split = "/")[[1]][6] + file_short_name <- paste(file_short_name,"_",strsplit(x, split = "/")[[1]][5]) + } + filenames <- append(filenames,file_short_name) +} +#filenames +df.filtered$File <- filenames +#df.filtered + +#get the workflow, same as earlier +workflow <- character(0) +for (x in df.filtered$File) { + w <- strsplit(x, split = "_")[[1]][1] + workflow <- append(workflow,w) +} +df.filtered$workflow <- workflow + +#get the alignment type (normal | iteralign | cutsite) +align <- character(0) +for (x in df.filtered$File) { + a <- strsplit(x, split = "_")[[1]][2] + align <- append(align,a) +} +df.filtered$align <- align + +#get the filtering options (nofilter | nofilter_local | filter | filter_local | filter_pcr | filter_pcr_local | both) +filtering <- character(0) +for (x in df.filtered$File) { + f <- strsplit(x, split = "_")[[1]][3] + f2 <- strsplit(x, split = "_")[[1]][4] + f3 <- strsplit(x, split = "_")[[1]][5] + f4 <- strsplit(x, split = "_")[[1]][6] + ff <- paste(f, f2, f3 ,sep = "_") + if (strsplit(ff, split = "_")[[1]][2] == "NA") { + ff <- strsplit(ff, split = "_")[[1]][1] + } + else if (strsplit(ff, split = "_")[[1]][3] == "NA") { + ff1 <- strsplit(ff, split = "_")[[1]][1] + ff2 <- strsplit(ff, split = "_")[[1]][2] + ff <- paste(ff1, ff2 ,sep = "_") + } + filtering <- append(filtering,ff) +} +df.filtered$filtering <- filtering + +#exclude picard filtering because really low +df.excludePicard <- df.filtered %>% + filter(!grepl('filter_picard', filtering)) + +#plot the number of contacts per conformation, with or without Picard filtering +ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) + + geom_point(size = 5) + + scale_y_log10() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + ggtitle("Number of contacts") + +ggplot(df.excludePicard, aes(x = reorder(File,Counts), y = Counts, shape = filtering, color = align)) + + geom_point(size = 5) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + ggtitle("Number of contacts")+ + scale_shape_manual(values = c(15,16,17,18)) + + xlab("") + + ylab("Counts") + + +ggplot(df.excludePicard, aes(x = reorder(File,Counts) , y = Counts, fill = align)) + + geom_col(size = 2) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ + ggtitle("Number of contacts")+ + xlab("") + + ylab("Counts") + + scale_fill_hue(c=45, l=80) + + facet_wrap(~ factor(filtering, levels = c("nofilter", "filter", "filter_pcr", "filter_filterpcr")), scales = "free")+ + scale_y_continuous(limits = c(0, 8e+07)) + + theme(axis.text.x=element_blank(), + axis.ticks.x=element_blank()) +