Skip to content
Snippets Groups Projects
Verified Commit 6122cdf9 authored by Mia Croiset's avatar Mia Croiset
Browse files

add scripts plots and cooler

parent bcfa6f99
No related branches found
No related tags found
No related merge requests found
# import core packages
import warnings
warnings.filterwarnings("ignore")
from itertools import combinations
import os
from pathlib import Path
# import semi-core packages
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.backends.backend_pdf import PdfPages
# %matplotlib inline
plt.style.use('seaborn-v0_8-poster')
import numpy as np
import pandas as pd
from multiprocessing import Pool
# import open2c libraries
import bioframe
import cooler
import cooltools
# count cpus
num_cpus = os.getenv('SLURM_CPUS_PER_TASK')
if not num_cpus:
num_cpus = os.cpu_count()
num_cpus = int(num_cpus)
# path to your mcool files
files = list(Path("./rerunParasplitNov24/tests_parasplit_nobalance").glob("*/contact_maps/cool/*.mcool"))
filesBalanced = list(Path("./rerunParasplitNov24/tests_parasplit_balance").glob("*/contact_maps/cool/*.mcool"))
# print(files)
allfiles = list()
for f in files:
# select a resolution from your mcool
c = cooler.Cooler(str(f)+"::resolutions/1000")
name = str(f).split("/")[2]
#quick sort to exclude bowtie2 --very-sensitive-local option results
if 'local' not in name:
#print(name)
grp = (name, c)
allfiles.append(grp)
# same for balanced files
""" for f in filesBalanced:
c = cooler.Cooler(str(f)+"::resolutions/4000")
name = str(f).split("/")[2]
if 'local' not in name:
print(name)
grp = (name, c)
allfiles.append(grp) """
# print(allfiles)
# print(len(allfiles))
# combine each case with the other to compare each conditions
comb = list(combinations(allfiles, 2))
#print(len(comb))
#print(comb[0][1])
# have a matrix to make the list of chromosomes for all
c = comb[0][1][1]
# quick test to check the combinations
""" for x in range(len(comb)):
print(str(x) + " -> "+ str(comb[x])) """
### to make a list of chromosome start/ends in bins:
chromstarts = []
for i in c.chromnames:
print(f'{i} : {c.extent(i)}')
chromstarts.append(c.extent(i)[0])
# print(chromstarts)
#list of chromosomes arms for S288c
chr_arms = bioframe.make_viewframe([("chr1",0,151465,"chr1_p"),("chr1",151465,230218,"chr1_q"),("chr2",0,238207,"chr2_p"),("chr2",238207,813184,"chr2_q"),("chr3",0,114385,"chr3_p"),("chr3",114385,316620,"chr3_q"),("chr4",0,449711,"chr4_p"),("chr4",449711,1531933,"chr4_q"),("chr5",0,153771,"chr5_p"),("chr5",153771,578613,"chr5_q"),("chr6",0,148510,"chr6_p"),("chr6",148510,270161,"chr6_q"),("chr7",0,497038,"chr7_p"),("chr7",497038,1090940,"chr7_q"),("chr8",0,105703,"chr8_p"),("chr8",105703,562643,"chr8_q"),("chr9",0,355629,"chr9_p"),("chr9",355629,439888,"chr9_q"),("chr10",0,436425,"chr10_p"),("chr10",436425,745751,"chr10_q"),("chr11",0,440246,"chr11_p"),("chr11",440246,666816,"chr11_q"),("chr12",0,150947,"chr12_p"),("chr12",150947,1078177,"chr12_q"),("chr13",0,268031,"chr13_p"),("chr13",268031,924431,"chr13_q"),("chr14",0,628758,"chr14_p"),("chr14",628758,784333,"chr14_q"),("chr15",0,326702,"chr15_p"),("chr15",326702,1091291,"chr15_q"),("chr16",0,555957,"chr16_p"),("chr16",555957,948066,"chr16_q"),("2_micron",0,6318,"2_micron"),("mitochondrion",0,85779,"mitochondrion")])
chr_arms = chr_arms[chr_arms.chrom.isin(c.chromnames)].reset_index(drop=True)
# print(chr_arms)
def plotGraph(diff,c,n1,n2):
f, ax = plt.subplots(
figsize=(7,6))
im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1);
plt.colorbar(im ,fraction=0.046, pad=0.04)
plt.title("Diff %s / %s" %(n1, n2), fontsize=14)
ax.set(xticks=chromstarts, xticklabels=c.chromnames)
ax.xaxis.set_label_position('bottom')
ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False)
return f
def plotGraphChr3(diff,c,n1,n2):
f, ax = plt.subplots(
figsize=(7,6))
im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1);
plt.colorbar(im ,fraction=0.046, pad=0.04)
plt.title("Diff (chr3) %s / %s" %(n1, n2), fontsize=14)
ax.xaxis.set_label_position('bottom')
ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False)
return f
# plot only given combinations
n1 = comb[62][1][0]
n2 = comb[54][0][0]
c1_tot = np.sum(comb[62][1][1].matrix(balance=False).fetch('chr3'))
c2_tot = np.sum(comb[54][0][1].matrix(balance=False).fetch('chr3'))
# to normalize to 1
vec1 = np.array([c1_tot])
vec2 = np.array([c2_tot])
diff = ((comb[62][1][1].matrix(balance=False).fetch('chr3')/vec1)/(comb[54][0][1].matrix(balance=False).fetch('chr3')/vec2))
for x in range(0, diff.shape[0]):
for y in range(0, diff.shape[1]):
if x == y:
diff[x,y] = 0
plot = plotGraphChr3(diff, c, n1, n2)
plot.savefig('chr3_2c.png')
plt.close(plot)
# plot all combinations
""" n=1
for x in comb:
# print(x)
print("Figure %i" % n)
c1_tot = np.sum(x[0][1].matrix(balance=False)[:])
c2_tot = np.sum(x[1][1].matrix(balance=False)[:])
vec1 = np.array([c1_tot])
vec2 = np.array([c2_tot])
diff = ((x[0][1].matrix(balance=False)[:]/vec1)/(x[1][1].matrix(balance=False)[:]/vec2))
n1 = x[0][0]
n2 = x[1][0]
# pp = PdfPages('%s.pdf' % n)
n = n+1
for x in range(0, diff.shape[0]):
for y in range(0, diff.shape[1]):
if x == y:
diff[x,y] = 0
plot = plotGraph(diff, c, n1, n2)
plot.savefig('%s.png' % n)
plt.close(plot)
# pp.savefig(plot)
# pp.close()
# print(diff) """
# plot all combinations but just chr3
""" n=1
for x in comb:
# print(x)
print("Figure %i" % n)
c1_tot = np.sum(x[0][1].matrix(balance=False).fetch('chr3'))
c2_tot = np.sum(x[1][1].matrix(balance=False).fetch('chr3'))
vec1 = np.array([c1_tot])
vec2 = np.array([c2_tot])
diff = ((x[0][1].matrix(balance=False).fetch('chr3'))/(x[1][1].matrix(balance=False).fetch('chr3')))
n1 = x[0][0]
n2 = x[1][0]
# pp = PdfPages('chr3_%s_noNorm.pdf' % n)
n = n+1
for x in range(0, diff.shape[0]):
for y in range(0, diff.shape[1]):
if x == y:
diff[x,y] = 0
plot = plotGraphChr3(diff, c, n1, n2)
plot.savefig('chr3_%s_noNorm.png' % n)
plt.close(plot)
# pp.savefig(plot)
# pp.close()
# print(diff) """
print("\n=========\n")
#print(allfiles[0])
# cvd == contacts-vs-distance
cvd = cooltools.expected_cis(
clr=c,
view_df=chr_arms,
smooth=True,
aggregate_smoothed=False,
nproc=1,
clr_weight_name=None
# nproc=num_cpus #if you do not have multiple cores available, set to 1
)
# plot contact vs distance for each chromosomal arm
""" print(cvd.head(4))
f, ax = plt.subplots(1,1)
for region in chr_arms['name']:
pp = PdfPages('dist_normal_nofilter_%s.pdf' % region)
ax.loglog(
cvd['dist_bp'].loc[cvd['region1']==region],
cvd['contact_frequency'].loc[cvd['region1']==region],
)
ax.set(
xlabel='separation, bp',
ylabel='IC contact frequency')
ax.set_aspect(1.0)
ax.grid(lw=0.5)
pp.savefig(f)
pp.close()
"""
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("LaF")
library(tidyverse)
library(ggplot2)
library(dplyr)
library(forcats)
library(LaF)
library(scales)
#################################
# #
# PLOTS ON PROCESS TIME #
# #
#################################
#Get the 3 replicates
df <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep0.txt", sep = "\t", header = TRUE)
rep2 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep1.txt", sep = "\t", header = TRUE)
rep3 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep2.txt", sep = "\t", header = TRUE)
# para_test <- read.csv("/home/mcroiset/HiC/test_parasplit/hicstuff_parasplit/pipeline_info/execution_trace_2024-08-30_11-40-06.txt", sep = "\t", header= TRUE)
merge_df <- bind_rows(df, rep2, rep3)
#Function to convert the time from report file (format Xd XXm XXs) to minutes
convert_time <- function(x){
parts <- strsplit(x, " ")
list_durations <- character(0)
for( part in parts ) {
sec <- 0
minute <- 0
hours <- 0
mili <- 0
for ( single in part ) {
if (grepl('\\d+s', single)) #for seconds
{
sec <- as.numeric(substr(single,1,nchar(single)-1))
sec <- sec/60
sec <- as.numeric(format(round(sec, 3), nsmall = 3))
}
if (grepl('\\d+m$', single)) #for minutes
{
minute <- as.numeric(substr(single,1,nchar(single)-1))
minute <- as.numeric(format(round(minute, 2), nsmall = 2))
}
if (grepl('\\d+h', single)) #for days
{
hours <- as.numeric(substr(single,1,nchar(single)-1))
hours <- hours*60
hours <- as.numeric(format(round(hours, 2), nsmall = 2))
}
if (grepl('\\d+ms', single)) #for milliseconds, may be equal to 0 already
{
mili <- as.numeric(substr(single,1,nchar(single)-2))
if (mili > 0) {
mili <- (mili/1000)/60
mili <- as.numeric(format(round(mili, 4), nsmall = 4))
}
}
}
new_duration <- (hours+minute+sec+mili)
list_durations <- append(list_durations,new_duration)
}
return(list_durations)
}
#convert the time for both duration (from submit of the process to end) and realtime (only process time)
list_dur <- convert_time(merge_df$duration)
list_realtime <- convert_time(merge_df$realtime)
# list_dur_para <- convert_time(para_test$duration)
# list_realtime_para <- convert_time(para_test$realtime)
merge_df$duration_minutes <- list_dur
merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes)
# para_test$duration_minutes <- list_dur_para
# para_test$duration_minutes <- as.numeric(para_test$duration_minutes)
merge_df$realtime_minutes <- list_realtime
merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes)
# para_test$realtime_minutes <- list_realtime_para
# para_test$realtime_minutes <- as.numeric(para_test$realtime_minutes)
#mean and standard deviation, but in the end not used in this script
val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes))
#add workflow column (either hicpro or hicstuff)
workflow <- character(0)
for (x in merge_df$file) {
full <- strsplit(x, split = "/")[[1]][4]
w <- sapply(strsplit(full, split = "_"), "[", 1)
workflow <- append(workflow,w)
}
merge_df$workflow <- workflow
# workflow <- character(0)
# for (x in para_test$duration) {
# w <- "hicstuff"
# workflow <- append(workflow,w)
# }
# para_test$workflow <- workflow
#strip process names : only last part, remove the workflow:subworkflow:process structure
processes <- character(0)
for (x in merge_df$name) {
process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
processes <- append(processes, as.character(process))
}
merge_df$name <- processes
# processes <- character(0)
# for (x in para_test$name) {
# process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
# processes <- append(processes, as.character(process))
# }
# para_test$name <- processes
#shorten the process names : removes the parenthesis with the data
#works if you don't care mixing all data for one process !
short_process <- character(0)
for (x in merge_df$name) {
short <- strsplit(x, split = "\\(")[[1]][1]
short_process <- append(short_process, as.character(short))
}
merge_df$name <- short_process
# short_process <- character(0)
# for (x in para_test$name) {
# short <- strsplit(x, split = "\\(")[[1]][1]
# short_process <- append(short_process, as.character(short))
# }
# para_test$name <- short_process
#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile))
#keeps first part before "/"
filenames <- character(0)
for (x in merge_df$file) {
file_short_name <- strsplit(x, split = "/")[[1]][4]
filenames <- append(filenames,file_short_name)
}
merge_df$file <- filenames
# filenames <- character(0)
# for (x in para_test$duration) {
# file_short_name <- "parasplit"
# filenames <- append(filenames,file_short_name)
# }
# para_test$file <- filenames
#used to add a particular shape for specific processes, not used in the end
important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1)
# merge_df <- bind_rows(merge_df, para_test)
#define categories for easier plot reading, n is the rest
categories <- character(0)
for (x in merge_df$name) {
if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) {
categories <- append(categories, "align")
}
else if (grepl('\\w*COOLER\\w*', x)) {
categories <- append(categories, "cooler")
}
else if (grepl('\\w*PAIRS\\w*', x)) {
categories<- append(categories, "pairs")
}
else if (grepl('\\w*MATRIX\\w*', x) || grepl('\\w*BUILD_CONTACT\\w*', x) || grepl('\\w*ICE_NORM\\w*', x)) {
categories<- append(categories, "matrix")
}
else if (grepl('\\w*FILTER\\w*', x) || grepl('\\w*PICARD\\w*', x) || grepl('\\w*SAMTOOLS\\w*', x)) {
categories<- append(categories, "filter")
}
else if (grepl('\\w*CUTSITE\\w*', x) || grepl('\\w*PARASPLIT\\w*', x)) {
categories<- append(categories, "parasplit")
}
else if (grepl('\\w*SAMPLESHEET\\w*', x) || (grepl('BOWTIE2_BUILD', x) || grepl('\\w*GETCHROM\\w*', x) || grepl('\\w*GET_RESTRIC\\w*', x))) {
categories<- append(categories, "data_prep")
}
else {
categories <- append(categories, "n")
}
}
# categories
merge_df$categorie <- categories
print(levels(as.factor(merge_df$name)))
#order the processes manually in order of the pipeline
#may fail if there's more options than I defined here, may make the order false because it depends of the alphabetical order of the processes
ordered_processes <- tibble(levels(as.factor(merge_df$name)))
order <- c(18, 7, 2, 20, 21, 24, 23, 26, 22, 25, 30, 31, 32, 3, 5, 19, 19, 9, 4, 28, 29, 7, 6, 1, 27)
ordered_processes <- ordered_processes %>% add_column(order = order)
ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df$name))`)
merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF")
# pdf("process_time.pdf")
#2 first plots for both duration time and realtime
#Then realtime whitout x labels for better visibility
ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 7) +
xlab("Processes") +
ylab("Duration in minutes") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+
ggtitle("Duration time of each process") +
scale_y_log10() +
theme_bw()
ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 4) +
xlab("Processes") +
ylab("Duration in minutes") +
ggtitle("Real execution time of each process") +
scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
geom_boxplot() +
scale_shape_manual(values = important_processes) +
scale_color_manual(values=categories_colors) +
facet_wrap(~ file, ncol = 7) +
xlab("Processes") +
ylab("Duration in minutes") +
ggtitle("Real execution time of each process") +
scale_y_log10() +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
#Cumulative sum of the realtime of the processes for each conformation
merge_df.ordered <- merge_df %>%
group_by(file) %>%
arrange(order, .by_group = TRUE) %>%
mutate(cum_time = cumsum(realtime_minutes))
ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = file)) +
geom_line(aes(color = file)) +
xlab("Processes") +
ylab("Duration in minutes") +
ggtitle("Duration per conformation") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
geom_point(aes(fill = categorie), shape = 21, size = 2.5) +
scale_fill_manual(values = categories_colors) +
scale_y_continuous(breaks = seq(0, 2300, by = 200)) +
geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter"), color = "black", linetype="dotted") +
geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter_local"), color = "black", linetype="dotted")
# dev.off()
#######################################
# #
# PLOTS ON NUMBER OF CONTACTS #
# #
#######################################
#get the 3 replicates matrices for all conformation
listFiles <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep0_bis.txt", sep = "\n", header = FALSE)
# listFiles2 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep1_bis.txt", sep = "\n")
# listFiles3 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep2_bis.txt", sep = "\n")
names(listFiles) <- "File"
names(listFiles2) <- "File"
names(listFiles3) <- "File"
# merge_lf <- bind_rows(listFiles, listFiles2, listFiles3)
merge_lf <- listFiles
#set the dataframe with the file and the number of contact (= number of line in the raw matrix)
df.counts <- tibble(File = character(), Counts = numeric())
for (filenames in merge_lf) {
filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L)
filenames <- paste("/home/mcroiset/HiC/", filenames ,sep = "")
for (f in filenames) {
count <- read_log(f, col_names = F)$X5[1]
# count <- determine_nlines(f)
df.counts <- add_row(df.counts, File = f, Counts = count)
# print(f)
# print(count)
}
}
#df.counts$File
#keep only AD281 data, because I had previous results in the same folder
#If you have different data, change the code below
df.filtered <- df.counts %>%
filter(grepl('AD281',File))
#df.filtered$File
#strip the file names, same as earlier
filenames <- character(0)
for (x in df.filtered$File) {
file_short_name <- strsplit(x, split = "/")[[1]][8]
if (grepl('matrix', file_short_name)) {
file_short_name <- strsplit(x, split = "/")[[1]][6]
file_short_name <- paste(file_short_name,"_",strsplit(x, split = "/")[[1]][5])
}
filenames <- append(filenames,file_short_name)
}
#filenames
df.filtered$File <- filenames
#df.filtered
#get the workflow, same as earlier
workflow <- character(0)
for (x in df.filtered$File) {
w <- strsplit(x, split = "_")[[1]][1]
workflow <- append(workflow,w)
}
df.filtered$workflow <- workflow
#get the alignment type (normal | iteralign | cutsite)
align <- character(0)
for (x in df.filtered$File) {
a <- strsplit(x, split = "_")[[1]][2]
align <- append(align,a)
}
df.filtered$align <- align
#get the filtering options (nofilter | nofilter_local | filter | filter_local | filter_pcr | filter_pcr_local | both)
filtering <- character(0)
for (x in df.filtered$File) {
f <- strsplit(x, split = "_")[[1]][3]
f2 <- strsplit(x, split = "_")[[1]][4]
f3 <- strsplit(x, split = "_")[[1]][5]
f4 <- strsplit(x, split = "_")[[1]][6]
ff <- paste(f, f2, f3 ,sep = "_")
if (strsplit(ff, split = "_")[[1]][2] == "NA") {
ff <- strsplit(ff, split = "_")[[1]][1]
}
else if (strsplit(ff, split = "_")[[1]][3] == "NA") {
ff1 <- strsplit(ff, split = "_")[[1]][1]
ff2 <- strsplit(ff, split = "_")[[1]][2]
ff <- paste(ff1, ff2 ,sep = "_")
}
filtering <- append(filtering,ff)
}
df.filtered$filtering <- filtering
#exclude picard filtering because really low
df.excludePicard <- df.filtered %>%
filter(!grepl('filter_picard', filtering))
#plot the number of contacts per conformation, with or without Picard filtering
ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) +
geom_point(size = 5) +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Number of contacts")
ggplot(df.excludePicard, aes(x = reorder(File,Counts), y = Counts, shape = filtering, color = align)) +
geom_point(size = 5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Number of contacts")+
scale_shape_manual(values = c(15,16,17,18)) +
xlab("") +
ylab("Counts")
ggplot(df.excludePicard, aes(x = reorder(File,Counts) , y = Counts, fill = align)) +
geom_col(size = 2) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
ggtitle("Number of contacts")+
xlab("") +
ylab("Counts") +
scale_fill_hue(c=45, l=80) +
facet_wrap(~ factor(filtering, levels = c("nofilter", "filter", "filter_pcr", "filter_filterpcr")), scales = "free")+
scale_y_continuous(limits = c(0, 8e+07)) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment