add scripts plots and cooler

6122cdf9 · Mia Croiset · bcfa6f99 · 6122cdf9 · 6122cdf9
Verified Commit 6122cdf9 authored 4 months ago by Mia Croiset
--- a/benchmark/loadCooler.py
+++ b/benchmark/loadCooler.py
+# import core packages
+import warnings
+warnings.filterwarnings("ignore")
+from itertools import combinations
+import os
+from pathlib import Path
+
+
+# import semi-core packages
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.backends.backend_pdf import PdfPages
+# %matplotlib inline
+plt.style.use('seaborn-v0_8-poster')
+import numpy as np
+import pandas as pd
+from multiprocessing import Pool
+
+# import open2c libraries
+import bioframe
+
+import cooler
+import cooltools
+
+# count cpus
+num_cpus = os.getenv('SLURM_CPUS_PER_TASK')
+if not num_cpus:
+    num_cpus = os.cpu_count()
+num_cpus = int(num_cpus)
+
+
+
+# path to your mcool files
+files = list(Path("./rerunParasplitNov24/tests_parasplit_nobalance").glob("*/contact_maps/cool/*.mcool"))
+filesBalanced = list(Path("./rerunParasplitNov24/tests_parasplit_balance").glob("*/contact_maps/cool/*.mcool"))
+
+# print(files)
+
+allfiles = list()
+
+for f in files:
+    # select a resolution from your mcool
+    c = cooler.Cooler(str(f)+"::resolutions/1000")
+    name = str(f).split("/")[2]
+    #quick sort to exclude bowtie2 --very-sensitive-local option results
+    if 'local' not in name:
+        #print(name)
+        grp = (name, c)
+        allfiles.append(grp)
+
+# same for balanced files
+""" for f in filesBalanced:
+    c = cooler.Cooler(str(f)+"::resolutions/4000")
+    name = str(f).split("/")[2]
+    if 'local' not in name:
+        print(name)
+        grp = (name, c)
+        allfiles.append(grp) """
+
+# print(allfiles)
+# print(len(allfiles))
+
+# combine each case with the other to compare each conditions
+comb = list(combinations(allfiles, 2))
+#print(len(comb))
+#print(comb[0][1])
+
+# have a matrix to make the list of chromosomes for all
+c = comb[0][1][1]
+# quick test to check the combinations
+""" for x in range(len(comb)):
+    print(str(x) + " -> "+ str(comb[x])) """
+
+### to make a list of chromosome start/ends in bins:
+chromstarts = []
+for i in c.chromnames:
+    print(f'{i} : {c.extent(i)}')
+    chromstarts.append(c.extent(i)[0])
+
+# print(chromstarts)
+#list of chromosomes arms for S288c
+chr_arms = bioframe.make_viewframe([("chr1",0,151465,"chr1_p"),("chr1",151465,230218,"chr1_q"),("chr2",0,238207,"chr2_p"),("chr2",238207,813184,"chr2_q"),("chr3",0,114385,"chr3_p"),("chr3",114385,316620,"chr3_q"),("chr4",0,449711,"chr4_p"),("chr4",449711,1531933,"chr4_q"),("chr5",0,153771,"chr5_p"),("chr5",153771,578613,"chr5_q"),("chr6",0,148510,"chr6_p"),("chr6",148510,270161,"chr6_q"),("chr7",0,497038,"chr7_p"),("chr7",497038,1090940,"chr7_q"),("chr8",0,105703,"chr8_p"),("chr8",105703,562643,"chr8_q"),("chr9",0,355629,"chr9_p"),("chr9",355629,439888,"chr9_q"),("chr10",0,436425,"chr10_p"),("chr10",436425,745751,"chr10_q"),("chr11",0,440246,"chr11_p"),("chr11",440246,666816,"chr11_q"),("chr12",0,150947,"chr12_p"),("chr12",150947,1078177,"chr12_q"),("chr13",0,268031,"chr13_p"),("chr13",268031,924431,"chr13_q"),("chr14",0,628758,"chr14_p"),("chr14",628758,784333,"chr14_q"),("chr15",0,326702,"chr15_p"),("chr15",326702,1091291,"chr15_q"),("chr16",0,555957,"chr16_p"),("chr16",555957,948066,"chr16_q"),("2_micron",0,6318,"2_micron"),("mitochondrion",0,85779,"mitochondrion")])
+chr_arms = chr_arms[chr_arms.chrom.isin(c.chromnames)].reset_index(drop=True)
+# print(chr_arms)
+
+
+def plotGraph(diff,c,n1,n2):
+    f, ax = plt.subplots(
+        figsize=(7,6))
+    im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1);
+    plt.colorbar(im ,fraction=0.046, pad=0.04)
+    plt.title("Diff %s / %s" %(n1, n2), fontsize=14)
+    ax.set(xticks=chromstarts, xticklabels=c.chromnames)
+    ax.xaxis.set_label_position('bottom')
+    ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False)
+    return f
+
+def plotGraphChr3(diff,c,n1,n2):
+    f, ax = plt.subplots(
+        figsize=(7,6))
+    im = ax.matshow((np.log2(diff)), cmap = "bwr", vmin=-1, vmax=1);
+    plt.colorbar(im ,fraction=0.046, pad=0.04)
+    plt.title("Diff (chr3) %s / %s" %(n1, n2), fontsize=14)
+    ax.xaxis.set_label_position('bottom')
+    ax.tick_params(axis='x', labelsize=8, rotation=45, labelbottom=True, labeltop=False)
+    return f
+
+# plot only given combinations
+n1 = comb[62][1][0]
+n2 = comb[54][0][0]
+c1_tot = np.sum(comb[62][1][1].matrix(balance=False).fetch('chr3'))
+c2_tot = np.sum(comb[54][0][1].matrix(balance=False).fetch('chr3'))
+# to normalize to 1
+vec1 = np.array([c1_tot])
+vec2 = np.array([c2_tot])
+diff = ((comb[62][1][1].matrix(balance=False).fetch('chr3')/vec1)/(comb[54][0][1].matrix(balance=False).fetch('chr3')/vec2))
+for x in range(0, diff.shape[0]):
+    for y in range(0, diff.shape[1]):
+        if x == y:
+            diff[x,y] = 0
+plot = plotGraphChr3(diff, c, n1, n2)
+plot.savefig('chr3_2c.png')
+plt.close(plot)
+
+# plot all combinations
+""" n=1
+for x in comb: 
+    # print(x)
+    print("Figure %i" % n)
+    c1_tot = np.sum(x[0][1].matrix(balance=False)[:])
+    c2_tot = np.sum(x[1][1].matrix(balance=False)[:])
+    vec1 = np.array([c1_tot])
+    vec2 = np.array([c2_tot])
+    diff = ((x[0][1].matrix(balance=False)[:]/vec1)/(x[1][1].matrix(balance=False)[:]/vec2))
+    n1 = x[0][0]
+    n2 = x[1][0]
+    # pp = PdfPages('%s.pdf' % n)
+    n = n+1
+    for x in range(0, diff.shape[0]):
+        for y in range(0, diff.shape[1]):
+            if x == y:
+                diff[x,y] = 0
+    plot = plotGraph(diff, c, n1, n2)
+    plot.savefig('%s.png' % n)
+    plt.close(plot)
+    # pp.savefig(plot)
+    # pp.close()
+    # print(diff) """
+
+# plot all combinations but just chr3
+""" n=1
+for x in comb: 
+    # print(x)
+    print("Figure %i" % n)
+    c1_tot = np.sum(x[0][1].matrix(balance=False).fetch('chr3'))
+    c2_tot = np.sum(x[1][1].matrix(balance=False).fetch('chr3'))
+    vec1 = np.array([c1_tot])
+    vec2 = np.array([c2_tot])
+    diff = ((x[0][1].matrix(balance=False).fetch('chr3'))/(x[1][1].matrix(balance=False).fetch('chr3')))
+    n1 = x[0][0]
+    n2 = x[1][0]
+    # pp = PdfPages('chr3_%s_noNorm.pdf' % n)
+    n = n+1
+    for x in range(0, diff.shape[0]):
+        for y in range(0, diff.shape[1]):
+            if x == y:
+                diff[x,y] = 0
+    plot = plotGraphChr3(diff, c, n1, n2)
+    plot.savefig('chr3_%s_noNorm.png' % n)
+    plt.close(plot)
+    # pp.savefig(plot)
+    # pp.close()
+    # print(diff) """
+
+
+print("\n=========\n")
+#print(allfiles[0])
+
+# cvd == contacts-vs-distance
+cvd = cooltools.expected_cis(
+    clr=c,
+    view_df=chr_arms,
+    smooth=True,
+    aggregate_smoothed=False,
+    nproc=1,
+    clr_weight_name=None
+    # nproc=num_cpus #if you do not have multiple cores available, set to 1
+)
+
+
+# plot contact vs distance for each chromosomal arm
+""" print(cvd.head(4))
+
+f, ax = plt.subplots(1,1)
+
+for region in chr_arms['name']:
+    pp = PdfPages('dist_normal_nofilter_%s.pdf' % region)
+    ax.loglog(
+        cvd['dist_bp'].loc[cvd['region1']==region],
+        cvd['contact_frequency'].loc[cvd['region1']==region],
+    )
+    ax.set(
+        xlabel='separation, bp',
+        ylabel='IC contact frequency')
+    ax.set_aspect(1.0)
+    ax.grid(lw=0.5)
+    pp.savefig(f)
+    pp.close()
+"""
--- a/benchmark/plotsBenchmarkParasplit.R
+++ b/benchmark/plotsBenchmarkParasplit.R
+install.packages("tidyverse")
+install.packages("ggplot2")
+install.packages("LaF")
+library(tidyverse)
+library(ggplot2)
+library(dplyr)
+library(forcats)
+library(LaF)
+library(scales)
+
+#################################
+#                               #
+#   PLOTS ON PROCESS TIME       #
+#                               #
+#################################
+
+#Get the 3 replicates
+
+df <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep0.txt", sep = "\t", header = TRUE)
+rep2 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep1.txt", sep = "\t", header = TRUE)
+rep3 <- read.csv("/home/mcroiset/HiC/test_parasplit/recap_rep2.txt", sep = "\t", header = TRUE)
+
+# para_test <- read.csv("/home/mcroiset/HiC/test_parasplit/hicstuff_parasplit/pipeline_info/execution_trace_2024-08-30_11-40-06.txt", sep = "\t", header= TRUE)
+
+merge_df <- bind_rows(df, rep2, rep3)
+
+
+#Function to convert the time from report file (format Xd XXm XXs) to minutes
+convert_time <- function(x){
+  parts <- strsplit(x, " ")
+  list_durations <- character(0)
+  for( part in parts ) {
+    sec <- 0
+    minute <- 0
+    hours <- 0
+    mili <- 0
+    for ( single in part ) {
+      if (grepl('\\d+s', single)) #for seconds 
+      {
+        sec <- as.numeric(substr(single,1,nchar(single)-1))
+        sec <- sec/60
+        sec <- as.numeric(format(round(sec, 3), nsmall = 3))
+      }
+      if (grepl('\\d+m$', single)) #for minutes 
+      {
+        minute <- as.numeric(substr(single,1,nchar(single)-1))
+        minute <- as.numeric(format(round(minute, 2), nsmall = 2))
+      }
+      if (grepl('\\d+h', single)) #for days 
+      {
+        hours <- as.numeric(substr(single,1,nchar(single)-1))
+        hours <- hours*60
+        hours <- as.numeric(format(round(hours, 2), nsmall = 2))
+      }
+      
+      if (grepl('\\d+ms', single)) #for milliseconds, may be equal to 0 already 
+      {
+        mili <- as.numeric(substr(single,1,nchar(single)-2))
+        if (mili > 0) {
+          mili <- (mili/1000)/60
+          mili <- as.numeric(format(round(mili, 4), nsmall = 4))
+        }
+      }
+    }
+    new_duration <- (hours+minute+sec+mili)
+    list_durations <- append(list_durations,new_duration)
+  }
+  return(list_durations)
+}
+#convert the time for both duration (from submit of the process to end) and realtime (only process time)
+list_dur <- convert_time(merge_df$duration)
+list_realtime <- convert_time(merge_df$realtime)
+
+# list_dur_para <- convert_time(para_test$duration)
+# list_realtime_para <- convert_time(para_test$realtime)
+
+merge_df$duration_minutes <- list_dur
+merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes)
+
+# para_test$duration_minutes <- list_dur_para
+# para_test$duration_minutes <- as.numeric(para_test$duration_minutes)
+
+merge_df$realtime_minutes <- list_realtime
+merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes)
+
+# para_test$realtime_minutes <- list_realtime_para
+# para_test$realtime_minutes <- as.numeric(para_test$realtime_minutes)
+
+#mean and standard deviation, but in the end not used in this script
+val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes))
+
+#add workflow column (either hicpro or hicstuff)
+workflow <- character(0)
+for (x in merge_df$file) {
+  full <- strsplit(x, split = "/")[[1]][4]
+  w <- sapply(strsplit(full, split = "_"), "[", 1)
+  workflow <- append(workflow,w)
+}
+merge_df$workflow <- workflow
+
+# workflow <- character(0)
+# for (x in para_test$duration) {
+#   w <- "hicstuff"
+#   workflow <- append(workflow,w)
+# }
+# para_test$workflow <- workflow
+
+#strip process names : only last part, remove the workflow:subworkflow:process structure
+processes <- character(0)
+for (x in merge_df$name) {
+  process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
+  processes <- append(processes, as.character(process))
+}
+merge_df$name <- processes
+
+# processes <- character(0)
+# for (x in para_test$name) {
+#   process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
+#   processes <- append(processes, as.character(process))
+# }
+# para_test$name <- processes
+
+#shorten the process names : removes the parenthesis with the data
+#works if you don't care mixing all data for one process !
+short_process <- character(0)
+for (x in merge_df$name) {
+  short <- strsplit(x, split = "\\(")[[1]][1]
+  short_process <- append(short_process, as.character(short))
+}
+merge_df$name <- short_process
+
+# short_process <- character(0)
+# for (x in para_test$name) {
+#   short <- strsplit(x, split = "\\(")[[1]][1]
+#   short_process <- append(short_process, as.character(short))
+# }
+# para_test$name <- short_process
+
+#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile))
+#keeps first part before "/"
+filenames <- character(0)
+for (x in merge_df$file) {
+  file_short_name <- strsplit(x, split = "/")[[1]][4]
+  filenames <- append(filenames,file_short_name)
+}
+merge_df$file <- filenames
+
+# filenames <- character(0)
+# for (x in para_test$duration) {
+#   file_short_name <- "parasplit"
+#   filenames <- append(filenames,file_short_name)
+# }
+# para_test$file <- filenames
+
+#used to add a particular shape for specific processes, not used in the end
+important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1)
+
+# merge_df <- bind_rows(merge_df, para_test)
+
+#define categories for easier plot reading, n is the rest
+categories <- character(0)
+for (x in merge_df$name) {
+  if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) {
+    categories <- append(categories, "align")
+  }
+  else if (grepl('\\w*COOLER\\w*', x)) {
+    categories <- append(categories, "cooler")
+  }
+  else if (grepl('\\w*PAIRS\\w*', x)) {
+    categories<- append(categories, "pairs")
+  }
+  else if (grepl('\\w*MATRIX\\w*', x) || grepl('\\w*BUILD_CONTACT\\w*', x) || grepl('\\w*ICE_NORM\\w*', x)) {
+    categories<- append(categories, "matrix")
+  }
+  else if (grepl('\\w*FILTER\\w*', x) || grepl('\\w*PICARD\\w*', x) || grepl('\\w*SAMTOOLS\\w*', x)) {
+    categories<- append(categories, "filter")
+  }
+  else if (grepl('\\w*CUTSITE\\w*', x) || grepl('\\w*PARASPLIT\\w*', x)) {
+    categories<- append(categories, "parasplit")
+  }
+  else if (grepl('\\w*SAMPLESHEET\\w*', x) || (grepl('BOWTIE2_BUILD', x) || grepl('\\w*GETCHROM\\w*', x) || grepl('\\w*GET_RESTRIC\\w*', x))) {
+    categories<- append(categories, "data_prep")
+  }
+  else {
+    categories <- append(categories, "n")
+  }
+}
+# categories
+merge_df$categorie <- categories
+
+print(levels(as.factor(merge_df$name)))
+#order the processes manually in order of the pipeline
+#may fail if there's more options than I defined here, may make the order false because it depends of the alphabetical order of the processes
+ordered_processes <- tibble(levels(as.factor(merge_df$name)))
+order <- c(18, 7, 2, 20, 21, 24, 23, 26, 22, 25, 30, 31, 32, 3, 5, 19, 19, 9, 4, 28, 29, 7, 6, 1, 27)
+ordered_processes <- ordered_processes %>% add_column(order = order)
+ordered_processes <- rename(ordered_processes, name = `levels(as.factor(merge_df$name))`)
+
+merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
+
+categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF",  "#aeb6bf", "#C77CFF")
+
+# pdf("process_time.pdf")
+#2 first plots for both duration time and realtime
+#Then realtime whitout x labels for better visibility
+
+ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) +
+  geom_boxplot() +
+  scale_shape_manual(values = important_processes) +
+  scale_color_manual(values=categories_colors) +
+  facet_wrap(~ file, ncol = 7) +
+  xlab("Processes") +
+  ylab("Duration in minutes") +
+  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())+ 
+  ggtitle("Duration time of each process") +
+  scale_y_log10() +
+  theme_bw()
+
+ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
+  geom_boxplot() +
+  scale_shape_manual(values = important_processes) +
+  scale_color_manual(values=categories_colors) +
+  facet_wrap(~ file, ncol = 4) +
+  xlab("Processes") +
+  ylab("Duration in minutes") +
+  ggtitle("Real execution time of each process") +
+  scale_y_log10() +
+  theme_bw() + 
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+
+ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = categorie)) +
+  geom_boxplot() +
+  scale_shape_manual(values = important_processes) +
+  scale_color_manual(values=categories_colors) +
+  facet_wrap(~ file, ncol = 7) +
+  xlab("Processes") +
+  ylab("Duration in minutes") +
+  ggtitle("Real execution time of each process") +
+  scale_y_log10() +
+  theme_bw() +
+  theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank())
+
+#Cumulative sum of the realtime of the processes for each conformation
+
+merge_df.ordered <- merge_df %>%
+  group_by(file) %>%
+  arrange(order, .by_group = TRUE) %>%
+  mutate(cum_time = cumsum(realtime_minutes))
+
+ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = file)) +
+  geom_line(aes(color = file)) +
+  xlab("Processes") +
+  ylab("Duration in minutes") +
+  ggtitle("Duration per conformation") +
+  theme_bw() + 
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+  geom_point(aes(fill = categorie), shape = 21, size = 2.5) +
+  scale_fill_manual(values = categories_colors) +
+  scale_y_continuous(breaks = seq(0, 2300, by = 200)) +
+  geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter"), color = "black", linetype="dotted") +
+  geom_line(data = ~filter(.x, file == "hicstuff_parasplit_nofilter_local"), color = "black", linetype="dotted")
+
+# dev.off()
+
+#######################################
+#                                     #
+#   PLOTS ON NUMBER OF CONTACTS       #
+#                                     #
+#######################################
+
+#get the 3 replicates matrices for all conformation
+listFiles <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep0_bis.txt", sep = "\n", header = FALSE)
+# listFiles2 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep1_bis.txt", sep = "\n")
+# listFiles3 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep2_bis.txt", sep = "\n")
+
+names(listFiles) <- "File"
+names(listFiles2) <- "File"
+names(listFiles3) <- "File"
+
+# merge_lf <- bind_rows(listFiles, listFiles2, listFiles3)
+merge_lf <- listFiles
+
+#set the dataframe with the file and the number of contact (= number of line in the raw matrix)
+df.counts <- tibble(File = character(), Counts = numeric())
+for (filenames in merge_lf) {
+  filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L)
+  filenames <- paste("/home/mcroiset/HiC/", filenames ,sep = "")
+  for (f in filenames) {
+    count <- read_log(f, col_names = F)$X5[1]
+    # count <- determine_nlines(f)
+    df.counts <- add_row(df.counts, File = f, Counts = count)
+    # print(f)
+    # print(count)
+  }
+}
+
+#df.counts$File
+
+#keep only AD281 data, because I had previous results in the same folder
+#If you have different data, change the code below
+df.filtered <- df.counts %>%
+  filter(grepl('AD281',File))
+#df.filtered$File
+
+#strip the file names, same as earlier
+filenames <- character(0)
+for (x in df.filtered$File) {
+  file_short_name <- strsplit(x, split = "/")[[1]][8]
+  if (grepl('matrix', file_short_name)) {
+    file_short_name <- strsplit(x, split = "/")[[1]][6]
+    file_short_name <- paste(file_short_name,"_",strsplit(x, split = "/")[[1]][5])
+  }
+  filenames <- append(filenames,file_short_name)
+}
+#filenames
+df.filtered$File <- filenames
+#df.filtered
+
+#get the workflow, same as earlier
+workflow <- character(0)
+for (x in df.filtered$File) {
+  w <- strsplit(x, split = "_")[[1]][1]
+  workflow <- append(workflow,w)
+}
+df.filtered$workflow <- workflow
+
+#get the alignment type (normal | iteralign | cutsite)
+align <- character(0)
+for (x in df.filtered$File) {
+  a <- strsplit(x, split = "_")[[1]][2]
+  align <- append(align,a)
+}
+df.filtered$align <- align
+
+#get the filtering options (nofilter | nofilter_local | filter | filter_local | filter_pcr | filter_pcr_local | both)
+filtering <- character(0)
+for (x in df.filtered$File) {
+  f <- strsplit(x, split = "_")[[1]][3]
+  f2 <- strsplit(x, split = "_")[[1]][4]
+  f3 <- strsplit(x, split = "_")[[1]][5]
+  f4 <- strsplit(x, split = "_")[[1]][6]
+  ff <- paste(f, f2, f3 ,sep = "_")
+  if (strsplit(ff, split = "_")[[1]][2] == "NA") {
+    ff <- strsplit(ff, split = "_")[[1]][1]
+  }
+  else if (strsplit(ff, split = "_")[[1]][3] == "NA") {
+    ff1 <- strsplit(ff, split = "_")[[1]][1]
+    ff2 <- strsplit(ff, split = "_")[[1]][2]
+    ff <- paste(ff1, ff2 ,sep = "_")
+  }
+  filtering <- append(filtering,ff)
+}
+df.filtered$filtering <- filtering
+
+#exclude picard filtering because really low
+df.excludePicard <- df.filtered %>%
+  filter(!grepl('filter_picard', filtering))
+
+#plot the number of contacts per conformation, with or without Picard filtering
+ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) +
+  geom_point(size = 5) +
+  scale_y_log10() + 
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
+  ggtitle("Number of contacts")
+
+ggplot(df.excludePicard, aes(x = reorder(File,Counts), y = Counts, shape = filtering, color = align)) +
+  geom_point(size = 5) +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
+  ggtitle("Number of contacts")+
+  scale_shape_manual(values = c(15,16,17,18)) +
+  xlab("") +
+  ylab("Counts")
+  
+
+ggplot(df.excludePicard, aes(x = reorder(File,Counts) , y = Counts, fill = align)) +
+  geom_col(size = 2) + 
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+ 
+  ggtitle("Number of contacts")+
+  xlab("") +
+  ylab("Counts") + 
+  scale_fill_hue(c=45, l=80)  +
+  facet_wrap(~ factor(filtering, levels = c("nofilter", "filter", "filter_pcr", "filter_filterpcr")), scales = "free")+
+  scale_y_continuous(limits = c(0, 8e+07)) +
+  theme(axis.text.x=element_blank(),
+        axis.ticks.x=element_blank())
+