Skip to content
Snippets Groups Projects
Verified Commit 09826a09 authored by Mia Croiset's avatar Mia Croiset
Browse files

comment script

parent 71005133
No related branches found
No related tags found
No related merge requests found
...@@ -13,12 +13,16 @@ library(LaF) ...@@ -13,12 +13,16 @@ library(LaF)
# # # #
################################# #################################
#Get the 3 replicates
df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE) df <- read.csv("/home/mcroiset/HiC/benchmark/recap.txt", sep = "\t", header = TRUE)
rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE) rep2 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep2.txt", sep = "\t", header = TRUE)
rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", header = TRUE) rep3 <- read.csv("/home/mcroiset/HiC/benchmark/recap_rep3.txt", sep = "\t", header = TRUE)
merge_df <- bind_rows(df, rep2, rep3) merge_df <- bind_rows(df, rep2, rep3)
#Function to convert the time from report file (format Xd XXm XXs) to minutes
convert_time <- function(x){ convert_time <- function(x){
parts <- strsplit(x, " ") parts <- strsplit(x, " ")
list_durations <- character(0) list_durations <- character(0)
...@@ -28,22 +32,26 @@ convert_time <- function(x){ ...@@ -28,22 +32,26 @@ convert_time <- function(x){
hours <- 0 hours <- 0
mili <- 0 mili <- 0
for ( single in part ) { for ( single in part ) {
if (grepl('\\d+s', single)) { if (grepl('\\d+s', single)) #for seconds
{
sec <- as.numeric(substr(single,1,nchar(single)-1)) sec <- as.numeric(substr(single,1,nchar(single)-1))
sec <- sec/60 sec <- sec/60
sec <- as.numeric(format(round(sec, 3), nsmall = 3)) sec <- as.numeric(format(round(sec, 3), nsmall = 3))
} }
if (grepl('\\d+m$', single)) { if (grepl('\\d+m$', single)) #for minutes
{
minute <- as.numeric(substr(single,1,nchar(single)-1)) minute <- as.numeric(substr(single,1,nchar(single)-1))
minute <- as.numeric(format(round(minute, 2), nsmall = 2)) minute <- as.numeric(format(round(minute, 2), nsmall = 2))
} }
if (grepl('\\d+h', single)) { if (grepl('\\d+h', single)) #for days
{
hours <- as.numeric(substr(single,1,nchar(single)-1)) hours <- as.numeric(substr(single,1,nchar(single)-1))
hours <- hours*60 hours <- hours*60
hours <- as.numeric(format(round(hours, 2), nsmall = 2)) hours <- as.numeric(format(round(hours, 2), nsmall = 2))
} }
if (grepl('\\d+ms', single)) { if (grepl('\\d+ms', single)) #for milliseconds, may be equal to 0 already
{
mili <- as.numeric(substr(single,1,nchar(single)-2)) mili <- as.numeric(substr(single,1,nchar(single)-2))
if (mili > 0) { if (mili > 0) {
mili <- (mili/1000)/60 mili <- (mili/1000)/60
...@@ -56,7 +64,7 @@ convert_time <- function(x){ ...@@ -56,7 +64,7 @@ convert_time <- function(x){
} }
return(list_durations) return(list_durations)
} }
#convert the time for both duration (from submit of the process to end) and realtime (only process time)
list_dur <- convert_time(merge_df$duration) list_dur <- convert_time(merge_df$duration)
list_realtime <- convert_time(merge_df$realtime) list_realtime <- convert_time(merge_df$realtime)
...@@ -66,8 +74,10 @@ merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes) ...@@ -66,8 +74,10 @@ merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes)
merge_df$realtime_minutes <- list_realtime merge_df$realtime_minutes <- list_realtime
merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes) merge_df$realtime_minutes <- as.numeric(merge_df$realtime_minutes)
#mean and standard deviation, but in the end not used in this script
val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes)) val <- merge_df %>% group_by(name) %>% summarise(moy = mean(duration_minutes), ect = sd(duration_minutes))
#add workflow column (either hicpro or hicstuff)
workflow <- character(0) workflow <- character(0)
for (x in merge_df$file) { for (x in merge_df$file) {
full <- sapply(strsplit(x, split = "/"), "[", 2) full <- sapply(strsplit(x, split = "/"), "[", 2)
...@@ -76,6 +86,7 @@ for (x in merge_df$file) { ...@@ -76,6 +86,7 @@ for (x in merge_df$file) {
} }
merge_df$workflow <- workflow merge_df$workflow <- workflow
#strip process names : only last part, remove the workflow:subworkflow:process structure
processes <- character(0) processes <- character(0)
for (x in merge_df$name) { for (x in merge_df$name) {
process <- lapply(strsplit(x, split = ":"), tail, n = 1L) process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
...@@ -83,6 +94,8 @@ for (x in merge_df$name) { ...@@ -83,6 +94,8 @@ for (x in merge_df$name) {
} }
merge_df$name <- processes merge_df$name <- processes
#shorten the process names : removes the parenthesis with the data
#works if you don't care mixing all data for one process !
short_process <- character(0) short_process <- character(0)
for (x in merge_df$name) { for (x in merge_df$name) {
short <- strsplit(x, split = "\\(")[[1]][1] short <- strsplit(x, split = "\\(")[[1]][1]
...@@ -90,6 +103,8 @@ for (x in merge_df$name) { ...@@ -90,6 +103,8 @@ for (x in merge_df$name) {
} }
merge_df$name <- short_process merge_df$name <- short_process
#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile))
#keeps first part before "/"
filenames <- character(0) filenames <- character(0)
for (x in merge_df$file) { for (x in merge_df$file) {
file_short_name <- sapply(strsplit(x, split = "/"), "[", 2) file_short_name <- sapply(strsplit(x, split = "/"), "[", 2)
...@@ -97,8 +112,10 @@ for (x in merge_df$file) { ...@@ -97,8 +112,10 @@ for (x in merge_df$file) {
} }
merge_df$file <- filenames merge_df$file <- filenames
#used to add a particular shape for specific processes, not used in the end
important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1) important_processes <- c(15, 16, 17, 18, 7, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1,9,1,2,5,6,1,1,1,1,1,15,11,1,1,1,1,12,1,1,1,1,1)
#define categories for easier plot reading, n is the rest
categories <- character(0) categories <- character(0)
for (x in merge_df$name) { for (x in merge_df$name) {
if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) { if (grepl('\\w*ALIGN\\w*', x) || grepl('\\w*TRIM\\w*', x) || grepl('MERGE_BOWTIE2', x)) {
...@@ -129,7 +146,9 @@ for (x in merge_df$name) { ...@@ -129,7 +146,9 @@ for (x in merge_df$name) {
# categories # categories
merge_df$categorie <- categories merge_df$categorie <- categories
print(levels(as.factor(merge_df$name))) #print(levels(as.factor(merge_df$name)))
#order the processes manually in order of the pipeline
#may fail if there's more options than I definded here, may make the order false because it depends of the alphabetical order of the processes
ordered_processes <- tibble(levels(as.factor(merge_df$name))) ordered_processes <- tibble(levels(as.factor(merge_df$name)))
order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8) order <- c(18, 7, 8, 7, 2, 20, 20, 21, 14, 24, 23, 27, 22, 25, 30, 31, 32, 3, 6, 5, 19, 13, 19, 9, 4, 15, 28, 29, 18, 21, 7, 8, 17, 16, 33, 11, 1, 10, 12, 27, 8)
ordered_processes <- ordered_processes %>% add_column(order = order) ordered_processes <- ordered_processes %>% add_column(order = order)
...@@ -140,6 +159,8 @@ merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name")) ...@@ -140,6 +159,8 @@ merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF") categories_colors <- c("#F8766D", "#CD9600", "#7CAE00", "#00BE67", "#00BFC4", "#00A9FF", "#aeb6bf", "#C77CFF")
# pdf("process_time.pdf") # pdf("process_time.pdf")
#2 first plots for both duration time and realtime
#Then realtime whitout x labels for better visibility
ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) + ggplot(merge_df, aes(x = fct_reorder(name, order), y = duration_minutes, color = categorie)) +
geom_boxplot() + geom_boxplot() +
...@@ -179,6 +200,7 @@ ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color = ...@@ -179,6 +200,7 @@ ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color =
axis.text.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank()) axis.ticks.x=element_blank())
#Cumulative sum of the realtime of the processes for each conformation
merge_df.ordered <- merge_df %>% merge_df.ordered <- merge_df %>%
group_by(file) %>% group_by(file) %>%
...@@ -204,7 +226,7 @@ ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group = ...@@ -204,7 +226,7 @@ ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group =
# # # #
####################################### #######################################
#get the 3 replicates matrices for all conformation
listFiles <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices.txt", sep = "\n") listFiles <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices.txt", sep = "\n")
listFiles2 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt", sep = "\n") listFiles2 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt", sep = "\n")
listFiles3 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt", sep = "\n") listFiles3 <- read.csv("/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt", sep = "\n")
...@@ -215,6 +237,7 @@ names(listFiles3) <- "File" ...@@ -215,6 +237,7 @@ names(listFiles3) <- "File"
merge_lf <- bind_rows(listFiles, listFiles2, listFiles3) merge_lf <- bind_rows(listFiles, listFiles2, listFiles3)
#set the dataframe with the file and the number of contact (= number of line in the raw matrix)
df.counts <- tibble(File = character(), Counts = numeric()) df.counts <- tibble(File = character(), Counts = numeric())
for (filenames in merge_lf) { for (filenames in merge_lf) {
filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L) filenames <- lapply(strsplit(filenames, split = "../../"), tail, n = 1L)
...@@ -227,11 +250,15 @@ for (filenames in merge_lf) { ...@@ -227,11 +250,15 @@ for (filenames in merge_lf) {
} }
} }
df.counts$File #df.counts$File
#keep only AD281 data, because I had previous results in the same folder
#If you have different data, change the code below
df.filtered <- df.counts %>% df.filtered <- df.counts %>%
filter(grepl('AD281.1000', File) | grepl('AD281_abs',File)) filter(grepl('AD281.1000', File) | grepl('AD281_abs',File))
#df.filtered$File #df.filtered$File
#strip the file names, same as earlier
filenames <- character(0) filenames <- character(0)
for (x in df.filtered$File) { for (x in df.filtered$File) {
file_short_name <- strsplit(x, split = "/")[[1]][7] file_short_name <- strsplit(x, split = "/")[[1]][7]
...@@ -241,6 +268,7 @@ for (x in df.filtered$File) { ...@@ -241,6 +268,7 @@ for (x in df.filtered$File) {
df.filtered$File <- filenames df.filtered$File <- filenames
#df.filtered #df.filtered
#get the workflow, same as earlier
workflow <- character(0) workflow <- character(0)
for (x in df.filtered$File) { for (x in df.filtered$File) {
w <- strsplit(x, split = "_")[[1]][1] w <- strsplit(x, split = "_")[[1]][1]
...@@ -248,6 +276,7 @@ for (x in df.filtered$File) { ...@@ -248,6 +276,7 @@ for (x in df.filtered$File) {
} }
df.filtered$workflow <- workflow df.filtered$workflow <- workflow
#get the alignment type (normal | iteralign | cutsite)
align <- character(0) align <- character(0)
for (x in df.filtered$File) { for (x in df.filtered$File) {
a <- strsplit(x, split = "_")[[1]][2] a <- strsplit(x, split = "_")[[1]][2]
...@@ -255,6 +284,7 @@ for (x in df.filtered$File) { ...@@ -255,6 +284,7 @@ for (x in df.filtered$File) {
} }
df.filtered$align <- align df.filtered$align <- align
#get the filtering options (nofilter | filter_events | filter_pcr | filter_and_filter_pcr | filter_picard)
filtering <- character(0) filtering <- character(0)
for (x in df.filtered$File) { for (x in df.filtered$File) {
f <- strsplit(x, split = "_")[[1]][3] f <- strsplit(x, split = "_")[[1]][3]
...@@ -274,9 +304,11 @@ for (x in df.filtered$File) { ...@@ -274,9 +304,11 @@ for (x in df.filtered$File) {
} }
df.filtered$filtering <- filtering df.filtered$filtering <- filtering
#exclude picard filtering because really low
df.excludePicard <- df.filtered %>% df.excludePicard <- df.filtered %>%
filter(!grepl('filter_picard', filtering)) filter(!grepl('filter_picard', filtering))
#plot the number of contacts per conformation, with or without Picard filtering
ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) + ggplot(df.filtered, aes(x = reorder(File,Counts), y = Counts, shape = align, color = filtering)) +
geom_point(size = 5) + geom_point(size = 5) +
scale_y_log10() + scale_y_log10() +
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment