Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hic
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
Hub
hic
Commits
6122cdf9
Verified
Commit
6122cdf9
authored
1 month ago
by
Mia Croiset
Browse files
Options
Downloads
Patches
Plain Diff
add scripts plots and cooler
parent
bcfa6f99
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
benchmark/loadCooler.py
+209
-0
209 additions, 0 deletions
benchmark/loadCooler.py
benchmark/plotsBenchmarkParasplit.R
+388
-0
388 additions, 0 deletions
benchmark/plotsBenchmarkParasplit.R
with
597 additions
and
0 deletions
benchmark/loadCooler.py
0 → 100644
+
209
−
0
View file @
6122cdf9
# import core packages
import
warnings
warnings
.
filterwarnings
(
"
ignore
"
)
from
itertools
import
combinations
import
os
from
pathlib
import
Path
# import semi-core packages
import
matplotlib.pyplot
as
plt
from
matplotlib
import
colors
from
matplotlib.backends.backend_pdf
import
PdfPages
# %matplotlib inline
plt
.
style
.
use
(
'
seaborn-v0_8-poster
'
)
import
numpy
as
np
import
pandas
as
pd
from
multiprocessing
import
Pool
# import open2c libraries
import
bioframe
import
cooler
import
cooltools
# count cpus
num_cpus
=
os
.
getenv
(
'
SLURM_CPUS_PER_TASK
'
)
if
not
num_cpus
:
num_cpus
=
os
.
cpu_count
()
num_cpus
=
int
(
num_cpus
)
# path to your mcool files
files
=
list
(
Path
(
"
./rerunParasplitNov24/tests_parasplit_nobalance
"
).
glob
(
"
*/contact_maps/cool/*.mcool
"
))
filesBalanced
=
list
(
Path
(
"
./rerunParasplitNov24/tests_parasplit_balance
"
).
glob
(
"
*/contact_maps/cool/*.mcool
"
))
# print(files)
allfiles
=
list
()
for
f
in
files
:
# select a resolution from your mcool
c
=
cooler
.
Cooler
(
str
(
f
)
+
"
::resolutions/1000
"
)
name
=
str
(
f
).
split
(
"
/
"
)[
2
]
#quick sort to exclude bowtie2 --very-sensitive-local option results
if
'
local
'
not
in
name
:
#print(name)
grp
=
(
name
,
c
)
allfiles
.
append
(
grp
)
# same for balanced files
"""
for f in filesBalanced:
c = cooler.Cooler(str(f)+
"
::resolutions/4000
"
)
name = str(f).split(
"
/
"
)[2]
if
'
local
'
not in name:
print(name)
grp = (name, c)
allfiles.append(grp)
"""
# print(allfiles)
# print(len(allfiles))
# combine each case with the other to compare each conditions
comb
=
list
(
combinations
(
allfiles
,
2
))
#print(len(comb))
#print(comb[0][1])
# have a matrix to make the list of chromosomes for all
c
=
comb
[
0
][
1
][
1
]
# quick test to check the combinations
"""
for x in range(len(comb)):
print(str(x) +
"
->
"
+ str(comb[x]))
"""
### to make a list of chromosome start/ends in bins:
chromstarts
=
[]
for
i
in
c
.
chromnames
:
print
(
f
'
{
i
}
:
{
c
.
extent
(
i
)
}
'
)
chromstarts
.
append
(
c
.
extent
(
i
)[
0
])
# print(chromstarts)
#list of chromosomes arms for S288c
chr_arms
=
bioframe
.
make_viewframe
([(
"
chr1
"
,
0
,
151465
,
"
chr1_p
"
),(
"
chr1
"
,
151465
,
230218
,
"
chr1_q
"
),(
"
chr2
"
,
0
,
238207
,
"
chr2_p
"
),(
"
chr2
"
,
238207
,
813184
,
"
chr2_q
"
),(
"
chr3
"
,
0
,
114385
,
"
chr3_p
"
),(
"
chr3
"
,
114385
,
316620
,
"
chr3_q
"
),(
"
chr4
"
,
0
,
449711
,
"
chr4_p
"
),(
"
chr4
"
,
449711
,
1531933
,
"
chr4_q
"
),(
"
chr5
"
,
0
,
153771
,
"
chr5_p
"
),(
"
chr5
"
,
153771
,
578613
,
"
chr5_q
"
),(
"
chr6
"
,
0
,
148510
,
"
chr6_p
"
),(
"
chr6
"
,
148510
,
270161
,
"
chr6_q
"
),(
"
chr7
"
,
0
,
497038
,
"
chr7_p
"
),(
"
chr7
"
,
497038
,
1090940
,
"
chr7_q
"
),(
"
chr8
"
,
0
,
105703
,
"
chr8_p
"
),(
"
chr8
"
,
105703
,
562643
,
"
chr8_q
"
),(
"
chr9
"
,
0
,
355629
,
"
chr9_p
"
),(
"
chr9
"
,
355629
,
439888
,
"
chr9_q
"
),(
"
chr10
"
,
0
,
436425
,
"
chr10_p
"
),(
"
chr10
"
,
436425
,
745751
,
"
chr10_q
"
),(
"
chr11
"
,
0
,
440246
,
"
chr11_p
"
),(
"
chr11
"
,
440246
,
666816
,
"
chr11_q
"
),(
"
chr12
"
,
0
,
150947
,
"
chr12_p
"
),(
"
chr12
"
,
150947
,
1078177
,
"
chr12_q
"
),(
"
chr13
"
,
0
,
268031
,
"
chr13_p
"
),(
"
chr13
"
,
268031
,
924431
,
"
chr13_q
"
),(
"
chr14
"
,
0
,
628758
,
"
chr14_p
"
),(
"
chr14
"
,
628758
,
784333
,
"
chr14_q
"
),(
"
chr15
"
,
0
,
326702
,
"
chr15_p
"
),(
"
chr15
"
,
326702
,
1091291
,
"
chr15_q
"
),(
"
chr16
"
,
0
,
555957
,
"
chr16_p
"
),(
"
chr16
"
,
555957
,
948066
,
"
chr16_q
"
),(
"
2_micron
"
,
0
,
6318
,
"
2_micron
"
),(
"
mitochondrion
"
,
0
,
85779
,
"
mitochondrion
"
)])
chr_arms
=
chr_arms
[
chr_arms
.
chrom
.
isin
(
c
.
chromnames
)].
reset_index
(
drop
=
True
)
# print(chr_arms)
def
plotGraph
(
diff
,
c
,
n1
,
n2
):
f
,
ax
=
plt
.
subplots
(
figsize
=
(
7
,
6
))
im
=
ax
.
matshow
((
np
.
log2
(
diff
)),
cmap
=
"
bwr
"
,
vmin
=-
1
,
vmax
=
1
);
plt
.
colorbar
(
im
,
fraction
=
0.046
,
pad
=
0.04
)
plt
.
title
(
"
Diff %s / %s
"
%
(
n1
,
n2
),
fontsize
=
14
)
ax
.
set
(
xticks
=
chromstarts
,
xticklabels
=
c
.
chromnames
)
ax
.
xaxis
.
set_label_position
(
'
bottom
'
)
ax
.
tick_params
(
axis
=
'
x
'
,
labelsize
=
8
,
rotation
=
45
,
labelbottom
=
True
,
labeltop
=
False
)
return
f
def
plotGraphChr3
(
diff
,
c
,
n1
,
n2
):
f
,
ax
=
plt
.
subplots
(
figsize
=
(
7
,
6
))
im
=
ax
.
matshow
((
np
.
log2
(
diff
)),
cmap
=
"
bwr
"
,
vmin
=-
1
,
vmax
=
1
);
plt
.
colorbar
(
im
,
fraction
=
0.046
,
pad
=
0.04
)
plt
.
title
(
"
Diff (chr3) %s / %s
"
%
(
n1
,
n2
),
fontsize
=
14
)
ax
.
xaxis
.
set_label_position
(
'
bottom
'
)
ax
.
tick_params
(
axis
=
'
x
'
,
labelsize
=
8
,
rotation
=
45
,
labelbottom
=
True
,
labeltop
=
False
)
return
f
# plot only given combinations
n1
=
comb
[
62
][
1
][
0
]
n2
=
comb
[
54
][
0
][
0
]
c1_tot
=
np
.
sum
(
comb
[
62
][
1
][
1
].
matrix
(
balance
=
False
).
fetch
(
'
chr3
'
))
c2_tot
=
np
.
sum
(
comb
[
54
][
0
][
1
].
matrix
(
balance
=
False
).
fetch
(
'
chr3
'
))
# to normalize to 1
vec1
=
np
.
array
([
c1_tot
])
vec2
=
np
.
array
([
c2_tot
])
diff
=
((
comb
[
62
][
1
][
1
].
matrix
(
balance
=
False
).
fetch
(
'
chr3
'
)
/
vec1
)
/
(
comb
[
54
][
0
][
1
].
matrix
(
balance
=
False
).
fetch
(
'
chr3
'
)
/
vec2
))
for
x
in
range
(
0
,
diff
.
shape
[
0
]):
for
y
in
range
(
0
,
diff
.
shape
[
1
]):
if
x
==
y
:
diff
[
x
,
y
]
=
0
plot
=
plotGraphChr3
(
diff
,
c
,
n1
,
n2
)
plot
.
savefig
(
'
chr3_2c.png
'
)
plt
.
close
(
plot
)
# plot all combinations
"""
n=1
for x in comb:
# print(x)
print(
"
Figure %i
"
% n)
c1_tot = np.sum(x[0][1].matrix(balance=False)[:])
c2_tot = np.sum(x[1][1].matrix(balance=False)[:])
vec1 = np.array([c1_tot])
vec2 = np.array([c2_tot])
diff = ((x[0][1].matrix(balance=False)[:]/vec1)/(x[1][1].matrix(balance=False)[:]/vec2))
n1 = x[0][0]
n2 = x[1][0]
# pp = PdfPages(
'
%s.pdf
'
% n)
n = n+1
for x in range(0, diff.shape[0]):
for y in range(0, diff.shape[1]):
if x == y:
diff[x,y] = 0
plot = plotGraph(diff, c, n1, n2)
plot.savefig(
'
%s.png
'
% n)
plt.close(plot)
# pp.savefig(plot)
# pp.close()
# print(diff)
"""
# plot all combinations but just chr3
"""
n=1
for x in comb:
# print(x)
print(
"
Figure %i
"
% n)
c1_tot = np.sum(x[0][1].matrix(balance=False).fetch(
'
chr3
'
))
c2_tot = np.sum(x[1][1].matrix(balance=False).fetch(
'
chr3
'
))
vec1 = np.array([c1_tot])
vec2 = np.array([c2_tot])
diff = ((x[0][1].matrix(balance=False).fetch(
'
chr3
'
))/(x[1][1].matrix(balance=False).fetch(
'
chr3
'
)))
n1 = x[0][0]
n2 = x[1][0]
# pp = PdfPages(
'
chr3_%s_noNorm.pdf
'
% n)
n = n+1
for x in range(0, diff.shape[0]):
for y in range(0, diff.shape[1]):
if x == y:
diff[x,y] = 0
plot = plotGraphChr3(diff, c, n1, n2)
plot.savefig(
'
chr3_%s_noNorm.png
'
% n)
plt.close(plot)
# pp.savefig(plot)
# pp.close()
# print(diff)
"""
print
(
"
\n
=========
\n
"
)
#print(allfiles[0])
# cvd == contacts-vs-distance
cvd
=
cooltools
.
expected_cis
(
clr
=
c
,
view_df
=
chr_arms
,
smooth
=
True
,
aggregate_smoothed
=
False
,
nproc
=
1
,
clr_weight_name
=
None
# nproc=num_cpus #if you do not have multiple cores available, set to 1
)
# plot contact vs distance for each chromosomal arm
"""
print(cvd.head(4))
f, ax = plt.subplots(1,1)
for region in chr_arms[
'
name
'
]:
pp = PdfPages(
'
dist_normal_nofilter_%s.pdf
'
% region)
ax.loglog(
cvd[
'
dist_bp
'
].loc[cvd[
'
region1
'
]==region],
cvd[
'
contact_frequency
'
].loc[cvd[
'
region1
'
]==region],
)
ax.set(
xlabel=
'
separation, bp
'
,
ylabel=
'
IC contact frequency
'
)
ax.set_aspect(1.0)
ax.grid(lw=0.5)
pp.savefig(f)
pp.close()
"""
This diff is collapsed.
Click to expand it.
benchmark/plotsBenchmarkParasplit.R
0 → 100644
+
388
−
0
View file @
6122cdf9
install.packages
(
"tidyverse"
)
install.packages
(
"ggplot2"
)
install.packages
(
"LaF"
)
library
(
tidyverse
)
library
(
ggplot2
)
library
(
dplyr
)
library
(
forcats
)
library
(
LaF
)
library
(
scales
)
#################################
# #
# PLOTS ON PROCESS TIME #
# #
#################################
#Get the 3 replicates
df
<-
read.csv
(
"/home/mcroiset/HiC/test_parasplit/recap_rep0.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep2
<-
read.csv
(
"/home/mcroiset/HiC/test_parasplit/recap_rep1.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep3
<-
read.csv
(
"/home/mcroiset/HiC/test_parasplit/recap_rep2.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
# para_test <- read.csv("/home/mcroiset/HiC/test_parasplit/hicstuff_parasplit/pipeline_info/execution_trace_2024-08-30_11-40-06.txt", sep = "\t", header= TRUE)
merge_df
<-
bind_rows
(
df
,
rep2
,
rep3
)
#Function to convert the time from report file (format Xd XXm XXs) to minutes
convert_time
<-
function
(
x
){
parts
<-
strsplit
(
x
,
" "
)
list_durations
<-
character
(
0
)
for
(
part
in
parts
)
{
sec
<-
0
minute
<-
0
hours
<-
0
mili
<-
0
for
(
single
in
part
)
{
if
(
grepl
(
'\\d+s'
,
single
))
#for seconds
{
sec
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
sec
<-
sec
/
60
sec
<-
as.numeric
(
format
(
round
(
sec
,
3
),
nsmall
=
3
))
}
if
(
grepl
(
'\\d+m$'
,
single
))
#for minutes
{
minute
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
minute
<-
as.numeric
(
format
(
round
(
minute
,
2
),
nsmall
=
2
))
}
if
(
grepl
(
'\\d+h'
,
single
))
#for days
{
hours
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
hours
<-
hours
*
60
hours
<-
as.numeric
(
format
(
round
(
hours
,
2
),
nsmall
=
2
))
}
if
(
grepl
(
'\\d+ms'
,
single
))
#for milliseconds, may be equal to 0 already
{
mili
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-2
))
if
(
mili
>
0
)
{
mili
<-
(
mili
/
1000
)
/
60
mili
<-
as.numeric
(
format
(
round
(
mili
,
4
),
nsmall
=
4
))
}
}
}
new_duration
<-
(
hours
+
minute
+
sec
+
mili
)
list_durations
<-
append
(
list_durations
,
new_duration
)
}
return
(
list_durations
)
}
#convert the time for both duration (from submit of the process to end) and realtime (only process time)
list_dur
<-
convert_time
(
merge_df
$
duration
)
list_realtime
<-
convert_time
(
merge_df
$
realtime
)
# list_dur_para <- convert_time(para_test$duration)
# list_realtime_para <- convert_time(para_test$realtime)
merge_df
$
duration_minutes
<-
list_dur
merge_df
$
duration_minutes
<-
as.numeric
(
merge_df
$
duration_minutes
)
# para_test$duration_minutes <- list_dur_para
# para_test$duration_minutes <- as.numeric(para_test$duration_minutes)
merge_df
$
realtime_minutes
<-
list_realtime
merge_df
$
realtime_minutes
<-
as.numeric
(
merge_df
$
realtime_minutes
)
# para_test$realtime_minutes <- list_realtime_para
# para_test$realtime_minutes <- as.numeric(para_test$realtime_minutes)
#mean and standard deviation, but in the end not used in this script
val
<-
merge_df
%>%
group_by
(
name
)
%>%
summarise
(
moy
=
mean
(
duration_minutes
),
ect
=
sd
(
duration_minutes
))
#add workflow column (either hicpro or hicstuff)
workflow
<-
character
(
0
)
for
(
x
in
merge_df
$
file
)
{
full
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
4
]
w
<-
sapply
(
strsplit
(
full
,
split
=
"_"
),
"["
,
1
)
workflow
<-
append
(
workflow
,
w
)
}
merge_df
$
workflow
<-
workflow
# workflow <- character(0)
# for (x in para_test$duration) {
# w <- "hicstuff"
# workflow <- append(workflow,w)
# }
# para_test$workflow <- workflow
#strip process names : only last part, remove the workflow:subworkflow:process structure
processes
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
process
<-
lapply
(
strsplit
(
x
,
split
=
":"
),
tail
,
n
=
1L
)
processes
<-
append
(
processes
,
as.character
(
process
))
}
merge_df
$
name
<-
processes
# processes <- character(0)
# for (x in para_test$name) {
# process <- lapply(strsplit(x, split = ":"), tail, n = 1L)
# processes <- append(processes, as.character(process))
# }
# para_test$name <- processes
#shorten the process names : removes the parenthesis with the data
#works if you don't care mixing all data for one process !
short_process
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
short
<-
strsplit
(
x
,
split
=
"\\("
)[[
1
]][
1
]
short_process
<-
append
(
short_process
,
as.character
(
short
))
}
merge_df
$
name
<-
short_process
# short_process <- character(0)
# for (x in para_test$name) {
# short <- strsplit(x, split = "\\(")[[1]][1]
# short_process <- append(short_process, as.character(short))
# }
# para_test$name <- short_process
#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile))
#keeps first part before "/"
filenames
<-
character
(
0
)
for
(
x
in
merge_df
$
file
)
{
file_short_name
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
4
]
filenames
<-
append
(
filenames
,
file_short_name
)
}
merge_df
$
file
<-
filenames
# filenames <- character(0)
# for (x in para_test$duration) {
# file_short_name <- "parasplit"
# filenames <- append(filenames,file_short_name)
# }
# para_test$file <- filenames
#used to add a particular shape for specific processes, not used in the end
important_processes
<-
c
(
15
,
16
,
17
,
18
,
7
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
9
,
1
,
2
,
5
,
6
,
1
,
1
,
1
,
1
,
1
,
15
,
11
,
1
,
1
,
1
,
1
,
12
,
1
,
1
,
1
,
1
,
1
)
# merge_df <- bind_rows(merge_df, para_test)
#define categories for easier plot reading, n is the rest
categories
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
if
(
grepl
(
'\\w*ALIGN\\w*'
,
x
)
||
grepl
(
'\\w*TRIM\\w*'
,
x
)
||
grepl
(
'MERGE_BOWTIE2'
,
x
))
{
categories
<-
append
(
categories
,
"align"
)
}
else
if
(
grepl
(
'\\w*COOLER\\w*'
,
x
))
{
categories
<-
append
(
categories
,
"cooler"
)
}
else
if
(
grepl
(
'\\w*PAIRS\\w*'
,
x
))
{
categories
<-
append
(
categories
,
"pairs"
)
}
else
if
(
grepl
(
'\\w*MATRIX\\w*'
,
x
)
||
grepl
(
'\\w*BUILD_CONTACT\\w*'
,
x
)
||
grepl
(
'\\w*ICE_NORM\\w*'
,
x
))
{
categories
<-
append
(
categories
,
"matrix"
)
}
else
if
(
grepl
(
'\\w*FILTER\\w*'
,
x
)
||
grepl
(
'\\w*PICARD\\w*'
,
x
)
||
grepl
(
'\\w*SAMTOOLS\\w*'
,
x
))
{
categories
<-
append
(
categories
,
"filter"
)
}
else
if
(
grepl
(
'\\w*CUTSITE\\w*'
,
x
)
||
grepl
(
'\\w*PARASPLIT\\w*'
,
x
))
{
categories
<-
append
(
categories
,
"parasplit"
)
}
else
if
(
grepl
(
'\\w*SAMPLESHEET\\w*'
,
x
)
||
(
grepl
(
'BOWTIE2_BUILD'
,
x
)
||
grepl
(
'\\w*GETCHROM\\w*'
,
x
)
||
grepl
(
'\\w*GET_RESTRIC\\w*'
,
x
)))
{
categories
<-
append
(
categories
,
"data_prep"
)
}
else
{
categories
<-
append
(
categories
,
"n"
)
}
}
# categories
merge_df
$
categorie
<-
categories
print
(
levels
(
as.factor
(
merge_df
$
name
)))
#order the processes manually in order of the pipeline
#may fail if there's more options than I defined here, may make the order false because it depends of the alphabetical order of the processes
ordered_processes
<-
tibble
(
levels
(
as.factor
(
merge_df
$
name
)))
order
<-
c
(
18
,
7
,
2
,
20
,
21
,
24
,
23
,
26
,
22
,
25
,
30
,
31
,
32
,
3
,
5
,
19
,
19
,
9
,
4
,
28
,
29
,
7
,
6
,
1
,
27
)
ordered_processes
<-
ordered_processes
%>%
add_column
(
order
=
order
)
ordered_processes
<-
rename
(
ordered_processes
,
name
=
`levels(as.factor(merge_df$name))`
)
merge_df
<-
left_join
(
merge_df
,
ordered_processes
,
by
=
c
(
"name"
=
"name"
))
categories_colors
<-
c
(
"#F8766D"
,
"#CD9600"
,
"#7CAE00"
,
"#00BE67"
,
"#00BFC4"
,
"#00A9FF"
,
"#aeb6bf"
,
"#C77CFF"
)
# pdf("process_time.pdf")
#2 first plots for both duration time and realtime
#Then realtime whitout x labels for better visibility
ggplot
(
merge_df
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
duration_minutes
,
color
=
categorie
))
+
geom_boxplot
()
+
scale_shape_manual
(
values
=
important_processes
)
+
scale_color_manual
(
values
=
categories_colors
)
+
facet_wrap
(
~
file
,
ncol
=
7
)
+
xlab
(
"Processes"
)
+
ylab
(
"Duration in minutes"
)
+
theme
(
axis.text.x
=
element_blank
(),
axis.ticks.x
=
element_blank
())
+
ggtitle
(
"Duration time of each process"
)
+
scale_y_log10
()
+
theme_bw
()
ggplot
(
merge_df
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
realtime_minutes
,
color
=
categorie
))
+
geom_boxplot
()
+
scale_shape_manual
(
values
=
important_processes
)
+
scale_color_manual
(
values
=
categories_colors
)
+
facet_wrap
(
~
file
,
ncol
=
4
)
+
xlab
(
"Processes"
)
+
ylab
(
"Duration in minutes"
)
+
ggtitle
(
"Real execution time of each process"
)
+
scale_y_log10
()
+
theme_bw
()
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
vjust
=
0.5
,
hjust
=
1
))
ggplot
(
merge_df
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
realtime_minutes
,
color
=
categorie
))
+
geom_boxplot
()
+
scale_shape_manual
(
values
=
important_processes
)
+
scale_color_manual
(
values
=
categories_colors
)
+
facet_wrap
(
~
file
,
ncol
=
7
)
+
xlab
(
"Processes"
)
+
ylab
(
"Duration in minutes"
)
+
ggtitle
(
"Real execution time of each process"
)
+
scale_y_log10
()
+
theme_bw
()
+
theme
(
axis.title.x
=
element_blank
(),
axis.text.x
=
element_blank
(),
axis.ticks.x
=
element_blank
())
#Cumulative sum of the realtime of the processes for each conformation
merge_df.ordered
<-
merge_df
%>%
group_by
(
file
)
%>%
arrange
(
order
,
.by_group
=
TRUE
)
%>%
mutate
(
cum_time
=
cumsum
(
realtime_minutes
))
ggplot
(
merge_df.ordered
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
cum_time
,
group
=
file
))
+
geom_line
(
aes
(
color
=
file
))
+
xlab
(
"Processes"
)
+
ylab
(
"Duration in minutes"
)
+
ggtitle
(
"Duration per conformation"
)
+
theme_bw
()
+
theme
(
axis.text.x
=
element_text
(
angle
=
45
,
vjust
=
1
,
hjust
=
1
))
+
geom_point
(
aes
(
fill
=
categorie
),
shape
=
21
,
size
=
2.5
)
+
scale_fill_manual
(
values
=
categories_colors
)
+
scale_y_continuous
(
breaks
=
seq
(
0
,
2300
,
by
=
200
))
+
geom_line
(
data
=
~
filter
(
.x
,
file
==
"hicstuff_parasplit_nofilter"
),
color
=
"black"
,
linetype
=
"dotted"
)
+
geom_line
(
data
=
~
filter
(
.x
,
file
==
"hicstuff_parasplit_nofilter_local"
),
color
=
"black"
,
linetype
=
"dotted"
)
# dev.off()
#######################################
# #
# PLOTS ON NUMBER OF CONTACTS #
# #
#######################################
#get the 3 replicates matrices for all conformation
listFiles
<-
read.csv
(
"/home/mcroiset/HiC/test_parasplit/matrices_rep0_bis.txt"
,
sep
=
"\n"
,
header
=
FALSE
)
# listFiles2 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep1_bis.txt", sep = "\n")
# listFiles3 <- read.csv("/home/mcroiset/HiC/test_parasplit/matrices_rep2_bis.txt", sep = "\n")
names
(
listFiles
)
<-
"File"
names
(
listFiles2
)
<-
"File"
names
(
listFiles3
)
<-
"File"
# merge_lf <- bind_rows(listFiles, listFiles2, listFiles3)
merge_lf
<-
listFiles
#set the dataframe with the file and the number of contact (= number of line in the raw matrix)
df.counts
<-
tibble
(
File
=
character
(),
Counts
=
numeric
())
for
(
filenames
in
merge_lf
)
{
filenames
<-
lapply
(
strsplit
(
filenames
,
split
=
"../../"
),
tail
,
n
=
1L
)
filenames
<-
paste
(
"/home/mcroiset/HiC/"
,
filenames
,
sep
=
""
)
for
(
f
in
filenames
)
{
count
<-
read_log
(
f
,
col_names
=
F
)
$
X5
[
1
]
# count <- determine_nlines(f)
df.counts
<-
add_row
(
df.counts
,
File
=
f
,
Counts
=
count
)
# print(f)
# print(count)
}
}
#df.counts$File
#keep only AD281 data, because I had previous results in the same folder
#If you have different data, change the code below
df.filtered
<-
df.counts
%>%
filter
(
grepl
(
'AD281'
,
File
))
#df.filtered$File
#strip the file names, same as earlier
filenames
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
file_short_name
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
8
]
if
(
grepl
(
'matrix'
,
file_short_name
))
{
file_short_name
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
6
]
file_short_name
<-
paste
(
file_short_name
,
"_"
,
strsplit
(
x
,
split
=
"/"
)[[
1
]][
5
])
}
filenames
<-
append
(
filenames
,
file_short_name
)
}
#filenames
df.filtered
$
File
<-
filenames
#df.filtered
#get the workflow, same as earlier
workflow
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
w
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
1
]
workflow
<-
append
(
workflow
,
w
)
}
df.filtered
$
workflow
<-
workflow
#get the alignment type (normal | iteralign | cutsite)
align
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
a
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
2
]
align
<-
append
(
align
,
a
)
}
df.filtered
$
align
<-
align
#get the filtering options (nofilter | nofilter_local | filter | filter_local | filter_pcr | filter_pcr_local | both)
filtering
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
f
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
3
]
f2
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
4
]
f3
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
5
]
f4
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
6
]
ff
<-
paste
(
f
,
f2
,
f3
,
sep
=
"_"
)
if
(
strsplit
(
ff
,
split
=
"_"
)[[
1
]][
2
]
==
"NA"
)
{
ff
<-
strsplit
(
ff
,
split
=
"_"
)[[
1
]][
1
]
}
else
if
(
strsplit
(
ff
,
split
=
"_"
)[[
1
]][
3
]
==
"NA"
)
{
ff1
<-
strsplit
(
ff
,
split
=
"_"
)[[
1
]][
1
]
ff2
<-
strsplit
(
ff
,
split
=
"_"
)[[
1
]][
2
]
ff
<-
paste
(
ff1
,
ff2
,
sep
=
"_"
)
}
filtering
<-
append
(
filtering
,
ff
)
}
df.filtered
$
filtering
<-
filtering
#exclude picard filtering because really low
df.excludePicard
<-
df.filtered
%>%
filter
(
!
grepl
(
'filter_picard'
,
filtering
))
#plot the number of contacts per conformation, with or without Picard filtering
ggplot
(
df.filtered
,
aes
(
x
=
reorder
(
File
,
Counts
),
y
=
Counts
,
shape
=
align
,
color
=
filtering
))
+
geom_point
(
size
=
5
)
+
scale_y_log10
()
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
vjust
=
0.5
,
hjust
=
1
))
+
ggtitle
(
"Number of contacts"
)
ggplot
(
df.excludePicard
,
aes
(
x
=
reorder
(
File
,
Counts
),
y
=
Counts
,
shape
=
filtering
,
color
=
align
))
+
geom_point
(
size
=
5
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
vjust
=
0.5
,
hjust
=
1
))
+
ggtitle
(
"Number of contacts"
)
+
scale_shape_manual
(
values
=
c
(
15
,
16
,
17
,
18
))
+
xlab
(
""
)
+
ylab
(
"Counts"
)
ggplot
(
df.excludePicard
,
aes
(
x
=
reorder
(
File
,
Counts
)
,
y
=
Counts
,
fill
=
align
))
+
geom_col
(
size
=
2
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
vjust
=
0.5
,
hjust
=
1
))
+
ggtitle
(
"Number of contacts"
)
+
xlab
(
""
)
+
ylab
(
"Counts"
)
+
scale_fill_hue
(
c
=
45
,
l
=
80
)
+
facet_wrap
(
~
factor
(
filtering
,
levels
=
c
(
"nofilter"
,
"filter"
,
"filter_pcr"
,
"filter_filterpcr"
)),
scales
=
"free"
)
+
scale_y_continuous
(
limits
=
c
(
0
,
8e+07
))
+
theme
(
axis.text.x
=
element_blank
(),
axis.ticks.x
=
element_blank
())
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment