Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hic
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
Hub
hic
Commits
09826a09
"git@gitbio.ens-lyon.fr:xgrand/ChIPster.git" did not exist on "3490e2f1b557c52ddf09a90262260938dd9debb9"
Verified
Commit
09826a09
authored
10 months ago
by
Mia Croiset
Browse files
Options
Downloads
Patches
Plain Diff
comment script
parent
71005133
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
benchmark/plots.r
+40
-8
40 additions, 8 deletions
benchmark/plots.r
with
40 additions
and
8 deletions
benchmark/plots.r
+
40
−
8
View file @
09826a09
...
@@ -13,12 +13,16 @@ library(LaF)
...
@@ -13,12 +13,16 @@ library(LaF)
# #
# #
#################################
#################################
#Get the 3 replicates
df
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
df
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep2
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap_rep2.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep2
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap_rep2.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep3
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap_rep3.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
rep3
<-
read.csv
(
"/home/mcroiset/HiC/benchmark/recap_rep3.txt"
,
sep
=
"\t"
,
header
=
TRUE
)
merge_df
<-
bind_rows
(
df
,
rep2
,
rep3
)
merge_df
<-
bind_rows
(
df
,
rep2
,
rep3
)
#Function to convert the time from report file (format Xd XXm XXs) to minutes
convert_time
<-
function
(
x
){
convert_time
<-
function
(
x
){
parts
<-
strsplit
(
x
,
" "
)
parts
<-
strsplit
(
x
,
" "
)
list_durations
<-
character
(
0
)
list_durations
<-
character
(
0
)
...
@@ -28,22 +32,26 @@ convert_time <- function(x){
...
@@ -28,22 +32,26 @@ convert_time <- function(x){
hours
<-
0
hours
<-
0
mili
<-
0
mili
<-
0
for
(
single
in
part
)
{
for
(
single
in
part
)
{
if
(
grepl
(
'\\d+s'
,
single
))
{
if
(
grepl
(
'\\d+s'
,
single
))
#for seconds
{
sec
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
sec
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
sec
<-
sec
/
60
sec
<-
sec
/
60
sec
<-
as.numeric
(
format
(
round
(
sec
,
3
),
nsmall
=
3
))
sec
<-
as.numeric
(
format
(
round
(
sec
,
3
),
nsmall
=
3
))
}
}
if
(
grepl
(
'\\d+m$'
,
single
))
{
if
(
grepl
(
'\\d+m$'
,
single
))
#for minutes
{
minute
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
minute
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
minute
<-
as.numeric
(
format
(
round
(
minute
,
2
),
nsmall
=
2
))
minute
<-
as.numeric
(
format
(
round
(
minute
,
2
),
nsmall
=
2
))
}
}
if
(
grepl
(
'\\d+h'
,
single
))
{
if
(
grepl
(
'\\d+h'
,
single
))
#for days
{
hours
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
hours
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-1
))
hours
<-
hours
*
60
hours
<-
hours
*
60
hours
<-
as.numeric
(
format
(
round
(
hours
,
2
),
nsmall
=
2
))
hours
<-
as.numeric
(
format
(
round
(
hours
,
2
),
nsmall
=
2
))
}
}
if
(
grepl
(
'\\d+ms'
,
single
))
{
if
(
grepl
(
'\\d+ms'
,
single
))
#for milliseconds, may be equal to 0 already
{
mili
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-2
))
mili
<-
as.numeric
(
substr
(
single
,
1
,
nchar
(
single
)
-2
))
if
(
mili
>
0
)
{
if
(
mili
>
0
)
{
mili
<-
(
mili
/
1000
)
/
60
mili
<-
(
mili
/
1000
)
/
60
...
@@ -56,7 +64,7 @@ convert_time <- function(x){
...
@@ -56,7 +64,7 @@ convert_time <- function(x){
}
}
return
(
list_durations
)
return
(
list_durations
)
}
}
#convert the time for both duration (from submit of the process to end) and realtime (only process time)
list_dur
<-
convert_time
(
merge_df
$
duration
)
list_dur
<-
convert_time
(
merge_df
$
duration
)
list_realtime
<-
convert_time
(
merge_df
$
realtime
)
list_realtime
<-
convert_time
(
merge_df
$
realtime
)
...
@@ -66,8 +74,10 @@ merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes)
...
@@ -66,8 +74,10 @@ merge_df$duration_minutes <- as.numeric(merge_df$duration_minutes)
merge_df
$
realtime_minutes
<-
list_realtime
merge_df
$
realtime_minutes
<-
list_realtime
merge_df
$
realtime_minutes
<-
as.numeric
(
merge_df
$
realtime_minutes
)
merge_df
$
realtime_minutes
<-
as.numeric
(
merge_df
$
realtime_minutes
)
#mean and standard deviation, but in the end not used in this script
val
<-
merge_df
%>%
group_by
(
name
)
%>%
summarise
(
moy
=
mean
(
duration_minutes
),
ect
=
sd
(
duration_minutes
))
val
<-
merge_df
%>%
group_by
(
name
)
%>%
summarise
(
moy
=
mean
(
duration_minutes
),
ect
=
sd
(
duration_minutes
))
#add workflow column (either hicpro or hicstuff)
workflow
<-
character
(
0
)
workflow
<-
character
(
0
)
for
(
x
in
merge_df
$
file
)
{
for
(
x
in
merge_df
$
file
)
{
full
<-
sapply
(
strsplit
(
x
,
split
=
"/"
),
"["
,
2
)
full
<-
sapply
(
strsplit
(
x
,
split
=
"/"
),
"["
,
2
)
...
@@ -76,6 +86,7 @@ for (x in merge_df$file) {
...
@@ -76,6 +86,7 @@ for (x in merge_df$file) {
}
}
merge_df
$
workflow
<-
workflow
merge_df
$
workflow
<-
workflow
#strip process names : only last part, remove the workflow:subworkflow:process structure
processes
<-
character
(
0
)
processes
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
for
(
x
in
merge_df
$
name
)
{
process
<-
lapply
(
strsplit
(
x
,
split
=
":"
),
tail
,
n
=
1L
)
process
<-
lapply
(
strsplit
(
x
,
split
=
":"
),
tail
,
n
=
1L
)
...
@@ -83,6 +94,8 @@ for (x in merge_df$name) {
...
@@ -83,6 +94,8 @@ for (x in merge_df$name) {
}
}
merge_df
$
name
<-
processes
merge_df
$
name
<-
processes
#shorten the process names : removes the parenthesis with the data
#works if you don't care mixing all data for one process !
short_process
<-
character
(
0
)
short_process
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
for
(
x
in
merge_df
$
name
)
{
short
<-
strsplit
(
x
,
split
=
"\\("
)[[
1
]][
1
]
short
<-
strsplit
(
x
,
split
=
"\\("
)[[
1
]][
1
]
...
@@ -90,6 +103,8 @@ for (x in merge_df$name) {
...
@@ -90,6 +103,8 @@ for (x in merge_df$name) {
}
}
merge_df
$
name
<-
short_process
merge_df
$
name
<-
short_process
#strip file names, to know which "conformation" it is (previous format : conformation(hic[stuff|pro]_aligntype_filtertype/pipeline_info/myfile))
#keeps first part before "/"
filenames
<-
character
(
0
)
filenames
<-
character
(
0
)
for
(
x
in
merge_df
$
file
)
{
for
(
x
in
merge_df
$
file
)
{
file_short_name
<-
sapply
(
strsplit
(
x
,
split
=
"/"
),
"["
,
2
)
file_short_name
<-
sapply
(
strsplit
(
x
,
split
=
"/"
),
"["
,
2
)
...
@@ -97,8 +112,10 @@ for (x in merge_df$file) {
...
@@ -97,8 +112,10 @@ for (x in merge_df$file) {
}
}
merge_df
$
file
<-
filenames
merge_df
$
file
<-
filenames
#used to add a particular shape for specific processes, not used in the end
important_processes
<-
c
(
15
,
16
,
17
,
18
,
7
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
9
,
1
,
2
,
5
,
6
,
1
,
1
,
1
,
1
,
1
,
15
,
11
,
1
,
1
,
1
,
1
,
12
,
1
,
1
,
1
,
1
,
1
)
important_processes
<-
c
(
15
,
16
,
17
,
18
,
7
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
9
,
1
,
2
,
5
,
6
,
1
,
1
,
1
,
1
,
1
,
15
,
11
,
1
,
1
,
1
,
1
,
12
,
1
,
1
,
1
,
1
,
1
)
#define categories for easier plot reading, n is the rest
categories
<-
character
(
0
)
categories
<-
character
(
0
)
for
(
x
in
merge_df
$
name
)
{
for
(
x
in
merge_df
$
name
)
{
if
(
grepl
(
'\\w*ALIGN\\w*'
,
x
)
||
grepl
(
'\\w*TRIM\\w*'
,
x
)
||
grepl
(
'MERGE_BOWTIE2'
,
x
))
{
if
(
grepl
(
'\\w*ALIGN\\w*'
,
x
)
||
grepl
(
'\\w*TRIM\\w*'
,
x
)
||
grepl
(
'MERGE_BOWTIE2'
,
x
))
{
...
@@ -129,7 +146,9 @@ for (x in merge_df$name) {
...
@@ -129,7 +146,9 @@ for (x in merge_df$name) {
# categories
# categories
merge_df
$
categorie
<-
categories
merge_df
$
categorie
<-
categories
print
(
levels
(
as.factor
(
merge_df
$
name
)))
#print(levels(as.factor(merge_df$name)))
#order the processes manually in order of the pipeline
#may fail if there's more options than I definded here, may make the order false because it depends of the alphabetical order of the processes
ordered_processes
<-
tibble
(
levels
(
as.factor
(
merge_df
$
name
)))
ordered_processes
<-
tibble
(
levels
(
as.factor
(
merge_df
$
name
)))
order
<-
c
(
18
,
7
,
8
,
7
,
2
,
20
,
20
,
21
,
14
,
24
,
23
,
27
,
22
,
25
,
30
,
31
,
32
,
3
,
6
,
5
,
19
,
13
,
19
,
9
,
4
,
15
,
28
,
29
,
18
,
21
,
7
,
8
,
17
,
16
,
33
,
11
,
1
,
10
,
12
,
27
,
8
)
order
<-
c
(
18
,
7
,
8
,
7
,
2
,
20
,
20
,
21
,
14
,
24
,
23
,
27
,
22
,
25
,
30
,
31
,
32
,
3
,
6
,
5
,
19
,
13
,
19
,
9
,
4
,
15
,
28
,
29
,
18
,
21
,
7
,
8
,
17
,
16
,
33
,
11
,
1
,
10
,
12
,
27
,
8
)
ordered_processes
<-
ordered_processes
%>%
add_column
(
order
=
order
)
ordered_processes
<-
ordered_processes
%>%
add_column
(
order
=
order
)
...
@@ -140,6 +159,8 @@ merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
...
@@ -140,6 +159,8 @@ merge_df <- left_join(merge_df, ordered_processes, by= c("name" = "name"))
categories_colors
<-
c
(
"#F8766D"
,
"#CD9600"
,
"#7CAE00"
,
"#00BE67"
,
"#00BFC4"
,
"#00A9FF"
,
"#aeb6bf"
,
"#C77CFF"
)
categories_colors
<-
c
(
"#F8766D"
,
"#CD9600"
,
"#7CAE00"
,
"#00BE67"
,
"#00BFC4"
,
"#00A9FF"
,
"#aeb6bf"
,
"#C77CFF"
)
# pdf("process_time.pdf")
# pdf("process_time.pdf")
#2 first plots for both duration time and realtime
#Then realtime whitout x labels for better visibility
ggplot
(
merge_df
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
duration_minutes
,
color
=
categorie
))
+
ggplot
(
merge_df
,
aes
(
x
=
fct_reorder
(
name
,
order
),
y
=
duration_minutes
,
color
=
categorie
))
+
geom_boxplot
()
+
geom_boxplot
()
+
...
@@ -179,6 +200,7 @@ ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color =
...
@@ -179,6 +200,7 @@ ggplot(merge_df, aes(x = fct_reorder(name, order), y = realtime_minutes, color =
axis.text.x
=
element_blank
(),
axis.text.x
=
element_blank
(),
axis.ticks.x
=
element_blank
())
axis.ticks.x
=
element_blank
())
#Cumulative sum of the realtime of the processes for each conformation
merge_df.ordered
<-
merge_df
%>%
merge_df.ordered
<-
merge_df
%>%
group_by
(
file
)
%>%
group_by
(
file
)
%>%
...
@@ -204,7 +226,7 @@ ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group =
...
@@ -204,7 +226,7 @@ ggplot(merge_df.ordered, aes(x = fct_reorder(name, order), y = cum_time, group =
# #
# #
#######################################
#######################################
#get the 3 replicates matrices for all conformation
listFiles
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices.txt"
,
sep
=
"\n"
)
listFiles
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices.txt"
,
sep
=
"\n"
)
listFiles2
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt"
,
sep
=
"\n"
)
listFiles2
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices_rep2.txt"
,
sep
=
"\n"
)
listFiles3
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt"
,
sep
=
"\n"
)
listFiles3
<-
read.csv
(
"/home/mcroiset/HiC/hic/benchmark/matrices_rep3.txt"
,
sep
=
"\n"
)
...
@@ -215,6 +237,7 @@ names(listFiles3) <- "File"
...
@@ -215,6 +237,7 @@ names(listFiles3) <- "File"
merge_lf
<-
bind_rows
(
listFiles
,
listFiles2
,
listFiles3
)
merge_lf
<-
bind_rows
(
listFiles
,
listFiles2
,
listFiles3
)
#set the dataframe with the file and the number of contact (= number of line in the raw matrix)
df.counts
<-
tibble
(
File
=
character
(),
Counts
=
numeric
())
df.counts
<-
tibble
(
File
=
character
(),
Counts
=
numeric
())
for
(
filenames
in
merge_lf
)
{
for
(
filenames
in
merge_lf
)
{
filenames
<-
lapply
(
strsplit
(
filenames
,
split
=
"../../"
),
tail
,
n
=
1L
)
filenames
<-
lapply
(
strsplit
(
filenames
,
split
=
"../../"
),
tail
,
n
=
1L
)
...
@@ -227,11 +250,15 @@ for (filenames in merge_lf) {
...
@@ -227,11 +250,15 @@ for (filenames in merge_lf) {
}
}
}
}
df.counts
$
File
#df.counts$File
#keep only AD281 data, because I had previous results in the same folder
#If you have different data, change the code below
df.filtered
<-
df.counts
%>%
df.filtered
<-
df.counts
%>%
filter
(
grepl
(
'AD281.1000'
,
File
)
|
grepl
(
'AD281_abs'
,
File
))
filter
(
grepl
(
'AD281.1000'
,
File
)
|
grepl
(
'AD281_abs'
,
File
))
#df.filtered$File
#df.filtered$File
#strip the file names, same as earlier
filenames
<-
character
(
0
)
filenames
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
for
(
x
in
df.filtered
$
File
)
{
file_short_name
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
7
]
file_short_name
<-
strsplit
(
x
,
split
=
"/"
)[[
1
]][
7
]
...
@@ -241,6 +268,7 @@ for (x in df.filtered$File) {
...
@@ -241,6 +268,7 @@ for (x in df.filtered$File) {
df.filtered
$
File
<-
filenames
df.filtered
$
File
<-
filenames
#df.filtered
#df.filtered
#get the workflow, same as earlier
workflow
<-
character
(
0
)
workflow
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
for
(
x
in
df.filtered
$
File
)
{
w
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
1
]
w
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
1
]
...
@@ -248,6 +276,7 @@ for (x in df.filtered$File) {
...
@@ -248,6 +276,7 @@ for (x in df.filtered$File) {
}
}
df.filtered
$
workflow
<-
workflow
df.filtered
$
workflow
<-
workflow
#get the alignment type (normal | iteralign | cutsite)
align
<-
character
(
0
)
align
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
for
(
x
in
df.filtered
$
File
)
{
a
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
2
]
a
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
2
]
...
@@ -255,6 +284,7 @@ for (x in df.filtered$File) {
...
@@ -255,6 +284,7 @@ for (x in df.filtered$File) {
}
}
df.filtered
$
align
<-
align
df.filtered
$
align
<-
align
#get the filtering options (nofilter | filter_events | filter_pcr | filter_and_filter_pcr | filter_picard)
filtering
<-
character
(
0
)
filtering
<-
character
(
0
)
for
(
x
in
df.filtered
$
File
)
{
for
(
x
in
df.filtered
$
File
)
{
f
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
3
]
f
<-
strsplit
(
x
,
split
=
"_"
)[[
1
]][
3
]
...
@@ -274,9 +304,11 @@ for (x in df.filtered$File) {
...
@@ -274,9 +304,11 @@ for (x in df.filtered$File) {
}
}
df.filtered
$
filtering
<-
filtering
df.filtered
$
filtering
<-
filtering
#exclude picard filtering because really low
df.excludePicard
<-
df.filtered
%>%
df.excludePicard
<-
df.filtered
%>%
filter
(
!
grepl
(
'filter_picard'
,
filtering
))
filter
(
!
grepl
(
'filter_picard'
,
filtering
))
#plot the number of contacts per conformation, with or without Picard filtering
ggplot
(
df.filtered
,
aes
(
x
=
reorder
(
File
,
Counts
),
y
=
Counts
,
shape
=
align
,
color
=
filtering
))
+
ggplot
(
df.filtered
,
aes
(
x
=
reorder
(
File
,
Counts
),
y
=
Counts
,
shape
=
align
,
color
=
filtering
))
+
geom_point
(
size
=
5
)
+
geom_point
(
size
=
5
)
+
scale_y_log10
()
+
scale_y_log10
()
+
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment