Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bigWig_visu
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
ReGArDS
Projects_Analyzes
bigWig_visu
Commits
d742be18
Commit
d742be18
authored
3 years ago
by
nfontrod
Browse files
Options
Downloads
Patches
Plain Diff
src/visu/figure_maker.py: add line at bins different between two conditions
parent
56afa7be
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/visu/figure_maker.py
+118
-30
118 additions, 30 deletions
src/visu/figure_maker.py
with
118 additions
and
30 deletions
src/visu/figure_maker.py
+
118
−
30
View file @
d742be18
...
...
@@ -18,7 +18,9 @@ from tqdm import tqdm
from
loguru
import
logger
import
matplotlib
as
mpl
import
matplotlib.font_manager
from
rpy2.robjects
import
r
,
pandas2ri
from
rpy2.robjects
import
r
,
pandas2ri
,
FloatVector
from
matplotlib
import
collections
as
mc
import
numpy
as
np
def
load_bed
(
bed
:
Path
,
bed_name
:
str
)
->
List
[
List
[
Union
[
int
,
str
]]]:
...
...
@@ -292,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
return
df_sum
,
condition_col
def
line_maker
(
list_pval
,
up_value
,
position
=
0
):
"""
Create a list of lines, with their colour.
Create a line if the a p-value in list_pval[i] is below 0.05.
If up_mean[i] > down_mean[i] the line will be green, purple else.
:param list_pval: (list of float), list of pvalue get by the comparison
of a propensity scale in a particular sequence position in an
up-regulated and down_regulated set of sequences.
:param up_value: (int) the ordinate coordinates where the line will be
placed.
:param position: (int) the abscissa position to begin to draw the lines
:return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
to a lines with the coordinates [(x1, y1), (x2, y2)]
"""
lcolor
=
[]
lines
=
[]
for
i
in
range
(
len
(
list_pval
)):
if
list_pval
[
i
]
<=
0.05
:
val
=
i
+
position
lines
.
append
([(
val
-
0.5
,
up_value
),
(
val
+
0.5
,
up_value
)])
lcolor
.
append
(
"
#000000
"
)
# red
return
lines
,
lcolor
def
paired_t_test
(
values1
:
List
[
float
],
values2
:
List
[
float
])
->
float
:
"""
Get the p-value of a paired t-test for each bin
:param values1: A list of values
:param values2: Another list of values
:return: The p-values of the paired t-test
>>>
paired_t_test
([
1
,
2
,
8
],
[
5
,
8
,
15
])
0.02337551764357566
"""
if
len
(
values1
)
!=
len
(
values2
):
raise
IndexError
(
"
values1 and values2 should have the same length
"
)
ttp
=
r
(
"""
function(values1, values2) {
return(t.test(values1, values2, paired=T)$p.value)
}
"""
)
return
ttp
(
FloatVector
(
values1
),
FloatVector
(
values2
))[
0
]
def
compute_stats
(
dff
:
pd
.
DataFrame
,
y_line
:
float
,
group_col
:
str
,
outfile
:
Path
)
->
Tuple
[
List
[
List
[
Tuple
]],
List
]:
"""
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: A group column
:param outfile: The file to save the stats
:return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)],
\
and the color associated to each line
"""
df
=
dff
.
sort_values
([
"
bin
"
,
group_col
],
ascending
=
True
)
groups
=
df
[
group_col
].
unique
()
if
len
(
groups
)
!=
2
:
raise
NotImplementedError
(
"
Statistical analysis only implemented for
"
"
2 groups of data
"
)
p_values_ttp
=
[]
grp1
=
[]
vgrp1
=
[]
grp2
=
[]
vgrp2
=
[]
for
bin
in
df
[
"
bin
"
].
unique
():
tmp
=
df
[
df
[
"
bin
"
]
==
bin
]
values1
=
tmp
.
loc
[
tmp
[
group_col
]
==
groups
[
0
],
"
coverage
"
].
values
values2
=
tmp
.
loc
[
tmp
[
group_col
]
==
groups
[
1
],
"
coverage
"
].
values
p_values_ttp
.
append
(
paired_t_test
(
values1
,
values2
))
grp1
.
append
(
np
.
mean
(
values1
))
grp2
.
append
(
np
.
mean
(
values2
))
vgrp1
.
append
(
"
;
"
.
join
(
map
(
str
,
[
round
(
v
,
3
)
for
v
in
values1
])))
vgrp2
.
append
(
"
;
"
.
join
(
map
(
str
,
[
round
(
v
,
3
)
for
v
in
values2
])))
stats_df
=
pd
.
DataFrame
({
"
bin
"
:
df
[
"
bin
"
].
unique
(),
"
p_values
"
:
p_values_ttp
,
groups
[
0
]:
grp1
,
groups
[
1
]:
grp2
,
f
"
val-
{
groups
[
0
]
}
"
:
vgrp1
,
f
"
val-
{
groups
[
1
]
}
"
:
vgrp2
,})
stats_df
.
to_csv
(
outfile
,
sep
=
"
\t
"
,
index
=
False
)
return
line_maker
(
p_values_ttp
,
y_line
,
min
(
df
[
"
bin
"
].
unique
()))
def
display_stat
(
g
:
sns
.
FacetGrid
,
dff
:
pd
.
DataFrame
,
y_line
:
float
,
group_col
:
str
,
stat
:
bool
,
outfile
:
Path
)
->
sns
.
FacetGrid
:
"""
Update the graphics by displaying stats.
:param g: A seaborn FacetGrid objects corresponding to the metagene
\
figure
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: The column containing the groups analyzed
:param outfile: The file where the metagene will be saved
:return: The facetGrid with the stats
"""
if
not
stat
:
return
g
stat_file
=
outfile
.
parent
/
f
"
{
outfile
.
stem
}
.txt
"
lines
,
lcolor
=
compute_stats
(
dff
,
y_line
,
group_col
,
stat_file
)
lc
=
mc
.
LineCollection
(
lines
,
colors
=
lcolor
,
linewidths
=
5
)
g
.
ax
.
add_collection
(
lc
)
return
g
def
figure_metagene
(
df_sum
:
pd
.
DataFrame
,
show_replicate
:
bool
,
border_names
:
List
[
str
],
nb_bin
:
int
,
environment
:
List
[
int
],
bed_name
:
str
,
output
:
Path
,
norm
:
Union
[
int
,
Path
],
condition_col
:
str
,
ylim
:
Optional
[
List
[
float
]])
->
Path
:
condition_col
:
str
,
ylim
:
Optional
[
List
[
float
]],
stat
:
bool
=
False
)
->
Path
:
"""
Create a metagene figure on the region of interest.
...
...
@@ -314,6 +423,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
each samples
:param condition_col: The name of the condition columns
:param ylim: The range of the y axis
:param stat: A boolean indicating wether to perform statistical analysis
:return: The name of the figure
"""
font_files
=
matplotlib
.
font_manager
.
findSystemFonts
(
fontpaths
=
None
,
...
...
@@ -356,6 +466,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
outfile_title
+=
"
.pdf
"
if
ylim
[
0
]
is
not
None
:
g
.
ax
.
set_ylim
(
ylim
[
0
],
ylim
[
1
])
ymin
,
ymax
=
g
.
ax
.
get_ylim
()
g
=
display_stat
(
g
,
df_sum
,
ymin
+
((
ymax
-
ymin
)
/
50
),
condition_col
,
stat
,
output
/
outfile_title
)
g
.
ax
.
tick_params
(
left
=
True
,
bottom
=
True
)
g
.
savefig
(
output
/
outfile_title
)
g
.
fig
.
clf
()
...
...
@@ -513,27 +626,6 @@ def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
"
location
"
:
location
})
def
compute_stat_dataframe
(
df_sum
:
pd
.
DataFrame
,
region_name
:
str
)
->
pd
.
DataFrame
:
"""
Create a dataframe containing the statistical analysis of
\
the difference of coverage between two conditions.
:param df_sum: A dataframe containing the coverage values between
\
two conditions
:param region_name: The name of the region of interest
:return: The dataframe containing statistical analysis
"""
df_diff
=
compute_diff
(
df_sum
)
if
'
location
'
not
in
df_diff
.
columns
:
df_diff
[
"
location
"
]
=
[
region_name
]
*
df_diff
.
shape
[
0
]
list_series
=
[
get_shapiro_an_lm_pvalue
(
df_diff
,
region
)
for
region
in
df_diff
[
"
location
"
].
unique
()
]
return
pd
.
DataFrame
(
list_series
)
def
create_figure
(
design
:
Path
,
bw_folder
:
Path
,
region_beds
:
List
[
Path
],
bed_names
:
List
[
str
],
nb_bin
:
int
=
100
,
figure_type
:
str
=
'
metagene
'
,
...
...
@@ -600,13 +692,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
environment
,
region_kind
,
ordered_condition
,
bed_names
)
if
figure_type
==
"
metagene
"
:
outfile
=
figure_metagene
(
df_sum
,
show_replicate
,
border_names
,
nb_bin
,
environment
,
region_kind
,
output
,
norm
,
cond_col
,
ylim
)
if
stat
:
df_stat
=
compute_stat_dataframe
(
df_sum
,
region_kind
)
df_stat
.
to_csv
(
outfile
.
parent
/
f
"
{
outfile
.
stem
}
.txt
"
,
sep
=
"
\t
"
,
index
=
False
)
figure_metagene
(
df_sum
,
show_replicate
,
border_names
,
nb_bin
,
environment
,
region_kind
,
output
,
norm
,
cond_col
,
ylim
,
stat
)
else
:
if
'
location
'
in
df_sum
.
columns
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment