Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TDD_MAPKi
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
ReGArDS
TDD_MAPKi
Commits
38a6e219
Commit
38a6e219
authored
Aug 24, 2022
by
nfontrod
Browse files
Options
Downloads
Patches
Plain Diff
src/07_ribosome_density.py: build density figure
parent
1e80378d
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/07_ribosome_density.py
+298
-0
298 additions, 0 deletions
src/07_ribosome_density.py
with
298 additions
and
0 deletions
src/07_ribosome_density.py
0 → 100644
+
298
−
0
View file @
38a6e219
#!/usr/bine/env
# -*- condig: UTF-8 -*-
"""
The goal of this script is to compute the ribosomal density for genes in
\
experiment 245-6-7
"""
from
multiprocessing
import
context
from
pathlib
import
Path
from
typing
import
List
import
doctest
import
pandas
as
pd
from
functools
import
reduce
import
seaborn
as
sns
import
numpy
as
np
from
rpy2.robjects
import
r
,
pandas2ri
class
Config_file
:
data
=
Path
(
__file__
).
parents
[
1
]
/
"
data
"
htseq_fold
=
data
/
"
htseq_count_manip_245-6-7_density
"
htseq_rnaseq
=
htseq_fold
/
"
htseq_count_rna
"
htseq_ribo
=
htseq_fold
/
"
htseq_count_ribo
"
result
=
Path
(
__file__
).
parents
[
1
]
/
"
results
"
tdd_braf
=
result
/
"
TDD_analysis
"
/
"
correlation
"
/
"
TDD_BRAF.txt
"
tdd_dmso
=
result
/
"
TDD_analysis
"
/
"
correlation
"
/
"
TDD_DMSO.txt
"
output
=
result
/
"
TDD_analysis
"
/
"
density_figures
"
def
load_htseqfiles
(
mfolder
:
Path
)
->
List
[
Path
]:
"""
Load the list of htseq files located in that folder
:param mfolder: A folder containing htseq count file
:return: The list of files in `mfolder`
>>>
res
=
load_htseqfiles
(
Config_file
.
htseq_ribo
)
>>>
len
(
res
)
6
>>>
[
x
.
stem
for
x
in
res
[
0
:
2
]]
[
'
DMSO_245_ribo
'
,
'
DMSO_247_ribo
'
]
"""
return
list
(
Path
(
mfolder
).
glob
(
"
*.tsv
"
))
def
load_df
(
list_files
:
List
[
Path
])
->
pd
.
DataFrame
:
"""
load all count files
:param list_files: A list of htseq files
:return: A dataframe contaning counts for every files
>>>
mfiles
=
[
Config_file
.
htseq_ribo
/
"
DMSO_245_ribo.tsv
"
,
...
Config_file
.
htseq_ribo
/
"
DMSO_247_ribo.tsv
"
]
>>>
load_df
(
mfiles
).
head
()
gene
DMSO_245_ribo
DMSO_247_ribo
0
A1BG
44
84
1
A1CF
0
0
2
A2M
4376
5682
3
A2ML1
0
0
4
A3GALT2
0
0
"""
list_df
=
[
pd
.
read_csv
(
x
,
sep
=
"
\t
"
,
names
=
[
"
gene
"
,
x
.
stem
])
for
x
in
list_files
]
return
reduce
(
lambda
x
,
y
:
x
.
merge
(
y
,
how
=
"
inner
"
),
list_df
)
def
compute_cpm
(
df
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
"""
Compute the cpm of the dataframe df
:param df: A dataframe containing raw read count
:return: The dataframe with the cpm read counts
>>>
dsdf
=
pd
.
DataFrame
({
"
gene
"
:
[
f
"
gn_
{
i
}
"
for
i
in
range
(
1
,
6
)],
...
"
x1
"
:
[
10000
,
20000
,
50000
,
12000
,
8000
],
...
"
x2
"
:
[
200000
,
400000
,
1000000
,
240000
,
160000
]})
>>>
compute_cpm
(
dsdf
)
gene
x1
x2
0
gn_1
100000.0
100000.0
1
gn_2
200000.0
200000.0
2
gn_3
500000.0
500000.0
3
gn_4
120000.0
120000.0
4
gn_5
80000.0
80000.0
"""
col
=
[
c
for
c
in
df
.
columns
if
c
!=
"
gene
"
]
for
cc
in
col
:
fctr
=
1e6
/
df
[
cc
].
sum
()
df
[
cc
]
=
df
[
cc
]
*
fctr
return
df
def
compute_ribosome_density
(
df_cpm
:
pd
.
DataFrame
):
"""
compute the density of ribosomes
:param df_cpm: A dataframe containing CPM
:return: The ribosomes density for every replicates
>>>
dsdf
=
pd
.
DataFrame
({
"
gene
"
:
list
(
"
ABCDE
"
),
...
"
DMSO_245_ribo
"
:
[
5
,
10
,
15
,
20
,
25
],
...
"
DMSO_245_rna
"
:
[
5
,
5
,
5
,
5
,
5
],
...
"
DMSO_246_ribo
"
:
[
5
,
5
,
5
,
5
,
5
],
...
"
DMSO_246_rna
"
:
[
5
,
10
,
15
,
20
,
0
]})
>>>
compute_ribosome_density
(
dsdf
)
gene
treatment
replicate
density
0
A
DMSO
245
1.000000
1
A
DMSO
246
1.000000
2
B
DMSO
245
2.000000
3
B
DMSO
246
0.500000
4
C
DMSO
245
3.000000
5
C
DMSO
246
0.333333
6
D
DMSO
245
4.000000
7
D
DMSO
246
0.250000
8
E
DMSO
245
5.000000
"""
df
=
df_cpm
.
melt
(
id_vars
=
[
"
gene
"
],
value_name
=
"
cpm
"
,
var_name
=
"
sample
"
)
df
[
"
condition
"
]
=
df
[
"
sample
"
].
str
.
replace
(
"
_(ribo|rna)
"
,
""
,
regex
=
True
)
df
[
"
kind
"
]
=
df
[
"
sample
"
].
str
.
extract
(
r
'
(ribo|rna)
'
)
df
=
df
.
pivot
(
index
=
[
"
gene
"
,
"
condition
"
],
columns
=
"
kind
"
,
values
=
"
cpm
"
).
reset_index
()
df
.
columns
.
name
=
None
df
[
"
density
"
]
=
df
[
"
ribo
"
]
/
df
[
"
rna
"
]
df
[
"
treatment
"
]
=
df
[
"
condition
"
].
str
.
extract
(
r
"
(DMSO|BRAF)_
"
)
df
[
"
replicate
"
]
=
df
[
"
condition
"
].
str
.
extract
(
r
"
_(24.)
"
)
df
.
drop
([
"
ribo
"
,
"
rna
"
,
"
condition
"
],
inplace
=
True
,
axis
=
1
)
df
=
df
[(
-
df
[
"
density
"
].
isna
()
&
-
np
.
isinf
(
df
[
"
density
"
]))]
return
df
[[
"
gene
"
,
"
treatment
"
,
"
replicate
"
,
"
density
"
]]
def
create_density_df
()
->
pd
.
DataFrame
:
"""
Compute the density dataframe
"""
file_ribo
=
load_htseqfiles
(
Config_file
.
htseq_ribo
)
file_rna
=
load_htseqfiles
(
Config_file
.
htseq_rnaseq
)
mfiles
=
file_ribo
+
file_rna
df
=
load_df
(
mfiles
)
df
=
compute_cpm
(
df
)
df
=
compute_ribosome_density
(
df
)
return
df
def
add_group_column
(
df_density
:
pd
.
DataFrame
,
gene_file
:
Path
)
->
pd
.
DataFrame
:
"""
Add a column containing gene having a TDD increasing/desreasing in BRAF
condition
:param df_density: A dataframe containing densities
:param gene_file: A file containing gene whose TDD changes in BRAF
condition
>>>
genes_l
=
list
(
"
ABC
"
)
>>>
temp_file
=
Path
(
"
/tmp/tmp.genes
"
)
>>>
tf
=
temp_file
.
open
(
"
w
"
)
>>>
x
=
tf
.
write
(
"
\\
n
"
.
join
(
genes_l
))
>>>
tf
.
close
()
>>>
dsdd
=
pd
.
DataFrame
({
...
'
gene
'
:
[
'
A
'
,
'
A
'
,
'
B
'
,
'
B
'
,
'
C
'
,
'
C
'
,
'
D
'
,
'
D
'
,
'
E
'
,
'
E
'
],
...
'
treatment
'
:
[
'
DMSO
'
]
*
10
,
...
'
replicate
'
:
[
245
,
246
,
245
,
246
,
245
,
246
,
245
,
246
,
245
,
246
],
...
'
density
'
:
[
1.0
,
1.0
,
2.0
,
0.5
,
3.0
,
0.333333
,
4.0
,
0.25
,
5.0
,
0.2
]})
>>>
add_group_column
(
dsdd
,
temp_file
)
gene
treatment
replicate
density
group
0
A
DMSO
245
1.000000
tmp
1
A
DMSO
246
1.000000
tmp
2
B
DMSO
245
2.000000
tmp
3
B
DMSO
246
0.500000
tmp
4
C
DMSO
245
3.000000
tmp
5
C
DMSO
246
0.333333
tmp
6
D
DMSO
245
4.000000
CTRL
7
D
DMSO
246
0.250000
CTRL
8
E
DMSO
245
5.000000
CTRL
9
E
DMSO
246
0.200000
CTRL
>>>
temp_file
.
unlink
()
"""
genes
=
gene_file
.
open
(
"
r
"
).
read
().
splitlines
()
df_density
[
"
group
"
]
=
[
"
CTRL
"
]
*
df_density
.
shape
[
0
]
df_density
.
loc
[
df_density
[
"
gene
"
].
isin
(
genes
),
"
group
"
]
=
gene_file
.
stem
return
df_density
def
avg_replicate
(
df_density
:
pd
.
DataFrame
)
->
pd
.
DataFrame
:
"""
Averages the density by gene and drop replicate column
:param df_density: A dataframe containing ribosomes densities
>>>
dsdd
=
pd
.
DataFrame
(
...
{
'
gene
'
:
[
'
A
'
,
'
A
'
,
'
B
'
,
'
B
'
,
'
C
'
,
'
C
'
,
'
D
'
,
'
D
'
,
'
E
'
,
'
E
'
],
...
'
treatment
'
:
[
'
DMSO
'
]
*
10
,
...
'
replicate
'
:
[
245
,
246
]
*
5
,
...
'
density
'
:
[
1.0
,
1.0
,
2.0
,
0.5
,
3.0
,
0.333333
,
4.0
,
0.25
,
5.0
,
0.2
],
...
'
group
'
:
[
'
tmp
'
]
*
6
+
[
'
CTRL
'
]
*
4
})
>>>
avg_replicate
(
dsdd
)
gene
treatment
group
density
0
A
DMSO
tmp
1.000000
1
B
DMSO
tmp
1.250000
2
C
DMSO
tmp
1.666667
3
D
DMSO
CTRL
2.125000
4
E
DMSO
CTRL
2.600000
"""
df
=
df_density
.
drop
(
"
replicate
"
,
axis
=
1
)
\
.
groupby
([
"
gene
"
,
"
treatment
"
,
"
group
"
])
\
.
mean
().
reset_index
()
return
df
def
statistical_analysis
(
df_density
:
pd
.
DataFrame
,
output
:
Path
)
->
float
:
"""
Get the density pvalue
:param df_density: A density dataframe
:type df_density: pd.DataFrame
:param outout: The folder where the diagnostics will be produced
:return: The pvalue needed
"""
pandas2ri
.
activate
()
output
=
output
/
"
diag
"
output
.
mkdir
(
exist_ok
=
True
)
treat
=
df_density
[
"
treatment
"
].
unique
()[
0
]
outfig
=
output
/
f
"
diag_
{
treat
}
.pdf
"
outpval
=
output
/
f
"
diag_pval_
{
treat
}
.txt
"
stat_s
=
r
(
f
"""
require(
"
DHARMa
"
)
require(
"
glmmTMB
"
)
function(data, outfig, outpval){{
mod <- glmmTMB(log1p(density) ~ group + replicate, ziformula = ~1,
family=
"
ziGamma
"
, data=data)
simulationOutput <- simulateResiduals(fittedModel = mod, n = 200)
pdf(outfig)
plot(simulationOutput)
dev.off()
res <- summary(mod)
tmp <- as.data.frame(res$coefficients$cond)
tmp[
'
col
'
] <- rownames(tmp)
write.table(tmp, outpval, sep=
"
\t
"
)
return(res$coefficients$cond[2, 4])
}}
"""
)
res
=
stat_s
(
df_density
,
str
(
outfig
),
str
(
outpval
))
return
float
(
res
)
def
create_density_figure
(
df_density
:
pd
.
DataFrame
,
gene_file
:
Path
,
output_file
:
Path
)
->
None
:
"""
Create a violinplot indicating the ribosome densities between the groups
of genes studied between each treatment
:param df_density: A DataFrame of density
:param gene_file: The gene_file used to build the group column of the
\
df_density dataframe
:param output_file: File where the figure will be created
"""
sns
.
set
(
context
=
"
talk
"
)
df_mean
=
avg_replicate
(
df_density
)
df_mean
[
"
log10_density
"
]
=
np
.
log
(
df_mean
[
"
density
"
]
+
1
)
g
=
sns
.
catplot
(
x
=
"
treatment
"
,
y
=
"
log10_density
"
,
hue
=
"
group
"
,
data
=
df_mean
,
kind
=
"
violin
"
,
height
=
10
,
aspect
=
1.7
,
palette
=
{
"
TDD_BRAF
"
:
"
#a600ff
"
,
"
CTRL
"
:
"
dimgrey
"
,
"
TDD_DMSO
"
:
"
#ff8400
"
},
cut
=
True
)
pdmso
=
statistical_analysis
(
df_density
[
df_density
[
"
treatment
"
]
==
"
DMSO
"
].
copy
(),
output_file
.
parent
)
pbraf
=
statistical_analysis
(
df_density
[
df_density
[
"
treatment
"
]
==
"
BRAF
"
].
copy
(),
output_file
.
parent
)
pdmso
=
"
%.2e
"
%
pdmso
pbraf
=
"
%.2e
"
%
pbraf
g
.
fig
.
suptitle
(
f
"
Ribosome density between
{
gene_file
.
stem
}
and CTRL
"
f
"
genes in BRAF and DMSO condition
\n
"
f
"
(p_treatment_BRAF =
{
pbraf
}
& p_treatment_BRAF =
{
pdmso
}
"
)
g
.
set
(
ylabel
=
"
log10(density +1)
"
)
g
.
savefig
(
output_file
)
def
density_figure_maker
():
df
=
create_density_df
()
list_files
=
[
Config_file
.
tdd_braf
,
Config_file
.
tdd_dmso
]
for
mfile
in
list_files
:
dfc
=
add_group_column
(
df
.
copy
(),
mfile
)
Config_file
.
output
.
mkdir
(
exist_ok
=
True
)
output_fig
=
Config_file
.
output
/
\
f
"
density_figure_gene_
{
mfile
.
stem
}
.pdf
"
dfc
.
to_csv
(
output_fig
.
parent
/
f
"
{
output_fig
.
stem
}
.txt
"
,
sep
=
"
\t
"
,
index
=
False
)
create_density_figure
(
dfc
,
mfile
,
output_fig
)
if
__name__
==
"
__main__
"
:
import
sys
if
len
(
sys
.
argv
)
>=
2
and
sys
.
argv
[
1
]
==
"
test
"
:
doctest
.
testmod
()
else
:
density_figure_maker
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment