Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
ChIA-PET_network
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
ReGArDS
ChIA-PET_network
Commits
53140828
Verified
Commit
53140828
authored
3 months ago
by
nfontrod
Browse files
Options
Downloads
Patches
Plain Diff
update creation of figure
parent
3e0a8de9
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/find_interaction_cluster/community_file_tools/perform_topgo_analysis.py
+164
-6
164 additions, 6 deletions
...on_cluster/community_file_tools/perform_topgo_analysis.py
with
164 additions
and
6 deletions
src/find_interaction_cluster/community_file_tools/perform_topgo_analysis.py
+
164
−
6
View file @
53140828
...
...
@@ -7,10 +7,17 @@ Description: Perform a topGo analysis for each list of genes in the file \
produced by the script community_file_2_gene_list in results/clusters_gene_list
"""
import
math
from
pathlib
import
Path
from
typing
import
List
import
numpy
as
np
import
pandas
as
pd
import
plotly.express
as
px
import
polars
as
pl
from
src.figures_utils.component_variance_figure
import
update_layout_fig
from
src.figures_utils.stacked_barplot_figures
import
get_color_pallette
from
..config
import
Config
...
...
@@ -30,14 +37,12 @@ def write_input_file(mfile: Path, folder: Path) -> List[Path]:
"""
input_files
=
[]
df
=
pd
.
read_csv
(
mfile
,
sep
=
"
\t
"
)
df
=
df
[
-
df
[
"
hg38_
symbol
"
].
isna
()].
copy
()
df
=
df
[
-
df
[
"
symbol
"
].
isna
()].
copy
()
for
cluster
in
df
[
"
cluster
"
].
unique
():
tmp
=
df
[
df
[
"
cluster
"
]
==
cluster
]
cl
=
cluster
.
replace
(
"
.
"
,
"
_
"
)
outfile
=
folder
/
f
"
{
cl
}
.txt
"
tmp
[[
"
hg38_symbol
"
]].
to_csv
(
outfile
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
)
tmp
[[
"
symbol
"
]].
to_csv
(
outfile
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
)
input_files
.
append
(
outfile
)
return
input_files
...
...
@@ -53,9 +58,9 @@ def write_background(mfile: Path, folder: Path) -> Path:
outf
=
folder
/
"
input_topgo
"
outf
.
mkdir
(
exist_ok
=
True
)
df
=
pd
.
read_csv
(
mfile
,
sep
=
"
\t
"
)
df
=
df
[
-
df
[
"
hg38_
symbol
"
].
isna
()].
copy
()
df
=
df
[
-
df
[
"
symbol
"
].
isna
()].
copy
()
outfile
=
folder
/
"
background.txt
"
ndf
=
pd
.
DataFrame
({
"
hg38_
symbol
"
:
df
[
"
hg38_
symbol
"
].
unique
()})
ndf
=
pd
.
DataFrame
({
"
symbol
"
:
df
[
"
symbol
"
].
unique
()})
ndf
.
to_csv
(
outfile
,
sep
=
"
\t
"
,
index
=
False
,
header
=
False
)
return
outfile
...
...
@@ -80,6 +85,158 @@ def execute_topgo(
sp
.
check_call
(
cmd
,
shell
=
True
)
def
get_enrichment_files
(
output_folder
:
Path
)
->
list
[
Path
]:
"""
Get the enrichment files generated by topgo
:param output_folder: The output folder where the enrichment files are
\
generated
:return: A list of Path objects representing the enrichment files
"""
return
list
(
output_folder
.
glob
(
"
**/*_genes_CC_*.txt
"
))
def
open_enrichment_file
(
outfile
:
Path
)
->
pl
.
DataFrame
:
"""
Open an enrichment file generated by topgo
:param outfile: The output file to open
:return: A DataFrame containing the enrichment data
"""
df
=
pl
.
read_csv
(
outfile
,
separator
=
"
\t
"
,
dtypes
=
{
"
GO.ID
"
:
pl
.
Utf8
(),
"
Term
"
:
pl
.
Utf8
(),
"
Annotated
"
:
pl
.
Int64
(),
"
Significant
"
:
pl
.
Int64
(),
"
Expected
"
:
pl
.
Float64
(),
"
fish
"
:
pl
.
Utf8
(),
"
pvalue
"
:
pl
.
Utf8
(),
"
padj
"
:
pl
.
Utf8
(),
},
).
with_columns
(
community
=
pl
.
lit
(
outfile
.
parent
.
name
),
fish
=
pl
.
col
(
"
fish
"
).
str
.
replace
(
"
<
"
,
""
).
cast
(
pl
.
Float64
()),
pvalue
=
pl
.
col
(
"
pvalue
"
).
str
.
replace
(
"
<
"
,
""
).
cast
(
pl
.
Float64
()),
padj
=
pl
.
col
(
"
padj
"
).
str
.
replace
(
"
<
"
,
""
).
cast
(
pl
.
Float64
()),
)
return
df
def
load_many_files
(
outfiles
:
list
[
Path
])
->
pl
.
DataFrame
:
"""
Load many enrichment files generated by topgo
:param outfiles: A list of Path objects representing the enrichment files
:return: A DataFrame containing the enrichment data
"""
dfs
=
[
open_enrichment_file
(
outfile
)
for
outfile
in
outfiles
]
return
pl
.
concat
(
dfs
)
def
generate_figure
(
df
:
pl
.
DataFrame
,
output
:
Path
,
go_term
:
str
,
term
:
str
,
cummunities_name
:
str
,
)
->
None
:
"""
Generate a figure for a given GO term
:param df: The DataFrame containing the enrichment data
:param output: The output path for the figure
:param go_term: The GO term to generate the figure for
:param term: The term description
:param cummunities_name: The name of the clusters studied
"""
fig
=
px
.
scatter
(
df
,
x
=
"
mlog10padj
"
,
y
=
"
log2fc
"
,
color
=
"
SPIN
"
if
"
SPIN
"
in
df
.
columns
else
"
community
"
,
size
=
"
Significant
"
,
hover_data
=
[
"
padj
"
,
"
log2fc
"
,
"
community
"
,
"
Significant
"
],
title
=
f
"
GO Term:
{
go_term
}
-
{
term
}
-
{
cummunities_name
}
"
,
color_discrete_sequence
=
df
[
"
color
"
].
tolist
(),
)
fig
=
update_layout_fig
(
fig
)
fig
.
update_layout
(
font
=
dict
(
size
=
15
))
fig
.
add_hline
(
y
=
0
,
line_dash
=
"
dash
"
,
line_color
=
"
black
"
,
line_width
=
2
,
layer
=
"
below
"
)
fig
.
add_vline
(
x
=-
math
.
log10
(
0.05
),
line_dash
=
"
dash
"
,
line_color
=
"
black
"
,
line_width
=
2
,
layer
=
"
below
"
,
)
fig
.
write_html
(
output
)
def
add_color_columns
(
df
:
pd
.
DataFrame
,
order
:
list
[
str
])
->
pd
.
DataFrame
:
"""
Add color columns to the dataframe based on the community column
:param df: The dataframe to add color columns to
:param order: The order of the communities
:return: The dataframe with color columns added
"""
if
not
df
[
df
[
"
community
"
].
str
.
contains
(
"
Speckle
"
)].
empty
:
df
[
"
SPIN
"
]
=
[
x
[
-
1
]
for
x
in
df
[
"
community
"
].
str
.
split
(
"
_
"
)]
tmp
=
[
x
.
split
(
"
_
"
)[
-
1
]
for
x
in
order
]
indexes
=
np
.
unique
(
tmp
,
return_index
=
True
)[
1
]
order
=
[
order
[
i
]
for
i
in
sorted
(
indexes
)]
c_dic
=
dict
(
zip
(
order
,
get_color_pallette
(
"
turbo
"
,
order
)))
df
[
"
color
"
]
=
df
[
"
community
"
].
map
(
c_dic
)
return
df
def
create_figures
(
outfolder
:
Path
,
mfile
:
Path
)
->
None
:
"""
Generate the enrichment scatter plot for each GO term
:param outfolder: The output folder where the figures will be saved
:param mfile: A file containing cluster with hg38 names
"""
order
=
pd
.
read_csv
(
mfile
,
sep
=
"
\t
"
)[
"
cluster
"
].
unique
().
tolist
()
outf
=
outfolder
/
"
scatter_fig
"
outf
.
mkdir
(
exist_ok
=
True
)
enri_file
=
get_enrichment_files
(
outfolder
)
df
=
load_many_files
(
enri_file
)
df
=
df
.
with_columns
(
log2fc
=
(
pl
.
col
(
"
Significant
"
)
/
pl
.
col
(
"
Expected
"
)).
log
(
base
=
2
),
mlog10padj
=
pl
.
col
(
"
padj
"
).
log
(
base
=
10
)
*
-
1
,
)
df
.
write_csv
(
outfolder
/
f
"
GO_
{
mfile
.
stem
}
_enrichment.csv
"
,
separator
=
"
\t
"
)
gt
=
{
k
:
v
[
0
]
for
k
,
v
in
df
.
filter
(
pl
.
col
(
"
padj
"
)
<
0.05
)
.
select
([
"
GO.ID
"
,
"
Term
"
])
.
unique
(
subset
=
"
GO.ID
"
)
.
unique
()
.
rows_by_key
(
"
GO.ID
"
)
.
items
()
}
df
=
df
.
filter
(
pl
.
col
(
"
GO.ID
"
).
is_in
(
gt
.
keys
()))
df
.
write_csv
(
outfolder
/
f
"
GO_
{
mfile
.
stem
}
_filtered_enrichment.csv
"
,
separator
=
"
\t
"
)
df
=
df
.
to_pandas
()
df
=
add_color_columns
(
df
,
order
)
df
[
"
community
"
]
=
pd
.
Categorical
(
df
[
"
community
"
],
categories
=
order
,
ordered
=
True
)
df
=
df
.
sort_values
(
"
community
"
)
for
go_term
,
term
in
gt
.
items
():
tmp
=
df
[
df
[
"
GO.ID
"
]
==
go_term
]
nterm
=
term
.
replace
(
"
"
,
"
_
"
).
replace
(
"
/
"
,
"
-
"
)[:
20
]
outfile
=
outf
/
f
"
{
go_term
}
_
{
nterm
}
_
{
mfile
.
stem
}
.html
"
generate_figure
(
tmp
,
outfile
,
go_term
,
term
,
mfile
.
stem
)
def
execute_cmds
(
mfile
:
Path
,
top
:
int
=
20
)
->
None
:
"""
Execute topgo for each cluster defined in mfile
...
...
@@ -97,6 +254,7 @@ def execute_cmds(mfile: Path, top: int = 20) -> None:
cfolder
=
folder
/
cinput
.
stem
cfolder
.
mkdir
(
exist_ok
=
True
)
execute_topgo
(
cinput
,
background
,
cfolder
,
top
)
create_figures
(
folder
,
mfile
)
@lp.parse
(
gene_list
=
"
file
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment