Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
ChIA-PET_network
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
LBMC
ReGArDS
ChIA-PET_network
Commits
aacaf00f
Commit
aacaf00f
authored
4 years ago
by
nfontrod
Browse files
Options
Downloads
Patches
Plain Diff
src/find_interaction_cluster/nt_and_community.py: add permutations tests
parent
10a59086
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/find_interaction_cluster/nt_and_community.py
+242
-49
242 additions, 49 deletions
src/find_interaction_cluster/nt_and_community.py
with
242 additions
and
49 deletions
src/find_interaction_cluster/nt_and_community.py
+
242
−
49
View file @
aacaf00f
...
@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \
...
@@ -7,12 +7,11 @@ Description: The goal of this script is to see if the nucleotide \
frequency is not random between communities
frequency is not random between communities
"""
"""
import
pandas
as
pd
import
pandas
as
pd
import
logging
import
logging
import
sqlite3
import
sqlite3
from
.config
import
ConfigGraph
,
get_communities
from
.config
import
ConfigGraph
,
get_communities
from
typing
import
List
from
typing
import
List
,
Tuple
,
Dict
from
doctest
import
testmod
from
doctest
import
testmod
from
functools
import
reduce
from
functools
import
reduce
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -22,6 +21,9 @@ from .community_finder import get_projects
...
@@ -22,6 +21,9 @@ from .community_finder import get_projects
from
..logging_conf
import
logging_def
from
..logging_conf
import
logging_def
from
itertools
import
product
from
itertools
import
product
import
multiprocessing
as
mp
import
multiprocessing
as
mp
import
numpy
as
np
from
.radomization_test_ppi
import
get_pvalue
from
tqdm
import
tqdm
def
get_nt_frequency
(
cnx
:
sqlite3
.
Connection
,
list_ft
:
List
[
str
],
def
get_nt_frequency
(
cnx
:
sqlite3
.
Connection
,
list_ft
:
List
[
str
],
...
@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]],
...
@@ -82,20 +84,20 @@ def get_community_table(communities: List[List[str]],
>>>
c
=
[[
'
1_1
'
,
'
2_5
'
],
[
'
7_9
'
,
'
4_19
'
,
'
3_3
'
]]
>>>
c
=
[[
'
1_1
'
,
'
2_5
'
],
[
'
7_9
'
,
'
4_19
'
,
'
3_3
'
]]
>>>
get_community_table
(
c
,
3
,
'
exon
'
)
>>>
get_community_table
(
c
,
3
,
'
exon
'
)
community
id_exon
community_size
community
id_exon
community_size
0
C
1
7_9
3
0
C
2
7_9
3
1
C
1
4_19
3
1
C
2
4_19
3
2
C
1
3_3
3
2
C
2
3_3
3
>>>
c
=
[[
'
1
'
,
'
2
'
],
[
'
7
'
,
'
49
'
,
'
3
'
]]
>>>
c
=
[[
'
1
'
,
'
2
'
],
[
'
7
'
,
'
49
'
,
'
3
'
]]
>>>
get_community_table
(
c
,
3
,
'
gene
'
)
>>>
get_community_table
(
c
,
3
,
'
gene
'
)
community
id_gene
community_size
community
id_gene
community_size
0
C
1
7
3
0
C
2
7
3
1
C
1
49
3
1
C
2
49
3
2
C
1
3
3
2
C
2
3
3
"""
"""
dic
=
{
"
community
"
:
[],
f
"
id_
{
feature
}
"
:
[],
"
community_size
"
:
[]}
dic
=
{
"
community
"
:
[],
f
"
id_
{
feature
}
"
:
[],
"
community_size
"
:
[]}
for
k
,
comm
in
enumerate
(
communities
):
for
k
,
comm
in
enumerate
(
communities
):
if
len
(
comm
)
>=
size_threshold
:
if
len
(
comm
)
>=
size_threshold
:
name
=
f
'
C
{
k
}
'
name
=
f
'
C
{
k
+
1
}
'
clen
=
len
(
comm
)
clen
=
len
(
comm
)
for
exon
in
comm
:
for
exon
in
comm
:
dic
[
"
community
"
].
append
(
name
)
dic
[
"
community
"
].
append
(
name
)
...
@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
...
@@ -173,7 +175,7 @@ def lmm_maker_summary(df: pd.DataFrame, outfile: Path, nt: str
res_df
.
rename
({
'
index
'
:
'
community
'
},
inplace
=
True
,
axis
=
1
)
res_df
.
rename
({
'
index
'
:
'
community
'
},
inplace
=
True
,
axis
=
1
)
res_df
[
'
community
'
]
=
res_df
[
'
community
'
].
str
.
replace
(
'
community
'
,
''
)
res_df
[
'
community
'
]
=
res_df
[
'
community
'
].
str
.
replace
(
'
community
'
,
''
)
res_df
.
loc
[
res_df
[
'
community
'
]
==
"
(Intercept)
"
,
"
community
"
]
=
"
C-CTRL
"
res_df
.
loc
[
res_df
[
'
community
'
]
==
"
(Intercept)
"
,
"
community
"
]
=
"
C-CTRL
"
mean_df
=
df
[[
nt
,
"
community
"
,
"
community_size
"
]].
\
mean_df
=
df
[[
nt
,
"
community
"
,
"
community_size
"
]].
\
groupby
([
"
community
"
,
"
community_size
"
]).
mean
().
reset_index
()
groupby
([
"
community
"
,
"
community_size
"
]).
mean
().
reset_index
()
return
res_df
.
merge
(
mean_df
,
how
=
"
left
"
,
on
=
"
community
"
)
return
res_df
.
merge
(
mean_df
,
how
=
"
left
"
,
on
=
"
community
"
)
...
@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
...
@@ -193,21 +195,18 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
return
[
str
(
cid
[
0
])
for
cid
in
res
]
return
[
str
(
cid
[
0
])
for
cid
in
res
]
def
create_ctrl_community
(
df
:
pd
.
DataFrame
,
outfile
:
Path
,
def
create_ctrl_community
(
df
:
pd
.
DataFrame
,
feature
:
str
=
'
exon
'
,
region
:
str
=
""
feature
:
str
=
'
exon
'
,
region
:
str
=
""
)
->
pd
.
DataFrame
:
)
->
pd
.
DataFrame
:
"""
"""
:param df: A dataframe containing the frequency of each feature in a
\
:param df: A dataframe containing the frequency of each feature in a
\
community.
community.
:param outfile: The output table containing frequencies
:param feature: The kind of feature to analyse
:param feature: The kind of feature to analyse
:param region: only use if feature is
'
gene
'
. Used to focus on
\
:param region: only use if feature is
'
gene
'
. Used to focus on
\
a given region in genes (can be gene, exon, intron).
a given region in genes (can be gene, exon, intron).
:return: A dataframe containing the frequency of every nucleotides
\
:return: A dataframe containing the frequency of every nucleotides
\
of every exon in a large community
of every exon in a large community
"""
"""
if
outfile
.
is_file
():
return
pd
.
read_csv
(
outfile
,
sep
=
"
\t
"
)
size_threshold
=
10
if
feature
==
"
gene
"
else
50
size_threshold
=
10
if
feature
==
"
gene
"
else
50
cnx
=
sqlite3
.
connect
(
ConfigGraph
.
db_file
)
cnx
=
sqlite3
.
connect
(
ConfigGraph
.
db_file
)
ft_id
=
get_ft_id
(
cnx
,
feature
)
ft_id
=
get_ft_id
(
cnx
,
feature
)
...
@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
...
@@ -218,15 +217,165 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
df_nt
=
df_nt
.
merge
(
df_com
,
how
=
"
left
"
,
on
=
f
"
id_
{
feature
}
"
)
df_nt
=
df_nt
.
merge
(
df_com
,
how
=
"
left
"
,
on
=
f
"
id_
{
feature
}
"
)
df_nt
[
'
community
'
]
=
[
'
C-CTRL
'
]
*
df_nt
.
shape
[
0
]
df_nt
[
'
community
'
]
=
[
'
C-CTRL
'
]
*
df_nt
.
shape
[
0
]
df
=
pd
.
concat
([
df
,
df_nt
],
axis
=
0
,
ignore_index
=
True
)
df
=
pd
.
concat
([
df
,
df_nt
],
axis
=
0
,
ignore_index
=
True
)
df
.
to_csv
(
outfile
,
sep
=
"
\t
"
,
index
=
False
)
return
df
return
df
def
lmm_with_ctrl
(
df
:
pd
.
DataFrame
,
feature
:
str
,
region
:
str
,
nt
:
str
,
outfile_diag
:
Path
)
->
pd
.
DataFrame
:
"""
:param df: df: A dataframe containing the frequency of each nucleotide
\
in each exons belonging to a community.
:param feature: The kind of feature analysed
:param region: the region of interest to extract from gene
:param nt: The nucleotide of interest
:param outfile_diag: File from which the diagnostics folder will be
\
inferred
:return: The dataframe with the p-value compared to the control
\
list of exons.
"""
ndf
=
create_ctrl_community
(
df
,
feature
,
region
)
return
lmm_maker_summary
(
ndf
,
outfile_diag
,
nt
)
def
get_feature_by_community
(
df
:
pd
.
DataFrame
,
feature
:
str
)
->
Dict
:
"""
Create a dictionary containing the exons contained in each community.
:param df: A dataframe containing the frequency of each nucleotide
\
in each exons belonging to a community.
:param feature: the feature of interest (exon, gene)
:return: A dictionary linking each community to the exons it contains
>>>
dataf
=
pd
.
DataFrame
({
"
id_gene
"
:
[
'
1
'
,
'
2
'
,
'
3
'
,
'
4
'
],
...
'
community
'
:
[
'
C1
'
,
'
C1
'
,
'
C2
'
,
'
C2
'
]})
>>>
get_feature_by_community
(
dataf
,
'
gene
'
)
{
'
C1
'
:
[
'
1
'
,
'
2
'
],
'
C2
'
:
[
'
3
'
,
'
4
'
]}
>>>
dataf
.
rename
({
"
id_gene
"
:
"
id_exon
"
},
axis
=
1
,
inplace
=
True
)
>>>
get_feature_by_community
(
dataf
,
'
exon
'
)
{
'
C1
'
:
[
'
1
'
,
'
2
'
],
'
C2
'
:
[
'
3
'
,
'
4
'
]}
"""
dic
=
{}
for
i
in
range
(
df
.
shape
[
0
]):
com
,
id_ft
=
df
.
iloc
[
i
,
:][[
'
community
'
,
f
'
id_
{
feature
}
'
]]
if
com
in
dic
:
dic
[
com
].
append
(
id_ft
)
else
:
dic
[
com
]
=
[
id_ft
]
return
dic
def
get_permutation_mean
(
df_ctrl
:
pd
.
DataFrame
,
nt
:
str
,
size
:
int
,
iteration
:
int
)
->
List
[
float
]:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration`
\
of `nt` frequencies from it.
:param df_ctrl: A dataframe containing the frequency of each nucleotide
\
in each exons/gene in fasterdb.
:param nt: A nucleotide of interest
:param size: The size of each sub samples to create
:param iteration: The number of sub samples to create
:return: The list of mean frequencies of `nt` in each subsample
"""
return
[
float
(
np
.
mean
(
df_ctrl
[
nt
].
sample
(
size
,
replace
=
True
).
values
))
for
_
in
range
(
iteration
)
]
def
perm_community_pval
(
row
:
pd
.
Series
,
df_ctrl
:
pd
.
DataFrame
,
nt
:
str
,
iteration
:
int
)
->
Tuple
[
float
,
float
,
str
]:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration`
\
of `nt` frequencies from it.
:param row: A line of a dataframe containing the frequency of
\
each feature inside a community.
:param df_ctrl: A dataframe containing the frequency of each nucleotide
\
in each exons/gene in fasterdb.
:param nt: A nucleotide of interest
:param iteration: The number of sub samples to create
:return: The ctrl mean frequency value of `nt`
\
the pvalue and the regulation of the enrichment/impoverishment
\
of the community in `row` compared to control exons.
"""
list_values
=
get_permutation_mean
(
df_ctrl
,
nt
,
row
[
"
community_size
"
],
iteration
)
pval
,
reg
=
get_pvalue
(
np
.
array
(
list_values
),
row
[
nt
],
iteration
)
return
float
(
np
.
mean
(
list_values
)),
pval
,
reg
def
perm_pvalues
(
df
:
pd
.
DataFrame
,
df_ctrl
:
pd
.
DataFrame
,
feature
:
str
,
nt
:
str
,
iteration
:
int
,
dic_com
:
Dict
)
->
pd
.
DataFrame
:
"""
Randomly sample `size` `feature` from `df_ctrl` to extract `iteration`
\
of `nt` frequencies from it.
:param df: A dataframe containing the frequency of each nucleotide
\
in each exons belonging to a community.
:param df_ctrl: A dataframe containing the frequency of each nucleotide
\
in each exons/gene in fasterdb.
:param feature: The feature of interest (gene, exon)
:param nt: A nucleotide of interest
:param iteration: The number of sub samples to create
:param dic_com: A dictionary linking each community to the exons
\
it contains.
:return: The dataframe containing p-values and regulation
\
indicating the enrichment of
"""
list_pval
,
list_reg
,
mean_ctrl
=
([]
for
_
in
range
(
3
))
for
i
in
tqdm
(
range
(
df
.
shape
[
0
]),
desc
=
"
performing permutations
"
):
row
=
df
.
iloc
[
i
,
:]
res
=
perm_community_pval
(
row
,
df_ctrl
.
loc
[
-
df_ctrl
[
f
'
id_
{
feature
}
'
].
isin
(
dic_com
[
row
[
'
community
'
]]),
:],
nt
,
iteration
)
[
x
.
append
(
y
)
for
x
,
y
in
zip
([
mean_ctrl
,
list_pval
,
list_reg
],
res
)]
adj_pvals
=
multipletests
(
list_pval
,
alpha
=
0.05
,
method
=
'
fdr_bh
'
,
is_sorted
=
False
,
returnsorted
=
False
)[
1
]
adj_regs
=
[
list_reg
[
i
]
if
adj_pvals
[
i
]
<=
0.05
else
"
.
"
for
i
in
range
(
len
(
list_reg
))]
df
[
f
'
mean_
{
iteration
}
_ctrl
'
]
=
mean_ctrl
df
[
f
'
p-adj
'
]
=
adj_pvals
df
[
f
'
reg-adj
'
]
=
adj_regs
return
df
def
perm_with_ctrl
(
df
:
pd
.
DataFrame
,
feature
:
str
,
nt
:
str
,
df_ctrl
:
pd
.
DataFrame
,
dic_com
:
Dict
,
iteration
:
int
)
->
pd
.
DataFrame
:
"""
:param df: A dataframe containing the frequency of each nucleotide
\
in each exons belonging to a community.
:param feature: The kind of feature analysed
:param nt: The nucleotide of interest
:param df_ctrl: A dataframe containing the frequency of each nucleotide
\
in each exons/gene in fasterdb.
:param dic_com: A dictionary linking each community to the exons
\
it contains.
:param iteration: The number of sub samples to create
:return: The dataframe with the p-value compared to the control
\
list of exons.
"""
mean_df
=
df
[[
nt
,
"
community
"
,
"
community_size
"
]].
\
groupby
([
"
community
"
,
"
community_size
"
]).
mean
().
reset_index
()
return
perm_pvalues
(
mean_df
,
df_ctrl
,
feature
,
nt
,
iteration
,
dic_com
)
def
get_stat_nt_communities
(
df
:
pd
.
DataFrame
,
project
:
str
,
weight
:
int
,
def
get_stat_nt_communities
(
df
:
pd
.
DataFrame
,
project
:
str
,
weight
:
int
,
global_weight
:
int
,
same_gene
:
bool
,
global_weight
:
int
,
same_gene
:
bool
,
nt
:
str
,
feature
:
str
=
"
exon
"
,
nt
:
str
,
df_ctrl
:
pd
.
DataFrame
,
region
:
str
=
""
,
dic_com
:
Dict
,
)
->
pd
.
Series
:
feature
:
str
=
"
exon
"
,
region
:
str
=
""
,
test_type
:
str
=
""
,
iteration
:
int
=
1000
)
->
pd
.
Series
:
"""
"""
Get data (proportion of `reg` regulated exons by a splicing factor
Get data (proportion of `reg` regulated exons by a splicing factor
`sf_name` within a community) for every community.
`sf_name` within a community) for every community.
...
@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
...
@@ -242,17 +391,22 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
:param same_gene: Say if we consider as co-localised, exons within the
\
:param same_gene: Say if we consider as co-localised, exons within the
\
same gene (True) or not (False) (default False)
same gene (True) or not (False) (default False)
:param nt: The nucleotide of interest
:param nt: The nucleotide of interest
:param feature: The king of feature analysed
:param df_ctrl: A dataframe containing the frequency of each nucleotide
\
in each exons/gene in fasterdb.
:param dic_com: A dictionary linking each community to the exons
\
it contains.
:param feature: The kind of feature analysed
:param region: the region of interest to extract from gene
:param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
:param iteration: The number of sub samples to create
"""
"""
logging
.
debug
(
f
"
lmm
for
{
project
}
, w:
{
weight
}
,
"
logging
.
debug
(
f
"
{
test_type
}
for
{
project
}
, w:
{
weight
}
,
"
f
"
g:
{
global_weight
}
nt:
{
nt
}
"
)
f
"
g:
{
global_weight
}
nt:
{
nt
}
"
)
res
=
{
"
project
"
:
[],
"
nt
"
:
[],
'
pval
'
:
[]}
res
=
{
"
project
"
:
[],
"
nt
"
:
[],
'
pval
'
:
[]}
outfile
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
outfile
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
global_weight
,
global_weight
,
same_gene
,
feature
,
same_gene
,
feature
,
f
"
{
nt
}
_stat.txt
"
,
f
"
{
nt
}
_stat
_
{
test_type
}
.txt
"
,
"
sf_community_enrichment
"
)
"
sf_community_enrichment
"
)
nfolder
=
outfile
.
parent
/
"
nt_analysis
"
nfolder
=
outfile
.
parent
/
"
nt_analysis
"
nfolder
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
nfolder
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
...
@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
...
@@ -261,23 +415,25 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
res
[
'
project
'
]
=
project
res
[
'
project
'
]
=
project
res
[
'
nt
'
]
=
nt
res
[
'
nt
'
]
=
nt
res
[
'
pval
'
]
=
pval
res
[
'
pval
'
]
=
pval
nt_ctrl_table
=
noutfile
.
parent
/
noutfile
.
name
.
replace
(
"
_stat.txt
"
,
"
_ctrl.txt
"
)
print
(
df
.
head
())
ndf
=
create_ctrl_community
(
df
,
nt_ctrl_table
,
feature
,
region
)
sum_df
=
lmm_maker_summary
(
ndf
,
outfile
,
nt
)
outfile_ctrl
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
outfile_ctrl
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
global_weight
,
global_weight
,
same_gene
,
feature
,
same_gene
,
feature
,
f
"
{
nt
}
_VS_CTRL_stat.txt
"
,
f
"
{
nt
}
_VS_CTRL_stat_
"
f
"
{
test_type
}
.txt
"
,
"
sf_community_enrichment
"
)
"
sf_community_enrichment
"
)
if
test_type
==
"
lmm
"
:
rdf
=
lmm_with_ctrl
(
df
,
feature
,
region
,
nt
,
outfile
)
else
:
rdf
=
perm_with_ctrl
(
df
,
feature
,
nt
,
df_ctrl
,
dic_com
,
iteration
)
noutfile_ctrl
=
nfolder
/
outfile_ctrl
.
name
noutfile_ctrl
=
nfolder
/
outfile_ctrl
.
name
sum_
df
.
to_csv
(
noutfile_ctrl
,
sep
=
"
\t
"
,
index
=
False
)
r
df
.
to_csv
(
noutfile_ctrl
,
sep
=
"
\t
"
,
index
=
False
)
return
pd
.
Series
(
res
)
return
pd
.
Series
(
res
)
def
create_dataframe
(
project
:
str
,
weight
:
int
,
global_weight
:
int
,
def
create_dataframe
(
project
:
str
,
weight
:
int
,
global_weight
:
int
,
same_gene
:
bool
,
feature
:
str
=
'
exon
'
)
->
pd
.
DataFrame
:
same_gene
:
bool
,
feature
:
str
=
'
exon
'
,
region
:
str
=
""
,
from_communities
:
bool
=
True
)
->
pd
.
DataFrame
:
"""
"""
:param project: The name of the project of interest
:param project: The name of the project of interest
:param weight: The minimum weight of interaction to consider
:param weight: The minimum weight of interaction to consider
...
@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int,
...
@@ -288,29 +444,65 @@ def create_dataframe(project: str, weight: int, global_weight: int,
:param same_gene: Say if we consider as co-localised, exons within the
\
:param same_gene: Say if we consider as co-localised, exons within the
\
same gene (True) or not (False)
same gene (True) or not (False)
:param feature: The kind of feature to analyse
:param feature: The kind of feature to analyse
:param from_communities: True if we only select gene/exons
:param region: the region of interest to extract from gene
:return: A dataframe containing the frequency of every nucleotides
\
:return: A dataframe containing the frequency of every nucleotides
\
of every exon in a large community
of every exon in a large community
"""
"""
size_threshold
=
10
if
feature
==
"
gene
"
else
50
size_threshold
=
10
if
feature
==
"
gene
"
else
50
result
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
"
_communities.txt
"
)
communities
=
get_communities
(
result
,
0
)
ncommunities
=
[
c
for
c
in
communities
if
len
(
c
)
>=
size_threshold
]
cnx
=
sqlite3
.
connect
(
ConfigGraph
.
db_file
)
cnx
=
sqlite3
.
connect
(
ConfigGraph
.
db_file
)
list_exon
=
reduce
(
lambda
x
,
y
:
x
+
y
,
ncommunities
)
if
from_communities
:
df
=
get_nt_frequency
(
cnx
,
list_exon
,
feature
)
result
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
global_weight
,
df_com
=
get_community_table
(
communities
,
size_threshold
,
feature
)
same_gene
,
feature
,
df
=
df
.
merge
(
df_com
,
how
=
"
left
"
,
on
=
f
"
id_
{
feature
}
"
)
"
_communities.txt
"
)
communities
=
get_communities
(
result
,
0
)
ncommunities
=
[
c
for
c
in
communities
if
len
(
c
)
>=
size_threshold
]
list_exon
=
reduce
(
lambda
x
,
y
:
x
+
y
,
ncommunities
)
df_com
=
get_community_table
(
communities
,
size_threshold
,
feature
)
df
=
get_nt_frequency
(
cnx
,
list_exon
,
feature
,
region
)
df
=
df
.
merge
(
df_com
,
how
=
"
left
"
,
on
=
f
"
id_
{
feature
}
"
)
else
:
list_exon
=
get_ft_id
(
cnx
,
feature
)
df
=
get_nt_frequency
(
cnx
,
list_exon
,
feature
,
region
)
return
df
return
df
def
create_dataframes
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
region
,
test_type
)
->
Tuple
[
pd
.
DataFrame
,
pd
.
DataFrame
,
Dict
]:
"""
:param weight: The weight of interaction to consider
:param global_weight: The global weighs to consider. if
\
the global weight is equal to 0 then the project `project` is
\
used.
:param project: The project name, used only if global_weight = 0
:param same_gene: Say if we consider as co-localised exon within the
\
same gene
:param feature: The kind of analysed feature
:param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
"""
df
=
create_dataframe
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
region
)
if
test_type
==
'
lmm
'
:
df_ctrl
=
pd
.
DataFrame
()
dic_com
=
{}
else
:
df_ctrl
=
create_dataframe
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
region
,
from_communities
=
False
)
dic_com
=
get_feature_by_community
(
df
,
feature
)
return
df
,
df_ctrl
,
dic_com
def
multiple_nt_lmm_launcher
(
ps
:
int
,
def
multiple_nt_lmm_launcher
(
ps
:
int
,
weight
:
int
,
weight
:
int
,
global_weight
:
int
,
global_weight
:
int
,
project
:
str
,
project
:
str
,
same_gene
:
bool
,
same_gene
:
bool
,
feature
:
str
=
'
exon
'
,
region
:
str
=
''
,
feature
:
str
=
'
exon
'
,
region
:
str
=
''
,
test_type
:
str
=
"
lmm
"
,
iteration
:
int
=
1000
,
logging_level
:
str
=
"
DISABLE
"
):
logging_level
:
str
=
"
DISABLE
"
):
"""
"""
Launch the statistical analysis for every
Launch the statistical analysis for every
...
@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int,
...
@@ -325,6 +517,8 @@ def multiple_nt_lmm_launcher(ps: int,
same gene
same gene
:param feature: The kind of analysed feature
:param feature: The kind of analysed feature
:param region: the region of interest to extract from gene
:param region: the region of interest to extract from gene
:param test_type: The type of test to make (permutation or lmm)
:param iteration: The number of iteration to perform
:param logging_level: Level of information to display
:param logging_level: Level of information to display
"""
"""
ConfigGraph
.
community_folder
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
ConfigGraph
.
community_folder
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
...
@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int,
...
@@ -332,22 +526,21 @@ def multiple_nt_lmm_launcher(ps: int,
logging
.
info
(
"
Checking if communities as an effect on nucleotide
"
logging
.
info
(
"
Checking if communities as an effect on nucleotide
"
"
frequency
"
)
"
frequency
"
)
project
=
get_projects
(
global_weight
,
project
)
project
=
get_projects
(
global_weight
,
project
)
nt_list
=
[
"
A
"
,
"
C
"
,
"
G
"
,
"
T
"
,
"
S
"
,
"
W
"
]
nt_list
=
[
"
A
"
]
#
, "C", "G", "T", "S", "W"]
condition
=
list
(
product
([
project
],
[
weight
],
nt_list
))
condition
=
list
(
product
([
project
],
[
weight
],
nt_list
))
processes
=
[]
processes
=
[]
pool
=
mp
.
Pool
(
processes
=
min
(
ps
,
len
(
condition
)))
pool
=
mp
.
Pool
(
processes
=
min
(
ps
,
len
(
condition
)))
logging
.
debug
(
"
Calculating stats...
"
)
logging
.
debug
(
"
Creating tables
"
)
df
,
df_ctrl
,
dic_com
=
create_dataframes
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
region
,
test_type
)
for
project
,
weight
,
nt
in
condition
:
for
project
,
weight
,
nt
in
condition
:
df
=
create_dataframe
(
project
,
weight
,
global_weight
,
same_gene
,
feature
)
nfile_table
=
ConfigGraph
.
get_community_file
(
nfile_table
=
ConfigGraph
.
get_community_file
(
project
,
weight
,
global_weight
,
same_gene
,
feature
,
project
,
weight
,
global_weight
,
same_gene
,
feature
,
f
"
_nt_table.txt
"
,
"
sf_community_enrichment
"
)
f
"
_nt_table.txt
"
,
"
sf_community_enrichment
"
)
df
.
to_csv
(
nfile_table
,
sep
=
"
\t
"
,
index
=
False
)
df
.
to_csv
(
nfile_table
,
sep
=
"
\t
"
,
index
=
False
)
args
=
[
df
,
project
,
weight
,
global_weight
,
same_gene
,
nt
,
df_ctrl
,
dic_com
,
feature
,
region
,
test_type
,
iteration
]
args
=
[
df
,
project
,
weight
,
global_weight
,
same_gene
,
nt
,
feature
,
region
]
processes
.
append
(
pool
.
apply_async
(
get_stat_nt_communities
,
args
))
processes
.
append
(
pool
.
apply_async
(
get_stat_nt_communities
,
args
))
results
=
[
p
.
get
(
timeout
=
None
)
for
p
in
processes
]
results
=
[
p
.
get
(
timeout
=
None
)
for
p
in
processes
]
pool
.
close
()
pool
.
close
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment