Skip to content
Snippets Groups Projects
Commit a08f96a4 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/create_ppi_files.py: fix...

src/find_interaction_cluster/create_ppi_files.py: fix filter_most_overllaping_ppi to correct community size and nb_com-ppi columns
parent fd031d72
No related branches found
No related tags found
No related merge requests found
......@@ -187,18 +187,20 @@ def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict,
return df
def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame:
def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int
) -> pd.DataFrame:
"""
Get the DNA community and the ppi community have the larger overlap \
possible.
:param df: The dataframe containing the number of \
genes that clusters in community at DNA and protein level
:param the minimum size required to keep gene community at dna level.
:return: The dataframe with only one line per DNA community
>>> test_df = pd.DataFrame({"id_gene": range(1, 12),
... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"],
... "community_size": [10] * 5 + [15] * 5 + [20],
... "community_size": [9] * 5 + [14] * 5 + [19],
... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0],
... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]})
>>> filter_most_overllaping_ppi(test_df)
......@@ -219,7 +221,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame:
df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1
if x["size_com-ppi"] > 0
else x['nb_com-ppi'], axis=1)
return df
df["community_size"] = df["community_size"] + 1
return df[df["community_size"] >= size_threshold]
def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path,
......@@ -306,7 +309,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path,
df.to_csv(outfile, sep="\t", index=False)
else:
df = pd.read_csv(outfile, sep="\t")
df_overlap = filter_most_overllaping_ppi(df)
size_threshold = 10
df_overlap = filter_most_overllaping_ppi(df, size_threshold)
ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t"))
dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file,
sep="\t"))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment