From a08f96a471736b5daefe06b3cc8df2d0c41144c7 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Thu, 19 Nov 2020 09:45:23 +0100
Subject: [PATCH] src/find_interaction_cluster/create_ppi_files.py: fix
 filter_most_overllaping_ppi to correct community size and nb_com-ppi columns

---
 src/find_interaction_cluster/create_ppi_files.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py
index 2e96d8e2..0c7a211d 100644
--- a/src/find_interaction_cluster/create_ppi_files.py
+++ b/src/find_interaction_cluster/create_ppi_files.py
@@ -187,18 +187,20 @@ def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict,
     return df
 
 
-def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame:
+def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int
+                                ) -> pd.DataFrame:
     """
     Get the DNA community and the ppi community have the larger overlap \
     possible.
 
     :param df: The dataframe containing the number of \
     genes that clusters in community at DNA and protein level
+    :param the minimum size required to keep gene community at dna level.
     :return: The dataframe with only one line per DNA community
 
     >>> test_df = pd.DataFrame({"id_gene": range(1, 12),
     ... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"],
-    ... "community_size": [10] * 5 + [15] * 5 + [20],
+    ... "community_size": [9] * 5 + [14] * 5 + [19],
     ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0],
     ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]})
     >>> filter_most_overllaping_ppi(test_df)
@@ -219,7 +221,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame:
     df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1
                                 if x["size_com-ppi"] > 0
                                 else x['nb_com-ppi'], axis=1)
-    return df
+    df["community_size"] = df["community_size"] + 1
+    return df[df["community_size"] >= size_threshold]
 
 
 def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path,
@@ -306,7 +309,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path,
         df.to_csv(outfile, sep="\t", index=False)
     else:
         df = pd.read_csv(outfile, sep="\t")
-    df_overlap = filter_most_overllaping_ppi(df)
+    size_threshold = 10
+    df_overlap = filter_most_overllaping_ppi(df, size_threshold)
     ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t"))
     dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file,
                                                       sep="\t"))
-- 
GitLab