From a08f96a471736b5daefe06b3cc8df2d0c41144c7 Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Thu, 19 Nov 2020 09:45:23 +0100 Subject: [PATCH] src/find_interaction_cluster/create_ppi_files.py: fix filter_most_overllaping_ppi to correct community size and nb_com-ppi columns --- src/find_interaction_cluster/create_ppi_files.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py index 2e96d8e2..0c7a211d 100644 --- a/src/find_interaction_cluster/create_ppi_files.py +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -187,18 +187,20 @@ def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict, return df -def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame: +def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int + ) -> pd.DataFrame: """ Get the DNA community and the ppi community have the larger overlap \ possible. :param df: The dataframe containing the number of \ genes that clusters in community at DNA and protein level + :param the minimum size required to keep gene community at dna level. :return: The dataframe with only one line per DNA community >>> test_df = pd.DataFrame({"id_gene": range(1, 12), ... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"], - ... "community_size": [10] * 5 + [15] * 5 + [20], + ... "community_size": [9] * 5 + [14] * 5 + [19], ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0], ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]}) >>> filter_most_overllaping_ppi(test_df) @@ -219,7 +221,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame: df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1 if x["size_com-ppi"] > 0 else x['nb_com-ppi'], axis=1) - return df + df["community_size"] = df["community_size"] + 1 + return df[df["community_size"] >= size_threshold] def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, @@ -306,7 +309,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, df.to_csv(outfile, sep="\t", index=False) else: df = pd.read_csv(outfile, sep="\t") - df_overlap = filter_most_overllaping_ppi(df) + size_threshold = 10 + df_overlap = filter_most_overllaping_ppi(df, size_threshold) ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t")) dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file, sep="\t")) -- GitLab