diff --git a/src/find_interaction_cluster/create_ppi_files.py b/src/find_interaction_cluster/create_ppi_files.py index 2e96d8e2882a920c7670ef7de50d62b0a8a4b9ab..0c7a211d217c74444db2361d2d4a76bb9c8f602f 100644 --- a/src/find_interaction_cluster/create_ppi_files.py +++ b/src/find_interaction_cluster/create_ppi_files.py @@ -187,18 +187,20 @@ def add_size_and_common_gene_number(df: pd.DataFrame, dic_gene: Dict, return df -def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame: +def filter_most_overllaping_ppi(df: pd.DataFrame, size_threshold: int + ) -> pd.DataFrame: """ Get the DNA community and the ppi community have the larger overlap \ possible. :param df: The dataframe containing the number of \ genes that clusters in community at DNA and protein level + :param the minimum size required to keep gene community at dna level. :return: The dataframe with only one line per DNA community >>> test_df = pd.DataFrame({"id_gene": range(1, 12), ... "DNA_community": ["C1"] * 5 + ["C2"] * 5 + ["C3"], - ... "community_size": [10] * 5 + [15] * 5 + [20], + ... "community_size": [9] * 5 + [14] * 5 + [19], ... "nb_com-ppi": [0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0], ... "size_com-ppi": [30, 50, 105, 102, 25, 30, 42, 47, 89, 12, 0]}) >>> filter_most_overllaping_ppi(test_df) @@ -219,7 +221,8 @@ def filter_most_overllaping_ppi(df: pd.DataFrame) -> pd.DataFrame: df['nb_com-ppi'] = df.apply(lambda x: x['nb_com-ppi'] + 1 if x["size_com-ppi"] > 0 else x['nb_com-ppi'], axis=1) - return df + df["community_size"] = df["community_size"] + 1 + return df[df["community_size"] >= size_threshold] def create_community_ppi_table(community_file: Path, fasterdb_ppi: Path, @@ -306,7 +309,8 @@ def ppi_stats_analysis(community_file: Path, fasterdb_ppi: Path, df.to_csv(outfile, sep="\t", index=False) else: df = pd.read_csv(outfile, sep="\t") - df_overlap = filter_most_overllaping_ppi(df) + size_threshold = 10 + df_overlap = filter_most_overllaping_ppi(df, size_threshold) ppi_gene = get_ppi_community_gene(pd.read_csv(ppi_outfile, sep="\t")) dic_dna_gene = get_dna_community_gene(pd.read_csv(community_file, sep="\t"))