From 9c206182aca2fdd03bbbf164f6ac09680007603a Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Tue, 11 Mar 2025 16:00:34 +0100 Subject: [PATCH] script to merge the communities together --- src/location_pp/merge_nr_communities.py | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/location_pp/merge_nr_communities.py diff --git a/src/location_pp/merge_nr_communities.py b/src/location_pp/merge_nr_communities.py new file mode 100644 index 00000000..9092d8e1 --- /dev/null +++ b/src/location_pp/merge_nr_communities.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +""" +Description: The goal of this script is to merge multiple +communities file together and remove redundant clusters, cluster +with the same genes +""" + +from pathlib import Path +from typing import List + +import lazyparser as lp +import polars as pl +from loguru import logger + +from src.location_pp.config import ConfigLocation + + +def load_communities(files: list[Path]) -> pl.DataFrame: + """Load communities from files.""" + df = pl.concat( + [ + pl.read_csv(file, separator="\t").select( + ["community", "nodes", "genes"] + ) + for file in files + ] + ) + row = df.shape[0] + df = df.unique(subset=["genes"], keep="first") + final = df.shape[0] + logger.info( + f" {row - final} / {row} communities were duplicated and removed" + ) + return df + + +@lp.parse(files="file") +def main_merge(files: List[str], outname: str) -> pl.DataFrame: + """Merge communities from files.""" + df = load_communities([Path(f) for f in files]) + df.write_csv(ConfigLocation.output / f"{outname}.txt", separator="\t") + + +if __name__ == "__main__": + main_merge() -- GitLab