diff --git a/src/location_pp/merge_nr_communities.py b/src/location_pp/merge_nr_communities.py new file mode 100644 index 0000000000000000000000000000000000000000..9092d8e1584aeb1a010ddcd215f2ab4efad779e2 --- /dev/null +++ b/src/location_pp/merge_nr_communities.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +""" +Description: The goal of this script is to merge multiple +communities file together and remove redundant clusters, cluster +with the same genes +""" + +from pathlib import Path +from typing import List + +import lazyparser as lp +import polars as pl +from loguru import logger + +from src.location_pp.config import ConfigLocation + + +def load_communities(files: list[Path]) -> pl.DataFrame: + """Load communities from files.""" + df = pl.concat( + [ + pl.read_csv(file, separator="\t").select( + ["community", "nodes", "genes"] + ) + for file in files + ] + ) + row = df.shape[0] + df = df.unique(subset=["genes"], keep="first") + final = df.shape[0] + logger.info( + f" {row - final} / {row} communities were duplicated and removed" + ) + return df + + +@lp.parse(files="file") +def main_merge(files: List[str], outname: str) -> pl.DataFrame: + """Merge communities from files.""" + df = load_communities([Path(f) for f in files]) + df.write_csv(ConfigLocation.output / f"{outname}.txt", separator="\t") + + +if __name__ == "__main__": + main_merge()