diff --git a/src/find_interaction_cluster/clip_figures/clip_analyser.py b/src/find_interaction_cluster/clip_figures/clip_analyser.py index 6a88d7ea4c830b1d9c945bd0480f34a42d9a7d29..491f4799b4cb76b999081ff51247c6b6c692bf83 100644 --- a/src/find_interaction_cluster/clip_figures/clip_analyser.py +++ b/src/find_interaction_cluster/clip_figures/clip_analyser.py @@ -30,18 +30,18 @@ def bedtools_intersect(gene_bed: Path, clip_bed: Path, >>> bedtools_intersect(ConfigClip.test_gene_bed, ConfigClip.test_clip_bed, ... "gene") - id_gene clip_peak - 0 1 2 - 1 2 1 - 2 3 3 - 3 4 0 - 4 5 0 - 5 6 0 - 6 7 0 - 7 8 0 - 8 9 0 - 9 415 0 - 10 10123 0 + id_gene clip_peak peak_density + 0 1 2 0.054877 + 1 2 1 0.029736 + 2 3 3 0.076251 + 3 4 0 0.000000 + 4 5 0 0.000000 + 5 6 0 0.000000 + 6 7 0 0.000000 + 7 8 0 0.000000 + 8 9 0 0.000000 + 9 415 0 0.000000 + 10 10123 0 0.000000 """ res = sp.check_output(f"bedtools intersect -a {gene_bed} " @@ -49,7 +49,9 @@ def bedtools_intersect(gene_bed: Path, clip_bed: Path, res = StringIO(str(res, "utf-8")) id_col = f"id_{feature}" columns = ["chr", "start", "end", id_col, "name", "strand", "clip_peak"] - return pd.read_csv(res, sep="\t", names=columns)[[id_col, "clip_peak"]] + df = pd.read_csv(res, sep="\t", names=columns) + df['peak_density'] = df['clip_peak'] / ((df['end'] - df['start']) / 1000) + return df[[f'id_{feature}', 'clip_peak', 'peak_density']] def find_or_create_community(project: str, weight: int, global_weight: int, @@ -137,20 +139,21 @@ def merge_dataframes(peak_df: pd.DataFrame, com_df: pd.DataFrame, :return: The dataframe merged >>> d1 = pd.DataFrame({"id_gene": [0, 1, 2, 3, 4, 5, 6, 7], - ... "clip_peak": [2, 1, 3, 0, 0, 0, 0, 0]}) + ... "peak_density": [0.054877, 0.029736, 0.076251, 0, 0, 0, 0, 0]}) >>> d2 = pd.DataFrame({"id_gene": [0, 1, 2, 3, 4], ... "community": ["C1", "C1", "C2", "C2", "C2"], ... "community_size": [2, 2, 3, 3, 3]}) >>> merge_dataframes(d1, d2, "gene") - id_gene clip_peak community community_size - 0 0 2 C1 2.0 - 1 1 1 C1 2.0 - 2 2 3 C2 3.0 - 3 3 0 C2 3.0 - 4 4 0 C2 3.0 - 5 5 0 NaN NaN - 6 6 0 NaN NaN - 7 7 0 NaN NaN + id_gene peak_density community community_size + 0 0 0.054877 C1 2.0 + 1 1 0.029736 C1 2.0 + 2 2 0.076251 C2 3.0 + 3 3 0.000000 C2 3.0 + 4 4 0.000000 C2 3.0 + 5 5 0.000000 NaN NaN + 6 6 0.000000 NaN NaN + 7 7 0.000000 NaN NaN + """ return peak_df.merge(com_df, how="left", on=f"id_{feature}") @@ -171,18 +174,19 @@ def create_table(feature: str, clip_file: Path, >>> cf = find_or_create_community("", 3, 3, True, "gene") >>> create_table("gene", ConfigClip.test_clip_bed, ... ConfigClip.test_gene_bed, cf) - id_gene clip_peak community community_size - 0 1 2 NaN NaN - 1 2 1 NaN NaN - 2 3 3 NaN NaN - 3 4 0 NaN NaN - 4 5 0 NaN NaN - 5 6 0 NaN NaN - 6 7 0 NaN NaN - 7 8 0 NaN NaN - 8 9 0 NaN NaN - 9 415 0 C1 11.0 - 10 10123 0 NaN NaN + id_gene clip_peak peak_density community community_size + 0 1 2 0.054877 NaN NaN + 1 2 1 0.029736 NaN NaN + 2 3 3 0.076251 NaN NaN + 3 4 0 0.000000 NaN NaN + 4 5 0 0.000000 NaN NaN + 5 6 0 0.000000 NaN NaN + 6 7 0 0.000000 NaN NaN + 7 8 0 0.000000 NaN NaN + 8 9 0 0.000000 NaN NaN + 9 415 0 0.000000 C1 11.0 + 10 10123 0 0.000000 NaN NaN + """ threshold = 10 if feature == "gene" else 50 @@ -219,7 +223,7 @@ def create_figure(project: str, weight: int, global_weight: int, output.mkdir(exist_ok=True) outfile = output / f"{clip_file.name.split('.')[0]}.pdf" final_table = create_table(feature, clip_file, feature_bed, com_file) - create_community_fig(final_table, feature, "clip_peak", outfile, + create_community_fig(final_table, feature, "peak_density", outfile, test_type)