From 52d4c43f2f7c62ef93bd4deda29c5f37de609e1b Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Tue, 1 Dec 2020 11:50:24 +0100
Subject: [PATCH] src/bed_handler/get_other_exon_in_same_gene.py: From a list
 of exons get the list of other exons in same gene far from a CTCF site

---
 .../get_other_exon_in_same_gene.py            | 183 ++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 src/bed_handler/get_other_exon_in_same_gene.py

diff --git a/src/bed_handler/get_other_exon_in_same_gene.py b/src/bed_handler/get_other_exon_in_same_gene.py
new file mode 100644
index 0000000..bdb773b
--- /dev/null
+++ b/src/bed_handler/get_other_exon_in_same_gene.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is, given \
+an bed file containing exon, recover all exons in every \
+gene in the bed file, then remove the exons presents in the input bed \
+and eliminate those near a CTCF site.
+"""
+
+from pathlib import Path
+from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed
+from .config import BedConfig, TestConfig, OutputBed
+import pandas as pd
+from typing import List, Tuple
+import lazyparser as lp
+
+
+def get_gene2keep(df: pd.DataFrame) -> List[int]:
+    """
+    Get the genes we want to keep from a dataframe corresponding to a \
+    bed file.
+
+    :param df: A dataframe corresponding to a bed of exons
+    :return: The list of gene to keep
+
+    >>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200],
+    ... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0],
+    ... "strand": ["-", "+"]})
+    >>> get_gene2keep(data)
+    [1, 2]
+    """
+    return [int(exon_id.split("_")[0]) for exon_id in df['id'].to_list()]
+
+
+def get_exon2remove(df: pd.DataFrame) -> List[str]:
+    """
+    Get the list of exon that we don't want to keep.
+
+    :param df: A dataframe corresponding to a bed file
+    :return: The list of exon to remove
+
+    >>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200],
+    ... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0],
+    ... "strand": ["-", "+"]})
+    >>> get_exon2remove(data)
+    ['1_1', '2_1']
+    """
+    return df['id'].to_list()
+
+
+def get_lists_from_bed(bed: str) -> Tuple[List[int], List[str]]:
+    """
+    Get the List of gene to keep and the list of exon to remove from a \
+    bed file.
+
+    :param bed: A bed file containing exons.
+    :return: The list of gene to keep and the list of exons to remove.
+
+    >>> g2k, e2r = get_lists_from_bed(TestConfig.exon_bed)
+    >>> g2k
+    [1, 1, 1, 1, 1, 1, 1, 1, 1]
+    >>> e2r
+    ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9']
+    """
+    df = pd.read_csv(bed, sep="\t")
+    return get_gene2keep(df), get_exon2remove(df)
+
+
+def filter_exon(bed: Path, gene2keep: List[int],
+                exon2remove: List[str]) -> pd.DataFrame:
+    """
+    :param bed: The bed file containing every fasterDb exon.
+    :param gene2keep: The list of gene 2 keep
+    :param exon2remove: The list of exon 2 remove
+    :return: The dataframe of fasterDB exon filtered
+
+    >>> filter_exon(TestConfig.exon_bed, [1], ["1_1", "1_3", "1_5", "1_7"])
+       #ref     start       end   id  score strand
+    1    18  28681183  28681432  1_2      0      -
+    3    18  28672063  28672263  1_4      0      -
+    5    18  28670990  28671110  1_6      0      -
+    7    18  28667631  28667776  1_8      0      -
+    8    18  28666538  28666705  1_9      0      -
+    """
+    df = pd.read_csv(bed, sep="\t")
+    df['gene_id'] = get_gene2keep(df)
+    df = df.loc[(df['gene_id'].isin(gene2keep)) &
+                (-df['id'].isin(exon2remove)), :].copy()
+    df.drop('gene_id', axis=1, inplace=True)
+    return df
+
+
+def get_final_bed(df_dist: pd.DataFrame, df_exon: pd.DataFrame,
+                  threshold: int) -> pd.DataFrame:
+    """
+    Merge de dataframe of exon with the dataframe of distance to ctcf \
+    site and filter those that are far from CTCF.
+
+    :param df_dist: A dataframe containing the distance of every exon \
+    to ctcf.
+    :param df_exon: The dataframe of good exon
+    :param threshold: The minimum distance required to keep the exon.
+    :return: The dataframe of exon that are distant from at \
+    least `threshold` nucleotide than CTCF
+
+    >>> data_ctcf = pd.DataFrame({"id": ["1_1", "1_2"], "dist": [-20, 30]})
+    >>> data = pd.DataFrame({"#ref": [18, 19, 20], "start": [100, 200, 300],
+    ... "end": [200, 300, 400], "id": ["1_1", "1_2", "3_1"],
+    ... "score": [0, 0, 0], "strand": ["-", "+", "-"]})
+    >>> get_final_bed(data_ctcf, data, 10)
+       #ref  start  end   id  score strand
+    0    18    100  200  1_1      0      -
+    1    19    200  300  1_2      0      +
+    >>> get_final_bed(data_ctcf, data, 21)
+       #ref  start  end   id  score strand
+    1    19    200  300  1_2      0      +
+    """
+    df = df_exon.merge(df_dist, how="left", on="id")
+    df = df.loc[abs(df["dist"]) >= threshold, :].copy()
+    df.drop("dist", axis=1, inplace=True)
+    return df
+
+
+def create_gene_bed4norm(gene_bed: Path, df_exon: pd.DataFrame
+                         ) -> pd.DataFrame:
+    """
+    Create a bed containing duplicated gene used to norm the exon bed.
+
+    :param gene_bed: A bed file containing all FasterDB genes
+    :param df_exon: The dataframe of exons of interest
+    :return: The dataframe containing duplicated gene
+
+    >>> data = pd.DataFrame({"#ref": [18, 19, 20, 21, 22],
+    ... "start": [100, 200, 300, 5, 6],
+    ... "end": [200, 300, 400, 5, 6],
+    ... "id": ["1_1", "1_2", "1_3", "3_1", "3_2"],
+    ... "score": [0, 0, 0, 0, 0], "strand": ["-", "+", "-", "-", "+"]})
+    >>> create_gene_bed4norm(TestConfig.gene_bed, data)
+       #ref     start       end  id score strand
+    0    18  28645943  28682388   1  DSC2      -
+    1    18  28645943  28682388   1  DSC2      -
+    2    18  28645943  28682388   1  DSC2      -
+    3    18  28898050  28937394   3  DSG1      +
+    4    18  28898050  28937394   3  DSG1      +
+    """
+    df_gene = pd.read_csv(gene_bed, sep="\t")
+    list_gene = get_gene2keep(df_exon)
+    tmp = pd.DataFrame({"id": list_gene})
+    df_gene = df_gene.loc[df_gene['id'].isin(list_gene), :]
+    return pd.merge(df_gene, tmp)
+
+
+@lp.parse(bed="file", distance_threshold="distance_threshold > 0")
+def get_other_exon_in_same_gene(bed: str,
+                                distance_threshold: int,
+                                outfile: str) -> None:
+    """
+    From a bed of exon, create a bed contaiing other exons in the \
+    same gene far from CTCF sites.
+
+    :param bed: A bed file containing exons
+    :param distance_threshold: The minimum distance required to keep the exon.
+    :param outfile: The name of the output file
+    """
+    exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed)
+    ctcf_dist = load_sipp_vs_ctcf(BedConfig.exons_vs_ctcf, exon_table)
+    ctcf_dist = ctcf_dist[["dist", "id"]]
+    g2k, e2r = get_lists_from_bed(bed)
+    good_exon_df = filter_exon(BedConfig.exon_bed, g2k, e2r)
+    final_df = get_final_bed(ctcf_dist, good_exon_df, distance_threshold)
+    final_df.to_csv(OutputBed.output / outfile, sep="\t", index=False)
+    gene_df = create_gene_bed4norm(BedConfig.gene_bed, final_df)
+    gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") +
+                   "-gene-dup.bed"), sep="\t", index=False)
+    gene_df = gene_df.drop_duplicates()
+    gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") +
+                   "-gene.bed"), sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    get_other_exon_in_same_gene()
-- 
GitLab