From 52d4c43f2f7c62ef93bd4deda29c5f37de609e1b Mon Sep 17 00:00:00 2001 From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr> Date: Tue, 1 Dec 2020 11:50:24 +0100 Subject: [PATCH] src/bed_handler/get_other_exon_in_same_gene.py: From a list of exons get the list of other exons in same gene far from a CTCF site --- .../get_other_exon_in_same_gene.py | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 src/bed_handler/get_other_exon_in_same_gene.py diff --git a/src/bed_handler/get_other_exon_in_same_gene.py b/src/bed_handler/get_other_exon_in_same_gene.py new file mode 100644 index 0000000..bdb773b --- /dev/null +++ b/src/bed_handler/get_other_exon_in_same_gene.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is, given \ +an bed file containing exon, recover all exons in every \ +gene in the bed file, then remove the exons presents in the input bed \ +and eliminate those near a CTCF site. +""" + +from pathlib import Path +from .get_gene_regulated_by_ddx import load_sipp_vs_ctcf, format_exon_bed +from .config import BedConfig, TestConfig, OutputBed +import pandas as pd +from typing import List, Tuple +import lazyparser as lp + + +def get_gene2keep(df: pd.DataFrame) -> List[int]: + """ + Get the genes we want to keep from a dataframe corresponding to a \ + bed file. + + :param df: A dataframe corresponding to a bed of exons + :return: The list of gene to keep + + >>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200], + ... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0], + ... "strand": ["-", "+"]}) + >>> get_gene2keep(data) + [1, 2] + """ + return [int(exon_id.split("_")[0]) for exon_id in df['id'].to_list()] + + +def get_exon2remove(df: pd.DataFrame) -> List[str]: + """ + Get the list of exon that we don't want to keep. + + :param df: A dataframe corresponding to a bed file + :return: The list of exon to remove + + >>> data = pd.DataFrame({"#ref": [18, 19], "start": [100, 200], + ... "end": [200, 300], "id": ["1_1", "2_1"], "score": [0, 0], + ... "strand": ["-", "+"]}) + >>> get_exon2remove(data) + ['1_1', '2_1'] + """ + return df['id'].to_list() + + +def get_lists_from_bed(bed: str) -> Tuple[List[int], List[str]]: + """ + Get the List of gene to keep and the list of exon to remove from a \ + bed file. + + :param bed: A bed file containing exons. + :return: The list of gene to keep and the list of exons to remove. + + >>> g2k, e2r = get_lists_from_bed(TestConfig.exon_bed) + >>> g2k + [1, 1, 1, 1, 1, 1, 1, 1, 1] + >>> e2r + ['1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9'] + """ + df = pd.read_csv(bed, sep="\t") + return get_gene2keep(df), get_exon2remove(df) + + +def filter_exon(bed: Path, gene2keep: List[int], + exon2remove: List[str]) -> pd.DataFrame: + """ + :param bed: The bed file containing every fasterDb exon. + :param gene2keep: The list of gene 2 keep + :param exon2remove: The list of exon 2 remove + :return: The dataframe of fasterDB exon filtered + + >>> filter_exon(TestConfig.exon_bed, [1], ["1_1", "1_3", "1_5", "1_7"]) + #ref start end id score strand + 1 18 28681183 28681432 1_2 0 - + 3 18 28672063 28672263 1_4 0 - + 5 18 28670990 28671110 1_6 0 - + 7 18 28667631 28667776 1_8 0 - + 8 18 28666538 28666705 1_9 0 - + """ + df = pd.read_csv(bed, sep="\t") + df['gene_id'] = get_gene2keep(df) + df = df.loc[(df['gene_id'].isin(gene2keep)) & + (-df['id'].isin(exon2remove)), :].copy() + df.drop('gene_id', axis=1, inplace=True) + return df + + +def get_final_bed(df_dist: pd.DataFrame, df_exon: pd.DataFrame, + threshold: int) -> pd.DataFrame: + """ + Merge de dataframe of exon with the dataframe of distance to ctcf \ + site and filter those that are far from CTCF. + + :param df_dist: A dataframe containing the distance of every exon \ + to ctcf. + :param df_exon: The dataframe of good exon + :param threshold: The minimum distance required to keep the exon. + :return: The dataframe of exon that are distant from at \ + least `threshold` nucleotide than CTCF + + >>> data_ctcf = pd.DataFrame({"id": ["1_1", "1_2"], "dist": [-20, 30]}) + >>> data = pd.DataFrame({"#ref": [18, 19, 20], "start": [100, 200, 300], + ... "end": [200, 300, 400], "id": ["1_1", "1_2", "3_1"], + ... "score": [0, 0, 0], "strand": ["-", "+", "-"]}) + >>> get_final_bed(data_ctcf, data, 10) + #ref start end id score strand + 0 18 100 200 1_1 0 - + 1 19 200 300 1_2 0 + + >>> get_final_bed(data_ctcf, data, 21) + #ref start end id score strand + 1 19 200 300 1_2 0 + + """ + df = df_exon.merge(df_dist, how="left", on="id") + df = df.loc[abs(df["dist"]) >= threshold, :].copy() + df.drop("dist", axis=1, inplace=True) + return df + + +def create_gene_bed4norm(gene_bed: Path, df_exon: pd.DataFrame + ) -> pd.DataFrame: + """ + Create a bed containing duplicated gene used to norm the exon bed. + + :param gene_bed: A bed file containing all FasterDB genes + :param df_exon: The dataframe of exons of interest + :return: The dataframe containing duplicated gene + + >>> data = pd.DataFrame({"#ref": [18, 19, 20, 21, 22], + ... "start": [100, 200, 300, 5, 6], + ... "end": [200, 300, 400, 5, 6], + ... "id": ["1_1", "1_2", "1_3", "3_1", "3_2"], + ... "score": [0, 0, 0, 0, 0], "strand": ["-", "+", "-", "-", "+"]}) + >>> create_gene_bed4norm(TestConfig.gene_bed, data) + #ref start end id score strand + 0 18 28645943 28682388 1 DSC2 - + 1 18 28645943 28682388 1 DSC2 - + 2 18 28645943 28682388 1 DSC2 - + 3 18 28898050 28937394 3 DSG1 + + 4 18 28898050 28937394 3 DSG1 + + """ + df_gene = pd.read_csv(gene_bed, sep="\t") + list_gene = get_gene2keep(df_exon) + tmp = pd.DataFrame({"id": list_gene}) + df_gene = df_gene.loc[df_gene['id'].isin(list_gene), :] + return pd.merge(df_gene, tmp) + + +@lp.parse(bed="file", distance_threshold="distance_threshold > 0") +def get_other_exon_in_same_gene(bed: str, + distance_threshold: int, + outfile: str) -> None: + """ + From a bed of exon, create a bed contaiing other exons in the \ + same gene far from CTCF sites. + + :param bed: A bed file containing exons + :param distance_threshold: The minimum distance required to keep the exon. + :param outfile: The name of the output file + """ + exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed) + ctcf_dist = load_sipp_vs_ctcf(BedConfig.exons_vs_ctcf, exon_table) + ctcf_dist = ctcf_dist[["dist", "id"]] + g2k, e2r = get_lists_from_bed(bed) + good_exon_df = filter_exon(BedConfig.exon_bed, g2k, e2r) + final_df = get_final_bed(ctcf_dist, good_exon_df, distance_threshold) + final_df.to_csv(OutputBed.output / outfile, sep="\t", index=False) + gene_df = create_gene_bed4norm(BedConfig.gene_bed, final_df) + gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") + + "-gene-dup.bed"), sep="\t", index=False) + gene_df = gene_df.drop_duplicates() + gene_df.to_csv(OutputBed.output / (outfile.replace(".bed", "") + + "-gene.bed"), sep="\t", index=False) + + +if __name__ == "__main__": + get_other_exon_in_same_gene() -- GitLab