diff --git a/src/bed_handler/__main__.py b/src/bed_handler/__main__.py index 32ae9c689aa1a446b812bc8d32b60ebbff260690..4f87f0c1e89a50b46615fd487fd9e6d0b09094ad 100644 --- a/src/bed_handler/__main__.py +++ b/src/bed_handler/__main__.py @@ -9,12 +9,14 @@ visualisation from .filter_gene import create_filtered_bed from .get_gene_locations import create_region_bed +from .get_gene_regulated_by_ddx import write_regulated_gene_file def launcher(): """ Create the necessary bed file to visualise bigwig file """ + write_regulated_gene_file() create_filtered_bed() create_region_bed() diff --git a/src/bed_handler/config.py b/src/bed_handler/config.py index e5316a5801eaebb7593811dd6d23ef167c9283b6..69b49e39a87772870be848f6045e8ec9338ec991 100644 --- a/src/bed_handler/config.py +++ b/src/bed_handler/config.py @@ -19,6 +19,7 @@ class OutputBed: tss_gene = output / "tss_gene.bed" tts_gene = output / "tts_gene.bed" after_gene = output / "after_gene.bed" + ddx_genes = output / "DDX5_17_genes.txt" class BedConfig: @@ -26,9 +27,9 @@ class BedConfig: A class containing all the variables used in this submodule """ base = Path(__file__).parents[2] - ddx_genes = base / "data" / "DDX5_17_genes.txt" gene_bed = base / "data" / "bed" / "gene.bed" exon_bed = base / "data" / "bed" / "exon.bed" + sipp_vs_ctcf = base / "data" / "Sipp_exon_vs_CTCF.csv" bed = OutputBed size = 5000 @@ -39,4 +40,5 @@ class TestConfig: list_genes = base / "list_genes.txt" gene_bed = base / "genes.bed" exon_bed = base / "exons.bed" - small_bw = base / "small.bw" \ No newline at end of file + small_bw = base / "small.bw" + sipp_vs_ctcf = base / 'Sipp_exon_vs_CTCF.csv' \ No newline at end of file diff --git a/src/bed_handler/filter_gene.py b/src/bed_handler/filter_gene.py index 0918807d908807687b898d8999699f8dbd8cc51d..944e72c656dc442a3c7e32acc807e8e5988579e2 100644 --- a/src/bed_handler/filter_gene.py +++ b/src/bed_handler/filter_gene.py @@ -51,9 +51,8 @@ def create_filtered_bed() -> None: """ Create a bed file containing only the genes of interest. """ - gene_list = select_gene_of_interest(BedConfig.ddx_genes) + gene_list = select_gene_of_interest(BedConfig.bed.ddx_genes) df = filter_bed(BedConfig.gene_bed, gene_list) - BedConfig.bed.output.mkdir(exist_ok=True, parents=True) df.to_csv(BedConfig.bed.filtered_gene, sep="\t", index=False) diff --git a/src/bed_handler/get_gene_regulated_by_ddx.py b/src/bed_handler/get_gene_regulated_by_ddx.py new file mode 100644 index 0000000000000000000000000000000000000000..b48e5b281ca944aa984d4d668d44d576211d9b7f --- /dev/null +++ b/src/bed_handler/get_gene_regulated_by_ddx.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to get genes regulated by \ +DDX5/17 +""" + +from .config import BedConfig, TestConfig +from pathlib import Path +import pandas as pd +from doctest import testmod +from typing import List + + +def format_exon_bed(exon_bed: Path, gene_bed: Path) -> pd.DataFrame: + """ + + :param exon_bed: A bed file containing fasterDB exons + :param gene_bed: A bed file containing fasterDB genes + :return: A dataframe of exon with it's coordinate id + it's \ + id corresponding to it's gene id and it's position within the gene + + >>> format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed) + exon_id id gene_id gene_name exon_pos exon_name + 0 18:28681866-28682388 1_1 1 DSC2 1 DSC2_1 + 1 18:28681184-28681432 1_2 1 DSC2 2 DSC2_2 + 2 18:28673522-28673606 1_3 1 DSC2 3 DSC2_3 + 3 18:28672064-28672263 1_4 1 DSC2 4 DSC2_4 + 4 18:28671490-28671530 1_5 1 DSC2 5 DSC2_5 + 5 18:28670991-28671110 1_6 1 DSC2 6 DSC2_6 + 6 18:28669402-28669557 1_7 1 DSC2 7 DSC2_7 + 7 18:28667632-28667776 1_8 1 DSC2 8 DSC2_8 + 8 18:28666539-28666705 1_9 1 DSC2 9 DSC2_9 + """ + exon_bed = pd.read_csv(exon_bed, sep="\t") + exon_bed['exon_id'] = exon_bed['#ref'].astype(str) + ':' + \ + (exon_bed['start'] + 1).astype(str) + '-' + \ + exon_bed['end'].astype(str) + exon_bed['gene_id'] = exon_bed['id'].str.replace(r'_\d+', '').astype(int) + exon_bed['exon_pos'] = exon_bed['id'].str.replace(r'\d+_', '').astype(int) + df_gene = pd.read_csv(gene_bed, sep="\t")[["id", "score"]] + df_gene.columns = ['gene_id', 'gene_name'] + exon_bed = exon_bed.merge(df_gene, how="left", on=['gene_id']) + exon_bed['exon_name'] = exon_bed['gene_name'] + "_" + \ + exon_bed['exon_pos'].astype(str) + return exon_bed[['exon_id', 'id', 'gene_id', 'gene_name', 'exon_pos', + 'exon_name']] + + +def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame + ) -> pd.DataFrame: + """ + Load the file containing exons regulated by DDX5 17 and containing their \ + distance to CTCF. Finaly merge this table to `exon_table` to get \ + aditional data + + :param sipp_file:file containing exons regulated by DDX5 17 and \ + containing their distance to CTCF. + :return: Same input table as a dataframe + + >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf, + ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) + >>> df[['exon_name', "exon_id", 'group', 'id', 'gene_id']] + exon_name exon_id group id gene_id + 0 DSC2_1 18:28681866-28682388 siPP_DOWN 1_1 1 + 1 DSC2_2 18:28681184-28681432 siPP_DOWN 1_2 1 + 2 DSC2_3 18:28673522-28673606 siPP_DOWN 1_3 1 + 3 DSC2_4 18:28672064-28672263 siPP_UP 1_4 1 + 4 DSC2_5 18:28671490-28671530 siPP_DOWN 1_5 1 + 5 DSC2_6 18:28670991-28671110 siPP_DOWN 1_6 1 + 6 DSC2_7 18:28669402-28669557 siPP_DOWN 1_7 1 + 7 DSC2_8 18:28667632-28667776 siPP_DOWN 1_8 1 + 8 DSC2_9 18:28666539-28666705 siPP_DOWN 1_9 1 + """ + df = pd.read_csv(sipp_file, sep="\t") + return df.merge(exon_table, how="left", on=["exon_id", "exon_name"]) + + +def get_ddx_genes(df: pd.DataFrame) -> List[int]: + """ + Return the unique values in gene_id column. + + :param df: A dataframe of DDX regulated exons + :return: Return the unique values in gene_id column. + + >>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf, + ... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)) + >>> get_ddx_genes(df) + [1] + """ + return list(df['gene_id'].unique()) + + +def write_regulated_gene_file(): + """ + Write file file containing the regulated exons by DDX5/17 + """ + BedConfig.bed.output.mkdir(exist_ok=True, parents=True) + exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed) + final_table = load_sipp_vs_ctct(BedConfig.sipp_vs_ctcf, exon_table) + list_gene = get_ddx_genes(final_table) + with BedConfig.bed.ddx_genes.open('w')as outfile: + for gene in list_gene: + outfile.write(f"{gene}\n") + + +if __name__ == "__main__": + testmod() diff --git a/tests/files/Sipp_exon_vs_CTCF.csv b/tests/files/Sipp_exon_vs_CTCF.csv new file mode 100644 index 0000000000000000000000000000000000000000..d42cf046bd7214f74d0dab106795d571479c8ad4 --- /dev/null +++ b/tests/files/Sipp_exon_vs_CTCF.csv @@ -0,0 +1,10 @@ +exon_name dist exon_id CTCF_hit_id group strand deltaPSI +DSC2_1 1 18:28681866-28682388 18:28681866-28682388 siPP_DOWN - -0.33 +DSC2_2 2 18:28681184-28681432 18:28681184-28681432 siPP_DOWN + -0.23 +DSC2_3 3 18:28673522-28673606 18:28673522-28673606 siPP_DOWN + -0.17 +DSC2_4 4 18:28672064-28672263 18:28672064-28672263 siPP_UP - 0.20 +DSC2_5 5 18:28671490-28671530 18:28671490-28671530 siPP_DOWN - -0.17 +DSC2_6 6 18:28670991-28671110 18:28670991-28671110 siPP_DOWN - -0.49 +DSC2_7 7 18:28669402-28669557 18:28669402-28669557 siPP_DOWN + -0.32 +DSC2_8 8 18:28667632-28667776 18:28667632-28667776 siPP_DOWN - -0.32 +DSC2_9 9 18:28666539-28666705 18:28666539-28666705 siPP_DOWN - -0.14