Skip to content
Snippets Groups Projects
Commit 94c02db1 authored by nfontrod's avatar nfontrod
Browse files

creation of get_gene_regulated_by_ddx.py and adaption of other scripts

parent a6614856
No related branches found
No related tags found
No related merge requests found
......@@ -9,12 +9,14 @@ visualisation
from .filter_gene import create_filtered_bed
from .get_gene_locations import create_region_bed
from .get_gene_regulated_by_ddx import write_regulated_gene_file
def launcher():
"""
Create the necessary bed file to visualise bigwig file
"""
write_regulated_gene_file()
create_filtered_bed()
create_region_bed()
......
......@@ -19,6 +19,7 @@ class OutputBed:
tss_gene = output / "tss_gene.bed"
tts_gene = output / "tts_gene.bed"
after_gene = output / "after_gene.bed"
ddx_genes = output / "DDX5_17_genes.txt"
class BedConfig:
......@@ -26,9 +27,9 @@ class BedConfig:
A class containing all the variables used in this submodule
"""
base = Path(__file__).parents[2]
ddx_genes = base / "data" / "DDX5_17_genes.txt"
gene_bed = base / "data" / "bed" / "gene.bed"
exon_bed = base / "data" / "bed" / "exon.bed"
sipp_vs_ctcf = base / "data" / "Sipp_exon_vs_CTCF.csv"
bed = OutputBed
size = 5000
......@@ -39,4 +40,5 @@ class TestConfig:
list_genes = base / "list_genes.txt"
gene_bed = base / "genes.bed"
exon_bed = base / "exons.bed"
small_bw = base / "small.bw"
\ No newline at end of file
small_bw = base / "small.bw"
sipp_vs_ctcf = base / 'Sipp_exon_vs_CTCF.csv'
\ No newline at end of file
......@@ -51,9 +51,8 @@ def create_filtered_bed() -> None:
"""
Create a bed file containing only the genes of interest.
"""
gene_list = select_gene_of_interest(BedConfig.ddx_genes)
gene_list = select_gene_of_interest(BedConfig.bed.ddx_genes)
df = filter_bed(BedConfig.gene_bed, gene_list)
BedConfig.bed.output.mkdir(exist_ok=True, parents=True)
df.to_csv(BedConfig.bed.filtered_gene, sep="\t", index=False)
......
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to get genes regulated by \
DDX5/17
"""
from .config import BedConfig, TestConfig
from pathlib import Path
import pandas as pd
from doctest import testmod
from typing import List
def format_exon_bed(exon_bed: Path, gene_bed: Path) -> pd.DataFrame:
"""
:param exon_bed: A bed file containing fasterDB exons
:param gene_bed: A bed file containing fasterDB genes
:return: A dataframe of exon with it's coordinate id + it's \
id corresponding to it's gene id and it's position within the gene
>>> format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed)
exon_id id gene_id gene_name exon_pos exon_name
0 18:28681866-28682388 1_1 1 DSC2 1 DSC2_1
1 18:28681184-28681432 1_2 1 DSC2 2 DSC2_2
2 18:28673522-28673606 1_3 1 DSC2 3 DSC2_3
3 18:28672064-28672263 1_4 1 DSC2 4 DSC2_4
4 18:28671490-28671530 1_5 1 DSC2 5 DSC2_5
5 18:28670991-28671110 1_6 1 DSC2 6 DSC2_6
6 18:28669402-28669557 1_7 1 DSC2 7 DSC2_7
7 18:28667632-28667776 1_8 1 DSC2 8 DSC2_8
8 18:28666539-28666705 1_9 1 DSC2 9 DSC2_9
"""
exon_bed = pd.read_csv(exon_bed, sep="\t")
exon_bed['exon_id'] = exon_bed['#ref'].astype(str) + ':' + \
(exon_bed['start'] + 1).astype(str) + '-' + \
exon_bed['end'].astype(str)
exon_bed['gene_id'] = exon_bed['id'].str.replace(r'_\d+', '').astype(int)
exon_bed['exon_pos'] = exon_bed['id'].str.replace(r'\d+_', '').astype(int)
df_gene = pd.read_csv(gene_bed, sep="\t")[["id", "score"]]
df_gene.columns = ['gene_id', 'gene_name']
exon_bed = exon_bed.merge(df_gene, how="left", on=['gene_id'])
exon_bed['exon_name'] = exon_bed['gene_name'] + "_" + \
exon_bed['exon_pos'].astype(str)
return exon_bed[['exon_id', 'id', 'gene_id', 'gene_name', 'exon_pos',
'exon_name']]
def load_sipp_vs_ctct(sipp_file: Path, exon_table: pd.DataFrame
) -> pd.DataFrame:
"""
Load the file containing exons regulated by DDX5 17 and containing their \
distance to CTCF. Finaly merge this table to `exon_table` to get \
aditional data
:param sipp_file:file containing exons regulated by DDX5 17 and \
containing their distance to CTCF.
:return: Same input table as a dataframe
>>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> df[['exon_name', "exon_id", 'group', 'id', 'gene_id']]
exon_name exon_id group id gene_id
0 DSC2_1 18:28681866-28682388 siPP_DOWN 1_1 1
1 DSC2_2 18:28681184-28681432 siPP_DOWN 1_2 1
2 DSC2_3 18:28673522-28673606 siPP_DOWN 1_3 1
3 DSC2_4 18:28672064-28672263 siPP_UP 1_4 1
4 DSC2_5 18:28671490-28671530 siPP_DOWN 1_5 1
5 DSC2_6 18:28670991-28671110 siPP_DOWN 1_6 1
6 DSC2_7 18:28669402-28669557 siPP_DOWN 1_7 1
7 DSC2_8 18:28667632-28667776 siPP_DOWN 1_8 1
8 DSC2_9 18:28666539-28666705 siPP_DOWN 1_9 1
"""
df = pd.read_csv(sipp_file, sep="\t")
return df.merge(exon_table, how="left", on=["exon_id", "exon_name"])
def get_ddx_genes(df: pd.DataFrame) -> List[int]:
"""
Return the unique values in gene_id column.
:param df: A dataframe of DDX regulated exons
:return: Return the unique values in gene_id column.
>>> df = load_sipp_vs_ctct(TestConfig.sipp_vs_ctcf,
... format_exon_bed(TestConfig.exon_bed, TestConfig.gene_bed))
>>> get_ddx_genes(df)
[1]
"""
return list(df['gene_id'].unique())
def write_regulated_gene_file():
"""
Write file file containing the regulated exons by DDX5/17
"""
BedConfig.bed.output.mkdir(exist_ok=True, parents=True)
exon_table = format_exon_bed(BedConfig.exon_bed, BedConfig.gene_bed)
final_table = load_sipp_vs_ctct(BedConfig.sipp_vs_ctcf, exon_table)
list_gene = get_ddx_genes(final_table)
with BedConfig.bed.ddx_genes.open('w')as outfile:
for gene in list_gene:
outfile.write(f"{gene}\n")
if __name__ == "__main__":
testmod()
exon_name dist exon_id CTCF_hit_id group strand deltaPSI
DSC2_1 1 18:28681866-28682388 18:28681866-28682388 siPP_DOWN - -0.33
DSC2_2 2 18:28681184-28681432 18:28681184-28681432 siPP_DOWN + -0.23
DSC2_3 3 18:28673522-28673606 18:28673522-28673606 siPP_DOWN + -0.17
DSC2_4 4 18:28672064-28672263 18:28672064-28672263 siPP_UP - 0.20
DSC2_5 5 18:28671490-28671530 18:28671490-28671530 siPP_DOWN - -0.17
DSC2_6 6 18:28670991-28671110 18:28670991-28671110 siPP_DOWN - -0.49
DSC2_7 7 18:28669402-28669557 18:28669402-28669557 siPP_DOWN + -0.32
DSC2_8 8 18:28667632-28667776 18:28667632-28667776 siPP_DOWN - -0.32
DSC2_9 9 18:28666539-28666705 18:28666539-28666705 siPP_DOWN - -0.14
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment