Skip to content
Snippets Groups Projects
Commit b5f1625c authored by nfontrod's avatar nfontrod
Browse files

src/download_encode_eclip/create_gene_bed.py: add a parameter level in...

src/download_encode_eclip/create_gene_bed.py: add a parameter level in create_gene_bed to choose ti create a bed containing genes without their first and last exons or a bed file containing exons (extended by 200 nt at both side) without the first and the last exons
parent a7aa08a1
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,8 @@
"""
Description: The goal of this script is to create a bed gene file \
without the first and the last exons.
without the first and the last exons or a bed file containing internal \
exons with an extra 200 nt around those exons
"""
......@@ -12,6 +13,7 @@ from .config import Config, ConfigEncodeClip
import pandas as pd
from pathlib import Path
import doctest
from typing import List
class BadIDError(Exception):
......@@ -83,6 +85,64 @@ def load_exon_bed(exon_bed: Path) -> pd.DataFrame:
return df
def filter_first_last_exons(exon_df: pd.DataFrame) -> pd.DataFrame:
"""
Remove the first and last exons in `exon_df` dataframe
:param exon_df: A dataframe of exon in a bed format
:return: The same dataframe but without the first and last exons
>>> ed = load_exon_bed(Config.tests_files / "exons.bed")
>>> filter_first_last_exons(ed)
chr start stop id score strand
0 18 28681183 28681432 1_2 0 -
1 18 28673521 28673606 1_3 0 -
2 18 28672063 28672263 1_4 0 -
3 18 28671489 28671530 1_5 0 -
4 18 28670990 28671110 1_6 0 -
5 18 28669401 28669557 1_7 0 -
6 18 28667631 28667776 1_8 0 -
"""
exon_df = exon_df.loc[-exon_df['id'].str.contains(r"\d+_1$"), :]
d = pd.DataFrame(exon_df["id"].str.split("_").to_list(),
columns=["gene", "pos"])
d['pos'] = d['pos'].astype(int)
d['gene'] = d['gene'].astype(int)
d = d.sort_values(["gene", "pos"], ascending=True)\
.drop_duplicates(subset="gene", keep="last")
last_id = (d["gene"].astype(str) + "_" + d["pos"].astype(str)).to_list()
return exon_df[-exon_df["id"].isin(last_id)].reset_index(drop=True)
def add_slop(df_exon: pd.DataFrame, size: int = 200) -> pd.DataFrame:
"""
Add 200 nt before and after the exons.
:param df_exon: A dataframe containing exons.
:param size: The size to add at the beginning and the end of the exon.
:return: The df_exon with and extra size at the left and right ends
>>> ed = load_exon_bed(Config.tests_files / "exons.bed")
>>> ed.loc[ed["id"] == "1_1", "start"] = 1
>>> add_slop(ed, 200)
chr start stop id score strand
0 18 0 28682588 1_1 0 -
1 18 28680983 28681632 1_2 0 -
2 18 28673321 28673806 1_3 0 -
3 18 28671863 28672463 1_4 0 -
4 18 28671289 28671730 1_5 0 -
5 18 28670790 28671310 1_6 0 -
6 18 28669201 28669757 1_7 0 -
7 18 28667431 28667976 1_8 0 -
8 18 28666338 28666905 1_9 0 -
"""
df_exon["start"] = df_exon["start"] - size
df_exon.loc[df_exon["start"] < 0, "start"] = 0
df_exon["stop"] = df_exon["stop"] + size
return df_exon
def get_first_and_last_exons(df_exon: pd.DataFrame) -> pd.DataFrame:
"""
Return a dataframe indicating for each gene, it's first or last exons.
......@@ -176,25 +236,41 @@ def create_new_bed(df_exons: pd.DataFrame) -> pd.DataFrame:
... 'start_last': [0, 900], 'stop_last': [100, 1000], 'gene_id': [1, 1],
... 'strand': ['-', '+']})
>>> create_new_bed(d)
chr start stop id score strand
0 18 100 900 1 0 -
1 18 100 900 1 0 +
chr start stop id score strand
0 chr18 100 900 1 0 -
1 chr18 100 900 1 0 +
"""
df_exons = df_exons.apply(get_gene_row, axis=1)
df_exons["chr"] = "chr" + df_exons["chr"]
df_exons["chr"] = "chr" + df_exons["chr"].astype('str')
return df_exons
def create_gene_bed():
def create_gene_bed(level: str) -> Path:
"""
Create a gene files without first and last exons.
Create a gene files without first and last exons or an exon bed file \
with exon extended from 200nt at both side without first and last exon
:param level: 'gene' to create a bed file without first and last exons \
'exon' to create a bed file of extended exon (200nt at each size) \
without first and last exon
:return: The path where the exon file have been created
"""
df_exon = load_exon_bed(Config.bed_exon)
df_fl = get_first_and_last_exons(df_exon)
bed_gene = create_new_bed(df_fl)
ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True)
bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False,
header=False)
if level == "gene":
df_fl = get_first_and_last_exons(df_exon)
bed_gene = create_new_bed(df_fl)
ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True)
bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False,
header=False)
return ConfigEncodeClip.output_gene
else:
df_exon = filter_first_last_exons(df_exon)
df_exon = add_slop(df_exon, 200)
ConfigEncodeClip.output_exon.parent.mkdir(exist_ok=True)
df_exon["chr"] = "chr" + df_exon["chr"].astype('str')
df_exon.to_csv(ConfigEncodeClip.output_exon, sep="\t", index=False,
header=False)
return ConfigEncodeClip.output_exon
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment