Skip to content
Snippets Groups Projects
Commit 53af41fc authored by nfontrod's avatar nfontrod
Browse files

src/download_encode_eclip/create_gene_bed.py: script to create a bed file of...

src/download_encode_eclip/create_gene_bed.py: script to create a bed file of human geneswithout the first and the last exons
parent d0cae3c0
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to create a bed gene file \
without the first and the last exons.
"""
from .config import Config, ConfigEncodeClip
import pandas as pd
from pathlib import Path
import doctest
class BadIDError(Exception):
"""
Class defining a bad exon id error.
"""
pass
def check_id(mserie: pd.Series) -> None:
"""
Check the id column of a row in a dataframe containing exons.
:param mserie: A row of a dataframe containing exon in a bed format
>>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "1_10",
... "score": 0, "strand": '-'}))
>>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "cd_10",
... "score": 0, "strand": '-'}))
Traceback (most recent call last):
...
BadIDError: The exon cd_10 as a bad id format.
It should have the form id_pos where
1. id: Is the fasterDB gene id of the exons
2. pos: Is the position of the exon within the gene
>>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "17",
... "score": 0, "strand": '-'}))
Traceback (most recent call last):
...
BadIDError: The exon 17 as a bad id format.
It should have the form id_pos where
1. id: Is the fasterDB gene id of the exons
2. pos: Is the position of the exon within the gene
"""
res = mserie["id"].split("_")
if len(res) != 2 or not res[0].isdigit() or not res[1].isdigit():
raise BadIDError(f"The exon {mserie['id']} as a bad id format.\n"
f"It should have the form id_pos where\n"
f" 1. id: Is the fasterDB gene id of the exons\n"
f" 2. pos: Is the position of the exon within the "
f"gene")
def load_exon_bed(exon_bed: Path) -> pd.DataFrame:
"""
Load a bed file containing exons. The columns containing the \
names of intervals must have the following structure: id_pos where:
1. id is the fasterDB id of a gene
2. pos: Is the exons position on the gene
:param exon_bed: A bed file containing exons
:return: The loaded exons bed file as a dataframe
>>> load_exon_bed(Config.tests_files / "exons.bed")
chr start stop id score strand
0 18 28681865 28682388 1_1 0 -
1 18 28681183 28681432 1_2 0 -
2 18 28673521 28673606 1_3 0 -
3 18 28672063 28672263 1_4 0 -
4 18 28671489 28671530 1_5 0 -
5 18 28670990 28671110 1_6 0 -
6 18 28669401 28669557 1_7 0 -
7 18 28667631 28667776 1_8 0 -
8 18 28666538 28666705 1_9 0 -
"""
df = pd.read_csv(exon_bed, sep="\t", names=["chr", "start", "stop", "id",
"score", "strand"])
df.apply(check_id, axis=1)
return df
def get_first_and_last_exons(df_exon: pd.DataFrame) -> pd.DataFrame:
"""
Return a dataframe indicating for each gene, it's first or last exons.
:param df_exon: A dataframe containing exons in bed format.
:return: dataframe indicating for each gene, it's first or last exons.
>>> d = pd.DataFrame({
... 'chr': {0: 18, 1: 18, 2: 18, 3: 18, 4: 18, 5: 18, 6: 18, 7: 18, 8: 18},
... 'start': {0: 28681865, 1: 28681183, 2: 28673521, 3: 28672063,
... 4: 28671489, 5: 28670990, 6: 28669401, 7: 28667631,
... 8: 28666538},
... 'stop': {0: 28682388, 1: 28681432, 2: 28673606, 3: 28672263,
... 4: 28671530, 5: 28671110, 6: 28669557, 7: 28667776,
... 8: 28666705},
... 'id': {0: '1_1', 1: '1_2', 2: '1_3', 3: '1_4', 4: '1_5', 5: '1_6',
... 6: '1_7', 7: '1_8', 8: '1_9'},
... 'score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0},
... 'strand': {0: '-', 1: '-', 2: '-', 3: '-', 4: '-', 5: '-', 6: '-',
... 7: '-', 8: '-'}})
>>> get_first_and_last_exons(d)
chr start_first stop_first start_last stop_last gene_id strand
0 18 28681865 28682388 28666538 28666705 1 -
"""
df_exon["gene_id"] = df_exon["id"].str.replace(r"_\d+", "", regex=True)\
.astype(int)
df_exon["pos"] = df_exon["id"].str.replace(r"\d+_", "", regex=True)\
.astype(int)
df_exon.sort_values(["gene_id", "pos"], ascending=True, inplace=True)
df_first = df_exon.drop_duplicates(subset="gene_id", keep="first")
df_last = df_exon.drop_duplicates(subset="gene_id", keep="last")
good_col = ["chr", "start_first", "stop_first", "start_last", "stop_last",
"gene_id", "strand"]
df = pd.merge(df_first, df_last, how="outer",
on=["gene_id", "chr", "strand", "score"],
suffixes=["_first", "_last"])
return df[df["pos_first"] != df["pos_last"]][good_col]
def get_gene_row(row: pd.Series) -> pd.Series:
"""
From a row containing the position of the first and the last exons of \
a gene, return another row containing the intervals of a gene without \
this first and last exons.
:param row: A row containing the position of the first and the \
last exons of a gene
:return: another row containing the intervals of a gene without \
this first and last exons.
>>> get_gene_row(pd.Series(
... {'chr': 18, 'start_first': 900, 'stop_first': 1000,
... 'start_last': 0, 'stop_last': 100, 'gene_id': 1,
... 'strand': '-'})).to_dict() == {'chr': 18, 'start': 100,
... 'stop': 900, 'id': 1, 'score': 0, 'strand': '-'}
True
>>> get_gene_row(pd.Series(
... {'chr': 18, 'start_first': 0, 'stop_first': 100,
... 'start_last': 900, 'stop_last': 1000, 'gene_id': 1,
... 'strand': '+'})).to_dict() == {'chr': 18, 'start': 100,
... 'stop': 900, 'id': 1, 'score': 0, 'strand': '+'}
True
"""
if row["strand"] == "+":
return pd.Series({"chr": row["chr"],
"start": row["stop_first"],
"stop": row["start_last"],
"id": row["gene_id"],
"score": 0,
"strand": row["strand"]})
return pd.Series({"chr": row["chr"],
"start": row["stop_last"],
"stop": row["start_first"],
"id": row["gene_id"],
"score": 0,
"strand": row["strand"]})
def create_new_bed(df_exons: pd.DataFrame) -> pd.DataFrame:
"""
From a dataframe containing the first and the last exons of \
each gene create dataframe of genes without the first and the last exons.
:param df_exons: a dataframe containing the first and the last exons of \
each gene create
:return: The dataframe in bed format of genes without it's first or last \
exons
>>> d = pd.DataFrame({'chr': [18, 18], 'start_first': [900, 0],
... 'stop_first': [1000, 100],
... 'start_last': [0, 900], 'stop_last': [100, 1000], 'gene_id': [1, 1],
... 'strand': ['-', '+']})
>>> create_new_bed(d)
chr start stop id score strand
0 18 100 900 1 0 -
1 18 100 900 1 0 +
"""
df_exons = df_exons.apply(get_gene_row, axis=1)
df_exons["chr"] = "chr" + df_exons["chr"]
return df_exons
def create_gene_bed():
"""
Create a gene files without first and last exons.
"""
df_exon = load_exon_bed(Config.bed_exon)
df_fl = get_first_and_last_exons(df_exon)
bed_gene = create_new_bed(df_fl)
ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True)
bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False,
header=False)
if __name__ == "__main__":
doctest.testmod()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment