diff --git a/src/download_encode_eclip/create_gene_bed.py b/src/download_encode_eclip/create_gene_bed.py new file mode 100644 index 0000000000000000000000000000000000000000..f695072dde4694a272714f73ab8c4237d7d85ace --- /dev/null +++ b/src/download_encode_eclip/create_gene_bed.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to create a bed gene file \ +without the first and the last exons. +""" + + +from .config import Config, ConfigEncodeClip +import pandas as pd +from pathlib import Path +import doctest + + +class BadIDError(Exception): + """ + Class defining a bad exon id error. + """ + pass + + +def check_id(mserie: pd.Series) -> None: + """ + Check the id column of a row in a dataframe containing exons. + + :param mserie: A row of a dataframe containing exon in a bed format + + >>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "1_10", + ... "score": 0, "strand": '-'})) + >>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "cd_10", + ... "score": 0, "strand": '-'})) + Traceback (most recent call last): + ... + BadIDError: The exon cd_10 as a bad id format. + It should have the form id_pos where + 1. id: Is the fasterDB gene id of the exons + 2. pos: Is the position of the exon within the gene + >>> check_id(pd.Series({"chr": 1, "start": 0, "stop": 50, "id": "17", + ... "score": 0, "strand": '-'})) + Traceback (most recent call last): + ... + BadIDError: The exon 17 as a bad id format. + It should have the form id_pos where + 1. id: Is the fasterDB gene id of the exons + 2. pos: Is the position of the exon within the gene + """ + res = mserie["id"].split("_") + if len(res) != 2 or not res[0].isdigit() or not res[1].isdigit(): + raise BadIDError(f"The exon {mserie['id']} as a bad id format.\n" + f"It should have the form id_pos where\n" + f" 1. id: Is the fasterDB gene id of the exons\n" + f" 2. pos: Is the position of the exon within the " + f"gene") + + +def load_exon_bed(exon_bed: Path) -> pd.DataFrame: + """ + Load a bed file containing exons. The columns containing the \ + names of intervals must have the following structure: id_pos where: + 1. id is the fasterDB id of a gene + 2. pos: Is the exons position on the gene + + :param exon_bed: A bed file containing exons + :return: The loaded exons bed file as a dataframe + + >>> load_exon_bed(Config.tests_files / "exons.bed") + chr start stop id score strand + 0 18 28681865 28682388 1_1 0 - + 1 18 28681183 28681432 1_2 0 - + 2 18 28673521 28673606 1_3 0 - + 3 18 28672063 28672263 1_4 0 - + 4 18 28671489 28671530 1_5 0 - + 5 18 28670990 28671110 1_6 0 - + 6 18 28669401 28669557 1_7 0 - + 7 18 28667631 28667776 1_8 0 - + 8 18 28666538 28666705 1_9 0 - + """ + df = pd.read_csv(exon_bed, sep="\t", names=["chr", "start", "stop", "id", + "score", "strand"]) + df.apply(check_id, axis=1) + return df + + +def get_first_and_last_exons(df_exon: pd.DataFrame) -> pd.DataFrame: + """ + Return a dataframe indicating for each gene, it's first or last exons. + + :param df_exon: A dataframe containing exons in bed format. + :return: dataframe indicating for each gene, it's first or last exons. + + >>> d = pd.DataFrame({ + ... 'chr': {0: 18, 1: 18, 2: 18, 3: 18, 4: 18, 5: 18, 6: 18, 7: 18, 8: 18}, + ... 'start': {0: 28681865, 1: 28681183, 2: 28673521, 3: 28672063, + ... 4: 28671489, 5: 28670990, 6: 28669401, 7: 28667631, + ... 8: 28666538}, + ... 'stop': {0: 28682388, 1: 28681432, 2: 28673606, 3: 28672263, + ... 4: 28671530, 5: 28671110, 6: 28669557, 7: 28667776, + ... 8: 28666705}, + ... 'id': {0: '1_1', 1: '1_2', 2: '1_3', 3: '1_4', 4: '1_5', 5: '1_6', + ... 6: '1_7', 7: '1_8', 8: '1_9'}, + ... 'score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0}, + ... 'strand': {0: '-', 1: '-', 2: '-', 3: '-', 4: '-', 5: '-', 6: '-', + ... 7: '-', 8: '-'}}) + >>> get_first_and_last_exons(d) + chr start_first stop_first start_last stop_last gene_id strand + 0 18 28681865 28682388 28666538 28666705 1 - + """ + df_exon["gene_id"] = df_exon["id"].str.replace(r"_\d+", "", regex=True)\ + .astype(int) + df_exon["pos"] = df_exon["id"].str.replace(r"\d+_", "", regex=True)\ + .astype(int) + df_exon.sort_values(["gene_id", "pos"], ascending=True, inplace=True) + df_first = df_exon.drop_duplicates(subset="gene_id", keep="first") + df_last = df_exon.drop_duplicates(subset="gene_id", keep="last") + good_col = ["chr", "start_first", "stop_first", "start_last", "stop_last", + "gene_id", "strand"] + df = pd.merge(df_first, df_last, how="outer", + on=["gene_id", "chr", "strand", "score"], + suffixes=["_first", "_last"]) + return df[df["pos_first"] != df["pos_last"]][good_col] + + +def get_gene_row(row: pd.Series) -> pd.Series: + """ + From a row containing the position of the first and the last exons of \ + a gene, return another row containing the intervals of a gene without \ + this first and last exons. + + :param row: A row containing the position of the first and the \ + last exons of a gene + :return: another row containing the intervals of a gene without \ + this first and last exons. + + >>> get_gene_row(pd.Series( + ... {'chr': 18, 'start_first': 900, 'stop_first': 1000, + ... 'start_last': 0, 'stop_last': 100, 'gene_id': 1, + ... 'strand': '-'})).to_dict() == {'chr': 18, 'start': 100, + ... 'stop': 900, 'id': 1, 'score': 0, 'strand': '-'} + True + >>> get_gene_row(pd.Series( + ... {'chr': 18, 'start_first': 0, 'stop_first': 100, + ... 'start_last': 900, 'stop_last': 1000, 'gene_id': 1, + ... 'strand': '+'})).to_dict() == {'chr': 18, 'start': 100, + ... 'stop': 900, 'id': 1, 'score': 0, 'strand': '+'} + True + """ + if row["strand"] == "+": + return pd.Series({"chr": row["chr"], + "start": row["stop_first"], + "stop": row["start_last"], + "id": row["gene_id"], + "score": 0, + "strand": row["strand"]}) + return pd.Series({"chr": row["chr"], + "start": row["stop_last"], + "stop": row["start_first"], + "id": row["gene_id"], + "score": 0, + "strand": row["strand"]}) + + +def create_new_bed(df_exons: pd.DataFrame) -> pd.DataFrame: + """ + From a dataframe containing the first and the last exons of \ + each gene create dataframe of genes without the first and the last exons. + + :param df_exons: a dataframe containing the first and the last exons of \ + each gene create + :return: The dataframe in bed format of genes without it's first or last \ + exons + + >>> d = pd.DataFrame({'chr': [18, 18], 'start_first': [900, 0], + ... 'stop_first': [1000, 100], + ... 'start_last': [0, 900], 'stop_last': [100, 1000], 'gene_id': [1, 1], + ... 'strand': ['-', '+']}) + >>> create_new_bed(d) + chr start stop id score strand + 0 18 100 900 1 0 - + 1 18 100 900 1 0 + + """ + df_exons = df_exons.apply(get_gene_row, axis=1) + df_exons["chr"] = "chr" + df_exons["chr"] + return df_exons + + +def create_gene_bed(): + """ + Create a gene files without first and last exons. + """ + df_exon = load_exon_bed(Config.bed_exon) + df_fl = get_first_and_last_exons(df_exon) + bed_gene = create_new_bed(df_fl) + ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True) + bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False, + header=False) + + +if __name__ == "__main__": + doctest.testmod()