diff --git a/src/download_encode_eclip/create_gene_bed.py b/src/download_encode_eclip/create_gene_bed.py index f695072dde4694a272714f73ab8c4237d7d85ace..824848f2a0f7d899ede58c5f7fbff09545a1c0d1 100644 --- a/src/download_encode_eclip/create_gene_bed.py +++ b/src/download_encode_eclip/create_gene_bed.py @@ -4,7 +4,8 @@ """ Description: The goal of this script is to create a bed gene file \ -without the first and the last exons. +without the first and the last exons or a bed file containing internal \ +exons with an extra 200 nt around those exons """ @@ -12,6 +13,7 @@ from .config import Config, ConfigEncodeClip import pandas as pd from pathlib import Path import doctest +from typing import List class BadIDError(Exception): @@ -83,6 +85,64 @@ def load_exon_bed(exon_bed: Path) -> pd.DataFrame: return df +def filter_first_last_exons(exon_df: pd.DataFrame) -> pd.DataFrame: + """ + Remove the first and last exons in `exon_df` dataframe + + :param exon_df: A dataframe of exon in a bed format + :return: The same dataframe but without the first and last exons + + >>> ed = load_exon_bed(Config.tests_files / "exons.bed") + >>> filter_first_last_exons(ed) + chr start stop id score strand + 0 18 28681183 28681432 1_2 0 - + 1 18 28673521 28673606 1_3 0 - + 2 18 28672063 28672263 1_4 0 - + 3 18 28671489 28671530 1_5 0 - + 4 18 28670990 28671110 1_6 0 - + 5 18 28669401 28669557 1_7 0 - + 6 18 28667631 28667776 1_8 0 - + """ + exon_df = exon_df.loc[-exon_df['id'].str.contains(r"\d+_1$"), :] + d = pd.DataFrame(exon_df["id"].str.split("_").to_list(), + columns=["gene", "pos"]) + d['pos'] = d['pos'].astype(int) + d['gene'] = d['gene'].astype(int) + d = d.sort_values(["gene", "pos"], ascending=True)\ + .drop_duplicates(subset="gene", keep="last") + last_id = (d["gene"].astype(str) + "_" + d["pos"].astype(str)).to_list() + return exon_df[-exon_df["id"].isin(last_id)].reset_index(drop=True) + + +def add_slop(df_exon: pd.DataFrame, size: int = 200) -> pd.DataFrame: + """ + Add 200 nt before and after the exons. + + :param df_exon: A dataframe containing exons. + :param size: The size to add at the beginning and the end of the exon. + :return: The df_exon with and extra size at the left and right ends + + >>> ed = load_exon_bed(Config.tests_files / "exons.bed") + >>> ed.loc[ed["id"] == "1_1", "start"] = 1 + >>> add_slop(ed, 200) + chr start stop id score strand + 0 18 0 28682588 1_1 0 - + 1 18 28680983 28681632 1_2 0 - + 2 18 28673321 28673806 1_3 0 - + 3 18 28671863 28672463 1_4 0 - + 4 18 28671289 28671730 1_5 0 - + 5 18 28670790 28671310 1_6 0 - + 6 18 28669201 28669757 1_7 0 - + 7 18 28667431 28667976 1_8 0 - + 8 18 28666338 28666905 1_9 0 - + """ + df_exon["start"] = df_exon["start"] - size + df_exon.loc[df_exon["start"] < 0, "start"] = 0 + df_exon["stop"] = df_exon["stop"] + size + return df_exon + + + def get_first_and_last_exons(df_exon: pd.DataFrame) -> pd.DataFrame: """ Return a dataframe indicating for each gene, it's first or last exons. @@ -176,25 +236,41 @@ def create_new_bed(df_exons: pd.DataFrame) -> pd.DataFrame: ... 'start_last': [0, 900], 'stop_last': [100, 1000], 'gene_id': [1, 1], ... 'strand': ['-', '+']}) >>> create_new_bed(d) - chr start stop id score strand - 0 18 100 900 1 0 - - 1 18 100 900 1 0 + + chr start stop id score strand + 0 chr18 100 900 1 0 - + 1 chr18 100 900 1 0 + """ df_exons = df_exons.apply(get_gene_row, axis=1) - df_exons["chr"] = "chr" + df_exons["chr"] + df_exons["chr"] = "chr" + df_exons["chr"].astype('str') return df_exons -def create_gene_bed(): +def create_gene_bed(level: str) -> Path: """ - Create a gene files without first and last exons. + Create a gene files without first and last exons or an exon bed file \ + with exon extended from 200nt at both side without first and last exon + + :param level: 'gene' to create a bed file without first and last exons \ + 'exon' to create a bed file of extended exon (200nt at each size) \ + without first and last exon + :return: The path where the exon file have been created """ df_exon = load_exon_bed(Config.bed_exon) - df_fl = get_first_and_last_exons(df_exon) - bed_gene = create_new_bed(df_fl) - ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True) - bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False, - header=False) + if level == "gene": + df_fl = get_first_and_last_exons(df_exon) + bed_gene = create_new_bed(df_fl) + ConfigEncodeClip.output_gene.parent.mkdir(exist_ok=True) + bed_gene.to_csv(ConfigEncodeClip.output_gene, sep="\t", index=False, + header=False) + return ConfigEncodeClip.output_gene + else: + df_exon = filter_first_last_exons(df_exon) + df_exon = add_slop(df_exon, 200) + ConfigEncodeClip.output_exon.parent.mkdir(exist_ok=True) + df_exon["chr"] = "chr" + df_exon["chr"].astype('str') + df_exon.to_csv(ConfigEncodeClip.output_exon, sep="\t", index=False, + header=False) + return ConfigEncodeClip.output_exon if __name__ == "__main__":