diff --git a/src/visu/__main__.py b/src/visu/__main__.py index b08467089953ccfcd8110ac7c7724c2c28f92a4d..bc53640546831b2ae1f49bbcb63b7eacf9b6a578 100644 --- a/src/visu/__main__.py +++ b/src/visu/__main__.py @@ -13,11 +13,11 @@ from pathlib import Path from typing import List -@lp.parse(design='file', region_bed='file', +@lp.parse(design='file', region_beds='file', nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'], show_replicate=['y', 'n', 'Y', 'N']) -def launcher(design: str, bw_folder: str, region_bed: str, - region_name: str, nb_bin: int = 100, +def launcher(design: str, bw_folder: str, region_beds: List[str], + region_names: List[str], nb_bin: int = 100, figure_type: str = 'metagene', norm: str = 'None', show_replicate: str = 'y', environment: List[int] = (0, 0), border_names: List[str] = ('', ''), @@ -31,8 +31,9 @@ def launcher(design: str, bw_folder: str, region_bed: str, the last one contains the replicate of the condition. :param bw_folder: The folder containing the bigwig file mentioned in \ the first column of the 'design' table. - :param region_bed: A bed file containing the regions to visualise - :param region_name: The name of the region analysed + :param region_beds: A list of bed files containing the regions to visualise + :param region_names: A list of names identifying regions insides \ + the given beds. :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) @@ -58,8 +59,9 @@ def launcher(design: str, bw_folder: str, region_bed: str, norm = Path(norm) if not norm.is_file(): raise FileNotFoundError(f"The file {norm} was not found") - create_figure(Path(design), Path(bw_folder), Path(region_bed), - region_name, nb_bin, figure_type, norm, show_rep, + reg_beds = [Path(p) for p in region_beds] + create_figure(Path(design), Path(bw_folder), reg_beds, + region_names, nb_bin, figure_type, norm, show_rep, environment, border_names, Path(output)) diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py index 0f6615f7cad287a4117656ab5dc29e543b220887..2da518f21c5df40d4cc753f60b48f14485bbb1f6 100644 --- a/src/visu/figure_maker.py +++ b/src/visu/figure_maker.py @@ -7,7 +7,7 @@ Description: """ from pathlib import Path -from typing import List, Union, Any +from typing import List, Union, Any, Tuple from doctest import testmod from ..bed_handler.config import TestConfig import pandas as pd @@ -17,15 +17,16 @@ import matplotlib.pyplot as plt from tqdm import tqdm -def load_bed(bed: Path) -> List[List[Union[int, str]]]: +def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]: """ Read a bed file and return the lines within it. :param bed: A bed file containing the regions of interest + :param bed_name: The name of the regions of interest inside the bed file :return:The list of feature inside the bed - >>> load_bed(TestConfig.gene_bed)[0] - ['18', 28645943, 28682388, 1, 'DSC2', '-'] + >>> load_bed(TestConfig.gene_bed, 'gene_test')[0] + ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test'] """ list_regions = [] with bed.open('r') as inbed: @@ -33,10 +34,34 @@ def load_bed(bed: Path) -> List[List[Union[int, str]]]: if not line.startswith("#"): cline = line.replace("\n", "").split("\t") list_regions.append([cline[0], int(cline[1]), int(cline[2]), - int(cline[3]), cline[4], cline[5]]) + int(cline[3]), cline[4], cline[5], + bed_name]) return list_regions +def load_beds(beds: List[Path], bed_names: List[str] + ) -> List[List[Union[int, str]]]: + """ + Read a bed file and return the lines within it. + + :param beds: A list of bed files containing the regions of interest + :param bed_names: A list of names indentifying regions insides the given \ + beds. + :return:The list of feature inside the beds file + + >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed], + ... ['gene1', 'gene2'])[0] + ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1'] + >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed], + ... ['gene1', 'gene2'])[-1] + ['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2'] + """ + regions = [] + for i in range(len(beds)): + regions += load_bed(beds[i], bed_names[i]) + return regions + + def inspect_bigwig_regions(bw: Any, region: List, replicate: str, nb_bin: int, resize: List[int], condition_name: str, @@ -54,30 +79,30 @@ def inspect_bigwig_regions(bw: Any, region: List, :return: a table with the coverage of this region >>> my_bw = pbw.open(str(TestConfig.small_bw)) - >>> region = ['1', 10, 25, 1, 'Test', '+'] - >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1') - coverage bin condition replicate - 0 0.000000 -2 cond1 R1 - 1 0.500000 -1 cond1 R1 - 2 75.000000 0 cond1 R1 - 3 20.000000 1 cond1 R1 - 4 10.000000 2 cond1 R1 - 5 4.666667 3 cond1 R1 - 6 2.000000 4 cond1 R1 - 7 1.000000 5 cond1 R1 - 8 0.500000 6 cond1 R1 - >>> region = ['1', 110, 133, 1, 'Test', '-'] - >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1') - coverage bin condition replicate - 0 0.00 -2 cond1 R1 - 1 12.50 -1 cond1 R1 - 2 42.00 0 cond1 R1 - 3 8.00 1 cond1 R1 - 4 4.25 2 cond1 R1 - 5 2.00 3 cond1 R1 - 6 2.00 4 cond1 R1 - 7 1.00 5 cond1 R1 - 8 1.00 6 cond1 R1 + >>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon'] + >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1') + coverage bin condition replicate region + 0 0.000000 -2 cond1 R1 exon + 1 0.500000 -1 cond1 R1 exon + 2 75.000000 0 cond1 R1 exon + 3 20.000000 1 cond1 R1 exon + 4 10.000000 2 cond1 R1 exon + 5 4.666667 3 cond1 R1 exon + 6 2.000000 4 cond1 R1 exon + 7 1.000000 5 cond1 R1 exon + 8 0.500000 6 cond1 R1 exon + >>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2'] + >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1') + coverage bin condition replicate region + 0 0.00 -2 cond1 R1 exon2 + 1 12.50 -1 cond1 R1 exon2 + 2 42.00 0 cond1 R1 exon2 + 3 8.00 1 cond1 R1 exon2 + 4 4.25 2 cond1 R1 exon2 + 5 2.00 3 cond1 R1 exon2 + 6 2.00 4 cond1 R1 exon2 + 7 1.00 5 cond1 R1 exon2 + 8 1.00 6 cond1 R1 exon2 """ val = bw.stats(region[0], region[1], region[2], nBins=nb_bin, exact=True) bins = list(range(len(val))) @@ -110,6 +135,7 @@ def inspect_bigwig_regions(bw: Any, region: List, df = pd.DataFrame(dic) df['condition'] = [condition_name] * df.shape[0] df['replicate'] = [replicate] * df.shape[0] + df['region'] = [region[6]] * df.shape[0] return df @@ -165,10 +191,31 @@ def create_full_table(df_exp: pd.DataFrame, regions: List[List], return pd.concat(list_df, axis=0, ignore_index=True) +def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]: + """ + + :param df: A dataframe of mean coverage for each bin. + :return: The dataframe with the region of condition column merged and \ + the name of the merged column + """ + if len(df['region'].unique()) == 1: + condition_col = 'condition' + df.drop('region', axis=1, inplace=True) + elif len(df['condition'].unique()) == 1: + condition_col = 'region' + df.drop('condition', axis=1, inplace=True) + else: + condition_col = 'condition-region' + df[condition_col] = df['condition'] + "-" + df['region'] + df.drop(['condition', 'region'], axis=1, inplace=True) + return df, condition_col + + def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, environment: List[int], - region_name: str, order_condition: List[str] - ) -> pd.DataFrame: + region_name: str, order_condition: List[str], + order_bed_name: List[str], + ) -> Tuple[pd.DataFrame, str]: """ summarize the data in df_cov. @@ -180,21 +227,24 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, the number of bin used to represent those surrounding regions. :param region_name: the name of the region analysed :param order_condition: The order of conditions - :return: The summarised dataframe + :param order_bed_name: The order of bed name to respect + :return: The summarised dataframe and the condition col """ - df_sum = df_cov.groupby(['bin', 'condition', 'replicate']).mean() \ + df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \ + .mean() \ .reset_index() if figure_type == "metagene": - return df_sum + df_sum, condition_col = merge_condition_region_col(df_sum) + return df_sum, condition_col if environment[0] != 0: df_sum['location'] = df_cov['bin'].apply( lambda x: f"before_{region_name}" if x < 0 else f"after_{region_name}" if x >= nb_bin else region_name) df_sum.drop('bin', axis=1, inplace=True) if environment[0] != 0: - col_merge = ['condition', 'replicate', 'location'] + col_merge = ['condition', 'region', 'replicate', 'location'] else: - col_merge = ['condition', 'replicate'] + col_merge = ['condition', 'region', 'replicate'] df_sum = df_sum.groupby(col_merge).mean().reset_index() if 'location' in df_sum.columns: df_sum['location'] = pd.Categorical( @@ -206,15 +256,21 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, df_sum['condition'], ordered=True, categories=order_condition ) - df_sum.sort_values(['condition', 'location'], ascending=True, + df_sum['region'] = pd.Categorical( + df_sum['region'], ordered=True, + categories=order_bed_name + ) + df_sum.sort_values(['condition', 'region', 'location'], ascending=True, inplace=True) - return df_sum + df_sum, condition_col = merge_condition_region_col(df_sum) + return df_sum, condition_col def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, border_names: List[str], nb_bin: int, - environment: List[int], region_name: str, - output: Path, norm: Union[int, Path]) -> None: + environment: List[int], bed_name: str, + output: Path, norm: Union[int, Path], + condition_col: str) -> None: """ Create a metagene figure on the region of interest. @@ -226,18 +282,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, nucleotide to represent around the region of interest and the second, the number of bin used to represent those surrounding regions. :param output: Folder where the figure will be created - :param region_name: The region of interest + :param bed_name: The name of considered regions :param norm: an integer corresponding to the bin used to normalise \ the samples or a file containing the normalisations to apply to \ each samples + :param condition_col: The name of the condition columns """ sns.set(context='poster', style='white') if show_replicate: - g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum, + g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum, kind='line', style='replicate', ci=None, height=12, aspect=1.7) else: - g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum, + g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum, kind='line', ci="sd", height=12, aspect=1.7) y_val = g.ax.get_ylim()[1] * 0.99 if border_names[0] != '': @@ -250,11 +307,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, g.set_xlabels('Bins') g.set_ylabels('Coverage') plt.subplots_adjust(top=0.9) - title = f"Average coverage in region '{region_name}'" + tmp_bed_name = bed_name.replace("--", ", ") + title = f"Average coverage in region '{tmp_bed_name}'" if environment[0] != 0: title += f"\nand in their surrounding regions of {environment[0]} nt" g.fig.suptitle(title) - outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \ + tmp_bed_name = bed_name.replace("--", "-") + outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \ f"{environment[0]}_nt-around-{environment[1]}-bin" if isinstance(norm, int): outfile_title += f"_b{norm}_norm" @@ -268,7 +327,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, nb_bin: int, environment: List[int], region_name: str, - output: Path, norm: Union[int, Path]) -> None: + output: Path, norm: Union[int, Path], + condition_col: str) -> None: """ Create a barplot figure on the region of interest. @@ -283,22 +343,25 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, :param norm: an integer corresponding to the bin used to normalise \ the samples or a file containing the normalisations to apply to \ each samples + :param condition_col: The name of the condition columns """ sns.set(context='poster', style='white') if show_replicate: - g = sns.catplot(x="condition", y="coverage", hue="replicate", + g = sns.catplot(x=condition_col, y="coverage", hue="replicate", kind="bar", data=df_sum, height=12, aspect=1.77, ci=None) else: - g = sns.catplot(x="condition", y="coverage", + g = sns.catplot(x=condition_col, y="coverage", kind="bar", data=df_sum, height=12, aspect=1.77, ci='sd') g.set_xlabels('') g.set_ylabels('Coverage') plt.subplots_adjust(top=0.9) - title = f"Average coverage in region '{region_name}'" + rgt = region_name.replace('--', ', ') + title = f"Average coverage in region '{rgt}'" g.fig.suptitle(title) - outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \ + rgt = region_name.replace('--', '-') + outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \ f"{environment[0]}_nt-around-{environment[1]}-bin" if isinstance(norm, int): outfile_title += f"_b{norm}_norm" @@ -326,8 +389,8 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path], raise ValueError(f"the bin {norm} was not found in the coverage " f"dataframe.") df_val = df.loc[df['bin'] == norm, - ['coverage', 'condition', 'replicate']] \ - .groupby(['condition', 'replicate']).mean().reset_index() + ['coverage', 'condition', 'region', 'replicate']] \ + .groupby(['condition', 'region', 'replicate']).mean().reset_index() df_val.rename({"coverage": "coef"}, axis=1, inplace=True) noutfile = outfile.parent / 'coef_table' / \ (outfile.name.replace(".txt.gz", "") + @@ -336,14 +399,19 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path], df_val.to_csv(noutfile, sep="\t", index=False) else: df_val = pd.read_csv(norm, sep="\t") - df = df.merge(df_val, how="left", on=['condition', 'replicate']) + if len(df_val['region'].unique()) > 1: + df = df.merge(df_val, how="left", on=['condition', 'region', + 'replicate']) + else: + df_val.drop('region', axis=1, inplace=True) + df = df.merge(df_val, how="left", on=['condition', 'replicate']) df['coverage'] = df['coverage'] / df['coef'] df.drop('coef', axis=1, inplace=True) return df -def create_figure(design: Path, bw_folder: Path, region_bed: Path, - region_name: str, nb_bin: int = 100, +def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], + bed_names: List[str], nb_bin: int = 100, figure_type: str = 'metagene', norm: Union[int, Path, None] = None, show_replicate: bool = True, environment: List[int] = (0, 0), @@ -358,8 +426,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, the last one contains the replicate of the condition. :param bw_folder: The folder containing the bigwig file mentioned in \ the first column of the 'design' table. - :param region_bed: A bed file containing the regions to visualise - :param region_name: The name of the region analysed + :param region_beds: A list of bed files containing the regions to visualise + :param bed_names: A list of names identifying regions insides the given \ + beds. :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) @@ -374,10 +443,15 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, :param border_names: The name of the borders :param output: Folder where the results will be created """ + if len(region_beds) != len(bed_names): + raise IndexError("Parameter region_beds and bed_names should " + "have the same length") df_exp = pd.read_csv(design, sep="\t") - regions = load_bed(region_bed) - region_bed_name = region_bed.name.replace('.bed', '') - outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \ + regions = load_beds(region_beds, bed_names) + region_bed_name = "-".join([b.name.replace('.bed', '') + for b in region_beds]) + outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \ + f'_{region_bed_name}_{nb_bin}bin_' \ f'{environment[0]}_nt-around-{environment[1]}-bin' if isinstance(norm, int): outfile += f'_bin{norm}_norm' @@ -397,20 +471,22 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, for condition in df_exp['condition'].to_list(): if condition not in ordered_condition: ordered_condition.append(condition) - df_sum = create_df_summary(df_cov, figure_type, nb_bin, environment, - region_name, ordered_condition) + region_kind = "--".join(bed_names) + df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin, + environment, region_kind, + ordered_condition, bed_names) if figure_type == "metagene": figure_metagene(df_sum, show_replicate, border_names, nb_bin, - environment, region_name, output, norm) + environment, region_kind, output, norm, cond_col) else: if 'location' in df_sum.columns: for cur_region in df_sum['location'].unique(): df_tmp = df_sum.loc[df_sum['location'] == cur_region, :] figure_barplot(df_tmp, show_replicate, nb_bin, environment, - cur_region, output, norm) + cur_region, output, norm, cond_col) else: figure_barplot(df_sum, show_replicate, nb_bin, environment, - region_name, output, norm) + region_kind, output, norm, cond_col) if __name__ == "__main__":