diff --git a/src/visu/__main__.py b/src/visu/__main__.py index 49e4339729960f7490858f59d69b4bd868501824..b93c4abe464ace5bbb02c00ffad7cc97d1fcac1c 100644 --- a/src/visu/__main__.py +++ b/src/visu/__main__.py @@ -16,10 +16,10 @@ from typing import List @lp.parse(design='file', region_bed='file', nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'], - show_replicate=['y', 'n', 'Y', 'N']) + show_replicate=['y', 'n', 'Y', 'N'], norm_bin0=['y', 'n', 'Y', 'N']) def launcher(design: str, bw_folder: str, region_bed: str, region_name: str, nb_bin: int = 100, - figure_type: str = 'metagene', + figure_type: str = 'metagene', norm_bin0: str = 'y', show_replicate: str = 'y', environment: List[int] = (0, 0), border_names: List[str] = ('', ''), output: str = '.') -> None: @@ -37,6 +37,7 @@ def launcher(design: str, bw_folder: str, region_bed: str, :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) + :param norm_bin0: True to normalize the figure by the 0bin false else. :param show_replicate: True to create a figure showing the replicate \ false else. :param environment: A list of two int. The first contains the number of \ @@ -51,9 +52,10 @@ def launcher(design: str, bw_folder: str, region_bed: str, f"be greater than 0 and the first value must be " f"greater than the second") show_rep = True if show_replicate.lower() == 'y' else False + norm_b0 = True if norm_bin0.lower() == 'y' else False create_figure(Path(design), Path(bw_folder), Path(region_bed), - region_name, nb_bin, figure_type, show_rep, environment, - border_names, Path(output)) + region_name, nb_bin, figure_type, norm_b0, show_rep, + environment, border_names, Path(output)) launcher() diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py index 50848136f8c9479adb58909130aa574d7aa70d75..368a940d66ccb0bcf113ae54a4e9f5bebec8d147 100644 --- a/src/visu/figure_maker.py +++ b/src/visu/figure_maker.py @@ -215,7 +215,7 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, border_names: List[str], nb_bin: int, environment: List[int], region_name: str, - output: Path) -> None: + output: Path, norm_bin0: bool) -> None: """ Create a metagene figure on the region of interest. @@ -228,6 +228,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, the number of bin used to represent those surrounding regions. :param output: Folder where the figure will be created :param region_name: The region of interest + :param norm_bin0: True to normalize the figure by the 0bin false else. """ sns.set(context='poster', style='white') if show_replicate: @@ -252,15 +253,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, if environment[0] != 0: title += f"\nand in their surrounding regions of {environment[0]} nt" g.fig.suptitle(title) - g.savefig(output / f"metagene_{region_name}_{nb_bin}bin_" \ - f"{environment[0]}_nt-around-{environment[1]}-bin.pdf") + outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \ + f"{environment[0]}_nt-around-{environment[1]}-bin" + if norm_bin0: + outfile_title += "_b0_norm" + outfile_title += ".pdf" + g.savefig(output / outfile_title) g.fig.clf() def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, nb_bin: int, environment: List[int], region_name: str, - output: Path) -> None: + output: Path, norm_bin0: bool) -> None: """ Create a barplot figure on the region of interest. @@ -272,6 +277,7 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, the number of bin used to represent those surrounding regions. :param output: Folder where the figure will be created :param region_name: The region of interest + :param norm_bin0: True to normalize the figure by the 0bin false else. """ sns.set(context='poster', style='white') if show_replicate: @@ -287,14 +293,35 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, plt.subplots_adjust(top=0.9) title = f"Average coverage in region '{region_name}'" g.fig.suptitle(title) - g.savefig(output / f"barplot_{region_name}_{nb_bin}bin_" \ - f"{environment[0]}_nt-around-{environment[1]}-bin.pdf") + outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \ + f"{environment[0]}_nt-around-{environment[1]}-bin" + if norm_bin0: + outfile_title += "_b0_norm" + outfile_title += ".pdf" + g.savefig(output / outfile_title) g.fig.clf() +def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame: + """ + Normalise the bins coverage by the average overage on bin 0. + + :param df: he dataframe of coverage + :return: the dataframe with normalised coverage + """ + df_val = df.loc[df['bin'] == 0, + ['coverage', 'condition', 'replicate']]\ + .groupby(['condition', 'replicate']).mean().reset_index() + df_val.rename({"coverage": "coef"}, axis=1, inplace=True) + df = df.merge(df_val, how="left", on=['condition', 'replicate']) + df['coverage'] = df['coverage'] / df['coef'] + df.drop('coef', axis=1, inplace=True) + return df + + def create_figure(design: Path, bw_folder: Path, region_bed: Path, region_name: str, nb_bin: int = 100, - figure_type: str = 'metagene', + figure_type: str = 'metagene', norm_bin0: bool = False, show_replicate: bool = True, environment: List[int] = (0, 0), border_names: List[str] = ('', ''), output: Path = Path('.')) -> None: @@ -312,6 +339,7 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) + :param norm_bin0: True to normalize the figure by the 0bin false else. :param show_replicate: True to create a figure showing the replicate \ false else. :param environment: A list of two int. The first contains the number of \ @@ -324,13 +352,18 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, regions = load_bed(region_bed) region_bed_name = region_bed.name.replace('.bed', '') outfile = f'tmp_cov_table_{region_bed_name}_{nb_bin}bin_' \ - f'{environment[0]}_nt-around-{environment[1]}-bin.txt.gz' + f'{environment[0]}_nt-around-{environment[1]}-bin' + if norm_bin0: + outfile += '_bin0_norm' + outfile += '.txt.gz' cov_file = output / outfile if cov_file.is_file(): df_cov = pd.read_csv(cov_file, sep="\t", compression='gzip') else: df_cov = create_full_table(df_exp, regions, nb_bin, environment, bw_folder) + if norm_bin0: + df_cov = bin0_normalisation(df_cov) df_cov.to_csv(cov_file, sep="\t", index=False, compression='gzip') ordered_condition = [] for condition in df_exp['condition'].to_list(): @@ -340,16 +373,16 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, region_name, ordered_condition) if figure_type == "metagene": figure_metagene(df_sum, show_replicate, border_names, nb_bin, - environment, region_name, output) + environment, region_name, output, norm_bin0) else: if 'location' in df_sum.columns: for cur_region in df_sum['location'].unique(): df_tmp = df_sum.loc[df_sum['location'] == cur_region, :] figure_barplot(df_tmp, show_replicate, nb_bin, environment, - cur_region, output) + cur_region, output, norm_bin0) else: figure_barplot(df_sum, show_replicate, nb_bin, environment, - region_name, output) + region_name, output, norm_bin0) if __name__ == "__main__":