diff --git a/src/visu/__main__.py b/src/visu/__main__.py index b93c4abe464ace5bbb02c00ffad7cc97d1fcac1c..b08467089953ccfcd8110ac7c7724c2c28f92a4d 100644 --- a/src/visu/__main__.py +++ b/src/visu/__main__.py @@ -7,7 +7,6 @@ Description: Create a figure showing the ChIP-Seq coverage of particular \ gene regions from ChIP-seq experiment. """ - from .figure_maker import create_figure import lazyparser as lp from pathlib import Path @@ -16,10 +15,10 @@ from typing import List @lp.parse(design='file', region_bed='file', nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'], - show_replicate=['y', 'n', 'Y', 'N'], norm_bin0=['y', 'n', 'Y', 'N']) + show_replicate=['y', 'n', 'Y', 'N']) def launcher(design: str, bw_folder: str, region_bed: str, region_name: str, nb_bin: int = 100, - figure_type: str = 'metagene', norm_bin0: str = 'y', + figure_type: str = 'metagene', norm: str = 'None', show_replicate: str = 'y', environment: List[int] = (0, 0), border_names: List[str] = ('', ''), output: str = '.') -> None: @@ -37,7 +36,9 @@ def launcher(design: str, bw_folder: str, region_bed: str, :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) - :param norm_bin0: True to normalize the figure by the 0bin false else. + :param norm_bin0: A number corresponding to a bin, a file with \ + the normalisation value to apply for each replicate. 'None' for no \ + normalisation (default 'None') :param show_replicate: True to create a figure showing the replicate \ false else. :param environment: A list of two int. The first contains the number of \ @@ -47,14 +48,18 @@ def launcher(design: str, bw_folder: str, region_bed: str, :param output: Folder where the results will be created """ if environment[0] < 0 or environment[1] < 0 or \ - environment[0] < environment[1]: + environment[0] < environment[1]: raise ValueError(f"The two values given with --environment must " f"be greater than 0 and the first value must be " f"greater than the second") show_rep = True if show_replicate.lower() == 'y' else False - norm_b0 = True if norm_bin0.lower() == 'y' else False + norm = int(norm) if norm.isdigit() else None if norm == 'None' else norm + if isinstance(norm, str): + norm = Path(norm) + if not norm.is_file(): + raise FileNotFoundError(f"The file {norm} was not found") create_figure(Path(design), Path(bw_folder), Path(region_bed), - region_name, nb_bin, figure_type, norm_b0, show_rep, + region_name, nb_bin, figure_type, norm, show_rep, environment, border_names, Path(output)) diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py index 097880288b709c5ce9e6084350e2ab08f5eeb91f..0f6615f7cad287a4117656ab5dc29e543b220887 100644 --- a/src/visu/figure_maker.py +++ b/src/visu/figure_maker.py @@ -214,7 +214,7 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, border_names: List[str], nb_bin: int, environment: List[int], region_name: str, - output: Path, norm_bin0: bool) -> None: + output: Path, norm: Union[int, Path]) -> None: """ Create a metagene figure on the region of interest. @@ -227,7 +227,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, the number of bin used to represent those surrounding regions. :param output: Folder where the figure will be created :param region_name: The region of interest - :param norm_bin0: True to normalize the figure by the 0bin false else. + :param norm: an integer corresponding to the bin used to normalise \ + the samples or a file containing the normalisations to apply to \ + each samples """ sns.set(context='poster', style='white') if show_replicate: @@ -254,8 +256,10 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, g.fig.suptitle(title) outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \ f"{environment[0]}_nt-around-{environment[1]}-bin" - if norm_bin0: - outfile_title += "_b0_norm" + if isinstance(norm, int): + outfile_title += f"_b{norm}_norm" + elif isinstance(norm, Path): + outfile_title += f"_file_norm" outfile_title += ".pdf" g.savefig(output / outfile_title) g.fig.clf() @@ -264,7 +268,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, nb_bin: int, environment: List[int], region_name: str, - output: Path, norm_bin0: bool) -> None: + output: Path, norm: Union[int, Path]) -> None: """ Create a barplot figure on the region of interest. @@ -276,7 +280,9 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, the number of bin used to represent those surrounding regions. :param output: Folder where the figure will be created :param region_name: The region of interest - :param norm_bin0: True to normalize the figure by the 0bin false else. + :param norm: an integer corresponding to the bin used to normalise \ + the samples or a file containing the normalisations to apply to \ + each samples """ sns.set(context='poster', style='white') if show_replicate: @@ -294,24 +300,42 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, g.fig.suptitle(title) outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \ f"{environment[0]}_nt-around-{environment[1]}-bin" - if norm_bin0: - outfile_title += "_b0_norm" + if isinstance(norm, int): + outfile_title += f"_b{norm}_norm" + elif isinstance(norm, Path): + outfile_title += f"_file_norm" outfile_title += ".pdf" g.savefig(output / outfile_title) g.fig.clf() -def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame: +def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path], + outfile: Path) -> pd.DataFrame: """ - Normalise the bins coverage by the average overage on bin 0. + Normalise the bins coverage by the average coverage on a particular bin \ + or by a value given in a particular file. :param df: he dataframe of coverage + :param norm: The bin used to normalise the sample or a file containing \ + the value used to normalise the samples. + :param outfile: The table containing coverage values :return: the dataframe with normalised coverage """ - df_val = df.loc[df['bin'] == 0, - ['coverage', 'condition', 'replicate']] \ - .groupby(['condition', 'replicate']).mean().reset_index() - df_val.rename({"coverage": "coef"}, axis=1, inplace=True) + if isinstance(norm, int): + if norm not in list(df['bin'].unique()): + raise ValueError(f"the bin {norm} was not found in the coverage " + f"dataframe.") + df_val = df.loc[df['bin'] == norm, + ['coverage', 'condition', 'replicate']] \ + .groupby(['condition', 'replicate']).mean().reset_index() + df_val.rename({"coverage": "coef"}, axis=1, inplace=True) + noutfile = outfile.parent / 'coef_table' / \ + (outfile.name.replace(".txt.gz", "") + + f".txt") + noutfile.parent.mkdir(exist_ok=True, parents=True) + df_val.to_csv(noutfile, sep="\t", index=False) + else: + df_val = pd.read_csv(norm, sep="\t") df = df.merge(df_val, how="left", on=['condition', 'replicate']) df['coverage'] = df['coverage'] / df['coef'] df.drop('coef', axis=1, inplace=True) @@ -320,7 +344,8 @@ def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame: def create_figure(design: Path, bw_folder: Path, region_bed: Path, region_name: str, nb_bin: int = 100, - figure_type: str = 'metagene', norm_bin0: bool = False, + figure_type: str = 'metagene', + norm: Union[int, Path, None] = None, show_replicate: bool = True, environment: List[int] = (0, 0), border_names: List[str] = ('', ''), output: Path = Path('.')) -> None: @@ -338,7 +363,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, :param nb_bin: The number of bins used to represents the regions of \ 'region_bed'. :param figure_type: The kind of representation wanted (barplot or metagene) - :param norm_bin0: True to normalize the figure by the 0bin false else. + :param norm: an integer corresponding to the bin used to normalise \ + the samples or a file containing the normalisations to apply to \ + each samples :param show_replicate: True to create a figure showing the replicate \ false else. :param environment: A list of two int. The first contains the number of \ @@ -352,8 +379,10 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, region_bed_name = region_bed.name.replace('.bed', '') outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \ f'{environment[0]}_nt-around-{environment[1]}-bin' - if norm_bin0: - outfile += '_bin0_norm' + if isinstance(norm, int): + outfile += f'_bin{norm}_norm' + elif isinstance(norm, Path): + outfile += f'_file_norm' outfile += '.txt.gz' cov_file = output / outfile if cov_file.is_file(): @@ -361,8 +390,8 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, else: df_cov = create_full_table(df_exp, regions, nb_bin, environment, bw_folder) - if norm_bin0: - df_cov = bin0_normalisation(df_cov) + if norm is not None: + df_cov = bin_normalisation(df_cov, norm, cov_file) df_cov.to_csv(cov_file, sep="\t", index=False, compression='gzip') ordered_condition = [] for condition in df_exp['condition'].to_list(): @@ -372,16 +401,16 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path, region_name, ordered_condition) if figure_type == "metagene": figure_metagene(df_sum, show_replicate, border_names, nb_bin, - environment, region_name, output, norm_bin0) + environment, region_name, output, norm) else: if 'location' in df_sum.columns: for cur_region in df_sum['location'].unique(): df_tmp = df_sum.loc[df_sum['location'] == cur_region, :] figure_barplot(df_tmp, show_replicate, nb_bin, environment, - cur_region, output, norm_bin0) + cur_region, output, norm) else: figure_barplot(df_sum, show_replicate, nb_bin, environment, - region_name, output, norm_bin0) + region_name, output, norm) if __name__ == "__main__":