From 8344a444ddee28e920c1dc9cacdf1aed16c64094 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Mon, 26 Oct 2020 11:39:42 +0100
Subject: [PATCH]  src/visu/figure_maker.py src/visu/__main__.py: bin0_norm ->
 norm parameter that can either be a file or a bin number or None

---
 src/visu/__main__.py     | 19 ++++++----
 src/visu/figure_maker.py | 75 ++++++++++++++++++++++++++++------------
 2 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/src/visu/__main__.py b/src/visu/__main__.py
index b93c4ab..b084670 100644
--- a/src/visu/__main__.py
+++ b/src/visu/__main__.py
@@ -7,7 +7,6 @@ Description:  Create a figure showing the ChIP-Seq coverage of particular \
 gene regions from ChIP-seq experiment.
 """
 
-
 from .figure_maker import create_figure
 import lazyparser as lp
 from pathlib import Path
@@ -16,10 +15,10 @@ from typing import List
 
 @lp.parse(design='file', region_bed='file',
           nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'],
-          show_replicate=['y', 'n', 'Y', 'N'], norm_bin0=['y', 'n', 'Y', 'N'])
+          show_replicate=['y', 'n', 'Y', 'N'])
 def launcher(design: str, bw_folder: str, region_bed: str,
              region_name: str, nb_bin: int = 100,
-             figure_type: str = 'metagene', norm_bin0: str = 'y',
+             figure_type: str = 'metagene', norm: str = 'None',
              show_replicate: str = 'y', environment: List[int] = (0, 0),
              border_names: List[str] = ('', ''),
              output: str = '.') -> None:
@@ -37,7 +36,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
-    :param norm_bin0: True to normalize the figure by the 0bin false else.
+    :param norm_bin0: A number corresponding to a bin, a file with \
+    the normalisation value to apply for each replicate. 'None' for no \
+    normalisation (default 'None')
     :param show_replicate: True to create a figure showing the replicate \
     false else.
     :param environment: A list of two int. The first contains the number of \
@@ -47,14 +48,18 @@ def launcher(design: str, bw_folder: str, region_bed: str,
     :param output: Folder where the results will be created
     """
     if environment[0] < 0 or environment[1] < 0 or \
-        environment[0] < environment[1]:
+            environment[0] < environment[1]:
         raise ValueError(f"The two values given with --environment must "
                          f"be greater than 0 and the first value must be "
                          f"greater than the second")
     show_rep = True if show_replicate.lower() == 'y' else False
-    norm_b0 = True if norm_bin0.lower() == 'y' else False
+    norm = int(norm) if norm.isdigit() else None if norm == 'None' else norm
+    if isinstance(norm, str):
+        norm = Path(norm)
+        if not norm.is_file():
+            raise FileNotFoundError(f"The file {norm} was not found")
     create_figure(Path(design), Path(bw_folder), Path(region_bed),
-                  region_name, nb_bin, figure_type, norm_b0, show_rep,
+                  region_name, nb_bin, figure_type, norm, show_rep,
                   environment, border_names, Path(output))
 
 
diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py
index 0978802..0f6615f 100644
--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -214,7 +214,7 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
 def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                     border_names: List[str], nb_bin: int,
                     environment: List[int], region_name: str,
-                    output: Path, norm_bin0: bool) -> None:
+                    output: Path, norm: Union[int, Path]) -> None:
     """
     Create a metagene figure on the region of interest.
 
@@ -227,7 +227,9 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     the number of bin used to represent those surrounding regions.
     :param output: Folder where the figure will be created
     :param region_name: The region of interest
-    :param norm_bin0: True to normalize the figure by the 0bin false else.
+    :param norm: an integer corresponding to the bin used to normalise \
+    the samples or a file containing the normalisations to apply to \
+    each samples
     """
     sns.set(context='poster', style='white')
     if show_replicate:
@@ -254,8 +256,10 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     g.fig.suptitle(title)
     outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \
                     f"{environment[0]}_nt-around-{environment[1]}-bin"
-    if norm_bin0:
-        outfile_title += "_b0_norm"
+    if isinstance(norm, int):
+        outfile_title += f"_b{norm}_norm"
+    elif isinstance(norm, Path):
+        outfile_title += f"_file_norm"
     outfile_title += ".pdf"
     g.savefig(output / outfile_title)
     g.fig.clf()
@@ -264,7 +268,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
 def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
                    nb_bin: int,
                    environment: List[int], region_name: str,
-                   output: Path, norm_bin0: bool) -> None:
+                   output: Path, norm: Union[int, Path]) -> None:
     """
     Create a barplot figure on the region of interest.
 
@@ -276,7 +280,9 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
     the number of bin used to represent those surrounding regions.
     :param output: Folder where the figure will be created
     :param region_name: The region of interest
-    :param norm_bin0: True to normalize the figure by the 0bin false else.
+    :param norm: an integer corresponding to the bin used to normalise \
+    the samples or a file containing the normalisations to apply to \
+    each samples
     """
     sns.set(context='poster', style='white')
     if show_replicate:
@@ -294,24 +300,42 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
     g.fig.suptitle(title)
     outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \
                     f"{environment[0]}_nt-around-{environment[1]}-bin"
-    if norm_bin0:
-        outfile_title += "_b0_norm"
+    if isinstance(norm, int):
+        outfile_title += f"_b{norm}_norm"
+    elif isinstance(norm, Path):
+        outfile_title += f"_file_norm"
     outfile_title += ".pdf"
     g.savefig(output / outfile_title)
     g.fig.clf()
 
 
-def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame:
+def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
+                      outfile: Path) -> pd.DataFrame:
     """
-    Normalise the bins coverage by the average overage on bin 0.
+    Normalise the bins coverage by the average coverage on a particular bin \
+    or by a value given in a particular file.
 
     :param df: he dataframe of coverage
+    :param norm: The bin used to normalise the sample or a file containing \
+    the value used to normalise the samples.
+    :param outfile: The table containing coverage values
     :return: the dataframe with normalised coverage
     """
-    df_val = df.loc[df['bin'] == 0,
-                    ['coverage', 'condition', 'replicate']] \
-        .groupby(['condition', 'replicate']).mean().reset_index()
-    df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
+    if isinstance(norm, int):
+        if norm not in list(df['bin'].unique()):
+            raise ValueError(f"the bin {norm} was not found in the coverage "
+                             f"dataframe.")
+        df_val = df.loc[df['bin'] == norm,
+                        ['coverage', 'condition', 'replicate']] \
+            .groupby(['condition', 'replicate']).mean().reset_index()
+        df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
+        noutfile = outfile.parent / 'coef_table' / \
+                   (outfile.name.replace(".txt.gz", "") +
+                    f".txt")
+        noutfile.parent.mkdir(exist_ok=True, parents=True)
+        df_val.to_csv(noutfile, sep="\t", index=False)
+    else:
+        df_val = pd.read_csv(norm, sep="\t")
     df = df.merge(df_val, how="left", on=['condition', 'replicate'])
     df['coverage'] = df['coverage'] / df['coef']
     df.drop('coef', axis=1, inplace=True)
@@ -320,7 +344,8 @@ def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame:
 
 def create_figure(design: Path, bw_folder: Path, region_bed: Path,
                   region_name: str, nb_bin: int = 100,
-                  figure_type: str = 'metagene', norm_bin0: bool = False,
+                  figure_type: str = 'metagene',
+                  norm: Union[int, Path, None] = None,
                   show_replicate: bool = True, environment: List[int] = (0, 0),
                   border_names: List[str] = ('', ''),
                   output: Path = Path('.')) -> None:
@@ -338,7 +363,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
-    :param norm_bin0: True to normalize the figure by the 0bin false else.
+    :param norm: an integer corresponding to the bin used to normalise \
+    the samples or a file containing the normalisations to apply to \
+    each samples
     :param show_replicate: True to create a figure showing the replicate \
     false else.
     :param environment: A list of two int. The first contains the number of \
@@ -352,8 +379,10 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     region_bed_name = region_bed.name.replace('.bed', '')
     outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \
               f'{environment[0]}_nt-around-{environment[1]}-bin'
-    if norm_bin0:
-        outfile += '_bin0_norm'
+    if isinstance(norm, int):
+        outfile += f'_bin{norm}_norm'
+    elif isinstance(norm, Path):
+        outfile += f'_file_norm'
     outfile += '.txt.gz'
     cov_file = output / outfile
     if cov_file.is_file():
@@ -361,8 +390,8 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     else:
         df_cov = create_full_table(df_exp, regions, nb_bin, environment,
                                    bw_folder)
-        if norm_bin0:
-            df_cov = bin0_normalisation(df_cov)
+        if norm is not None:
+            df_cov = bin_normalisation(df_cov, norm, cov_file)
         df_cov.to_csv(cov_file, sep="\t", index=False, compression='gzip')
     ordered_condition = []
     for condition in df_exp['condition'].to_list():
@@ -372,16 +401,16 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
                                region_name, ordered_condition)
     if figure_type == "metagene":
         figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                        environment, region_name, output, norm_bin0)
+                        environment, region_name, output, norm)
     else:
         if 'location' in df_sum.columns:
             for cur_region in df_sum['location'].unique():
                 df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
                 figure_barplot(df_tmp, show_replicate, nb_bin, environment,
-                               cur_region, output, norm_bin0)
+                               cur_region, output, norm)
         else:
             figure_barplot(df_sum, show_replicate, nb_bin, environment,
-                           region_name, output, norm_bin0)
+                           region_name, output, norm)
 
 
 if __name__ == "__main__":
-- 
GitLab