From f0558847bc18b2c7a257ccef95b6c88cc36dea20 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Thu, 22 Oct 2020 17:16:17 +0200
Subject: [PATCH] add a parameter norm_bin0

---
 src/visu/__main__.py     | 10 +++++---
 src/visu/figure_maker.py | 55 ++++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/visu/__main__.py b/src/visu/__main__.py
index 49e4339..b93c4ab 100644
--- a/src/visu/__main__.py
+++ b/src/visu/__main__.py
@@ -16,10 +16,10 @@ from typing import List
 
 @lp.parse(design='file', region_bed='file',
           nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'],
-          show_replicate=['y', 'n', 'Y', 'N'])
+          show_replicate=['y', 'n', 'Y', 'N'], norm_bin0=['y', 'n', 'Y', 'N'])
 def launcher(design: str, bw_folder: str, region_bed: str,
              region_name: str, nb_bin: int = 100,
-             figure_type: str = 'metagene',
+             figure_type: str = 'metagene', norm_bin0: str = 'y',
              show_replicate: str = 'y', environment: List[int] = (0, 0),
              border_names: List[str] = ('', ''),
              output: str = '.') -> None:
@@ -37,6 +37,7 @@ def launcher(design: str, bw_folder: str, region_bed: str,
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
+    :param norm_bin0: True to normalize the figure by the 0bin false else.
     :param show_replicate: True to create a figure showing the replicate \
     false else.
     :param environment: A list of two int. The first contains the number of \
@@ -51,9 +52,10 @@ def launcher(design: str, bw_folder: str, region_bed: str,
                          f"be greater than 0 and the first value must be "
                          f"greater than the second")
     show_rep = True if show_replicate.lower() == 'y' else False
+    norm_b0 = True if norm_bin0.lower() == 'y' else False
     create_figure(Path(design), Path(bw_folder), Path(region_bed),
-                  region_name, nb_bin, figure_type, show_rep, environment,
-                  border_names, Path(output))
+                  region_name, nb_bin, figure_type, norm_b0, show_rep,
+                  environment, border_names, Path(output))
 
 
 launcher()
diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py
index 5084813..368a940 100644
--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -215,7 +215,7 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
 def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                     border_names: List[str], nb_bin: int,
                     environment: List[int], region_name: str,
-                    output: Path) -> None:
+                    output: Path, norm_bin0: bool) -> None:
     """
     Create a metagene figure on the region of interest.
 
@@ -228,6 +228,7 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     the number of bin used to represent those surrounding regions.
     :param output: Folder where the figure will be created
     :param region_name: The region of interest
+    :param norm_bin0: True to normalize the figure by the 0bin false else.
     """
     sns.set(context='poster', style='white')
     if show_replicate:
@@ -252,15 +253,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     if environment[0] != 0:
         title += f"\nand in their surrounding regions of {environment[0]} nt"
     g.fig.suptitle(title)
-    g.savefig(output / f"metagene_{region_name}_{nb_bin}bin_" \
-              f"{environment[0]}_nt-around-{environment[1]}-bin.pdf")
+    outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \
+        f"{environment[0]}_nt-around-{environment[1]}-bin"
+    if norm_bin0:
+        outfile_title += "_b0_norm"
+    outfile_title += ".pdf"
+    g.savefig(output / outfile_title)
     g.fig.clf()
 
 
 def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
                    nb_bin: int,
                    environment: List[int], region_name: str,
-                   output: Path) -> None:
+                   output: Path, norm_bin0: bool) -> None:
     """
     Create a barplot figure on the region of interest.
 
@@ -272,6 +277,7 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
     the number of bin used to represent those surrounding regions.
     :param output: Folder where the figure will be created
     :param region_name: The region of interest
+    :param norm_bin0: True to normalize the figure by the 0bin false else.
     """
     sns.set(context='poster', style='white')
     if show_replicate:
@@ -287,14 +293,35 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
     plt.subplots_adjust(top=0.9)
     title = f"Average coverage in region '{region_name}'"
     g.fig.suptitle(title)
-    g.savefig(output / f"barplot_{region_name}_{nb_bin}bin_" \
-                 f"{environment[0]}_nt-around-{environment[1]}-bin.pdf")
+    outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \
+        f"{environment[0]}_nt-around-{environment[1]}-bin"
+    if norm_bin0:
+        outfile_title += "_b0_norm"
+    outfile_title += ".pdf"
+    g.savefig(output / outfile_title)
     g.fig.clf()
 
 
+def bin0_normalisation(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalise the bins coverage by the average overage on bin 0.
+
+    :param df: he dataframe of coverage
+    :return: the dataframe with normalised coverage
+    """
+    df_val = df.loc[df['bin'] == 0,
+                    ['coverage', 'condition', 'replicate']]\
+        .groupby(['condition', 'replicate']).mean().reset_index()
+    df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
+    df = df.merge(df_val, how="left", on=['condition', 'replicate'])
+    df['coverage'] = df['coverage'] / df['coef']
+    df.drop('coef', axis=1, inplace=True)
+    return df
+
+
 def create_figure(design: Path, bw_folder: Path, region_bed: Path,
                   region_name: str, nb_bin: int = 100,
-                  figure_type: str = 'metagene',
+                  figure_type: str = 'metagene', norm_bin0: bool = False,
                   show_replicate: bool = True, environment: List[int] = (0, 0),
                   border_names: List[str] = ('', ''),
                   output: Path = Path('.')) -> None:
@@ -312,6 +339,7 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
+    :param norm_bin0: True to normalize the figure by the 0bin false else.
     :param show_replicate: True to create a figure showing the replicate \
     false else.
     :param environment: A list of two int. The first contains the number of \
@@ -324,13 +352,18 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     regions = load_bed(region_bed)
     region_bed_name = region_bed.name.replace('.bed', '')
     outfile = f'tmp_cov_table_{region_bed_name}_{nb_bin}bin_' \
-              f'{environment[0]}_nt-around-{environment[1]}-bin.txt.gz'
+              f'{environment[0]}_nt-around-{environment[1]}-bin'
+    if norm_bin0:
+        outfile += '_bin0_norm'
+    outfile += '.txt.gz'
     cov_file = output / outfile
     if cov_file.is_file():
         df_cov = pd.read_csv(cov_file, sep="\t", compression='gzip')
     else:
         df_cov = create_full_table(df_exp, regions, nb_bin, environment,
                                    bw_folder)
+        if norm_bin0:
+            df_cov = bin0_normalisation(df_cov)
         df_cov.to_csv(cov_file, sep="\t", index=False, compression='gzip')
     ordered_condition = []
     for condition in df_exp['condition'].to_list():
@@ -340,16 +373,16 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
                                region_name, ordered_condition)
     if figure_type == "metagene":
         figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                        environment, region_name, output)
+                        environment, region_name, output, norm_bin0)
     else:
         if 'location' in df_sum.columns:
             for cur_region in df_sum['location'].unique():
                 df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
                 figure_barplot(df_tmp, show_replicate, nb_bin, environment,
-                               cur_region, output)
+                               cur_region, output, norm_bin0)
         else:
             figure_barplot(df_sum, show_replicate, nb_bin, environment,
-                           region_name, output)
+                           region_name, output, norm_bin0)
 
 
 if __name__ == "__main__":
-- 
GitLab