From ca7c85c719cb24d5dcc569c9841d47bb0c7b184d Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Wed, 4 Nov 2020 10:37:17 +0100
Subject: [PATCH] src/visu/__main__.py src/visu/figure_maker.py: modification
 to create a figure with two different list of exons

---
 src/visu/__main__.py     |  16 +--
 src/visu/figure_maker.py | 206 +++++++++++++++++++++++++++------------
 2 files changed, 150 insertions(+), 72 deletions(-)

diff --git a/src/visu/__main__.py b/src/visu/__main__.py
index b084670..bc53640 100644
--- a/src/visu/__main__.py
+++ b/src/visu/__main__.py
@@ -13,11 +13,11 @@ from pathlib import Path
 from typing import List
 
 
-@lp.parse(design='file', region_bed='file',
+@lp.parse(design='file', region_beds='file',
           nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'],
           show_replicate=['y', 'n', 'Y', 'N'])
-def launcher(design: str, bw_folder: str, region_bed: str,
-             region_name: str, nb_bin: int = 100,
+def launcher(design: str, bw_folder: str, region_beds: List[str],
+             region_names: List[str], nb_bin: int = 100,
              figure_type: str = 'metagene', norm: str = 'None',
              show_replicate: str = 'y', environment: List[int] = (0, 0),
              border_names: List[str] = ('', ''),
@@ -31,8 +31,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
     the last one contains the replicate of the condition.
     :param bw_folder: The folder containing the bigwig file mentioned in \
     the first column of the 'design' table.
-    :param region_bed: A bed file containing the regions to visualise
-    :param region_name: The name of the region analysed
+    :param region_beds: A list of bed files containing the regions to visualise
+    :param region_names: A list of names identifying regions insides \
+    the given beds.
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
@@ -58,8 +59,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
         norm = Path(norm)
         if not norm.is_file():
             raise FileNotFoundError(f"The file {norm} was not found")
-    create_figure(Path(design), Path(bw_folder), Path(region_bed),
-                  region_name, nb_bin, figure_type, norm, show_rep,
+    reg_beds = [Path(p) for p in region_beds]
+    create_figure(Path(design), Path(bw_folder), reg_beds,
+                  region_names, nb_bin, figure_type, norm, show_rep,
                   environment, border_names, Path(output))
 
 
diff --git a/src/visu/figure_maker.py b/src/visu/figure_maker.py
index 0f6615f..2da518f 100644
--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -7,7 +7,7 @@ Description:
 """
 
 from pathlib import Path
-from typing import List, Union, Any
+from typing import List, Union, Any, Tuple
 from doctest import testmod
 from ..bed_handler.config import TestConfig
 import pandas as pd
@@ -17,15 +17,16 @@ import matplotlib.pyplot as plt
 from tqdm import tqdm
 
 
-def load_bed(bed: Path) -> List[List[Union[int, str]]]:
+def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
     """
     Read a bed file and return the lines within it.
 
     :param bed: A bed file containing the regions of interest
+    :param bed_name: The name of the regions of interest inside the bed file
     :return:The list of feature inside the bed
 
-    >>> load_bed(TestConfig.gene_bed)[0]
-    ['18', 28645943, 28682388, 1, 'DSC2', '-']
+    >>> load_bed(TestConfig.gene_bed, 'gene_test')[0]
+    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test']
     """
     list_regions = []
     with bed.open('r') as inbed:
@@ -33,10 +34,34 @@ def load_bed(bed: Path) -> List[List[Union[int, str]]]:
             if not line.startswith("#"):
                 cline = line.replace("\n", "").split("\t")
                 list_regions.append([cline[0], int(cline[1]), int(cline[2]),
-                                     int(cline[3]), cline[4], cline[5]])
+                                     int(cline[3]), cline[4], cline[5],
+                                     bed_name])
     return list_regions
 
 
+def load_beds(beds: List[Path], bed_names: List[str]
+              ) -> List[List[Union[int, str]]]:
+    """
+    Read a bed file and return the lines within it.
+
+    :param beds: A list of bed files containing the regions of interest
+    :param bed_names: A list of names indentifying regions insides the given \
+    beds.
+    :return:The list of feature inside the beds file
+
+    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
+    ... ['gene1', 'gene2'])[0]
+    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1']
+    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
+    ... ['gene1', 'gene2'])[-1]
+    ['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2']
+    """
+    regions = []
+    for i in range(len(beds)):
+        regions += load_bed(beds[i], bed_names[i])
+    return regions
+
+
 def inspect_bigwig_regions(bw: Any, region: List,
                            replicate: str, nb_bin: int, resize: List[int],
                            condition_name: str,
@@ -54,30 +79,30 @@ def inspect_bigwig_regions(bw: Any, region: List,
     :return: a table with the coverage of this region
 
     >>> my_bw = pbw.open(str(TestConfig.small_bw))
-    >>> region = ['1', 10, 25, 1, 'Test', '+']
-    >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
-        coverage  bin condition replicate
-    0   0.000000   -2     cond1        R1
-    1   0.500000   -1     cond1        R1
-    2  75.000000    0     cond1        R1
-    3  20.000000    1     cond1        R1
-    4  10.000000    2     cond1        R1
-    5   4.666667    3     cond1        R1
-    6   2.000000    4     cond1        R1
-    7   1.000000    5     cond1        R1
-    8   0.500000    6     cond1        R1
-    >>> region = ['1', 110, 133, 1, 'Test', '-']
-    >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
-       coverage  bin condition replicate
-    0      0.00   -2     cond1        R1
-    1     12.50   -1     cond1        R1
-    2     42.00    0     cond1        R1
-    3      8.00    1     cond1        R1
-    4      4.25    2     cond1        R1
-    5      2.00    3     cond1        R1
-    6      2.00    4     cond1        R1
-    7      1.00    5     cond1        R1
-    8      1.00    6     cond1        R1
+    >>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon']
+    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
+        coverage  bin condition replicate region
+    0   0.000000   -2     cond1        R1   exon
+    1   0.500000   -1     cond1        R1   exon
+    2  75.000000    0     cond1        R1   exon
+    3  20.000000    1     cond1        R1   exon
+    4  10.000000    2     cond1        R1   exon
+    5   4.666667    3     cond1        R1   exon
+    6   2.000000    4     cond1        R1   exon
+    7   1.000000    5     cond1        R1   exon
+    8   0.500000    6     cond1        R1   exon
+    >>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2']
+    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
+       coverage  bin condition replicate region
+    0      0.00   -2     cond1        R1  exon2
+    1     12.50   -1     cond1        R1  exon2
+    2     42.00    0     cond1        R1  exon2
+    3      8.00    1     cond1        R1  exon2
+    4      4.25    2     cond1        R1  exon2
+    5      2.00    3     cond1        R1  exon2
+    6      2.00    4     cond1        R1  exon2
+    7      1.00    5     cond1        R1  exon2
+    8      1.00    6     cond1        R1  exon2
     """
     val = bw.stats(region[0], region[1], region[2], nBins=nb_bin, exact=True)
     bins = list(range(len(val)))
@@ -110,6 +135,7 @@ def inspect_bigwig_regions(bw: Any, region: List,
     df = pd.DataFrame(dic)
     df['condition'] = [condition_name] * df.shape[0]
     df['replicate'] = [replicate] * df.shape[0]
+    df['region'] = [region[6]] * df.shape[0]
     return df
 
 
@@ -165,10 +191,31 @@ def create_full_table(df_exp: pd.DataFrame, regions: List[List],
     return pd.concat(list_df, axis=0, ignore_index=True)
 
 
+def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
+    """
+
+    :param df: A dataframe of mean coverage for each bin.
+    :return: The dataframe with the region of condition column merged and \
+    the name of the merged column
+    """
+    if len(df['region'].unique()) == 1:
+        condition_col = 'condition'
+        df.drop('region', axis=1, inplace=True)
+    elif len(df['condition'].unique()) == 1:
+        condition_col = 'region'
+        df.drop('condition', axis=1, inplace=True)
+    else:
+        condition_col = 'condition-region'
+        df[condition_col] = df['condition'] + "-" + df['region']
+        df.drop(['condition', 'region'], axis=1, inplace=True)
+    return df, condition_col
+
+
 def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
                       environment: List[int],
-                      region_name: str, order_condition: List[str]
-                      ) -> pd.DataFrame:
+                      region_name: str, order_condition: List[str],
+                      order_bed_name: List[str],
+                      ) -> Tuple[pd.DataFrame, str]:
     """
     summarize the data in df_cov.
 
@@ -180,21 +227,24 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
     the number of bin used to represent those surrounding regions.
     :param region_name: the name of the region analysed
     :param order_condition: The order of conditions
-    :return: The summarised dataframe
+    :param order_bed_name: The order of bed name to respect
+    :return: The summarised dataframe and the condition col
     """
-    df_sum = df_cov.groupby(['bin', 'condition', 'replicate']).mean() \
+    df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \
+        .mean() \
         .reset_index()
     if figure_type == "metagene":
-        return df_sum
+        df_sum, condition_col = merge_condition_region_col(df_sum)
+        return df_sum, condition_col
     if environment[0] != 0:
         df_sum['location'] = df_cov['bin'].apply(
             lambda x: f"before_{region_name}" if x < 0 else
             f"after_{region_name}" if x >= nb_bin else region_name)
     df_sum.drop('bin', axis=1, inplace=True)
     if environment[0] != 0:
-        col_merge = ['condition', 'replicate', 'location']
+        col_merge = ['condition', 'region', 'replicate', 'location']
     else:
-        col_merge = ['condition', 'replicate']
+        col_merge = ['condition', 'region', 'replicate']
     df_sum = df_sum.groupby(col_merge).mean().reset_index()
     if 'location' in df_sum.columns:
         df_sum['location'] = pd.Categorical(
@@ -206,15 +256,21 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
             df_sum['condition'], ordered=True,
             categories=order_condition
         )
-        df_sum.sort_values(['condition', 'location'], ascending=True,
+        df_sum['region'] = pd.Categorical(
+            df_sum['region'], ordered=True,
+            categories=order_bed_name
+        )
+        df_sum.sort_values(['condition', 'region', 'location'], ascending=True,
                            inplace=True)
-    return df_sum
+    df_sum, condition_col = merge_condition_region_col(df_sum)
+    return df_sum, condition_col
 
 
 def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                     border_names: List[str], nb_bin: int,
-                    environment: List[int], region_name: str,
-                    output: Path, norm: Union[int, Path]) -> None:
+                    environment: List[int], bed_name: str,
+                    output: Path, norm: Union[int, Path],
+                    condition_col: str) -> None:
     """
     Create a metagene figure on the region of interest.
 
@@ -226,18 +282,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     nucleotide to represent around the region of interest and the second,
     the number of bin used to represent those surrounding regions.
     :param output: Folder where the figure will be created
-    :param region_name: The region of interest
+    :param bed_name: The name of considered regions
     :param norm: an integer corresponding to the bin used to normalise \
     the samples or a file containing the normalisations to apply to \
     each samples
+    :param condition_col: The name of the condition columns
     """
     sns.set(context='poster', style='white')
     if show_replicate:
-        g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
+        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
                         kind='line', style='replicate', ci=None,
                         height=12, aspect=1.7)
     else:
-        g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
+        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
                         kind='line', ci="sd", height=12, aspect=1.7)
     y_val = g.ax.get_ylim()[1] * 0.99
     if border_names[0] != '':
@@ -250,11 +307,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
     g.set_xlabels('Bins')
     g.set_ylabels('Coverage')
     plt.subplots_adjust(top=0.9)
-    title = f"Average coverage in region '{region_name}'"
+    tmp_bed_name = bed_name.replace("--", ", ")
+    title = f"Average coverage in region '{tmp_bed_name}'"
     if environment[0] != 0:
         title += f"\nand in their surrounding regions of {environment[0]} nt"
     g.fig.suptitle(title)
-    outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \
+    tmp_bed_name = bed_name.replace("--", "-")
+    outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \
                     f"{environment[0]}_nt-around-{environment[1]}-bin"
     if isinstance(norm, int):
         outfile_title += f"_b{norm}_norm"
@@ -268,7 +327,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
 def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
                    nb_bin: int,
                    environment: List[int], region_name: str,
-                   output: Path, norm: Union[int, Path]) -> None:
+                   output: Path, norm: Union[int, Path],
+                   condition_col: str) -> None:
     """
     Create a barplot figure on the region of interest.
 
@@ -283,22 +343,25 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
     :param norm: an integer corresponding to the bin used to normalise \
     the samples or a file containing the normalisations to apply to \
     each samples
+    :param condition_col: The name of the condition columns
     """
     sns.set(context='poster', style='white')
     if show_replicate:
-        g = sns.catplot(x="condition", y="coverage", hue="replicate",
+        g = sns.catplot(x=condition_col, y="coverage", hue="replicate",
                         kind="bar", data=df_sum, height=12, aspect=1.77,
                         ci=None)
     else:
-        g = sns.catplot(x="condition", y="coverage",
+        g = sns.catplot(x=condition_col, y="coverage",
                         kind="bar", data=df_sum, height=12, aspect=1.77,
                         ci='sd')
     g.set_xlabels('')
     g.set_ylabels('Coverage')
     plt.subplots_adjust(top=0.9)
-    title = f"Average coverage in region '{region_name}'"
+    rgt = region_name.replace('--', ', ')
+    title = f"Average coverage in region '{rgt}'"
     g.fig.suptitle(title)
-    outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \
+    rgt = region_name.replace('--', '-')
+    outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \
                     f"{environment[0]}_nt-around-{environment[1]}-bin"
     if isinstance(norm, int):
         outfile_title += f"_b{norm}_norm"
@@ -326,8 +389,8 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
             raise ValueError(f"the bin {norm} was not found in the coverage "
                              f"dataframe.")
         df_val = df.loc[df['bin'] == norm,
-                        ['coverage', 'condition', 'replicate']] \
-            .groupby(['condition', 'replicate']).mean().reset_index()
+                        ['coverage', 'condition', 'region', 'replicate']] \
+            .groupby(['condition', 'region', 'replicate']).mean().reset_index()
         df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
         noutfile = outfile.parent / 'coef_table' / \
                    (outfile.name.replace(".txt.gz", "") +
@@ -336,14 +399,19 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
         df_val.to_csv(noutfile, sep="\t", index=False)
     else:
         df_val = pd.read_csv(norm, sep="\t")
-    df = df.merge(df_val, how="left", on=['condition', 'replicate'])
+    if len(df_val['region'].unique()) > 1:
+        df = df.merge(df_val, how="left", on=['condition', 'region',
+                                              'replicate'])
+    else:
+        df_val.drop('region', axis=1, inplace=True)
+        df = df.merge(df_val, how="left", on=['condition', 'replicate'])
     df['coverage'] = df['coverage'] / df['coef']
     df.drop('coef', axis=1, inplace=True)
     return df
 
 
-def create_figure(design: Path, bw_folder: Path, region_bed: Path,
-                  region_name: str, nb_bin: int = 100,
+def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
+                  bed_names: List[str], nb_bin: int = 100,
                   figure_type: str = 'metagene',
                   norm: Union[int, Path, None] = None,
                   show_replicate: bool = True, environment: List[int] = (0, 0),
@@ -358,8 +426,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     the last one contains the replicate of the condition.
     :param bw_folder: The folder containing the bigwig file mentioned in \
     the first column of the 'design' table.
-    :param region_bed: A bed file containing the regions to visualise
-    :param region_name: The name of the region analysed
+    :param region_beds: A list of bed files containing the regions to visualise
+    :param bed_names: A list of names identifying regions insides the given \
+    beds.
     :param nb_bin: The number of bins used to represents the regions of \
     'region_bed'.
     :param figure_type: The kind of representation wanted (barplot or metagene)
@@ -374,10 +443,15 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     :param border_names: The name of the borders
     :param output: Folder where the results will be created
     """
+    if len(region_beds) != len(bed_names):
+        raise IndexError("Parameter region_beds and bed_names should "
+                         "have the same length")
     df_exp = pd.read_csv(design, sep="\t")
-    regions = load_bed(region_bed)
-    region_bed_name = region_bed.name.replace('.bed', '')
-    outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \
+    regions = load_beds(region_beds, bed_names)
+    region_bed_name = "-".join([b.name.replace('.bed', '')
+                                for b in region_beds])
+    outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \
+              f'_{region_bed_name}_{nb_bin}bin_' \
               f'{environment[0]}_nt-around-{environment[1]}-bin'
     if isinstance(norm, int):
         outfile += f'_bin{norm}_norm'
@@ -397,20 +471,22 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
     for condition in df_exp['condition'].to_list():
         if condition not in ordered_condition:
             ordered_condition.append(condition)
-    df_sum = create_df_summary(df_cov, figure_type, nb_bin, environment,
-                               region_name, ordered_condition)
+    region_kind = "--".join(bed_names)
+    df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin,
+                                         environment, region_kind,
+                                         ordered_condition, bed_names)
     if figure_type == "metagene":
         figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                        environment, region_name, output, norm)
+                        environment, region_kind, output, norm, cond_col)
     else:
         if 'location' in df_sum.columns:
             for cur_region in df_sum['location'].unique():
                 df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
                 figure_barplot(df_tmp, show_replicate, nb_bin, environment,
-                               cur_region, output, norm)
+                               cur_region, output, norm, cond_col)
         else:
             figure_barplot(df_sum, show_replicate, nb_bin, environment,
-                           region_name, output, norm)
+                           region_kind, output, norm, cond_col)
 
 
 if __name__ == "__main__":
-- 
GitLab