src/visu/__main__.py src/visu/figure_maker.py: modification to create a figure...

src/visu/__main__.py src/visu/figure_maker.py: modification to create a figure with two different list of exons

src/visu/main.py src/visu/figure_maker.py: modification to create a figure...
ca7c85c7 · nfontrod · 759679b3 · ca7c85c7 · ca7c85c7
Commit ca7c85c7 authored 4 years ago by nfontrod
--- a/src/visu/__main__.py
+++ b/src/visu/__main__.py
@@ -13,11 +13,11 @@ from pathlib import Path
 from typing import List


-@lp.parse(design='file', region_bed='file',
+@lp.parse(design='file', region_beds='file',
          nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'],
          show_replicate=['y', 'n', 'Y', 'N'])
-def launcher(design: str, bw_folder: str, region_bed: str,
-             region_name: str, nb_bin: int = 100,
+def launcher(design: str, bw_folder: str, region_beds: List[str],
+             region_names: List[str], nb_bin: int = 100,
             figure_type: str = 'metagene', norm: str = 'None',
             show_replicate: str = 'y', environment: List[int] = (0, 0),
             border_names: List[str] = ('', ''),
@@ -31,8 +31,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
    the last one contains the replicate of the condition.
    :param bw_folder: The folder containing the bigwig file mentioned in \
    the first column of the 'design' table.
-    :param region_bed: A bed file containing the regions to visualise
-    :param region_name: The name of the region analysed
+    :param region_beds: A list of bed files containing the regions to visualise
+    :param region_names: A list of names identifying regions insides \
+    the given beds.
    :param nb_bin: The number of bins used to represents the regions of \
    'region_bed'.
    :param figure_type: The kind of representation wanted (barplot or metagene)
@@ -58,8 +59,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
        norm = Path(norm)
        if not norm.is_file():
            raise FileNotFoundError(f"The file {norm} was not found")
-    create_figure(Path(design), Path(bw_folder), Path(region_bed),
-                  region_name, nb_bin, figure_type, norm, show_rep,
+    reg_beds = [Path(p) for p in region_beds]
+    create_figure(Path(design), Path(bw_folder), reg_beds,
+                  region_names, nb_bin, figure_type, norm, show_rep,
                  environment, border_names, Path(output))



--- a/src/visu/figure_maker.py
+++ b/src/visu/figure_maker.py
@@ -7,7 +7,7 @@ Description:
 """

 from pathlib import Path
-from typing import List, Union, Any
+from typing import List, Union, Any, Tuple
 from doctest import testmod
 from ..bed_handler.config import TestConfig
 import pandas as pd
@@ -17,15 +17,16 @@ import matplotlib.pyplot as plt
 from tqdm import tqdm


-def load_bed(bed: Path) -> List[List[Union[int, str]]]:
+def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
    """
    Read a bed file and return the lines within it.

    :param bed: A bed file containing the regions of interest
+    :param bed_name: The name of the regions of interest inside the bed file
    :return:The list of feature inside the bed

-    >>> load_bed(TestConfig.gene_bed)[0]
-    ['18', 28645943, 28682388, 1, 'DSC2', '-']
+    >>> load_bed(TestConfig.gene_bed, 'gene_test')[0]
+    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test']
    """
    list_regions = []
    with bed.open('r') as inbed:
@@ -33,10 +34,34 @@ def load_bed(bed: Path) -> List[List[Union[int, str]]]:
            if not line.startswith("#"):
                cline = line.replace("\n", "").split("\t")
                list_regions.append([cline[0], int(cline[1]), int(cline[2]),
-                                     int(cline[3]), cline[4], cline[5]])
+                                     int(cline[3]), cline[4], cline[5],
+                                     bed_name])
    return list_regions


+def load_beds(beds: List[Path], bed_names: List[str]
+              ) -> List[List[Union[int, str]]]:
+    """
+    Read a bed file and return the lines within it.
+
+    :param beds: A list of bed files containing the regions of interest
+    :param bed_names: A list of names indentifying regions insides the given \
+    beds.
+    :return:The list of feature inside the beds file
+
+    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
+    ... ['gene1', 'gene2'])[0]
+    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1']
+    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
+    ... ['gene1', 'gene2'])[-1]
+    ['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2']
+    """
+    regions = []
+    for i in range(len(beds)):
+        regions += load_bed(beds[i], bed_names[i])
+    return regions
+
+
 def inspect_bigwig_regions(bw: Any, region: List,
                           replicate: str, nb_bin: int, resize: List[int],
                           condition_name: str,
@@ -54,30 +79,30 @@ def inspect_bigwig_regions(bw: Any, region: List,
    :return: a table with the coverage of this region

    >>> my_bw = pbw.open(str(TestConfig.small_bw))
-    >>> region = ['1', 10, 25, 1, 'Test', '+']
-    >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
-        coverage  bin condition replicate
-    0   0.000000   -2     cond1        R1
-    1   0.500000   -1     cond1        R1
-    2  75.000000    0     cond1        R1
-    3  20.000000    1     cond1        R1
-    4  10.000000    2     cond1        R1
-    5   4.666667    3     cond1        R1
-    6   2.000000    4     cond1        R1
-    7   1.000000    5     cond1        R1
-    8   0.500000    6     cond1        R1
-    >>> region = ['1', 110, 133, 1, 'Test', '-']
-    >>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
-       coverage  bin condition replicate
-    0      0.00   -2     cond1        R1
-    1     12.50   -1     cond1        R1
-    2     42.00    0     cond1        R1
-    3      8.00    1     cond1        R1
-    4      4.25    2     cond1        R1
-    5      2.00    3     cond1        R1
-    6      2.00    4     cond1        R1
-    7      1.00    5     cond1        R1
-    8      1.00    6     cond1        R1
+    >>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon']
+    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
+        coverage  bin condition replicate region
+    0   0.000000   -2     cond1        R1   exon
+    1   0.500000   -1     cond1        R1   exon
+    2  75.000000    0     cond1        R1   exon
+    3  20.000000    1     cond1        R1   exon
+    4  10.000000    2     cond1        R1   exon
+    5   4.666667    3     cond1        R1   exon
+    6   2.000000    4     cond1        R1   exon
+    7   1.000000    5     cond1        R1   exon
+    8   0.500000    6     cond1        R1   exon
+    >>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2']
+    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
+       coverage  bin condition replicate region
+    0      0.00   -2     cond1        R1  exon2
+    1     12.50   -1     cond1        R1  exon2
+    2     42.00    0     cond1        R1  exon2
+    3      8.00    1     cond1        R1  exon2
+    4      4.25    2     cond1        R1  exon2
+    5      2.00    3     cond1        R1  exon2
+    6      2.00    4     cond1        R1  exon2
+    7      1.00    5     cond1        R1  exon2
+    8      1.00    6     cond1        R1  exon2
    """
    val = bw.stats(region[0], region[1], region[2], nBins=nb_bin, exact=True)
    bins = list(range(len(val)))
@@ -110,6 +135,7 @@ def inspect_bigwig_regions(bw: Any, region: List,
    df = pd.DataFrame(dic)
    df['condition'] = [condition_name] * df.shape[0]
    df['replicate'] = [replicate] * df.shape[0]
+    df['region'] = [region[6]] * df.shape[0]
    return df


@@ -165,10 +191,31 @@ def create_full_table(df_exp: pd.DataFrame, regions: List[List],
    return pd.concat(list_df, axis=0, ignore_index=True)


+def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
+    """
+
+    :param df: A dataframe of mean coverage for each bin.
+    :return: The dataframe with the region of condition column merged and \
+    the name of the merged column
+    """
+    if len(df['region'].unique()) == 1:
+        condition_col = 'condition'
+        df.drop('region', axis=1, inplace=True)
+    elif len(df['condition'].unique()) == 1:
+        condition_col = 'region'
+        df.drop('condition', axis=1, inplace=True)
+    else:
+        condition_col = 'condition-region'
+        df[condition_col] = df['condition'] + "-" + df['region']
+        df.drop(['condition', 'region'], axis=1, inplace=True)
+    return df, condition_col
+
+
 def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
                      environment: List[int],
-                      region_name: str, order_condition: List[str]
-                      ) -> pd.DataFrame:
+                      region_name: str, order_condition: List[str],
+                      order_bed_name: List[str],
+                      ) -> Tuple[pd.DataFrame, str]:
    """
    summarize the data in df_cov.

@@ -180,21 +227,24 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
    the number of bin used to represent those surrounding regions.
    :param region_name: the name of the region analysed
    :param order_condition: The order of conditions
-    :return: The summarised dataframe
+    :param order_bed_name: The order of bed name to respect
+    :return: The summarised dataframe and the condition col
    """
-    df_sum = df_cov.groupby(['bin', 'condition', 'replicate']).mean() \
+    df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \
+        .mean() \
        .reset_index()
    if figure_type == "metagene":
-        return df_sum
+        df_sum, condition_col = merge_condition_region_col(df_sum)
+        return df_sum, condition_col
    if environment[0] != 0:
        df_sum['location'] = df_cov['bin'].apply(
            lambda x: f"before_{region_name}" if x < 0 else
            f"after_{region_name}" if x >= nb_bin else region_name)
    df_sum.drop('bin', axis=1, inplace=True)
    if environment[0] != 0:
-        col_merge = ['condition', 'replicate', 'location']
+        col_merge = ['condition', 'region', 'replicate', 'location']
    else:
-        col_merge = ['condition', 'replicate']
+        col_merge = ['condition', 'region', 'replicate']
    df_sum = df_sum.groupby(col_merge).mean().reset_index()
    if 'location' in df_sum.columns:
        df_sum['location'] = pd.Categorical(
@@ -206,15 +256,21 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
            df_sum['condition'], ordered=True,
            categories=order_condition
        )
-        df_sum.sort_values(['condition', 'location'], ascending=True,
+        df_sum['region'] = pd.Categorical(
+            df_sum['region'], ordered=True,
+            categories=order_bed_name
+        )
+        df_sum.sort_values(['condition', 'region', 'location'], ascending=True,
                           inplace=True)
-    return df_sum
+    df_sum, condition_col = merge_condition_region_col(df_sum)
+    return df_sum, condition_col


 def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                    border_names: List[str], nb_bin: int,
-                    environment: List[int], region_name: str,
-                    output: Path, norm: Union[int, Path]) -> None:
+                    environment: List[int], bed_name: str,
+                    output: Path, norm: Union[int, Path],
+                    condition_col: str) -> None:
    """
    Create a metagene figure on the region of interest.

@@ -226,18 +282,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
    nucleotide to represent around the region of interest and the second,
    the number of bin used to represent those surrounding regions.
    :param output: Folder where the figure will be created
-    :param region_name: The region of interest
+    :param bed_name: The name of considered regions
    :param norm: an integer corresponding to the bin used to normalise \
    the samples or a file containing the normalisations to apply to \
    each samples
+    :param condition_col: The name of the condition columns
    """
    sns.set(context='poster', style='white')
    if show_replicate:
-        g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
+        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
                        kind='line', style='replicate', ci=None,
                        height=12, aspect=1.7)
    else:
-        g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
+        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
                        kind='line', ci="sd", height=12, aspect=1.7)
    y_val = g.ax.get_ylim()[1] * 0.99
    if border_names[0] != '':
@@ -250,11 +307,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
    g.set_xlabels('Bins')
    g.set_ylabels('Coverage')
    plt.subplots_adjust(top=0.9)
-    title = f"Average coverage in region '{region_name}'"
+    tmp_bed_name = bed_name.replace("--", ", ")
+    title = f"Average coverage in region '{tmp_bed_name}'"
    if environment[0] != 0:
        title += f"\nand in their surrounding regions of {environment[0]} nt"
    g.fig.suptitle(title)
-    outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \
+    tmp_bed_name = bed_name.replace("--", "-")
+    outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \
                    f"{environment[0]}_nt-around-{environment[1]}-bin"
    if isinstance(norm, int):
        outfile_title += f"_b{norm}_norm"
@@ -268,7 +327,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
 def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
                   nb_bin: int,
                   environment: List[int], region_name: str,
-                   output: Path, norm: Union[int, Path]) -> None:
+                   output: Path, norm: Union[int, Path],
+                   condition_col: str) -> None:
    """
    Create a barplot figure on the region of interest.

@@ -283,22 +343,25 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
    :param norm: an integer corresponding to the bin used to normalise \
    the samples or a file containing the normalisations to apply to \
    each samples
+    :param condition_col: The name of the condition columns
    """
    sns.set(context='poster', style='white')
    if show_replicate:
-        g = sns.catplot(x="condition", y="coverage", hue="replicate",
+        g = sns.catplot(x=condition_col, y="coverage", hue="replicate",
                        kind="bar", data=df_sum, height=12, aspect=1.77,
                        ci=None)
    else:
-        g = sns.catplot(x="condition", y="coverage",
+        g = sns.catplot(x=condition_col, y="coverage",
                        kind="bar", data=df_sum, height=12, aspect=1.77,
                        ci='sd')
    g.set_xlabels('')
    g.set_ylabels('Coverage')
    plt.subplots_adjust(top=0.9)
-    title = f"Average coverage in region '{region_name}'"
+    rgt = region_name.replace('--', ', ')
+    title = f"Average coverage in region '{rgt}'"
    g.fig.suptitle(title)
-    outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \
+    rgt = region_name.replace('--', '-')
+    outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \
                    f"{environment[0]}_nt-around-{environment[1]}-bin"
    if isinstance(norm, int):
        outfile_title += f"_b{norm}_norm"
@@ -326,8 +389,8 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
            raise ValueError(f"the bin {norm} was not found in the coverage "
                             f"dataframe.")
        df_val = df.loc[df['bin'] == norm,
-                        ['coverage', 'condition', 'replicate']] \
-            .groupby(['condition', 'replicate']).mean().reset_index()
+                        ['coverage', 'condition', 'region', 'replicate']] \
+            .groupby(['condition', 'region', 'replicate']).mean().reset_index()
        df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
        noutfile = outfile.parent / 'coef_table' / \
                   (outfile.name.replace(".txt.gz", "") +
@@ -336,14 +399,19 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
        df_val.to_csv(noutfile, sep="\t", index=False)
    else:
        df_val = pd.read_csv(norm, sep="\t")
-    df = df.merge(df_val, how="left", on=['condition', 'replicate'])
+    if len(df_val['region'].unique()) > 1:
+        df = df.merge(df_val, how="left", on=['condition', 'region',
+                                              'replicate'])
+    else:
+        df_val.drop('region', axis=1, inplace=True)
+        df = df.merge(df_val, how="left", on=['condition', 'replicate'])
    df['coverage'] = df['coverage'] / df['coef']
    df.drop('coef', axis=1, inplace=True)
    return df


-def create_figure(design: Path, bw_folder: Path, region_bed: Path,
-                  region_name: str, nb_bin: int = 100,
+def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
+                  bed_names: List[str], nb_bin: int = 100,
                  figure_type: str = 'metagene',
                  norm: Union[int, Path, None] = None,
                  show_replicate: bool = True, environment: List[int] = (0, 0),
@@ -358,8 +426,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
    the last one contains the replicate of the condition.
    :param bw_folder: The folder containing the bigwig file mentioned in \
    the first column of the 'design' table.
-    :param region_bed: A bed file containing the regions to visualise
-    :param region_name: The name of the region analysed
+    :param region_beds: A list of bed files containing the regions to visualise
+    :param bed_names: A list of names identifying regions insides the given \
+    beds.
    :param nb_bin: The number of bins used to represents the regions of \
    'region_bed'.
    :param figure_type: The kind of representation wanted (barplot or metagene)
@@ -374,10 +443,15 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
    :param border_names: The name of the borders
    :param output: Folder where the results will be created
    """
+    if len(region_beds) != len(bed_names):
+        raise IndexError("Parameter region_beds and bed_names should "
+                         "have the same length")
    df_exp = pd.read_csv(design, sep="\t")
-    regions = load_bed(region_bed)
-    region_bed_name = region_bed.name.replace('.bed', '')
-    outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \
+    regions = load_beds(region_beds, bed_names)
+    region_bed_name = "-".join([b.name.replace('.bed', '')
+                                for b in region_beds])
+    outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \
+              f'_{region_bed_name}_{nb_bin}bin_' \
              f'{environment[0]}_nt-around-{environment[1]}-bin'
    if isinstance(norm, int):
        outfile += f'_bin{norm}_norm'
@@ -397,20 +471,22 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
    for condition in df_exp['condition'].to_list():
        if condition not in ordered_condition:
            ordered_condition.append(condition)
-    df_sum = create_df_summary(df_cov, figure_type, nb_bin, environment,
-                               region_name, ordered_condition)
+    region_kind = "--".join(bed_names)
+    df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin,
+                                         environment, region_kind,
+                                         ordered_condition, bed_names)
    if figure_type == "metagene":
        figure_metagene(df_sum, show_replicate, border_names, nb_bin,
-                        environment, region_name, output, norm)
+                        environment, region_kind, output, norm, cond_col)
    else:
        if 'location' in df_sum.columns:
            for cur_region in df_sum['location'].unique():
                df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
                figure_barplot(df_tmp, show_replicate, nb_bin, environment,
-                               cur_region, output, norm)
+                               cur_region, output, norm, cond_col)
        else:
            figure_barplot(df_sum, show_replicate, nb_bin, environment,
-                           region_name, output, norm)
+                           region_kind, output, norm, cond_col)


 if __name__ == "__main__":