Skip to content
Snippets Groups Projects
figure_maker.py 29.3 KiB
Newer Older
nfontrod's avatar
nfontrod committed
#!/usr/bin/env python3

# -*- coding: UTF-8 -*-

"""
Description:
"""

from pathlib import Path
from typing import List, Union, Any, Tuple, Optional
nfontrod's avatar
nfontrod committed
from doctest import testmod
from ..bed_handler.config import TestConfig
import pandas as pd
import pyBigWig as pbw
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.font_manager
from rpy2.robjects import r, pandas2ri, FloatVector
from matplotlib import collections as mc
import numpy as np
nfontrod's avatar
nfontrod committed


def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
nfontrod's avatar
nfontrod committed
    """
    Read a bed file and return the lines within it.

    :param bed: A bed file containing the regions of interest
    :param bed_name: The name of the regions of interest inside the bed file
nfontrod's avatar
nfontrod committed
    :return:The list of feature inside the bed

    >>> load_bed(TestConfig.gene_bed, 'gene_test')[0]
    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test']
nfontrod's avatar
nfontrod committed
    """
    list_regions = []
    with bed.open('r') as inbed:
        for line in inbed:
            if not line.startswith("#"):
                cline = line.replace("\n", "").split("\t")
                my_id = int(cline[3]) if cline[3].isdigit() else cline[3]
nfontrod's avatar
nfontrod committed
                list_regions.append([cline[0], int(cline[1]), int(cline[2]),
                                     my_id, cline[4], cline[5],
nfontrod's avatar
nfontrod committed
    return list_regions


def load_beds(beds: List[Path], bed_names: List[str]
              ) -> List[List[Union[int, str]]]:
    """
    Read a bed file and return the lines within it.

    :param beds: A list of bed files containing the regions of interest
    :param bed_names: A list of names indentifying regions insides the given \
    beds.
    :return:The list of feature inside the beds file

    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
    ... ['gene1', 'gene2'])[0]
    ['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1']
    >>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
    ... ['gene1', 'gene2'])[-1]
    ['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2']
    """
    regions = []
    for i in range(len(beds)):
        regions += load_bed(beds[i], bed_names[i])
    return regions


nfontrod's avatar
nfontrod committed
def inspect_bigwig_regions(bw: Any, region: List,
                           replicate: str, nb_bin: int, resize: List[int],
                           condition_name: str,
                           ) -> pd.DataFrame:
    """
    get the coverage value inside the bigwig region `region`.

    :param bw: A opened bigwig file
    :param region: The region of interest
    :param replicate: The replicate name
    :param nb_bin: The number of bin that will represent the region
    :param resize: The number of nucleotide used to extend the region
    in both side
    :param condition_name: the name of the condition
    :return: a table with the coverage of this region

    >>> my_bw = pbw.open(str(TestConfig.small_bw))
    >>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon']
    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
        coverage  bin condition replicate region
    0   0.000000   -2     cond1        R1   exon
    1   0.500000   -1     cond1        R1   exon
    2  75.000000    0     cond1        R1   exon
    3  20.000000    1     cond1        R1   exon
    4  10.000000    2     cond1        R1   exon
    5   4.666667    3     cond1        R1   exon
    6   2.000000    4     cond1        R1   exon
    7   1.000000    5     cond1        R1   exon
    8   0.500000    6     cond1        R1   exon
    >>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2']
    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
       coverage  bin condition replicate region
    0      0.00   -2     cond1        R1  exon2
    1     12.50   -1     cond1        R1  exon2
    2     42.00    0     cond1        R1  exon2
    3      8.00    1     cond1        R1  exon2
    4      4.25    2     cond1        R1  exon2
    5      2.00    3     cond1        R1  exon2
    6      2.00    4     cond1        R1  exon2
    7      1.00    5     cond1        R1  exon2
    8      1.00    6     cond1        R1  exon2
    >>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [0, 0], 'cond1')
       coverage  bin condition replicate region
    0     42.00    0     cond1        R1  exon2
    1      8.00    1     cond1        R1  exon2
    2      4.25    2     cond1        R1  exon2
    3      2.00    3     cond1        R1  exon2
    4      2.00    4     cond1        R1  exon2
nfontrod's avatar
nfontrod committed
    """
    r_start = max(region[1], 0)
    r_end = min(region[2], bw.chroms(region[0]))
    if r_end - r_start < nb_bin:
        return pd.DataFrame()
    val = bw.stats(region[0],
                   max(region[1], 0),
                   min(region[2], bw.chroms(region[0])),
                   nBins=nb_bin,
                   exact=True)
nfontrod's avatar
nfontrod committed
    bins = list(range(len(val)))
    if len(bins) != nb_bin:
        raise ValueError("The lenght of bins should be equals to nb_bin")
    max_loc = max(region[1] - resize[0], 0)

    min_loc = min(region[2] + resize[0], bw.chroms(region[0]))
nfontrod's avatar
nfontrod committed
    if resize[0] > 0:
        val_before = bw.stats(region[0], max_loc, region[1], nBins=resize[1],
                              exact=True)
        val_after = bw.stats(region[0], region[2], min_loc, nBins=resize[1],
                             exact=True)
    else:
        val_before = []
        val_after = []
    if None in val_after:
        print(f"Warning ! None values found in {region} - "
              f"{[region[0], region[2], min_loc]}")
    if region[5] == "+":
        bin_before = list(range(-len(val_before), 0))
        bin_after = list(range(bins[-1] + 1,
                               bins[-1] + 1 + len(val_after)))
        val = val_before + val + val_after
    else:
        bin_before = list(range(-len(val_after), 0))
        bin_after = list(range(bins[-1] + 1,
                               bins[-1] + 1 + len(val_before)))
        val = val_after[::-1] + val[::-1] + val_before[::-1]
    bins = bin_before + bins + bin_after
nfontrod's avatar
nfontrod committed
    dic = {"coverage": val, "bin": bins}
    df = pd.DataFrame(dic)
    df['condition'] = [condition_name] * df.shape[0]
    df['replicate'] = [replicate] * df.shape[0]
    df['region'] = [region[6]] * df.shape[0]
nfontrod's avatar
nfontrod committed
    return df


def create_sample_table(bw_file: Path, regions: List[List],
                        replicate: str, nb_bin: int, resize: List[int],
                        condition_name: str,
                        ) -> pd.DataFrame:
    """
    Get the table for all the regions of interest

    :param bw_file: A bigwig file
    :param regions: The regions of interest
    :param replicate: The replicate name
    :param nb_bin: The number of bin that will represent the region
    :param resize: The number of nucleotide used to extend the region
    in both side
    :param condition_name: the name of the condition
    :return: a table with the coverage of this region
    """
    list_df = []
    bw = pbw.open(str(bw_file))
    for region in tqdm(regions, desc="scanning coverage ..."):
        list_df.append(inspect_bigwig_regions(bw, region, replicate, nb_bin,
                                              resize, condition_name))
    r = sum([df.empty for df in list_df])
    if r > 0:
        logger.warning(f"They were {r} bed feature that could not be reported "
                       f"because of their small size")
nfontrod's avatar
nfontrod committed
    return pd.concat(list_df, axis=0, ignore_index=True)


def create_full_table(df_exp: pd.DataFrame, regions: List[List],
                      nb_bin: int, resize: List[int],
                      input_folder: Path) -> pd.DataFrame:
    """
    get the regions for every bigwig files.

    :param df_exp: A dataframe containing the bigwig file that \
    we want to analyse
    :param regions: The regions to visualise
    :param nb_bin: The number of bin used to resize the regions
    :param resize: The number of nucleotides \
    of the regions localised at each sides of the genomic regions inside
    `regions`.
    :param input_folder: Folder where the bigwig file are located
    :return: The full coverage table
    """
    list_df = []
    for i in range(df_exp.shape[0]):
        mline = df_exp.iloc[i, :]
        bw_file = input_folder / mline['bigwig']
        print(f"working on file {bw_file}")
        condition = mline['condition']
        replicate = mline['replicate']
        list_df.append(create_sample_table(bw_file, regions, replicate,
                                           nb_bin, resize, condition))
    return pd.concat(list_df, axis=0, ignore_index=True)


def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
    """

    :param df: A dataframe of mean coverage for each bin.
    :return: The dataframe with the region of condition column merged and \
    the name of the merged column
    """
    if len(df['region'].unique()) == 1:
        condition_col = 'condition'
        df.drop('region', axis=1, inplace=True)
    elif len(df['condition'].unique()) == 1:
        condition_col = 'region'
        df.drop('condition', axis=1, inplace=True)
    else:
        condition_col = 'condition-region'
        df[condition_col] = df['condition'] + "-" + df['region']
        df.drop(['condition', 'region'], axis=1, inplace=True)
    return df, condition_col


nfontrod's avatar
nfontrod committed
def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
                      environment: List[int],
                      region_name: str, order_condition: List[str],
                      order_bed_name: List[str],
                      ) -> Tuple[pd.DataFrame, str]:
nfontrod's avatar
nfontrod committed
    """
    summarize the data in df_cov.

    :param df_cov: A dataframe of coverage for each bin.
    :param figure_type: The kind of figure to make (metagene or barplot)
    :param nb_bin: The number of bin used to represent the region of interest
    :param environment: A list of two int. The first contains the number of \
    nucleotide to represent around the region of interest and the second,
    the number of bin used to represent those surrounding regions.
    :param region_name: the name of the region analysed
    :param order_condition: The order of conditions
    :param order_bed_name: The order of bed name to respect
    :return: The summarised dataframe and the condition col
nfontrod's avatar
nfontrod committed
    """
    df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \
        .mean() \
nfontrod's avatar
nfontrod committed
        .reset_index()
    if figure_type == "metagene":
        if environment[0] != 0:
            df_sum['location'] = df_sum['bin'].apply(
                lambda x: f"before_{region_name}" if x < 0 else
                f"after_{region_name}" if x >= nb_bin else region_name)
        df_sum, condition_col = merge_condition_region_col(df_sum)
        return df_sum, condition_col
nfontrod's avatar
nfontrod committed
    df_sum.drop('bin', axis=1, inplace=True)
    if environment[0] != 0:
        col_merge = ['condition', 'region', 'replicate', 'location']
nfontrod's avatar
nfontrod committed
    else:
        col_merge = ['condition', 'region', 'replicate']
nfontrod's avatar
nfontrod committed
    df_sum = df_sum.groupby(col_merge).mean().reset_index()
    if 'location' in df_sum.columns:
        df_sum['location'] = pd.Categorical(
            df_sum['location'], ordered=True,
            categories=[f"before_{region_name}", region_name,
                        f"after_{region_name}"]
        )
        df_sum['condition'] = pd.Categorical(
            df_sum['condition'], ordered=True,
            categories=order_condition
        )
        df_sum['region'] = pd.Categorical(
            df_sum['region'], ordered=True,
            categories=order_bed_name
        )
        df_sum.sort_values(['condition', 'region', 'location'], ascending=True,
nfontrod's avatar
nfontrod committed
                           inplace=True)
    df_sum, condition_col = merge_condition_region_col(df_sum)
    return df_sum, condition_col
nfontrod's avatar
nfontrod committed


def line_maker(list_pval, up_value, position=0):
    """
    Create a list of lines, with their colour.

    Create a line if the a p-value in list_pval[i] is below 0.05.
    If up_mean[i] > down_mean[i] the line will be green, purple else.
    :param list_pval: (list of float), list of pvalue get by the comparison
    of a propensity scale in a particular sequence position in an
    up-regulated and down_regulated set of sequences.
    :param up_value: (int) the ordinate coordinates where the line will be
    placed.
    :param position: (int) the abscissa position to begin to draw the lines
    :return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
    to a lines with the coordinates [(x1, y1), (x2, y2)]
    """
    lcolor = []
    lines = []
    for i in range(len(list_pval)):
        if list_pval[i] <= 0.05:
            val = i + position
            lines.append([(val - 0.5, up_value), (val + 0.5, up_value)])
            lcolor.append("#000000")  # red
    return lines, lcolor


def paired_t_test(values1: List[float], values2: List[float]) -> float:
    """
    Get the p-value of a paired t-test for each bin

    :param values1: A list of values
    :param values2: Another list of values
    :return: The p-values of the paired t-test
    >>> paired_t_test([1, 2, 8], [5, 8, 15])
    0.02337551764357566
    """
    if len(values1) != len(values2):
        raise IndexError("values1 and values2 should have the same length")
    ttp = r("""
    function(values1, values2) {
        return(t.test(values1, values2, paired=T)$p.value)
    }
    """)
    return ttp(FloatVector(values1), FloatVector(values2))[0]


def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str,
                  outfile: Path) -> Tuple[List[List[Tuple]], List]:
    """

    :param dff: A dataframe containing the coverage displayed in the figure
    :param y_line: The height of the p-value line
    :param group_col: A group column
    :param outfile: The file to save the stats
    :return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \
    and the color associated to each line
    """
    df = dff.sort_values(["bin", group_col], ascending=True)
    groups = df[group_col].unique()
    if len(groups) != 2:
        raise NotImplementedError("Statistical analysis only implemented for "
                                  "2 groups of data")
    p_values_ttp = []
    grp1 = []
    vgrp1 = []
    grp2 = []
    vgrp2 = []
    for bin in df["bin"].unique():
        tmp = df[df["bin"] == bin]
        values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values
        values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values
        p_values_ttp.append(paired_t_test(values1, values2))
        grp1.append(np.mean(values1))
        grp2.append(np.mean(values2))
        vgrp1.append(";".join(map(str, [round(v, 3) for v in values1])))
        vgrp2.append(";".join(map(str, [round(v, 3) for v in values2])))
    stats_df = pd.DataFrame({"bin": df["bin"].unique(),
                             "p_values": p_values_ttp,
                             groups[0]: grp1, groups[1]: grp2,
                             f"val-{groups[0]}": vgrp1,
                             f"val-{groups[1]}": vgrp2,})
    stats_df.to_csv(outfile, sep="\t", index=False)
    return line_maker(p_values_ttp, y_line, min(df["bin"].unique()))


def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float,
                 group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid:
    """
    Update the graphics by displaying stats.

    :param g: A seaborn FacetGrid objects corresponding to the metagene \
    figure
    :param dff: A dataframe containing the coverage displayed in the figure
    :param y_line: The height of the p-value line
    :param group_col: The column containing the groups analyzed
    :param outfile: The file where the metagene will be saved
    :return: The facetGrid with the stats
    """
    if not stat:
        return g
    stat_file = outfile.parent / f"{outfile.stem}.txt"
    lines, lcolor = compute_stats(dff, y_line, group_col, stat_file)
    lc = mc.LineCollection(lines, colors=lcolor, linewidths=5)
    g.ax.add_collection(lc)
    return g


nfontrod's avatar
nfontrod committed
def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
                    border_names: List[str], nb_bin: int,
                    environment: List[int], bed_name: str,
                    output: Path, norm: Union[int, Path],
                    condition_col: str, ylim: Optional[List[float]],
                    stat: bool = False) -> Path:
nfontrod's avatar
nfontrod committed
    """
    Create a metagene figure on the region of interest.

    :param df_sum: The summarized coverage table
    :param show_replicate: True to show the replicate, false else
    :param border_names: The name of borders of the region of interest
    :param nb_bin: The number of bins representing the regions of interest
    :param environment:  A list of two int. The first contains the number of \
    nucleotide to represent around the region of interest and the second,
    the number of bin used to represent those surrounding regions.
    :param output: Folder where the figure will be created
    :param bed_name: The name of considered regions
    :param norm: an integer corresponding to the bin used to normalise \
    the samples or a file containing the normalisations to apply to \
    each samples
    :param condition_col: The name of the condition columns
    :param ylim: The range of the y axis
    :param stat: A boolean indicating wether to perform statistical analysis
nfontrod's avatar
nfontrod committed
    """
    font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
                                                         fontext='ttf')
    font_list = matplotlib.font_manager.createFontList(font_files)
    matplotlib.font_manager.fontManager.ttflist.extend(font_list)

    sns.set(context='poster', style='white', font_scale=1.4)
    mpl.rcParams['font.family'] = 'Arial'
nfontrod's avatar
nfontrod committed
    if show_replicate:
        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
nfontrod's avatar
nfontrod committed
                        kind='line', style='replicate', ci=None,
                        height=12, aspect=1.7)
    else:
        g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
                        kind='line', ci=None, height=12, aspect=1.7)
nfontrod's avatar
nfontrod committed
    y_val = g.ax.get_ylim()[1] * 0.99
    if border_names[0] != '':
        g.ax.axvline(x=0, color='k', linestyle='--', alpha=0.1)
        g.ax.annotate(border_names[0], [0, y_val], ha="center", va='center')
    if border_names[1] != '':
        g.ax.axvline(x=nb_bin - 1, color='k', linestyle='--', alpha=0.1)
        g.ax.annotate(border_names[1], [nb_bin - 1, y_val], ha="center",
                      va='center')
    g.set_xlabels('Bins')
    g.set_ylabels('Coverage')
    plt.subplots_adjust(top=0.9)
    tmp_bed_name = bed_name.replace("--", ", ")
    title = f"Average coverage in region '{tmp_bed_name}'"
nfontrod's avatar
nfontrod committed
    if environment[0] != 0:
        title += f" and in their surrounding regions of {environment[0]} nt"
    g.fig.suptitle(title, fontsize=30)
    tmp_bed_name = bed_name.replace("--", "-")
    outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \
                    f"{environment[0]}_nt-around-{environment[1]}-bin"
    if isinstance(norm, int):
        outfile_title += f"_b{norm}_norm"
    elif isinstance(norm, Path):
        outfile_title += f"_file_norm"
nfontrod's avatar
nfontrod committed
    outfile_title += ".pdf"
    if ylim[0] is not None:
        g.ax.set_ylim(ylim[0], ylim[1])
    ymin, ymax = g.ax.get_ylim()
    g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col,
                     stat, output / outfile_title)
    g.ax.tick_params(left=True, bottom=True)
nfontrod's avatar
nfontrod committed
    g.savefig(output / outfile_title)
nfontrod's avatar
nfontrod committed
    g.fig.clf()
nfontrod's avatar
nfontrod committed


def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
                   nb_bin: int,
                   environment: List[int], region_name: str,
                   output: Path, norm: Union[int, Path],
                   condition_col: str) -> None:
nfontrod's avatar
nfontrod committed
    """
    Create a barplot figure on the region of interest.

    :param df_sum: The summarized coverage table
    :param show_replicate: True to show the replicate, false else
    :param nb_bin: The number of bins representing the regions of interest
    :param environment:  A list of two int. The first contains the number of \
    nucleotide to represent around the region of interest and the second,
    the number of bin used to represent those surrounding regions.
    :param output: Folder where the figure will be created
    :param region_name: The region of interest
    :param norm: an integer corresponding to the bin used to normalise \
    the samples or a file containing the normalisations to apply to \
    each samples
    :param condition_col: The name of the condition columns
nfontrod's avatar
nfontrod committed
    """
    sns.set(context='poster', style='white')
    if show_replicate:
        g = sns.catplot(x=condition_col, y="coverage", hue="replicate",
nfontrod's avatar
nfontrod committed
                        kind="bar", data=df_sum, height=12, aspect=1.77,
                        ci=None)
    else:
        g = sns.catplot(x=condition_col, y="coverage",
nfontrod's avatar
nfontrod committed
                        kind="bar", data=df_sum, height=12, aspect=1.77,
                        ci='sd')
    g.set_xlabels('')
    g.set_ylabels('Coverage')
    plt.subplots_adjust(top=0.9)
    rgt = region_name.replace('--', ', ')
    title = f"Average coverage in region '{rgt}'"
nfontrod's avatar
nfontrod committed
    g.fig.suptitle(title)
    rgt = region_name.replace('--', '-')
    outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \
                    f"{environment[0]}_nt-around-{environment[1]}-bin"
    if isinstance(norm, int):
        outfile_title += f"_b{norm}_norm"
    elif isinstance(norm, Path):
        outfile_title += f"_file_norm"
nfontrod's avatar
nfontrod committed
    outfile_title += ".pdf"
    g.savefig(output / outfile_title)
nfontrod's avatar
nfontrod committed
    g.fig.clf()


def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
                      outfile: Path) -> pd.DataFrame:
nfontrod's avatar
nfontrod committed
    """
    Normalise the bins coverage by the average coverage on a particular bin \
    or by a value given in a particular file.
nfontrod's avatar
nfontrod committed

    :param df: he dataframe of coverage
    :param norm: The bin used to normalise the sample or a file containing \
    the value used to normalise the samples.
    :param outfile: The table containing coverage values
nfontrod's avatar
nfontrod committed
    :return: the dataframe with normalised coverage
    """
    if isinstance(norm, int):
        if norm not in list(df['bin'].unique()):
            raise ValueError(f"the bin {norm} was not found in the coverage "
                             f"dataframe.")
        df_val = df.loc[df['bin'] == norm,
                        ['coverage', 'condition', 'region', 'replicate']] \
            .groupby(['condition', 'region', 'replicate']).mean().reset_index()
        df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
        noutfile = outfile.parent / 'coef_table' / \
                   (outfile.name.replace(".txt.gz", "") +
                    f".txt")
        noutfile.parent.mkdir(exist_ok=True, parents=True)
        df_val.to_csv(noutfile, sep="\t", index=False)
    else:
        df_val = pd.read_csv(norm, sep="\t")
    if len(df_val['region'].unique()) > 1:
        df = df.merge(df_val, how="left", on=['condition', 'region',
                                              'replicate'])
    else:
        df_val.drop('region', axis=1, inplace=True)
        df = df.merge(df_val, how="left", on=['condition', 'replicate'])
nfontrod's avatar
nfontrod committed
    df['coverage'] = df['coverage'] / df['coef']
    df.drop('coef', axis=1, inplace=True)
    return df


def compute_diff(df_sum: pd.DataFrame) -> pd.DataFrame:
    """
    Compute the average coverage differences between two conditions.

    :param df_sum: A dataframe containing the coverage values between \
    two conditions
    :return: The dataframe with a coolumn diff

    >>> di = pd.DataFrame({'bin': [-25] * 6,
    ... 'condition': ['Ctrl', 'Ctrl', 'Ctrl', 'siDDX', 'siDDX', 'siDDX'],
    ... 'replicate': ['R1', 'R2', 'R3', 'R1', 'R2', 'R3'],
    ... 'coverage': [0.1,
    ...  0.2,
    ...  0.3,
    ...  0.15,
    ...  0.25,
    ...  0.35],
    ... 'location': ['before_ddx_down'] * 6})
    >>> compute_diff(di)
       bin         location  Ctrl  siDDX  diff
    0  -25  before_ddx_down   0.2   0.25  0.05
    """
    cond = df_sum["condition"].unique()
    df_sum = df_sum.groupby(["bin", "location", "condition"]).mean()
    if len(cond) != 2:
        raise IndexError("Only two different conditions must be given to "
                         "perform stats")
    df = df_sum.pivot_table(index=["bin", "location"],
                            columns="condition",
                            values="coverage").reset_index()
    df.columns.name = None
    df.groupby(["bin", "location"])
    df["diff"] = df[cond[1]] - df[cond[0]]
    return df


def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
    """
    Return a series containing the shapiro and the t-test p-value of \
    the difference in coverage between 2 conditions at a particular location

    :param df: A dataframe containing the difference in coverage \
    between 2 conditions at a particular location
    :param location: The region of interest
    :return: a series containing the shapiro and the t-test p-value of \
    the difference in coverage between 2 conditions at a particular location
    """
    df2 = df[df["location"] == location].copy()
    df2.to_csv(f"{location}.txt", sep="\t", index=False)
    pandas2ri.activate()
    stat_s = r("""
    function(data){
        mod <- lm(diff ~ 1, data=data)
        s <- summary(mod)
        stat <- s$coefficients[1, "Pr(>|t|)"]
        shapiro <- shapiro.test(data$diff)$p.value
        return(c(shapiro, stat))
    }
    """)
    res = stat_s(df2)
    return pd.Series({"shapiro_p-value": res[0],
                      "t-test-p-value": res[1],
                      "location": location})


def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
                  bed_names: List[str], nb_bin: int = 100,
                  figure_type: str = 'metagene',
                  norm: Union[int, Path, None] = None,
nfontrod's avatar
nfontrod committed
                  show_replicate: bool = True, environment: List[int] = (0, 0),
                  border_names: List[str] = ('', ''),
                  ylim: List[float] = (None, None), stat: bool = False
                  ) -> None:
nfontrod's avatar
nfontrod committed
    """
    Create A metagene or a barplot figure from bigwig file on regions defined \
    in the bed file provided with 'region_bed' parameter.

    :param design: A tabulated file containing 3 columns. The first columns \
    contains a bigwig filename, the second contains the condition name and \
    the last one contains the replicate of the condition.
    :param bw_folder: The folder containing the bigwig file mentioned in \
    the first column of the 'design' table.
    :param region_beds: A list of bed files containing the regions to visualise
    :param bed_names: A list of names identifying regions insides the given \
    beds.
nfontrod's avatar
nfontrod committed
    :param nb_bin: The number of bins used to represents the regions of \
    'region_bed'.
    :param figure_type: The kind of representation wanted (barplot or metagene)
    :param norm: an integer corresponding to the bin used to normalise \
    the samples or a file containing the normalisations to apply to \
    each samples
nfontrod's avatar
nfontrod committed
    :param show_replicate: True to create a figure showing the replicate \
    false else.
    :param environment: A list of two int. The first contains the number of \
    nucleotide to represent around the region of interest and the second,
    the number of bin used to represent those surrounding regions.
    :param border_names: The name of the borders
    :param output: Folder where the results will be created
    :param stat: A boolean indicating whether or not to perform \
    statistical analysis
nfontrod's avatar
nfontrod committed
    """
    if len(region_beds) != len(bed_names):
        raise IndexError("Parameter region_beds and bed_names should "
                         "have the same length")
nfontrod's avatar
nfontrod committed
    df_exp = pd.read_csv(design, sep="\t")
    regions = load_beds(region_beds, bed_names)
    region_bed_name = "-".join(b.name.replace('.bed', '') for b in region_beds)
    outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \
              f'_{region_bed_name}_{nb_bin}bin_' \
nfontrod's avatar
nfontrod committed
              f'{environment[0]}_nt-around-{environment[1]}-bin'
    if isinstance(norm, int):
        outfile += f'_bin{norm}_norm'
    elif isinstance(norm, Path):
        outfile += f'_file_norm'
nfontrod's avatar
nfontrod committed
    outfile += '.txt.gz'
nfontrod's avatar
nfontrod committed
    cov_file = output / outfile
    df_cov = create_full_table(df_exp, regions, nb_bin, environment,
                               bw_folder)
    if norm is not None:
        df_cov = bin_normalisation(df_cov, norm, cov_file)
nfontrod's avatar
nfontrod committed
    ordered_condition = []
    for condition in df_exp['condition'].to_list():
        if condition not in ordered_condition:
            ordered_condition.append(condition)
    region_kind = "--".join(bed_names)
    df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin,
                                         environment, region_kind,
                                         ordered_condition, bed_names)
nfontrod's avatar
nfontrod committed
    if figure_type == "metagene":
        figure_metagene(df_sum, show_replicate, border_names, nb_bin,
                        environment, region_kind, output, norm,
                        cond_col, ylim, stat)
nfontrod's avatar
nfontrod committed
    else:
        if 'location' in df_sum.columns:
            for cur_region in df_sum['location'].unique():
                df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
                figure_barplot(df_tmp, show_replicate, nb_bin, environment,
nfontrod's avatar
nfontrod committed
        else:
            figure_barplot(df_sum, show_replicate, nb_bin, environment,
nfontrod's avatar
nfontrod committed


if __name__ == "__main__":
    testmod()