From 52a5208aae43bacb1b9fe86363ae5c9f33b3fe04 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Thu, 5 Nov 2020 17:06:39 +0100
Subject: [PATCH] src/gc_content/*.py: script to compute the gc content from
 bed files

---
 src/gc_content/__init__.py   |   7 ++
 src/gc_content/__main__.py   |  37 ++++++
 src/gc_content/config.py     |  17 +++
 src/gc_content/gc_content.py | 191 +++++++++++++++++++++++++++++++
 src/gc_content/gc_stats.py   |  46 ++++++++
 src/gc_content/stat_annot.py | 214 +++++++++++++++++++++++++++++++++++
 6 files changed, 512 insertions(+)
 create mode 100644 src/gc_content/__init__.py
 create mode 100644 src/gc_content/__main__.py
 create mode 100644 src/gc_content/config.py
 create mode 100644 src/gc_content/gc_content.py
 create mode 100644 src/gc_content/gc_stats.py
 create mode 100644 src/gc_content/stat_annot.py

diff --git a/src/gc_content/__init__.py b/src/gc_content/__init__.py
new file mode 100644
index 0000000..60f3a11
--- /dev/null
+++ b/src/gc_content/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description:
+"""
diff --git a/src/gc_content/__main__.py b/src/gc_content/__main__.py
new file mode 100644
index 0000000..143a692
--- /dev/null
+++ b/src/gc_content/__main__.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: Launch the creation of GC barplots
+"""
+
+from .gc_content import create_barplot
+from typing import List
+from pathlib import Path
+import lazyparser as lp
+
+
+@lp.parse(beds="file", genome="file", environment="environment >= 0")
+def launcher(beds: List[str], bed_names: List[str],
+             genome: str, environment: int = 0,
+             ft_name: str = "interval") -> None:
+    """
+
+    :param beds: A list of bed files containing the regions to visualise
+    :param bed_names: A list of names identifying regions insides the given \
+    beds.
+    :param genome: A Fasta file containing a genome.
+    :param environment: Number of nucleotides around the genomic intervals \
+    in the beds files for which we want to know the gc content (default 0)
+    :param ft_name: A name corresponding to the feature of interest \
+    (default 'interval')
+    """
+    if len(bed_names) != len(beds):
+        raise IndexError("--bed_names and --beds should contains the same "
+                         "number of items")
+    beds = [Path(b) for b in beds]
+    create_barplot(beds, bed_names, genome, environment, ft_name)
+
+
+launcher()
diff --git a/src/gc_content/config.py b/src/gc_content/config.py
new file mode 100644
index 0000000..7036789
--- /dev/null
+++ b/src/gc_content/config.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: contains variables used in this modules
+"""
+
+from ..bed_handler.config import TestConfig
+from pathlib import Path
+
+
+class ConfigGC:
+    """
+    contains the folder where the gc barplot will be created
+    """
+    output = Path(__file__).parents[2] / "results" / "gc_barplots"
diff --git a/src/gc_content/gc_content.py b/src/gc_content/gc_content.py
new file mode 100644
index 0000000..0ffe299
--- /dev/null
+++ b/src/gc_content/gc_content.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is to create a barplot indicating \
+the gc content of list of exons
+"""
+
+from .config import TestConfig, ConfigGC
+from ..visu.figure_maker import load_beds
+from typing import List, Any, Dict
+from pyfaidx import Fasta
+from Bio.Seq import Seq
+from doctest import testmod
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import seaborn as sns
+from .gc_stats import make_stat
+from .stat_annot import add_stat_annotation
+
+
+def get_gc_content(bed_line: List[Any], dic_seq: Fasta) -> float:
+    """
+    Get the gc content of a genomic interval.
+
+    :param bed_line: A genomic interval in a bed format
+    :param dic_seq: A dictionary containing chromosomal sequences
+    :return: The gc content
+
+    >>> get_gc_content(["chr1", 0, 10, "s1", ".", "+"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    100.0
+    >>> get_gc_content(["chr1", 10, 20, "s1", ".", "+"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    50.0
+    >>> get_gc_content(["chr1", 20, 30, "s1", ".", "+"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    0.0
+    >>> get_gc_content(["chr1", 0, 10, "s1", ".", "-"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    100.0
+    >>> get_gc_content(["chr1", 10, 20, "s1", ".", "-"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    50.0
+    >>> get_gc_content(["chr1", 20, 30, "s1", ".", "-"],
+    ... Fasta(str(TestConfig.test_fasta)))
+    0.0
+    """
+    seq = str(dic_seq[bed_line[0]][bed_line[1]:bed_line[2]]).upper()
+    if "N" in seq:
+        return np.nan
+    if bed_line[5] == "-":
+        seq = str(Seq(seq).reverse_complement())
+    return (seq.count("G") + seq.count("C")) / len(seq) * 100
+
+
+def get_many_gc_content(bed_line: List[Any], dic_seq: Fasta,
+                        environment: int) -> Dict:
+    """
+    Get the GC content of a given genomic interval and the GC content \
+    around this interval.
+
+    :param bed_line: A genomic interval in a bed format
+    :param dic_seq: A dictionary containing chromosomal sequences
+    :param environment: The number of nucleotides around the interval for \
+    which we want to get the gc content
+    :return: A dictionary containing the GC content inside the interval \
+    we want to get and around it.
+
+    >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"],
+    ... Fasta(str(TestConfig.test_fasta)), 0)
+    {'interval': 50.0}
+    >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"],
+    ... Fasta(str(TestConfig.test_fasta)), 10)
+    {'before': 100.0, 'interval': 50.0, 'after': 0.0}
+    >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "-"],
+    ... Fasta(str(TestConfig.test_fasta)), 10)
+    {'before': 0.0, 'interval': 50.0, 'after': 100.0}
+    """
+    gc_interval = get_gc_content(bed_line, dic_seq)
+    if environment == 0:
+        return {"interval": gc_interval}
+    gc_interval = get_gc_content(bed_line, dic_seq)
+    before_val = max(0, bed_line[1] - environment)
+    before_interval = get_gc_content([bed_line[0], before_val,
+                                      bed_line[1]] + bed_line[3:], dic_seq)
+    after_val = min(len(dic_seq[bed_line[0]]), bed_line[2] + environment)
+    after_interval = get_gc_content([bed_line[0], bed_line[2],
+                                     after_val] +
+                                    bed_line[3:], dic_seq)
+    if bed_line[5] == "-":
+        tmp = before_interval
+        before_interval = after_interval
+        after_interval = tmp
+    return {"before": before_interval, "interval": gc_interval,
+            "after": after_interval}
+
+
+def build_gc_dataframe(bed_content: List[List], dic_seq: Fasta,
+                       environment: int, ft_name: str = "interval"
+                       ) -> pd.DataFrame:
+    """
+    Create a dataframe containing the GC content of many bed files and \
+    their environment.
+
+    :param bed_content: A list of genomic intervals of interest
+    :param dic_seq: A dictionary containing genomic sequence
+    :param environment: Number of nucleotide around the genomic interval for \
+    which we want to know the gc content
+    :param ft_name: A name corresponding to the feature of interest
+    :return: a table containing the gc content of the intervals
+    """
+    dic = {"gc_content": [], "region": [], "location": []}
+    for bed_line in bed_content:
+        res = get_many_gc_content(bed_line, dic_seq,
+                                  environment)
+        for k in res:
+            dic["gc_content"].append(res[k])
+            dic["location"].append(k.replace("interval", ft_name))
+            dic["region"].append(bed_line[6])
+    return pd.DataFrame(dic)
+
+
+def make_gc_barplot(df: pd.DataFrame, outfile: Path, environment: int,
+                    region_names: List[str], ft_name: str) -> None:
+    """
+    Create a barplot showing the gc content in the given lists of intervals.
+
+    :param df: A dataframe containing the gc content of genomics intervals
+    :param outfile: The output file of interest
+    :param environment: Number of nucleotide around the genomic interval for \
+    which we want to know the gc content
+    :param region_names: The name of the regions analysed
+    :param ft_name: The name of the feature analyzed
+    """
+    sns.set(context="poster")
+    rgn = ", ".join(region_names[:-1]) + " and " + region_names[-1]
+    title = f"GC content of {rgn} {ft_name}"
+    p_vals = make_stat(df)
+    if environment != 0:
+        g = sns.catplot(x="location", y="gc_content", hue="region", data=df,
+                        aspect=1.77, height=12, kind="violin")
+        add_stat_annotation(g.ax, data=df, x="location", y="gc_content",
+                            hue="region",
+                            loc='inside',
+                            boxPairList=p_vals,
+                            linewidth=2, fontsize="medium",
+                            lineYOffsetAxesCoord=0,
+                            lineYOffsetToBoxAxesCoord=0.1)
+    else:
+        g = sns.catplot(x="region", y="gc_content", data=df,
+                        aspect=1.77, height=12, kind="violin")
+        add_stat_annotation(g.ax, data=df, x="region", y="gc_content",
+                            loc='inside',
+                            boxPairList=p_vals,
+                            linewidth=2, fontsize="medium",
+                            lineYOffsetToBoxAxesCoord=0.1)
+    g.set_ylabels("GC content")
+    g.set_xlabels("")
+    g.fig.subplots_adjust(top=0.9)
+    g.fig.suptitle(title + " and their surrounding regions "
+                           f"of {environment} nucleotides")
+    g.savefig(outfile)
+
+
+def create_barplot(beds: List[Path], bed_names: List[str],
+                   genome: str, environment: int,
+                   ft_name: str = "interval") -> None:
+    """
+
+    :param beds: A list of bed files containing the regions to visualise
+    :param bed_names: A list of names identifying regions insides the given \
+    beds.
+    :param genome: A Fasta file containing a genome.
+    :param environment: Number of nucleotide around the genomic interval for \
+    which we want to know the gc content
+    :param ft_name: A name corresponding to the feature of interest
+    """
+    bed_content = load_beds(beds, bed_names)
+    dic_seq = Fasta(genome)
+    df = build_gc_dataframe(bed_content, dic_seq, environment, ft_name)
+    outfile = ConfigGC.output / \
+              f"gc_content_{'-'.join(bed_names)}_env_{environment}_nt.pdf"
+    outfile.parent.mkdir(exist_ok=True, parents=True)
+    make_gc_barplot(df, outfile, environment, bed_names, ft_name)
+
+
+if __name__ == "__main__":
+    testmod()
diff --git a/src/gc_content/gc_stats.py b/src/gc_content/gc_stats.py
new file mode 100644
index 0000000..778022b
--- /dev/null
+++ b/src/gc_content/gc_stats.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is make a statistical analysis
+"""
+
+import pandas as pd
+from typing import List, Any
+from scipy.stats import mannwhitneyu
+from itertools import combinations
+
+
+def mann_whitney(df: pd.DataFrame) -> List[Any]:
+    """
+    Return the man withney test comparing the gc content and the different \
+    regions in the dataframe.
+
+    :param df: A dataframe of gc content
+    :return: The name of the groups and their p-value
+    """
+    regions = df['region'].unique()
+    for r1, r2 in combinations(regions, 2):
+        v1 = df.loc[df['region'] == r1, "gc_content"]
+        v2 = df.loc[df['region'] == r2, "gc_content"]
+        return [r1, r2, mannwhitneyu(v1, v2)[-1]]
+
+
+def make_stat(df: pd.DataFrame) -> List[List[Any]]:
+    """
+
+    :param df: A dataframe of gc content
+    :return: The list of pvalues of interest
+    """
+    if len(df['location'].unique()) == 1:
+        return [mann_whitney(df)]
+    else:
+        list_pval = []
+        for loc in df['location'].unique():
+            df_tmp = df.loc[df["location"] == loc, :]
+            res = mann_whitney(df_tmp)
+            res[0] = (loc, res[0])
+            res[1] = (loc, res[1])
+            list_pval.append(res)
+        return list_pval
diff --git a/src/gc_content/stat_annot.py b/src/gc_content/stat_annot.py
new file mode 100644
index 0000000..6223c48
--- /dev/null
+++ b/src/gc_content/stat_annot.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+
+# -*- coding;: utf-8 -*-
+
+"""
+Description:
+    Add annotation on figure.
+    Script adapted from
+    github.com/webermarcolivier/statannot/blob/master/statannot/statannot.py.
+"""
+
+import matplotlib.pyplot as plt
+from matplotlib import lines
+import matplotlib.transforms as mtransforms
+from matplotlib.font_manager import FontProperties
+import numpy as np
+import seaborn as sns
+from seaborn.utils import remove_na
+
+
+def add_stat_annotation(ax, data=None, x=None, y=None, hue=None, order=None,
+                        hue_order=None, boxPairList=None, loc='inside',
+                        useFixedOffset=False, lineYOffsetToBoxAxesCoord=None,
+                        lineYOffsetAxesCoord=None, lineHeightAxesCoord=0.02,
+                        textYOffsetPoints=1, color='0.2', linewidth=1.5,
+                        fontsize='medium', verbose=1):
+    """
+    User should use the same argument for the data, x, y, hue, order, \
+    hue_order as the seaborn boxplot function.
+    boxPairList can be of either form:
+    boxplot: [(cat1, cat2, pval), (cat3, cat4, pval)]
+    """
+
+    def find_x_position_box(boxPlotter, boxName):
+        """
+        boxName can be either a name "cat" or a tuple ("cat", "hue")
+        """
+        if boxPlotter.plot_hues is None:
+            cat = boxName
+            hueOffset = 0
+        else:
+            cat = boxName[0]
+            hue = boxName[1]
+            hueOffset = boxPlotter.hue_offsets[
+                boxPlotter.hue_names.index(hue)]
+
+        groupPos = boxPlotter.group_names.index(cat)
+        boxPos = groupPos + hueOffset
+        return boxPos
+
+    def get_box_data(boxPlotter, boxName):
+        """
+        boxName can be either a name "cat" or a tuple ("cat", "hue")
+        Here we really have to duplicate seaborn code, because there is not
+        direct access to the
+        box_data in the BoxPlotter class.
+        """
+        cat = boxName
+        if boxPlotter.plot_hues is None:
+            box_max_l = []
+            for cat in boxPlotter.group_names:
+                i = boxPlotter.group_names.index(cat)
+                group_data = boxPlotter.plot_data[i]
+                box_data = remove_na(group_data)
+                box_max_l.append(np.max(box_data))
+            return max(box_max_l)
+        else:
+            i = boxPlotter.group_names.index(cat[0])
+            group_data = boxPlotter.plot_data[i]
+            box_data = []
+            for hue in boxPlotter.hue_names:
+                hue_mask = boxPlotter.plot_hues[i] == hue
+                box_data.append(np.max(remove_na(group_data[hue_mask])))
+            return max(box_data)
+
+    fig = plt.gcf()
+
+    validList = ['inside', 'outside']
+    if loc not in validList:
+        raise ValueError(f"loc value should be one of the following: "
+                         f"{', '.join(validList)}.")
+
+    # Create the same BoxPlotter object as seaborn's boxplot
+    boxPlotter = sns.categorical._BoxPlotter(x, y, hue, data, order, hue_order,
+                                             orient=None, width=.8, color=None,
+                                             palette=None, saturation=.75,
+                                             dodge=True, fliersize=5,
+                                             linewidth=None)
+    ylim = ax.get_ylim()
+    yRange = ylim[1] - ylim[0]
+
+    if lineYOffsetAxesCoord is None:
+        if loc == 'inside':
+            lineYOffsetAxesCoord = 0.005
+            if lineYOffsetToBoxAxesCoord is None:
+                lineYOffsetToBoxAxesCoord = 0.1
+        elif loc == 'outside':
+            lineYOffsetAxesCoord = 0.03
+            lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord
+    else:
+        if loc == 'inside':
+            if lineYOffsetToBoxAxesCoord is None:
+                lineYOffsetToBoxAxesCoord = 0.06
+        elif loc == 'outside':
+            lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord
+    yOffset = lineYOffsetAxesCoord * yRange
+    yOffsetToBox = lineYOffsetToBoxAxesCoord * yRange
+
+    yStack = []
+    annList = []
+    for box1, box2, pval in boxPairList:
+
+        groupNames = boxPlotter.group_names
+        cat1 = box1
+        cat2 = box2
+        if isinstance(cat1, tuple):
+            hue_names = boxPlotter.hue_names
+            valid = cat1[0] in groupNames and cat2[0] in groupNames and \
+                    cat1[1] in hue_names and cat2[1] in hue_names
+        else:
+            valid = cat1 in groupNames and cat2 in groupNames
+
+        if valid:
+            # Get position of boxes
+            x1 = find_x_position_box(boxPlotter, box1)
+            x2 = find_x_position_box(boxPlotter, box2)
+            box_data1 = get_box_data(boxPlotter, box1)
+            box_data2 = get_box_data(boxPlotter, box2)
+            ymax1 = box_data1
+            ymax2 = box_data2
+
+            if pval > 1e-16:
+                text = "p = {:.2e}".format(pval)
+            else:
+                text = "p < 1e-16"
+
+            if loc == 'inside':
+                yRef = max(ymax1, ymax2)
+            else:
+                yRef = ylim[1]
+            if len(yStack) > 0 and isinstance(box1, str):
+                yRef2 = max(yRef, max(yStack))
+            else:
+                yRef2 = yRef
+
+            if len(yStack) == 0 or not isinstance(box1, str):
+                y = yRef2 + yOffsetToBox
+            else:
+                y = yRef2 + yOffset
+            h = lineHeightAxesCoord * yRange
+            lineX, lineY = [x1, x1, x2, x2], [y, y + h, y + h, y]
+            if loc == 'inside':
+                ax.plot(lineX, lineY, lw=linewidth, c=color)
+            elif loc == 'outside':
+                line = lines.Line2D(lineX, lineY, lw=linewidth, c=color,
+                                    transform=ax.transData)
+                line.set_clip_on(False)
+                ax.add_line(line)
+
+            if text is not None:
+                ann = ax.annotate(text, xy=(np.mean([x1, x2]), y + h),
+                                  xytext=(0, textYOffsetPoints),
+                                  textcoords='offset points',
+                                  xycoords='data', ha='center', va='bottom',
+                                  fontsize=fontsize,
+                                  clip_on=False, annotation_clip=False)
+                annList.append(ann)
+
+            new_max_ylim = 1.1 * (y + h)
+            if new_max_ylim > ylim[1]:
+                ax.set_ylim((ylim[0], 1.1 * (y + h)))
+
+            if text is not None:
+                plt.draw()
+                yTopAnnot = None
+                gotMatplotlibError = False
+                if not useFixedOffset:
+                    try:
+                        bbox = ann.get_window_extent()
+                        bbox_data = bbox.transformed(ax.transData.inverted())
+                        yTopAnnot = bbox_data.ymax
+                    except RuntimeError:
+                        gotMatplotlibError = True
+
+                if useFixedOffset or gotMatplotlibError:
+                    if verbose >= 1:
+                        print("Warning: cannot get the text bounding box. "
+                              "Falling back to a fixed y offset. Layout may "
+                              "be not optimal.")
+                    # We will apply a fixed offset in points, based on the font size of the annotation.
+                    fontsizePoints = FontProperties(
+                        size='medium').get_size_in_points()
+                    offsetTrans = mtransforms.offset_copy(ax.transData,
+                                                          fig=fig,
+                                                          x=0,
+                                                          y=1.0 * fontsizePoints + textYOffsetPoints,
+                                                          units='points')
+                    yTopDisplay = offsetTrans.transform((0, y + h))
+                    yTopAnnot = ax.transData.inverted().transform(yTopDisplay)[
+                        1]
+            else:
+                yTopAnnot = y + h
+
+            yStack.append(yTopAnnot)
+        else:
+            raise ValueError("boxPairList contains an unvalid box pair.")
+
+    yStackMax = max(yStack)
+    if loc == 'inside':
+        if ylim[1] < 1.03 * yStackMax:
+            ax.set_ylim((ylim[0], 1.03 * yStackMax))
+    elif loc == 'outside':
+        ax.set_ylim((ylim[0], ylim[1]))
+    return ax
-- 
GitLab