diff --git a/src/gc_content/__init__.py b/src/gc_content/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..60f3a118d41cf7494b1cd067bf5d649b42ba1e05 --- /dev/null +++ b/src/gc_content/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: +""" diff --git a/src/gc_content/__main__.py b/src/gc_content/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..143a692eaf57a30103d44b3b2a5ab803a941a35d --- /dev/null +++ b/src/gc_content/__main__.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: Launch the creation of GC barplots +""" + +from .gc_content import create_barplot +from typing import List +from pathlib import Path +import lazyparser as lp + + +@lp.parse(beds="file", genome="file", environment="environment >= 0") +def launcher(beds: List[str], bed_names: List[str], + genome: str, environment: int = 0, + ft_name: str = "interval") -> None: + """ + + :param beds: A list of bed files containing the regions to visualise + :param bed_names: A list of names identifying regions insides the given \ + beds. + :param genome: A Fasta file containing a genome. + :param environment: Number of nucleotides around the genomic intervals \ + in the beds files for which we want to know the gc content (default 0) + :param ft_name: A name corresponding to the feature of interest \ + (default 'interval') + """ + if len(bed_names) != len(beds): + raise IndexError("--bed_names and --beds should contains the same " + "number of items") + beds = [Path(b) for b in beds] + create_barplot(beds, bed_names, genome, environment, ft_name) + + +launcher() diff --git a/src/gc_content/config.py b/src/gc_content/config.py new file mode 100644 index 0000000000000000000000000000000000000000..70367895abe1cbfd95f48964bbaf41bdb04dd6f3 --- /dev/null +++ b/src/gc_content/config.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: contains variables used in this modules +""" + +from ..bed_handler.config import TestConfig +from pathlib import Path + + +class ConfigGC: + """ + contains the folder where the gc barplot will be created + """ + output = Path(__file__).parents[2] / "results" / "gc_barplots" diff --git a/src/gc_content/gc_content.py b/src/gc_content/gc_content.py new file mode 100644 index 0000000000000000000000000000000000000000..0ffe299c8a6a7dd9ac2dd73a3833609dc8a46532 --- /dev/null +++ b/src/gc_content/gc_content.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is to create a barplot indicating \ +the gc content of list of exons +""" + +from .config import TestConfig, ConfigGC +from ..visu.figure_maker import load_beds +from typing import List, Any, Dict +from pyfaidx import Fasta +from Bio.Seq import Seq +from doctest import testmod +import pandas as pd +import numpy as np +from pathlib import Path +import seaborn as sns +from .gc_stats import make_stat +from .stat_annot import add_stat_annotation + + +def get_gc_content(bed_line: List[Any], dic_seq: Fasta) -> float: + """ + Get the gc content of a genomic interval. + + :param bed_line: A genomic interval in a bed format + :param dic_seq: A dictionary containing chromosomal sequences + :return: The gc content + + >>> get_gc_content(["chr1", 0, 10, "s1", ".", "+"], + ... Fasta(str(TestConfig.test_fasta))) + 100.0 + >>> get_gc_content(["chr1", 10, 20, "s1", ".", "+"], + ... Fasta(str(TestConfig.test_fasta))) + 50.0 + >>> get_gc_content(["chr1", 20, 30, "s1", ".", "+"], + ... Fasta(str(TestConfig.test_fasta))) + 0.0 + >>> get_gc_content(["chr1", 0, 10, "s1", ".", "-"], + ... Fasta(str(TestConfig.test_fasta))) + 100.0 + >>> get_gc_content(["chr1", 10, 20, "s1", ".", "-"], + ... Fasta(str(TestConfig.test_fasta))) + 50.0 + >>> get_gc_content(["chr1", 20, 30, "s1", ".", "-"], + ... Fasta(str(TestConfig.test_fasta))) + 0.0 + """ + seq = str(dic_seq[bed_line[0]][bed_line[1]:bed_line[2]]).upper() + if "N" in seq: + return np.nan + if bed_line[5] == "-": + seq = str(Seq(seq).reverse_complement()) + return (seq.count("G") + seq.count("C")) / len(seq) * 100 + + +def get_many_gc_content(bed_line: List[Any], dic_seq: Fasta, + environment: int) -> Dict: + """ + Get the GC content of a given genomic interval and the GC content \ + around this interval. + + :param bed_line: A genomic interval in a bed format + :param dic_seq: A dictionary containing chromosomal sequences + :param environment: The number of nucleotides around the interval for \ + which we want to get the gc content + :return: A dictionary containing the GC content inside the interval \ + we want to get and around it. + + >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"], + ... Fasta(str(TestConfig.test_fasta)), 0) + {'interval': 50.0} + >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"], + ... Fasta(str(TestConfig.test_fasta)), 10) + {'before': 100.0, 'interval': 50.0, 'after': 0.0} + >>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "-"], + ... Fasta(str(TestConfig.test_fasta)), 10) + {'before': 0.0, 'interval': 50.0, 'after': 100.0} + """ + gc_interval = get_gc_content(bed_line, dic_seq) + if environment == 0: + return {"interval": gc_interval} + gc_interval = get_gc_content(bed_line, dic_seq) + before_val = max(0, bed_line[1] - environment) + before_interval = get_gc_content([bed_line[0], before_val, + bed_line[1]] + bed_line[3:], dic_seq) + after_val = min(len(dic_seq[bed_line[0]]), bed_line[2] + environment) + after_interval = get_gc_content([bed_line[0], bed_line[2], + after_val] + + bed_line[3:], dic_seq) + if bed_line[5] == "-": + tmp = before_interval + before_interval = after_interval + after_interval = tmp + return {"before": before_interval, "interval": gc_interval, + "after": after_interval} + + +def build_gc_dataframe(bed_content: List[List], dic_seq: Fasta, + environment: int, ft_name: str = "interval" + ) -> pd.DataFrame: + """ + Create a dataframe containing the GC content of many bed files and \ + their environment. + + :param bed_content: A list of genomic intervals of interest + :param dic_seq: A dictionary containing genomic sequence + :param environment: Number of nucleotide around the genomic interval for \ + which we want to know the gc content + :param ft_name: A name corresponding to the feature of interest + :return: a table containing the gc content of the intervals + """ + dic = {"gc_content": [], "region": [], "location": []} + for bed_line in bed_content: + res = get_many_gc_content(bed_line, dic_seq, + environment) + for k in res: + dic["gc_content"].append(res[k]) + dic["location"].append(k.replace("interval", ft_name)) + dic["region"].append(bed_line[6]) + return pd.DataFrame(dic) + + +def make_gc_barplot(df: pd.DataFrame, outfile: Path, environment: int, + region_names: List[str], ft_name: str) -> None: + """ + Create a barplot showing the gc content in the given lists of intervals. + + :param df: A dataframe containing the gc content of genomics intervals + :param outfile: The output file of interest + :param environment: Number of nucleotide around the genomic interval for \ + which we want to know the gc content + :param region_names: The name of the regions analysed + :param ft_name: The name of the feature analyzed + """ + sns.set(context="poster") + rgn = ", ".join(region_names[:-1]) + " and " + region_names[-1] + title = f"GC content of {rgn} {ft_name}" + p_vals = make_stat(df) + if environment != 0: + g = sns.catplot(x="location", y="gc_content", hue="region", data=df, + aspect=1.77, height=12, kind="violin") + add_stat_annotation(g.ax, data=df, x="location", y="gc_content", + hue="region", + loc='inside', + boxPairList=p_vals, + linewidth=2, fontsize="medium", + lineYOffsetAxesCoord=0, + lineYOffsetToBoxAxesCoord=0.1) + else: + g = sns.catplot(x="region", y="gc_content", data=df, + aspect=1.77, height=12, kind="violin") + add_stat_annotation(g.ax, data=df, x="region", y="gc_content", + loc='inside', + boxPairList=p_vals, + linewidth=2, fontsize="medium", + lineYOffsetToBoxAxesCoord=0.1) + g.set_ylabels("GC content") + g.set_xlabels("") + g.fig.subplots_adjust(top=0.9) + g.fig.suptitle(title + " and their surrounding regions " + f"of {environment} nucleotides") + g.savefig(outfile) + + +def create_barplot(beds: List[Path], bed_names: List[str], + genome: str, environment: int, + ft_name: str = "interval") -> None: + """ + + :param beds: A list of bed files containing the regions to visualise + :param bed_names: A list of names identifying regions insides the given \ + beds. + :param genome: A Fasta file containing a genome. + :param environment: Number of nucleotide around the genomic interval for \ + which we want to know the gc content + :param ft_name: A name corresponding to the feature of interest + """ + bed_content = load_beds(beds, bed_names) + dic_seq = Fasta(genome) + df = build_gc_dataframe(bed_content, dic_seq, environment, ft_name) + outfile = ConfigGC.output / \ + f"gc_content_{'-'.join(bed_names)}_env_{environment}_nt.pdf" + outfile.parent.mkdir(exist_ok=True, parents=True) + make_gc_barplot(df, outfile, environment, bed_names, ft_name) + + +if __name__ == "__main__": + testmod() diff --git a/src/gc_content/gc_stats.py b/src/gc_content/gc_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..778022b76b16b234592b7b247d84fbc3bad24734 --- /dev/null +++ b/src/gc_content/gc_stats.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +# -*- coding: UTF-8 -*- + +""" +Description: The goal of this script is make a statistical analysis +""" + +import pandas as pd +from typing import List, Any +from scipy.stats import mannwhitneyu +from itertools import combinations + + +def mann_whitney(df: pd.DataFrame) -> List[Any]: + """ + Return the man withney test comparing the gc content and the different \ + regions in the dataframe. + + :param df: A dataframe of gc content + :return: The name of the groups and their p-value + """ + regions = df['region'].unique() + for r1, r2 in combinations(regions, 2): + v1 = df.loc[df['region'] == r1, "gc_content"] + v2 = df.loc[df['region'] == r2, "gc_content"] + return [r1, r2, mannwhitneyu(v1, v2)[-1]] + + +def make_stat(df: pd.DataFrame) -> List[List[Any]]: + """ + + :param df: A dataframe of gc content + :return: The list of pvalues of interest + """ + if len(df['location'].unique()) == 1: + return [mann_whitney(df)] + else: + list_pval = [] + for loc in df['location'].unique(): + df_tmp = df.loc[df["location"] == loc, :] + res = mann_whitney(df_tmp) + res[0] = (loc, res[0]) + res[1] = (loc, res[1]) + list_pval.append(res) + return list_pval diff --git a/src/gc_content/stat_annot.py b/src/gc_content/stat_annot.py new file mode 100644 index 0000000000000000000000000000000000000000..6223c48fd82a50a10c841facc6a8af9c7807e2c0 --- /dev/null +++ b/src/gc_content/stat_annot.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +# -*- coding;: utf-8 -*- + +""" +Description: + Add annotation on figure. + Script adapted from + github.com/webermarcolivier/statannot/blob/master/statannot/statannot.py. +""" + +import matplotlib.pyplot as plt +from matplotlib import lines +import matplotlib.transforms as mtransforms +from matplotlib.font_manager import FontProperties +import numpy as np +import seaborn as sns +from seaborn.utils import remove_na + + +def add_stat_annotation(ax, data=None, x=None, y=None, hue=None, order=None, + hue_order=None, boxPairList=None, loc='inside', + useFixedOffset=False, lineYOffsetToBoxAxesCoord=None, + lineYOffsetAxesCoord=None, lineHeightAxesCoord=0.02, + textYOffsetPoints=1, color='0.2', linewidth=1.5, + fontsize='medium', verbose=1): + """ + User should use the same argument for the data, x, y, hue, order, \ + hue_order as the seaborn boxplot function. + boxPairList can be of either form: + boxplot: [(cat1, cat2, pval), (cat3, cat4, pval)] + """ + + def find_x_position_box(boxPlotter, boxName): + """ + boxName can be either a name "cat" or a tuple ("cat", "hue") + """ + if boxPlotter.plot_hues is None: + cat = boxName + hueOffset = 0 + else: + cat = boxName[0] + hue = boxName[1] + hueOffset = boxPlotter.hue_offsets[ + boxPlotter.hue_names.index(hue)] + + groupPos = boxPlotter.group_names.index(cat) + boxPos = groupPos + hueOffset + return boxPos + + def get_box_data(boxPlotter, boxName): + """ + boxName can be either a name "cat" or a tuple ("cat", "hue") + Here we really have to duplicate seaborn code, because there is not + direct access to the + box_data in the BoxPlotter class. + """ + cat = boxName + if boxPlotter.plot_hues is None: + box_max_l = [] + for cat in boxPlotter.group_names: + i = boxPlotter.group_names.index(cat) + group_data = boxPlotter.plot_data[i] + box_data = remove_na(group_data) + box_max_l.append(np.max(box_data)) + return max(box_max_l) + else: + i = boxPlotter.group_names.index(cat[0]) + group_data = boxPlotter.plot_data[i] + box_data = [] + for hue in boxPlotter.hue_names: + hue_mask = boxPlotter.plot_hues[i] == hue + box_data.append(np.max(remove_na(group_data[hue_mask]))) + return max(box_data) + + fig = plt.gcf() + + validList = ['inside', 'outside'] + if loc not in validList: + raise ValueError(f"loc value should be one of the following: " + f"{', '.join(validList)}.") + + # Create the same BoxPlotter object as seaborn's boxplot + boxPlotter = sns.categorical._BoxPlotter(x, y, hue, data, order, hue_order, + orient=None, width=.8, color=None, + palette=None, saturation=.75, + dodge=True, fliersize=5, + linewidth=None) + ylim = ax.get_ylim() + yRange = ylim[1] - ylim[0] + + if lineYOffsetAxesCoord is None: + if loc == 'inside': + lineYOffsetAxesCoord = 0.005 + if lineYOffsetToBoxAxesCoord is None: + lineYOffsetToBoxAxesCoord = 0.1 + elif loc == 'outside': + lineYOffsetAxesCoord = 0.03 + lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord + else: + if loc == 'inside': + if lineYOffsetToBoxAxesCoord is None: + lineYOffsetToBoxAxesCoord = 0.06 + elif loc == 'outside': + lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord + yOffset = lineYOffsetAxesCoord * yRange + yOffsetToBox = lineYOffsetToBoxAxesCoord * yRange + + yStack = [] + annList = [] + for box1, box2, pval in boxPairList: + + groupNames = boxPlotter.group_names + cat1 = box1 + cat2 = box2 + if isinstance(cat1, tuple): + hue_names = boxPlotter.hue_names + valid = cat1[0] in groupNames and cat2[0] in groupNames and \ + cat1[1] in hue_names and cat2[1] in hue_names + else: + valid = cat1 in groupNames and cat2 in groupNames + + if valid: + # Get position of boxes + x1 = find_x_position_box(boxPlotter, box1) + x2 = find_x_position_box(boxPlotter, box2) + box_data1 = get_box_data(boxPlotter, box1) + box_data2 = get_box_data(boxPlotter, box2) + ymax1 = box_data1 + ymax2 = box_data2 + + if pval > 1e-16: + text = "p = {:.2e}".format(pval) + else: + text = "p < 1e-16" + + if loc == 'inside': + yRef = max(ymax1, ymax2) + else: + yRef = ylim[1] + if len(yStack) > 0 and isinstance(box1, str): + yRef2 = max(yRef, max(yStack)) + else: + yRef2 = yRef + + if len(yStack) == 0 or not isinstance(box1, str): + y = yRef2 + yOffsetToBox + else: + y = yRef2 + yOffset + h = lineHeightAxesCoord * yRange + lineX, lineY = [x1, x1, x2, x2], [y, y + h, y + h, y] + if loc == 'inside': + ax.plot(lineX, lineY, lw=linewidth, c=color) + elif loc == 'outside': + line = lines.Line2D(lineX, lineY, lw=linewidth, c=color, + transform=ax.transData) + line.set_clip_on(False) + ax.add_line(line) + + if text is not None: + ann = ax.annotate(text, xy=(np.mean([x1, x2]), y + h), + xytext=(0, textYOffsetPoints), + textcoords='offset points', + xycoords='data', ha='center', va='bottom', + fontsize=fontsize, + clip_on=False, annotation_clip=False) + annList.append(ann) + + new_max_ylim = 1.1 * (y + h) + if new_max_ylim > ylim[1]: + ax.set_ylim((ylim[0], 1.1 * (y + h))) + + if text is not None: + plt.draw() + yTopAnnot = None + gotMatplotlibError = False + if not useFixedOffset: + try: + bbox = ann.get_window_extent() + bbox_data = bbox.transformed(ax.transData.inverted()) + yTopAnnot = bbox_data.ymax + except RuntimeError: + gotMatplotlibError = True + + if useFixedOffset or gotMatplotlibError: + if verbose >= 1: + print("Warning: cannot get the text bounding box. " + "Falling back to a fixed y offset. Layout may " + "be not optimal.") + # We will apply a fixed offset in points, based on the font size of the annotation. + fontsizePoints = FontProperties( + size='medium').get_size_in_points() + offsetTrans = mtransforms.offset_copy(ax.transData, + fig=fig, + x=0, + y=1.0 * fontsizePoints + textYOffsetPoints, + units='points') + yTopDisplay = offsetTrans.transform((0, y + h)) + yTopAnnot = ax.transData.inverted().transform(yTopDisplay)[ + 1] + else: + yTopAnnot = y + h + + yStack.append(yTopAnnot) + else: + raise ValueError("boxPairList contains an unvalid box pair.") + + yStackMax = max(yStack) + if loc == 'inside': + if ylim[1] < 1.03 * yStackMax: + ax.set_ylim((ylim[0], 1.03 * yStackMax)) + elif loc == 'outside': + ax.set_ylim((ylim[0], ylim[1])) + return ax