Skip to content
Snippets Groups Projects
Commit 52a5208a authored by nfontrod's avatar nfontrod
Browse files

src/gc_content/*.py: script to compute the gc content from bed files

parent 4d17cd23
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description:
"""
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: Launch the creation of GC barplots
"""
from .gc_content import create_barplot
from typing import List
from pathlib import Path
import lazyparser as lp
@lp.parse(beds="file", genome="file", environment="environment >= 0")
def launcher(beds: List[str], bed_names: List[str],
genome: str, environment: int = 0,
ft_name: str = "interval") -> None:
"""
:param beds: A list of bed files containing the regions to visualise
:param bed_names: A list of names identifying regions insides the given \
beds.
:param genome: A Fasta file containing a genome.
:param environment: Number of nucleotides around the genomic intervals \
in the beds files for which we want to know the gc content (default 0)
:param ft_name: A name corresponding to the feature of interest \
(default 'interval')
"""
if len(bed_names) != len(beds):
raise IndexError("--bed_names and --beds should contains the same "
"number of items")
beds = [Path(b) for b in beds]
create_barplot(beds, bed_names, genome, environment, ft_name)
launcher()
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: contains variables used in this modules
"""
from ..bed_handler.config import TestConfig
from pathlib import Path
class ConfigGC:
"""
contains the folder where the gc barplot will be created
"""
output = Path(__file__).parents[2] / "results" / "gc_barplots"
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is to create a barplot indicating \
the gc content of list of exons
"""
from .config import TestConfig, ConfigGC
from ..visu.figure_maker import load_beds
from typing import List, Any, Dict
from pyfaidx import Fasta
from Bio.Seq import Seq
from doctest import testmod
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
from .gc_stats import make_stat
from .stat_annot import add_stat_annotation
def get_gc_content(bed_line: List[Any], dic_seq: Fasta) -> float:
"""
Get the gc content of a genomic interval.
:param bed_line: A genomic interval in a bed format
:param dic_seq: A dictionary containing chromosomal sequences
:return: The gc content
>>> get_gc_content(["chr1", 0, 10, "s1", ".", "+"],
... Fasta(str(TestConfig.test_fasta)))
100.0
>>> get_gc_content(["chr1", 10, 20, "s1", ".", "+"],
... Fasta(str(TestConfig.test_fasta)))
50.0
>>> get_gc_content(["chr1", 20, 30, "s1", ".", "+"],
... Fasta(str(TestConfig.test_fasta)))
0.0
>>> get_gc_content(["chr1", 0, 10, "s1", ".", "-"],
... Fasta(str(TestConfig.test_fasta)))
100.0
>>> get_gc_content(["chr1", 10, 20, "s1", ".", "-"],
... Fasta(str(TestConfig.test_fasta)))
50.0
>>> get_gc_content(["chr1", 20, 30, "s1", ".", "-"],
... Fasta(str(TestConfig.test_fasta)))
0.0
"""
seq = str(dic_seq[bed_line[0]][bed_line[1]:bed_line[2]]).upper()
if "N" in seq:
return np.nan
if bed_line[5] == "-":
seq = str(Seq(seq).reverse_complement())
return (seq.count("G") + seq.count("C")) / len(seq) * 100
def get_many_gc_content(bed_line: List[Any], dic_seq: Fasta,
environment: int) -> Dict:
"""
Get the GC content of a given genomic interval and the GC content \
around this interval.
:param bed_line: A genomic interval in a bed format
:param dic_seq: A dictionary containing chromosomal sequences
:param environment: The number of nucleotides around the interval for \
which we want to get the gc content
:return: A dictionary containing the GC content inside the interval \
we want to get and around it.
>>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"],
... Fasta(str(TestConfig.test_fasta)), 0)
{'interval': 50.0}
>>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "+"],
... Fasta(str(TestConfig.test_fasta)), 10)
{'before': 100.0, 'interval': 50.0, 'after': 0.0}
>>> get_many_gc_content(["chr1", 10, 20, "s1", ".", "-"],
... Fasta(str(TestConfig.test_fasta)), 10)
{'before': 0.0, 'interval': 50.0, 'after': 100.0}
"""
gc_interval = get_gc_content(bed_line, dic_seq)
if environment == 0:
return {"interval": gc_interval}
gc_interval = get_gc_content(bed_line, dic_seq)
before_val = max(0, bed_line[1] - environment)
before_interval = get_gc_content([bed_line[0], before_val,
bed_line[1]] + bed_line[3:], dic_seq)
after_val = min(len(dic_seq[bed_line[0]]), bed_line[2] + environment)
after_interval = get_gc_content([bed_line[0], bed_line[2],
after_val] +
bed_line[3:], dic_seq)
if bed_line[5] == "-":
tmp = before_interval
before_interval = after_interval
after_interval = tmp
return {"before": before_interval, "interval": gc_interval,
"after": after_interval}
def build_gc_dataframe(bed_content: List[List], dic_seq: Fasta,
environment: int, ft_name: str = "interval"
) -> pd.DataFrame:
"""
Create a dataframe containing the GC content of many bed files and \
their environment.
:param bed_content: A list of genomic intervals of interest
:param dic_seq: A dictionary containing genomic sequence
:param environment: Number of nucleotide around the genomic interval for \
which we want to know the gc content
:param ft_name: A name corresponding to the feature of interest
:return: a table containing the gc content of the intervals
"""
dic = {"gc_content": [], "region": [], "location": []}
for bed_line in bed_content:
res = get_many_gc_content(bed_line, dic_seq,
environment)
for k in res:
dic["gc_content"].append(res[k])
dic["location"].append(k.replace("interval", ft_name))
dic["region"].append(bed_line[6])
return pd.DataFrame(dic)
def make_gc_barplot(df: pd.DataFrame, outfile: Path, environment: int,
region_names: List[str], ft_name: str) -> None:
"""
Create a barplot showing the gc content in the given lists of intervals.
:param df: A dataframe containing the gc content of genomics intervals
:param outfile: The output file of interest
:param environment: Number of nucleotide around the genomic interval for \
which we want to know the gc content
:param region_names: The name of the regions analysed
:param ft_name: The name of the feature analyzed
"""
sns.set(context="poster")
rgn = ", ".join(region_names[:-1]) + " and " + region_names[-1]
title = f"GC content of {rgn} {ft_name}"
p_vals = make_stat(df)
if environment != 0:
g = sns.catplot(x="location", y="gc_content", hue="region", data=df,
aspect=1.77, height=12, kind="violin")
add_stat_annotation(g.ax, data=df, x="location", y="gc_content",
hue="region",
loc='inside',
boxPairList=p_vals,
linewidth=2, fontsize="medium",
lineYOffsetAxesCoord=0,
lineYOffsetToBoxAxesCoord=0.1)
else:
g = sns.catplot(x="region", y="gc_content", data=df,
aspect=1.77, height=12, kind="violin")
add_stat_annotation(g.ax, data=df, x="region", y="gc_content",
loc='inside',
boxPairList=p_vals,
linewidth=2, fontsize="medium",
lineYOffsetToBoxAxesCoord=0.1)
g.set_ylabels("GC content")
g.set_xlabels("")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle(title + " and their surrounding regions "
f"of {environment} nucleotides")
g.savefig(outfile)
def create_barplot(beds: List[Path], bed_names: List[str],
genome: str, environment: int,
ft_name: str = "interval") -> None:
"""
:param beds: A list of bed files containing the regions to visualise
:param bed_names: A list of names identifying regions insides the given \
beds.
:param genome: A Fasta file containing a genome.
:param environment: Number of nucleotide around the genomic interval for \
which we want to know the gc content
:param ft_name: A name corresponding to the feature of interest
"""
bed_content = load_beds(beds, bed_names)
dic_seq = Fasta(genome)
df = build_gc_dataframe(bed_content, dic_seq, environment, ft_name)
outfile = ConfigGC.output / \
f"gc_content_{'-'.join(bed_names)}_env_{environment}_nt.pdf"
outfile.parent.mkdir(exist_ok=True, parents=True)
make_gc_barplot(df, outfile, environment, bed_names, ft_name)
if __name__ == "__main__":
testmod()
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description: The goal of this script is make a statistical analysis
"""
import pandas as pd
from typing import List, Any
from scipy.stats import mannwhitneyu
from itertools import combinations
def mann_whitney(df: pd.DataFrame) -> List[Any]:
"""
Return the man withney test comparing the gc content and the different \
regions in the dataframe.
:param df: A dataframe of gc content
:return: The name of the groups and their p-value
"""
regions = df['region'].unique()
for r1, r2 in combinations(regions, 2):
v1 = df.loc[df['region'] == r1, "gc_content"]
v2 = df.loc[df['region'] == r2, "gc_content"]
return [r1, r2, mannwhitneyu(v1, v2)[-1]]
def make_stat(df: pd.DataFrame) -> List[List[Any]]:
"""
:param df: A dataframe of gc content
:return: The list of pvalues of interest
"""
if len(df['location'].unique()) == 1:
return [mann_whitney(df)]
else:
list_pval = []
for loc in df['location'].unique():
df_tmp = df.loc[df["location"] == loc, :]
res = mann_whitney(df_tmp)
res[0] = (loc, res[0])
res[1] = (loc, res[1])
list_pval.append(res)
return list_pval
#!/usr/bin/env python3
# -*- coding;: utf-8 -*-
"""
Description:
Add annotation on figure.
Script adapted from
github.com/webermarcolivier/statannot/blob/master/statannot/statannot.py.
"""
import matplotlib.pyplot as plt
from matplotlib import lines
import matplotlib.transforms as mtransforms
from matplotlib.font_manager import FontProperties
import numpy as np
import seaborn as sns
from seaborn.utils import remove_na
def add_stat_annotation(ax, data=None, x=None, y=None, hue=None, order=None,
hue_order=None, boxPairList=None, loc='inside',
useFixedOffset=False, lineYOffsetToBoxAxesCoord=None,
lineYOffsetAxesCoord=None, lineHeightAxesCoord=0.02,
textYOffsetPoints=1, color='0.2', linewidth=1.5,
fontsize='medium', verbose=1):
"""
User should use the same argument for the data, x, y, hue, order, \
hue_order as the seaborn boxplot function.
boxPairList can be of either form:
boxplot: [(cat1, cat2, pval), (cat3, cat4, pval)]
"""
def find_x_position_box(boxPlotter, boxName):
"""
boxName can be either a name "cat" or a tuple ("cat", "hue")
"""
if boxPlotter.plot_hues is None:
cat = boxName
hueOffset = 0
else:
cat = boxName[0]
hue = boxName[1]
hueOffset = boxPlotter.hue_offsets[
boxPlotter.hue_names.index(hue)]
groupPos = boxPlotter.group_names.index(cat)
boxPos = groupPos + hueOffset
return boxPos
def get_box_data(boxPlotter, boxName):
"""
boxName can be either a name "cat" or a tuple ("cat", "hue")
Here we really have to duplicate seaborn code, because there is not
direct access to the
box_data in the BoxPlotter class.
"""
cat = boxName
if boxPlotter.plot_hues is None:
box_max_l = []
for cat in boxPlotter.group_names:
i = boxPlotter.group_names.index(cat)
group_data = boxPlotter.plot_data[i]
box_data = remove_na(group_data)
box_max_l.append(np.max(box_data))
return max(box_max_l)
else:
i = boxPlotter.group_names.index(cat[0])
group_data = boxPlotter.plot_data[i]
box_data = []
for hue in boxPlotter.hue_names:
hue_mask = boxPlotter.plot_hues[i] == hue
box_data.append(np.max(remove_na(group_data[hue_mask])))
return max(box_data)
fig = plt.gcf()
validList = ['inside', 'outside']
if loc not in validList:
raise ValueError(f"loc value should be one of the following: "
f"{', '.join(validList)}.")
# Create the same BoxPlotter object as seaborn's boxplot
boxPlotter = sns.categorical._BoxPlotter(x, y, hue, data, order, hue_order,
orient=None, width=.8, color=None,
palette=None, saturation=.75,
dodge=True, fliersize=5,
linewidth=None)
ylim = ax.get_ylim()
yRange = ylim[1] - ylim[0]
if lineYOffsetAxesCoord is None:
if loc == 'inside':
lineYOffsetAxesCoord = 0.005
if lineYOffsetToBoxAxesCoord is None:
lineYOffsetToBoxAxesCoord = 0.1
elif loc == 'outside':
lineYOffsetAxesCoord = 0.03
lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord
else:
if loc == 'inside':
if lineYOffsetToBoxAxesCoord is None:
lineYOffsetToBoxAxesCoord = 0.06
elif loc == 'outside':
lineYOffsetToBoxAxesCoord = lineYOffsetAxesCoord
yOffset = lineYOffsetAxesCoord * yRange
yOffsetToBox = lineYOffsetToBoxAxesCoord * yRange
yStack = []
annList = []
for box1, box2, pval in boxPairList:
groupNames = boxPlotter.group_names
cat1 = box1
cat2 = box2
if isinstance(cat1, tuple):
hue_names = boxPlotter.hue_names
valid = cat1[0] in groupNames and cat2[0] in groupNames and \
cat1[1] in hue_names and cat2[1] in hue_names
else:
valid = cat1 in groupNames and cat2 in groupNames
if valid:
# Get position of boxes
x1 = find_x_position_box(boxPlotter, box1)
x2 = find_x_position_box(boxPlotter, box2)
box_data1 = get_box_data(boxPlotter, box1)
box_data2 = get_box_data(boxPlotter, box2)
ymax1 = box_data1
ymax2 = box_data2
if pval > 1e-16:
text = "p = {:.2e}".format(pval)
else:
text = "p < 1e-16"
if loc == 'inside':
yRef = max(ymax1, ymax2)
else:
yRef = ylim[1]
if len(yStack) > 0 and isinstance(box1, str):
yRef2 = max(yRef, max(yStack))
else:
yRef2 = yRef
if len(yStack) == 0 or not isinstance(box1, str):
y = yRef2 + yOffsetToBox
else:
y = yRef2 + yOffset
h = lineHeightAxesCoord * yRange
lineX, lineY = [x1, x1, x2, x2], [y, y + h, y + h, y]
if loc == 'inside':
ax.plot(lineX, lineY, lw=linewidth, c=color)
elif loc == 'outside':
line = lines.Line2D(lineX, lineY, lw=linewidth, c=color,
transform=ax.transData)
line.set_clip_on(False)
ax.add_line(line)
if text is not None:
ann = ax.annotate(text, xy=(np.mean([x1, x2]), y + h),
xytext=(0, textYOffsetPoints),
textcoords='offset points',
xycoords='data', ha='center', va='bottom',
fontsize=fontsize,
clip_on=False, annotation_clip=False)
annList.append(ann)
new_max_ylim = 1.1 * (y + h)
if new_max_ylim > ylim[1]:
ax.set_ylim((ylim[0], 1.1 * (y + h)))
if text is not None:
plt.draw()
yTopAnnot = None
gotMatplotlibError = False
if not useFixedOffset:
try:
bbox = ann.get_window_extent()
bbox_data = bbox.transformed(ax.transData.inverted())
yTopAnnot = bbox_data.ymax
except RuntimeError:
gotMatplotlibError = True
if useFixedOffset or gotMatplotlibError:
if verbose >= 1:
print("Warning: cannot get the text bounding box. "
"Falling back to a fixed y offset. Layout may "
"be not optimal.")
# We will apply a fixed offset in points, based on the font size of the annotation.
fontsizePoints = FontProperties(
size='medium').get_size_in_points()
offsetTrans = mtransforms.offset_copy(ax.transData,
fig=fig,
x=0,
y=1.0 * fontsizePoints + textYOffsetPoints,
units='points')
yTopDisplay = offsetTrans.transform((0, y + h))
yTopAnnot = ax.transData.inverted().transform(yTopDisplay)[
1]
else:
yTopAnnot = y + h
yStack.append(yTopAnnot)
else:
raise ValueError("boxPairList contains an unvalid box pair.")
yStackMax = max(yStack)
if loc == 'inside':
if ylim[1] < 1.03 * yStackMax:
ax.set_ylim((ylim[0], 1.03 * yStackMax))
elif loc == 'outside':
ax.set_ylim((ylim[0], ylim[1]))
return ax
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment