Newer
Older
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Description:
"""
from pathlib import Path
from typing import List, Union, Any, Tuple, Optional
from doctest import testmod
from ..bed_handler.config import TestConfig
import pandas as pd
import pyBigWig as pbw
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

nfontrod
committed
from loguru import logger
import matplotlib as mpl
import matplotlib.font_manager
from rpy2.robjects import r, pandas2ri, FloatVector
from matplotlib import collections as mc
import numpy as np

nfontrod
committed
def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
"""
Read a bed file and return the lines within it.
:param bed: A bed file containing the regions of interest

nfontrod
committed
:param bed_name: The name of the regions of interest inside the bed file

nfontrod
committed
>>> load_bed(TestConfig.gene_bed, 'gene_test')[0]
['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test']
"""
list_regions = []
with bed.open('r') as inbed:
for line in inbed:
if not line.startswith("#"):
cline = line.replace("\n", "").split("\t")
my_id = int(cline[3]) if cline[3].isdigit() else cline[3]

nfontrod
committed
bed_name])

nfontrod
committed
def load_beds(beds: List[Path], bed_names: List[str]
) -> List[List[Union[int, str]]]:
"""
Read a bed file and return the lines within it.
:param beds: A list of bed files containing the regions of interest
:param bed_names: A list of names indentifying regions insides the given \
beds.
:return:The list of feature inside the beds file
>>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
... ['gene1', 'gene2'])[0]
['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1']
>>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
... ['gene1', 'gene2'])[-1]
['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2']
"""
regions = []
for i in range(len(beds)):
regions += load_bed(beds[i], bed_names[i])
return regions
def inspect_bigwig_regions(bw: Any, region: List,
replicate: str, nb_bin: int, resize: List[int],
condition_name: str,
) -> pd.DataFrame:
"""
get the coverage value inside the bigwig region `region`.
:param bw: A opened bigwig file
:param region: The region of interest
:param replicate: The replicate name
:param nb_bin: The number of bin that will represent the region
:param resize: The number of nucleotide used to extend the region
in both side
:param condition_name: the name of the condition
:return: a table with the coverage of this region
>>> my_bw = pbw.open(str(TestConfig.small_bw))

nfontrod
committed
>>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon']
>>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate region
0 0.000000 -2 cond1 R1 exon
1 0.500000 -1 cond1 R1 exon
2 75.000000 0 cond1 R1 exon
3 20.000000 1 cond1 R1 exon
4 10.000000 2 cond1 R1 exon
5 4.666667 3 cond1 R1 exon
6 2.000000 4 cond1 R1 exon
7 1.000000 5 cond1 R1 exon
8 0.500000 6 cond1 R1 exon
>>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2']
>>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate region
0 0.00 -2 cond1 R1 exon2
1 12.50 -1 cond1 R1 exon2
2 42.00 0 cond1 R1 exon2
3 8.00 1 cond1 R1 exon2
4 4.25 2 cond1 R1 exon2
5 2.00 3 cond1 R1 exon2
6 2.00 4 cond1 R1 exon2
7 1.00 5 cond1 R1 exon2
8 1.00 6 cond1 R1 exon2

nfontrod
committed
>>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [0, 0], 'cond1')
coverage bin condition replicate region
0 42.00 0 cond1 R1 exon2
1 8.00 1 cond1 R1 exon2
2 4.25 2 cond1 R1 exon2
3 2.00 3 cond1 R1 exon2
4 2.00 4 cond1 R1 exon2

nfontrod
committed
r_start = max(region[1], 0)
r_end = min(region[2], bw.chroms(region[0]))
if r_end - r_start < nb_bin:
return pd.DataFrame()

nfontrod
committed
val = bw.stats(region[0],
max(region[1], 0),
min(region[2], bw.chroms(region[0])),
nBins=nb_bin,
exact=True)
bins = list(range(len(val)))
if len(bins) != nb_bin:
raise ValueError("The lenght of bins should be equals to nb_bin")

nfontrod
committed
max_loc = max(region[1] - resize[0], 0)
min_loc = min(region[2] + resize[0], bw.chroms(region[0]))
if resize[0] > 0:
val_before = bw.stats(region[0], max_loc, region[1], nBins=resize[1],
exact=True)
val_after = bw.stats(region[0], region[2], min_loc, nBins=resize[1],
exact=True)

nfontrod
committed
else:
val_before = []
val_after = []
if None in val_after:
print(f"Warning ! None values found in {region} - "
f"{[region[0], region[2], min_loc]}")
if region[5] == "+":
bin_before = list(range(-len(val_before), 0))
bin_after = list(range(bins[-1] + 1,
bins[-1] + 1 + len(val_after)))
val = val_before + val + val_after
else:
bin_before = list(range(-len(val_after), 0))
bin_after = list(range(bins[-1] + 1,
bins[-1] + 1 + len(val_before)))
val = val_after[::-1] + val[::-1] + val_before[::-1]
bins = bin_before + bins + bin_after
dic = {"coverage": val, "bin": bins}
df = pd.DataFrame(dic)
df['condition'] = [condition_name] * df.shape[0]
df['replicate'] = [replicate] * df.shape[0]

nfontrod
committed
df['region'] = [region[6]] * df.shape[0]
return df
def create_sample_table(bw_file: Path, regions: List[List],
replicate: str, nb_bin: int, resize: List[int],
condition_name: str,
) -> pd.DataFrame:
"""
Get the table for all the regions of interest
:param bw_file: A bigwig file
:param regions: The regions of interest
:param replicate: The replicate name
:param nb_bin: The number of bin that will represent the region
:param resize: The number of nucleotide used to extend the region
in both side
:param condition_name: the name of the condition
:return: a table with the coverage of this region
"""
list_df = []
bw = pbw.open(str(bw_file))
for region in tqdm(regions, desc="scanning coverage ..."):
list_df.append(inspect_bigwig_regions(bw, region, replicate, nb_bin,
resize, condition_name))

nfontrod
committed
r = sum([df.empty for df in list_df])
if r > 0:
logger.warning(f"They were {r} bed feature that could not be reported "
f"because of their small size")
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
return pd.concat(list_df, axis=0, ignore_index=True)
def create_full_table(df_exp: pd.DataFrame, regions: List[List],
nb_bin: int, resize: List[int],
input_folder: Path) -> pd.DataFrame:
"""
get the regions for every bigwig files.
:param df_exp: A dataframe containing the bigwig file that \
we want to analyse
:param regions: The regions to visualise
:param nb_bin: The number of bin used to resize the regions
:param resize: The number of nucleotides \
of the regions localised at each sides of the genomic regions inside
`regions`.
:param input_folder: Folder where the bigwig file are located
:return: The full coverage table
"""
list_df = []
for i in range(df_exp.shape[0]):
mline = df_exp.iloc[i, :]
bw_file = input_folder / mline['bigwig']
print(f"working on file {bw_file}")
condition = mline['condition']
replicate = mline['replicate']
list_df.append(create_sample_table(bw_file, regions, replicate,
nb_bin, resize, condition))
return pd.concat(list_df, axis=0, ignore_index=True)

nfontrod
committed
def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
"""
:param df: A dataframe of mean coverage for each bin.
:return: The dataframe with the region of condition column merged and \
the name of the merged column
"""
if len(df['region'].unique()) == 1:
condition_col = 'condition'
df.drop('region', axis=1, inplace=True)
elif len(df['condition'].unique()) == 1:
condition_col = 'region'
df.drop('condition', axis=1, inplace=True)
else:
condition_col = 'condition-region'
df[condition_col] = df['condition'] + "-" + df['region']
df.drop(['condition', 'region'], axis=1, inplace=True)
return df, condition_col
def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
environment: List[int],

nfontrod
committed
region_name: str, order_condition: List[str],
order_bed_name: List[str],
) -> Tuple[pd.DataFrame, str]:
"""
summarize the data in df_cov.
:param df_cov: A dataframe of coverage for each bin.
:param figure_type: The kind of figure to make (metagene or barplot)
:param nb_bin: The number of bin used to represent the region of interest
:param environment: A list of two int. The first contains the number of \
nucleotide to represent around the region of interest and the second,
the number of bin used to represent those surrounding regions.
:param region_name: the name of the region analysed
:param order_condition: The order of conditions

nfontrod
committed
:param order_bed_name: The order of bed name to respect
:return: The summarised dataframe and the condition col

nfontrod
committed
df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \
.mean() \

nfontrod
committed
if environment[0] != 0:
df_sum['location'] = df_sum['bin'].apply(
lambda x: f"before_{region_name}" if x < 0 else
f"after_{region_name}" if x >= nb_bin else region_name)

nfontrod
committed
df_sum, condition_col = merge_condition_region_col(df_sum)
return df_sum, condition_col
df_sum.drop('bin', axis=1, inplace=True)
if environment[0] != 0:

nfontrod
committed
col_merge = ['condition', 'region', 'replicate', 'location']

nfontrod
committed
col_merge = ['condition', 'region', 'replicate']
df_sum = df_sum.groupby(col_merge).mean().reset_index()
if 'location' in df_sum.columns:
df_sum['location'] = pd.Categorical(
df_sum['location'], ordered=True,
categories=[f"before_{region_name}", region_name,
f"after_{region_name}"]
)
df_sum['condition'] = pd.Categorical(
df_sum['condition'], ordered=True,
categories=order_condition
)

nfontrod
committed
df_sum['region'] = pd.Categorical(
df_sum['region'], ordered=True,
categories=order_bed_name
)
df_sum.sort_values(['condition', 'region', 'location'], ascending=True,

nfontrod
committed
df_sum, condition_col = merge_condition_region_col(df_sum)
return df_sum, condition_col
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
def line_maker(list_pval, up_value, position=0):
"""
Create a list of lines, with their colour.
Create a line if the a p-value in list_pval[i] is below 0.05.
If up_mean[i] > down_mean[i] the line will be green, purple else.
:param list_pval: (list of float), list of pvalue get by the comparison
of a propensity scale in a particular sequence position in an
up-regulated and down_regulated set of sequences.
:param up_value: (int) the ordinate coordinates where the line will be
placed.
:param position: (int) the abscissa position to begin to draw the lines
:return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
to a lines with the coordinates [(x1, y1), (x2, y2)]
"""
lcolor = []
lines = []
for i in range(len(list_pval)):
if list_pval[i] <= 0.05:
val = i + position
lines.append([(val - 0.5, up_value), (val + 0.5, up_value)])
lcolor.append("#000000") # red
return lines, lcolor
def paired_t_test(values1: List[float], values2: List[float]) -> float:
"""
Get the p-value of a paired t-test for each bin
:param values1: A list of values
:param values2: Another list of values
:return: The p-values of the paired t-test
>>> paired_t_test([1, 2, 8], [5, 8, 15])
0.02337551764357566
"""
if len(values1) != len(values2):
raise IndexError("values1 and values2 should have the same length")
ttp = r("""
function(values1, values2) {
return(t.test(values1, values2, paired=T)$p.value)
}
""")
return ttp(FloatVector(values1), FloatVector(values2))[0]
def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str,
outfile: Path) -> Tuple[List[List[Tuple]], List]:
"""
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: A group column
:param outfile: The file to save the stats
:return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \
and the color associated to each line
"""
df = dff.sort_values(["bin", group_col], ascending=True)
groups = df[group_col].unique()
if len(groups) != 2:
raise NotImplementedError("Statistical analysis only implemented for "
"2 groups of data")
p_values_ttp = []
grp1 = []
vgrp1 = []
grp2 = []
vgrp2 = []
for bin in df["bin"].unique():
tmp = df[df["bin"] == bin]
values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values
values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values
p_values_ttp.append(paired_t_test(values1, values2))
grp1.append(np.mean(values1))
grp2.append(np.mean(values2))
vgrp1.append(";".join(map(str, [round(v, 3) for v in values1])))
vgrp2.append(";".join(map(str, [round(v, 3) for v in values2])))
stats_df = pd.DataFrame({"bin": df["bin"].unique(),
"p_values": p_values_ttp,
groups[0]: grp1, groups[1]: grp2,
f"val-{groups[0]}": vgrp1,
f"val-{groups[1]}": vgrp2,})
stats_df.to_csv(outfile, sep="\t", index=False)
return line_maker(p_values_ttp, y_line, min(df["bin"].unique()))
def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float,
group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid:
"""
Update the graphics by displaying stats.
:param g: A seaborn FacetGrid objects corresponding to the metagene \
figure
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: The column containing the groups analyzed
:param outfile: The file where the metagene will be saved
:return: The facetGrid with the stats
"""
if not stat:
return g
stat_file = outfile.parent / f"{outfile.stem}.txt"
lines, lcolor = compute_stats(dff, y_line, group_col, stat_file)
lc = mc.LineCollection(lines, colors=lcolor, linewidths=5)
g.ax.add_collection(lc)
return g
def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
border_names: List[str], nb_bin: int,

nfontrod
committed
environment: List[int], bed_name: str,
output: Path, norm: Union[int, Path],
condition_col: str, ylim: Optional[List[float]],
stat: bool = False) -> Path:
"""
Create a metagene figure on the region of interest.
:param df_sum: The summarized coverage table
:param show_replicate: True to show the replicate, false else
:param border_names: The name of borders of the region of interest
:param nb_bin: The number of bins representing the regions of interest
:param environment: A list of two int. The first contains the number of \
nucleotide to represent around the region of interest and the second,
the number of bin used to represent those surrounding regions.
:param output: Folder where the figure will be created

nfontrod
committed
:param bed_name: The name of considered regions
:param norm: an integer corresponding to the bin used to normalise \
the samples or a file containing the normalisations to apply to \
each samples

nfontrod
committed
:param condition_col: The name of the condition columns
:param ylim: The range of the y axis
:param stat: A boolean indicating wether to perform statistical analysis

nfontrod
committed
:return: The name of the figure
font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
fontext='ttf')
font_list = matplotlib.font_manager.createFontList(font_files)
matplotlib.font_manager.fontManager.ttflist.extend(font_list)
sns.set(context='poster', style='white', font_scale=1.4)
mpl.rcParams['font.family'] = 'Arial'

nfontrod
committed
g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
kind='line', style='replicate', ci=None,
height=12, aspect=1.7)
else:

nfontrod
committed
g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
kind='line', ci=None, height=12, aspect=1.7)
y_val = g.ax.get_ylim()[1] * 0.99
if border_names[0] != '':
g.ax.axvline(x=0, color='k', linestyle='--', alpha=0.1)
g.ax.annotate(border_names[0], [0, y_val], ha="center", va='center')
if border_names[1] != '':
g.ax.axvline(x=nb_bin - 1, color='k', linestyle='--', alpha=0.1)
g.ax.annotate(border_names[1], [nb_bin - 1, y_val], ha="center",
va='center')
g.set_xlabels('Bins')
g.set_ylabels('Coverage')
plt.subplots_adjust(top=0.9)

nfontrod
committed
tmp_bed_name = bed_name.replace("--", ", ")
title = f"Average coverage in region '{tmp_bed_name}'"
title += f" and in their surrounding regions of {environment[0]} nt"
g.fig.suptitle(title, fontsize=30)

nfontrod
committed
tmp_bed_name = bed_name.replace("--", "-")
outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \
f"{environment[0]}_nt-around-{environment[1]}-bin"
if isinstance(norm, int):
outfile_title += f"_b{norm}_norm"
elif isinstance(norm, Path):
outfile_title += f"_file_norm"
if ylim[0] is not None:
g.ax.set_ylim(ylim[0], ylim[1])
ymin, ymax = g.ax.get_ylim()
g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col,
stat, output / outfile_title)
g.ax.tick_params(left=True, bottom=True)

nfontrod
committed
return output / outfile_title
def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
nb_bin: int,
environment: List[int], region_name: str,

nfontrod
committed
output: Path, norm: Union[int, Path],
condition_col: str) -> None:
"""
Create a barplot figure on the region of interest.
:param df_sum: The summarized coverage table
:param show_replicate: True to show the replicate, false else
:param nb_bin: The number of bins representing the regions of interest
:param environment: A list of two int. The first contains the number of \
nucleotide to represent around the region of interest and the second,
the number of bin used to represent those surrounding regions.
:param output: Folder where the figure will be created
:param region_name: The region of interest
:param norm: an integer corresponding to the bin used to normalise \
the samples or a file containing the normalisations to apply to \
each samples

nfontrod
committed
:param condition_col: The name of the condition columns
"""
sns.set(context='poster', style='white')
if show_replicate:

nfontrod
committed
g = sns.catplot(x=condition_col, y="coverage", hue="replicate",
kind="bar", data=df_sum, height=12, aspect=1.77,
ci=None)
else:

nfontrod
committed
g = sns.catplot(x=condition_col, y="coverage",
kind="bar", data=df_sum, height=12, aspect=1.77,
ci='sd')
g.set_xlabels('')
g.set_ylabels('Coverage')
plt.subplots_adjust(top=0.9)

nfontrod
committed
rgt = region_name.replace('--', ', ')
title = f"Average coverage in region '{rgt}'"

nfontrod
committed
rgt = region_name.replace('--', '-')
outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \
f"{environment[0]}_nt-around-{environment[1]}-bin"
if isinstance(norm, int):
outfile_title += f"_b{norm}_norm"
elif isinstance(norm, Path):
outfile_title += f"_file_norm"
outfile_title += ".pdf"
g.savefig(output / outfile_title)
def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
outfile: Path) -> pd.DataFrame:
Normalise the bins coverage by the average coverage on a particular bin \
or by a value given in a particular file.
:param norm: The bin used to normalise the sample or a file containing \
the value used to normalise the samples.
:param outfile: The table containing coverage values
:return: the dataframe with normalised coverage
"""
if isinstance(norm, int):
if norm not in list(df['bin'].unique()):
raise ValueError(f"the bin {norm} was not found in the coverage "
f"dataframe.")
df_val = df.loc[df['bin'] == norm,

nfontrod
committed
['coverage', 'condition', 'region', 'replicate']] \
.groupby(['condition', 'region', 'replicate']).mean().reset_index()
df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
noutfile = outfile.parent / 'coef_table' / \
(outfile.name.replace(".txt.gz", "") +
f".txt")
noutfile.parent.mkdir(exist_ok=True, parents=True)
df_val.to_csv(noutfile, sep="\t", index=False)
else:
df_val = pd.read_csv(norm, sep="\t")

nfontrod
committed
if len(df_val['region'].unique()) > 1:
df = df.merge(df_val, how="left", on=['condition', 'region',
'replicate'])
else:
df_val.drop('region', axis=1, inplace=True)
df = df.merge(df_val, how="left", on=['condition', 'replicate'])
df['coverage'] = df['coverage'] / df['coef']
df.drop('coef', axis=1, inplace=True)
return df

nfontrod
committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
def compute_diff(df_sum: pd.DataFrame) -> pd.DataFrame:
"""
Compute the average coverage differences between two conditions.
:param df_sum: A dataframe containing the coverage values between \
two conditions
:return: The dataframe with a coolumn diff
>>> di = pd.DataFrame({'bin': [-25] * 6,
... 'condition': ['Ctrl', 'Ctrl', 'Ctrl', 'siDDX', 'siDDX', 'siDDX'],
... 'replicate': ['R1', 'R2', 'R3', 'R1', 'R2', 'R3'],
... 'coverage': [0.1,
... 0.2,
... 0.3,
... 0.15,
... 0.25,
... 0.35],
... 'location': ['before_ddx_down'] * 6})
>>> compute_diff(di)
bin location Ctrl siDDX diff
0 -25 before_ddx_down 0.2 0.25 0.05
"""
cond = df_sum["condition"].unique()
df_sum = df_sum.groupby(["bin", "location", "condition"]).mean()
if len(cond) != 2:
raise IndexError("Only two different conditions must be given to "
"perform stats")
df = df_sum.pivot_table(index=["bin", "location"],
columns="condition",
values="coverage").reset_index()
df.columns.name = None
df.groupby(["bin", "location"])
df["diff"] = df[cond[1]] - df[cond[0]]
return df
def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
"""
Return a series containing the shapiro and the t-test p-value of \
the difference in coverage between 2 conditions at a particular location
:param df: A dataframe containing the difference in coverage \
between 2 conditions at a particular location
:param location: The region of interest
:return: a series containing the shapiro and the t-test p-value of \
the difference in coverage between 2 conditions at a particular location
"""
df2 = df[df["location"] == location].copy()
df2.to_csv(f"{location}.txt", sep="\t", index=False)
pandas2ri.activate()
stat_s = r("""
function(data){
mod <- lm(diff ~ 1, data=data)
s <- summary(mod)
stat <- s$coefficients[1, "Pr(>|t|)"]
shapiro <- shapiro.test(data$diff)$p.value
return(c(shapiro, stat))
}
""")
res = stat_s(df2)
return pd.Series({"shapiro_p-value": res[0],
"t-test-p-value": res[1],
"location": location})

nfontrod
committed
def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
bed_names: List[str], nb_bin: int = 100,
figure_type: str = 'metagene',
norm: Union[int, Path, None] = None,
show_replicate: bool = True, environment: List[int] = (0, 0),
border_names: List[str] = ('', ''),
output: Path = Path('.'),

nfontrod
committed
ylim: List[float] = (None, None), stat: bool = False
) -> None:
"""
Create A metagene or a barplot figure from bigwig file on regions defined \
in the bed file provided with 'region_bed' parameter.
:param design: A tabulated file containing 3 columns. The first columns \
contains a bigwig filename, the second contains the condition name and \
the last one contains the replicate of the condition.
:param bw_folder: The folder containing the bigwig file mentioned in \
the first column of the 'design' table.

nfontrod
committed
:param region_beds: A list of bed files containing the regions to visualise
:param bed_names: A list of names identifying regions insides the given \
beds.
:param nb_bin: The number of bins used to represents the regions of \
'region_bed'.
:param figure_type: The kind of representation wanted (barplot or metagene)
:param norm: an integer corresponding to the bin used to normalise \
the samples or a file containing the normalisations to apply to \
each samples
:param show_replicate: True to create a figure showing the replicate \
false else.
:param environment: A list of two int. The first contains the number of \
nucleotide to represent around the region of interest and the second,
the number of bin used to represent those surrounding regions.
:param border_names: The name of the borders
:param output: Folder where the results will be created
:param ylim: The range of y-axis

nfontrod
committed
:param stat: A boolean indicating whether or not to perform \
statistical analysis

nfontrod
committed
if len(region_beds) != len(bed_names):
raise IndexError("Parameter region_beds and bed_names should "
"have the same length")

nfontrod
committed
regions = load_beds(region_beds, bed_names)
region_bed_name = "-".join(b.name.replace('.bed', '') for b in region_beds)

nfontrod
committed
outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \
f'_{region_bed_name}_{nb_bin}bin_' \
if isinstance(norm, int):
outfile += f'_bin{norm}_norm'
elif isinstance(norm, Path):
outfile += f'_file_norm'
df_cov = create_full_table(df_exp, regions, nb_bin, environment,
bw_folder)
if norm is not None:
df_cov = bin_normalisation(df_cov, norm, cov_file)
ordered_condition = []
for condition in df_exp['condition'].to_list():
if condition not in ordered_condition:
ordered_condition.append(condition)

nfontrod
committed
region_kind = "--".join(bed_names)
df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin,
environment, region_kind,
ordered_condition, bed_names)
figure_metagene(df_sum, show_replicate, border_names, nb_bin,
environment, region_kind, output, norm,
cond_col, ylim, stat)

nfontrod
committed
else:
if 'location' in df_sum.columns:
for cur_region in df_sum['location'].unique():
df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
figure_barplot(df_tmp, show_replicate, nb_bin, environment,

nfontrod
committed
cur_region, output, norm, cond_col)
else:
figure_barplot(df_sum, show_replicate, nb_bin, environment,

nfontrod
committed
region_kind, output, norm, cond_col)