Skip to content
Snippets Groups Projects
Commit ca7c85c7 authored by nfontrod's avatar nfontrod
Browse files

src/visu/__main__.py src/visu/figure_maker.py: modification to create a figure...

src/visu/__main__.py src/visu/figure_maker.py: modification to create a figure with two different list of exons
parent 759679b3
No related branches found
No related tags found
No related merge requests found
......@@ -13,11 +13,11 @@ from pathlib import Path
from typing import List
@lp.parse(design='file', region_bed='file',
@lp.parse(design='file', region_beds='file',
nb_bin="nb_bin > 5", figure_type=['metagene', 'barplot'],
show_replicate=['y', 'n', 'Y', 'N'])
def launcher(design: str, bw_folder: str, region_bed: str,
region_name: str, nb_bin: int = 100,
def launcher(design: str, bw_folder: str, region_beds: List[str],
region_names: List[str], nb_bin: int = 100,
figure_type: str = 'metagene', norm: str = 'None',
show_replicate: str = 'y', environment: List[int] = (0, 0),
border_names: List[str] = ('', ''),
......@@ -31,8 +31,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
the last one contains the replicate of the condition.
:param bw_folder: The folder containing the bigwig file mentioned in \
the first column of the 'design' table.
:param region_bed: A bed file containing the regions to visualise
:param region_name: The name of the region analysed
:param region_beds: A list of bed files containing the regions to visualise
:param region_names: A list of names identifying regions insides \
the given beds.
:param nb_bin: The number of bins used to represents the regions of \
'region_bed'.
:param figure_type: The kind of representation wanted (barplot or metagene)
......@@ -58,8 +59,9 @@ def launcher(design: str, bw_folder: str, region_bed: str,
norm = Path(norm)
if not norm.is_file():
raise FileNotFoundError(f"The file {norm} was not found")
create_figure(Path(design), Path(bw_folder), Path(region_bed),
region_name, nb_bin, figure_type, norm, show_rep,
reg_beds = [Path(p) for p in region_beds]
create_figure(Path(design), Path(bw_folder), reg_beds,
region_names, nb_bin, figure_type, norm, show_rep,
environment, border_names, Path(output))
......
......@@ -7,7 +7,7 @@ Description:
"""
from pathlib import Path
from typing import List, Union, Any
from typing import List, Union, Any, Tuple
from doctest import testmod
from ..bed_handler.config import TestConfig
import pandas as pd
......@@ -17,15 +17,16 @@ import matplotlib.pyplot as plt
from tqdm import tqdm
def load_bed(bed: Path) -> List[List[Union[int, str]]]:
def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
"""
Read a bed file and return the lines within it.
:param bed: A bed file containing the regions of interest
:param bed_name: The name of the regions of interest inside the bed file
:return:The list of feature inside the bed
>>> load_bed(TestConfig.gene_bed)[0]
['18', 28645943, 28682388, 1, 'DSC2', '-']
>>> load_bed(TestConfig.gene_bed, 'gene_test')[0]
['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene_test']
"""
list_regions = []
with bed.open('r') as inbed:
......@@ -33,10 +34,34 @@ def load_bed(bed: Path) -> List[List[Union[int, str]]]:
if not line.startswith("#"):
cline = line.replace("\n", "").split("\t")
list_regions.append([cline[0], int(cline[1]), int(cline[2]),
int(cline[3]), cline[4], cline[5]])
int(cline[3]), cline[4], cline[5],
bed_name])
return list_regions
def load_beds(beds: List[Path], bed_names: List[str]
) -> List[List[Union[int, str]]]:
"""
Read a bed file and return the lines within it.
:param beds: A list of bed files containing the regions of interest
:param bed_names: A list of names indentifying regions insides the given \
beds.
:return:The list of feature inside the beds file
>>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
... ['gene1', 'gene2'])[0]
['18', 28645943, 28682388, 1, 'DSC2', '-', 'gene1']
>>> load_beds([TestConfig.gene_bed, TestConfig.gene_bed],
... ['gene1', 'gene2'])[-1]
['13', 45967450, 45992516, 9, 'SLC25A30', '-', 'gene2']
"""
regions = []
for i in range(len(beds)):
regions += load_bed(beds[i], bed_names[i])
return regions
def inspect_bigwig_regions(bw: Any, region: List,
replicate: str, nb_bin: int, resize: List[int],
condition_name: str,
......@@ -54,30 +79,30 @@ def inspect_bigwig_regions(bw: Any, region: List,
:return: a table with the coverage of this region
>>> my_bw = pbw.open(str(TestConfig.small_bw))
>>> region = ['1', 10, 25, 1, 'Test', '+']
>>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate
0 0.000000 -2 cond1 R1
1 0.500000 -1 cond1 R1
2 75.000000 0 cond1 R1
3 20.000000 1 cond1 R1
4 10.000000 2 cond1 R1
5 4.666667 3 cond1 R1
6 2.000000 4 cond1 R1
7 1.000000 5 cond1 R1
8 0.500000 6 cond1 R1
>>> region = ['1', 110, 133, 1, 'Test', '-']
>>> inspect_bigwig_regions(my_bw, region, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate
0 0.00 -2 cond1 R1
1 12.50 -1 cond1 R1
2 42.00 0 cond1 R1
3 8.00 1 cond1 R1
4 4.25 2 cond1 R1
5 2.00 3 cond1 R1
6 2.00 4 cond1 R1
7 1.00 5 cond1 R1
8 1.00 6 cond1 R1
>>> mregion = ['1', 10, 25, 1, 'Test', '+', 'exon']
>>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate region
0 0.000000 -2 cond1 R1 exon
1 0.500000 -1 cond1 R1 exon
2 75.000000 0 cond1 R1 exon
3 20.000000 1 cond1 R1 exon
4 10.000000 2 cond1 R1 exon
5 4.666667 3 cond1 R1 exon
6 2.000000 4 cond1 R1 exon
7 1.000000 5 cond1 R1 exon
8 0.500000 6 cond1 R1 exon
>>> mregion = ['1', 110, 133, 1, 'Test', '-', 'exon2']
>>> inspect_bigwig_regions(my_bw, mregion, 'R1', 5, [4, 2], 'cond1')
coverage bin condition replicate region
0 0.00 -2 cond1 R1 exon2
1 12.50 -1 cond1 R1 exon2
2 42.00 0 cond1 R1 exon2
3 8.00 1 cond1 R1 exon2
4 4.25 2 cond1 R1 exon2
5 2.00 3 cond1 R1 exon2
6 2.00 4 cond1 R1 exon2
7 1.00 5 cond1 R1 exon2
8 1.00 6 cond1 R1 exon2
"""
val = bw.stats(region[0], region[1], region[2], nBins=nb_bin, exact=True)
bins = list(range(len(val)))
......@@ -110,6 +135,7 @@ def inspect_bigwig_regions(bw: Any, region: List,
df = pd.DataFrame(dic)
df['condition'] = [condition_name] * df.shape[0]
df['replicate'] = [replicate] * df.shape[0]
df['region'] = [region[6]] * df.shape[0]
return df
......@@ -165,10 +191,31 @@ def create_full_table(df_exp: pd.DataFrame, regions: List[List],
return pd.concat(list_df, axis=0, ignore_index=True)
def merge_condition_region_col(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
"""
:param df: A dataframe of mean coverage for each bin.
:return: The dataframe with the region of condition column merged and \
the name of the merged column
"""
if len(df['region'].unique()) == 1:
condition_col = 'condition'
df.drop('region', axis=1, inplace=True)
elif len(df['condition'].unique()) == 1:
condition_col = 'region'
df.drop('condition', axis=1, inplace=True)
else:
condition_col = 'condition-region'
df[condition_col] = df['condition'] + "-" + df['region']
df.drop(['condition', 'region'], axis=1, inplace=True)
return df, condition_col
def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
environment: List[int],
region_name: str, order_condition: List[str]
) -> pd.DataFrame:
region_name: str, order_condition: List[str],
order_bed_name: List[str],
) -> Tuple[pd.DataFrame, str]:
"""
summarize the data in df_cov.
......@@ -180,21 +227,24 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
the number of bin used to represent those surrounding regions.
:param region_name: the name of the region analysed
:param order_condition: The order of conditions
:return: The summarised dataframe
:param order_bed_name: The order of bed name to respect
:return: The summarised dataframe and the condition col
"""
df_sum = df_cov.groupby(['bin', 'condition', 'replicate']).mean() \
df_sum = df_cov.groupby(['bin', 'condition', 'region', 'replicate']) \
.mean() \
.reset_index()
if figure_type == "metagene":
return df_sum
df_sum, condition_col = merge_condition_region_col(df_sum)
return df_sum, condition_col
if environment[0] != 0:
df_sum['location'] = df_cov['bin'].apply(
lambda x: f"before_{region_name}" if x < 0 else
f"after_{region_name}" if x >= nb_bin else region_name)
df_sum.drop('bin', axis=1, inplace=True)
if environment[0] != 0:
col_merge = ['condition', 'replicate', 'location']
col_merge = ['condition', 'region', 'replicate', 'location']
else:
col_merge = ['condition', 'replicate']
col_merge = ['condition', 'region', 'replicate']
df_sum = df_sum.groupby(col_merge).mean().reset_index()
if 'location' in df_sum.columns:
df_sum['location'] = pd.Categorical(
......@@ -206,15 +256,21 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
df_sum['condition'], ordered=True,
categories=order_condition
)
df_sum.sort_values(['condition', 'location'], ascending=True,
df_sum['region'] = pd.Categorical(
df_sum['region'], ordered=True,
categories=order_bed_name
)
df_sum.sort_values(['condition', 'region', 'location'], ascending=True,
inplace=True)
return df_sum
df_sum, condition_col = merge_condition_region_col(df_sum)
return df_sum, condition_col
def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
border_names: List[str], nb_bin: int,
environment: List[int], region_name: str,
output: Path, norm: Union[int, Path]) -> None:
environment: List[int], bed_name: str,
output: Path, norm: Union[int, Path],
condition_col: str) -> None:
"""
Create a metagene figure on the region of interest.
......@@ -226,18 +282,19 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
nucleotide to represent around the region of interest and the second,
the number of bin used to represent those surrounding regions.
:param output: Folder where the figure will be created
:param region_name: The region of interest
:param bed_name: The name of considered regions
:param norm: an integer corresponding to the bin used to normalise \
the samples or a file containing the normalisations to apply to \
each samples
:param condition_col: The name of the condition columns
"""
sns.set(context='poster', style='white')
if show_replicate:
g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
kind='line', style='replicate', ci=None,
height=12, aspect=1.7)
else:
g = sns.relplot('bin', 'coverage', hue='condition', data=df_sum,
g = sns.relplot('bin', 'coverage', hue=condition_col, data=df_sum,
kind='line', ci="sd", height=12, aspect=1.7)
y_val = g.ax.get_ylim()[1] * 0.99
if border_names[0] != '':
......@@ -250,11 +307,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
g.set_xlabels('Bins')
g.set_ylabels('Coverage')
plt.subplots_adjust(top=0.9)
title = f"Average coverage in region '{region_name}'"
tmp_bed_name = bed_name.replace("--", ", ")
title = f"Average coverage in region '{tmp_bed_name}'"
if environment[0] != 0:
title += f"\nand in their surrounding regions of {environment[0]} nt"
g.fig.suptitle(title)
outfile_title = f"metagene_{region_name}_{nb_bin}bin_" \
tmp_bed_name = bed_name.replace("--", "-")
outfile_title = f"metagene_{tmp_bed_name}_{nb_bin}bin_" \
f"{environment[0]}_nt-around-{environment[1]}-bin"
if isinstance(norm, int):
outfile_title += f"_b{norm}_norm"
......@@ -268,7 +327,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
nb_bin: int,
environment: List[int], region_name: str,
output: Path, norm: Union[int, Path]) -> None:
output: Path, norm: Union[int, Path],
condition_col: str) -> None:
"""
Create a barplot figure on the region of interest.
......@@ -283,22 +343,25 @@ def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
:param norm: an integer corresponding to the bin used to normalise \
the samples or a file containing the normalisations to apply to \
each samples
:param condition_col: The name of the condition columns
"""
sns.set(context='poster', style='white')
if show_replicate:
g = sns.catplot(x="condition", y="coverage", hue="replicate",
g = sns.catplot(x=condition_col, y="coverage", hue="replicate",
kind="bar", data=df_sum, height=12, aspect=1.77,
ci=None)
else:
g = sns.catplot(x="condition", y="coverage",
g = sns.catplot(x=condition_col, y="coverage",
kind="bar", data=df_sum, height=12, aspect=1.77,
ci='sd')
g.set_xlabels('')
g.set_ylabels('Coverage')
plt.subplots_adjust(top=0.9)
title = f"Average coverage in region '{region_name}'"
rgt = region_name.replace('--', ', ')
title = f"Average coverage in region '{rgt}'"
g.fig.suptitle(title)
outfile_title = f"barplot_{region_name}_{nb_bin}bin_" \
rgt = region_name.replace('--', '-')
outfile_title = f"barplot_{rgt}_{nb_bin}bin_" \
f"{environment[0]}_nt-around-{environment[1]}-bin"
if isinstance(norm, int):
outfile_title += f"_b{norm}_norm"
......@@ -326,8 +389,8 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
raise ValueError(f"the bin {norm} was not found in the coverage "
f"dataframe.")
df_val = df.loc[df['bin'] == norm,
['coverage', 'condition', 'replicate']] \
.groupby(['condition', 'replicate']).mean().reset_index()
['coverage', 'condition', 'region', 'replicate']] \
.groupby(['condition', 'region', 'replicate']).mean().reset_index()
df_val.rename({"coverage": "coef"}, axis=1, inplace=True)
noutfile = outfile.parent / 'coef_table' / \
(outfile.name.replace(".txt.gz", "") +
......@@ -336,14 +399,19 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
df_val.to_csv(noutfile, sep="\t", index=False)
else:
df_val = pd.read_csv(norm, sep="\t")
df = df.merge(df_val, how="left", on=['condition', 'replicate'])
if len(df_val['region'].unique()) > 1:
df = df.merge(df_val, how="left", on=['condition', 'region',
'replicate'])
else:
df_val.drop('region', axis=1, inplace=True)
df = df.merge(df_val, how="left", on=['condition', 'replicate'])
df['coverage'] = df['coverage'] / df['coef']
df.drop('coef', axis=1, inplace=True)
return df
def create_figure(design: Path, bw_folder: Path, region_bed: Path,
region_name: str, nb_bin: int = 100,
def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
bed_names: List[str], nb_bin: int = 100,
figure_type: str = 'metagene',
norm: Union[int, Path, None] = None,
show_replicate: bool = True, environment: List[int] = (0, 0),
......@@ -358,8 +426,9 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
the last one contains the replicate of the condition.
:param bw_folder: The folder containing the bigwig file mentioned in \
the first column of the 'design' table.
:param region_bed: A bed file containing the regions to visualise
:param region_name: The name of the region analysed
:param region_beds: A list of bed files containing the regions to visualise
:param bed_names: A list of names identifying regions insides the given \
beds.
:param nb_bin: The number of bins used to represents the regions of \
'region_bed'.
:param figure_type: The kind of representation wanted (barplot or metagene)
......@@ -374,10 +443,15 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
:param border_names: The name of the borders
:param output: Folder where the results will be created
"""
if len(region_beds) != len(bed_names):
raise IndexError("Parameter region_beds and bed_names should "
"have the same length")
df_exp = pd.read_csv(design, sep="\t")
regions = load_bed(region_bed)
region_bed_name = region_bed.name.replace('.bed', '')
outfile = f'tmp_cov_table_{design.name}_{region_bed_name}_{nb_bin}bin_' \
regions = load_beds(region_beds, bed_names)
region_bed_name = "-".join([b.name.replace('.bed', '')
for b in region_beds])
outfile = f'tmp_cov_table_{design.name.replace(".txt", "")}' \
f'_{region_bed_name}_{nb_bin}bin_' \
f'{environment[0]}_nt-around-{environment[1]}-bin'
if isinstance(norm, int):
outfile += f'_bin{norm}_norm'
......@@ -397,20 +471,22 @@ def create_figure(design: Path, bw_folder: Path, region_bed: Path,
for condition in df_exp['condition'].to_list():
if condition not in ordered_condition:
ordered_condition.append(condition)
df_sum = create_df_summary(df_cov, figure_type, nb_bin, environment,
region_name, ordered_condition)
region_kind = "--".join(bed_names)
df_sum, cond_col = create_df_summary(df_cov, figure_type, nb_bin,
environment, region_kind,
ordered_condition, bed_names)
if figure_type == "metagene":
figure_metagene(df_sum, show_replicate, border_names, nb_bin,
environment, region_name, output, norm)
environment, region_kind, output, norm, cond_col)
else:
if 'location' in df_sum.columns:
for cur_region in df_sum['location'].unique():
df_tmp = df_sum.loc[df_sum['location'] == cur_region, :]
figure_barplot(df_tmp, show_replicate, nb_bin, environment,
cur_region, output, norm)
cur_region, output, norm, cond_col)
else:
figure_barplot(df_sum, show_replicate, nb_bin, environment,
region_name, output, norm)
region_kind, output, norm, cond_col)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment