Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • LBMC/regards/Projects_Analyzes/bigwig_visu
1 result
Show changes
Commits on Source (10)
...@@ -9,3 +9,6 @@ tests/files/test_genome.fa.fai ...@@ -9,3 +9,6 @@ tests/files/test_genome.fa.fai
coverage.xml coverage.xml
.scannerwork/ .scannerwork/
doc/build/* doc/build/*
*.txt
*.bed
.vscode/*
...@@ -32,6 +32,7 @@ class BedConfig: ...@@ -32,6 +32,7 @@ class BedConfig:
sipp_vs_ctcf = base / "data" / "Sipp_exon_vs_CTCF.csv" sipp_vs_ctcf = base / "data" / "Sipp_exon_vs_CTCF.csv"
all_vs_ctcf = base / 'data' / 'All_Exons_vs_CTCF.csv' all_vs_ctcf = base / 'data' / 'All_Exons_vs_CTCF.csv'
exons_vs_ctcf = base / "data" / "All_Exons_vs_CTCF.csv" exons_vs_ctcf = base / "data" / "All_Exons_vs_CTCF.csv"
expressed_genes = base / "data" / "5y_expressed_genes_basemean>5.txt"
bed = OutputBed bed = OutputBed
size = 5000 size = 5000
......
...@@ -8,7 +8,7 @@ a file, that can be find in a specific column in the bed file ...@@ -8,7 +8,7 @@ a file, that can be find in a specific column in the bed file
""" """
from pathlib import Path from pathlib import Path
from typing import List from typing import List, Union
from .config import TestConfig, OutputBed from .config import TestConfig, OutputBed
import pandas as pd import pandas as pd
import lazyparser as lp import lazyparser as lp
...@@ -30,7 +30,8 @@ def select_ft_of_interest(gene_file: Path) -> List[int]: ...@@ -30,7 +30,8 @@ def select_ft_of_interest(gene_file: Path) -> List[int]:
gene_id for gene_id in gene_list] gene_id for gene_id in gene_list]
def filter_bed(bed_file: Path, gene_list: List[int], col_name: str, keep: bool def filter_bed(bed_file: Path, gene_list: List[Union[int, float]],
col_name: str, keep: bool
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
load a bed containing FasterDB gene and only recover the gene of \ load a bed containing FasterDB gene and only recover the gene of \
......
...@@ -15,6 +15,7 @@ import warnings ...@@ -15,6 +15,7 @@ import warnings
from .get_other_exon_in_same_gene import create_gene_bed4norm from .get_other_exon_in_same_gene import create_gene_bed4norm
from pathlib import Path from pathlib import Path
import lazyparser as lp import lazyparser as lp
from typing import List
def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
...@@ -126,6 +127,17 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int, ...@@ -126,6 +127,17 @@ def filter_ctcf_distance_table(df: pd.DataFrame, reg: str, threshold: int,
return df return df
def filter_expressed(exon_list: List[str]) -> List[str]:
"""
Filter only expressed exons.
:param exon_list: A list of exons
:return: The list of expressed exons
"""
egenes = BedConfig.expressed_genes.open('r').read().splitlines()
return [exon for exon in exon_list if exon.split("_")[0] in egenes]
def create_bed_ctcf_exon(reg: str, threshold: int, def create_bed_ctcf_exon(reg: str, threshold: int,
location: str, include0: bool = False, location: str, include0: bool = False,
near_ctcf: bool = True) -> None: near_ctcf: bool = True) -> None:
...@@ -169,6 +181,7 @@ def create_bed_ctcf_exon(reg: str, threshold: int, ...@@ -169,6 +181,7 @@ def create_bed_ctcf_exon(reg: str, threshold: int,
bad_id = df['id'].to_list() if include0 \ bad_id = df['id'].to_list() if include0 \
else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
list_exons = [e for e in tmp_exons if e not in bad_id] list_exons = [e for e in tmp_exons if e not in bad_id]
list_exons = filter_expressed(list_exons)
list_genes = [int(exon.split('_')[0]) for exon in list_exons] list_genes = [int(exon.split('_')[0]) for exon in list_exons]
df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_exon = filter_bed(BedConfig.exon_bed, list_exons)
df_gene = filter_bed(BedConfig.gene_bed, list_genes) df_gene = filter_bed(BedConfig.gene_bed, list_genes)
...@@ -239,6 +252,7 @@ def get_bed_ctcf_exon(exon_bed: str, threshold: int, ...@@ -239,6 +252,7 @@ def get_bed_ctcf_exon(exon_bed: str, threshold: int,
bad_id = df['id'].to_list() if include0 \ bad_id = df['id'].to_list() if include0 \
else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list() else df['id'].to_list() + df.loc[df['dist'] == 0, 'id'].to_list()
list_exons = [e for e in tmp_exons if e not in bad_id] list_exons = [e for e in tmp_exons if e not in bad_id]
list_exons = filter_expressed(list_exons)
list_genes = [int(exon.split('_')[0]) for exon in list_exons] list_genes = [int(exon.split('_')[0]) for exon in list_exons]
df_exon = filter_bed(BedConfig.exon_bed, list_exons) df_exon = filter_bed(BedConfig.exon_bed, list_exons)
df_gene = filter_bed(BedConfig.gene_bed, list_genes) df_gene = filter_bed(BedConfig.gene_bed, list_genes)
......
...@@ -641,47 +641,58 @@ for exp in ${exps[*]}; do ...@@ -641,47 +641,58 @@ for exp in ${exps[*]}; do
done done
bins=(0 99) # Figure 1C
beds=(readthrough no_readthrough) names=(IP)
designs=(data/designnew_exp_all_replicates_IP.txt)
bins=(99) # 0)
beds=(readthrough) # no_readthrough)
for bed in ${beds[*]}; do for bed in ${beds[*]}; do
for bin in ${bins[*]}; do for bin in ${bins[*]}; do
python3 -m src.visu \ for i in ${!designs[@]}; do
--design data/design_exp_all_replicates.txt \ python3 -m src.visu \
--bw_folder data/bigwig/ \ --design ${designs[i]} \
--region_bed results/bed_file/${bed}_expressed_gene.bed \ --bw_folder data/bigwig_newnorm/ \
--region_name ${bed} \ --region_bed results/bed_file/${bed}_expressed_gene.bed \
--output results/figures/ \ --region_name ${bed} \
--border_name TSS TTS \ --output results/figures/ \
--environment 10000 25 \ --border_name TSS TTS \
--show_replicate n \ --environment 10000 25 \
--figure_type metagene \ --show_replicate n \
--nb_bin 100 \ --figure_type metagene \
--norm ${bin} --nb_bin 100 \
--norm ${bin} \
--stat True
mv results/figures/metagene_${bed}_100bin_10000_nt-around-25-bin_b${bin}_norm.pdf results/figures/all_replicates_${bed}_b${bin}_norm.pdf mv results/figures/metagene_${bed}_100bin_10000_nt-around-25-bin_b${bin}_norm.pdf results/figures/Fig1C_${bed}_b${bin}_norm_${names[$i]}.pdf
done
done done
done done
beds=(readthrough no_readthrough) names=(IP)
bins=(0 99) designs=(data/designnew_exp_all_replicates_IP.txt)
loc=(TSS TTS) beds=(readthrough) # no_readthrough)
bins=(99) # 0)
loc=(TTS) # TSS)
for bed in ${beds[*]}; do for bed in ${beds[*]}; do
for i in ${!bins[@]}; do for i in ${!bins[@]}; do
python3 -m src.visu \ for j in ${!designs[@]}; do
--design data/design_exp_all_replicates.txt \ python3 -m src.visu \
--bw_folder data/bigwig/ \ --design ${designs[i]} \
--region_bed results/bed_file/${bed}_expressed_gene_10kb.bed \ --bw_folder data/bigwig_newnorm/ \
--region_name ${bed} \ --region_bed results/bed_file/${bed}_expressed_gene_10kb.bed \
--output results/figures/ \ --region_name ${bed} \
--border_name TTS '' \ --output results/figures/ \
--environment 0 0 \ --border_name TTS '' \
--show_replicate n \ --environment 0 0 \
--figure_type metagene \ --show_replicate n \
--nb_bin 25 \ --figure_type metagene \
--norm "results/figures/coef_table/tmp_cov_table_design_exp_all_replicates_${bed}_expressed_gene_100bin_10000_nt-around-25-bin_bin${bins[$i]}_norm.txt" --nb_bin 25 \
--norm "results/figures/coef_table/tmp_cov_table_designnew_exp_all_replicates_${names[$j]}_${bed}_expressed_gene_100bin_10000_nt-around-25-bin_bin${bins[$i]}_norm.txt" \
mv results/figures/metagene_${bed}_25bin_0_nt-around-0-bin_file_norm.pdf results/figures/all_replicates_TTS10kb_${bed}_${loc[$i]}-bin_norm.pdf --stat True
mv results/figures/metagene_${bed}_25bin_0_nt-around-0-bin_file_norm.pdf results/figures/Fig_1C_TTS10kb_${bed}_${loc[$i]}-bin_norm_${names[$j]}.pdf
done
done done
done done
...@@ -772,13 +783,15 @@ python3 -m src.visu \ ...@@ -772,13 +783,15 @@ python3 -m src.visu \
--show_replicate n \ --show_replicate n \
--figure_type metagene \ --figure_type metagene \
--nb_bin 100 \ --nb_bin 100 \
--norm "None" --norm "None" \
--stat True
mv results/figures/metagene_all_gene_100bin_0_nt-around-0-bin.pdf results/figures/metagene_5y_rnaseq_TSS-2kb_all_gene.pdf mv results/figures/metagene_all_gene_100bin_0_nt-around-0-bin.pdf results/figures/metagene_5y_rnaseq_TSS-2kb_all_gene.pdf
##################################################### #####################################################
# Metagene figure of last exons in readthrough genes near (<=2000 nt) or far (> 2000nt) from a CTCF file and last exons from non readthrough genes # Metagene figure of last exons in readthrough genes near (<=2000 nt) or far (> 2000nt) from a CTCF file and last exons from non readthrough genes
# figures 5C
##################################################### #####################################################
# Bed file containing the last exons of expressed gene with readthrough # Bed file containing the last exons of expressed gene with readthrough
...@@ -803,8 +816,8 @@ for i in ${!list_names[*]}; do ...@@ -803,8 +816,8 @@ for i in ${!list_names[*]}; do
gbed=${bed/exon\.bed/gene-dup.bed} gbed=${bed/exon\.bed/gene-dup.bed}
nbed=${gbed/\.bed/} nbed=${gbed/\.bed/}
python3 -m src.visu \ python3 -m src.visu \
--design data/design_exp_all_replicates.txt \ --design data/designnew_exp_all_replicates_IP.txt \
--bw_folder data/bigwig/ \ --bw_folder data/bigwig_newnorm \
--region_bed results/bed_file/${gbed} \ --region_bed results/bed_file/${gbed} \
--region_name ${cname} \ --region_name ${cname} \
--output results/figures/ \ --output results/figures/ \
...@@ -818,8 +831,8 @@ for i in ${!list_names[*]}; do ...@@ -818,8 +831,8 @@ for i in ${!list_names[*]}; do
rm results/figures/metagene_${cname}_100bin_10000_nt-around-25-bin_b0_norm.pdf rm results/figures/metagene_${cname}_100bin_10000_nt-around-25-bin_b0_norm.pdf
python3 -m src.visu \ python3 -m src.visu \
--design data/design_exp_all_replicates.txt \ --design data/designnew_exp_all_replicates_IP.txt \
--bw_folder data/bigwig/ \ --bw_folder data/bigwig_newnorm \
--region_bed results/bed_file/${bed} \ --region_bed results/bed_file/${bed} \
--region_name ${cname} \ --region_name ${cname} \
--output results/figures/ \ --output results/figures/ \
...@@ -828,10 +841,61 @@ for i in ${!list_names[*]}; do ...@@ -828,10 +841,61 @@ for i in ${!list_names[*]}; do
--show_replicate n \ --show_replicate n \
--figure_type metagene \ --figure_type metagene \
--nb_bin 30 \ --nb_bin 30 \
--stat True \
-y 0.15 0.4 \ -y 0.15 0.4 \
--norm "results/figures/coef_table/tmp_cov_table_design_exp_all_replicates_${nbed}_100bin_10000_nt-around-25-bin_bin0_norm.txt" --norm "results/figures/coef_table/tmp_cov_table_designnew_exp_all_replicates_IP_${nbed}_100bin_10000_nt-around-25-bin_bin0_norm.txt" \
--stat True
mv results/figures/metagene_${cname}_30bin_10000_nt-around-25-bin_file_norm.pdf results/figures/all_replicates_metaexon_${cname}_30bin_10000_nt-around-25-bin_file_norm.pdf mv results/figures/metagene_${cname}_30bin_10000_nt-around-25-bin_file_norm.pdf results/figures/all_replicates_metaexon_${cname}_30bin_10000_nt-around-25-bin_file_norm.pdf
done done
python3 -m src.gc_content -B results/bed_file/readthrough_last_exon_near_CTCF_2000_both_ddx_with0_exon.bed results/bed_file/readthrough_last_exon_far_CTCF_2000_both_ddx_with0_exon.bed results/bed_file/no_readthrough_expressed_last_exon.bed -b readthrough_ctcf readthrough no_readthrough -g data/Homo_sapiens.GRCh37.dna.primary_assembly.fa -f "exons" -e 2000
\ No newline at end of file ###########################################################
# Figures siPP vs siCTRL pour ddx_down_ctcf,
# , ddx_down figure 5B
###########################################################
list_names=(ddx_down_ctcf ddx_down)
bed_names=(CTCF_2000_both_ddx_down_with0_exon.bed Far_CTCF_2000_both_ddx_down_with0_exon.bed)
for i in ${!list_names[*]}; do
cname=${list_names[i]}
bed=${bed_names[i]}
gbed=${bed/exon\.bed/gene-dup.bed}
nbed=${gbed/\.bed/}
python3 -m src.visu \
--design data/designnew_exp_all_replicates_IP.txt \
--bw_folder data/bigwig_newnorm/ \
--region_bed results/bed_file/${gbed} \
--region_name ${cname} \
--output results/figures/ \
--border_name TSS TTS \
--environment 10000 25 \
--show_replicate n \
--figure_type metagene \
--nb_bin 100 \
--norm '0'
rm results/figures/metagene_${cname}_100bin_10000_nt-around-25-bin_b0_norm.pdf
python3 -m src.visu \
--design data/designnew_exp_all_replicates_IP.txt \
--bw_folder data/bigwig_newnorm/ \
--region_bed results/bed_file/${bed} \
--region_name ${cname} \
--output results/figures/ \
--border_name " " " " \
--environment 10000 25 \
--show_replicate n \
--figure_type metagene \
--nb_bin 30 \
-y 0.15 0.4 \
--stat True \
--norm "results/figures/coef_table/tmp_cov_table_designnew_exp_all_replicates_IP_${nbed}_100bin_10000_nt-around-25-bin_bin0_norm.txt" \
--stat True
mv results/figures/metagene_${cname}_30bin_10000_nt-around-25-bin_file_norm.pdf results/figures/all_replicates_metagene_${cname}_100bin_10000_nt-around-25-bin_file_norm.pdf
done
python3 -m src.gc_content -B results/bed_file/readthrough_last_exon_near_CTCF_2000_both_ddx_with0_exon.bed results/bed_file/readthrough_last_exon_far_CTCF_2000_both_ddx_with0_exon.bed results/bed_file/no_readthrough_expressed_last_exon.bed -b readthrough_ctcf readthrough no_readthrough -g data/Homo_sapiens.GRCh37.dna.primary_assembly.fa -f "exons" -e 2000
...@@ -21,7 +21,8 @@ def launcher(design: str, bw_folder: str, region_beds: List[str], ...@@ -21,7 +21,8 @@ def launcher(design: str, bw_folder: str, region_beds: List[str],
figure_type: str = 'metagene', norm: str = 'None', figure_type: str = 'metagene', norm: str = 'None',
show_replicate: str = 'y', environment: List[int] = (0, 0), show_replicate: str = 'y', environment: List[int] = (0, 0),
border_names: List[str] = ('', ''), border_names: List[str] = ('', ''),
output: str = '.', ylim: List[float] = (None, None)) -> None: output: str = '.', ylim: List[float] = (None, None),
stat: bool = False) -> None:
""" """
Create A metagene or a barplot figure from bigwig file on regions defined \ Create A metagene or a barplot figure from bigwig file on regions defined \
in the bed file provided with 'region_bed' parameter. in the bed file provided with 'region_bed' parameter.
...@@ -48,6 +49,9 @@ def launcher(design: str, bw_folder: str, region_beds: List[str], ...@@ -48,6 +49,9 @@ def launcher(design: str, bw_folder: str, region_beds: List[str],
:param border_names: The name of the borders :param border_names: The name of the borders
:param output: Folder where the results will be created :param output: Folder where the results will be created
:param ylim: The y-axis range (default None) :param ylim: The y-axis range (default None)
:param stat: A boolean indicating whether to perform \
statistical analysis. This parameter can only be set to True \
if figure_type is 'metagene' (default: False)
""" """
if ylim[0] is not None and (len(ylim) != 2 or ylim[0] >= ylim[1]): if ylim[0] is not None and (len(ylim) != 2 or ylim[0] >= ylim[1]):
raise ValueError("The ylim must have two values and the first value " raise ValueError("The ylim must have two values and the first value "
...@@ -59,6 +63,9 @@ def launcher(design: str, bw_folder: str, region_beds: List[str], ...@@ -59,6 +63,9 @@ def launcher(design: str, bw_folder: str, region_beds: List[str],
f"greater than the second") f"greater than the second")
show_rep = show_replicate.lower() == 'y' show_rep = show_replicate.lower() == 'y'
norm = int(norm) if norm.isdigit() else None if norm == 'None' else norm norm = int(norm) if norm.isdigit() else None if norm == 'None' else norm
if stat and figure_type != "metagene":
raise NotImplementedError(f"the stat parameter can only be used if "
f"figure type is metagene")
if isinstance(norm, str): if isinstance(norm, str):
norm = Path(norm) norm = Path(norm)
if not norm.is_file(): if not norm.is_file():
...@@ -66,7 +73,7 @@ def launcher(design: str, bw_folder: str, region_beds: List[str], ...@@ -66,7 +73,7 @@ def launcher(design: str, bw_folder: str, region_beds: List[str],
reg_beds = [Path(p) for p in region_beds] reg_beds = [Path(p) for p in region_beds]
create_figure(Path(design), Path(bw_folder), reg_beds, create_figure(Path(design), Path(bw_folder), reg_beds,
region_names, nb_bin, figure_type, norm, show_rep, region_names, nb_bin, figure_type, norm, show_rep,
environment, border_names, Path(output), ylim) environment, border_names, Path(output), ylim, stat)
launcher() launcher()
...@@ -18,6 +18,9 @@ from tqdm import tqdm ...@@ -18,6 +18,9 @@ from tqdm import tqdm
from loguru import logger from loguru import logger
import matplotlib as mpl import matplotlib as mpl
import matplotlib.font_manager import matplotlib.font_manager
from rpy2.robjects import r, pandas2ri, FloatVector
from matplotlib import collections as mc
import numpy as np
def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]: def load_bed(bed: Path, bed_name: str) -> List[List[Union[int, str]]]:
...@@ -259,12 +262,12 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, ...@@ -259,12 +262,12 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
.mean() \ .mean() \
.reset_index() .reset_index()
if figure_type == "metagene": if figure_type == "metagene":
if environment[0] != 0:
df_sum['location'] = df_sum['bin'].apply(
lambda x: f"before_{region_name}" if x < 0 else
f"after_{region_name}" if x >= nb_bin else region_name)
df_sum, condition_col = merge_condition_region_col(df_sum) df_sum, condition_col = merge_condition_region_col(df_sum)
return df_sum, condition_col return df_sum, condition_col
if environment[0] != 0:
df_sum['location'] = df_sum['bin'].apply(
lambda x: f"before_{region_name}" if x < 0 else
f"after_{region_name}" if x >= nb_bin else region_name)
df_sum.drop('bin', axis=1, inplace=True) df_sum.drop('bin', axis=1, inplace=True)
if environment[0] != 0: if environment[0] != 0:
col_merge = ['condition', 'region', 'replicate', 'location'] col_merge = ['condition', 'region', 'replicate', 'location']
...@@ -291,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int, ...@@ -291,11 +294,118 @@ def create_df_summary(df_cov: pd.DataFrame, figure_type: str, nb_bin: int,
return df_sum, condition_col return df_sum, condition_col
def line_maker(list_pval, up_value, position=0):
"""
Create a list of lines, with their colour.
Create a line if the a p-value in list_pval[i] is below 0.05.
If up_mean[i] > down_mean[i] the line will be green, purple else.
:param list_pval: (list of float), list of pvalue get by the comparison
of a propensity scale in a particular sequence position in an
up-regulated and down_regulated set of sequences.
:param up_value: (int) the ordinate coordinates where the line will be
placed.
:param position: (int) the abscissa position to begin to draw the lines
:return: lines - (list of list of 2 tuple), the list of 2 tuple corresponds
to a lines with the coordinates [(x1, y1), (x2, y2)]
"""
lcolor = []
lines = []
for i in range(len(list_pval)):
if list_pval[i] <= 0.05:
val = i + position
lines.append([(val - 0.5, up_value), (val + 0.5, up_value)])
lcolor.append("#000000") # red
return lines, lcolor
def paired_t_test(values1: List[float], values2: List[float]) -> float:
"""
Get the p-value of a paired t-test for each bin
:param values1: A list of values
:param values2: Another list of values
:return: The p-values of the paired t-test
>>> paired_t_test([1, 2, 8], [5, 8, 15])
0.02337551764357566
"""
if len(values1) != len(values2):
raise IndexError("values1 and values2 should have the same length")
ttp = r("""
function(values1, values2) {
return(t.test(values1, values2, paired=T)$p.value)
}
""")
return ttp(FloatVector(values1), FloatVector(values2))[0]
def compute_stats(dff: pd.DataFrame, y_line: float, group_col: str,
outfile: Path) -> Tuple[List[List[Tuple]], List]:
"""
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: A group column
:param outfile: The file to save the stats
:return: A list of lines coordinates in the form of [(x1, y1), (x2, y2)], \
and the color associated to each line
"""
df = dff.sort_values(["bin", group_col], ascending=True)
groups = df[group_col].unique()
if len(groups) != 2:
raise NotImplementedError("Statistical analysis only implemented for "
"2 groups of data")
p_values_ttp = []
grp1 = []
vgrp1 = []
grp2 = []
vgrp2 = []
for bin in df["bin"].unique():
tmp = df[df["bin"] == bin]
values1 = tmp.loc[tmp[group_col] == groups[0], "coverage"].values
values2 = tmp.loc[tmp[group_col] == groups[1], "coverage"].values
p_values_ttp.append(paired_t_test(values1, values2))
grp1.append(np.mean(values1))
grp2.append(np.mean(values2))
vgrp1.append(";".join(map(str, [round(v, 3) for v in values1])))
vgrp2.append(";".join(map(str, [round(v, 3) for v in values2])))
stats_df = pd.DataFrame({"bin": df["bin"].unique(),
"p_values": p_values_ttp,
groups[0]: grp1, groups[1]: grp2,
f"val-{groups[0]}": vgrp1,
f"val-{groups[1]}": vgrp2,})
stats_df.to_csv(outfile, sep="\t", index=False)
return line_maker(p_values_ttp, y_line, min(df["bin"].unique()))
def display_stat(g: sns.FacetGrid, dff: pd.DataFrame, y_line: float,
group_col: str, stat: bool, outfile: Path) -> sns.FacetGrid:
"""
Update the graphics by displaying stats.
:param g: A seaborn FacetGrid objects corresponding to the metagene \
figure
:param dff: A dataframe containing the coverage displayed in the figure
:param y_line: The height of the p-value line
:param group_col: The column containing the groups analyzed
:param outfile: The file where the metagene will be saved
:return: The facetGrid with the stats
"""
if not stat:
return g
stat_file = outfile.parent / f"{outfile.stem}.txt"
lines, lcolor = compute_stats(dff, y_line, group_col, stat_file)
lc = mc.LineCollection(lines, colors=lcolor, linewidths=5)
g.ax.add_collection(lc)
return g
def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
border_names: List[str], nb_bin: int, border_names: List[str], nb_bin: int,
environment: List[int], bed_name: str, environment: List[int], bed_name: str,
output: Path, norm: Union[int, Path], output: Path, norm: Union[int, Path],
condition_col: str, ylim: Optional[List[float]]) -> None: condition_col: str, ylim: Optional[List[float]],
stat: bool = False) -> Path:
""" """
Create a metagene figure on the region of interest. Create a metagene figure on the region of interest.
...@@ -313,6 +423,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, ...@@ -313,6 +423,8 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
each samples each samples
:param condition_col: The name of the condition columns :param condition_col: The name of the condition columns
:param ylim: The range of the y axis :param ylim: The range of the y axis
:param stat: A boolean indicating wether to perform statistical analysis
:return: The name of the figure
""" """
font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None, font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None,
fontext='ttf') fontext='ttf')
...@@ -354,9 +466,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool, ...@@ -354,9 +466,13 @@ def figure_metagene(df_sum: pd.DataFrame, show_replicate: bool,
outfile_title += ".pdf" outfile_title += ".pdf"
if ylim[0] is not None: if ylim[0] is not None:
g.ax.set_ylim(ylim[0], ylim[1]) g.ax.set_ylim(ylim[0], ylim[1])
ymin, ymax = g.ax.get_ylim()
g = display_stat(g, df_sum, ymin + ((ymax - ymin) / 50), condition_col,
stat, output / outfile_title)
g.ax.tick_params(left=True, bottom=True) g.ax.tick_params(left=True, bottom=True)
g.savefig(output / outfile_title) g.savefig(output / outfile_title)
g.fig.clf() g.fig.clf()
return output / outfile_title
def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool, def figure_barplot(df_sum: pd.DataFrame, show_replicate: bool,
...@@ -445,6 +561,71 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path], ...@@ -445,6 +561,71 @@ def bin_normalisation(df: pd.DataFrame, norm: Union[int, Path],
return df return df
def compute_diff(df_sum: pd.DataFrame) -> pd.DataFrame:
"""
Compute the average coverage differences between two conditions.
:param df_sum: A dataframe containing the coverage values between \
two conditions
:return: The dataframe with a coolumn diff
>>> di = pd.DataFrame({'bin': [-25] * 6,
... 'condition': ['Ctrl', 'Ctrl', 'Ctrl', 'siDDX', 'siDDX', 'siDDX'],
... 'replicate': ['R1', 'R2', 'R3', 'R1', 'R2', 'R3'],
... 'coverage': [0.1,
... 0.2,
... 0.3,
... 0.15,
... 0.25,
... 0.35],
... 'location': ['before_ddx_down'] * 6})
>>> compute_diff(di)
bin location Ctrl siDDX diff
0 -25 before_ddx_down 0.2 0.25 0.05
"""
cond = df_sum["condition"].unique()
df_sum = df_sum.groupby(["bin", "location", "condition"]).mean()
if len(cond) != 2:
raise IndexError("Only two different conditions must be given to "
"perform stats")
df = df_sum.pivot_table(index=["bin", "location"],
columns="condition",
values="coverage").reset_index()
df.columns.name = None
df.groupby(["bin", "location"])
df["diff"] = df[cond[1]] - df[cond[0]]
return df
def get_shapiro_an_lm_pvalue(df: pd.DataFrame, location: str) -> pd.Series:
"""
Return a series containing the shapiro and the t-test p-value of \
the difference in coverage between 2 conditions at a particular location
:param df: A dataframe containing the difference in coverage \
between 2 conditions at a particular location
:param location: The region of interest
:return: a series containing the shapiro and the t-test p-value of \
the difference in coverage between 2 conditions at a particular location
"""
df2 = df[df["location"] == location].copy()
df2.to_csv(f"{location}.txt", sep="\t", index=False)
pandas2ri.activate()
stat_s = r("""
function(data){
mod <- lm(diff ~ 1, data=data)
s <- summary(mod)
stat <- s$coefficients[1, "Pr(>|t|)"]
shapiro <- shapiro.test(data$diff)$p.value
return(c(shapiro, stat))
}
""")
res = stat_s(df2)
return pd.Series({"shapiro_p-value": res[0],
"t-test-p-value": res[1],
"location": location})
def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
bed_names: List[str], nb_bin: int = 100, bed_names: List[str], nb_bin: int = 100,
figure_type: str = 'metagene', figure_type: str = 'metagene',
...@@ -452,7 +633,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], ...@@ -452,7 +633,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
show_replicate: bool = True, environment: List[int] = (0, 0), show_replicate: bool = True, environment: List[int] = (0, 0),
border_names: List[str] = ('', ''), border_names: List[str] = ('', ''),
output: Path = Path('.'), output: Path = Path('.'),
ylim: List[float] = (None, None)) -> None: ylim: List[float] = (None, None), stat: bool = False
) -> None:
""" """
Create A metagene or a barplot figure from bigwig file on regions defined \ Create A metagene or a barplot figure from bigwig file on regions defined \
in the bed file provided with 'region_bed' parameter. in the bed file provided with 'region_bed' parameter.
...@@ -479,6 +661,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], ...@@ -479,6 +661,8 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
:param border_names: The name of the borders :param border_names: The name of the borders
:param output: Folder where the results will be created :param output: Folder where the results will be created
:param ylim: The range of y-axis :param ylim: The range of y-axis
:param stat: A boolean indicating whether or not to perform \
statistical analysis
""" """
if len(region_beds) != len(bed_names): if len(region_beds) != len(bed_names):
raise IndexError("Parameter region_beds and bed_names should " raise IndexError("Parameter region_beds and bed_names should "
...@@ -509,8 +693,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path], ...@@ -509,8 +693,9 @@ def create_figure(design: Path, bw_folder: Path, region_beds: List[Path],
ordered_condition, bed_names) ordered_condition, bed_names)
if figure_type == "metagene": if figure_type == "metagene":
figure_metagene(df_sum, show_replicate, border_names, nb_bin, figure_metagene(df_sum, show_replicate, border_names, nb_bin,
environment, region_kind, output, norm, cond_col, environment, region_kind, output, norm,
ylim) cond_col, ylim, stat)
else: else:
if 'location' in df_sum.columns: if 'location' in df_sum.columns:
for cur_region in df_sum['location'].unique(): for cur_region in df_sum['location'].unique():
......