diff --git a/src/modules/statistical_analysis/script/glm_stat.py b/src/modules/statistical_analysis/script/glm_stat.py index 03ce374d68b8a23e3213de9413f56b1e52464ddc..e87fa2471a2803c8233cb3ca301f5959461a7649 100644 --- a/src/modules/statistical_analysis/script/glm_stat.py +++ b/src/modules/statistical_analysis/script/glm_stat.py @@ -79,13 +79,15 @@ def glm_beta(dataframe: pd.DataFrame, snp: str, {diag_str} tuk <- emmeans(mlm, ~ condition) res <- summary(contrast(tuk, method = "pairwise", - adjust = "fdr", by = NULL)) + adjust = "none", by = NULL)) res <- as.data.frame(res) + res[["padj"]] <- p.adjust(res[["p.value"]], method="fdr") return(res) }} """) df_stats = stat_s(dataframe, str(output_diag)) pandas2ri.deactivate() + df_stats.insert(1, "snp", [snp] * df_stats.shape[0]) df_stats.to_csv(f"{output_stat}/{snp}_stat.txt", sep="\t", index=False) return df_stats @@ -125,13 +127,15 @@ def glm_binom(dataframe: pd.DataFrame, snp: str, {diag_str} tuk <- emmeans(mlm, ~ condition) res <- summary(contrast(tuk, method = "pairwise", - adjust = "fdr", by = NULL)) + adjust = "none", by = NULL)) res <- as.data.frame(res) + res[["padj"]] <- p.adjust(res[["p.value"]], method="fdr") return(res) }} """) df_stats = stat_s(dataframe, str(output_diag)) pandas2ri.deactivate() + df_stats.insert(1, "snp", [snp] * df_stats.shape[0]) df_stats.to_csv(f"{output_stat}/{snp}_stat.txt", sep="\t", index=False) return df_stats diff --git a/src/modules/statistical_analysis/script/sample_reads.py b/src/modules/statistical_analysis/script/sample_reads.py index 2e55de2f14d8db9a1717026c244b04cc7bd7baf1..13368b875c5a25be7914dfcebe19068d1865eb72 100644 --- a/src/modules/statistical_analysis/script/sample_reads.py +++ b/src/modules/statistical_analysis/script/sample_reads.py @@ -42,6 +42,7 @@ def get_mapped_read(mfile: Path) -> Tuple[str, int]: "unmapped"]) return mfile.stem.replace( "_aligned_sorted_snodup", "").replace( + "_aligned_sorted_sorted", "").replace( "_aligned_sorted", "").replace("_stats", ""), df["mapped"].sum() @@ -58,7 +59,9 @@ def get_stats_table(content: List[Tuple[str, int]]) -> pd.DataFrame: df["frac"] = round(min_val / df["mapped_reads"], 8) df["to_subsample"] = ["no"] * df.shape[0] df.loc[(df["frac"] < 1) & (df["frac"] > 0), "to_subsample"] = "yes" - return df + df["frac"] = df["frac"].astype(str) + df["frac"] = df["frac"].str.replace('0.', '', regex=False) + return df.sort_values("samples") def produce_figure(df: pd.DataFrame, output: Path) -> None: @@ -72,10 +75,11 @@ def produce_figure(df: pd.DataFrame, output: Path) -> None: g = sns.catplot( data=df, x="samples", y="mapped_reads", height=12, aspect=1.7, kind="bar", color="lightgrey", edgecolor="black") - g.ax.set_yscale("log") g.set_xticklabels(rotation=90) g.set_axis_labels("", "# Mapped Read") g.savefig(output / "mapped_read.pdf") + g.ax.set_yscale("log") + g.savefig(output / "mapped_read_log.pdf") df.to_csv(output / "mapped_read.txt", sep="\t", index=False) diff --git a/src/modules/statistical_analysis/script/variant_counts.py b/src/modules/statistical_analysis/script/variant_counts.py index 61fd971ebf63cff3d0ae25d1afe8c9ef74306b33..23e71f28abbcc889c5fbbb7aa914569c335388bb 100644 --- a/src/modules/statistical_analysis/script/variant_counts.py +++ b/src/modules/statistical_analysis/script/variant_counts.py @@ -11,6 +11,7 @@ from pathlib import Path from typing import List import pandas as pd from variant_frequencies import merge_dataframe +from statsmodels.stats.multitest import multipletests from glm_stat import glm_binom, compute_barplot_count, \ build_relative_figure @@ -131,12 +132,21 @@ def make_count_analysis(design: pd.DataFrame, build_relative_figure(df_sum, output_rel, ctrl, title, False) build_relative_figure(df_sum, output_rel, ctrl, title, True) list_snp = df_sum["snp"].unique().tolist() + df_stat_lists = [] for snp in list_snp: tmp_df = df_sum[df_sum["snp"] == snp].copy() + if len(tmp_df["condition"].unique()) < 2: + continue compute_barplot_count(tmp_df.copy(), output_fig, snp) rep = len(tmp_df["replicate"].unique()) > 1 tmp_df = format_df(tmp_df) - glm_binom(tmp_df, snp, output_stat, output_stat, rep, False) + df_stat_lists.append(glm_binom(tmp_df, snp, output_stat, output_stat, + rep, False)) + df_stats_final = pd.concat(df_stat_lists, axis=0, ignore_index=True) + df_stats_final["padj"] = list(multipletests(df_stats_final["p.value"], + method='fdr_bh')[1]) + df_stats_final.to_csv(output_stat / "recap_stats.txt", sep="\t", + index=False) if __name__ == "__main__": diff --git a/src/modules/statistical_analysis/script/variant_frequencies.py b/src/modules/statistical_analysis/script/variant_frequencies.py index 15bec5171f74df94bae66656a865dab57e0ad64e..c77dd50cf6f65372bd916c2b34f7bf14a8252593 100644 --- a/src/modules/statistical_analysis/script/variant_frequencies.py +++ b/src/modules/statistical_analysis/script/variant_frequencies.py @@ -14,6 +14,7 @@ from pathlib import Path from typing import List import pandas as pd import statsmodels.api as sm +from statsmodels.stats.multitest import multipletests import plotly.express as px from glm_stat import glm_beta, build_relative_figure @@ -135,6 +136,11 @@ def flatten(df: pd.DataFrame) -> pd.DataFrame: [counts[0], counts[i + 1]]) t["count"] = ncount rows.append(t) + df = pd.DataFrame(rows) + if df.empty: + return pd.DataFrame(columns=["CHROM", "POS", "REF", + "ALT", "depth", "freq_alt", "count", + "sample"]) return pd.DataFrame(rows).drop("falts", axis=1) @@ -196,6 +202,10 @@ def prepare_df(df: pd.DataFrame, design: pd.DataFrame, """ df = filter_on_alternative_allele(df, alt, ref) df = flatten(df) + if df.empty: + return pd.DataFrame(columns=["CHROM", "POS", "REF", + "ALT", "depth", "freq_alt", "count", + "sample", "condition", "replicate"]) df = filter_on_alternative_allele(df, alt, ref) return merge_dataframe(df, design) @@ -425,12 +435,15 @@ def make_frequency_analysis(folder: Path, design: pd.DataFrame, out_txt = output_figure / "tables" out_txt.mkdir(exist_ok=True) snp_order = [] + df_stat_lists = [] for ref, alt in product( list("ATGC"), ["A", "T", "G", "C", "A*", "T*", "G*", "C*", "+", "*"]): if ref != alt.strip("*"): print(ref, alt) df = variant_loading(folder, design, ref, alt) + if len(df["condition"].unique()) < 2: + continue diag = (ref, alt) in [ ("A", "T"), ("A", "T*"), @@ -438,10 +451,15 @@ def make_frequency_analysis(folder: Path, design: pd.DataFrame, ("A", "*"), ] rep = len(df["replicate"].unique()) > 1 - glm_beta(df, f"{ref}>{alt}", output_diag, - output_stats, rep, diag) + df_stat_lists.append(glm_beta(df, f"{ref}>{alt}", output_diag, + output_stats, rep, diag)) create_freqfigures(df, output_figure, out_txt, ref, alt) snp_order.append(f"{ref}>{alt}") + df_stats_final = pd.concat(df_stat_lists, axis=0, ignore_index=True) + df_stats_final["padj"] = list(multipletests(df_stats_final["p.value"], + method='fdr_bh')[1]) + df_stats_final.to_csv(output_stats / "recap_stats.txt", sep="\t", + index=False) del df if ctrl != "": df_sum = load_tables(out_txt, snp_order) diff --git a/src/pipelines/nf_modules/analysis.nf b/src/pipelines/nf_modules/analysis.nf index b92b9939044f81d3401ed6c89ef6fcbb995e4cb8..2e6925f1898f4b76e8b98a26d29d67fb41f773d7 100644 --- a/src/pipelines/nf_modules/analysis.nf +++ b/src/pipelines/nf_modules/analysis.nf @@ -38,7 +38,7 @@ process sample_size { path stats output: - path "mapped_read.pdf", emit: figure + path "mapped_read*.pdf", emit: figure path "mapped_read.txt", emit: report script: diff --git a/src/pipelines/nf_modules/picard.nf b/src/pipelines/nf_modules/picard.nf index e0c530de62b3b69c0b114e3aa3fe819637f8a5ae..f831f21f6f9f4f66dc01a2759a8c0b4117e2b8be 100644 --- a/src/pipelines/nf_modules/picard.nf +++ b/src/pipelines/nf_modules/picard.nf @@ -17,6 +17,11 @@ process picard_mark_duplicate { path "*_report.dupinfo.txt", emit: report script: + if (bam instanceof List){ + bam = bam[0] + } else { + bam = bam + } """ PicardCommandLine MarkDuplicates \ ${params.mark_duplicate} \ diff --git a/src/pipelines/nf_modules/samtools.nf b/src/pipelines/nf_modules/samtools.nf index 7c22077f5f60a740e47c6365b8174f5b9a5184af..3a41ef0e9202ed02668841e18a34a8867b58f92e 100644 --- a/src/pipelines/nf_modules/samtools.nf +++ b/src/pipelines/nf_modules/samtools.nf @@ -96,8 +96,8 @@ process subsampling { echo "\$f" frac=`grep \$f ${size_report} | grep "yes" | cut -f 3` if [ -n "\${frac}" ]; then - echo "samtools view --subsample-seed 1 --subsample \${frac} ${bam} -b -o ${file_id}_subsampled.bam" - samtools view -s \${frac} ${bam} -b -o ${file_id}_subsampled.bam + echo "samtools view -s \${RANDOM}.\${frac} ${bam} -b -o ${file_id}_subsampled.bam" + samtools view -s \${RANDOM}.\${frac} ${bam} -b -o ${file_id}_subsampled.bam else cp ${bam} ${file_id}_subsampled.bam fi diff --git a/src/pipelines/variant_calling.nf b/src/pipelines/variant_calling.nf index ff9f3f1f079feb36bf06e7ee520ae8c02e03b9cd..613a490466dfabcc2e05a9cc113ca495a5c3a151 100644 --- a/src/pipelines/variant_calling.nf +++ b/src/pipelines/variant_calling.nf @@ -14,6 +14,11 @@ nextflow.enable.dsl=2 params.bam = "" // data/tiny_dataset/map/*.bam /* List of bam file to analyze +If the parameter params.rmdup is false then the input should contains bam and their +index: example *_sorted{.bam,.bam.bai} + +If params.rmdup if true then only submit bam file. + @type: Files */ @@ -50,7 +55,7 @@ params.call = "-A -V indels -m -O b" */ -params.filter = "-i 'QUAL>=10 && DP>=200' -O b" +params.filter = "-i 'QUAL>=10 && DP>=500' -O b" /* The parameter used in bcftools view command @type: String @@ -69,7 +74,7 @@ params.folder = "project" @Type: String */ -params.min_depth = 200 +params.min_depth = 500 /* The minimum depth to consider bases of interest @Type: int @@ -229,7 +234,6 @@ workflow { } else { bam_file.set { new_bam_file } } - idxstats_first(new_bam_file) sample_size_first(idxstats_first.out.stats.collect()) if (params.subsample) {