diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py index e626dca8606e18f08e5d8950be705ea6ebe31506..744b5875d390a681cfc2bce8417ab7d0198ea608 100755 --- a/bin/hicstuff_bam2pairs.py +++ b/bin/hicstuff_bam2pairs.py @@ -6,9 +6,13 @@ import pysam as ps import pandas as pd import itertools from hicstuff_log import logger +import hicstuff_log as hcl import hicstuff_io as hio import hicstuff_digest as hcd +import hicstuff_stats as hcs from Bio import SeqIO +import logging +from hicstuff_version import __version__ @@ -135,6 +139,16 @@ def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): ) ) +def generate_log_header(log_path, input1, input2, genome, enzyme): + hcl.set_file_handler(log_path, formatter=logging.Formatter("")) + logger.info("## hicstuff: v%s log file", __version__) + logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S")) + logger.info("## enzyme: %s", str(enzyme)) + logger.info("## input1: %s ", input1) + logger.info("## input2: %s", input2) + logger.info("## ref: %s", genome) + logger.info("---") + #hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -171,6 +185,11 @@ if __name__ == "__main__": elif enzyme == "arima": enzyme = ["DpnII","HinfI"] + log_file = "hicstuff_pairs.log" + sys.stderr = open ("hicstuff_pairs.log", "wt") + hcl.set_file_handler(log_file) + generate_log_header(log_file, bam1, bam2, fasta, enzyme) + bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual) restrict_table = {} diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py index 4ac69c15e4d42a22b3f69a5ab8b03dd5d6224d24..f99700bbc648ced284c5e64a499d4eecd74e8cba 100755 --- a/bin/hicstuff_build_matrix.py +++ b/bin/hicstuff_build_matrix.py @@ -11,6 +11,7 @@ from hicstuff_log import logger import hicstuff_io as hio import hicstuff_digest as hcd from Bio import SeqIO +import hicstuff_log as hcl def pairs2matrix( @@ -163,6 +164,24 @@ if __name__ == "__main__": mat_format = args.type mat_out = args.output + log_file = "hicstuff_matrix.log" + sys.stderr = open ("hicstuff_matrix.log", "wt") + hcl.set_file_handler(log_file) + + # Log which pairs file is being used and how many pairs are listed + pairs_count = 0 + with open(pairs, "r") as file: + for line in file: + if line.startswith('#'): + continue + else: + pairs_count += 1 + logger.info( + "Generating matrix from pairs file %s (%d pairs in the file) ", + pairs, pairs_count + ) + + if mat_format == "cool": # Name matrix file in .cool cool_file = os.path.splitext(mat_out)[0] + ".cool" diff --git a/bin/hicstuff_filter.py b/bin/hicstuff_filter.py index 23b74245b8440ab3bbca398618f405d629f3d610..62642ac6a605697dfeafea77f99dba0f99a0560b 100755 --- a/bin/hicstuff_filter.py +++ b/bin/hicstuff_filter.py @@ -22,6 +22,8 @@ from collections import OrderedDict import matplotlib.pyplot as plt from hicstuff_log import logger import argparse +import logging +import hicstuff_log as hcl def process_read_pair(line): @@ -553,6 +555,10 @@ if __name__ == "__main__": else: pie_plot = prefix+"_"+pie_plot + log_file = "hicstuff_filter.log" + sys.stdout = open ("hicstuff_filter.log", "wt") + hcl.set_file_handler(log_file) + uncut_thr, loop_thr = get_thresholds( pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix ) diff --git a/bin/hicstuff_stats.py b/bin/hicstuff_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd1976162025e42db6c14c013473a6890716e24 --- /dev/null +++ b/bin/hicstuff_stats.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import re +import os +from os.path import basename +from os.path import splitext +import json + +def get_pipeline_stats(log_file): + """Get stats after pipeline execution. + + Parameters + ---------- + log_file : str + Path to hicstuff log file. + + Returns + ------- + list: + A single list containing stats about hicstuff pipeline execution: + - Number of sequenced pairs from fastqs + - Number (% of total reads) of unmapped reads + - Number (% of total reads) of mapped reads + - Number (% of total reads) of pairs + - Number (% of total pairs) of filtered pairs + - Number (% of total pairs) of loops + - Number (% of total pairs) of uncuts + - Number (% of total pairs) of abnormal (-- and ++) + - Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates] + - From all pairs used in contact matrix: + - Number (% of matrix) of cis pairs + - Number (% of matrix) of trans pairs + - Trans ratio + """ + + with open(log_file) as file: + log_lines = [line.rstrip() for line in file] + + # 1. Number of sequenced pairs from fastqs + fastq_pairs = [s for s in log_lines if re.search("reads found in each", s)][0] + fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs) + fastq_pairs = re.sub(" reads found in each.*", "", fastq_pairs) + fastq_pairs = int(float(fastq_pairs)) + + # 2. Number (% of total) of (un)mapped reads + tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0] + tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped) + tot_mapped = re.sub("/.*", "", tot_mapped) + tot_mapped = int(float(tot_mapped)) + tot_unmapped = fastq_pairs*2 - tot_mapped + + # 3. Number (% of total) of pairs + tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0] + tot_pairs = re.sub(".*INFO :: ", "", tot_pairs) + tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs) + tot_pairs = int(float(tot_pairs)) + + # 4. Number (% of total) of filtered pairs + filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)] + if (len(filtered_pairs) > 0): + filtered_pairs = filtered_pairs[0] + loops_pairs = re.sub(".*Loops: ", "", filtered_pairs) + loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs) + loops_pairs = int(float(loops_pairs)) + uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs) + uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs) + uncuts_pairs = int(float(uncuts_pairs)) + abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs) + abnormal_pairs = int(float(abnormal_pairs)) + filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs) + filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs) + filtered_pairs = int(float(filtered_pairs)) + else: + loops_pairs=0 + uncuts_pairs=0 + abnormal_pairs=0 + filtered_pairs=0 + + # 5. Number (% of total) of PCR duplicates pairs + pcr_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)] + if (len(pcr_pairs) > 0): + pcr_pairs = pcr_pairs[0] + pcr_pairs = re.sub(".*have been filtered out \(", "", pcr_pairs) + pcr_pairs = re.sub(" / .*", "", pcr_pairs) + pcr_pairs = int(float(pcr_pairs)) + else: + pcr_pairs = 0 + + # 6. Number (%) of final pairs + removed_pairs=filtered_pairs+pcr_pairs + final_pairs=tot_pairs-removed_pairs + + # 7. Final stats + stats = { + 'Sample': re.sub(".hicstuff.*", "", basename(log_file)), + 'Total read pairs': fastq_pairs, + 'Mapped reads': tot_mapped, + 'Unmapped reads': tot_unmapped, + 'Recovered contacts': tot_pairs, + 'Final contacts': final_pairs, + 'Removed contacts': removed_pairs, + 'Filtered out': filtered_pairs, + 'Loops': loops_pairs, + 'Uncuts': uncuts_pairs, + 'Weirds': abnormal_pairs, + 'PCR duplicates': pcr_pairs + } + + return(stats) + +def write_pipeline_stats(stats, out_file): + prefix = stats['Sample'] + fastq_pairs=stats['Total read pairs'] + tot_mapped=stats['Mapped reads'] + tot_unmapped=stats['Unmapped reads'] + tot_pairs=stats['Recovered contacts'] + final_pairs=stats['Final contacts'] + removed_pairs=stats['Removed contacts'] + filtered_pairs=stats['Filtered out'] + loops_pairs=stats['Loops'] + uncuts_pairs=stats['Uncuts'] + abnormal_pairs=stats['Weirds'] + pcr_pairs=stats['PCR duplicates'] + pct_pairs = round(tot_pairs/fastq_pairs*100, 2) + pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2) + pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2) + pct_filtered = round(filtered_pairs/tot_pairs*100, 2) + pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2) + pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2) + pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2) + pct_pcr = round(pcr_pairs/tot_pairs*100, 2) + pct_removed=round(removed_pairs/tot_pairs*100, 2) + pct_final = round(final_pairs/tot_pairs*100, 2) + + # Open the log file and read its contents + _, file_extension = splitext(out_file) + if (file_extension == '.json'): + with open(out_file, 'w') as json_file: + json.dump(stats, json_file, indent=4) + else: + with open(out_file, 'w') as stats_file: + stats_file.write("Sample: {prefix}\n".format(prefix = prefix)) + stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs))) + stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format( + tot_mapped = "{:,}".format(tot_mapped), + pct_mapped = pct_mapped + ) + ) + stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format( + tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped + )) + stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format( + tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs + )) + stats_file.write("Removed contacts: {removed_pairs} ({pct_removed}% of all contacts)\n".format( + removed_pairs = "{:,}".format(removed_pairs), pct_removed = pct_removed + )) + stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format( + filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered + )) + stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format( + loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs + )) + stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format( + uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs + )) + stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format( + abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs + )) + stats_file.write(" PCR duplicates: {pcr_pairs} ({pct_pcr}% of all contacts)\n".format( + pcr_pairs = "{:,}".format(pcr_pairs), pct_pcr = pct_pcr + )) + stats_file.write("Final contacts: {final_pairs} ({pct_final}% of all contacts)\n".format( + final_pairs = "{:,}".format(final_pairs), pct_final = pct_final + )) + +def print_pipeline_stats(stats): + tmp = '.'+stats['Sample']+'.txt' + write_pipeline_stats(stats, tmp) + with open(tmp, 'r') as file: + lines = [line for line in file] + print(''.join(lines)) + os.unlink(tmp) diff --git a/bin/hicstuff_version.py b/bin/hicstuff_version.py new file mode 100644 index 0000000000000000000000000000000000000000..29e4a9413d67347053bffff8555e4e60ec644686 --- /dev/null +++ b/bin/hicstuff_version.py @@ -0,0 +1 @@ +__version__ = '3.2.2' diff --git a/conf/modules.config b/conf/modules.config index 2001ddda18158c6e6bb4ee4e4e41ec34f9b84bb4..08075e72051b2f8ef3f2a8f5bd3ac20c871c9e17 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -212,9 +212,16 @@ process { " -c ${params.hicstuff_circular}" ].join('').trim() } publishDir = [ - path: { "${params.outdir}/hicstuff/pairs" }, - mode: 'copy', - enabled: params.save_pairs + [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy', + enabled: params.save_pairs + ], + [ + path: { "${params.outdir}/hicstuff/log" }, + mode: 'copy', + pattern: '*.log' + ] ] } @@ -226,9 +233,16 @@ process { " -q ${params.hicstuff_pie_plot}" ].join('').trim() } publishDir = [ - path: { "${params.outdir}/hicstuff/pairs" }, - mode: 'copy', - enabled: params.save_pairs_intermediates + [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy', + enabled: params.save_pairs_intermediates + ], + [ + path: { "${params.outdir}/hicstuff/log" }, + mode: 'copy', + pattern: '*.log' + ] ] } @@ -263,15 +277,29 @@ process { withName: 'BUILD_MATRIX' { ext.args = params.hicstuff_matrix publishDir = [ - path: { "${params.outdir}/hicstuff/matrix" }, - mode: 'copy' + [ + path: { "${params.outdir}/hicstuff/matrix" }, + mode: 'copy' + ], + [ + path: { "${params.outdir}/hicstuff/log" }, + mode: 'copy', + pattern: '*.log' + ] ] } withName: 'BUILD_MATRIX_COOL' { ext.args = params.hicstuff_matrix publishDir = [ - path: { "${params.outdir}/hicstuff/matrix" }, - mode: 'copy' + [ + path: { "${params.outdir}/hicstuff/matrix" }, + mode: 'copy' + ], + [ + path: { "${params.outdir}/hicstuff/log" }, + mode: 'copy', + pattern: '*.log' + ] ] } diff --git a/modules/local/hicstuff/bam2pairs.nf b/modules/local/hicstuff/bam2pairs.nf index fba720deb96aae3446c3c4fb78ed524b28bea477..d4f279d6030db15ee66b3cb9d57e048794dac3c0 100644 --- a/modules/local/hicstuff/bam2pairs.nf +++ b/modules/local/hicstuff/bam2pairs.nf @@ -14,6 +14,7 @@ process BAM2PAIRS { output: tuple val(meta1), path("*_valid.pairs"), emit: valid_pairs tuple val(meta1), path("*_valid_idx.pairs"), emit: idx_pairs + path "*.log", emit: log_file script: @@ -23,6 +24,8 @@ process BAM2PAIRS { """ hicstuff_bam2pairs.py -b1 ${bam1} -b2 ${bam2} -i ${info_contigs} -e ${digestion} -f ${fasta} -o ${meta1.id}_${pairs} -x ${meta1.id}_${index} ${args} + + mv hicstuff_pairs.log hicstuff_${meta1.id}_pairs.log """ } diff --git a/modules/local/hicstuff/build_matrix.nf b/modules/local/hicstuff/build_matrix.nf index a7abfedefabec3410a1254c91fc597336e2453d0..5351c83d983fd9241b8185dd62d746bce273f453 100644 --- a/modules/local/hicstuff/build_matrix.nf +++ b/modules/local/hicstuff/build_matrix.nf @@ -11,6 +11,7 @@ process BUILD_MATRIX { output: tuple val(meta), path("${meta1.id}_*"), emit: matrix + path "*.log", emit: log_file script: @@ -22,5 +23,6 @@ process BUILD_MATRIX { hicstuff_build_matrix.py -p ${args}.pre.pairs -f ${fragments_list} -t graal -o ${args} mv ${args} ${meta1.id}_${args} + mv hicstuff_matrix.log hicstuff_${meta1.id}_matrix_sparse.log """ } diff --git a/modules/local/hicstuff/build_matrix_cool.nf b/modules/local/hicstuff/build_matrix_cool.nf index b39f66901425ce46e88b64a702790f04acb992cb..b63e931e23cef0bde43f8132df7ecc5de6fe6470 100644 --- a/modules/local/hicstuff/build_matrix_cool.nf +++ b/modules/local/hicstuff/build_matrix_cool.nf @@ -11,6 +11,7 @@ process BUILD_MATRIX_COOL { output: tuple val(meta), path("${meta1.id}_*.cool"), emit: matrix + path "*.log", emit: log_file script: @@ -21,5 +22,6 @@ process BUILD_MATRIX_COOL { hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t cool -o ${args} mv ${base}.cool ${meta1.id}_${base}.cool + mv hicstuff_matrix.log hicstuff_${meta1.id}_matrix_cooler.log """ } diff --git a/modules/local/hicstuff/filter_event.nf b/modules/local/hicstuff/filter_event.nf index 777ad50f6804001a77054fcc748670d95e2830af..f6b0cde27bcf0ffdc1df0f567c18b44675b56c31 100644 --- a/modules/local/hicstuff/filter_event.nf +++ b/modules/local/hicstuff/filter_event.nf @@ -11,6 +11,7 @@ process FILTER_EVENT { output: tuple val(meta1), path("*_valid_idx_filtered.pairs"), emit: idx_pairs_filtered path("*.png"), optional: true + path("*.log"), emit: log_file script: @@ -18,5 +19,7 @@ process FILTER_EVENT { """ hicstuff_filter.py -i ${idx_pairs} -p ${meta1.id} ${args} + + mv hicstuff_filter.log hicstuff_${meta1.id}_filter.log """ } diff --git a/nextflow.config b/nextflow.config index 738613a9224b8cb741db1f03e0c76dd03e94106f..63bb7b1095bf2353f8c109e56807e636d779ba24 100644 --- a/nextflow.config +++ b/nextflow.config @@ -172,7 +172,7 @@ params { //Cutsite cutsite_mode = 'for_vs_rev' - cutsite_seed = 0 + cutsite_seed = 20 save_digested = false cutsite = false