Skip to content
Snippets Groups Projects
Verified Commit a71a5bd8 authored by Mia Croiset's avatar Mia Croiset
Browse files

add hicstuff report stats

parent 7f46b8c2
No related branches found
No related tags found
No related merge requests found
......@@ -6,9 +6,13 @@ import pysam as ps
import pandas as pd
import itertools
from hicstuff_log import logger
import hicstuff_log as hcl
import hicstuff_io as hio
import hicstuff_digest as hcd
import hicstuff_stats as hcs
from Bio import SeqIO
import logging
from hicstuff_version import __version__
......@@ -135,6 +139,16 @@ def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
)
)
def generate_log_header(log_path, input1, input2, genome, enzyme):
hcl.set_file_handler(log_path, formatter=logging.Formatter(""))
logger.info("## hicstuff: v%s log file", __version__)
logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S"))
logger.info("## enzyme: %s", str(enzyme))
logger.info("## input1: %s ", input1)
logger.info("## input2: %s", input2)
logger.info("## ref: %s", genome)
logger.info("---")
#hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
......@@ -171,6 +185,11 @@ if __name__ == "__main__":
elif enzyme == "arima":
enzyme = ["DpnII","HinfI"]
log_file = "hicstuff_pairs.log"
sys.stderr = open ("hicstuff_pairs.log", "wt")
hcl.set_file_handler(log_file)
generate_log_header(log_file, bam1, bam2, fasta, enzyme)
bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual)
restrict_table = {}
......
......@@ -11,6 +11,7 @@ from hicstuff_log import logger
import hicstuff_io as hio
import hicstuff_digest as hcd
from Bio import SeqIO
import hicstuff_log as hcl
def pairs2matrix(
......@@ -163,6 +164,24 @@ if __name__ == "__main__":
mat_format = args.type
mat_out = args.output
log_file = "hicstuff_matrix.log"
sys.stderr = open ("hicstuff_matrix.log", "wt")
hcl.set_file_handler(log_file)
# Log which pairs file is being used and how many pairs are listed
pairs_count = 0
with open(pairs, "r") as file:
for line in file:
if line.startswith('#'):
continue
else:
pairs_count += 1
logger.info(
"Generating matrix from pairs file %s (%d pairs in the file) ",
pairs, pairs_count
)
if mat_format == "cool":
# Name matrix file in .cool
cool_file = os.path.splitext(mat_out)[0] + ".cool"
......
......@@ -22,6 +22,8 @@ from collections import OrderedDict
import matplotlib.pyplot as plt
from hicstuff_log import logger
import argparse
import logging
import hicstuff_log as hcl
def process_read_pair(line):
......@@ -553,6 +555,10 @@ if __name__ == "__main__":
else:
pie_plot = prefix+"_"+pie_plot
log_file = "hicstuff_filter.log"
sys.stdout = open ("hicstuff_filter.log", "wt")
hcl.set_file_handler(log_file)
uncut_thr, loop_thr = get_thresholds(
pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix
)
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import os
from os.path import basename
from os.path import splitext
import json
def get_pipeline_stats(log_file):
"""Get stats after pipeline execution.
Parameters
----------
log_file : str
Path to hicstuff log file.
Returns
-------
list:
A single list containing stats about hicstuff pipeline execution:
- Number of sequenced pairs from fastqs
- Number (% of total reads) of unmapped reads
- Number (% of total reads) of mapped reads
- Number (% of total reads) of pairs
- Number (% of total pairs) of filtered pairs
- Number (% of total pairs) of loops
- Number (% of total pairs) of uncuts
- Number (% of total pairs) of abnormal (-- and ++)
- Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates]
- From all pairs used in contact matrix:
- Number (% of matrix) of cis pairs
- Number (% of matrix) of trans pairs
- Trans ratio
"""
with open(log_file) as file:
log_lines = [line.rstrip() for line in file]
# 1. Number of sequenced pairs from fastqs
fastq_pairs = [s for s in log_lines if re.search("reads found in each", s)][0]
fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs)
fastq_pairs = re.sub(" reads found in each.*", "", fastq_pairs)
fastq_pairs = int(float(fastq_pairs))
# 2. Number (% of total) of (un)mapped reads
tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0]
tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped)
tot_mapped = re.sub("/.*", "", tot_mapped)
tot_mapped = int(float(tot_mapped))
tot_unmapped = fastq_pairs*2 - tot_mapped
# 3. Number (% of total) of pairs
tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0]
tot_pairs = re.sub(".*INFO :: ", "", tot_pairs)
tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs)
tot_pairs = int(float(tot_pairs))
# 4. Number (% of total) of filtered pairs
filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)]
if (len(filtered_pairs) > 0):
filtered_pairs = filtered_pairs[0]
loops_pairs = re.sub(".*Loops: ", "", filtered_pairs)
loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs)
loops_pairs = int(float(loops_pairs))
uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs)
uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs)
uncuts_pairs = int(float(uncuts_pairs))
abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs)
abnormal_pairs = int(float(abnormal_pairs))
filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs)
filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs)
filtered_pairs = int(float(filtered_pairs))
else:
loops_pairs=0
uncuts_pairs=0
abnormal_pairs=0
filtered_pairs=0
# 5. Number (% of total) of PCR duplicates pairs
pcr_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)]
if (len(pcr_pairs) > 0):
pcr_pairs = pcr_pairs[0]
pcr_pairs = re.sub(".*have been filtered out \(", "", pcr_pairs)
pcr_pairs = re.sub(" / .*", "", pcr_pairs)
pcr_pairs = int(float(pcr_pairs))
else:
pcr_pairs = 0
# 6. Number (%) of final pairs
removed_pairs=filtered_pairs+pcr_pairs
final_pairs=tot_pairs-removed_pairs
# 7. Final stats
stats = {
'Sample': re.sub(".hicstuff.*", "", basename(log_file)),
'Total read pairs': fastq_pairs,
'Mapped reads': tot_mapped,
'Unmapped reads': tot_unmapped,
'Recovered contacts': tot_pairs,
'Final contacts': final_pairs,
'Removed contacts': removed_pairs,
'Filtered out': filtered_pairs,
'Loops': loops_pairs,
'Uncuts': uncuts_pairs,
'Weirds': abnormal_pairs,
'PCR duplicates': pcr_pairs
}
return(stats)
def write_pipeline_stats(stats, out_file):
prefix = stats['Sample']
fastq_pairs=stats['Total read pairs']
tot_mapped=stats['Mapped reads']
tot_unmapped=stats['Unmapped reads']
tot_pairs=stats['Recovered contacts']
final_pairs=stats['Final contacts']
removed_pairs=stats['Removed contacts']
filtered_pairs=stats['Filtered out']
loops_pairs=stats['Loops']
uncuts_pairs=stats['Uncuts']
abnormal_pairs=stats['Weirds']
pcr_pairs=stats['PCR duplicates']
pct_pairs = round(tot_pairs/fastq_pairs*100, 2)
pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2)
pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2)
pct_filtered = round(filtered_pairs/tot_pairs*100, 2)
pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2)
pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2)
pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2)
pct_pcr = round(pcr_pairs/tot_pairs*100, 2)
pct_removed=round(removed_pairs/tot_pairs*100, 2)
pct_final = round(final_pairs/tot_pairs*100, 2)
# Open the log file and read its contents
_, file_extension = splitext(out_file)
if (file_extension == '.json'):
with open(out_file, 'w') as json_file:
json.dump(stats, json_file, indent=4)
else:
with open(out_file, 'w') as stats_file:
stats_file.write("Sample: {prefix}\n".format(prefix = prefix))
stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs)))
stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format(
tot_mapped = "{:,}".format(tot_mapped),
pct_mapped = pct_mapped
)
)
stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format(
tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped
))
stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format(
tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs
))
stats_file.write("Removed contacts: {removed_pairs} ({pct_removed}% of all contacts)\n".format(
removed_pairs = "{:,}".format(removed_pairs), pct_removed = pct_removed
))
stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format(
filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered
))
stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format(
loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs
))
stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format(
uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs
))
stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format(
abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs
))
stats_file.write(" PCR duplicates: {pcr_pairs} ({pct_pcr}% of all contacts)\n".format(
pcr_pairs = "{:,}".format(pcr_pairs), pct_pcr = pct_pcr
))
stats_file.write("Final contacts: {final_pairs} ({pct_final}% of all contacts)\n".format(
final_pairs = "{:,}".format(final_pairs), pct_final = pct_final
))
def print_pipeline_stats(stats):
tmp = '.'+stats['Sample']+'.txt'
write_pipeline_stats(stats, tmp)
with open(tmp, 'r') as file:
lines = [line for line in file]
print(''.join(lines))
os.unlink(tmp)
__version__ = '3.2.2'
......@@ -212,9 +212,16 @@ process {
" -c ${params.hicstuff_circular}"
].join('').trim() }
publishDir = [
path: { "${params.outdir}/hicstuff/pairs" },
mode: 'copy',
enabled: params.save_pairs
[
path: { "${params.outdir}/hicstuff/pairs" },
mode: 'copy',
enabled: params.save_pairs
],
[
path: { "${params.outdir}/hicstuff/log" },
mode: 'copy',
pattern: '*.log'
]
]
}
......@@ -226,9 +233,16 @@ process {
" -q ${params.hicstuff_pie_plot}"
].join('').trim() }
publishDir = [
path: { "${params.outdir}/hicstuff/pairs" },
mode: 'copy',
enabled: params.save_pairs_intermediates
[
path: { "${params.outdir}/hicstuff/pairs" },
mode: 'copy',
enabled: params.save_pairs_intermediates
],
[
path: { "${params.outdir}/hicstuff/log" },
mode: 'copy',
pattern: '*.log'
]
]
}
......@@ -263,15 +277,29 @@ process {
withName: 'BUILD_MATRIX' {
ext.args = params.hicstuff_matrix
publishDir = [
path: { "${params.outdir}/hicstuff/matrix" },
mode: 'copy'
[
path: { "${params.outdir}/hicstuff/matrix" },
mode: 'copy'
],
[
path: { "${params.outdir}/hicstuff/log" },
mode: 'copy',
pattern: '*.log'
]
]
}
withName: 'BUILD_MATRIX_COOL' {
ext.args = params.hicstuff_matrix
publishDir = [
path: { "${params.outdir}/hicstuff/matrix" },
mode: 'copy'
[
path: { "${params.outdir}/hicstuff/matrix" },
mode: 'copy'
],
[
path: { "${params.outdir}/hicstuff/log" },
mode: 'copy',
pattern: '*.log'
]
]
}
......
......@@ -14,6 +14,7 @@ process BAM2PAIRS {
output:
tuple val(meta1), path("*_valid.pairs"), emit: valid_pairs
tuple val(meta1), path("*_valid_idx.pairs"), emit: idx_pairs
path "*.log", emit: log_file
script:
......@@ -23,6 +24,8 @@ process BAM2PAIRS {
"""
hicstuff_bam2pairs.py -b1 ${bam1} -b2 ${bam2} -i ${info_contigs} -e ${digestion} -f ${fasta} -o ${meta1.id}_${pairs} -x ${meta1.id}_${index} ${args}
mv hicstuff_pairs.log hicstuff_${meta1.id}_pairs.log
"""
}
......@@ -11,6 +11,7 @@ process BUILD_MATRIX {
output:
tuple val(meta), path("${meta1.id}_*"), emit: matrix
path "*.log", emit: log_file
script:
......@@ -22,5 +23,6 @@ process BUILD_MATRIX {
hicstuff_build_matrix.py -p ${args}.pre.pairs -f ${fragments_list} -t graal -o ${args}
mv ${args} ${meta1.id}_${args}
mv hicstuff_matrix.log hicstuff_${meta1.id}_matrix_sparse.log
"""
}
......@@ -11,6 +11,7 @@ process BUILD_MATRIX_COOL {
output:
tuple val(meta), path("${meta1.id}_*.cool"), emit: matrix
path "*.log", emit: log_file
script:
......@@ -21,5 +22,6 @@ process BUILD_MATRIX_COOL {
hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t cool -o ${args}
mv ${base}.cool ${meta1.id}_${base}.cool
mv hicstuff_matrix.log hicstuff_${meta1.id}_matrix_cooler.log
"""
}
......@@ -11,6 +11,7 @@ process FILTER_EVENT {
output:
tuple val(meta1), path("*_valid_idx_filtered.pairs"), emit: idx_pairs_filtered
path("*.png"), optional: true
path("*.log"), emit: log_file
script:
......@@ -18,5 +19,7 @@ process FILTER_EVENT {
"""
hicstuff_filter.py -i ${idx_pairs} -p ${meta1.id} ${args}
mv hicstuff_filter.log hicstuff_${meta1.id}_filter.log
"""
}
......@@ -172,7 +172,7 @@ params {
//Cutsite
cutsite_mode = 'for_vs_rev'
cutsite_seed = 0
cutsite_seed = 20
save_digested = false
cutsite = false
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment