From d520fcd0ddb452a8c376a561a895ce72deaa2eae Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Wed, 22 May 2024 17:20:51 +0200 Subject: [PATCH] clean functions to call hicstuff functions --- bin/hicstuff_bam2pairs.py | 147 ++--------------------------------- bin/hicstuff_build_matrix.py | 10 +-- bin/hicstuff_cutsite.py | 28 +------ bin/hicstuff_distance_law.py | 14 +--- bin/hicstuff_filter.py | 19 ----- bin/hicstuff_filter_pcr.py | 65 ++-------------- bin/hicstuff_iteralign.py | 20 ----- 7 files changed, 19 insertions(+), 284 deletions(-) diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py index 83d5c97..8c6afeb 100755 --- a/bin/hicstuff_bam2pairs.py +++ b/bin/hicstuff_bam2pairs.py @@ -1,154 +1,19 @@ #!/usr/bin/env python -import os, time, csv, sys, re +""" +Bridge from nextflow function call to hicstuff's python function for bam2pairs +""" + +import os, sys import argparse -import pysam as ps -import pandas as pd -import itertools from hicstuff.log import logger import hicstuff.log as hcl import hicstuff.io as hio import hicstuff.digest as hcd -import hicstuff.stats as hcs from Bio import SeqIO import logging from hicstuff.version import __version__ - - - -def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): - """ - Make a .pairs file from two Hi-C bam files sorted by read names. - The Hi-C mates are matched by read identifier. Pairs where at least one - reads maps with MAPQ below min_qual threshold are discarded. Pairs are - sorted by readID and stored in upper triangle (first pair higher). - Parameters - ---------- - bam1 : str - Path to the name-sorted BAM file with aligned Hi-C forward reads. - bam2 : str - Path to the name-sorted BAM file with aligned Hi-C reverse reads. - out_pairs : str - Path to the output space-separated .pairs file with columns - readID, chr1 pos1 chr2 pos2 strand1 strand2 - info_contigs : str - Path to the info contigs file, to get info on chromosome sizes and order. - min_qual : int - Minimum mapping quality required to keep a Hi-C pair. - """ - forward = ps.AlignmentFile(bam1, "rb") - reverse = ps.AlignmentFile(bam2, "rb") - - # Generate header lines - format_version = "## pairs format v1.0\n" - sorting = "#sorted: readID\n" - cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" - # Chromosome order will be identical in info_contigs and pair files - chroms = pd.read_csv(info_contigs, sep="\t").apply( - lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1 - ) - with open(out_pairs, "w") as pairs: - pairs.writelines([format_version, sorting, cols] + chroms.tolist()) - pairs_writer = csv.writer(pairs, delimiter="\t") - n_reads = {"total": 0, "mapped": 0} - # Remember if some read IDs were missing from either file - unmatched_reads = 0 - # Remember if all reads in one bam file have been read - exhausted = [False, False] - # Iterate on both BAM simultaneously - end_regex = re.compile(r'/[12]$') - for end1, end2 in itertools.zip_longest(forward, reverse): - # Remove end-specific suffix if any - end1.query_name = re.sub(end_regex, '', end1.query_name) - end2.query_name = re.sub(end_regex, '', end2.query_name) - # Both file still have reads - # Check if reads pass filter - try: - end1_passed = end1.mapping_quality >= min_qual - # Happens if end1 bam file has been exhausted - except AttributeError: - exhausted[0] = True - end1_passed = False - try: - end2_passed = end2.mapping_quality >= min_qual - # Happens if end2 bam file has been exhausted - except AttributeError: - exhausted[1] = True - end2_passed = False - # Skip read if mate is not present until they match or reads - # have been exhausted - while sum(exhausted) == 0 and end1.query_name != end2.query_name: - # Get next read and check filters again - # Count single-read iteration - unmatched_reads += 1 - n_reads["total"] += 1 - if end1.query_name < end2.query_name: - try: - end1 = next(forward) - end1_passed = end1.mapping_quality >= min_qual - # If EOF is reached in BAM 1 - except (StopIteration, AttributeError): - exhausted[0] = True - end1_passed = False - n_reads["mapped"] += end1_passed - elif end1.query_name > end2.query_name: - try: - end2 = next(reverse) - end2_passed = end2.mapping_quality >= min_qual - # If EOF is reached in BAM 2 - except (StopIteration, AttributeError): - exhausted[1] = True - end2_passed = False - n_reads["mapped"] += end2_passed - - # 2 reads processed per iteration, unless one file is exhausted - n_reads["total"] += 2 - sum(exhausted) - n_reads["mapped"] += sum([end1_passed, end2_passed]) - # Keep only pairs where both reads have good quality - if end1_passed and end2_passed: - - # Flipping to get upper triangle - if ( - end1.reference_id == end2.reference_id - and end1.reference_start > end2.reference_start - ) or end1.reference_id > end2.reference_id: - end1, end2 = end2, end1 - pairs_writer.writerow( - [ - end1.query_name, - end1.reference_name, - end1.reference_start + 1, - end2.reference_name, - end2.reference_start + 1, - "-" if end1.is_reverse else "+", - "-" if end2.is_reverse else "+", - ] - ) - pairs.close() - if unmatched_reads > 0: - logger.warning( - "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.", - unmatched_reads, - ) - logger.info( - "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format( - total=n_reads["total"], - mapped=n_reads["mapped"], - perc_map=round(100 * n_reads["mapped"] / n_reads["total"]), - qual=min_qual, - ) - ) - -def generate_log_header(log_path, input1, input2, genome, enzyme): - hcl.set_file_handler(log_path, formatter=logging.Formatter("")) - logger.info("## hicstuff: v%s log file", __version__) - logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S")) - logger.info("## enzyme: %s", str(enzyme)) - logger.info("## input1: %s ", input1) - logger.info("## input2: %s", input2) - logger.info("## ref: %s", genome) - logger.info("---") - #hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter) +from hicstuff.pipeline import bam2pairs, generate_log_header if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py index 906026b..99520b5 100755 --- a/bin/hicstuff_build_matrix.py +++ b/bin/hicstuff_build_matrix.py @@ -1,16 +1,8 @@ #!/usr/bin/env python -import os, time, csv, sys, re +import os, sys import argparse -import pysam as ps -import pandas as pd -import shutil as st -import subprocess as sp -import itertools from hicstuff.log import logger -import hicstuff.io as hio -import hicstuff.digest as hcd -from Bio import SeqIO import hicstuff.log as hcl import hicstuff.pipeline as hp diff --git a/bin/hicstuff_cutsite.py b/bin/hicstuff_cutsite.py index fa3faa2..6822f80 100755 --- a/bin/hicstuff_cutsite.py +++ b/bin/hicstuff_cutsite.py @@ -1,35 +1,11 @@ #!/usr/bin/env python3 # coding: utf-8 -"""Cut the reads at their ligation sites. - -The module will cut at ligation sites of the given enzyme. It will from the -original fastq (gzipped or not) files create new gzipped fastq files with -combinations of the fragments of reads obtains by cutting at the ligation sites -of the reads. - -There are three choices to how combine the fragments. 1. "for_vs_rev": All the -combinations are made between one forward fragment and one reverse fragment. -It's the most releveant one for usual workflow. 2. "all": All 2-combinations are -made. 3. "pile": Only combinations between adjacent fragments in the initial -reads are made. - -This module contains the following functions: - - cut_ligation_sites - - cutsite_read - - write_pair - - _writer +""" +Bridge from nextflow function call to hicstuff's python function for cutsite """ - -import gzip -import multiprocessing -import pyfastx -import re -import sys import argparse -import hicstuff.digest as hcd -from hicstuff.log import logger import hicstuff.cutsite as hc if __name__ == "__main__": diff --git a/bin/hicstuff_distance_law.py b/bin/hicstuff_distance_law.py index 8b49d6c..ad70f32 100755 --- a/bin/hicstuff_distance_law.py +++ b/bin/hicstuff_distance_law.py @@ -1,17 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +""" +Bridge from nextflow function call to hicstuff's python function for distance law +""" + import numpy as np -import sys -import matplotlib.pyplot as plt -import warnings -from scipy import ndimage -from matplotlib import cm -import pandas as pd -import os as os -import csv as csv -import hicstuff.io as hio -from hicstuff.log import logger import argparse import hicstuff.distance_law as hdl diff --git a/bin/hicstuff_filter.py b/bin/hicstuff_filter.py index 3b0bedd..ef915c4 100755 --- a/bin/hicstuff_filter.py +++ b/bin/hicstuff_filter.py @@ -1,27 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""Hi-C event filtering -Analyse the contents of a 3C library and filters spurious events such as loops -and uncuts to improve the overall signal. Filtering consists in excluding +- -and -+ Hi-C pairs if their reads are closer than a threshold in minimum number -of restriction fragments. This threshold represents the distance at which the -abundance of these events deviate significantly from the rest of the library. -It is estimated automatically by default using the median absolute deviation of -pairs at longer distances, but can also be entered manually. -The program takes a 2D BED file as input with the following fields: -chromA startA endA indiceA strandA chromB startB endB indiceB strandB -Each line of the file represents a Hi-C pair with reads A and B. The indices -are 0-based and represent the restriction fragment to which reads are -attributed. -@author: cmdoret (reimplementation of Axel KournaK's code) -""" import sys -import numpy as np -from collections import OrderedDict -import matplotlib.pyplot as plt import argparse -import logging import hicstuff.filter as hf import hicstuff_log as hcl diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py index 30a7904..b568fe1 100755 --- a/bin/hicstuff_filter_pcr.py +++ b/bin/hicstuff_filter_pcr.py @@ -1,68 +1,15 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +""" +Bridge from nextflow function call to hicstuff's python function for PCR duplicates filtering +""" + import hicstuff.io as hio from hicstuff.log import logger +from hicstuff.pipeline import filter_pcr_dup import argparse -import os, time, csv, sys, re - -def filter_pcr_dup(pairs_idx_file, filtered_file): - """ - Filter out PCR duplicates from a coordinate-sorted pairs file using - overrrepresented exact coordinates. If multiple fragments have two reads - with the exact same coordinates, only one of those fragments is kept. - Parameters - ---------- - pairs_idx_file : str - Path to an indexed pairs file containing the Hi-C reads. - filtered_file : str - Path to the output pairs file after removing duplicates. - """ - # Keep count of how many reads are filtered - filter_count = 0 - reads_count = 0 - # Store header lines - header = hio.get_pairs_header(pairs_idx_file) - with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered: - # Copy header lines to filtered file - for head_line in header: - filtered.write(head_line + "\n") - next(pairs) - - # Use csv methods to easily access columns - paircols = [ - "readID", - "chr1", - "pos1", - "chr2", - "pos2", - "strand1", - "strand2", - "frag1", - "frag2", - ] - # Columns used for comparison of coordinates - coord_cols = [col for col in paircols if col != "readID"] - pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols) - filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols) - - # Initialise a variable to store coordinates of reads in previous pair - prev = {k: 0 for k in paircols} - for pair in pair_reader: - reads_count += 1 - # If coordinates are the same as before, skip pair - if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols): - filter_count += 1 - continue - # Else write pair and store new coordinates as previous - else: - filt_writer.writerow(pair) - prev = pair - logger.info( - "%d%% PCR duplicates have been filtered out (%d / %d pairs) " - % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) - ) - +import csv if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/bin/hicstuff_iteralign.py b/bin/hicstuff_iteralign.py index ad19ed6..5e446fe 100755 --- a/bin/hicstuff_iteralign.py +++ b/bin/hicstuff_iteralign.py @@ -1,28 +1,8 @@ #!/usr/bin/env python3 # coding: utf-8 -"""Iterative alignment -Aligns iteratively reads from a 3C fastq file: reads -are trimmed with a range-sweeping number of basepairs -and each read set generated this way is mapped onto -the reference genome. This may result in a small -increase of properly mapped reads. - -@author: Remi Montagne & cmdoret -""" - -import os -import sys -import glob -import re -import subprocess as sp -import pysam as ps -import shutil as st import hicstuff.io as hio -import contextlib import argparse -from hicstuff.log import logger -from os.path import join import hicstuff.iteralign as hi if __name__ == "__main__": -- GitLab