From d520fcd0ddb452a8c376a561a895ce72deaa2eae Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Wed, 22 May 2024 17:20:51 +0200
Subject: [PATCH] clean functions to call hicstuff functions

---
 bin/hicstuff_bam2pairs.py    | 147 ++---------------------------------
 bin/hicstuff_build_matrix.py |  10 +--
 bin/hicstuff_cutsite.py      |  28 +------
 bin/hicstuff_distance_law.py |  14 +---
 bin/hicstuff_filter.py       |  19 -----
 bin/hicstuff_filter_pcr.py   |  65 ++--------------
 bin/hicstuff_iteralign.py    |  20 -----
 7 files changed, 19 insertions(+), 284 deletions(-)

diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py
index 83d5c97..8c6afeb 100755
--- a/bin/hicstuff_bam2pairs.py
+++ b/bin/hicstuff_bam2pairs.py
@@ -1,154 +1,19 @@
 #!/usr/bin/env python
 
-import os, time, csv, sys, re
+"""
+Bridge from nextflow function call to hicstuff's python function for bam2pairs
+"""
+
+import os, sys
 import argparse
-import pysam as ps
-import pandas as pd
-import itertools
 from hicstuff.log import logger
 import hicstuff.log as hcl
 import hicstuff.io as hio
 import hicstuff.digest as hcd
-import hicstuff.stats as hcs
 from Bio import SeqIO
 import logging
 from hicstuff.version import __version__
-
-
-
-def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
-    """
-    Make a .pairs file from two Hi-C bam files sorted by read names.
-    The Hi-C mates are matched by read identifier. Pairs where at least one
-    reads maps with MAPQ below  min_qual threshold are discarded. Pairs are
-    sorted by readID and stored in upper triangle (first pair higher).
-    Parameters
-    ----------
-    bam1 : str
-        Path to the name-sorted BAM file with aligned Hi-C forward reads.
-    bam2 : str
-        Path to the name-sorted BAM file with aligned Hi-C reverse reads.
-    out_pairs : str
-        Path to the output space-separated .pairs file with columns
-        readID, chr1 pos1 chr2 pos2 strand1 strand2
-    info_contigs : str
-        Path to the info contigs file, to get info on chromosome sizes and order.
-    min_qual : int
-        Minimum mapping quality required to keep a Hi-C pair.
-    """
-    forward = ps.AlignmentFile(bam1, "rb")
-    reverse = ps.AlignmentFile(bam2, "rb")
-
-    # Generate header lines
-    format_version = "## pairs format v1.0\n"
-    sorting = "#sorted: readID\n"
-    cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n"
-    # Chromosome order will be identical in info_contigs and pair files
-    chroms = pd.read_csv(info_contigs, sep="\t").apply(
-        lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1
-    )
-    with open(out_pairs, "w") as pairs:
-        pairs.writelines([format_version, sorting, cols] + chroms.tolist())
-        pairs_writer = csv.writer(pairs, delimiter="\t")
-        n_reads = {"total": 0, "mapped": 0}
-        # Remember if some read IDs were missing from either file
-        unmatched_reads = 0
-        # Remember if all reads in one bam file have been read
-        exhausted = [False, False]
-        # Iterate on both BAM simultaneously
-        end_regex = re.compile(r'/[12]$')
-        for end1, end2 in itertools.zip_longest(forward, reverse):
-            # Remove end-specific suffix if any
-            end1.query_name = re.sub(end_regex, '', end1.query_name)
-            end2.query_name = re.sub(end_regex, '', end2.query_name)
-            # Both file still have reads
-            # Check if reads pass filter
-            try:
-                end1_passed = end1.mapping_quality >= min_qual
-            # Happens if end1 bam file has been exhausted
-            except AttributeError:
-                exhausted[0] = True
-                end1_passed = False
-            try:
-                end2_passed = end2.mapping_quality >= min_qual
-            # Happens if end2 bam file has been exhausted
-            except AttributeError:
-                exhausted[1] = True
-                end2_passed = False
-            # Skip read if mate is not present until they match or reads
-            # have been exhausted
-            while sum(exhausted) == 0 and end1.query_name != end2.query_name:
-                # Get next read and check filters again
-                # Count single-read iteration
-                unmatched_reads += 1
-                n_reads["total"] += 1
-                if end1.query_name < end2.query_name:
-                    try:
-                        end1 = next(forward)
-                        end1_passed = end1.mapping_quality >= min_qual
-                    # If EOF is reached in BAM 1
-                    except (StopIteration, AttributeError):
-                        exhausted[0] = True
-                        end1_passed = False
-                    n_reads["mapped"] += end1_passed
-                elif end1.query_name > end2.query_name:
-                    try:
-                        end2 = next(reverse)
-                        end2_passed = end2.mapping_quality >= min_qual
-                    # If EOF is reached in BAM 2
-                    except (StopIteration, AttributeError):
-                        exhausted[1] = True
-                        end2_passed = False
-                    n_reads["mapped"] += end2_passed
-
-            # 2 reads processed per iteration, unless one file is exhausted
-            n_reads["total"] += 2 - sum(exhausted)
-            n_reads["mapped"] += sum([end1_passed, end2_passed])
-            # Keep only pairs where both reads have good quality
-            if end1_passed and end2_passed:
-
-                # Flipping to get upper triangle
-                if (
-                    end1.reference_id == end2.reference_id
-                    and end1.reference_start > end2.reference_start
-                ) or end1.reference_id > end2.reference_id:
-                    end1, end2 = end2, end1
-                pairs_writer.writerow(
-                    [
-                        end1.query_name,
-                        end1.reference_name,
-                        end1.reference_start + 1,
-                        end2.reference_name,
-                        end2.reference_start + 1,
-                        "-" if end1.is_reverse else "+",
-                        "-" if end2.is_reverse else "+",
-                    ]
-                )
-    pairs.close()
-    if unmatched_reads > 0:
-        logger.warning(
-            "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.",
-            unmatched_reads,
-        )
-    logger.info(
-        "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format(
-            total=n_reads["total"],
-            mapped=n_reads["mapped"],
-            perc_map=round(100 * n_reads["mapped"] / n_reads["total"]),
-            qual=min_qual,
-        )
-    )
-
-def generate_log_header(log_path, input1, input2, genome, enzyme):
-    hcl.set_file_handler(log_path, formatter=logging.Formatter(""))
-    logger.info("## hicstuff: v%s log file", __version__)
-    logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S"))
-    logger.info("## enzyme: %s", str(enzyme))
-    logger.info("## input1: %s ", input1)
-    logger.info("## input2: %s", input2)
-    logger.info("## ref: %s", genome)
-    logger.info("---")
-    #hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter)
+from hicstuff.pipeline import bam2pairs, generate_log_header
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py
index 906026b..99520b5 100755
--- a/bin/hicstuff_build_matrix.py
+++ b/bin/hicstuff_build_matrix.py
@@ -1,16 +1,8 @@
 #!/usr/bin/env python
 
-import os, time, csv, sys, re
+import os, sys
 import argparse
-import pysam as ps
-import pandas as pd
-import shutil as st
-import subprocess as sp
-import itertools
 from hicstuff.log import logger
-import hicstuff.io as hio
-import hicstuff.digest as hcd
-from Bio import SeqIO
 import hicstuff.log as hcl
 import hicstuff.pipeline as hp
 
diff --git a/bin/hicstuff_cutsite.py b/bin/hicstuff_cutsite.py
index fa3faa2..6822f80 100755
--- a/bin/hicstuff_cutsite.py
+++ b/bin/hicstuff_cutsite.py
@@ -1,35 +1,11 @@
 #!/usr/bin/env python3
 # coding: utf-8
 
-"""Cut the reads at their ligation sites.
-
-The module will cut at ligation sites of the given enzyme. It will from the
-original fastq (gzipped or not) files create new gzipped fastq files with
-combinations of the fragments of reads obtains by cutting at the ligation sites
-of the reads.
-
-There are three choices to how combine the fragments. 1. "for_vs_rev": All the
-combinations are made between one forward fragment and one reverse fragment.
-It's the most releveant one for usual workflow. 2. "all": All 2-combinations are
-made. 3. "pile": Only combinations between adjacent fragments in the initial
-reads are made.
-
-This module contains the following functions:
-    - cut_ligation_sites
-    - cutsite_read
-    - write_pair
-    - _writer
+"""
+Bridge from nextflow function call to hicstuff's python function for cutsite
 """
 
-
-import gzip
-import multiprocessing
-import pyfastx
-import re
-import sys
 import argparse
-import hicstuff.digest as hcd
-from hicstuff.log import logger
 import hicstuff.cutsite as hc
 
 if __name__ == "__main__":
diff --git a/bin/hicstuff_distance_law.py b/bin/hicstuff_distance_law.py
index 8b49d6c..ad70f32 100755
--- a/bin/hicstuff_distance_law.py
+++ b/bin/hicstuff_distance_law.py
@@ -1,17 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+"""
+Bridge from nextflow function call to hicstuff's python function for distance law
+"""
+
 import numpy as np
-import sys
-import matplotlib.pyplot as plt
-import warnings
-from scipy import ndimage
-from matplotlib import cm
-import pandas as pd
-import os as os
-import csv as csv
-import hicstuff.io as hio
-from hicstuff.log import logger
 import argparse
 import hicstuff.distance_law as hdl
 
diff --git a/bin/hicstuff_filter.py b/bin/hicstuff_filter.py
index 3b0bedd..ef915c4 100755
--- a/bin/hicstuff_filter.py
+++ b/bin/hicstuff_filter.py
@@ -1,27 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""Hi-C event filtering
-Analyse the contents of a 3C library and filters spurious events such as loops
-and uncuts to improve the overall signal. Filtering consists in excluding +-
-and -+ Hi-C pairs if their reads are closer than a threshold in minimum number
-of restriction fragments. This threshold represents the distance at which the
-abundance of these events deviate significantly from the rest of the library.
-It is estimated automatically by default using the median absolute deviation of
-pairs at longer distances, but can also be entered manually.
-The program takes a 2D BED file as input with the following fields:
-chromA startA endA indiceA strandA chromB startB endB indiceB strandB
-Each line of the file represents a Hi-C pair with reads A and B. The indices
-are 0-based and represent the restriction fragment to which reads are
-attributed.
-@author: cmdoret (reimplementation of Axel KournaK's code)
-"""
 
 import sys
-import numpy as np
-from collections import OrderedDict
-import matplotlib.pyplot as plt
 import argparse
-import logging
 import hicstuff.filter as hf
 import hicstuff_log as hcl
 
diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py
index 30a7904..b568fe1 100755
--- a/bin/hicstuff_filter_pcr.py
+++ b/bin/hicstuff_filter_pcr.py
@@ -1,68 +1,15 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+"""
+Bridge from nextflow function call to hicstuff's python function for PCR duplicates filtering
+"""
+
 import hicstuff.io as hio
 from hicstuff.log import logger
+from hicstuff.pipeline import filter_pcr_dup
 import argparse
-import os, time, csv, sys, re
-
-def filter_pcr_dup(pairs_idx_file, filtered_file):
-    """
-    Filter out PCR duplicates from a coordinate-sorted pairs file using
-    overrrepresented exact coordinates. If multiple fragments have two reads
-    with the exact same coordinates, only one of those fragments is kept.
-    Parameters
-    ----------
-    pairs_idx_file : str
-        Path to an indexed pairs file containing the Hi-C reads.
-    filtered_file : str
-        Path to the output pairs file after removing duplicates.
-    """
-    # Keep count of how many reads are filtered
-    filter_count = 0
-    reads_count = 0
-    # Store header lines
-    header = hio.get_pairs_header(pairs_idx_file)
-    with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered:
-        # Copy header lines to filtered file
-        for head_line in header:
-            filtered.write(head_line + "\n")
-            next(pairs)
-
-        # Use csv methods to easily access columns
-        paircols = [
-            "readID",
-            "chr1",
-            "pos1",
-            "chr2",
-            "pos2",
-            "strand1",
-            "strand2",
-            "frag1",
-            "frag2",
-        ]
-        # Columns used for comparison of coordinates
-        coord_cols = [col for col in paircols if col != "readID"]
-        pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols)
-        filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols)
-
-        # Initialise a variable to store coordinates of reads in previous pair
-        prev = {k: 0 for k in paircols}
-        for pair in pair_reader:
-            reads_count += 1
-            # If coordinates are the same as before, skip pair
-            if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols):
-                filter_count += 1
-                continue
-            # Else write pair and store new coordinates as previous
-            else:
-                filt_writer.writerow(pair)
-                prev = pair
-        logger.info(
-            "%d%% PCR duplicates have been filtered out (%d / %d pairs) "
-            % (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
-        )
-
+import csv
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/bin/hicstuff_iteralign.py b/bin/hicstuff_iteralign.py
index ad19ed6..5e446fe 100755
--- a/bin/hicstuff_iteralign.py
+++ b/bin/hicstuff_iteralign.py
@@ -1,28 +1,8 @@
 #!/usr/bin/env python3
 # coding: utf-8
-"""Iterative alignment
 
-Aligns iteratively reads from a 3C fastq file: reads
-are trimmed with a range-sweeping number of basepairs
-and each read set generated this way is mapped onto
-the reference genome. This may result in a small
-increase of properly mapped reads.
-
-@author: Remi Montagne & cmdoret
-"""
-
-import os
-import sys
-import glob
-import re
-import subprocess as sp
-import pysam as ps
-import shutil as st
 import hicstuff.io as hio
-import contextlib
 import argparse
-from hicstuff.log import logger
-from os.path import join
 import hicstuff.iteralign as hi
 
 if __name__ == "__main__":
-- 
GitLab