From 49ad90d5dbfa4dcb2a17f221d07f5ce94e301a26 Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Wed, 22 May 2024 17:11:27 +0200
Subject: [PATCH] update build matrices function + delete manual sorting

---
 bin/hicstuff_build_matrix.py           | 180 ++++---------------------
 modules/local/hicstuff/build_matrix.nf |   2 +-
 2 files changed, 24 insertions(+), 158 deletions(-)

diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py
index fef5f0a..906026b 100755
--- a/bin/hicstuff_build_matrix.py
+++ b/bin/hicstuff_build_matrix.py
@@ -12,144 +12,7 @@ import hicstuff.io as hio
 import hicstuff.digest as hcd
 from Bio import SeqIO
 import hicstuff.log as hcl
-
-
-def pairs2matrix(
-    pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
-):
-    """Generate the matrix by counting the number of occurences of each
-    combination of restriction fragments in a pairs file.
-    Parameters
-    ----------
-    pairs_file : str
-        Path to a Hi-C pairs file, with frag1 and frag2 columns added.
-    mat_file : str
-        Path where the matrix will be written.
-    fragments_file : str
-        Path to the fragments_list.txt file. Used to know total
-        matrix size in case some observations are not observed at the end.
-    mat_fmt : str
-        The format to use when writing the matrix. Can be graal or bg2 format.
-    threads : int
-        Number of threads to use in parallel.
-    tmp_dir : str
-        Temporary directory for sorting files. If None given, will use the system default.
-    """
-    # Number of fragments is N lines in frag list - 1 for the header
-    n_frags = sum(1 for line in open(fragments_file, "r")) - 1
-    frags = pd.read_csv(fragments_file, delimiter="\t")
-
-    def write_mat_entry(frag1, frag2, contacts):
-        """Write a single sparse matrix entry in either graal or bg2 format"""
-        if mat_fmt == "graal":
-            mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
-        elif mat_fmt == "bg2":
-            frag1, frag2 = int(frag1), int(frag2)
-            mat.write(
-                "\t".join(
-                    map(
-                        str,
-                        [
-                            frags.chrom[frag1],
-                            frags.start_pos[frag1],
-                            frags.end_pos[frag1],
-                            frags.chrom[frag2],
-                            frags.start_pos[frag2],
-                            frags.end_pos[frag2],
-                            contacts,
-                        ],
-                    )
-                )
-                + "\n"
-            )
-
-    pre_mat_file = mat_file + ".pre.pairs"
-    # hio.sort_pairs(
-    #     pairs_file,
-    #     pre_mat_file,
-    #     keys=["frag1", "frag2"],
-    #     threads=threads,
-    #     tmp_dir=tmp_dir,
-    # )
-
-
-    header_size = len(hio.get_pairs_header(pre_mat_file))
-    with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
-        # Skip header lines
-        for _ in range(header_size):
-            next(pairs)
-        prev_pair = ["0", "0"]  # Pairs identified by [frag1, frag2]
-        n_occ = 0  # Number of occurences of each frag combination
-        n_nonzero = 0  # Total number of nonzero matrix entries
-        n_pairs = 0  # Total number of pairs entered in the matrix
-        pairs_reader = csv.reader(pairs, delimiter="\t")
-        # First line contains nrows, ncols and number of nonzero entries.
-        # Number of nonzero entries is unknown for now
-        if mat_fmt == "graal":
-            mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
-        for pair in pairs_reader:
-            # Fragment ids are field 8 and 9
-            curr_pair = [pair[7], pair[8]]
-            # Increment number of occurences for fragment pair
-            if prev_pair == curr_pair:
-                n_occ += 1
-            # Write previous pair and start a new one
-            else:
-                if n_occ > 0:
-                    write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
-                prev_pair = curr_pair
-                n_pairs += n_occ
-                n_occ = 1
-                n_nonzero += 1
-        # Write the last value
-        write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
-        n_nonzero += 1
-        n_pairs += 1
-
-    # Edit header line to fill number of nonzero entries inplace in graal header
-    if mat_fmt == "graal":
-        with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
-            header = mat.readline()
-            header = header.replace("-", str(n_nonzero))
-            tmp_mat.write(header)
-            st.copyfileobj(mat, tmp_mat)
-        # Replace the matrix file with the one with corrected header
-        os.rename(pre_mat_file, mat_file)
-    else:
-        os.remove(pre_mat_file)
-
-    logger.info(
-        "%d pairs used to build a contact map of %d bins with %d nonzero entries.",
-        n_pairs,
-        n_frags,
-        n_nonzero,
-    )
-
-
-def pairs2cool(pairs_file, cool_file, bins_file):
-    """
-    Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
-
-    Parameters
-    ----------
-    pairs_file : str
-        Path to the pairs file containing input contact data.
-    cool_file : str
-        Path to the output cool file name to generate.
-    bins_file : str
-        Path to the file containing genomic segmentation information. (fragments_list.txt).
-    """
-
-    # Make bins file compatible with cooler cload
-    bins_tmp = bins_file + ".cooler"
-    bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
-    bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
-
-    cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
-    cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
-    sp.call(cooler_cmd.format(**cool_args), shell=True)
-    os.remove(bins_tmp)
-
+import hicstuff.pipeline as hp
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -161,37 +24,40 @@ if __name__ == "__main__":
 
     pairs = args.pairs
     fragments_list = args.fragments
-    mat_format = args.type
+    mat_fmt = args.type
     mat_out = args.output
 
     log_file = "hicstuff_matrix.log"
     sys.stderr = open ("hicstuff_matrix.log", "wt")
     hcl.set_file_handler(log_file)
 
-    # Log which pairs file is being used and how many pairs are listed
-    pairs_count = 0
-    with open(pairs, "r") as file:
-        for line in file:
-            if line.startswith('#'):
-                continue
-            else:
-                pairs_count += 1
-    logger.info(
-        "Generating matrix from pairs file %s (%d pairs in the file) ",
-        pairs, pairs_count
-    )
-
+    # Build matrix from pairs.
+    if mat_fmt == "cool":
+
+        # Log which pairs file is being used and how many pairs are listed
+        pairs_count = 0
+        with open(pairs, "r") as file:
+            for line in file:
+                if line.startswith('#'):
+                    continue
+                else:
+                    pairs_count += 1
+        logger.info(
+            "Generating matrix from pairs file %s (%d pairs in the file) ",
+            pairs, pairs_count
+        )
 
-    if mat_format == "cool":
         # Name matrix file in .cool
-        cool_file = os.path.splitext(mat_out)[0] + ".cool"
-        pairs2cool(pairs, cool_file, fragments_list)
+        mat = os.path.splitext(mat_out)[0] + ".cool"
+
+        hp.pairs2cool(pairs, mat, fragments_list, exclude=None)
+
     else:
-        pairs2matrix(
+        hp.pairs2matrix(
             pairs,
             mat_out,
             fragments_list,
-            mat_fmt=mat_format,
+            mat_fmt=mat_fmt,
             threads=1,
             tmp_dir=None,
         )
diff --git a/modules/local/hicstuff/build_matrix.nf b/modules/local/hicstuff/build_matrix.nf
index 7746449..54e34cd 100644
--- a/modules/local/hicstuff/build_matrix.nf
+++ b/modules/local/hicstuff/build_matrix.nf
@@ -20,7 +20,7 @@ process BUILD_MATRIX {
     """
     grep -e ^# ${idx_pairs} > ${args}.pre.pairs
     grep -v ^# ${idx_pairs} | sort -S 2G -k8,8n -k9,9n --parallel=1 >> ${args}.pre.pairs
-    hicstuff_build_matrix.py -p ${args}.pre.pairs -f ${fragments_list} -t graal -o ${args}
+    hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o ${args}
 
     pairtools sort ${idx_pairs} --output ${idx_pairs}.gz --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt frag1
 
-- 
GitLab