From f54462d4311d54f7c3b96fdb89beb2835d0bebb9 Mon Sep 17 00:00:00 2001
From: Mia Croiset <mia.croiset@ens-lyon.fr>
Date: Thu, 16 Feb 2023 17:26:33 +0100
Subject: [PATCH] TEST module build_matrix

---
 bin/hicstuff_bam2pairs.py              |   2 +-
 bin/hicstuff_build_matrix.py           | 175 +++++++++++++++++++++++++
 conf/hicstuff.config                   |   6 +
 modules/local/hicstuff/build_matrix.nf |  19 +++
 subworkflows/local/hicstuff_sub.nf     |   6 +
 5 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100755 bin/hicstuff_build_matrix.py
 create mode 100644 modules/local/hicstuff/build_matrix.nf

diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py
index 6697197..ea456aa 100755
--- a/bin/hicstuff_bam2pairs.py
+++ b/bin/hicstuff_bam2pairs.py
@@ -155,7 +155,7 @@ if __name__ == "__main__":
     info_contigs = args.info_contigs
     min_qual = int(args.min_qual)
     enzyme = args.enzyme
-    fasta =args.fasta
+    fasta = args.fasta
 
     #hicstuff case sensitive enzymes adaptation
     if enzyme == "hindiii":
diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py
new file mode 100755
index 0000000..451849e
--- /dev/null
+++ b/bin/hicstuff_build_matrix.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+import os, time, csv, sys, re
+import argparse
+import pysam as ps
+import pandas as pd
+import shutil as st
+import itertools
+from hicstuff_log import logger
+import hicstuff_io as hio
+import hicstuff_digest as hcd
+from Bio import SeqIO
+
+
+def pairs2matrix(
+    pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
+):
+    """Generate the matrix by counting the number of occurences of each
+    combination of restriction fragments in a pairs file.
+    Parameters
+    ----------
+    pairs_file : str
+        Path to a Hi-C pairs file, with frag1 and frag2 columns added.
+    mat_file : str
+        Path where the matrix will be written.
+    fragments_file : str
+        Path to the fragments_list.txt file. Used to know total
+        matrix size in case some observations are not observed at the end.
+    mat_fmt : str
+        The format to use when writing the matrix. Can be graal or bg2 format.
+    threads : int
+        Number of threads to use in parallel.
+    tmp_dir : str
+        Temporary directory for sorting files. If None given, will use the system default.
+    """
+    # Number of fragments is N lines in frag list - 1 for the header
+    n_frags = sum(1 for line in open(fragments_file, "r")) - 1
+    frags = pd.read_csv(fragments_file, delimiter="\t")
+
+    def write_mat_entry(frag1, frag2, contacts):
+        """Write a single sparse matrix entry in either graal or bg2 format"""
+        if mat_fmt == "graal":
+            mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
+        elif mat_fmt == "bg2":
+            frag1, frag2 = int(frag1), int(frag2)
+            mat.write(
+                "\t".join(
+                    map(
+                        str,
+                        [
+                            frags.chrom[frag1],
+                            frags.start_pos[frag1],
+                            frags.end_pos[frag1],
+                            frags.chrom[frag2],
+                            frags.start_pos[frag2],
+                            frags.end_pos[frag2],
+                            contacts,
+                        ],
+                    )
+                )
+                + "\n"
+            )
+
+    pre_mat_file = mat_file + ".pre.pairs"
+    hio.sort_pairs(
+        pairs_file,
+        pre_mat_file,
+        keys=["frag1", "frag2"],
+        threads=threads,
+        tmp_dir=tmp_dir,
+    )
+    header_size = len(hio.get_pairs_header(pre_mat_file))
+    with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
+        # Skip header lines
+        for _ in range(header_size):
+            next(pairs)
+        prev_pair = ["0", "0"]  # Pairs identified by [frag1, frag2]
+        n_occ = 0  # Number of occurences of each frag combination
+        n_nonzero = 0  # Total number of nonzero matrix entries
+        n_pairs = 0  # Total number of pairs entered in the matrix
+        pairs_reader = csv.reader(pairs, delimiter="\t")
+        # First line contains nrows, ncols and number of nonzero entries.
+        # Number of nonzero entries is unknown for now
+        if mat_fmt == "graal":
+            mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
+        for pair in pairs_reader:
+            # Fragment ids are field 8 and 9
+            curr_pair = [pair[7], pair[8]]
+            # Increment number of occurences for fragment pair
+            if prev_pair == curr_pair:
+                n_occ += 1
+            # Write previous pair and start a new one
+            else:
+                if n_occ > 0:
+                    write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
+                prev_pair = curr_pair
+                n_pairs += n_occ
+                n_occ = 1
+                n_nonzero += 1
+        # Write the last value
+        write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
+        n_nonzero += 1
+        n_pairs += 1
+
+    # Edit header line to fill number of nonzero entries inplace in graal header
+    if mat_fmt == "graal":
+        with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
+            header = mat.readline()
+            header = header.replace("-", str(n_nonzero))
+            tmp_mat.write(header)
+            st.copyfileobj(mat, tmp_mat)
+        # Replace the matrix file with the one with corrected header
+        os.rename(pre_mat_file, mat_file)
+    else:
+        os.remove(pre_mat_file)
+
+    logger.info(
+        "%d pairs used to build a contact map of %d bins with %d nonzero entries.",
+        n_pairs,
+        n_frags,
+        n_nonzero,
+    )
+
+
+def pairs2cool(pairs_file, cool_file, bins_file):
+    """
+    Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
+
+    Parameters
+    ----------
+    pairs_file : str
+        Path to the pairs file containing input contact data.
+    cool_file : str
+        Path to the output cool file name to generate.
+    bins_file : str
+        Path to the file containing genomic segmentation information. (fragments_list.txt).
+    """
+
+    # Make bins file compatible with cooler cload
+    bins_tmp = bins_file + ".cooler"
+    bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
+    bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
+
+    cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
+    cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
+    sp.call(cooler_cmd.format(**cool_args), shell=True)
+    os.remove(bins_tmp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--pairs")
+    parser.add_argument("-f", "--fragments")
+    parser.add_argument("-t", "--type")
+    parser.add_argument("-o", "--output")
+    args = parser.parse_args()
+
+    pairs = args.pairs
+    fragments_list = args.fragments
+    mat_format = args.type
+    mat_out = args.output
+
+    if mat_format == "cool":
+        # Name matrix file in .cool
+        cool_file = os.path.splitext(mat_out)[0] + ".cool"
+        pairs2cool(pairs, cool_file, fragments_list)
+    else:
+        pairs2matrix(
+            pairs,
+            mat_out,
+            fragments_list,
+            mat_fmt=mat_format,
+            threads=1,
+            tmp_dir=None,
+        )
diff --git a/conf/hicstuff.config b/conf/hicstuff.config
index 1d23768..6b1178a 100644
--- a/conf/hicstuff.config
+++ b/conf/hicstuff.config
@@ -27,6 +27,12 @@ process {
             mode: 'copy'
         ]
     }
+    withName: 'BUILD_MATRIX' {
+        publishDir = [
+            path: { "${params.outdir}/hicstuff/matrix" },
+            mode: 'copy'
+        ]
+    }
 
     //**********************************************
     // PREPARE_GENOME
diff --git a/modules/local/hicstuff/build_matrix.nf b/modules/local/hicstuff/build_matrix.nf
new file mode 100644
index 0000000..4570159
--- /dev/null
+++ b/modules/local/hicstuff/build_matrix.nf
@@ -0,0 +1,19 @@
+process BUILD_MATRIX {
+    tag '$valid_pairs'
+    label 'process_single'
+
+    conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
+    container = "lbmc/hicstuff:3.1.3"
+
+    input:
+    tuple val(meta), path(idx_pairs)
+    tuple val(meta), path(fragments_list)
+
+    output:
+    tuple val(meta), path("abs_fragments_contacts_weighted.txt"), emit: matrix
+
+    script:
+    """
+    hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o abs_fragments_contacts_weighted.txt
+    """
+}
diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf
index 25f3118..d007bd3 100644
--- a/subworkflows/local/hicstuff_sub.nf
+++ b/subworkflows/local/hicstuff_sub.nf
@@ -2,6 +2,7 @@
 include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2'
 include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme'
 include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs'
+include { BUILD_MATRIX } from '../../modules/local/hicstuff/build_matrix'
 
 // Paired-end to Single-end
 def pairToSingle(row, mates) {
@@ -55,4 +56,9 @@ workflow HICSTUFF_SUB {
         digestion,
         fasta
     )
+
+    BUILD_MATRIX(
+        BAM2PAIRS.out.idx_pairs,
+        FRAGMENT_ENZYME.out.fragments_list
+    )
 }
-- 
GitLab