Skip to content
Snippets Groups Projects
Verified Commit f54462d4 authored by Mia Croiset's avatar Mia Croiset
Browse files

TEST module build_matrix

parent 9f6f0ce6
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
import os, time, csv, sys, re
import argparse
import pysam as ps
import pandas as pd
import shutil as st
import itertools
from hicstuff_log import logger
import hicstuff_io as hio
import hicstuff_digest as hcd
from Bio import SeqIO
def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
"""Generate the matrix by counting the number of occurences of each
combination of restriction fragments in a pairs file.
Parameters
----------
pairs_file : str
Path to a Hi-C pairs file, with frag1 and frag2 columns added.
mat_file : str
Path where the matrix will be written.
fragments_file : str
Path to the fragments_list.txt file. Used to know total
matrix size in case some observations are not observed at the end.
mat_fmt : str
The format to use when writing the matrix. Can be graal or bg2 format.
threads : int
Number of threads to use in parallel.
tmp_dir : str
Temporary directory for sorting files. If None given, will use the system default.
"""
# Number of fragments is N lines in frag list - 1 for the header
n_frags = sum(1 for line in open(fragments_file, "r")) - 1
frags = pd.read_csv(fragments_file, delimiter="\t")
def write_mat_entry(frag1, frag2, contacts):
"""Write a single sparse matrix entry in either graal or bg2 format"""
if mat_fmt == "graal":
mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
elif mat_fmt == "bg2":
frag1, frag2 = int(frag1), int(frag2)
mat.write(
"\t".join(
map(
str,
[
frags.chrom[frag1],
frags.start_pos[frag1],
frags.end_pos[frag1],
frags.chrom[frag2],
frags.start_pos[frag2],
frags.end_pos[frag2],
contacts,
],
)
)
+ "\n"
)
pre_mat_file = mat_file + ".pre.pairs"
hio.sort_pairs(
pairs_file,
pre_mat_file,
keys=["frag1", "frag2"],
threads=threads,
tmp_dir=tmp_dir,
)
header_size = len(hio.get_pairs_header(pre_mat_file))
with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
# Skip header lines
for _ in range(header_size):
next(pairs)
prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2]
n_occ = 0 # Number of occurences of each frag combination
n_nonzero = 0 # Total number of nonzero matrix entries
n_pairs = 0 # Total number of pairs entered in the matrix
pairs_reader = csv.reader(pairs, delimiter="\t")
# First line contains nrows, ncols and number of nonzero entries.
# Number of nonzero entries is unknown for now
if mat_fmt == "graal":
mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
for pair in pairs_reader:
# Fragment ids are field 8 and 9
curr_pair = [pair[7], pair[8]]
# Increment number of occurences for fragment pair
if prev_pair == curr_pair:
n_occ += 1
# Write previous pair and start a new one
else:
if n_occ > 0:
write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
prev_pair = curr_pair
n_pairs += n_occ
n_occ = 1
n_nonzero += 1
# Write the last value
write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
n_nonzero += 1
n_pairs += 1
# Edit header line to fill number of nonzero entries inplace in graal header
if mat_fmt == "graal":
with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
header = mat.readline()
header = header.replace("-", str(n_nonzero))
tmp_mat.write(header)
st.copyfileobj(mat, tmp_mat)
# Replace the matrix file with the one with corrected header
os.rename(pre_mat_file, mat_file)
else:
os.remove(pre_mat_file)
logger.info(
"%d pairs used to build a contact map of %d bins with %d nonzero entries.",
n_pairs,
n_frags,
n_nonzero,
)
def pairs2cool(pairs_file, cool_file, bins_file):
"""
Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
bins_file : str
Path to the file containing genomic segmentation information. (fragments_list.txt).
"""
# Make bins file compatible with cooler cload
bins_tmp = bins_file + ".cooler"
bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
sp.call(cooler_cmd.format(**cool_args), shell=True)
os.remove(bins_tmp)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--pairs")
parser.add_argument("-f", "--fragments")
parser.add_argument("-t", "--type")
parser.add_argument("-o", "--output")
args = parser.parse_args()
pairs = args.pairs
fragments_list = args.fragments
mat_format = args.type
mat_out = args.output
if mat_format == "cool":
# Name matrix file in .cool
cool_file = os.path.splitext(mat_out)[0] + ".cool"
pairs2cool(pairs, cool_file, fragments_list)
else:
pairs2matrix(
pairs,
mat_out,
fragments_list,
mat_fmt=mat_format,
threads=1,
tmp_dir=None,
)
...@@ -27,6 +27,12 @@ process { ...@@ -27,6 +27,12 @@ process {
mode: 'copy' mode: 'copy'
] ]
} }
withName: 'BUILD_MATRIX' {
publishDir = [
path: { "${params.outdir}/hicstuff/matrix" },
mode: 'copy'
]
}
//********************************************** //**********************************************
// PREPARE_GENOME // PREPARE_GENOME
......
process BUILD_MATRIX {
tag '$valid_pairs'
label 'process_single'
conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3"
container = "lbmc/hicstuff:3.1.3"
input:
tuple val(meta), path(idx_pairs)
tuple val(meta), path(fragments_list)
output:
tuple val(meta), path("abs_fragments_contacts_weighted.txt"), emit: matrix
script:
"""
hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o abs_fragments_contacts_weighted.txt
"""
}
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2' include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2'
include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme' include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme'
include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs' include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs'
include { BUILD_MATRIX } from '../../modules/local/hicstuff/build_matrix'
// Paired-end to Single-end // Paired-end to Single-end
def pairToSingle(row, mates) { def pairToSingle(row, mates) {
...@@ -55,4 +56,9 @@ workflow HICSTUFF_SUB { ...@@ -55,4 +56,9 @@ workflow HICSTUFF_SUB {
digestion, digestion,
fasta fasta
) )
BUILD_MATRIX(
BAM2PAIRS.out.idx_pairs,
FRAGMENT_ENZYME.out.fragments_list
)
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment