Skip to content
Snippets Groups Projects
Verified Commit 49ad90d5 authored by Mia Croiset's avatar Mia Croiset
Browse files

update build matrices function + delete manual sorting

parent bf23c27b
Branches
No related tags found
No related merge requests found
...@@ -12,144 +12,7 @@ import hicstuff.io as hio ...@@ -12,144 +12,7 @@ import hicstuff.io as hio
import hicstuff.digest as hcd import hicstuff.digest as hcd
from Bio import SeqIO from Bio import SeqIO
import hicstuff.log as hcl import hicstuff.log as hcl
import hicstuff.pipeline as hp
def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
"""Generate the matrix by counting the number of occurences of each
combination of restriction fragments in a pairs file.
Parameters
----------
pairs_file : str
Path to a Hi-C pairs file, with frag1 and frag2 columns added.
mat_file : str
Path where the matrix will be written.
fragments_file : str
Path to the fragments_list.txt file. Used to know total
matrix size in case some observations are not observed at the end.
mat_fmt : str
The format to use when writing the matrix. Can be graal or bg2 format.
threads : int
Number of threads to use in parallel.
tmp_dir : str
Temporary directory for sorting files. If None given, will use the system default.
"""
# Number of fragments is N lines in frag list - 1 for the header
n_frags = sum(1 for line in open(fragments_file, "r")) - 1
frags = pd.read_csv(fragments_file, delimiter="\t")
def write_mat_entry(frag1, frag2, contacts):
"""Write a single sparse matrix entry in either graal or bg2 format"""
if mat_fmt == "graal":
mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
elif mat_fmt == "bg2":
frag1, frag2 = int(frag1), int(frag2)
mat.write(
"\t".join(
map(
str,
[
frags.chrom[frag1],
frags.start_pos[frag1],
frags.end_pos[frag1],
frags.chrom[frag2],
frags.start_pos[frag2],
frags.end_pos[frag2],
contacts,
],
)
)
+ "\n"
)
pre_mat_file = mat_file + ".pre.pairs"
# hio.sort_pairs(
# pairs_file,
# pre_mat_file,
# keys=["frag1", "frag2"],
# threads=threads,
# tmp_dir=tmp_dir,
# )
header_size = len(hio.get_pairs_header(pre_mat_file))
with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
# Skip header lines
for _ in range(header_size):
next(pairs)
prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2]
n_occ = 0 # Number of occurences of each frag combination
n_nonzero = 0 # Total number of nonzero matrix entries
n_pairs = 0 # Total number of pairs entered in the matrix
pairs_reader = csv.reader(pairs, delimiter="\t")
# First line contains nrows, ncols and number of nonzero entries.
# Number of nonzero entries is unknown for now
if mat_fmt == "graal":
mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
for pair in pairs_reader:
# Fragment ids are field 8 and 9
curr_pair = [pair[7], pair[8]]
# Increment number of occurences for fragment pair
if prev_pair == curr_pair:
n_occ += 1
# Write previous pair and start a new one
else:
if n_occ > 0:
write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
prev_pair = curr_pair
n_pairs += n_occ
n_occ = 1
n_nonzero += 1
# Write the last value
write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
n_nonzero += 1
n_pairs += 1
# Edit header line to fill number of nonzero entries inplace in graal header
if mat_fmt == "graal":
with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
header = mat.readline()
header = header.replace("-", str(n_nonzero))
tmp_mat.write(header)
st.copyfileobj(mat, tmp_mat)
# Replace the matrix file with the one with corrected header
os.rename(pre_mat_file, mat_file)
else:
os.remove(pre_mat_file)
logger.info(
"%d pairs used to build a contact map of %d bins with %d nonzero entries.",
n_pairs,
n_frags,
n_nonzero,
)
def pairs2cool(pairs_file, cool_file, bins_file):
"""
Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
bins_file : str
Path to the file containing genomic segmentation information. (fragments_list.txt).
"""
# Make bins file compatible with cooler cload
bins_tmp = bins_file + ".cooler"
bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
sp.call(cooler_cmd.format(**cool_args), shell=True)
os.remove(bins_tmp)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -161,13 +24,16 @@ if __name__ == "__main__": ...@@ -161,13 +24,16 @@ if __name__ == "__main__":
pairs = args.pairs pairs = args.pairs
fragments_list = args.fragments fragments_list = args.fragments
mat_format = args.type mat_fmt = args.type
mat_out = args.output mat_out = args.output
log_file = "hicstuff_matrix.log" log_file = "hicstuff_matrix.log"
sys.stderr = open ("hicstuff_matrix.log", "wt") sys.stderr = open ("hicstuff_matrix.log", "wt")
hcl.set_file_handler(log_file) hcl.set_file_handler(log_file)
# Build matrix from pairs.
if mat_fmt == "cool":
# Log which pairs file is being used and how many pairs are listed # Log which pairs file is being used and how many pairs are listed
pairs_count = 0 pairs_count = 0
with open(pairs, "r") as file: with open(pairs, "r") as file:
...@@ -181,17 +47,17 @@ if __name__ == "__main__": ...@@ -181,17 +47,17 @@ if __name__ == "__main__":
pairs, pairs_count pairs, pairs_count
) )
if mat_format == "cool":
# Name matrix file in .cool # Name matrix file in .cool
cool_file = os.path.splitext(mat_out)[0] + ".cool" mat = os.path.splitext(mat_out)[0] + ".cool"
pairs2cool(pairs, cool_file, fragments_list)
hp.pairs2cool(pairs, mat, fragments_list, exclude=None)
else: else:
pairs2matrix( hp.pairs2matrix(
pairs, pairs,
mat_out, mat_out,
fragments_list, fragments_list,
mat_fmt=mat_format, mat_fmt=mat_fmt,
threads=1, threads=1,
tmp_dir=None, tmp_dir=None,
) )
...@@ -20,7 +20,7 @@ process BUILD_MATRIX { ...@@ -20,7 +20,7 @@ process BUILD_MATRIX {
""" """
grep -e ^# ${idx_pairs} > ${args}.pre.pairs grep -e ^# ${idx_pairs} > ${args}.pre.pairs
grep -v ^# ${idx_pairs} | sort -S 2G -k8,8n -k9,9n --parallel=1 >> ${args}.pre.pairs grep -v ^# ${idx_pairs} | sort -S 2G -k8,8n -k9,9n --parallel=1 >> ${args}.pre.pairs
hicstuff_build_matrix.py -p ${args}.pre.pairs -f ${fragments_list} -t graal -o ${args} hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o ${args}
pairtools sort ${idx_pairs} --output ${idx_pairs}.gz --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt frag1 pairtools sort ${idx_pairs} --output ${idx_pairs}.gz --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt frag1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment