Skip to content
Snippets Groups Projects
Verified Commit 49ad90d5 authored by Mia Croiset's avatar Mia Croiset
Browse files

update build matrices function + delete manual sorting

parent bf23c27b
No related branches found
No related tags found
No related merge requests found
......@@ -12,144 +12,7 @@ import hicstuff.io as hio
import hicstuff.digest as hcd
from Bio import SeqIO
import hicstuff.log as hcl
def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
"""Generate the matrix by counting the number of occurences of each
combination of restriction fragments in a pairs file.
Parameters
----------
pairs_file : str
Path to a Hi-C pairs file, with frag1 and frag2 columns added.
mat_file : str
Path where the matrix will be written.
fragments_file : str
Path to the fragments_list.txt file. Used to know total
matrix size in case some observations are not observed at the end.
mat_fmt : str
The format to use when writing the matrix. Can be graal or bg2 format.
threads : int
Number of threads to use in parallel.
tmp_dir : str
Temporary directory for sorting files. If None given, will use the system default.
"""
# Number of fragments is N lines in frag list - 1 for the header
n_frags = sum(1 for line in open(fragments_file, "r")) - 1
frags = pd.read_csv(fragments_file, delimiter="\t")
def write_mat_entry(frag1, frag2, contacts):
"""Write a single sparse matrix entry in either graal or bg2 format"""
if mat_fmt == "graal":
mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
elif mat_fmt == "bg2":
frag1, frag2 = int(frag1), int(frag2)
mat.write(
"\t".join(
map(
str,
[
frags.chrom[frag1],
frags.start_pos[frag1],
frags.end_pos[frag1],
frags.chrom[frag2],
frags.start_pos[frag2],
frags.end_pos[frag2],
contacts,
],
)
)
+ "\n"
)
pre_mat_file = mat_file + ".pre.pairs"
# hio.sort_pairs(
# pairs_file,
# pre_mat_file,
# keys=["frag1", "frag2"],
# threads=threads,
# tmp_dir=tmp_dir,
# )
header_size = len(hio.get_pairs_header(pre_mat_file))
with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
# Skip header lines
for _ in range(header_size):
next(pairs)
prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2]
n_occ = 0 # Number of occurences of each frag combination
n_nonzero = 0 # Total number of nonzero matrix entries
n_pairs = 0 # Total number of pairs entered in the matrix
pairs_reader = csv.reader(pairs, delimiter="\t")
# First line contains nrows, ncols and number of nonzero entries.
# Number of nonzero entries is unknown for now
if mat_fmt == "graal":
mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
for pair in pairs_reader:
# Fragment ids are field 8 and 9
curr_pair = [pair[7], pair[8]]
# Increment number of occurences for fragment pair
if prev_pair == curr_pair:
n_occ += 1
# Write previous pair and start a new one
else:
if n_occ > 0:
write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
prev_pair = curr_pair
n_pairs += n_occ
n_occ = 1
n_nonzero += 1
# Write the last value
write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
n_nonzero += 1
n_pairs += 1
# Edit header line to fill number of nonzero entries inplace in graal header
if mat_fmt == "graal":
with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
header = mat.readline()
header = header.replace("-", str(n_nonzero))
tmp_mat.write(header)
st.copyfileobj(mat, tmp_mat)
# Replace the matrix file with the one with corrected header
os.rename(pre_mat_file, mat_file)
else:
os.remove(pre_mat_file)
logger.info(
"%d pairs used to build a contact map of %d bins with %d nonzero entries.",
n_pairs,
n_frags,
n_nonzero,
)
def pairs2cool(pairs_file, cool_file, bins_file):
"""
Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
bins_file : str
Path to the file containing genomic segmentation information. (fragments_list.txt).
"""
# Make bins file compatible with cooler cload
bins_tmp = bins_file + ".cooler"
bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
sp.call(cooler_cmd.format(**cool_args), shell=True)
os.remove(bins_tmp)
import hicstuff.pipeline as hp
if __name__ == "__main__":
parser = argparse.ArgumentParser()
......@@ -161,37 +24,40 @@ if __name__ == "__main__":
pairs = args.pairs
fragments_list = args.fragments
mat_format = args.type
mat_fmt = args.type
mat_out = args.output
log_file = "hicstuff_matrix.log"
sys.stderr = open ("hicstuff_matrix.log", "wt")
hcl.set_file_handler(log_file)
# Log which pairs file is being used and how many pairs are listed
pairs_count = 0
with open(pairs, "r") as file:
for line in file:
if line.startswith('#'):
continue
else:
pairs_count += 1
logger.info(
"Generating matrix from pairs file %s (%d pairs in the file) ",
pairs, pairs_count
)
# Build matrix from pairs.
if mat_fmt == "cool":
# Log which pairs file is being used and how many pairs are listed
pairs_count = 0
with open(pairs, "r") as file:
for line in file:
if line.startswith('#'):
continue
else:
pairs_count += 1
logger.info(
"Generating matrix from pairs file %s (%d pairs in the file) ",
pairs, pairs_count
)
if mat_format == "cool":
# Name matrix file in .cool
cool_file = os.path.splitext(mat_out)[0] + ".cool"
pairs2cool(pairs, cool_file, fragments_list)
mat = os.path.splitext(mat_out)[0] + ".cool"
hp.pairs2cool(pairs, mat, fragments_list, exclude=None)
else:
pairs2matrix(
hp.pairs2matrix(
pairs,
mat_out,
fragments_list,
mat_fmt=mat_format,
mat_fmt=mat_fmt,
threads=1,
tmp_dir=None,
)
......@@ -20,7 +20,7 @@ process BUILD_MATRIX {
"""
grep -e ^# ${idx_pairs} > ${args}.pre.pairs
grep -v ^# ${idx_pairs} | sort -S 2G -k8,8n -k9,9n --parallel=1 >> ${args}.pre.pairs
hicstuff_build_matrix.py -p ${args}.pre.pairs -f ${fragments_list} -t graal -o ${args}
hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o ${args}
pairtools sort ${idx_pairs} --output ${idx_pairs}.gz --c1 chr1 --c2 chr2 --p1 pos1 --p2 pos2 --pt frag1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment