From f1fe040e026ec002573a16339b80848d38aeb173 Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Fri, 10 Feb 2023 09:54:32 +0100 Subject: [PATCH] TEST fragment enzyme modules --- bin/__init__.py | 1 + bin/hicstuff_digest.py | 489 ++++++++ bin/hicstuff_fragments.py | 40 + bin/hicstuff_io.py | 1379 +++++++++++++++++++++ bin/hicstuff_log.py | 89 ++ conf/hicstuff.config | 31 + modules/local/hicstuff/fragment_enzyme.nf | 21 + subworkflows/local/hicstuff_sub.nf | 12 + workflows/hicstuff.nf | 8 +- 9 files changed, 2067 insertions(+), 3 deletions(-) create mode 100644 bin/__init__.py create mode 100644 bin/hicstuff_digest.py create mode 100755 bin/hicstuff_fragments.py create mode 100644 bin/hicstuff_io.py create mode 100644 bin/hicstuff_log.py create mode 100644 modules/local/hicstuff/fragment_enzyme.nf diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100644 index 0000000..813219d --- /dev/null +++ b/bin/__init__.py @@ -0,0 +1 @@ +from .bin import * diff --git a/bin/hicstuff_digest.py b/bin/hicstuff_digest.py new file mode 100644 index 0000000..3fba3da --- /dev/null +++ b/bin/hicstuff_digest.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Genome digestion +Functions used to write auxiliary instagraal compatible +sparse matrices. +""" + +from Bio import SeqIO, SeqUtils +from Bio.Seq import Seq +from Bio.Restriction import RestrictionBatch, Analysis +import os, sys, csv +import re +import collections +import copy +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from hicstuff_log import logger +import hicstuff_io as hio + +DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" +DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" +DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" +DEFAULT_KB_BINNING = 1 +DEFAULT_THRESHOLD_SIZE = 0 +# Most used enzyme for eukaryotes +DEFAULT_ENZYME = "DpnII" +# If using evenly-sized chunks instead of restriction +# enzymes, they shouldn't be too short +DEFAULT_MIN_CHUNK_SIZE = 50 + + +def write_frag_info( + fasta, + enzyme, + min_size=DEFAULT_THRESHOLD_SIZE, + circular=False, + output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, + output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME, + output_dir=None, +): + """Digest and write fragment information + Write the fragments_list.txt and info_contigs.txt that are necessary for + instagraal to run. + Parameters + ---------- + fasta : pathlib.Path or str + The path to the reference genome + enzyme : str, int or list of str + If a string, must be the name of an enzyme (e.g. DpnII) and the genome + will be cut at the enzyme's restriction sites. If a number, the genome + will be cut uniformly into chunks with length equal to that number. A + list of enzymes can also be specified if using multiple enzymes. + min_size : float, optional + Size below which shorter contigs are discarded. Default is 0, i.e. all + contigs are retained. + circular : bool, optional + Whether the genome is circular. Default is False. + output_contigs : str, optional + The name of the file with contig info. Default is info_contigs.txt + output_frags : str, optional + The name of the file with fragment info. Default is fragments_list.txt + output_dir : [type], optional + The path to the output directory, which will be created if not already + existing. Default is the current directory. + """ + + records = SeqIO.parse(hio.read_compressed(fasta), "fasta") + + try: + info_contigs_path = os.path.join(output_dir, output_contigs) + frag_list_path = os.path.join(output_dir, output_frags) + except TypeError: + info_contigs_path = output_contigs + frag_list_path = output_frags + + with open(info_contigs_path, "w") as info_contigs: + + info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") + + with open(frag_list_path, "w") as fragments_list: + + fragments_list.write( + "id\tchrom\tstart_pos" "\tend_pos\tsize\tgc_content\n" + ) + + total_frags = 0 + + for record in records: + contig_seq = record.seq + contig_name = record.id + contig_length = len(contig_seq) + if contig_length < int(min_size): + continue + + sites = get_restriction_table( + contig_seq, enzyme, circular=circular + ) + fragments = ( + contig_seq[sites[i] : sites[i + 1]] + for i in range(len(sites) - 1) + ) + n_frags = 0 + + current_id = 1 + start_pos = 0 + for frag in fragments: + frag_length = len(frag) + if frag_length > 0: + end_pos = start_pos + frag_length + gc_content = SeqUtils.GC(frag) / 100.0 + + current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + current_id, + contig_name, + start_pos, + end_pos, + frag_length, + gc_content, + ) + + fragments_list.write(current_fragment_line) + + try: + assert (current_id == 1 and start_pos == 0) or ( + current_id > 1 and start_pos > 0 + ) + except AssertionError: + logger.error((current_id, start_pos)) + raise + start_pos = end_pos + current_id += 1 + n_frags += 1 + + current_contig_line = "%s\t%s\t%s\t%s\n" % ( + contig_name, + contig_length, + n_frags, + total_frags, + ) + total_frags += n_frags + info_contigs.write(current_contig_line) + + +def attribute_fragments(pairs_file, idx_pairs_file, restriction_table): + """ + Writes the indexed pairs file, which has two more columns than the input + pairs file corresponding to the restriction fragment index of each read. + Note that pairs files have 1bp point positions whereas restriction table + has 0bp point poisitions. + Parameters + ---------- + pairs_file: str + Path the the input pairs file. Consists of 7 tab-separated + columns: readID, chr1, pos1, chr2, pos2, strand1, strand2 + idx_pairs_file: str + Path to the output indexed pairs file. Consists of 9 white space + separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, + frag1, frag2. frag1 and frag2 are 0-based restriction fragments based + on whole genome. + restriction_table: dict + Dictionary with chromosome identifiers (str) as keys and list of + positions (int) of restriction sites as values. + """ + + # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow + # 1. could be reduced by searching groups of N frags in parallel and 2. by + # writing N frags simultaneously using a single call of writerows. + + # Parse and update header section + pairs_header = hio.get_pairs_header(pairs_file) + header_size = len(pairs_header) + chrom_order = [] + with open(idx_pairs_file, "w") as idx_pairs: + for line in pairs_header: + # Add new column names to header + if line.startswith("#columns"): + line = line.rstrip() + " frag1 frag2" + if line.startswith("#chromsize"): + chrom_order.append(line.split()[1]) + idx_pairs.write(line + "\n") + + # Get number of fragments per chrom to allow genome-based indices + shift_frags = {} + prev_frags = 0 + for rank, chrom in enumerate(chrom_order): + if rank > 0: + # Note the "-1" because there are nfrags + 1 sites in rest table + prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1 + # Idx of each chrom's frags will be shifted by n frags in previous chroms + shift_frags[chrom] = prev_frags + + missing_contigs = set() + # Attribute pairs to fragments and append them to output file (after header) + with open(pairs_file, "r") as pairs, open( + idx_pairs_file, "a" + ) as idx_pairs: + # Skip header lines + for _ in range(header_size): + next(pairs) + + # Define input and output fields + pairs_cols = [ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + ] + idx_cols = pairs_cols + ["frag1", "frag2"] + + # Use csv reader / writer to automatically parse columns into a dict + pairs_reader = csv.DictReader( + pairs, fieldnames=pairs_cols, delimiter="\t" + ) + pairs_writer = csv.DictWriter( + idx_pairs, fieldnames=idx_cols, delimiter="\t" + ) + + for pair in pairs_reader: + # Get the 0-based indices of corresponding restriction fragments + # Deducing 1 from pair position to get it into 0bp point + pair["frag1"] = find_frag( + int(pair["pos1"]) - 1, restriction_table[pair["chr1"]] + ) + pair["frag2"] = find_frag( + int(pair["pos2"]) - 1, restriction_table[pair["chr2"]] + ) + # Shift fragment indices to make them genome-based instead of + # chromosome-based + try: + pair["frag1"] += shift_frags[pair["chr1"]] + except KeyError: + missing_contigs.add(pair["chr1"]) + try: + pair["frag2"] += shift_frags[pair["chr2"]] + except KeyError: + missing_contigs.add(pair["chr2"]) + + # Write indexed pairs in the new file + pairs_writer.writerow(pair) + + if missing_contigs: + logger.warning( + "Pairs on the following contigs were discarded as " + "those contigs are not listed in the paris file header. " + "This is normal if you filtered out small contigs: %s" + % " ".join(list(missing_contigs)) + ) + + +def get_restriction_table(seq, enzyme, circular=False): + """ + Get the restriction table for a single genomic sequence. + Parameters + ---------- + seq : Seq object + A biopython Seq object representing a chromosomes or contig. + enzyme : int, str or list of str + The name of the restriction enzyme used, or a list of restriction + enzyme names. Can also be an integer, to digest by fixed chunk size. + circular : bool + Wether the genome is circular. + Returns + ------- + numpy.array: + List of restriction fragment boundary positions for the input sequence. + >>> from Bio.Seq import Seq + >>> get_restriction_table(Seq("AAGCCGGATCGG"),"HpaII") + array([ 0, 4, 12]) + >>> get_restriction_table(Seq("AA"),["HpaII", "MluCI"]) + array([0, 2]) + >>> get_restriction_table(Seq("AA"),"aeiou1") + Traceback (most recent call last): + ... + ValueError: aeiou1 is not a valid restriction enzyme. + >>> get_restriction_table("AA","HpaII") + Traceback (most recent call last): + ... + TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead + """ + chrom_len = len(seq) + wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme) + # Restriction batch containing the restriction enzyme + try: + enz = [enzyme] if isinstance(enzyme, str) else enzyme + cutter = RestrictionBatch(enz) + except (TypeError, ValueError): + try: + cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) + except ValueError: + raise ValueError(wrong_enzyme) + + # Conversion from string type to restriction type + if isinstance(cutter, int): + sites = [i for i in range(0, chrom_len, cutter)] + if sites[-1] < chrom_len: + sites.append(chrom_len) + else: + # Find sites of all restriction enzymes given + ana = Analysis(cutter, seq, linear=not circular) + sites = ana.full() + # Gets all sites into a single flat list with 0-based index + sites = [site - 1 for enz in sites.values() for site in enz] + # Sort by position and allow first add start and end of seq + sites.sort() + sites.insert(0, 0) + sites.append(chrom_len) + + return np.array(sites) + + +def find_frag(pos, r_sites): + """ + Use binary search to find the index of a chromosome restriction fragment + corresponding to an input genomic position. + Parameters + ---------- + pos : int + Genomic position, in base pairs. + r_sites : list + List of genomic positions corresponding to restriction sites. + Returns + ------- + int + The 0-based index of the restriction fragment to which the position belongs. + >>> find_frag(15, [0, 20, 30]) + 0 + >>> find_frag(15, [10, 20, 30]) + Traceback (most recent call last): + ... + ValueError: The first position in the restriction table is not 0. + >>> find_frag(31, [0, 20, 30]) + Traceback (most recent call last): + ... + ValueError: Read position is larger than last entry in restriction table. + """ + if r_sites[0] != 0: + raise ValueError( + "The first position in the restriction table is not 0." + ) + if pos > r_sites[-1]: + raise ValueError( + "Read position is larger than last entry in restriction table." + ) + # binary search for the index of the read + index = max(np.searchsorted(r_sites, pos, side="right") - 1, 0) + # Last site = end of the chrom, index of last fragment is last site - 1 + index = min(len(r_sites) - 2, index) + + return index + + +def frag_len( + frags_file_name=DEFAULT_FRAGMENTS_LIST_FILE_NAME, + output_dir=None, + plot=False, + fig_path=None, +): + """ + logs summary statistics of fragment length distribution based on an + input fragment file. Can optionally show a histogram instead + of text summary. + Parameters + ---------- + frags_file_name : str + Path to the output list of fragments. + output_dir : str + Directory where the list should be saved. + plot : bool + Wether a histogram of fragment length should be shown. + fig_path : str + If a path is given, the figure will be saved instead of shown. + """ + + try: + frag_list_path = os.path.join(output_dir, frags_file_name) + except TypeError: + frag_list_path = frags_file_name + frags = pd.read_csv(frag_list_path, sep="\t") + nfrags = frags.shape[0] + med_len = frags["size"].median() + nbins = 40 + if plot: + fig, ax = plt.subplots() + _, _, _ = ax.hist(frags["size"], bins=nbins) + + ax.set_xlabel("Fragment length [bp]") + ax.set_ylabel("Log10 number of fragments") + ax.set_title("Distribution of restriction fragment length") + ax.set_yscale("log", base=10) + ax.annotate( + "Total fragments: {}".format(nfrags), + xy=(0.95, 0.95), + xycoords="axes fraction", + fontsize=12, + horizontalalignment="right", + verticalalignment="top", + ) + ax.annotate( + "Median length: {}".format(med_len), + xy=(0.95, 0.90), + xycoords="axes fraction", + fontsize=12, + horizontalalignment="right", + verticalalignment="top", + ) + # Tweak spacing to prevent clipping of ylabel + fig.tight_layout() + if fig_path: + plt.savefig(fig_path) + else: + plt.show() + plt.clf() + else: + logger.info( + "Genome digested into {0} fragments with a median " + "length of {1}".format(nfrags, med_len) + ) + + +def gen_enzyme_religation_regex(enzyme): + """Return a regex which corresponds to all possible religation sites given a + set of enzyme. + Parameters: + ----------- + enzyme : str + String that contains the names of the enzyme separated by a comma. + Returns: + -------- + re.Pattern : + Regex that corresponds to all possible ligation sites given a set of + enzyme. + Examples: + --------- + >>> gen_enzyme_religation_regex('HpaII') + re.compile('CCGCGG') + >>> gen_enzyme_religation_regex('HpaII,MluCI') + re.compile('AATTAATT|AATTCGG|CCGAATT|CCGCGG') + """ + + # Split the str on the comma to separate the different enzymes. + enzyme = enzyme.split(",") + + # Check on Biopython dictionnary the enzyme. + rb = RestrictionBatch(enzyme) + + # Initiation: + give_list = [] + accept_list = [] + ligation_list = [] + + # Iterates on the enzymes. + for enz in rb: + + # Extract restriction sites and look for cut sites. + site = enz.elucidate() + fw_cut = site.find("^") + rev_cut = site.find("_") + + # Process "give" site. Remove N on the left (useless). + give_site = site[:rev_cut].replace("^", "") + while give_site[0] == "N": + give_site = give_site[1:] + give_list.append(give_site) + + # Process "accept" site. Remove N on the rigth (useless). + accept_site = site[fw_cut + 1 :].replace("_", "") + while accept_site[-1] == "N": + accept_site = accept_site[:-1] + accept_list.append(accept_site) + + # Iterates on the two list to build all the possible HiC ligation sites. + for give_site in give_list: + for accept_site in accept_list: + # Replace "N" by "." for regex searching of the sites + ligation_list.append((give_site + accept_site).replace("N", ".")) + ligation_list.append( + str(Seq(give_site + accept_site).reverse_complement()).replace( + "N", "." + ) + ) + + # Build the regex for any ligation sites. + pattern = "|".join(sorted(list(set(ligation_list)))) + return re.compile(pattern) diff --git a/bin/hicstuff_fragments.py b/bin/hicstuff_fragments.py new file mode 100755 index 0000000..c49378e --- /dev/null +++ b/bin/hicstuff_fragments.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import hicstuff_digest as hcd +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--enzyme") + parser.add_argument("-i", "--index") + args = parser.parse_args() + + enzyme = args.enzyme + index = args.index + + + #hicstuff case sensitive enzymes adaptation + if enzyme == "hindiii": + enzyme = "HindIII" + elif enzyme == "dpnii": + enzyme = "DpnII" + elif enzyme == "bglii": + enzyme = "BglII" + elif enzyme == "mboi": + enzyme = "MboI" + elif enzyme == "arima": + enzyme = "Arima" + + + # Generate info_contigs and fragments_list output files + hcd.write_frag_info( + index, + enzyme, + min_size=0, + circular=False, + output_contigs="info_contigs.txt", + output_frags="fragments_list.txt", + ) + + # Log fragment size distribution + hcd.frag_len(frags_file_name="fragments_list.txt", plot=False, fig_path="frags_hist.pdf") diff --git a/bin/hicstuff_io.py b/bin/hicstuff_io.py new file mode 100644 index 0000000..c8c4dde --- /dev/null +++ b/bin/hicstuff_io.py @@ -0,0 +1,1379 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import gzip +import zipfile +import bz2 +import io +import os +import functools +import sys +import numpy as np +import pandas as pd +import collections +import subprocess as sp +from scipy.sparse import tril, triu +import pathlib +import re +from os.path import join, exists +from random import getrandbits +from scipy.sparse import coo_matrix +from Bio import SeqIO, SeqUtils +#import hicstuff.hicstuff as hcs +#from hicstuff.log import logger +#from hicstuff.version import __version__ +import scipy.stats as ss + +DEFAULT_MAX_MATRIX_SHAPE = 10000 + +DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" +DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" +DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" + + +def _cols_to_sparse(sparse_array, shape=None, dtype=np.float64): + """ + Make a coordinate based sparse matrix from columns. + Convert (3, n) shaped arrays to a sparse matrix. The first + one acts as the row coordinates, the second one as the + column coordinates, the third one as the data points + for each pair. If duplicates are found, the data points are + added. + Parameters + ---------- + sparse_array : array_like + An array with exactly three columns representing the sparse + matrix data in coordinate format. + shape : tuple of int + The total number of rows and columns in the matrix. Will be estimated + from nonzero values if omitted. + dtype : type, optional + The type of data being loaded. Default is numpy.float64 + Example + ------- + >>> import numpy as np + >>> row, col = np.array([1, 2, 3]), np.array([3, 2, 1]) + >>> data = np.array([4, 5, 6]) + >>> M = np.array([row, col, data]).T + >>> S = _cols_to_sparse(M) + >>> print(S.todense()) + [[0. 0. 0. 0.] + [0. 0. 0. 4.] + [0. 0. 5. 0.] + [0. 6. 0. 0.]] + """ + + row = sparse_array[:, 0] + col = sparse_array[:, 1] + data = sparse_array[:, 2] + if shape is None: + n = int(max(np.amax(row), np.amax(col))) + 1 + shape = (n, n) + + S = coo_matrix((data, (row, col)), shape=shape, dtype=dtype) + return S + + +def load_sparse_matrix(mat_path, binning=1, dtype=np.float64): + """Load sparse matrix + Load a text file matrix into a sparse matrix object. The expected format is + a 3 column file where columns are row_number, col_number, value. The first + line consists of 3 values representing the total number of rows, columns + and nonzero values. + Parameters + ---------- + mat_path : file, str or pathlib.Path + The input matrix file in instagraal format. + binning : int or "auto" + The binning to perform. If "auto", binning will + be automatically inferred so that the matrix size + will not go beyond (10000, 10000) in shape. That + can be changed by modifying the DEFAULT_MAX_MATRIX_SHAPE + value. Default is 1, i.e. no binning is performed + dtype : type, optional + The type of data being loaded. Default is numpy.float64 + Returns + ------- + sparse_mat : scipy.sparse.coo_matrix + The output (sparse) matrix in COOrdinate format. + Examples + -------- + >>> S = load_sparse_matrix('test_data/mat_5kb.tsv', binning=1) + >>> S.data[:10] + array([84., 2., 3., 2., 1., 1., 50., 1., 66., 1.]) + >>> S.shape + (16, 16) + """ + try: + cols_arr = np.loadtxt(mat_path, delimiter="\t", dtype=dtype) + shape = (int(cols_arr[0, 0]), int(cols_arr[0, 1])) + except ValueError: + cols_arr = np.loadtxt( + mat_path, delimiter="\t", dtype=dtype, skiprows=1 + ) + shape = None + + # Get values into an array without the header. Use the header to give size. + sparse_mat = _cols_to_sparse(cols_arr[1:, :], shape=shape, dtype=dtype) + + if binning == "auto": + num_bins = max(sparse_mat.shape) + 1 + subsampling_factor = num_bins // DEFAULT_MAX_MATRIX_SHAPE + else: + subsampling_factor = binning + sparse_mat = hcs.bin_sparse( + sparse_mat, subsampling_factor=subsampling_factor + ) + return sparse_mat + + +def save_sparse_matrix(s_mat, path): + """Save a sparse matrix + Saves a sparse matrix object into tsv format. + Parameters + ---------- + s_mat : scipy.sparse.coo_matrix + The sparse matrix to save on disk + path : str + File path where the matrix will be stored + """ + if s_mat.format != "coo": + ValueError("Sparse matrix must be in coo format") + dtype = s_mat.dtype + fmt = "%i" if dtype == int else "%.10e" + sparse_arr = np.vstack([s_mat.row, s_mat.col, s_mat.data]).T + + np.savetxt( + path, + sparse_arr, + header="{nrows}\t{ncols}\t{nonzero}".format( + nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz + ), + comments="", + fmt=fmt, + delimiter="\t", + ) + + +def load_pos_col(path, colnum, header=1, dtype=np.int64): + """ + Loads a single column of a TSV file with header into a numpy array. + + Parameters + ---------- + path : str + The path of the TSV file to load. + colnum : int + The 0-based index of the column to load. + header : int + Number of line to skip. By default the header is a single line. + Returns + ------- + numpy.array : + A 1D numpy array with the + Examples + -------- + >>> load_pos_col('test_data/mat_5kb.tsv', 0)[:10] + array([0, 0, 0, 0, 0, 0, 1, 1, 2, 2]) + """ + pos_arr = np.genfromtxt( + path, + delimiter="\t", + usecols=(colnum,), + skip_header=header, + dtype=dtype, + ) + return pos_arr + + +def generate_temp_dir(path): + """Temporary directory generation + Generates a temporary file with a random name at the input path. + + Parameters + ---------- + path : str + The path at which the temporary directory will be created. + + Returns + ------- + str + The path of the newly created temporary directory. + """ + exist = True + while exist: + # Keep trying random directory names if they already exist + directory = str(hex(getrandbits(32)))[2:] + full_path = join(path, directory) + if not exists(full_path): + exist = False + try: + os.makedirs(full_path) + except PermissionError: + raise PermissionError( + "The temporary directory cannot be created in {}. " + "Make sure you have write permission.".format(path) + ) + return full_path + + +def _check_cooler(fun): + """Decorates function `fun` to check if cooler is available..""" + + @functools.wraps(fun) + def wrapped(*args, **kwargs): + try: + import cooler + + fun.__globals__["cooler"] = cooler + except ImportError: + logger.error( + "The cooler package is required to use {0}, please install it first".format( + fun.__name__ + ) + ) + raise ImportError("The cooler package is required.") + return fun(*args, **kwargs) + + return wrapped + + +@_check_cooler +def add_cool_column( + clr, column, column_name, table_name="bins", metadata={}, dtype=None +): + """ + Adds a new column to a loaded Cooler store. If the column exists, + it is replaced. This will affect the .cool file. + + Parameters + ---------- + clr : Cooler object + A Cooler store. + column : pandas Series + The column to add to the cooler. + column_name : str + The name of the column to add. + table_name : str + The name of the table to which the column should be added. + Defaults to the "bins" table. + metadata : dict + A dictionary of metadata to associate with the new column. + """ + with clr.open("r+") as c: + if column_name in c[table_name]: + del c[table_name][column_name] + h5opts = dict(compression="gzip", compression_opts=6) + c[table_name].create_dataset( + column_name, data=column, dtype=dtype, **h5opts + ) + c[table_name][column_name].attrs.update(metadata) + + +@_check_cooler +def load_cool(cool): + """ + Reads a cool file into memory and parses it into graal style tables. + + Parameters + ---------- + cool : str + Path to the input .cool file. + Returns + ------- + mat : scipy coo_matrix + Hi-C contact map in COO format. + frags : pandas DataFrame + Table off bins matching the matrix. Corresponds to the content of the fragments_list.txt file. + chroms : pandas DataFrame + Table of chromosome informations. + """ + c = cooler.Cooler(cool) # pylint: disable=undefined-variable + frags = c.bins()[:] + chroms = c.chroms()[:] + mat = c.pixels()[:] + frags.rename( + columns={"start": "start_pos", "end": "end_pos"}, inplace=True + ) + frags["id"] = frags.groupby("chrom", sort=False).cumcount() + 1 + # Try loading hicstuff-specific columns + try: + frags = frags[ + ["id", "chrom", "start_pos", "end_pos", "size", "gc_content"] + ] + # If absent, only load standard columns + except KeyError: + frags = frags[["id", "chrom", "start_pos", "end_pos"]] + + chroms["cumul_length"] = ( + chroms.length.shift(1).fillna(0).cumsum().astype(int) + ) + n_frags = c.bins()[:].groupby("chrom", sort=False).count().start + chroms["n_frags"] = chroms.merge( + n_frags, right_index=True, left_on="name", how="left" + ).start + chroms.rename(columns={"name": "contig"}, inplace=True) + n = int(max(np.amax(mat.bin1_id), np.amax(mat.bin2_id))) + 1 + shape = (n, n) + mat = coo_matrix((mat["count"], (mat.bin1_id, mat.bin2_id)), shape=shape) + + return mat, frags, chroms + + +@_check_cooler +def save_cool(cool_out, mat, frags, metadata={}): + """ + Writes a .cool file from graal style tables. + + Parameters + ---------- + cool_out : str + Path to the output cool file. + mat : scipy coo_matrix + The Hi-C contact matrix in sparse COO format. + frags : pandas DataFrame + The graal style 'fragments_list' table. + metadata : dict + Potential metadata to associate with the cool file. + """ + up_tri = False + # Check if symmetric matrix is symmetric + # (i.e. only upper triangle or full mat) + if (abs(mat - mat.T) > 1e-10).nnz != 0: + up_tri = True + # Drop useless column + try: + bins = frags.drop("id", axis=1) + except KeyError: + bins = frags + # Get column names right + bins.rename( + columns={"seq": "chrom", "start_pos": "start", "end_pos": "end"}, + inplace=True, + ) + mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data} + pixels = pd.DataFrame(mat_dict) + cooler.create_cooler( # pylint: disable=undefined-variable + cool_out, + bins, + pixels, + metadata=metadata, + symmetric_upper=up_tri, + triucheck=False, + ) + + +def read_compressed(filename, mode='r'): + """Read compressed file + Opens the file in read mode with appropriate decompression algorithm. + Parameters + ---------- + filename : str + The path to the input file + Returns + ------- + file-like object + The handle to access the input file's content + """ + + # Standard header bytes for diff compression formats + comp_bytes = { + b"\x1f\x8b\x08": "gz", + b"\x42\x5a\x68": "bz2", + b"\x50\x4b\x03\x04": "zip", + } + + max_len = max(len(x) for x in comp_bytes) + + def file_type(filename): + """Guess file type + Compare header bytes with those in the file and return type. + """ + with open(filename, "rb") as f: + file_start = f.read(max_len) + for magic, filetype in comp_bytes.items(): + if file_start.startswith(magic): + return filetype + return "uncompressed" + + # Open file with appropriate function + mode_map = {'r': 'rt', 'rb': 'rb'} + comp = file_type(filename) + if comp == "gz": + return gzip.open(filename, mode_map[mode]) + elif comp == "bz2": + return bz2.open(filename, mode_map[mode]) + elif comp == "zip": + zip_arch = zipfile.ZipFile(filename, mode) + if len(zip_arch.namelist()) > 1: + raise IOError( + "Only a single fastq file must be in the zip archive." + ) + else: + # ZipFile opens as bytes by default, using io to read as text + zip_content = zip_arch.open(zip_arch.namelist()[0], mode) + return io.TextIOWrapper(zip_content, encoding="utf-8") + else: + return open(filename, mode) + + +def is_compressed(filename): + """Check compression status + Check if the input file is compressed from the first bytes. + Parameters + ---------- + filename : str + The path to the input file + Returns + ------- + bool + True if the file is compressed, False otherwise. + """ + + # Standard header bytes for diff compression formats + comp_bytes = { + b"\x1f\x8b\x08": "gz", + b"\x42\x5a\x68": "bz2", + b"\x50\x4b\x03\x04": "zip", + } + max_len = max(len(x) for x in comp_bytes) + with open(filename, "rb") as f: + file_start = f.read(max_len) + for magic, _ in comp_bytes.items(): + if file_start.startswith(magic): + return True + return False + + +def from_dade_matrix(filename, header=False): + """Load a DADE matrix + Load a numpy array from a DADE matrix file, optionally + returning bin information from the header. Header data + processing is delegated downstream. + Parameters + ---------- + filename : str, file or pathlib.Path + The name of the file containing the DADE matrix. + header : bool + Whether to return as well information contained + in the header. Default is False. + Example + ------- + >>> import numpy as np + >>> import tempfile + >>> lines = [['RST', 'chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'], + ... ['chr1~0', '5'], + ... ['chr1~10', '8', '3'], + ... ['chr2~0', '3', '5', '5'], + ... ['chr2~30', '5', '10', '11', '2'] + ... ] + >>> formatted = ["\\t".join(l) + "\\n" for l in lines ] + >>> dade = tempfile.NamedTemporaryFile(mode='w') + >>> for fm in formatted: + ... dade.write(fm) + 34 + 9 + 12 + 13 + 18 + >>> dade.flush() + >>> M, h = from_dade_matrix(dade.name, header=True) + >>> dade.close() + >>> print(M) + [[ 5. 8. 3. 5.] + [ 8. 3. 5. 10.] + [ 3. 5. 5. 11.] + [ 5. 10. 11. 2.]] + >>> print(h) + ['chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'] + See https://github.com/scovit/DADE for more details about Dade. + """ + + A = pd.read_csv(filename, sep="\t", header=None) + A.fillna("0", inplace=True) + M, headers = np.array(A.iloc[1:, 1:], dtype=np.float64), A.iloc[0, :] + matrix = M + M.T - np.diag(np.diag(M)) + if header: + return matrix, headers.tolist()[1:] + else: + return matrix + + +def to_dade_matrix(M, annotations="", filename=None): + """Returns a Dade matrix from input numpy matrix. Any annotations are added + as header. If filename is provided and valid, said matrix is also saved + as text. + """ + + n, m = M.shape + A = np.zeros((n + 1, m + 1)) + A[1:, 1:] = M + if not annotations: + annotations = np.array(["" for _ in n], dtype=str) + A[0, :] = annotations + A[:, 0] = annotations.T + if filename: + try: + np.savetxt(filename, A, fmt="%i") + logger.info( + "I saved input matrix in dade format as {0}".format( + str(filename) + ) + ) + except ValueError as e: + logger.warning("I couldn't save input matrix.") + logger.warning(str(e)) + + return A + + +def load_into_redis(filename): + """Load a file into redis + Load a matrix file and sotre it in memory with redis. + Useful to pass around huge datasets from scripts to + scripts and load them only once. + Inspired from https://gist.github.com/alexland/ce02d6ae5c8b63413843 + Parameters + ---------- + filename : str, file or pathlib.Path + The file of the matrix to load. + Returns + ------- + key : str + The key of the dataset needed to retrieve it from redis. + """ + try: + from redis import StrictRedis as redis + import time + except ImportError: + print( + "Error! Redis does not appear to be installed in your system.", + file=sys.stderr, + ) + exit(1) + + M = np.genfromtxt(filename, dtype=None) + array_dtype = str(M.dtype) + m, n = M.shape + M = M.ravel().tostring() + database = redis(host="localhost", port=6379, db=0) + key = "{0}|{1}#{2}#{3}".format(int(time.time()), array_dtype, m, n) + + database.set(key, M) + + return key + + +def load_from_redis(key): + """Retrieve a dataset from redis + Retrieve a cached dataset that was stored in redis + with the input key. + Parameters + ---------- + key : str + The key of the dataset that was stored in redis. + Returns + ------- + M : numpy.ndarray + The retrieved dataset in array format. + """ + + try: + from redis import StrictRedis as redis + except ImportError: + print( + "Error! Redis does not appear to be installed in your system.", + file=sys.stderr, + ) + exit(1) + + database = redis(host="localhost", port=6379, db=0) + + try: + M = database.get(key) + except KeyError: + print( + "Error! No dataset was found with the supplied key.", + file=sys.stderr, + ) + exit(1) + + array_dtype, n, m = key.split("|")[1].split("#") + + M = np.fromstring(M, dtype=array_dtype).reshape(int(n), int(m)) + return M + + +def dade_to_graal( + filename, + output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME, + output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, + output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME, + output_dir=None, +): + """Convert a matrix from DADE format (https://github.com/scovit/dade) + to a graal-compatible format. Since DADE matrices contain both fragment + and contact information all files are generated at the same time. + """ + + with open(output_matrix, "w") as sparse_file: + sparse_file.write("id_frag_a\tid_frag_b\tn_contact") + with open(filename) as file_handle: + first_line = file_handle.readline() + for row_index, line in enumerate(file_handle): + dense_row = np.array(line.split("\t")[1:], dtype=np.int32) + for col_index in np.nonzero(dense_row)[0]: + line_to_write = "{}\t{}\t{}\n".format( + row_index, col_index, dense_row[col_index] + ) + sparse_file.write(line_to_write) + + header = first_line.split("\t") + bin_type = header[0] + if bin_type == '"RST"': + logger.info("I detected fragment-wise binning") + elif bin_type == '"BIN"': + logger.info("I detected fixed size binning") + else: + logger.warning( + ( + "Sorry, I don't understand this matrix's " + "binning: I read {}".format(str(bin_type)) + ) + ) + + header_data = [ + header_elt.replace("'", "") + .replace('"', "") + .replace("\n", "") + .split("~") + for header_elt in header[1:] + ] + + ( + global_frag_ids, + contig_names, + local_frag_ids, + frag_starts, + frag_ends, + ) = np.array(list(zip(*header_data))) + + frag_starts = frag_starts.astype(np.int32) - 1 + frag_ends = frag_ends.astype(np.int32) - 1 + frag_lengths = frag_ends - frag_starts + + total_length = len(global_frag_ids) + + with open(output_contigs, "w") as info_contigs: + + info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") + + cumul_length = 0 + + for contig in collections.OrderedDict.fromkeys(contig_names): + + length_tig = np.sum(frag_lengths[contig_names == contig]) + n_frags = collections.Counter(contig_names)[contig] + line_to_write = "%s\t%s\t%s\t%s\n" % ( + contig, + length_tig, + n_frags, + cumul_length, + ) + info_contigs.write(line_to_write) + cumul_length += n_frags + + with open(output_frags, "w") as fragments_list: + + fragments_list.write( + "id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n" + ) + bogus_gc = 0.5 + + for i in range(total_length): + line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + int(local_frag_ids[i]) + 1, + contig_names[i], + frag_starts[i], + frag_ends[i], + frag_lengths[i], + bogus_gc, + ) + fragments_list.write(line_to_write) + + +def load_bedgraph2d(filename, bin_size=None, fragments_file=None): + """ + Loads matrix and fragment information from a 2D bedgraph file. Note this + function assumes chromosomes are ordered in alphabetical. order + + Parameters + ---------- + filename : str + Path to the bedgraph2D file. + bin_size : int + The size of bins in the case of fixed bin size. + fragments_file : str + Path to a fragments file to explicitely provide fragments positions. + If the matrix does not have fixed bin size, this prevents errors. + + Returns + ------- + mat : scipy.sparse.coo_matrix + The Hi-C contact map as the upper triangle of a symetric matrix, in + sparse format. + frags : pandas.DataFrame + The list of fragments/bin present in the matrix with their genomic + positions. + """ + bed2d = pd.read_csv(filename, sep="\t", header=None) + chrom_sizes = {} + if bin_size is not None: + # If bin size if provided, retrieve chromosome lengths, this will be + # used when regenerating bin coordinates + chroms_left = bed2d[[3, 5]] + chroms_left.columns = [0, 2] + chroms = ( + pd.concat([bed2d[[0, 2]], chroms_left]) + .groupby([0], sort=False) + .max() + ) + for chrom, size in zip(chroms.index, np.array(chroms)): + chrom_sizes[chrom] = size[0] + elif fragments_file is None: + logger.warning( + "Please be aware that not all information can be restored from a " + "bg2 file without fixed bin size; fragments without any contact " + "will be lost" + ) + # Get all possible fragment chrom-positions into an array + frag_pos = np.vstack( + [np.array(bed2d[[0, 1, 2]]), np.array(bed2d[[3, 4, 5]])] + ) + # Sort by position (least important, col 1) + frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")] + # Then by chrom (most important, col 0) + frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")] + # Get unique names for fragments (chrom+pos) + ordered_frag_pos = ( + pd.DataFrame(frag_pos).drop_duplicates().reset_index(drop=True) + ) + frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1) + frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1) + # If fragments file is provided, use fragments positions to indices mapping + if fragments_file is not None: + frags = pd.read_csv(fragments_file, delimiter="\t") + frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1) + frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)} + # If fixed fragment size available, use it to reconstruct original + # fragments ID (even if they are absent from the bedgraph file). + elif bin_size is not None: + frag_map = {} + chrom_frags = [] + for chrom, size in chrom_sizes.items(): + prev_frags = len(frag_map) + for bin_id, bin_pos in enumerate(range(0, size, bin_size)): + frag_map[(chrom, bin_pos)] = bin_id + prev_frags + n_bins = size // bin_size + chrom_frags.append( + pd.DataFrame( + { + "id": range(1, n_bins + 1), + "chrom": np.repeat(chrom, n_bins), + "start_pos": range(0, size, bin_size), + "end_pos": range(bin_size, size + bin_size, bin_size), + } + ) + ) + frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True) + frags.insert( + loc=3, column="size", value=frags.end_pos - frags.start_pos + ) + # If None available, guess fragments indices from bedgraph (potentially wrong) + else: + frag_map = { + (v[0], v[1]): i + for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows() + } + frags = ordered_frag_pos.copy() + frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1] + frags.insert(loc=0, column="id", value=0) + frags.id = frags.groupby([0], sort=False).cumcount() + 1 + frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"] + # Match bin indices to their names + frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a))) + frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b))) + contacts = np.array(bed2d.iloc[:, 6].tolist()) + # Use index to build matrix + n_frags = len(frag_map.keys()) + mat = coo_matrix( + (contacts, (frag_id_a, frag_id_b)), shape=(n_frags, n_frags) + ) + + # Get size of each chromosome in basepairs + chromsizes = frags.groupby("chrom", sort=False).apply( + lambda x: np.int64(max(x.end_pos)) + ) + chrom_bins = frags.groupby("chrom", sort=False).size() + # Shift chromsizes by one to get starting bin, first one is zero + # Make chromsize cumulative to get start bin of each chrom + # Get chroms into a 1D array of bin starts + chrom_start = chrom_bins.shift(1, fill_value=0).cumsum() + chroms = pd.DataFrame( + { + "contig": chromsizes.index, + "length": chromsizes.values, + "n_frags": chrom_bins, + "cumul_length": chrom_start, + } + ) + return mat, frags, chroms + + +def flexible_hic_loader( + mat, fragments_file=None, chroms_file=None, quiet=False +): + """ + Wrapper function to load COO, bg2 or cool input and return the same output. + COO formats requires fragments_file and chroms_file options. bg2 format can + infer bin_size if fixed. When providing a bg2 matrix with uneven fragments + length, one should provide fragments_file as well or empty bins will be + truncated from the output. + + Parameters + ---------- + mat : str + Path to the matrix in graal, bedgraph2 or cool format. + fragments_file : str or None + Path to the file with fragments information (fragments_list.txt). + Only required if the matrix is in graal format. + chroms_file : str or None + Path to the file with chromosome information (info_contigs.txt). Only required + if the matrix is in graal format. + quiet : bool + If True, will silence warnings for empty outputs. + Returns + ------- + mat : scipy.sparse.coo_matrix + Sparse upper triangle Hi-C matrix. + frags : pandas.DataFrame or None + Table of fragment informations. None if information was not provided. + chroms : pandas.DataFrame or None + Table of chromosomes/contig information. None if information was not provided. + """ + hic_format = get_hic_format(mat) + # Load cool based on file extension + if hic_format == "cool": + mat, frags, chroms = load_cool(mat) + # Use the first line to determine COO / bg2 format + if hic_format == "bg2": + # Use the frags file to define bins if available + if fragments_file is not None: + mat, frags, chroms = load_bedgraph2d( + mat, fragments_file=fragments_file + ) + else: + # Guess if bin size is fixed based on MAD + bg2 = pd.read_csv(mat, sep="\t") + sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1]) + size_mad = ss.median_abs_deviation(sizes, scale='normal') + # Use only the bg2 + if size_mad > 0: + mat, frags, chroms = load_bedgraph2d(mat) + logger.warning( + "Input is a bedgraph2d file with uneven bin size, " + "but no fragments_file was provided. Empty bins will " + "be missing from the output. To avoid this, provide a " + "fragments file." + ) + # Use fixed bin size + else: + mat, frags, chroms = load_bedgraph2d( + mat, bin_size=int(np.median(sizes)) + ) + + elif hic_format == "graal": + mat = load_sparse_matrix(mat) + try: + frags = pd.read_csv(fragments_file, sep="\t") + except ValueError: + if not quiet: + logger.warning( + "fragments_file was not provided when " + "loading a matrix in COO/graal format. frags will be None." + ) + frags = None + try: + chroms = pd.read_csv(chroms_file, sep="\t") + except ValueError: + if not quiet: + logger.warning( + "chroms_file was not provided when " + "loading a matrix in COO/graal format. chroms will be None." + ) + + chroms = None + + # Ensure the matrix is upper triangle symmetric + if mat.shape[0] == mat.shape[1]: + if (abs(mat - mat.T) > 1e-10).nnz > 0: + mat = mat + tril(mat, k=-1).T + mat = triu(mat, format="coo") + + return mat, frags, chroms + + +def get_hic_format(mat): + """Returns the format of the input Hi-C matrix + + Parameters + ---------- + mat : str + Path to the input Hi-C matrix + Returns + ------- + str : + Hi-C format string. One of graal, bg2, cool + """ + if ( + mat.endswith(".cool") + or mat.count(".mcool::/") == 1 + or mat.count(".cool::/") == 1 + ): + hic_format = "cool" + else: + # Use the first line to determine COO / bg2 format + ncols = len(open(mat).readline().split("\t")) + if ncols == 7: + hic_format = "bg2" + elif ncols == 3: + hic_format = "graal" + else: + raise ValueError("Unkown file format") + return hic_format + + +def flexible_hic_saver( + mat, out_prefix, frags=None, chroms=None, hic_fmt="graal", quiet=False, +): + """ + Saves objects to the desired Hi-C file format. + Parameters + ---------- + mat : scipy.sparse.coo_matrix + Hi-C contact map. + out_prefix : str + Output path without extension (the extension is added based on hic_fmt). + frags : pandas.DataFrame or None + Table of fragments informations. + chroms : pandas.DataFrame or None + Table of chromosomes / contigs informations. + hic_fmt : str + Output format. Can be one of graal for graal-compatible COO format, bg2 for + 2D bedgraph format, or cool for cooler compatible format. + """ + if hic_fmt == "graal": + save_sparse_matrix(mat, out_prefix + ".mat.tsv") + try: + frags.to_csv(out_prefix + ".frags.tsv", sep="\t", index=False) + except AttributeError: + if not quiet: + logger.warning( + "Could not create fragments_list.txt from input files" + ) + try: + chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False) + except AttributeError: + if not quiet: + logger.warning( + "Could not create info_contigs.txt from input files" + ) + elif hic_fmt == "cool": + frag_sizes = frags.end_pos - frags.start_pos + size_mad = np.median(frag_sizes - np.median(frag_sizes)) + bin_type = "variable" if size_mad else "fixed" + try: + save_cool( + out_prefix + ".cool", + mat, + frags, + metadata={"hicstuff": __version__, "bin-type": bin_type}, + ) + except NameError: + NameError("frags is required to save a cool file") + elif hic_fmt == "bg2": + try: + save_bedgraph2d(mat, frags, out_prefix + ".bg2") + except NameError: + NameError("frags is required to save a bg2 file") + else: + raise ValueError("Unknown output format: {0}".format(hic_fmt)) + + +def save_bedgraph2d(mat, frags, out_path): + """ + Given a sparse matrix and a corresponding list of fragments, save a file + in 2D bedgraph format. + Parameters + ---------- + mat : scipy.sparse.coo_matrix + The sparse contact map. + frags : pandas.DataFrame + A structure containing the annotations for each matrix bin. Should + correspond to the content of the fragments_list.txt file. + """ + + mat_df = pd.DataFrame( + {"row": mat.row, "col": mat.col, "data": mat.data.astype(int)} + ) + # Merge fragments with matrix based on row indices to annotate rows + merge_mat = mat_df.merge( + frags, left_on="row", right_index=True, how="left", suffixes=("", "") + ) + # Rename annotations to assign to frag1 + merge_mat.rename( + columns={"chrom": "chr1", "start_pos": "start1", "end_pos": "end1"}, + inplace=True, + ) + # Do the same operation for cols (frag2) + merge_mat = merge_mat.merge( + frags, + left_on="col", + right_index=True, + how="left", + suffixes=("_0", "_2"), + ) + merge_mat.rename( + columns={"chrom": "chr2", "start_pos": "start2", "end_pos": "end2"}, + inplace=True, + ) + # Select only relevant columns in correct order + bg2 = merge_mat.loc[ + :, ["chr1", "start1", "end1", "chr2", "start2", "end2", "data"] + ] + bg2.to_csv(out_path, header=None, index=False, sep="\t") + + +def get_pos_cols(df): + """Get column names representing chromosome, start and end column + from a dataframe. Allows flexible names. + Parameters + ---------- + df : pd.DataFrame + Returns + ------- + tuple of str: + Tuple containing the names of the chromosome, start and end + columns in the input dataframe. + Examples + -------- + >>> import pandas as pd + >>> d = [1, 2, 3] + >>> df = pd.DataFrame( + ... {'Chromosome': d, 'Start': d, 'End': d, 'species': d} + ... ) + >>> get_pos_cols(df) + ('Chromosome', 'Start', 'End') + >>> df = pd.DataFrame( + ... {'id': d, 'chr': d, 'start_bp': d, 'end_bp': d} + ... ) + >>> get_pos_cols(df) + ('chr', 'start_bp', 'end_bp') + """ + + # Get case insensitive column names + cnames = df.columns + inames = cnames.str.lower() + + def _col_getter(cols, pat): + """Helper to get column index from the start of its name""" + mask = cols.str.startswith(pat) + idx = np.flatnonzero(mask)[0] + return idx + + chrom_col = cnames[_col_getter(inames, "chr")] + start_col = cnames[_col_getter(inames, "start")] + end_col = cnames[_col_getter(inames, "end")] + return chrom_col, start_col, end_col + + +def gc_bins(genome_path, frags): + """Generate GC content annotation for bins using input genome. + Parameters + ---------- + genome_path : str + Path the the genome file in FASTA format. + frags : pandas.DataFrame + Table containing bin segmentation of the genome. + Required columns: chrom, start, end. + Returns + ------- + numpy.ndarray of floats: + GC content per bin, in the range [0.0, 1.0]. + """ + # Grab columns by name (catch start, Start, star_pos, etc) + chrom_col, start_col, end_col = get_pos_cols(frags) + # Fill the gc array by chromosome + gc_bins = np.zeros(frags.shape[0], dtype=float) + for rec in SeqIO.parse(genome_path, "fasta"): + mask = frags[chrom_col] == rec.id + # Define coordinates of each bin + starts = frags.loc[mask, start_col] + ends = frags.loc[mask, end_col] + # Slice chromosome sequence based on bins + seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)] + # Fill GC values for bins in the chromosome + idx = np.flatnonzero(mask) + gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0 + + return gc_bins + + +def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"): + """ + Sort a pairs file in batches using UNIX sort. + Parameters + ---------- + in_file : str + Path to the unsorted input file + out_file : str + Path to the sorted output file. + keys : list of str + list of columns to use as sort keys. Each column can be one of readID, + chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to + the order in the list. + tmp_dir : str + Path to the directory where temporary files will be created. Defaults + to current directory. + threads : int + Number of parallel sorting threads. + buffer : str + Buffer size used for sorting. Consists of a number and a unit. + """ + # TODO: Write a pure python implementation to drop GNU coreutils depencency, + # could be inspired from: https://stackoverflow.com/q/14465154/8440675 + + # Check if UNIX sort version supports parallelism + parallel_ok = True + sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE) + sort_ver = ( + sort_ver.communicate()[0] + .decode() + .split("\n")[0] + .split(" ")[-1] + .split(".") + ) + # If so, specify threads, otherwise don't mention it in the command line + try: + sort_ver = list(map(int, sort_ver)) + if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23): + logger.warning( + "GNU sort version is {0} but >8.23 is required for parallel " + "sort. Sorting on a single thread.".format( + ".".join(map(str, sort_ver)) + ) + ) + parallel_ok = False + # BSD sort has a different format and will throw error upon parsing. It does + # not support parallel processes anyway. + except ValueError: + logger.warning( + "Using BSD sort instead of GNU sort, sorting on a single thread." + ) + parallel_ok = False + + key_map = { + "readID": "-k1,1d", + "chr1": "-k2,2V", + "pos1": "-k3,3n", + "chr2": "-k4,4V", + "pos2": "-k5,5n", + "strand1": "-k6,6d", + "strand2": "-k7,7d", + "frag1": "-k8,8n", + "frag2": "-k9,9n", + } + + # transform column names to corresponding sort keys + try: + sort_keys = map(lambda k: key_map[k], keys) + except KeyError: + print("Unkown column name.") + raise + # Rewrite header with new sorting order + header = get_pairs_header(in_file) + with open(out_file, "w") as output: + for line in header: + if line.startswith("#sorted"): + output.write("#sorted: {0}\n".format("-".join(keys))) + else: + output.write(line + "\n") + + # Sort pairs and append to file. + with open(out_file, "a") as output: + grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE) + sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys) + if tmp_dir is not None: + sort_cmd.append("--temporary-directory={0}".format(tmp_dir)) + if parallel_ok: + sort_cmd.append("--parallel={0}".format(threads)) + sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output) + sort_proc.communicate() + + +def get_pairs_header(pairs): + r"""Retrieves the header of a .pairs file and stores lines into a list. + Parameters + ---------- + pairs : str or file object + Path to the pairs file. + Returns + ------- + header : list of str + A list of header lines found, in the same order they appear in pairs. + Examples + -------- + >>> import os + >>> from tempfile import NamedTemporaryFile + >>> p = NamedTemporaryFile('w', delete=False) + >>> p.writelines(["## pairs format v1.0\n", "#sorted: chr1-chr2\n", "abcd\n"]) + >>> p.close() + >>> h = get_pairs_header(p.name) + >>> for line in h: + ... print([line]) + ['## pairs format v1.0'] + ['#sorted: chr1-chr2'] + >>> os.unlink(p.name) + """ + # Open file if needed + with open(pairs, "r") as pairs: + # Store header lines into a list + header = [] + line = pairs.readline() + while line.startswith("#"): + header.append(line.rstrip()) + line = pairs.readline() + + return header + + +def reorder_fasta(genome, output=None, threshold=100000): + """Reorder and trim a fasta file + Sort a fasta file by record lengths, optionally trimming the smallest ones. + Parameters + ---------- + genome : str, file or pathlib.Path + The genome scaffold file (or file handle) + output : str, file or pathlib.Path + The output file to write to + threshold : int, optional + The size below which scaffolds are discarded, by default 100000 + """ + + if output is None: + output = "{}_renamed.fa".format(genome.split(".")[0]) + + handle = SeqIO.parse(genome, "fasta") + handle_to_write = sorted( + (len(u) for u in handle if len(u) > threshold), reverse=True + ) + SeqIO.write(handle_to_write, output, "fasta") + + +def rename_genome(genome, output=None, ambiguous=True): + """Rename genome and slugify headers + Rename genomes according to a simple naming scheme; this + is mainly done to avoid special character weirdness. + Parameters + ---------- + genome : file, str or pathlib.Path + The input genome to be renamed and slugify. + output : file, str or pathlib.Path + The output genome to be written into. Default is <base>_renamed.fa, + where <base> is genome_in without its extension. + ambiguous : bool + Whether to retain ambiguous non-N bases, otherwise rename them to Ns. + Default is True. + """ + + if output is None: + output = "{}_renamed.fa".format(genome.split(".")[0]) + + with open(output, "w") as output_handle: + for record in SeqIO.parse(genome, "fasta"): + + # Replace hyphens, tabs and whitespace with underscores + new_record_id = record.id.replace(" ", "_") + new_record_id = new_record_id.replace("-", "_") + new_record_id = new_record_id.replace("\t", "_") + + # Remove anything that's weird, i.e. not alphanumeric + # or an underscore + new_record_id = re.sub("[^_A-Za-z0-9]+", "", new_record_id) + header = ">{}\n".format(new_record_id) + + new_seq = re.sub("[^ATGCatgcNn]", "N", str(record.seq)) + + output_handle.write(header) + output_handle.write("{}\n".format(new_seq)) + + +def check_fasta_index(ref, mode="bowtie2"): + """ + Checks for the existence of a bowtie2 or bwa index based on the reference + file name. + Parameters + ---------- + ref : str + Path to the reference genome. + mode : str + The alignment software used to build the index. bowtie2 or bwa. If any + other value is given, the function returns the reference path. + Returns + ------- + index : str + The bowtie2 or bwa index basename. None if no index was found + """ + ref = pathlib.Path(ref) + if mode == "bowtie2": + # Bowtie2 should have 6 index files + bt2_idx_files = list(ref.parent.glob("{}*bt2*".format(ref.name))) + index = None if len(bt2_idx_files) < 6 else bt2_idx_files + elif mode == "bwa": + refdir = str(ref.parent) + refdir_files = os.listdir(refdir) + bwa_idx_files = [ + join(refdir, f) + for f in refdir_files + if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f) + ] + index = None if len(bwa_idx_files) < 5 else bwa_idx_files + else: + index = [ref] + if index is not None: + # Convert the PosixPath objects to strings and get the longest common prefix to obtain + # index basename (without the dot) + index = os.path.commonprefix(list(map(str, index))).strip(".") + return index + + +def check_is_fasta(in_file): + """ + Checks whether input file is in fasta format. + + Parameters + ---------- + in_file : str + Path to the input file. + Returns + ------- + bool : + True if the input is in fasta format, False otherwise + """ + try: + with read_compressed(in_file) as handle: + fasta = any(SeqIO.parse(handle, "fasta")) + except FileNotFoundError: + fasta = False + + return fasta diff --git a/bin/hicstuff_log.py b/bin/hicstuff_log.py new file mode 100644 index 0000000..171a566 --- /dev/null +++ b/bin/hicstuff_log.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Basic logging +Basic logging setup with three main handlers (stdout, log file and optionally +texting with the right API). By default the log file is disabled, but can be +enabled or changed using set_file_handler. +""" + + +import logging +import logging.handlers +import requests +import json + +TEXT_CREDENTIALS_DEFAULT_PATH = "text_credentials.json" + +CURRENT_LOG_LEVEL = logging.INFO + + +logging.captureWarnings(True) + +logger = logging.getLogger() +logger.setLevel(CURRENT_LOG_LEVEL) + +logfile_formatter = logging.Formatter( + "%(asctime)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d,%H:%M:%S" +) +stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s") +text_formatter = logging.Formatter("%(message)s") + +# file_handler = logging.FileHandler("hicstuff.log", "a") + +# file_handler.setLevel(logging.INFO) +# file_handler.setFormatter(logfile_formatter) +# logger.addHandler(file_handler) + +stream_handler = logging.StreamHandler() +stream_handler.setLevel(logging.DEBUG) +stream_handler.setFormatter(stdout_formatter) +logger.addHandler(stream_handler) + + +def set_file_handler(log_path, formatter=logfile_formatter): + """Change the file handler for custom log file location""" + + filehandler = logging.FileHandler(log_path, "a") + filehandler.setLevel(logging.INFO) + filehandler.setFormatter(formatter) + for hdlr in logger.handlers[:]: # remove the existing file handlers + if isinstance(hdlr, logging.FileHandler): + logger.removeHandler(hdlr) + logger.addHandler(filehandler) # set the new handler + + +def setup_text_logging(credentials=TEXT_CREDENTIALS_DEFAULT_PATH): + """Setup text logging + Setup text notifications on errors with a basic API. In order for it to + work, a file named 'text_credentials.json' must be in the directory where + a command is run. The logger performs a GET request to a text notification + API. + Parameters + ---------- + credentials : str or pathlib.Path + A JSON file with necessary credentials for the GET request + """ + + with open(credentials) as cred_handle: + cred_dict = json.load(cred_handle) + + api_service = cred_dict.pop("api_service") + + class TextHandler(logging.Handler): + def emit(self, record): + log_entry = self.format(record) + cred_dict["msg"] = log_entry + return requests.get(api_service, cred_dict) + + sms_handler = TextHandler() + sms_handler.setLevel(logging.ERROR) + sms_handler.setFormatter(text_formatter) + + logger.addHandler(sms_handler) + + +try: + setup_text_logging(TEXT_CREDENTIALS_DEFAULT_PATH) +except FileNotFoundError: + pass diff --git a/conf/hicstuff.config b/conf/hicstuff.config index 6e20d06..f26ae40 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -9,6 +9,37 @@ process { withName: 'BOWTIE2_ALIGNMENT' { ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } } + + withName: 'FRAGMENT_ENZYME' { + publishDir = [ + path: { "${params.outdir}/hicstuff/fragment_enzyme" }, + mode: 'copy' + ] + } + + //********************************************** + // PREPARE_GENOME + withName: 'BOWTIE2_BUILD' { + publishDir = [ + path: { "${params.outdir}/genome/bowtie2" }, + mode: 'copy' + ] + } + + withName: 'CUSTOM_GETCHROMSIZES' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: 'copy' + ] + } + + withName: 'GET_RESTRICTION_FRAGMENTS' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: 'copy' + ] + } + } params { diff --git a/modules/local/hicstuff/fragment_enzyme.nf b/modules/local/hicstuff/fragment_enzyme.nf new file mode 100644 index 0000000..dafa3cf --- /dev/null +++ b/modules/local/hicstuff/fragment_enzyme.nf @@ -0,0 +1,21 @@ +process FRAGMENT_ENZYME { + tag "$digestion" + label "process_single" + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + val(digestion) + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("info_contigs.txt") , emit: info_contigs + tuple val(meta), path("fragments_list.txt"), emit: fragments_list + + script: + """ + hicstuff_fragments.py -e ${digestion} -i ${fasta} + """ + +} diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index dfb160b..99516b3 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -1,5 +1,6 @@ include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2' +include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -27,6 +28,8 @@ workflow HICSTUFF_SUB { reads index ligation_site + digestion + fasta main: @@ -39,4 +42,13 @@ workflow HICSTUFF_SUB { ch_reads, index.collect() ) + + FRAGMENT_ENZYME( + digestion, + fasta + ) + + emit: + info_contigs = FRAGMENT_ENZYME.out.info_contigs + fragments_list = FRAGMENT_ENZYME.out.fragments_list } diff --git a/workflows/hicstuff.nf b/workflows/hicstuff.nf index 0a760cf..4a69b68 100644 --- a/workflows/hicstuff.nf +++ b/workflows/hicstuff.nf @@ -6,9 +6,9 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample // Digestion parameters if (params.digestion){ restriction_site = params.digestion ? params.digest[ params.digestion ].restriction_site ?: false : false - ch_restriction_site = Channel.value(restriction_site) + ch_restriction_site = Channel.value(restriction_site) //str seq avec ^ pour cut ligation_site = params.digestion ? params.digest[ params.digestion ].ligation_site ?: false : false - ch_ligation_site = Channel.value(ligation_site) + ch_ligation_site = Channel.value(ligation_site) //str seq }else if (params.dnase){ ch_restriction_site = Channel.empty() ch_ligation_site = Channel.empty() @@ -37,6 +37,8 @@ workflow HICSTUFF { HICSTUFF_SUB( INPUT_CHECK.out.reads, PREPARE_GENOME.out.index, - ch_ligation_site + ch_ligation_site, + params.digestion, //str enzyme + ch_fasta ) } -- GitLab