diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..813219d380e222f851c6ca9a74da54c87290acff --- /dev/null +++ b/bin/__init__.py @@ -0,0 +1 @@ +from .bin import * diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py new file mode 100755 index 0000000000000000000000000000000000000000..6e3cfeb38e2ed81be2f71a14d7da87a08f4f19cd --- /dev/null +++ b/bin/hicstuff_bam2pairs.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python + +import os, time, csv, sys, re +import argparse +import pysam as ps +import pandas as pd +import itertools +from hicstuff_log import logger +import hicstuff_io as hio +import hicstuff_digest as hcd +from Bio import SeqIO + + + +def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30): + """ + Make a .pairs file from two Hi-C bam files sorted by read names. + The Hi-C mates are matched by read identifier. Pairs where at least one + reads maps with MAPQ below min_qual threshold are discarded. Pairs are + sorted by readID and stored in upper triangle (first pair higher). + Parameters + ---------- + bam1 : str + Path to the name-sorted BAM file with aligned Hi-C forward reads. + bam2 : str + Path to the name-sorted BAM file with aligned Hi-C reverse reads. + out_pairs : str + Path to the output space-separated .pairs file with columns + readID, chr1 pos1 chr2 pos2 strand1 strand2 + info_contigs : str + Path to the info contigs file, to get info on chromosome sizes and order. + min_qual : int + Minimum mapping quality required to keep a Hi-C pair. + """ + forward = ps.AlignmentFile(bam1, "rb") + reverse = ps.AlignmentFile(bam2, "rb") + + # Generate header lines + format_version = "## pairs format v1.0\n" + sorting = "#sorted: readID\n" + cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n" + # Chromosome order will be identical in info_contigs and pair files + chroms = pd.read_csv(info_contigs, sep="\t").apply( + lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1 + ) + with open(out_pairs, "w") as pairs: + pairs.writelines([format_version, sorting, cols] + chroms.tolist()) + pairs_writer = csv.writer(pairs, delimiter="\t") + n_reads = {"total": 0, "mapped": 0} + # Remember if some read IDs were missing from either file + unmatched_reads = 0 + # Remember if all reads in one bam file have been read + exhausted = [False, False] + # Iterate on both BAM simultaneously + end_regex = re.compile(r'/[12]$') + for end1, end2 in itertools.zip_longest(forward, reverse): + # Remove end-specific suffix if any + end1.query_name = re.sub(end_regex, '', end1.query_name) + end2.query_name = re.sub(end_regex, '', end2.query_name) + # Both file still have reads + # Check if reads pass filter + try: + end1_passed = end1.mapping_quality >= min_qual + # Happens if end1 bam file has been exhausted + except AttributeError: + exhausted[0] = True + end1_passed = False + try: + end2_passed = end2.mapping_quality >= min_qual + # Happens if end2 bam file has been exhausted + except AttributeError: + exhausted[1] = True + end2_passed = False + # Skip read if mate is not present until they match or reads + # have been exhausted + while sum(exhausted) == 0 and end1.query_name != end2.query_name: + # Get next read and check filters again + # Count single-read iteration + unmatched_reads += 1 + n_reads["total"] += 1 + if end1.query_name < end2.query_name: + try: + end1 = next(forward) + end1_passed = end1.mapping_quality >= min_qual + # If EOF is reached in BAM 1 + except (StopIteration, AttributeError): + exhausted[0] = True + end1_passed = False + n_reads["mapped"] += end1_passed + elif end1.query_name > end2.query_name: + try: + end2 = next(reverse) + end2_passed = end2.mapping_quality >= min_qual + # If EOF is reached in BAM 2 + except (StopIteration, AttributeError): + exhausted[1] = True + end2_passed = False + n_reads["mapped"] += end2_passed + + # 2 reads processed per iteration, unless one file is exhausted + n_reads["total"] += 2 - sum(exhausted) + n_reads["mapped"] += sum([end1_passed, end2_passed]) + # Keep only pairs where both reads have good quality + if end1_passed and end2_passed: + + # Flipping to get upper triangle + if ( + end1.reference_id == end2.reference_id + and end1.reference_start > end2.reference_start + ) or end1.reference_id > end2.reference_id: + end1, end2 = end2, end1 + pairs_writer.writerow( + [ + end1.query_name, + end1.reference_name, + end1.reference_start + 1, + end2.reference_name, + end2.reference_start + 1, + "-" if end1.is_reverse else "+", + "-" if end2.is_reverse else "+", + ] + ) + pairs.close() + if unmatched_reads > 0: + logger.warning( + "%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.", + unmatched_reads, + ) + logger.info( + "{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format( + total=n_reads["total"], + mapped=n_reads["mapped"], + perc_map=round(100 * n_reads["mapped"] / n_reads["total"]), + qual=min_qual, + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-b1", "--bam1") + parser.add_argument("-b2", "--bam2") + parser.add_argument("-o", "--out_pairs") + parser.add_argument("-x", "--out_idx") + parser.add_argument("-i", "--info_contigs") + parser.add_argument("-q", "--min_qual") + parser.add_argument("-e", "--enzyme") + parser.add_argument("-f", "--fasta") + parser.add_argument("-c", "--circular") + args = parser.parse_args() + + bam1 = args.bam1 + bam2 = args.bam2 + out_pairs = args.out_pairs + out_idx = args.out_idx + info_contigs = args.info_contigs + min_qual = int(args.min_qual) + enzyme = args.enzyme + fasta = args.fasta + circular = args.circular + + #hicstuff case sensitive enzymes adaptation + if enzyme == "hindiii": + enzyme = "HindIII" + elif enzyme == "dpnii": + enzyme = "DpnII" + elif enzyme == "bglii": + enzyme = "BglII" + elif enzyme == "mboi": + enzyme = "MboI" + elif enzyme == "arima": + enzyme = "Arima" + + bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual) + + restrict_table = {} + for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"): + # Get chromosome restriction table + restrict_table[record.id] = hcd.get_restriction_table( + record.seq, enzyme, circular=circular + ) + + hcd.attribute_fragments(out_pairs, out_idx, restrict_table) + + hio.sort_pairs( + out_idx, + out_idx + ".sorted", + keys=["chr1", "pos1", "chr2", "pos2"], + threads=1, + tmp_dir=None, + ) + os.rename(out_idx + ".sorted", out_idx) diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py new file mode 100755 index 0000000000000000000000000000000000000000..bd68ee0f5c6a7b839ee4537240019e0147cbb080 --- /dev/null +++ b/bin/hicstuff_build_matrix.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python + +import os, time, csv, sys, re +import argparse +import pysam as ps +import pandas as pd +import shutil as st +import subprocess as sp +import itertools +from hicstuff_log import logger +import hicstuff_io as hio +import hicstuff_digest as hcd +from Bio import SeqIO + + +def pairs2matrix( + pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None +): + """Generate the matrix by counting the number of occurences of each + combination of restriction fragments in a pairs file. + Parameters + ---------- + pairs_file : str + Path to a Hi-C pairs file, with frag1 and frag2 columns added. + mat_file : str + Path where the matrix will be written. + fragments_file : str + Path to the fragments_list.txt file. Used to know total + matrix size in case some observations are not observed at the end. + mat_fmt : str + The format to use when writing the matrix. Can be graal or bg2 format. + threads : int + Number of threads to use in parallel. + tmp_dir : str + Temporary directory for sorting files. If None given, will use the system default. + """ + # Number of fragments is N lines in frag list - 1 for the header + n_frags = sum(1 for line in open(fragments_file, "r")) - 1 + frags = pd.read_csv(fragments_file, delimiter="\t") + + def write_mat_entry(frag1, frag2, contacts): + """Write a single sparse matrix entry in either graal or bg2 format""" + if mat_fmt == "graal": + mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n") + elif mat_fmt == "bg2": + frag1, frag2 = int(frag1), int(frag2) + mat.write( + "\t".join( + map( + str, + [ + frags.chrom[frag1], + frags.start_pos[frag1], + frags.end_pos[frag1], + frags.chrom[frag2], + frags.start_pos[frag2], + frags.end_pos[frag2], + contacts, + ], + ) + ) + + "\n" + ) + + pre_mat_file = mat_file + ".pre.pairs" + hio.sort_pairs( + pairs_file, + pre_mat_file, + keys=["frag1", "frag2"], + threads=threads, + tmp_dir=tmp_dir, + ) + header_size = len(hio.get_pairs_header(pre_mat_file)) + with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat: + # Skip header lines + for _ in range(header_size): + next(pairs) + prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2] + n_occ = 0 # Number of occurences of each frag combination + n_nonzero = 0 # Total number of nonzero matrix entries + n_pairs = 0 # Total number of pairs entered in the matrix + pairs_reader = csv.reader(pairs, delimiter="\t") + # First line contains nrows, ncols and number of nonzero entries. + # Number of nonzero entries is unknown for now + if mat_fmt == "graal": + mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n") + for pair in pairs_reader: + # Fragment ids are field 8 and 9 + curr_pair = [pair[7], pair[8]] + # Increment number of occurences for fragment pair + if prev_pair == curr_pair: + n_occ += 1 + # Write previous pair and start a new one + else: + if n_occ > 0: + write_mat_entry(prev_pair[0], prev_pair[1], n_occ) + prev_pair = curr_pair + n_pairs += n_occ + n_occ = 1 + n_nonzero += 1 + # Write the last value + write_mat_entry(curr_pair[0], curr_pair[1], n_occ) + n_nonzero += 1 + n_pairs += 1 + + # Edit header line to fill number of nonzero entries inplace in graal header + if mat_fmt == "graal": + with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat: + header = mat.readline() + header = header.replace("-", str(n_nonzero)) + tmp_mat.write(header) + st.copyfileobj(mat, tmp_mat) + # Replace the matrix file with the one with corrected header + os.rename(pre_mat_file, mat_file) + else: + os.remove(pre_mat_file) + + logger.info( + "%d pairs used to build a contact map of %d bins with %d nonzero entries.", + n_pairs, + n_frags, + n_nonzero, + ) + + +def pairs2cool(pairs_file, cool_file, bins_file): + """ + Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations. + + Parameters + ---------- + pairs_file : str + Path to the pairs file containing input contact data. + cool_file : str + Path to the output cool file name to generate. + bins_file : str + Path to the file containing genomic segmentation information. (fragments_list.txt). + """ + + # Make bins file compatible with cooler cload + bins_tmp = bins_file + ".cooler" + bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None) + bins.to_csv(bins_tmp, sep="\t", header=False, index=False) + + cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}" + cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file} + sp.call(cooler_cmd.format(**cool_args), shell=True) + os.remove(bins_tmp) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--pairs") + parser.add_argument("-f", "--fragments") + parser.add_argument("-t", "--type") + parser.add_argument("-o", "--output") + args = parser.parse_args() + + pairs = args.pairs + fragments_list = args.fragments + mat_format = args.type + mat_out = args.output + + if mat_format == "cool": + # Name matrix file in .cool + cool_file = os.path.splitext(mat_out)[0] + ".cool" + pairs2cool(pairs, cool_file, fragments_list) + else: + pairs2matrix( + pairs, + mat_out, + fragments_list, + mat_fmt=mat_format, + threads=1, + tmp_dir=None, + ) diff --git a/bin/hicstuff_digest.py b/bin/hicstuff_digest.py new file mode 100644 index 0000000000000000000000000000000000000000..3fba3da20ddfd4d500c00cc5b01141b8f355f410 --- /dev/null +++ b/bin/hicstuff_digest.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Genome digestion +Functions used to write auxiliary instagraal compatible +sparse matrices. +""" + +from Bio import SeqIO, SeqUtils +from Bio.Seq import Seq +from Bio.Restriction import RestrictionBatch, Analysis +import os, sys, csv +import re +import collections +import copy +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from hicstuff_log import logger +import hicstuff_io as hio + +DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" +DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" +DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" +DEFAULT_KB_BINNING = 1 +DEFAULT_THRESHOLD_SIZE = 0 +# Most used enzyme for eukaryotes +DEFAULT_ENZYME = "DpnII" +# If using evenly-sized chunks instead of restriction +# enzymes, they shouldn't be too short +DEFAULT_MIN_CHUNK_SIZE = 50 + + +def write_frag_info( + fasta, + enzyme, + min_size=DEFAULT_THRESHOLD_SIZE, + circular=False, + output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, + output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME, + output_dir=None, +): + """Digest and write fragment information + Write the fragments_list.txt and info_contigs.txt that are necessary for + instagraal to run. + Parameters + ---------- + fasta : pathlib.Path or str + The path to the reference genome + enzyme : str, int or list of str + If a string, must be the name of an enzyme (e.g. DpnII) and the genome + will be cut at the enzyme's restriction sites. If a number, the genome + will be cut uniformly into chunks with length equal to that number. A + list of enzymes can also be specified if using multiple enzymes. + min_size : float, optional + Size below which shorter contigs are discarded. Default is 0, i.e. all + contigs are retained. + circular : bool, optional + Whether the genome is circular. Default is False. + output_contigs : str, optional + The name of the file with contig info. Default is info_contigs.txt + output_frags : str, optional + The name of the file with fragment info. Default is fragments_list.txt + output_dir : [type], optional + The path to the output directory, which will be created if not already + existing. Default is the current directory. + """ + + records = SeqIO.parse(hio.read_compressed(fasta), "fasta") + + try: + info_contigs_path = os.path.join(output_dir, output_contigs) + frag_list_path = os.path.join(output_dir, output_frags) + except TypeError: + info_contigs_path = output_contigs + frag_list_path = output_frags + + with open(info_contigs_path, "w") as info_contigs: + + info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") + + with open(frag_list_path, "w") as fragments_list: + + fragments_list.write( + "id\tchrom\tstart_pos" "\tend_pos\tsize\tgc_content\n" + ) + + total_frags = 0 + + for record in records: + contig_seq = record.seq + contig_name = record.id + contig_length = len(contig_seq) + if contig_length < int(min_size): + continue + + sites = get_restriction_table( + contig_seq, enzyme, circular=circular + ) + fragments = ( + contig_seq[sites[i] : sites[i + 1]] + for i in range(len(sites) - 1) + ) + n_frags = 0 + + current_id = 1 + start_pos = 0 + for frag in fragments: + frag_length = len(frag) + if frag_length > 0: + end_pos = start_pos + frag_length + gc_content = SeqUtils.GC(frag) / 100.0 + + current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + current_id, + contig_name, + start_pos, + end_pos, + frag_length, + gc_content, + ) + + fragments_list.write(current_fragment_line) + + try: + assert (current_id == 1 and start_pos == 0) or ( + current_id > 1 and start_pos > 0 + ) + except AssertionError: + logger.error((current_id, start_pos)) + raise + start_pos = end_pos + current_id += 1 + n_frags += 1 + + current_contig_line = "%s\t%s\t%s\t%s\n" % ( + contig_name, + contig_length, + n_frags, + total_frags, + ) + total_frags += n_frags + info_contigs.write(current_contig_line) + + +def attribute_fragments(pairs_file, idx_pairs_file, restriction_table): + """ + Writes the indexed pairs file, which has two more columns than the input + pairs file corresponding to the restriction fragment index of each read. + Note that pairs files have 1bp point positions whereas restriction table + has 0bp point poisitions. + Parameters + ---------- + pairs_file: str + Path the the input pairs file. Consists of 7 tab-separated + columns: readID, chr1, pos1, chr2, pos2, strand1, strand2 + idx_pairs_file: str + Path to the output indexed pairs file. Consists of 9 white space + separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, + frag1, frag2. frag1 and frag2 are 0-based restriction fragments based + on whole genome. + restriction_table: dict + Dictionary with chromosome identifiers (str) as keys and list of + positions (int) of restriction sites as values. + """ + + # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow + # 1. could be reduced by searching groups of N frags in parallel and 2. by + # writing N frags simultaneously using a single call of writerows. + + # Parse and update header section + pairs_header = hio.get_pairs_header(pairs_file) + header_size = len(pairs_header) + chrom_order = [] + with open(idx_pairs_file, "w") as idx_pairs: + for line in pairs_header: + # Add new column names to header + if line.startswith("#columns"): + line = line.rstrip() + " frag1 frag2" + if line.startswith("#chromsize"): + chrom_order.append(line.split()[1]) + idx_pairs.write(line + "\n") + + # Get number of fragments per chrom to allow genome-based indices + shift_frags = {} + prev_frags = 0 + for rank, chrom in enumerate(chrom_order): + if rank > 0: + # Note the "-1" because there are nfrags + 1 sites in rest table + prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1 + # Idx of each chrom's frags will be shifted by n frags in previous chroms + shift_frags[chrom] = prev_frags + + missing_contigs = set() + # Attribute pairs to fragments and append them to output file (after header) + with open(pairs_file, "r") as pairs, open( + idx_pairs_file, "a" + ) as idx_pairs: + # Skip header lines + for _ in range(header_size): + next(pairs) + + # Define input and output fields + pairs_cols = [ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + ] + idx_cols = pairs_cols + ["frag1", "frag2"] + + # Use csv reader / writer to automatically parse columns into a dict + pairs_reader = csv.DictReader( + pairs, fieldnames=pairs_cols, delimiter="\t" + ) + pairs_writer = csv.DictWriter( + idx_pairs, fieldnames=idx_cols, delimiter="\t" + ) + + for pair in pairs_reader: + # Get the 0-based indices of corresponding restriction fragments + # Deducing 1 from pair position to get it into 0bp point + pair["frag1"] = find_frag( + int(pair["pos1"]) - 1, restriction_table[pair["chr1"]] + ) + pair["frag2"] = find_frag( + int(pair["pos2"]) - 1, restriction_table[pair["chr2"]] + ) + # Shift fragment indices to make them genome-based instead of + # chromosome-based + try: + pair["frag1"] += shift_frags[pair["chr1"]] + except KeyError: + missing_contigs.add(pair["chr1"]) + try: + pair["frag2"] += shift_frags[pair["chr2"]] + except KeyError: + missing_contigs.add(pair["chr2"]) + + # Write indexed pairs in the new file + pairs_writer.writerow(pair) + + if missing_contigs: + logger.warning( + "Pairs on the following contigs were discarded as " + "those contigs are not listed in the paris file header. " + "This is normal if you filtered out small contigs: %s" + % " ".join(list(missing_contigs)) + ) + + +def get_restriction_table(seq, enzyme, circular=False): + """ + Get the restriction table for a single genomic sequence. + Parameters + ---------- + seq : Seq object + A biopython Seq object representing a chromosomes or contig. + enzyme : int, str or list of str + The name of the restriction enzyme used, or a list of restriction + enzyme names. Can also be an integer, to digest by fixed chunk size. + circular : bool + Wether the genome is circular. + Returns + ------- + numpy.array: + List of restriction fragment boundary positions for the input sequence. + >>> from Bio.Seq import Seq + >>> get_restriction_table(Seq("AAGCCGGATCGG"),"HpaII") + array([ 0, 4, 12]) + >>> get_restriction_table(Seq("AA"),["HpaII", "MluCI"]) + array([0, 2]) + >>> get_restriction_table(Seq("AA"),"aeiou1") + Traceback (most recent call last): + ... + ValueError: aeiou1 is not a valid restriction enzyme. + >>> get_restriction_table("AA","HpaII") + Traceback (most recent call last): + ... + TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead + """ + chrom_len = len(seq) + wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme) + # Restriction batch containing the restriction enzyme + try: + enz = [enzyme] if isinstance(enzyme, str) else enzyme + cutter = RestrictionBatch(enz) + except (TypeError, ValueError): + try: + cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) + except ValueError: + raise ValueError(wrong_enzyme) + + # Conversion from string type to restriction type + if isinstance(cutter, int): + sites = [i for i in range(0, chrom_len, cutter)] + if sites[-1] < chrom_len: + sites.append(chrom_len) + else: + # Find sites of all restriction enzymes given + ana = Analysis(cutter, seq, linear=not circular) + sites = ana.full() + # Gets all sites into a single flat list with 0-based index + sites = [site - 1 for enz in sites.values() for site in enz] + # Sort by position and allow first add start and end of seq + sites.sort() + sites.insert(0, 0) + sites.append(chrom_len) + + return np.array(sites) + + +def find_frag(pos, r_sites): + """ + Use binary search to find the index of a chromosome restriction fragment + corresponding to an input genomic position. + Parameters + ---------- + pos : int + Genomic position, in base pairs. + r_sites : list + List of genomic positions corresponding to restriction sites. + Returns + ------- + int + The 0-based index of the restriction fragment to which the position belongs. + >>> find_frag(15, [0, 20, 30]) + 0 + >>> find_frag(15, [10, 20, 30]) + Traceback (most recent call last): + ... + ValueError: The first position in the restriction table is not 0. + >>> find_frag(31, [0, 20, 30]) + Traceback (most recent call last): + ... + ValueError: Read position is larger than last entry in restriction table. + """ + if r_sites[0] != 0: + raise ValueError( + "The first position in the restriction table is not 0." + ) + if pos > r_sites[-1]: + raise ValueError( + "Read position is larger than last entry in restriction table." + ) + # binary search for the index of the read + index = max(np.searchsorted(r_sites, pos, side="right") - 1, 0) + # Last site = end of the chrom, index of last fragment is last site - 1 + index = min(len(r_sites) - 2, index) + + return index + + +def frag_len( + frags_file_name=DEFAULT_FRAGMENTS_LIST_FILE_NAME, + output_dir=None, + plot=False, + fig_path=None, +): + """ + logs summary statistics of fragment length distribution based on an + input fragment file. Can optionally show a histogram instead + of text summary. + Parameters + ---------- + frags_file_name : str + Path to the output list of fragments. + output_dir : str + Directory where the list should be saved. + plot : bool + Wether a histogram of fragment length should be shown. + fig_path : str + If a path is given, the figure will be saved instead of shown. + """ + + try: + frag_list_path = os.path.join(output_dir, frags_file_name) + except TypeError: + frag_list_path = frags_file_name + frags = pd.read_csv(frag_list_path, sep="\t") + nfrags = frags.shape[0] + med_len = frags["size"].median() + nbins = 40 + if plot: + fig, ax = plt.subplots() + _, _, _ = ax.hist(frags["size"], bins=nbins) + + ax.set_xlabel("Fragment length [bp]") + ax.set_ylabel("Log10 number of fragments") + ax.set_title("Distribution of restriction fragment length") + ax.set_yscale("log", base=10) + ax.annotate( + "Total fragments: {}".format(nfrags), + xy=(0.95, 0.95), + xycoords="axes fraction", + fontsize=12, + horizontalalignment="right", + verticalalignment="top", + ) + ax.annotate( + "Median length: {}".format(med_len), + xy=(0.95, 0.90), + xycoords="axes fraction", + fontsize=12, + horizontalalignment="right", + verticalalignment="top", + ) + # Tweak spacing to prevent clipping of ylabel + fig.tight_layout() + if fig_path: + plt.savefig(fig_path) + else: + plt.show() + plt.clf() + else: + logger.info( + "Genome digested into {0} fragments with a median " + "length of {1}".format(nfrags, med_len) + ) + + +def gen_enzyme_religation_regex(enzyme): + """Return a regex which corresponds to all possible religation sites given a + set of enzyme. + Parameters: + ----------- + enzyme : str + String that contains the names of the enzyme separated by a comma. + Returns: + -------- + re.Pattern : + Regex that corresponds to all possible ligation sites given a set of + enzyme. + Examples: + --------- + >>> gen_enzyme_religation_regex('HpaII') + re.compile('CCGCGG') + >>> gen_enzyme_religation_regex('HpaII,MluCI') + re.compile('AATTAATT|AATTCGG|CCGAATT|CCGCGG') + """ + + # Split the str on the comma to separate the different enzymes. + enzyme = enzyme.split(",") + + # Check on Biopython dictionnary the enzyme. + rb = RestrictionBatch(enzyme) + + # Initiation: + give_list = [] + accept_list = [] + ligation_list = [] + + # Iterates on the enzymes. + for enz in rb: + + # Extract restriction sites and look for cut sites. + site = enz.elucidate() + fw_cut = site.find("^") + rev_cut = site.find("_") + + # Process "give" site. Remove N on the left (useless). + give_site = site[:rev_cut].replace("^", "") + while give_site[0] == "N": + give_site = give_site[1:] + give_list.append(give_site) + + # Process "accept" site. Remove N on the rigth (useless). + accept_site = site[fw_cut + 1 :].replace("_", "") + while accept_site[-1] == "N": + accept_site = accept_site[:-1] + accept_list.append(accept_site) + + # Iterates on the two list to build all the possible HiC ligation sites. + for give_site in give_list: + for accept_site in accept_list: + # Replace "N" by "." for regex searching of the sites + ligation_list.append((give_site + accept_site).replace("N", ".")) + ligation_list.append( + str(Seq(give_site + accept_site).reverse_complement()).replace( + "N", "." + ) + ) + + # Build the regex for any ligation sites. + pattern = "|".join(sorted(list(set(ligation_list)))) + return re.compile(pattern) diff --git a/bin/hicstuff_distance_law.py b/bin/hicstuff_distance_law.py new file mode 100755 index 0000000000000000000000000000000000000000..545c3f7e1e0929d070a850ed239aab90d585f8dc --- /dev/null +++ b/bin/hicstuff_distance_law.py @@ -0,0 +1,839 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import sys +import matplotlib.pyplot as plt +import warnings +from scipy import ndimage +from matplotlib import cm +import pandas as pd +import os as os +import csv as csv +import hicstuff_io as hio +from hicstuff_log import logger +import argparse + +def export_distance_law(xs, ps, names, out_file=None): + """ Export the x(s) and p(s) from two list of numpy.ndarrays to a table + in txt file with three columns separated by a tabulation. The first column + contains the x(s), the second the p(s) and the third the name of the arm or + chromosome. The file is created in the directory given by outdir or the + current file if no file is given. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the start position of logbins of each p(s) in base pairs. + ps : list of numpy.ndarray + The list of p(s). + names : list of string + List containing the names of the chromosomes/arms/conditions of the p(s) + values given. + out_file : str or None + Path where output file should be written. ./distance_law.txt by default. + Return + ------ + txt file: + File with three columns separated by a tabulation. The first column + contains the x(s), the second the p(s) and the third the name of the arm + or chromosome. The file is creates in the output file given or the + default one if none given. + """ + # ./distance_law.txt as out_file if no out_file is given. + if out_file is None: + out_file = os.getcwd() + "/distance_law.txt" + # Sanity check: as many chromosomes/arms as ps + if len(xs) != len(names): + logger.error("Number of chromosomes/arms and number of p(s) list differ.") + sys.exit(1) + # Create the file and write it + f = open(out_file, "w") + for i in range(len(xs)): + for j in range(len(xs[i])): + line = ( + str(format(xs[i][j], "g")) + + "\t" + + str(format(ps[i][j], "g")) + + "\t" + + names[i] + + "\n" + ) + f.write(line) + f.close() + + +def import_distance_law(distance_law_file): + """ Import the table created by export_distance_law and return the list of + x(s) and p(s) in the order of the chromosomes. + Parameters + ---------- + distance_law_file : string + Path to the file containing three columns : the x(s), the p(s), and the + chromosome/arm name. + Returns + ------- + list of numpy.ndarray : + The start coordinate of each bin one array per chromosome or arm. + list of numpy.ndarray : + The distance law probabilities corresponding of the bins of the + previous list. + list of numpy.ndarray : + The names of the arms/chromosomes corresponding to the previous + list. + """ + file = pd.read_csv( + distance_law_file, + sep="\t", + header=None, + dtype={"a": np.int32, "b": np.float32, "c": str}, + ) + names_idx = np.unique(file.iloc[:, 2], return_index=True)[1] + names = [file.iloc[:, 2][index] for index in sorted(names_idx)] + xs = [None] * len(names) + ps = [None] * len(names) + labels = [None] * len(names) + for i in range(len(names)): + subfile = file[file.iloc[:, 2] == names[i]] + xs[i] = np.array(subfile.iloc[:, 0]) + ps[i] = np.array(subfile.iloc[:, 1]) + labels[i] = np.array(subfile.iloc[:, 2]) + return xs, ps, labels + + +def get_chr_segment_bins_index(fragments, centro_file=None, rm_centro=0): + """Get the index positions of the start and end bins of different + chromosomes, or arms if the centromers position have been given from the + fragments file made by hicstuff. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + centro_file : None or str + None or path to a file with the genomic positions of the centromers + sorted as the chromosomes separated by a space. The file have only one + line. + rm_centro : int + If a value is given, will remove the contacts close the centromeres. + It will remove as many kb as the argument given. Default is zero. + + Returns + ------- + list of floats : + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + """ + # Get bins where chromosomes start + chr_start_bins = np.where(fragments == 0)[0] + # Create a list of same length for the end of the bins + chr_end_bins = np.zeros(len(chr_start_bins)) + # Get bins where chromsomes end + for i in range(len(chr_start_bins) - 1): + chr_end_bins[i] = chr_start_bins[i + 1] + chr_end_bins[-1] = len(fragments.iloc[:, 0]) + # Combine start and end of bins in a single array. Values are the id of the + # bins + chr_segment_bins = np.sort(np.concatenate((chr_start_bins, chr_end_bins))) + if centro_file is not None: + # Read the file of the centromers + with open(centro_file, "r", newline="") as centro: + centro = csv.reader(centro, delimiter=" ") + centro_pos = next(centro) + # Sanity check: as many chroms as centromeres + if len(chr_start_bins) != len(centro_pos): + logger.warning( + "Number of chromosomes and centromeres differ, centromeres position are not taking into account." + ) + centro_file = None + if centro_file is not None: + # Get bins of centromeres + centro_bins = np.zeros(2 * len(centro_pos)) + for i in range(len(chr_start_bins)): + if (i + 1) < len(chr_start_bins): + subfrags = fragments[chr_start_bins[i] : chr_start_bins[i + 1]] + else: + subfrags = fragments[chr_start_bins[i] :] + # index of last fragment starting before centro in same chrom + centro_bins[2 * i] = chr_start_bins[i] + max( + np.where( + subfrags["start_pos"][:] // (int(centro_pos[i]) - rm_centro) == 0 + )[0] + ) + centro_bins[2 * i + 1] = chr_start_bins[i] + max( + np.where( + subfrags["start_pos"][:] // (int(centro_pos[i]) + rm_centro) == 0 + )[0] + ) + # Combine centro and chrom bins into a single array. Values are the id + # of the bins started and ending the arms. + chr_segment_bins = np.sort( + np.concatenate((chr_start_bins, chr_end_bins, centro_bins)) + ) + return list(chr_segment_bins) + + +def get_chr_segment_length(fragments, chr_segment_bins): + """Compute a list of the length of the different objects (arm or + chromosome) given by chr_segment_bins. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + + Returns + ------- + list of numpy.ndarray: + The length in base pairs of each chromosome or arm. + """ + chr_segment_length = [None] * int(len(chr_segment_bins) / 2) + # Iterate in chr_segment_bins in order to obtain the size of each chromosome/arm + for i in range(len(chr_segment_length)): + # Obtain the size of the chromosome/arm, the if loop is to avoid the + # case of arms where the end position of the last fragments doesn't + # mean the size of arm. If it's the right arm we have to start to count the + # size from the beginning of the arm. + if fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] == 0: + n = fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] + else: + n = ( + fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] + - fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] + ) + chr_segment_length[i] = n + return chr_segment_length + + +def logbins_xs(fragments, chr_segment_length, base=1.1, circular=False): + """Compute the logbins of each chromosome/arm in order to have theme to + compute distance law. At the end you will have bins of increasing with a + logspace with the base of the value given in base. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + base : float + Base use to construct the logspace of the bins, 1.1 by default. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. + + Returns + ------- + list of numpy.ndarray : + The start coordinate of each bin one array per chromosome or arm. + """ + # Create the xs array and a list of the length of the chromosomes/arms + xs = [None] * len(chr_segment_length) + # Iterate in chr_segment_bins in order to make the logspace + for i in range(len(chr_segment_length)): + n = chr_segment_length[i] + # if the chromosome is circular the mawimum distance between two reads + # are divided by two + if circular: + n /= 2 + n_bins = int(np.log(n) / np.log(base)) + # For each chromosome/arm compute a logspace to have the logbin + # equivalent to the size of the arms and increasing size of bins + xs[i] = np.unique( + np.logspace(0, n_bins, num=n_bins + 1, base=base, dtype=int) + ) + return xs + + +def circular_distance_law(distance, chr_segment_length, chr_bin): + """Recalculate the distance to return the distance in a circular chromosome + and not the distance between the two genomic positions. + Parameters + ---------- + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + distance : int + Distance between two fragments with a contact. + Returns + ------- + int : + The real distance in the chromosome circular and not the distance + between two genomic positions + Examples + -------- + >>> circular_distance_law(7500, [2800, 9000], 1) + 1500 + >>> circular_distance_law(1300, [2800, 9000], 0) + 1300 + >>> circular_distance_law(1400, [2800, 9000], 0) + 1400 + """ + chr_len = chr_segment_length[chr_bin] + if distance > chr_len / 2: + distance = chr_len - distance + return distance + + +def get_pairs_distance( + line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular=False +): + """From a line of a pair reads file, filter -/+ or +/- reads, keep only the + reads in the same chromosome/arm and compute the distance of the the two + fragments. It modify the input ps in order to count or not the line given. + It will add one in the logbin corresponding to the distance. + Parameters + ---------- + line : OrderedDict + Line of a pair reads file with the these keys readID, chr1, pos1, chr2, + pos2, strand1, strand2, frag1, frag2. The values are in a dictionnary. + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + xs : list of lists + The start coordinate of each bin one array per chromosome or arm. + ps : list of lists + The sum of contact already count. xs and ps should have the same + dimensions. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. + """ + # Check this is a pairs_idx file and not simple pairs + if line['frag1'] is None: + logger.error( + 'Input pairs file must have frag1 and frag2 columns. In hicstuff ' + 'pipeline, this is the "valid_idx.pairs" file.' + ) + # We only keep the event +/+ or -/-. This is done to avoid to have any + # event of uncut which are not possible in these events. We can remove the + # good events of +/- or -/+ because we don't need a lot of reads to compute + # the distance law and if we eliminate these reads we do not create others + # biases as they should have the same distribution. + if line["strand1"] == line["strand2"]: + # Find in which chromosome/arm are the fragment 1 and 2. + chr_bin1 = ( + np.searchsorted(chr_segment_bins, int(line["frag1"]), side="right") - 1 + ) + chr_bin2 = ( + np.searchsorted(chr_segment_bins, int(line["frag2"]), side="right") - 1 + ) + # We only keep the reads with the two fragments in the same chromosome + # or arm. + if chr_bin1 == chr_bin2: + # Remove the contacts in the centromeres if centro_remove + if chr_bin1 % 2 == 0: + chr_bin1 = int(chr_bin1 / 2) + # For the reads -/-, the fragments should be religated with both + # their start position (position in the left on the genomic + # sequence, 5'). For the reads +/+ it's the contrary. We compute + # the distance as the distance between the two extremities which + # are religated. + if line["strand1"] == "-": + distance = abs( + np.array(fragments["start_pos"][int(line["frag1"])]) + - np.array(fragments["start_pos"][int(line["frag2"])]) + ) + if line["strand1"] == "+": + distance = abs( + np.array(fragments["end_pos"][int(line["frag1"])]) + - np.array(fragments["end_pos"][int(line["frag2"])]) + ) + if circular: + distance = circular_distance_law( + distance, chr_segment_length, chr_bin1 + ) + xs_temp = xs[chr_bin1][:] + # Find the logbins in which the distance is and add one to the sum + # of contact. + ps_indice = np.searchsorted(xs_temp, distance, side="right") - 1 + ps[chr_bin1][ps_indice] += 1 + + +def get_names(fragments, chr_segment_bins): + """Make a list of the names of the arms or the chromosomes. + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start position of chromosomes/arms to compute the distance law on + each chromosome/arm separately. + Returns + ------- + list of floats : + List of the labels given to the curves. It will be the name of the arms + or chromosomes. + """ + # Get the name of the chromosomes. + chr_names_idx = np.unique(fragments.iloc[:, 1], return_index=True)[1] + chr_names = [fragments.iloc[:, 1][index] for index in sorted(chr_names_idx)] + # Case where they are separate in chromosomes + if len(chr_segment_bins) / 2 != len(chr_names): + names = [] + for chr in chr_names: + names.append(str(chr) + "_left") + names.append(str(chr) + "_rigth") + chr_names = names + return chr_names + + +def get_distance_law( + pairs_reads_file, + fragments_file, + centro_file=None, + base=1.1, + out_file=None, + circular=False, + rm_centro=0, +): + """Compute distance law as a function of the genomic coordinate aka P(s). + Bin length increases exponentially with distance. Works on pairs file + format from 4D Nucleome Omics Data Standards Working Group. If the genome + is composed of several chromosomes and you want to compute the arms + separately, provide a file with the positions of centromers. Create a file + with three coulumns separated by a tabulation. The first column contains + the xs, the second the ps and the third the name of the arm or chromosome. + The file is create in the directory given in outdir or in the current + directory if no directory given. + Parameters + ---------- + pairs_reads_file : string + Path of a pairs file format from 4D Nucleome Omics Data Standards + Working Group with the 8th and 9th coulumns are the ID of the fragments + of the reads 1 and 2. + fragments_file : path + Path of a table containing in the first column the ID of the fragment, + in the second the names of the chromosome in the third and fourth + the start position and the end position of the fragment. The file have + no header. (File like the 'fragments_list.txt' from hicstuff) + centro_file : None or str + None or path to a file with the genomic positions of the centromers + sorted as the chromosomes separated by a space. The file have only one + line. + base : float + Base use to construct the logspace of the bins - 1.1 by default. + out_file : None or str + Path of the output file. If no path given, the output is returned. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. Cannot be True if centro_file is not None + rm_centro : int + If a value is given, will remove the contacts close the centromeres. + It will remove as many kb as the argument given. Default is None. + Returns + ------- + xs : list of numpy.ndarray + Basepair coordinates of log bins used to compute distance law. + ps : list of numpy.ndarray + Contacts value, in arbitrary units, at increasingly long genomic ranges + given by xs. + names : list of strings + Names of chromosomes that are plotted + """ + # Sanity check : centro_fileition should be None if chromosomes are + # circulars (no centromeres is circular chromosomes). + if circular and centro_file != None: + logger.error("Chromosomes cannot have a centromere and be circular") + sys.exit(1) + # Import third columns of fragments file + fragments = pd.read_csv(fragments_file, sep="\t", header=0, usecols=[0, 1, 2, 3]) + # Calculate the indice of the bins to separate into chromosomes/arms + chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file, rm_centro) + # Calculate the length of each chromosoms/arms + chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins) + xs = logbins_xs(fragments, chr_segment_length, base, circular) + # Create the list of p(s) with one array for each chromosome/arm and each + # array contain as many values as in the logbin + ps = [None] * len(chr_segment_length) + for i in range(len(xs)): + ps[i] = [0] * len(xs[i]) + # Read the pair reads file + with open(pairs_reads_file, "r", newline="") as reads: + # Remove the line of the header + header_length = len(hio.get_pairs_header(pairs_reads_file)) + for i in range(header_length): + next(reads) + # Reads all the others lines and put the values in a dictionnary with + # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', + # 'strand2', 'frag1', 'frag2' + reader = csv.DictReader( + reads, + fieldnames=[ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + "frag1", + "frag2", + ], + delimiter="\t", + ) + for line in reader: + # Iterate in each line of the file after the header + get_pairs_distance( + line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular + ) + # Divide the number of contacts by the area of the logbin + for i in range(len(xs)): + n = chr_segment_length[i] + for j in range(len(xs[i]) - 1): + # Use the area of a trapezium to know the area of the logbin with n + # the size of the matrix. + ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * ( + (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]) + ) + # print( + # ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) + # * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j])) + # ) + # Case of the last logbin which is an isosceles rectangle triangle + # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2) + ps[i][-1] /= ((n - xs[i][-1]) ** 2) / 2 + names = get_names(fragments, chr_segment_bins) + if out_file: + export_distance_law(xs, ps, names, out_file) + return xs, ps, names + + +def normalize_distance_law(xs, ps, inf=3000, sup=None): + """Normalize the distance in order to have the sum of the ps values between + 'inf' (default value is 3kb) until the end of the array equal to one and + limit the effect of coverage between two conditions/chromosomes/arms when + you compare them together. If we have a list of ps, it will normalize until + the length of the shorter object or the value of sup, whichever is smaller. + Parameters + ---------- + xs : list of numpy.ndarray + list of logbins corresponding to the ps. + ps : list of numpy.ndarray + Average ps or list of ps of the chromosomes/arms. xs and ps have to + have the same shape. + inf : integer + Inferior value of the interval on which, the normalization is applied. + sup : integer + Superior value of the interval on which, the normalization is applied. + Returns + ------- + list of numpy.ndarray : + List of ps each normalized separately. + """ + # Sanity check: xs and ps have the same dimension + if np.shape(xs) != np.shape(ps): + logger.error("xs and ps should have the same dimension.") + sys.exit(1) + # Define the length of shortest chromosomes as a lower bound for the sup boundary + min_xs = len(min(xs, key=len)) + normed_ps = [None] * len(ps) + if sup is None: + sup = np.inf + for chrom_id, chrom_ps in enumerate(ps): + # Iterate on the different ps to normalize each of theme separately + chrom_sum = 0 + # Change the last value to have something continuous because the last + # one is much bigger (computed on matrix corner = triangle instead of trapezoid). + chrom_ps[-1] = chrom_ps[-2] + for bin_id, bin_value in enumerate(chrom_ps): + # Compute normalization factor based on values between inf and sup + # Sup will be whatever is smaller between user-provided sup and length of + # the shortest chromosome + if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] < sup) and (bin_id < min_xs): + chrom_sum += bin_value + if chrom_sum == 0: + chrom_sum += 1 + logger.warning("No values of p(s) in one segment") + # Make the normalisation + normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum + return normed_ps + + +def average_distance_law(xs, ps, sup, big_arm_only=False): + """Compute the average distance law between the file the different distance + law of the chromosomes/arms. + Parameters + ---------- + xs : list of numpy.ndarray + The list of logbins. + ps : list of lists of floats + The list of numpy.ndarray. + sup : int + Value given to set the minimum size of the chromosomes/arms to make the + average. + big_arm_only : bool + By default False. If True, will only take into account the arms/chromosomes + longer than the value of sup. Sup mandatory if set. + Returns + ------- + numpy.ndarray : + List of the xs with the max length. + numpy.ndarray : + List of the average_ps. + """ + # Find longest chromosome / arm and make two arrays of this length for the + # average distance law and remove the last value. + xs = max(xs, key=len) + max_length = len(xs) + ps_values = np.zeros(max_length) + ps_occur = np.zeros(max_length) + for chrom_ps in ps: + # Iterate on ps in order to calculate the number of occurences (all the + # chromossomes/arms are not as long as the longest one) and the sum of + # the values of distance law. + # Change the last value to have something continuous because the last + # one is much bigger. + chrom_ps[-1] = chrom_ps[-2] + # Sanity check : sup strictly inferior to maw length arms. + if big_arm_only: + if sup >= xs[-1]: + logger.error( + "sup have to be inferior to the max length of arms/chromsomes if big arm only set" + ) + sys.exit(1) + if sup <= xs[len(chrom_ps) - 1]: + ps_occur[: len(chrom_ps)] += 1 + ps_values[: len(chrom_ps)] += chrom_ps + else: + ps_occur[: len(chrom_ps)] += 1 + ps_values[: len(chrom_ps)] += chrom_ps + # Make the mean + averaged_ps = ps_values / ps_occur + return xs, averaged_ps + + +def slope_distance_law(xs, ps): + """Compute the slope of the loglog curve of the ps as the + [log(ps(n+1)) - log(ps(n))] / [log(n+1) - log(n)]. + Compute only list of ps, not list of array. + Parameters + ---------- + xs : list of numpy.ndarray + The list of logbins. + ps : list of numpy.ndarray + The list of ps. + Returns + ------- + list of numpy.ndarray : + The slope of the distance law. It will be shorter of one value than the + ps given initially. + """ + slope = [None] * len(ps) + for i in range(len(ps)): + ps[i][ps[i] == 0] = 10 ** (-9) + # Compute the slope + slope_temp = np.log(np.array(ps[i][1:]) / np.array(ps[i][:-1])) / np.log( + np.array(xs[i][1:]) / np.array(xs[i][:-1]) + ) + # The 1.8 is the intensity of the normalisation, it could be adapted. + slope_temp[slope_temp == np.nan] = 10 ** (-15) + slope[i] = ndimage.filters.gaussian_filter1d(slope_temp, 1.8) + return slope + + +def get_ylim(xs, curve, inf, sup): + """Compute the max and the min of the list of list between the inferior + (inf) and superior (sup) bounds. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the logbins starting position in basepair. + curve : list of numpy.ndarray + A list of numpy.ndarray from which you want to extract minimum and + maximum values in a given interval. + inf : int + Inferior limit of the interval in basepair. + sup : int + Superior limit of the interval in basepair. + Returns + ------- + min_tot : float + Minimum value of the list of list in this interval. + max_tot : float + Maximum value of the list of list in this interval. + Examples + -------- + >>> get_ylim([np.array([1, 4, 15]), np.array([1, 4, 15, 40])], + ... [np.array([5.5, 3.2, 17.10]), np.array([24, 32, 1.111, 18.5])], + ... 2, + ... 15 + ... ) + (1.111, 32.0) + """ + # Create a list in order to add all the interesting values + flatten_list = [] + for i, logbins in enumerate(xs): + # Iterate on xs. + # Search for the minimum index corresponding to the smallest bin + # superior or equal to inf (in pair base). + min_value = min(logbins[logbins >= inf], default=0) + min_index = np.where(logbins == min_value)[0] + # Skip chromosome if total size smaller than inf + if len(min_index) == 0: + continue + # Search for the maximum index corresponding to the biggest bin + # inferior or equal to sup (in pair base). + max_value = max(logbins[logbins <= sup]) + max_index = np.where(logbins == max_value)[0] + # Add the values in the interval in the flattened list. + if int(max_index) != len(logbins) - 1: + max_index += 1 + for j in range(int(min_index), int(max_index)): + flatten_list.append(curve[i][j]) + # Caluclate the min and the max of this list. + min_tot = min(flatten_list) + max_tot = max(flatten_list) + return min_tot, max_tot + + +def plot_ps_slope(xs, ps, labels, fig_path=None, inf=3000, sup=None): + """Compute two plots, one with the different distance law of each + arm/chromosome in loglog scale and one with the slope (derivative) of these + curves. Generate a svg file with savefig. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the logbins of each ps. + ps : list of numpy.ndarray + The list of ps. + labels_file : list of strings + Names of the different curves in the order in which they are given. + fig_path : str + Path where the figure will be created. If None (default), the plot is + shown in an interactive window. + inf : int + Value of the mimimum x of the window of the plot. Have to be strictly + positive. By default 3000. + sup : int + Value of the maximum x of the window of the plot. By default None. + + """ + # Give the max value for sup if no value have been attributed + if sup is None: + sup = max(max(xs, key=len)) + + # Compute slopes from the curves + slope = slope_distance_law(xs, ps) + # Make the plot of distance law + # Give a range of color + cols = iter(cm.rainbow(np.linspace(0, 1, len(ps)))) + fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(18, 10)) + plt.subplots_adjust(left=0.05, right=0.85, top=0.93, bottom=0.07) + ax1.set_xlabel("Distance (pb)", fontsize="x-large") + ax1.set_ylabel("P(s)", fontsize="x-large") + ax1.set_title("Distance law", fontsize="xx-large") + ylim = get_ylim(xs, ps, inf, sup) + ax1.set_ylim(0.9 * ylim[0], 1.1 * ylim[1]) + for i in range(len(ps)): + # Iterate on the different distance law array and take them by order of + # size in order to have the color scale equivalent to the size scale + col = next(cols) + ax1.loglog(xs[i], ps[i], label=labels[i]) + # Make the same plot with the slope + cols = iter(cm.rainbow(np.linspace(0, 1, len(slope)))) + ax2.set_xlabel("Distance (pb)", fontsize="x-large") + ax2.set_ylabel("Slope", fontsize="x-large") + ax2.set_title("Slope of the distance law", fontsize="xx-large") + ax2.set_xlim([inf, sup]) + ylim = get_ylim(xs, slope, inf, sup) + ax2.set_ylim(1.1 * ylim[0], 0.9 * ylim[1]) + xs2 = [None] * len(xs) + for i in range(len(slope)): + xs2[i] = xs[i][:-1] + col = next(cols) + ax2.semilogx(xs2[i], slope[i], label=labels[i], subs=[2, 3, 4, 5, 6, 7, 8, 9]) + ax2.legend(loc="upper left", bbox_to_anchor=(1.02, 1.00), ncol=1, fontsize="large") + # Save the figure in svg + if fig_path is not None: + plt.savefig(fig_path) + else: + plt.show() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input") + parser.add_argument("-f", "--fragment") + parser.add_argument("-c", "--centro_file") + parser.add_argument("-b", "--base") + parser.add_argument("-p", "--prefix") + parser.add_argument("-o", "--out_file") + parser.add_argument("-u", "--circular") + parser.add_argument("-r", "--rm_centro") + parser.add_argument("-l", "--plot") + parser.add_argument("-d", '--dist_plot') + args = parser.parse_args() + + input = args.input + fragment = args.fragment + prefix = args.prefix + centro_file = args.centro_file + if centro_file == "None": + centro_file = None + else: + centro_file = prefix+"_"+centro_file + base = float(args.base) + out_file = prefix+"_"+args.out_file + circular = args.circular + if circular.lower() == "false": + circular = False + elif circular.lower() == "true": + circular = True + rm_centro = args.rm_centro + if rm_centro == "None": + rm_centro = 0 + rm_centro = int(rm_centro) + plot = args.plot + if plot.lower() == "false": + plot = False + elif plot.lower() == "true": + plot = True + distance_law_plot = args.dist_plot + if distance_law_plot == "None": + distance_law_plot = None + else: + distance_law_plot = prefix+"_"+distance_law_plot + + x_s, p_s, _ = get_distance_law( + input, + fragment, + centro_file, + base, + out_file, + circular, + rm_centro + ) + + if plot: + # Retrieve chrom labels from distance law file + _, _, chr_labels = import_distance_law(out_file) + chr_labels = [lab[0] for lab in chr_labels] + chr_labels_idx = np.unique(chr_labels, return_index=True)[1] + chr_labels = [chr_labels[index] for index in sorted(chr_labels_idx)] + p_s = normalize_distance_law(x_s, p_s) + plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot) diff --git a/bin/hicstuff_filter.py b/bin/hicstuff_filter.py new file mode 100755 index 0000000000000000000000000000000000000000..23b74245b8440ab3bbca398618f405d629f3d610 --- /dev/null +++ b/bin/hicstuff_filter.py @@ -0,0 +1,569 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Hi-C event filtering +Analyse the contents of a 3C library and filters spurious events such as loops +and uncuts to improve the overall signal. Filtering consists in excluding +- +and -+ Hi-C pairs if their reads are closer than a threshold in minimum number +of restriction fragments. This threshold represents the distance at which the +abundance of these events deviate significantly from the rest of the library. +It is estimated automatically by default using the median absolute deviation of +pairs at longer distances, but can also be entered manually. +The program takes a 2D BED file as input with the following fields: +chromA startA endA indiceA strandA chromB startB endB indiceB strandB +Each line of the file represents a Hi-C pair with reads A and B. The indices +are 0-based and represent the restriction fragment to which reads are +attributed. +@author: cmdoret (reimplementation of Axel KournaK's code) +""" + +import sys +import numpy as np +from collections import OrderedDict +import matplotlib.pyplot as plt +from hicstuff_log import logger +import argparse + + +def process_read_pair(line): + r"""Process and order read pairs in a .pairs record. + Takes a read pair (line) from a .pairs file as input, reorders the pair + so that read 1 in intrachromosomal pairs always has the smallest genomic + coordinate. + Parameters + ---------- + line : str + Record from a pairs file with columns: readID, chr1, pos1, chr2, + pos2, strand1, strand2, frag1, frag2 + Returns + ------- + dict + Dictionary with reordered pair where each column from the line as an + entry. The number of restriction sites separating reads in the pair and + the type of event are also stored in the dictionary, under keys + "nsites" and "type". + Examples + -------- + >>> d = process_read_pair("readX\ta\t1\tb\t20\t-\t-\t1\t3") + >>> for u in sorted(d.items()): + ... print(u) + ('chr1', 'a') + ('chr2', 'b') + ('frag1', 1) + ('frag2', 3) + ('nsites', 2) + ('pos1', 1) + ('pos2', 20) + ('readID', 'readX') + ('strand1', '-') + ('strand2', '-') + ('type', 'inter') + >>> d = process_read_pair('readY\ta\t20\ta\t10\t-\t+\t2\t1') + >>> [d[x] for x in sorted(d.keys())] + ['a', 'a', 1, 2, 1, 10, 20, 'readY', '+', '-', '+-'] + """ + # Split line by whitespace + p = line.split("\t") + if len(p) != 9: + raise ValueError( + "Your input file does not have 9 columns. Make sure " + "the file has the readID and twice the following 4 fields " + "(once for each read in the pair): chr pos strand frag." + ) + # Saving each column as a dictionary entry with meaningful key + cols = [ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + "frag1", + "frag2", + ] + p = {cols[i]: p[i] for i in range(len(cols))} + # Transforming numeric columns to int + for col in ["pos1", "pos2", "frag1", "frag2"]: + p[col] = int(p[col]) + + # invert records for intrachromosomal pairs where rec2 comes before + # rec1 in genomic coordinates + if p["chr1"] == p["chr2"] and p["pos2"] < p["pos1"]: + p["strand1"], p["strand2"] = p["strand2"], p["strand1"] + p["pos1"], p["pos2"] = p["pos2"], p["pos1"] + p["frag1"], p["frag2"] = p["frag2"], p["frag1"] + + # Number of restriction sites separating reads in the pair + p["nsites"] = abs(p["frag2"] - p["frag1"]) + # Get event type + if p["chr1"] == p["chr2"]: + p["type"] = "".join([p["strand1"], p["strand2"]]) + else: + p["type"] = "inter" + + return p + + +def get_thresholds( + in_dat, interactive=False, plot_events=False, fig_path=None, prefix=None +): + """Guess distance threshold for event filtering + Analyse the events in the first million of Hi-C pairs in the library, plot + the occurrences of each event type according to number of restriction + fragments, and ask user interactively for the minimum threshold for uncuts + and loops. + Parameters + ---------- + in_dat: str + Path to the .pairs file containing Hi-C pairs. + interactive: bool + If True, plots are diplayed and thresholds are required interactively. + plot_events : bool + Whether to show the plot + fig_path : str + Path where the figure will be saved. If None, the figure will be + diplayed interactively. + prefix : str + If the library has a name, it will be shown on plots. + + Returns + ------- + dictionary + dictionary with keys "uncuts" and "loops" where the values are the + corresponding thresholds entered by the user. + """ + thr_uncut = None + thr_loop = None + max_sites = 50 + # Map of event -> legend name of event for intrachromosomal pairs. + legend = { + "++": "++ (weird)", + "--": "-- (weird)", + "+-": "+- (uncuts)", + "-+": "-+ (loops)", + } + colors = {"++": "#222222", "+-": "r", "--": "#666666", "-+": "tab:orange"} + n_events = {event: np.zeros(max_sites) for event in legend} + i = 0 + # open the file for reading (just the first 1 000 000 lines) + with open(in_dat, "r") as pairs: + for line in pairs: + # Skip header lines + if line.startswith("#"): + continue + i += 1 + # Only use the first million pairs to estimate thresholds + if i == 1000000: + break + # Process Hi-C pair into a dictionary + p = process_read_pair(line) + # Type of event and number of restriction site between reads + etype = p["type"] + nsites = p["nsites"] + # Count number of events for intrachrom pairs + if etype != "inter" and nsites < max_sites: + n_events[etype][nsites] += 1 + + def plot_event(n_events, legend, name): + """Plot the frequency of a given event types over distance.""" + plt.xlim([-0.5, 15]) + plt.plot( + range(n_events[name].shape[0]), + n_events[name], + "o-", + label=legend[name], + linewidth=2.0, + c=colors[name], + ) + + if interactive: + # PLot: + try: + plt.figure(0) + for event in legend: + plot_event(n_events, legend, event) + plt.grid() + plt.xlabel("Number of restriction fragment(s)") + plt.ylabel("Number of events") + plt.yscale("log") + plt.legend() + plt.show(block=False) + + except Exception: + logger.error( + "Unable to show plots, skipping figure generation. Perhaps " + "there is no Xserver running ? (might be due to windows " + "environment). Try running without the interactive option." + ) + + # Asks the user for appropriate thresholds + print( + "Please enter the number of restriction fragments separating " + "reads in a Hi-C pair below or at which loops and " + "uncuts events will be excluded\n", + file=sys.stderr, + ) + thr_uncut = int(input("Enter threshold for the uncuts events (+-):")) + thr_loop = int(input("Enter threshold for the loops events (-+):")) + try: + plt.clf() + except Exception: + pass + else: + # Estimate thresholds from data + for event in n_events: + fixed = n_events[event] + fixed[fixed == 0] = 1 + n_events[event] = fixed + + all_events = np.log(np.array(list(n_events.values()))) + # Compute median occurences at each restriction sites + event_med = np.median(all_events, axis=0) + # Compute MAD, to have a robust estimator of the expected deviation + # from median at long distances + mad = np.median(abs(all_events - event_med)) + exp_stdev = mad / 0.67449 + # Iterate over sites, from furthest to frag+2 + for site in range(max_sites)[:1:-1]: + # For uncuts and loops, keep the last (closest) site where the + # deviation from other events <= expected_stdev + if ( + abs(np.log(n_events["+-"][site]) - event_med[site]) + <= exp_stdev + ): + thr_uncut = site + if ( + abs(np.log(n_events["-+"][site]) - event_med[site]) + <= exp_stdev + ): + thr_loop = site + if thr_uncut is None or thr_loop is None: + raise ValueError( + "The threshold for loops or uncut could not be estimated. " + "Please try running with -i to investigate the problem." + ) + logger.info( + "Filtering with thresholds: uncuts={0} loops={1}".format( + thr_uncut, thr_loop + ) + ) + if plot_events: + try: + plt.figure(1) + plt.xlim([-0.5, 15]) + # Draw colored lines for events to discard + plt.plot( + range(0, thr_uncut + 1), + n_events["+-"][: thr_uncut + 1], + "o-", + c=colors["+-"], + label=legend["+-"], + ) + plt.plot( + range(0, thr_loop + 1), + n_events["-+"][: thr_loop + 1], + "o-", + c=colors["-+"], + label=legend["-+"], + ) + plt.plot( + range(0, 2), + n_events["--"][:2], + "o-", + c=colors["--"], + label=legend["--"], + ) + plt.plot( + range(0, 2), + n_events["++"][:2], + "o-", + c=colors["++"], + label=legend["++"], + ) + # Draw black lines for events to keep + plt.plot( + range(thr_uncut, n_events["+-"].shape[0]), + n_events["+-"][thr_uncut:], + "o-", + range(thr_loop, n_events["-+"].shape[0]), + n_events["-+"][thr_loop:], + "o-", + range(1, n_events["--"].shape[0]), + n_events["--"][1:], + "o-", + range(1, n_events["++"].shape[0]), + n_events["++"][1:], + "o-", + label="kept", + linewidth=2.0, + c="g", + ) + plt.grid() + plt.xlabel("Number of restriction site(s)") + plt.ylabel("Number of events") + plt.yscale("log") + # Remove duplicate "kept" entries in legend + handles, labels = plt.gca().get_legend_handles_labels() + by_label = OrderedDict(zip(labels, handles)) + plt.legend(by_label.values(), by_label.keys()) + # Show uncut and loop threshold as vertical lines + plt.axvline(x=thr_loop, color=colors["-+"]) + plt.axvline(x=thr_uncut, color=colors["+-"]) + + if prefix: + plt.title( + "Library events by distance in {}".format(prefix) + ) + plt.tight_layout() + if fig_path: + plt.savefig(fig_path) + else: + plt.show(block=False) + # plt.clf() + + except Exception: + logger.error( + "Unable to show plots, skipping figure generation. Is " + "an X server running? (might be due to windows " + "environment). Try running without the plot option." + ) + return thr_uncut, thr_loop + + +def filter_events( + in_dat, + out_filtered, + thr_uncut, + thr_loop, + plot_events=False, + fig_path=None, + prefix=None, +): + """Filter events (loops, uncuts and weirds) + Filter out spurious intrachromosomal Hi-C pairs from input file. +- pairs + with reads closer or at the uncut threshold and -+ pairs with reads closer + or at the loop thresholds are excluded from the ouput file. -- and ++ pairs + with both mates on the same fragments are also discarded. All others are + written. + Parameters + ---------- + in_dat : file object + File handle in read mode to the 2D BED file containing Hi-C pairs. + out_filtered : file object + File handle in write mode the output filtered 2D BED file. + thr_uncut : int + Minimum number of restriction sites between reads to keep an + intrachromosomal +- pair. + thr_loop : int + Minimum number of restriction sites between reads to keep an + intrachromosomal -+ pair. + plot_events : bool + If True, a plot showing the proportion of each type of event will be + shown after filtering. + fig_path : str + Path where the figure will be saved. If None, figure is displayed + interactively. + prefix : str + If the library has a name, it will be shown on plots. + """ + n_uncuts = 0 + n_loops = 0 + n_weirds = 0 + lrange_intra = 0 + lrange_inter = 0 + + # open the files for reading and writing + with open(in_dat, "r") as pairs, open(out_filtered, "w") as filtered: + for line in pairs: # iterate over each line + # Copy header lines to output + if line.startswith("#"): + filtered.write(line) + continue + + p = process_read_pair(line) + line_to_write = ( + "\t".join( + map( + str, + ( + p["readID"], + p["chr1"], + p["pos1"], + p["chr2"], + p["pos2"], + p["strand1"], + p["strand2"], + p["frag1"], + p["frag2"], + ), + ) + ) + + "\n" + ) + if p["chr1"] == p["chr2"]: + # Do not report ++ and -- pairs on the same fragment (impossible) + if p["frag1"] == p["frag2"] and p["strand1"] == p["strand2"]: + n_weirds += 1 + elif p["nsites"] <= thr_loop and p["type"] == "-+": + n_loops += 1 + elif p["nsites"] <= thr_uncut and p["type"] == "+-": + n_uncuts += 1 + else: + lrange_intra += 1 + filtered.write(line_to_write) + + if p["chr1"] != p["chr2"]: + lrange_inter += 1 + filtered.write(line_to_write) + + if lrange_inter > 0: + ratio_inter = round( + 100 * lrange_inter / float(lrange_intra + lrange_inter), 2 + ) + else: + ratio_inter = 0 + + # Log quick summary of operation results + kept = lrange_intra + lrange_inter + discarded = n_loops + n_uncuts + n_weirds + total = kept + discarded + logger.info( + "Proportion of inter contacts: {0}% (intra: {1}, " + "inter: {2})".format(ratio_inter, lrange_intra, lrange_inter) + ) + logger.info( + "{0} pairs discarded: Loops: {1}, Uncuts: {2}, Weirds: {3}".format( + discarded, n_loops, n_uncuts, n_weirds + ) + ) + logger.info( + "{0} pairs kept ({1}%)".format( + kept, round(100 * kept / (kept + discarded), 2) + ) + ) + + # Visualize summary if requested by user + if plot_events: + try: + # Plot: make a square figure and axes to plot a pieChart: + plt.figure(2, figsize=(6, 6)) + # The slices will be ordered and plotted counter-clockwise. + fracs = [n_uncuts, n_loops, n_weirds, lrange_intra, lrange_inter] + # Format labels to include event names and proportion + labels = list( + map( + lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total), + [ + ("Uncuts", n_uncuts), + ("Loops", n_loops), + ("Weirds", n_weirds), + ("3D intra", lrange_intra), + ("3D inter", lrange_inter), + ], + ) + ) + colors = ["salmon", "lightskyblue", "yellow", "palegreen", "plum"] + patches, _ = plt.pie(fracs, colors=colors, startangle=90) + plt.legend( + patches, + labels, + loc='upper left', + bbox_to_anchor=(-0.1, 1.), + ) + if prefix: + plt.title( + "Distribution of library events in {}".format(prefix), + bbox={"facecolor": "1.0", "pad": 5}, + ) + plt.text( + 0.3, + 1.15, + "Threshold Uncuts = " + str(thr_uncut), + fontdict=None, + ) + plt.text( + 0.3, + 1.05, + "Threshold Loops = " + str(thr_loop), + fontdict=None, + ) + + plt.text( + -1.5, + -1.2, + "Total number of reads = " + str(total), + fontdict=None, + ) + plt.text( + -1.5, + -1.3, + "Ratio inter/(intra+inter) = " + str(ratio_inter) + "%", + fontdict=None, + ) + percentage = round( + 100 + * float(lrange_inter + lrange_intra) + / (n_loops + n_uncuts + n_weirds + lrange_inter + lrange_intra) + ) + plt.text( + -1.5, + -1.4, + "Selected reads = {0}%".format(percentage), + fontdict=None, + ) + if fig_path: + plt.savefig(fig_path) + else: + plt.show() + plt.clf() + except Exception: + logger.error( + "Unable to show plots. Perhaps there is no Xserver running ?" + "(might be due to windows environment) skipping figure " + "generation." + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input") + parser.add_argument("-o", "--output") + parser.add_argument("-p", "--prefix") + parser.add_argument("-l", "--plot") + parser.add_argument("-d", '--dist_plot') + parser.add_argument("-q", "--pie_plot") + args = parser.parse_args() + + pairs_idx = args.input + pairs_out = args.output + prefix = args.prefix + pairs_out = prefix+"_"+pairs_out + plot = args.plot + if plot.lower() == "false": + plot = False + elif plot.lower() == "true": + plot = True + dist_plot = args.dist_plot + if dist_plot == "None": + dist_plot = None + else: + dist_plot = prefix+"_"+dist_plot + pie_plot = args.pie_plot + if pie_plot == "None": + pie_plot = None + else: + pie_plot = prefix+"_"+pie_plot + + uncut_thr, loop_thr = get_thresholds( + pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix + ) + + filter_events( + pairs_idx, + pairs_out, + uncut_thr, + loop_thr, + plot_events=plot, + fig_path=pie_plot, + prefix=prefix + ) + diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py new file mode 100755 index 0000000000000000000000000000000000000000..40c45687d85de9e2e055776761df9152e2361971 --- /dev/null +++ b/bin/hicstuff_filter_pcr.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import hicstuff_io as hio +from hicstuff_log import logger +import argparse +import os, time, csv, sys, re + +def filter_pcr_dup(pairs_idx_file, filtered_file): + """ + Filter out PCR duplicates from a coordinate-sorted pairs file using + overrrepresented exact coordinates. If multiple fragments have two reads + with the exact same coordinates, only one of those fragments is kept. + Parameters + ---------- + pairs_idx_file : str + Path to an indexed pairs file containing the Hi-C reads. + filtered_file : str + Path to the output pairs file after removing duplicates. + """ + # Keep count of how many reads are filtered + filter_count = 0 + reads_count = 0 + # Store header lines + header = hio.get_pairs_header(pairs_idx_file) + with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered: + # Copy header lines to filtered file + for head_line in header: + filtered.write(head_line + "\n") + next(pairs) + + # Use csv methods to easily access columns + paircols = [ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + "frag1", + "frag2", + ] + # Columns used for comparison of coordinates + coord_cols = [col for col in paircols if col != "readID"] + pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols) + filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols) + + # Initialise a variable to store coordinates of reads in previous pair + prev = {k: 0 for k in paircols} + for pair in pair_reader: + reads_count += 1 + # If coordinates are the same as before, skip pair + if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols): + filter_count += 1 + continue + # Else write pair and store new coordinates as previous + else: + filt_writer.writerow(pair) + prev = pair + logger.info( + "%d%% PCR duplicates have been filtered out (%d / %d pairs) " + % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input") + parser.add_argument("-p", "--prefix") + parser.add_argument("-o" ,"--output_file") + args = parser.parse_args() + + input = args.input + prefix = args.prefix + output_file = prefix+"_"+args.output_file + + filter_pcr_dup(input, output_file) diff --git a/bin/hicstuff_fragments.py b/bin/hicstuff_fragments.py new file mode 100755 index 0000000000000000000000000000000000000000..5594097f4a8177d32ab879b75d9c18c6f814ee2d --- /dev/null +++ b/bin/hicstuff_fragments.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import hicstuff_digest as hcd +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--enzyme") + parser.add_argument("-i", "--index") + parser.add_argument("-m", "--min_size") + parser.add_argument("-c", "--circular") + parser.add_argument("-o", "--output_contigs") + parser.add_argument("-f", "--output_frags") + parser.add_argument("-p", "--plot") + parser.add_argument("-g", "--fig_path") + args = parser.parse_args() + + enzyme = args.enzyme + index = args.index + min_size = args.min_size + circular = args.circular + if circular.lower() == "false": + circular = False + elif circular.lower() == "true": + circular = True + output_contigs = args.output_contigs + output_frags = args.output_frags + plot = args.plot + if plot.lower() == "false": + plot = False + elif plot.lower() == "true": + plot = True + fig_path = args.fig_path + + + #hicstuff case sensitive enzymes adaptation + if enzyme == "hindiii": + enzyme = "HindIII" + elif enzyme == "dpnii": + enzyme = "DpnII" + elif enzyme == "bglii": + enzyme = "BglII" + elif enzyme == "mboi": + enzyme = "MboI" + elif enzyme == "arima": + enzyme = "Arima" + + + # Generate info_contigs and fragments_list output files + hcd.write_frag_info( + index, + enzyme, + min_size, + circular, + output_contigs, + output_frags + ) + + # Log fragment size distribution + hcd.frag_len(output_frags, plot=plot, fig_path=fig_path) diff --git a/bin/hicstuff_io.py b/bin/hicstuff_io.py new file mode 100644 index 0000000000000000000000000000000000000000..c8c4dde924c52d5e44661a3e528acf1cae53388e --- /dev/null +++ b/bin/hicstuff_io.py @@ -0,0 +1,1379 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import gzip +import zipfile +import bz2 +import io +import os +import functools +import sys +import numpy as np +import pandas as pd +import collections +import subprocess as sp +from scipy.sparse import tril, triu +import pathlib +import re +from os.path import join, exists +from random import getrandbits +from scipy.sparse import coo_matrix +from Bio import SeqIO, SeqUtils +#import hicstuff.hicstuff as hcs +#from hicstuff.log import logger +#from hicstuff.version import __version__ +import scipy.stats as ss + +DEFAULT_MAX_MATRIX_SHAPE = 10000 + +DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" +DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" +DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" + + +def _cols_to_sparse(sparse_array, shape=None, dtype=np.float64): + """ + Make a coordinate based sparse matrix from columns. + Convert (3, n) shaped arrays to a sparse matrix. The first + one acts as the row coordinates, the second one as the + column coordinates, the third one as the data points + for each pair. If duplicates are found, the data points are + added. + Parameters + ---------- + sparse_array : array_like + An array with exactly three columns representing the sparse + matrix data in coordinate format. + shape : tuple of int + The total number of rows and columns in the matrix. Will be estimated + from nonzero values if omitted. + dtype : type, optional + The type of data being loaded. Default is numpy.float64 + Example + ------- + >>> import numpy as np + >>> row, col = np.array([1, 2, 3]), np.array([3, 2, 1]) + >>> data = np.array([4, 5, 6]) + >>> M = np.array([row, col, data]).T + >>> S = _cols_to_sparse(M) + >>> print(S.todense()) + [[0. 0. 0. 0.] + [0. 0. 0. 4.] + [0. 0. 5. 0.] + [0. 6. 0. 0.]] + """ + + row = sparse_array[:, 0] + col = sparse_array[:, 1] + data = sparse_array[:, 2] + if shape is None: + n = int(max(np.amax(row), np.amax(col))) + 1 + shape = (n, n) + + S = coo_matrix((data, (row, col)), shape=shape, dtype=dtype) + return S + + +def load_sparse_matrix(mat_path, binning=1, dtype=np.float64): + """Load sparse matrix + Load a text file matrix into a sparse matrix object. The expected format is + a 3 column file where columns are row_number, col_number, value. The first + line consists of 3 values representing the total number of rows, columns + and nonzero values. + Parameters + ---------- + mat_path : file, str or pathlib.Path + The input matrix file in instagraal format. + binning : int or "auto" + The binning to perform. If "auto", binning will + be automatically inferred so that the matrix size + will not go beyond (10000, 10000) in shape. That + can be changed by modifying the DEFAULT_MAX_MATRIX_SHAPE + value. Default is 1, i.e. no binning is performed + dtype : type, optional + The type of data being loaded. Default is numpy.float64 + Returns + ------- + sparse_mat : scipy.sparse.coo_matrix + The output (sparse) matrix in COOrdinate format. + Examples + -------- + >>> S = load_sparse_matrix('test_data/mat_5kb.tsv', binning=1) + >>> S.data[:10] + array([84., 2., 3., 2., 1., 1., 50., 1., 66., 1.]) + >>> S.shape + (16, 16) + """ + try: + cols_arr = np.loadtxt(mat_path, delimiter="\t", dtype=dtype) + shape = (int(cols_arr[0, 0]), int(cols_arr[0, 1])) + except ValueError: + cols_arr = np.loadtxt( + mat_path, delimiter="\t", dtype=dtype, skiprows=1 + ) + shape = None + + # Get values into an array without the header. Use the header to give size. + sparse_mat = _cols_to_sparse(cols_arr[1:, :], shape=shape, dtype=dtype) + + if binning == "auto": + num_bins = max(sparse_mat.shape) + 1 + subsampling_factor = num_bins // DEFAULT_MAX_MATRIX_SHAPE + else: + subsampling_factor = binning + sparse_mat = hcs.bin_sparse( + sparse_mat, subsampling_factor=subsampling_factor + ) + return sparse_mat + + +def save_sparse_matrix(s_mat, path): + """Save a sparse matrix + Saves a sparse matrix object into tsv format. + Parameters + ---------- + s_mat : scipy.sparse.coo_matrix + The sparse matrix to save on disk + path : str + File path where the matrix will be stored + """ + if s_mat.format != "coo": + ValueError("Sparse matrix must be in coo format") + dtype = s_mat.dtype + fmt = "%i" if dtype == int else "%.10e" + sparse_arr = np.vstack([s_mat.row, s_mat.col, s_mat.data]).T + + np.savetxt( + path, + sparse_arr, + header="{nrows}\t{ncols}\t{nonzero}".format( + nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz + ), + comments="", + fmt=fmt, + delimiter="\t", + ) + + +def load_pos_col(path, colnum, header=1, dtype=np.int64): + """ + Loads a single column of a TSV file with header into a numpy array. + + Parameters + ---------- + path : str + The path of the TSV file to load. + colnum : int + The 0-based index of the column to load. + header : int + Number of line to skip. By default the header is a single line. + Returns + ------- + numpy.array : + A 1D numpy array with the + Examples + -------- + >>> load_pos_col('test_data/mat_5kb.tsv', 0)[:10] + array([0, 0, 0, 0, 0, 0, 1, 1, 2, 2]) + """ + pos_arr = np.genfromtxt( + path, + delimiter="\t", + usecols=(colnum,), + skip_header=header, + dtype=dtype, + ) + return pos_arr + + +def generate_temp_dir(path): + """Temporary directory generation + Generates a temporary file with a random name at the input path. + + Parameters + ---------- + path : str + The path at which the temporary directory will be created. + + Returns + ------- + str + The path of the newly created temporary directory. + """ + exist = True + while exist: + # Keep trying random directory names if they already exist + directory = str(hex(getrandbits(32)))[2:] + full_path = join(path, directory) + if not exists(full_path): + exist = False + try: + os.makedirs(full_path) + except PermissionError: + raise PermissionError( + "The temporary directory cannot be created in {}. " + "Make sure you have write permission.".format(path) + ) + return full_path + + +def _check_cooler(fun): + """Decorates function `fun` to check if cooler is available..""" + + @functools.wraps(fun) + def wrapped(*args, **kwargs): + try: + import cooler + + fun.__globals__["cooler"] = cooler + except ImportError: + logger.error( + "The cooler package is required to use {0}, please install it first".format( + fun.__name__ + ) + ) + raise ImportError("The cooler package is required.") + return fun(*args, **kwargs) + + return wrapped + + +@_check_cooler +def add_cool_column( + clr, column, column_name, table_name="bins", metadata={}, dtype=None +): + """ + Adds a new column to a loaded Cooler store. If the column exists, + it is replaced. This will affect the .cool file. + + Parameters + ---------- + clr : Cooler object + A Cooler store. + column : pandas Series + The column to add to the cooler. + column_name : str + The name of the column to add. + table_name : str + The name of the table to which the column should be added. + Defaults to the "bins" table. + metadata : dict + A dictionary of metadata to associate with the new column. + """ + with clr.open("r+") as c: + if column_name in c[table_name]: + del c[table_name][column_name] + h5opts = dict(compression="gzip", compression_opts=6) + c[table_name].create_dataset( + column_name, data=column, dtype=dtype, **h5opts + ) + c[table_name][column_name].attrs.update(metadata) + + +@_check_cooler +def load_cool(cool): + """ + Reads a cool file into memory and parses it into graal style tables. + + Parameters + ---------- + cool : str + Path to the input .cool file. + Returns + ------- + mat : scipy coo_matrix + Hi-C contact map in COO format. + frags : pandas DataFrame + Table off bins matching the matrix. Corresponds to the content of the fragments_list.txt file. + chroms : pandas DataFrame + Table of chromosome informations. + """ + c = cooler.Cooler(cool) # pylint: disable=undefined-variable + frags = c.bins()[:] + chroms = c.chroms()[:] + mat = c.pixels()[:] + frags.rename( + columns={"start": "start_pos", "end": "end_pos"}, inplace=True + ) + frags["id"] = frags.groupby("chrom", sort=False).cumcount() + 1 + # Try loading hicstuff-specific columns + try: + frags = frags[ + ["id", "chrom", "start_pos", "end_pos", "size", "gc_content"] + ] + # If absent, only load standard columns + except KeyError: + frags = frags[["id", "chrom", "start_pos", "end_pos"]] + + chroms["cumul_length"] = ( + chroms.length.shift(1).fillna(0).cumsum().astype(int) + ) + n_frags = c.bins()[:].groupby("chrom", sort=False).count().start + chroms["n_frags"] = chroms.merge( + n_frags, right_index=True, left_on="name", how="left" + ).start + chroms.rename(columns={"name": "contig"}, inplace=True) + n = int(max(np.amax(mat.bin1_id), np.amax(mat.bin2_id))) + 1 + shape = (n, n) + mat = coo_matrix((mat["count"], (mat.bin1_id, mat.bin2_id)), shape=shape) + + return mat, frags, chroms + + +@_check_cooler +def save_cool(cool_out, mat, frags, metadata={}): + """ + Writes a .cool file from graal style tables. + + Parameters + ---------- + cool_out : str + Path to the output cool file. + mat : scipy coo_matrix + The Hi-C contact matrix in sparse COO format. + frags : pandas DataFrame + The graal style 'fragments_list' table. + metadata : dict + Potential metadata to associate with the cool file. + """ + up_tri = False + # Check if symmetric matrix is symmetric + # (i.e. only upper triangle or full mat) + if (abs(mat - mat.T) > 1e-10).nnz != 0: + up_tri = True + # Drop useless column + try: + bins = frags.drop("id", axis=1) + except KeyError: + bins = frags + # Get column names right + bins.rename( + columns={"seq": "chrom", "start_pos": "start", "end_pos": "end"}, + inplace=True, + ) + mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data} + pixels = pd.DataFrame(mat_dict) + cooler.create_cooler( # pylint: disable=undefined-variable + cool_out, + bins, + pixels, + metadata=metadata, + symmetric_upper=up_tri, + triucheck=False, + ) + + +def read_compressed(filename, mode='r'): + """Read compressed file + Opens the file in read mode with appropriate decompression algorithm. + Parameters + ---------- + filename : str + The path to the input file + Returns + ------- + file-like object + The handle to access the input file's content + """ + + # Standard header bytes for diff compression formats + comp_bytes = { + b"\x1f\x8b\x08": "gz", + b"\x42\x5a\x68": "bz2", + b"\x50\x4b\x03\x04": "zip", + } + + max_len = max(len(x) for x in comp_bytes) + + def file_type(filename): + """Guess file type + Compare header bytes with those in the file and return type. + """ + with open(filename, "rb") as f: + file_start = f.read(max_len) + for magic, filetype in comp_bytes.items(): + if file_start.startswith(magic): + return filetype + return "uncompressed" + + # Open file with appropriate function + mode_map = {'r': 'rt', 'rb': 'rb'} + comp = file_type(filename) + if comp == "gz": + return gzip.open(filename, mode_map[mode]) + elif comp == "bz2": + return bz2.open(filename, mode_map[mode]) + elif comp == "zip": + zip_arch = zipfile.ZipFile(filename, mode) + if len(zip_arch.namelist()) > 1: + raise IOError( + "Only a single fastq file must be in the zip archive." + ) + else: + # ZipFile opens as bytes by default, using io to read as text + zip_content = zip_arch.open(zip_arch.namelist()[0], mode) + return io.TextIOWrapper(zip_content, encoding="utf-8") + else: + return open(filename, mode) + + +def is_compressed(filename): + """Check compression status + Check if the input file is compressed from the first bytes. + Parameters + ---------- + filename : str + The path to the input file + Returns + ------- + bool + True if the file is compressed, False otherwise. + """ + + # Standard header bytes for diff compression formats + comp_bytes = { + b"\x1f\x8b\x08": "gz", + b"\x42\x5a\x68": "bz2", + b"\x50\x4b\x03\x04": "zip", + } + max_len = max(len(x) for x in comp_bytes) + with open(filename, "rb") as f: + file_start = f.read(max_len) + for magic, _ in comp_bytes.items(): + if file_start.startswith(magic): + return True + return False + + +def from_dade_matrix(filename, header=False): + """Load a DADE matrix + Load a numpy array from a DADE matrix file, optionally + returning bin information from the header. Header data + processing is delegated downstream. + Parameters + ---------- + filename : str, file or pathlib.Path + The name of the file containing the DADE matrix. + header : bool + Whether to return as well information contained + in the header. Default is False. + Example + ------- + >>> import numpy as np + >>> import tempfile + >>> lines = [['RST', 'chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'], + ... ['chr1~0', '5'], + ... ['chr1~10', '8', '3'], + ... ['chr2~0', '3', '5', '5'], + ... ['chr2~30', '5', '10', '11', '2'] + ... ] + >>> formatted = ["\\t".join(l) + "\\n" for l in lines ] + >>> dade = tempfile.NamedTemporaryFile(mode='w') + >>> for fm in formatted: + ... dade.write(fm) + 34 + 9 + 12 + 13 + 18 + >>> dade.flush() + >>> M, h = from_dade_matrix(dade.name, header=True) + >>> dade.close() + >>> print(M) + [[ 5. 8. 3. 5.] + [ 8. 3. 5. 10.] + [ 3. 5. 5. 11.] + [ 5. 10. 11. 2.]] + >>> print(h) + ['chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'] + See https://github.com/scovit/DADE for more details about Dade. + """ + + A = pd.read_csv(filename, sep="\t", header=None) + A.fillna("0", inplace=True) + M, headers = np.array(A.iloc[1:, 1:], dtype=np.float64), A.iloc[0, :] + matrix = M + M.T - np.diag(np.diag(M)) + if header: + return matrix, headers.tolist()[1:] + else: + return matrix + + +def to_dade_matrix(M, annotations="", filename=None): + """Returns a Dade matrix from input numpy matrix. Any annotations are added + as header. If filename is provided and valid, said matrix is also saved + as text. + """ + + n, m = M.shape + A = np.zeros((n + 1, m + 1)) + A[1:, 1:] = M + if not annotations: + annotations = np.array(["" for _ in n], dtype=str) + A[0, :] = annotations + A[:, 0] = annotations.T + if filename: + try: + np.savetxt(filename, A, fmt="%i") + logger.info( + "I saved input matrix in dade format as {0}".format( + str(filename) + ) + ) + except ValueError as e: + logger.warning("I couldn't save input matrix.") + logger.warning(str(e)) + + return A + + +def load_into_redis(filename): + """Load a file into redis + Load a matrix file and sotre it in memory with redis. + Useful to pass around huge datasets from scripts to + scripts and load them only once. + Inspired from https://gist.github.com/alexland/ce02d6ae5c8b63413843 + Parameters + ---------- + filename : str, file or pathlib.Path + The file of the matrix to load. + Returns + ------- + key : str + The key of the dataset needed to retrieve it from redis. + """ + try: + from redis import StrictRedis as redis + import time + except ImportError: + print( + "Error! Redis does not appear to be installed in your system.", + file=sys.stderr, + ) + exit(1) + + M = np.genfromtxt(filename, dtype=None) + array_dtype = str(M.dtype) + m, n = M.shape + M = M.ravel().tostring() + database = redis(host="localhost", port=6379, db=0) + key = "{0}|{1}#{2}#{3}".format(int(time.time()), array_dtype, m, n) + + database.set(key, M) + + return key + + +def load_from_redis(key): + """Retrieve a dataset from redis + Retrieve a cached dataset that was stored in redis + with the input key. + Parameters + ---------- + key : str + The key of the dataset that was stored in redis. + Returns + ------- + M : numpy.ndarray + The retrieved dataset in array format. + """ + + try: + from redis import StrictRedis as redis + except ImportError: + print( + "Error! Redis does not appear to be installed in your system.", + file=sys.stderr, + ) + exit(1) + + database = redis(host="localhost", port=6379, db=0) + + try: + M = database.get(key) + except KeyError: + print( + "Error! No dataset was found with the supplied key.", + file=sys.stderr, + ) + exit(1) + + array_dtype, n, m = key.split("|")[1].split("#") + + M = np.fromstring(M, dtype=array_dtype).reshape(int(n), int(m)) + return M + + +def dade_to_graal( + filename, + output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME, + output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, + output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME, + output_dir=None, +): + """Convert a matrix from DADE format (https://github.com/scovit/dade) + to a graal-compatible format. Since DADE matrices contain both fragment + and contact information all files are generated at the same time. + """ + + with open(output_matrix, "w") as sparse_file: + sparse_file.write("id_frag_a\tid_frag_b\tn_contact") + with open(filename) as file_handle: + first_line = file_handle.readline() + for row_index, line in enumerate(file_handle): + dense_row = np.array(line.split("\t")[1:], dtype=np.int32) + for col_index in np.nonzero(dense_row)[0]: + line_to_write = "{}\t{}\t{}\n".format( + row_index, col_index, dense_row[col_index] + ) + sparse_file.write(line_to_write) + + header = first_line.split("\t") + bin_type = header[0] + if bin_type == '"RST"': + logger.info("I detected fragment-wise binning") + elif bin_type == '"BIN"': + logger.info("I detected fixed size binning") + else: + logger.warning( + ( + "Sorry, I don't understand this matrix's " + "binning: I read {}".format(str(bin_type)) + ) + ) + + header_data = [ + header_elt.replace("'", "") + .replace('"', "") + .replace("\n", "") + .split("~") + for header_elt in header[1:] + ] + + ( + global_frag_ids, + contig_names, + local_frag_ids, + frag_starts, + frag_ends, + ) = np.array(list(zip(*header_data))) + + frag_starts = frag_starts.astype(np.int32) - 1 + frag_ends = frag_ends.astype(np.int32) - 1 + frag_lengths = frag_ends - frag_starts + + total_length = len(global_frag_ids) + + with open(output_contigs, "w") as info_contigs: + + info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") + + cumul_length = 0 + + for contig in collections.OrderedDict.fromkeys(contig_names): + + length_tig = np.sum(frag_lengths[contig_names == contig]) + n_frags = collections.Counter(contig_names)[contig] + line_to_write = "%s\t%s\t%s\t%s\n" % ( + contig, + length_tig, + n_frags, + cumul_length, + ) + info_contigs.write(line_to_write) + cumul_length += n_frags + + with open(output_frags, "w") as fragments_list: + + fragments_list.write( + "id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n" + ) + bogus_gc = 0.5 + + for i in range(total_length): + line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( + int(local_frag_ids[i]) + 1, + contig_names[i], + frag_starts[i], + frag_ends[i], + frag_lengths[i], + bogus_gc, + ) + fragments_list.write(line_to_write) + + +def load_bedgraph2d(filename, bin_size=None, fragments_file=None): + """ + Loads matrix and fragment information from a 2D bedgraph file. Note this + function assumes chromosomes are ordered in alphabetical. order + + Parameters + ---------- + filename : str + Path to the bedgraph2D file. + bin_size : int + The size of bins in the case of fixed bin size. + fragments_file : str + Path to a fragments file to explicitely provide fragments positions. + If the matrix does not have fixed bin size, this prevents errors. + + Returns + ------- + mat : scipy.sparse.coo_matrix + The Hi-C contact map as the upper triangle of a symetric matrix, in + sparse format. + frags : pandas.DataFrame + The list of fragments/bin present in the matrix with their genomic + positions. + """ + bed2d = pd.read_csv(filename, sep="\t", header=None) + chrom_sizes = {} + if bin_size is not None: + # If bin size if provided, retrieve chromosome lengths, this will be + # used when regenerating bin coordinates + chroms_left = bed2d[[3, 5]] + chroms_left.columns = [0, 2] + chroms = ( + pd.concat([bed2d[[0, 2]], chroms_left]) + .groupby([0], sort=False) + .max() + ) + for chrom, size in zip(chroms.index, np.array(chroms)): + chrom_sizes[chrom] = size[0] + elif fragments_file is None: + logger.warning( + "Please be aware that not all information can be restored from a " + "bg2 file without fixed bin size; fragments without any contact " + "will be lost" + ) + # Get all possible fragment chrom-positions into an array + frag_pos = np.vstack( + [np.array(bed2d[[0, 1, 2]]), np.array(bed2d[[3, 4, 5]])] + ) + # Sort by position (least important, col 1) + frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")] + # Then by chrom (most important, col 0) + frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")] + # Get unique names for fragments (chrom+pos) + ordered_frag_pos = ( + pd.DataFrame(frag_pos).drop_duplicates().reset_index(drop=True) + ) + frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1) + frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1) + # If fragments file is provided, use fragments positions to indices mapping + if fragments_file is not None: + frags = pd.read_csv(fragments_file, delimiter="\t") + frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1) + frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)} + # If fixed fragment size available, use it to reconstruct original + # fragments ID (even if they are absent from the bedgraph file). + elif bin_size is not None: + frag_map = {} + chrom_frags = [] + for chrom, size in chrom_sizes.items(): + prev_frags = len(frag_map) + for bin_id, bin_pos in enumerate(range(0, size, bin_size)): + frag_map[(chrom, bin_pos)] = bin_id + prev_frags + n_bins = size // bin_size + chrom_frags.append( + pd.DataFrame( + { + "id": range(1, n_bins + 1), + "chrom": np.repeat(chrom, n_bins), + "start_pos": range(0, size, bin_size), + "end_pos": range(bin_size, size + bin_size, bin_size), + } + ) + ) + frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True) + frags.insert( + loc=3, column="size", value=frags.end_pos - frags.start_pos + ) + # If None available, guess fragments indices from bedgraph (potentially wrong) + else: + frag_map = { + (v[0], v[1]): i + for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows() + } + frags = ordered_frag_pos.copy() + frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1] + frags.insert(loc=0, column="id", value=0) + frags.id = frags.groupby([0], sort=False).cumcount() + 1 + frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"] + # Match bin indices to their names + frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a))) + frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b))) + contacts = np.array(bed2d.iloc[:, 6].tolist()) + # Use index to build matrix + n_frags = len(frag_map.keys()) + mat = coo_matrix( + (contacts, (frag_id_a, frag_id_b)), shape=(n_frags, n_frags) + ) + + # Get size of each chromosome in basepairs + chromsizes = frags.groupby("chrom", sort=False).apply( + lambda x: np.int64(max(x.end_pos)) + ) + chrom_bins = frags.groupby("chrom", sort=False).size() + # Shift chromsizes by one to get starting bin, first one is zero + # Make chromsize cumulative to get start bin of each chrom + # Get chroms into a 1D array of bin starts + chrom_start = chrom_bins.shift(1, fill_value=0).cumsum() + chroms = pd.DataFrame( + { + "contig": chromsizes.index, + "length": chromsizes.values, + "n_frags": chrom_bins, + "cumul_length": chrom_start, + } + ) + return mat, frags, chroms + + +def flexible_hic_loader( + mat, fragments_file=None, chroms_file=None, quiet=False +): + """ + Wrapper function to load COO, bg2 or cool input and return the same output. + COO formats requires fragments_file and chroms_file options. bg2 format can + infer bin_size if fixed. When providing a bg2 matrix with uneven fragments + length, one should provide fragments_file as well or empty bins will be + truncated from the output. + + Parameters + ---------- + mat : str + Path to the matrix in graal, bedgraph2 or cool format. + fragments_file : str or None + Path to the file with fragments information (fragments_list.txt). + Only required if the matrix is in graal format. + chroms_file : str or None + Path to the file with chromosome information (info_contigs.txt). Only required + if the matrix is in graal format. + quiet : bool + If True, will silence warnings for empty outputs. + Returns + ------- + mat : scipy.sparse.coo_matrix + Sparse upper triangle Hi-C matrix. + frags : pandas.DataFrame or None + Table of fragment informations. None if information was not provided. + chroms : pandas.DataFrame or None + Table of chromosomes/contig information. None if information was not provided. + """ + hic_format = get_hic_format(mat) + # Load cool based on file extension + if hic_format == "cool": + mat, frags, chroms = load_cool(mat) + # Use the first line to determine COO / bg2 format + if hic_format == "bg2": + # Use the frags file to define bins if available + if fragments_file is not None: + mat, frags, chroms = load_bedgraph2d( + mat, fragments_file=fragments_file + ) + else: + # Guess if bin size is fixed based on MAD + bg2 = pd.read_csv(mat, sep="\t") + sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1]) + size_mad = ss.median_abs_deviation(sizes, scale='normal') + # Use only the bg2 + if size_mad > 0: + mat, frags, chroms = load_bedgraph2d(mat) + logger.warning( + "Input is a bedgraph2d file with uneven bin size, " + "but no fragments_file was provided. Empty bins will " + "be missing from the output. To avoid this, provide a " + "fragments file." + ) + # Use fixed bin size + else: + mat, frags, chroms = load_bedgraph2d( + mat, bin_size=int(np.median(sizes)) + ) + + elif hic_format == "graal": + mat = load_sparse_matrix(mat) + try: + frags = pd.read_csv(fragments_file, sep="\t") + except ValueError: + if not quiet: + logger.warning( + "fragments_file was not provided when " + "loading a matrix in COO/graal format. frags will be None." + ) + frags = None + try: + chroms = pd.read_csv(chroms_file, sep="\t") + except ValueError: + if not quiet: + logger.warning( + "chroms_file was not provided when " + "loading a matrix in COO/graal format. chroms will be None." + ) + + chroms = None + + # Ensure the matrix is upper triangle symmetric + if mat.shape[0] == mat.shape[1]: + if (abs(mat - mat.T) > 1e-10).nnz > 0: + mat = mat + tril(mat, k=-1).T + mat = triu(mat, format="coo") + + return mat, frags, chroms + + +def get_hic_format(mat): + """Returns the format of the input Hi-C matrix + + Parameters + ---------- + mat : str + Path to the input Hi-C matrix + Returns + ------- + str : + Hi-C format string. One of graal, bg2, cool + """ + if ( + mat.endswith(".cool") + or mat.count(".mcool::/") == 1 + or mat.count(".cool::/") == 1 + ): + hic_format = "cool" + else: + # Use the first line to determine COO / bg2 format + ncols = len(open(mat).readline().split("\t")) + if ncols == 7: + hic_format = "bg2" + elif ncols == 3: + hic_format = "graal" + else: + raise ValueError("Unkown file format") + return hic_format + + +def flexible_hic_saver( + mat, out_prefix, frags=None, chroms=None, hic_fmt="graal", quiet=False, +): + """ + Saves objects to the desired Hi-C file format. + Parameters + ---------- + mat : scipy.sparse.coo_matrix + Hi-C contact map. + out_prefix : str + Output path without extension (the extension is added based on hic_fmt). + frags : pandas.DataFrame or None + Table of fragments informations. + chroms : pandas.DataFrame or None + Table of chromosomes / contigs informations. + hic_fmt : str + Output format. Can be one of graal for graal-compatible COO format, bg2 for + 2D bedgraph format, or cool for cooler compatible format. + """ + if hic_fmt == "graal": + save_sparse_matrix(mat, out_prefix + ".mat.tsv") + try: + frags.to_csv(out_prefix + ".frags.tsv", sep="\t", index=False) + except AttributeError: + if not quiet: + logger.warning( + "Could not create fragments_list.txt from input files" + ) + try: + chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False) + except AttributeError: + if not quiet: + logger.warning( + "Could not create info_contigs.txt from input files" + ) + elif hic_fmt == "cool": + frag_sizes = frags.end_pos - frags.start_pos + size_mad = np.median(frag_sizes - np.median(frag_sizes)) + bin_type = "variable" if size_mad else "fixed" + try: + save_cool( + out_prefix + ".cool", + mat, + frags, + metadata={"hicstuff": __version__, "bin-type": bin_type}, + ) + except NameError: + NameError("frags is required to save a cool file") + elif hic_fmt == "bg2": + try: + save_bedgraph2d(mat, frags, out_prefix + ".bg2") + except NameError: + NameError("frags is required to save a bg2 file") + else: + raise ValueError("Unknown output format: {0}".format(hic_fmt)) + + +def save_bedgraph2d(mat, frags, out_path): + """ + Given a sparse matrix and a corresponding list of fragments, save a file + in 2D bedgraph format. + Parameters + ---------- + mat : scipy.sparse.coo_matrix + The sparse contact map. + frags : pandas.DataFrame + A structure containing the annotations for each matrix bin. Should + correspond to the content of the fragments_list.txt file. + """ + + mat_df = pd.DataFrame( + {"row": mat.row, "col": mat.col, "data": mat.data.astype(int)} + ) + # Merge fragments with matrix based on row indices to annotate rows + merge_mat = mat_df.merge( + frags, left_on="row", right_index=True, how="left", suffixes=("", "") + ) + # Rename annotations to assign to frag1 + merge_mat.rename( + columns={"chrom": "chr1", "start_pos": "start1", "end_pos": "end1"}, + inplace=True, + ) + # Do the same operation for cols (frag2) + merge_mat = merge_mat.merge( + frags, + left_on="col", + right_index=True, + how="left", + suffixes=("_0", "_2"), + ) + merge_mat.rename( + columns={"chrom": "chr2", "start_pos": "start2", "end_pos": "end2"}, + inplace=True, + ) + # Select only relevant columns in correct order + bg2 = merge_mat.loc[ + :, ["chr1", "start1", "end1", "chr2", "start2", "end2", "data"] + ] + bg2.to_csv(out_path, header=None, index=False, sep="\t") + + +def get_pos_cols(df): + """Get column names representing chromosome, start and end column + from a dataframe. Allows flexible names. + Parameters + ---------- + df : pd.DataFrame + Returns + ------- + tuple of str: + Tuple containing the names of the chromosome, start and end + columns in the input dataframe. + Examples + -------- + >>> import pandas as pd + >>> d = [1, 2, 3] + >>> df = pd.DataFrame( + ... {'Chromosome': d, 'Start': d, 'End': d, 'species': d} + ... ) + >>> get_pos_cols(df) + ('Chromosome', 'Start', 'End') + >>> df = pd.DataFrame( + ... {'id': d, 'chr': d, 'start_bp': d, 'end_bp': d} + ... ) + >>> get_pos_cols(df) + ('chr', 'start_bp', 'end_bp') + """ + + # Get case insensitive column names + cnames = df.columns + inames = cnames.str.lower() + + def _col_getter(cols, pat): + """Helper to get column index from the start of its name""" + mask = cols.str.startswith(pat) + idx = np.flatnonzero(mask)[0] + return idx + + chrom_col = cnames[_col_getter(inames, "chr")] + start_col = cnames[_col_getter(inames, "start")] + end_col = cnames[_col_getter(inames, "end")] + return chrom_col, start_col, end_col + + +def gc_bins(genome_path, frags): + """Generate GC content annotation for bins using input genome. + Parameters + ---------- + genome_path : str + Path the the genome file in FASTA format. + frags : pandas.DataFrame + Table containing bin segmentation of the genome. + Required columns: chrom, start, end. + Returns + ------- + numpy.ndarray of floats: + GC content per bin, in the range [0.0, 1.0]. + """ + # Grab columns by name (catch start, Start, star_pos, etc) + chrom_col, start_col, end_col = get_pos_cols(frags) + # Fill the gc array by chromosome + gc_bins = np.zeros(frags.shape[0], dtype=float) + for rec in SeqIO.parse(genome_path, "fasta"): + mask = frags[chrom_col] == rec.id + # Define coordinates of each bin + starts = frags.loc[mask, start_col] + ends = frags.loc[mask, end_col] + # Slice chromosome sequence based on bins + seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)] + # Fill GC values for bins in the chromosome + idx = np.flatnonzero(mask) + gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0 + + return gc_bins + + +def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"): + """ + Sort a pairs file in batches using UNIX sort. + Parameters + ---------- + in_file : str + Path to the unsorted input file + out_file : str + Path to the sorted output file. + keys : list of str + list of columns to use as sort keys. Each column can be one of readID, + chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to + the order in the list. + tmp_dir : str + Path to the directory where temporary files will be created. Defaults + to current directory. + threads : int + Number of parallel sorting threads. + buffer : str + Buffer size used for sorting. Consists of a number and a unit. + """ + # TODO: Write a pure python implementation to drop GNU coreutils depencency, + # could be inspired from: https://stackoverflow.com/q/14465154/8440675 + + # Check if UNIX sort version supports parallelism + parallel_ok = True + sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE) + sort_ver = ( + sort_ver.communicate()[0] + .decode() + .split("\n")[0] + .split(" ")[-1] + .split(".") + ) + # If so, specify threads, otherwise don't mention it in the command line + try: + sort_ver = list(map(int, sort_ver)) + if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23): + logger.warning( + "GNU sort version is {0} but >8.23 is required for parallel " + "sort. Sorting on a single thread.".format( + ".".join(map(str, sort_ver)) + ) + ) + parallel_ok = False + # BSD sort has a different format and will throw error upon parsing. It does + # not support parallel processes anyway. + except ValueError: + logger.warning( + "Using BSD sort instead of GNU sort, sorting on a single thread." + ) + parallel_ok = False + + key_map = { + "readID": "-k1,1d", + "chr1": "-k2,2V", + "pos1": "-k3,3n", + "chr2": "-k4,4V", + "pos2": "-k5,5n", + "strand1": "-k6,6d", + "strand2": "-k7,7d", + "frag1": "-k8,8n", + "frag2": "-k9,9n", + } + + # transform column names to corresponding sort keys + try: + sort_keys = map(lambda k: key_map[k], keys) + except KeyError: + print("Unkown column name.") + raise + # Rewrite header with new sorting order + header = get_pairs_header(in_file) + with open(out_file, "w") as output: + for line in header: + if line.startswith("#sorted"): + output.write("#sorted: {0}\n".format("-".join(keys))) + else: + output.write(line + "\n") + + # Sort pairs and append to file. + with open(out_file, "a") as output: + grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE) + sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys) + if tmp_dir is not None: + sort_cmd.append("--temporary-directory={0}".format(tmp_dir)) + if parallel_ok: + sort_cmd.append("--parallel={0}".format(threads)) + sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output) + sort_proc.communicate() + + +def get_pairs_header(pairs): + r"""Retrieves the header of a .pairs file and stores lines into a list. + Parameters + ---------- + pairs : str or file object + Path to the pairs file. + Returns + ------- + header : list of str + A list of header lines found, in the same order they appear in pairs. + Examples + -------- + >>> import os + >>> from tempfile import NamedTemporaryFile + >>> p = NamedTemporaryFile('w', delete=False) + >>> p.writelines(["## pairs format v1.0\n", "#sorted: chr1-chr2\n", "abcd\n"]) + >>> p.close() + >>> h = get_pairs_header(p.name) + >>> for line in h: + ... print([line]) + ['## pairs format v1.0'] + ['#sorted: chr1-chr2'] + >>> os.unlink(p.name) + """ + # Open file if needed + with open(pairs, "r") as pairs: + # Store header lines into a list + header = [] + line = pairs.readline() + while line.startswith("#"): + header.append(line.rstrip()) + line = pairs.readline() + + return header + + +def reorder_fasta(genome, output=None, threshold=100000): + """Reorder and trim a fasta file + Sort a fasta file by record lengths, optionally trimming the smallest ones. + Parameters + ---------- + genome : str, file or pathlib.Path + The genome scaffold file (or file handle) + output : str, file or pathlib.Path + The output file to write to + threshold : int, optional + The size below which scaffolds are discarded, by default 100000 + """ + + if output is None: + output = "{}_renamed.fa".format(genome.split(".")[0]) + + handle = SeqIO.parse(genome, "fasta") + handle_to_write = sorted( + (len(u) for u in handle if len(u) > threshold), reverse=True + ) + SeqIO.write(handle_to_write, output, "fasta") + + +def rename_genome(genome, output=None, ambiguous=True): + """Rename genome and slugify headers + Rename genomes according to a simple naming scheme; this + is mainly done to avoid special character weirdness. + Parameters + ---------- + genome : file, str or pathlib.Path + The input genome to be renamed and slugify. + output : file, str or pathlib.Path + The output genome to be written into. Default is <base>_renamed.fa, + where <base> is genome_in without its extension. + ambiguous : bool + Whether to retain ambiguous non-N bases, otherwise rename them to Ns. + Default is True. + """ + + if output is None: + output = "{}_renamed.fa".format(genome.split(".")[0]) + + with open(output, "w") as output_handle: + for record in SeqIO.parse(genome, "fasta"): + + # Replace hyphens, tabs and whitespace with underscores + new_record_id = record.id.replace(" ", "_") + new_record_id = new_record_id.replace("-", "_") + new_record_id = new_record_id.replace("\t", "_") + + # Remove anything that's weird, i.e. not alphanumeric + # or an underscore + new_record_id = re.sub("[^_A-Za-z0-9]+", "", new_record_id) + header = ">{}\n".format(new_record_id) + + new_seq = re.sub("[^ATGCatgcNn]", "N", str(record.seq)) + + output_handle.write(header) + output_handle.write("{}\n".format(new_seq)) + + +def check_fasta_index(ref, mode="bowtie2"): + """ + Checks for the existence of a bowtie2 or bwa index based on the reference + file name. + Parameters + ---------- + ref : str + Path to the reference genome. + mode : str + The alignment software used to build the index. bowtie2 or bwa. If any + other value is given, the function returns the reference path. + Returns + ------- + index : str + The bowtie2 or bwa index basename. None if no index was found + """ + ref = pathlib.Path(ref) + if mode == "bowtie2": + # Bowtie2 should have 6 index files + bt2_idx_files = list(ref.parent.glob("{}*bt2*".format(ref.name))) + index = None if len(bt2_idx_files) < 6 else bt2_idx_files + elif mode == "bwa": + refdir = str(ref.parent) + refdir_files = os.listdir(refdir) + bwa_idx_files = [ + join(refdir, f) + for f in refdir_files + if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f) + ] + index = None if len(bwa_idx_files) < 5 else bwa_idx_files + else: + index = [ref] + if index is not None: + # Convert the PosixPath objects to strings and get the longest common prefix to obtain + # index basename (without the dot) + index = os.path.commonprefix(list(map(str, index))).strip(".") + return index + + +def check_is_fasta(in_file): + """ + Checks whether input file is in fasta format. + + Parameters + ---------- + in_file : str + Path to the input file. + Returns + ------- + bool : + True if the input is in fasta format, False otherwise + """ + try: + with read_compressed(in_file) as handle: + fasta = any(SeqIO.parse(handle, "fasta")) + except FileNotFoundError: + fasta = False + + return fasta diff --git a/bin/hicstuff_log.py b/bin/hicstuff_log.py new file mode 100644 index 0000000000000000000000000000000000000000..171a5665ed1ea81371dcd39b28cc701f6cc7cb86 --- /dev/null +++ b/bin/hicstuff_log.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Basic logging +Basic logging setup with three main handlers (stdout, log file and optionally +texting with the right API). By default the log file is disabled, but can be +enabled or changed using set_file_handler. +""" + + +import logging +import logging.handlers +import requests +import json + +TEXT_CREDENTIALS_DEFAULT_PATH = "text_credentials.json" + +CURRENT_LOG_LEVEL = logging.INFO + + +logging.captureWarnings(True) + +logger = logging.getLogger() +logger.setLevel(CURRENT_LOG_LEVEL) + +logfile_formatter = logging.Formatter( + "%(asctime)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d,%H:%M:%S" +) +stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s") +text_formatter = logging.Formatter("%(message)s") + +# file_handler = logging.FileHandler("hicstuff.log", "a") + +# file_handler.setLevel(logging.INFO) +# file_handler.setFormatter(logfile_formatter) +# logger.addHandler(file_handler) + +stream_handler = logging.StreamHandler() +stream_handler.setLevel(logging.DEBUG) +stream_handler.setFormatter(stdout_formatter) +logger.addHandler(stream_handler) + + +def set_file_handler(log_path, formatter=logfile_formatter): + """Change the file handler for custom log file location""" + + filehandler = logging.FileHandler(log_path, "a") + filehandler.setLevel(logging.INFO) + filehandler.setFormatter(formatter) + for hdlr in logger.handlers[:]: # remove the existing file handlers + if isinstance(hdlr, logging.FileHandler): + logger.removeHandler(hdlr) + logger.addHandler(filehandler) # set the new handler + + +def setup_text_logging(credentials=TEXT_CREDENTIALS_DEFAULT_PATH): + """Setup text logging + Setup text notifications on errors with a basic API. In order for it to + work, a file named 'text_credentials.json' must be in the directory where + a command is run. The logger performs a GET request to a text notification + API. + Parameters + ---------- + credentials : str or pathlib.Path + A JSON file with necessary credentials for the GET request + """ + + with open(credentials) as cred_handle: + cred_dict = json.load(cred_handle) + + api_service = cred_dict.pop("api_service") + + class TextHandler(logging.Handler): + def emit(self, record): + log_entry = self.format(record) + cred_dict["msg"] = log_entry + return requests.get(api_service, cred_dict) + + sms_handler = TextHandler() + sms_handler.setLevel(logging.ERROR) + sms_handler.setFormatter(text_formatter) + + logger.addHandler(sms_handler) + + +try: + setup_text_logging(TEXT_CREDENTIALS_DEFAULT_PATH) +except FileNotFoundError: + pass diff --git a/conf/base.config b/conf/base.config index 2558cb1be3b55fa76fac6e98671f945093579d44..eff5eba6b3e73735ef10d9263640a1437ad32058 100644 --- a/conf/base.config +++ b/conf/base.config @@ -40,8 +40,8 @@ process { time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 64.GB * task.attempt, 'memory' ) } + cpus = { check_max( 8 * task.attempt, 'cpus' ) } //TODO go back to 16 when not local + memory = { check_max( 31.GB * task.attempt, 'memory' ) }//TODO go back to 64 when not local time = { check_max( 16.h * task.attempt, 'time' ) } } withLabel:process_long { @@ -61,3 +61,35 @@ process { cache = false } } +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} diff --git a/conf/hicstuff.config b/conf/hicstuff.config new file mode 100644 index 0000000000000000000000000000000000000000..3f5cdf60468e3f2ce4fca93cfdc5340f7be71a4d --- /dev/null +++ b/conf/hicstuff.config @@ -0,0 +1,399 @@ +params { + + // Input options + input = null + + + // References + genome = null + igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_ignore = false + chromosome_size = null + restriction_fragments = null + save_reference = false + + // Mapping + split_fastq = false + fastq_chunks_size = 20000000 + save_interaction_bam = false + save_aligned_intermediates = false + bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' + bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' + keep_dups = false + keep_multi = false + min_mapq = 10 + + // Digestion Hi-C + digestion = null + ligation_site = null + restriction_site = null + digest { + 'hindiii'{ + restriction_site='A^AGCTT' + ligation_site='AAGCTAGCTT' + } + 'mboi' { + restriction_site='^GATC' + ligation_site='GATCGATC' + } + 'dpnii' { + restriction_site='^GATC' + ligation_site='GATCGATC' + } + 'arima' { + restriction_site='^GATC,G^ANTC' + ligation_site='GATCGATC,GATCANTC,GANTGATC,GANTANTC' + } + } + + min_restriction_fragment_size = 0 + max_restriction_fragment_size = 0 + min_insert_size = 0 + max_insert_size = 0 + save_pairs_intermediates = false + + // Dnase Hi-C + dnase = false + min_cis_dist = 0 + + // Contact maps + save_raw_maps = false + bin_size = '1000000' + res_zoomify = null + hicpro_maps = false + ice_max_iter = 100 + ice_filter_low_count_perc = 0.02 + ice_filter_high_count_perc = 0 + ice_eps = 0.1 + + // Downstream Analysis + res_dist_decay = '250000' + tads_caller = 'insulation' + res_tads = '40000' + res_compartments = '250000' + + // Workflow + skip_maps = false + skip_balancing = false + skip_mcool = false + skip_dist_decay = false + skip_compartments = false + skip_tads = false + skip_multiqc = false + + // MultiQC options + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null + + // Boilerplate options + outdir = '${params.outdir}' + tracedir = "${params.outdir}/pipeline_info" + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'genomes,digest' + + // Config options + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_description = null + config_profile_contact = null + config_profile_url = null + config_profile_name = null + + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' + + // Hicstuff + hicstuff_bwt2_align_opts = '--very-sensitive-local' + hicstuff_min_size = 0 + hicstuff_circular = 'false' + hicstuff_output_contigs = 'info_contigs.txt' + hicstuff_output_frags = 'fragments_list.txt' + hicstuff_frags_plot = 'false' + hicstuff_frags_plot_path = 'frags_hist.pdf' + hicstuff_valid_pairs = 'valid.pairs' + hicstuff_valid_idx = 'valid_idx.pairs' + hicstuff_min_qual = 30 + hicstuff_matrix = 'abs_fragments_contacts_weighted.txt' + hicstuff_bin = 10000 + hicstuff_valid_idx_filtered = 'valid_idx_filtered.pairs' + hicstuff_plot_events = 'false' + hicstuff_pie_plot = 'distrib' + hicstuff_dist_plot = 'dist' + hicstuff_centro_file = 'None' //give absolute path or 'None' + hicstuff_base = 1.1 + hicstuff_distance_out_file = 'distance_law.txt' + hicstuff_rm_centro = 'None' //int or 'None' + hicstuff_distance_plot = 'false' + hicstuff_distance_out_plot = 'plot_distance_law.pdf' + hicstuff_filter_pcr_out_file = 'valid_idx_pcrfree.pairs' + + //Hicstuff optional modules + filter_event = false + distance_law = false + filter_pcr = false + filter_pcr_picard = true +} + +process { + //Default + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + withName: 'BOWTIE2_ALIGNMENT' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + ext.args = params.hicstuff_bwt2_align_opts + publishDir = [ + path: { "${params.outdir}/hicstuff/reads"}, + mode: 'copy' + ] + } + + withName: 'FRAGMENT_ENZYME' { + ext.args = { [ + " -m ${params.hicstuff_min_size}", + " -c ${params.hicstuff_circular}", + " -o ${params.hicstuff_output_contigs}", + " -f ${params.hicstuff_output_frags}", + " -p ${params.hicstuff_frags_plot}", + " -g ${params.hicstuff_frags_plot_path}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/fragment_enzyme" }, + mode: 'copy' + ] + } + + withName: 'BAM2PAIRS' { + ext.pairs = params.hicstuff_valid_pairs + ext.index = params.hicstuff_valid_idx + ext.args = { [ + " -q ${params.hicstuff_min_qual}", + " -c ${params.hicstuff_circular}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + + withName: 'FILTER_EVENT' { + ext.args = { [ + " -o ${params.hicstuff_valid_idx_filtered}", + " --plot ${params.hicstuff_plot_events}", + " -d ${params.hicstuff_dist_plot}", + " -q ${params.hicstuff_pie_plot}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + + withName: 'DISTANCE_LAW' { + ext.args = { [ + " -c ${params.hicstuff_centro_file}", + " -b ${params.hicstuff_base}", + " -o ${params.hicstuff_distance_out_file}", + " -u ${params.hicstuff_circular}", + " -r ${params.hicstuff_rm_centro}", + " -l ${params.hicstuff_distance_plot}", + " -d ${params.hicstuff_distance_out_plot}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + + withName: 'FILTER_PCR' { + ext.args = { [ + "-o ${params.hicstuff_filter_pcr_out_file}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + + withName: 'GATK4_MARKDUPLICATES' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}.bam" } + ext.args = { [ + "--REMOVE_DUPLICATES true" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/gatk4/bam" }, + mode: 'copy' + ] + } + + withName: 'PICARD_MARKDUPLICATES' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + ext.args = { [ + "--REMOVE_DUPLICATES true" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/picard/bam" }, + mode: 'copy' + ] + } + + withName: 'SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" } + ext.args = { [ + "" + ].join('').trim() } + } + + withName: 'SAMTOOLS_SORT_N' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_nsorted" } + ext.args = { [ + "-n" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/gatk4/bam" }, + mode: 'copy' + ] + } + + withName: 'SAMTOOLS_INDEX' { + ext.args = { [ + "" + ].join('').trim() } + } + + withName: 'BUILD_MATRIX' { + ext.args = params.hicstuff_matrix + publishDir = [ + path: { "${params.outdir}/hicstuff/matrix" }, + mode: 'copy' + ] + } + withName: 'BUILD_MATRIX_COOL' { + ext.args = params.hicstuff_matrix + publishDir = [ + path: { "${params.outdir}/hicstuff/matrix" }, + mode: 'copy' + ] + } + withName: 'BUILD_MATRIX_COOL_ALT' { + ext.outname = params.hicstuff_matrix + ext.bin = params.hicstuff_bin + ext.args = {"-c1 2 -p1 3 -p2 4 -c2 5"} + publishDir = [ + path: { "${params.outdir}/hicstuff/matrix" }, + mode: 'copy' + ] + } + + //********************************************** + // PREPARE_GENOME + withName: 'BOWTIE2_BUILD' { + publishDir = [ + path: { "${params.outdir}/genome/bowtie2_index" }, + mode: 'copy' + ] + } + + withName: 'CUSTOM_GETCHROMSIZES' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: 'copy' + ] + } + + withName: 'GET_RESTRICTION_FRAGMENTS' { + publishDir = [ + path: { "${params.outdir}/genome" }, + mode: 'copy' + ] + } + +} + +profiles { + debug { process.beforeScript = 'echo $HOSTNAME' } + conda { + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } +} +includeConfig 'base.config' diff --git a/conf/modules.config b/conf/modules.config index 096a86006168e216e1f68863f65e4dd2d5d96c7d..c289ffbf9f52ebf1230baa4f7fb87b25e81eb9bd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -286,4 +286,45 @@ process { ext.args = '--correctForMultipleTesting fdr' ext.prefix = { "${cool.baseName}" } } + +// withName: 'HIC_PLOT_MATRIX' { +// publishDir = [ +// path: { "${params.outdir}/contact_maps/cool/" }, +// mode: 'copy' +// ] +// ext.args = '--log --perChromosome' +// ext.prefix = { "${cool.baseName}" } +// } + + //******************************** + // FILTER_PCR_DUP + + withName: 'PICARD_MARKDUPLICATES' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}" } + ext.args = { [ + "--REMOVE_DUPLICATES true" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/picard/bam" }, + mode: 'copy' + ] + } + + withName: 'SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_sorted" } + ext.args = { [ + "" + ].join('').trim() } + } + + withName: 'SAMTOOLS_SORT_N' { + ext.prefix = { "${meta.id}_${meta.chunk}_${meta.mates}_nsorted" } + ext.args = { [ + "-n" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/picard/bam" }, + mode: 'copy' + ] + } } diff --git a/modules/local/filterbam/main.nf b/modules/local/filterbam/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..0f63be60330d928711dff04923ea7e3d562e5b76 --- /dev/null +++ b/modules/local/filterbam/main.nf @@ -0,0 +1,25 @@ +process FILTER_PAIR { + tag "$meta1.id" + label 'process_high' //medium + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta1), path(bam1), val(meta2), path(bam2) + + output: + tuple val(meta1), path("*_1paired.bam"), val(meta2), path("*_2paired.bam"), emit: bam + + script: + """ + samtools view ${bam1} | cut -f1 | sort | uniq > names1.txt + samtools view ${bam2} | cut -f1 | sort | uniq > names2.txt + cat names1.txt names2.txt| sort | uniq -c | grep " 2" | sed "s/^ 2 //g" > common_reads.txt + samtools view -h -N common_reads.txt ${bam1} -O BAM > ${meta1.id}_1paired.bam + samtools view -h -N common_reads.txt ${bam2} -O BAM > ${meta2.id}_2paired.bam + + """ +} \ No newline at end of file diff --git a/modules/local/hicexplorer/hicPlotMatrix.nf b/modules/local/hicexplorer/hicPlotMatrix.nf new file mode 100644 index 0000000000000000000000000000000000000000..8041aa1e3afe88acb2cc3ac26c9ce25b10d45d0a --- /dev/null +++ b/modules/local/hicexplorer/hicPlotMatrix.nf @@ -0,0 +1,24 @@ +process HIC_PLOT_MATRIX { + tag "$meta.id" + label 'process_single' + + conda "bioconda::hicexplorer=3.7.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hicexplorer:3.7.2--pyhdfd78af_1' : + 'quay.io/biocontainers/hicexplorer:3.7.2--pyhdfd78af_1' }" + + + input: + tuple val(meta), path(cool) + + output: + tuple val(meta), path("*.pdf"), emit: pdf + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + hicPlotMatrix -m ${cool} -o ${prefix}_cool_log.pdf ${args} + """ +} diff --git a/modules/local/hicstuff/align_bowtie2.nf b/modules/local/hicstuff/align_bowtie2.nf new file mode 100644 index 0000000000000000000000000000000000000000..188836d9ff6cf3affda23287355eb390896c22ec --- /dev/null +++ b/modules/local/hicstuff/align_bowtie2.nf @@ -0,0 +1,31 @@ +process BOWTIE2_ALIGNMENT { + tag "$meta.id" + label "process_high" + + conda "bioconda::bowtie2=2.4.4 bioconda::samtools=1.16.1 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' : + 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + + output: + tuple val(meta), path ("*.bam"), emit: bam + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 $args -p $task.cpus -x \$INDEX -U $reads > ${prefix}.tmp + + samtools view -F 2048 -h -@ $task.cpus ${prefix}.tmp | samtools sort -n -@ $task.cpus -o ${prefix}.bam - + """ + +} diff --git a/modules/local/hicstuff/bam2pairs.nf b/modules/local/hicstuff/bam2pairs.nf new file mode 100644 index 0000000000000000000000000000000000000000..4b2b26ecccceba423ef20270116a433650803ac5 --- /dev/null +++ b/modules/local/hicstuff/bam2pairs.nf @@ -0,0 +1,28 @@ +process BAM2PAIRS { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(bam1), val(meta2), path(bam2) + tuple val(meta), path(info_contigs) + val(digestion) + tuple val(meta), path(fasta) + + output: + tuple val(meta1), path("*_valid.pairs"), emit: valid_pairs + tuple val(meta1), path("*_valid_idx.pairs"), emit: idx_pairs + + script: + + def args = task.ext.args ?: '' + def pairs = task.ext.pairs ?: '' + def index = task.ext.index ?: '' + + """ + hicstuff_bam2pairs.py -b1 ${bam1} -b2 ${bam2} -i ${info_contigs} -e ${digestion} -f ${fasta} -o ${meta1.id}_${pairs} -x ${meta1.id}_${index} ${args} + """ + +} diff --git a/modules/local/hicstuff/build_matrix.nf b/modules/local/hicstuff/build_matrix.nf new file mode 100644 index 0000000000000000000000000000000000000000..d8ff6a6a21726028d4d1ff35fb282382f0e7864e --- /dev/null +++ b/modules/local/hicstuff/build_matrix.nf @@ -0,0 +1,24 @@ +process BUILD_MATRIX { + tag "$meta1.id" + label 'process_single' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + tuple val(meta), path(fragments_list) + + output: + tuple val(meta), path("${meta1.id}_*"), emit: matrix + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t graal -o ${args} + + mv ${args} ${meta1.id}_${args} + """ +} diff --git a/modules/local/hicstuff/build_matrix_cool.nf b/modules/local/hicstuff/build_matrix_cool.nf new file mode 100644 index 0000000000000000000000000000000000000000..36110614ecb9f0e1014cdf19904fe56d5f83ad0e --- /dev/null +++ b/modules/local/hicstuff/build_matrix_cool.nf @@ -0,0 +1,25 @@ +process BUILD_MATRIX_COOL { + tag "$meta1.id" + label 'process_single' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + tuple val(meta), path(fragments_list) + + output: + tuple val(meta), path("${meta1.id}_*.cool"), emit: matrix + + script: + + def args = task.ext.args ?: '' + def base = args.replaceFirst(/.txt/,"") + + """ + hicstuff_build_matrix.py -p ${idx_pairs} -f ${fragments_list} -t cool -o ${args} + + mv ${base}.cool ${meta1.id}_${base}.cool + """ +} diff --git a/modules/local/hicstuff/build_matrix_cool_alt.nf b/modules/local/hicstuff/build_matrix_cool_alt.nf new file mode 100644 index 0000000000000000000000000000000000000000..a52465dcbc9f76e6d4249f74d9da44ea2d810de1 --- /dev/null +++ b/modules/local/hicstuff/build_matrix_cool_alt.nf @@ -0,0 +1,28 @@ +process BUILD_MATRIX_COOL_ALT { + tag "$meta1.id" + label 'process_single' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta), path(chromosome_size) + tuple val(meta1), path(idx_pairs) + + + output: + tuple val(meta), path("${meta1.id}_*.cool"), emit: matrix + + script: + + def args = task.ext.args ?: '' + def bin = task.ext.bin.toInteger() ?: '' + def outname = task.ext.outname ?: '' + def base = outname.replaceFirst(/.txt/,"") + + """ + cooler cload pairs ${args} ${chromosome_size}:${bin} ${idx_pairs} ${base}.cool + + mv ${base}.cool ${meta1.id}_${base}_${bin}_alt.cool + """ +} diff --git a/modules/local/hicstuff/distance_law.nf b/modules/local/hicstuff/distance_law.nf new file mode 100644 index 0000000000000000000000000000000000000000..c87112ce6d3f016e319be21660186b451a184110 --- /dev/null +++ b/modules/local/hicstuff/distance_law.nf @@ -0,0 +1,23 @@ +process DISTANCE_LAW { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + tuple val(meta), path(fragments_list) + + output: + path("*distance_law.txt") + path("*.pdf"), optional: true + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_distance_law.py -i ${idx_pairs} -f ${fragments_list} -p ${meta1.id} ${args} + """ +} diff --git a/modules/local/hicstuff/filter_event.nf b/modules/local/hicstuff/filter_event.nf new file mode 100644 index 0000000000000000000000000000000000000000..2fa70512ed1a815d50d61a4b8e3ba35d6eb748b6 --- /dev/null +++ b/modules/local/hicstuff/filter_event.nf @@ -0,0 +1,22 @@ +process FILTER_EVENT { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + + output: + tuple val(meta1), path("*_valid_idx_filtered.pairs"), emit: idx_pairs_filtered + path("*.png"), optional: true + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_filter.py -i ${idx_pairs} -p ${meta1.id} ${args} + """ +} diff --git a/modules/local/hicstuff/filter_pcr.nf b/modules/local/hicstuff/filter_pcr.nf new file mode 100644 index 0000000000000000000000000000000000000000..57ab0f4312846356f2a5f7ce971ffa8c7511d3c9 --- /dev/null +++ b/modules/local/hicstuff/filter_pcr.nf @@ -0,0 +1,21 @@ +process FILTER_PCR { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + + output: + tuple val(meta1), path("*_valid_idx_pcrfree.pairs"), emit: idx_pairs_pcrfree + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_filter_pcr.py -i ${idx_pairs} -p ${meta1.id} ${args} + """ +} diff --git a/modules/local/hicstuff/fragment_enzyme.nf b/modules/local/hicstuff/fragment_enzyme.nf new file mode 100644 index 0000000000000000000000000000000000000000..b770ee4b8af0b3914b7e8541ff5002bf260fcd84 --- /dev/null +++ b/modules/local/hicstuff/fragment_enzyme.nf @@ -0,0 +1,25 @@ +process FRAGMENT_ENZYME { + tag "$digestion" + label "process_single" + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + val(digestion) + tuple val(meta), path(fasta) + + output: + tuple val(meta), path(params.hicstuff_output_contigs) , emit: info_contigs + tuple val(meta), path(params.hicstuff_output_frags), emit: fragments_list + path("*.pdf"), optional: true + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_fragments.py -e ${digestion} -i ${fasta} ${args} + """ + +} diff --git a/modules/nf-core/custom/gatk4/markduplicates/main.nf b/modules/nf-core/custom/gatk4/markduplicates/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..223fa7c1a568cbcb5dfb6063acb928a9d6971ce2 --- /dev/null +++ b/modules/nf-core/custom/gatk4/markduplicates/main.nf @@ -0,0 +1,54 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix} \\ + --METRICS_FILE ${prefix}.metrics \\ + --TMP_DIR . \\ + ${reference} \\ + $args + if [[ ${prefix} == *.cram ]]&&[[ -f ${prefix}.bai ]]; then + mv ${prefix}.bai ${prefix}.crai + fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/gatk4/markduplicates/meta.yml b/modules/nf-core/custom/gatk4/markduplicates/meta.yml new file mode 100644 index 0000000000000000000000000000000000000000..ae7443dab6e3682b7cfeb7a03b1f3f0252c8f59d --- /dev/null +++ b/modules/nf-core/custom/gatk4/markduplicates/meta.yml @@ -0,0 +1,72 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - markduplicates + - bam + - sort +tools: + - gatk4: + description: + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" + +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" \ No newline at end of file diff --git a/modules/nf-core/custom/picard/markduplicates/main.nf b/modules/nf-core/custom/picard/markduplicates/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..eef8c81601a2414f3b1b8a3d5cc7b5303f3e0b76 --- /dev/null +++ b/modules/nf-core/custom/picard/markduplicates/main.nf @@ -0,0 +1,61 @@ +process PICARD_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::picard=3.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'quay.io/biocontainers/picard:3.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.metrics.txt"), emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + mv ${fasta} ${fasta.simpleName}.fasta + + picard \\ + -Xmx${avail_mem}M \\ + MarkDuplicates \\ + $args \\ + --INPUT $bam \\ + --OUTPUT ${prefix}.bam \\ + --REFERENCE_SEQUENCE ${fasta.simpleName}.fasta \\ + --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + touch ${prefix}.MarkDuplicates.metrics.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/picard/markduplicates/meta.yml b/modules/nf-core/custom/picard/markduplicates/meta.yml new file mode 100644 index 0000000000000000000000000000000000000000..e463328d9c6e50abcd466a65b9b022b881f7f1e2 --- /dev/null +++ b/modules/nf-core/custom/picard/markduplicates/meta.yml @@ -0,0 +1,71 @@ +name: picard_markduplicates +description: Locate and tag duplicate reads in a BAM file +keywords: + - markduplicates + - pcr + - duplicates + - bam + - sam + - cram +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://broadinstitute.github.io/picard/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Reference genome fasta index + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with duplicate reads marked/removed + pattern: "*.{bam}" + - bai: + type: file + description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag + pattern: "*.{bai}" + - metrics: + type: file + description: Duplicate metrics file generated by picard + pattern: "*.{metrics.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/index/main.nf b/modules/nf-core/custom/samtools/index/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..05fa9758e6b2f9ce5ab3e7fa2404e9b234947169 --- /dev/null +++ b/modules/nf-core/custom/samtools/index/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/index/meta.yml b/modules/nf-core/custom/samtools/index/meta.yml new file mode 100644 index 0000000000000000000000000000000000000000..6037b9e658387998214f987017d0bc742156b553 --- /dev/null +++ b/modules/nf-core/custom/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/sort/main.nf b/modules/nf-core/custom/samtools/sort/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..f5692575cb24910127b33e243b7cb29f7535c1c7 --- /dev/null +++ b/modules/nf-core/custom/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_high' //medium + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sort_memory = (task.memory.mega/task.cpus).intValue() + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -m ${sort_memory}M \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/samtools/sort/meta.yml b/modules/nf-core/custom/samtools/sort/meta.yml new file mode 100644 index 0000000000000000000000000000000000000000..158785726d6436b8e5672bfbcc7ff2d948e06e97 --- /dev/null +++ b/modules/nf-core/custom/samtools/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" \ No newline at end of file diff --git a/modules/nf-core/custom/samtools_n/sort/main.nf b/modules/nf-core/custom/samtools_n/sort/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..59a6e2a4e0827332cf3742d2d6e696d299fd4282 --- /dev/null +++ b/modules/nf-core/custom/samtools_n/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT_N { + tag "$meta.id" + label 'process_high' //medium + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'quay.io/biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sort_memory = (task.memory.mega/task.cpus).intValue() + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -m ${sort_memory}M \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/samtools_n/sort/meta.yml b/modules/nf-core/custom/samtools_n/sort/meta.yml new file mode 100644 index 0000000000000000000000000000000000000000..158785726d6436b8e5672bfbcc7ff2d948e06e97 --- /dev/null +++ b/modules/nf-core/custom/samtools_n/sort/meta.yml @@ -0,0 +1,48 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 24d1bc0e535fa7c8bc48fb93ce22e9875a12c716..8a5267e1091e54798010dcab7b6565e9e46464c8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,8 +28,9 @@ params { save_aligned_intermediates = false bwt2_opts_end2end = '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' bwt2_opts_trimmed = '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' - keep_dups = false + keep_dups = true keep_multi = false + filter_pcr_picard = true min_mapq = 10 // Digestion Hi-C diff --git a/subworkflows/local/filter_pcr_dup.nf b/subworkflows/local/filter_pcr_dup.nf new file mode 100644 index 0000000000000000000000000000000000000000..7fb3fa55f498e66c9ebc6cbc386946ce6e3000e9 --- /dev/null +++ b/subworkflows/local/filter_pcr_dup.nf @@ -0,0 +1,46 @@ +/* + * FILTER_PCR_DUP + * Delete PCR duplicates reads with PICARD MarkDuplicates + */ + +include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main' +include { SAMTOOLS_SORT_N } from '../../modules/nf-core/custom/samtools_n/sort/main' +include { FILTER_PAIR } from '../../modules/local/filterbam/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main' +include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/custom/picard/markduplicates/main' + +workflow FILTER_PCR_DUP { + + take: + bam + fasta + index + + main: + + SAMTOOLS_SORT( + bam + ) + + PICARD_MARKDUPLICATES( + SAMTOOLS_SORT.out.bam, + fasta.collect(), + index.collect() + ) + + SAMTOOLS_SORT_N( + PICARD_MARKDUPLICATES.out.bam + ) + SAMTOOLS_SORT_N.out.bam.set{ ch_bam } + + FILTER_PAIR( + ch_bam.combine(ch_bam) + .map { + meta1, bam1, meta2, bam2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null + }) + FILTER_PAIR.out.bam.set{ new_ch_bam } + + emit: + bam = new_ch_bam +} diff --git a/subworkflows/local/hicpro.nf b/subworkflows/local/hicpro.nf index 8b106a0820cf808501add6e5ad886cc726626107..0cea0ff0cf49d7aa84a49d9b042e349ca7610d6f 100644 --- a/subworkflows/local/hicpro.nf +++ b/subworkflows/local/hicpro.nf @@ -3,7 +3,7 @@ * MAIN WORKFLOW * From the raw sequencing reads to the list of valid interactions */ - + include { HICPRO_MAPPING } from './hicpro_mapping' include { GET_VALID_INTERACTION } from '../../modules/local/hicpro/get_valid_interaction' include { GET_VALID_INTERACTION_DNASE } from '../../modules/local/hicpro/get_valid_interaction_dnase' @@ -12,121 +12,142 @@ include { MERGE_STATS } from '../../modules/local/hicpro/merge_stats' include { HICPRO2PAIRS } from '../../modules/local/hicpro/hicpro2pairs' include { BUILD_CONTACT_MAPS } from '../../modules/local/hicpro/build_contact_maps' include { ICE_NORMALIZATION } from '../../modules/local/hicpro/run_ice' +include { FILTER_PCR_DUP } from './filter_pcr_dup' // Remove meta.chunks def removeChunks(row){ - meta = row[0].clone() - meta.remove('chunk') - return [meta, row[1]] + meta = row[0].clone() + meta.remove('chunk') + return [meta, row[1]] } workflow HICPRO { - take: - reads // [meta, read1, read2] - index // path - fragments // path - chrsize // path - ligation_site // value - map_res // values - - main: - ch_versions = Channel.empty() - - // Fastq to paired-end bam - HICPRO_MAPPING( - reads, - index, - ligation_site - ) - ch_versions = ch_versions.mix(HICPRO_MAPPING.out.versions) - - //*************************************** - // DIGESTION PROTOCOLS - - if (!params.dnase){ - GET_VALID_INTERACTION ( - HICPRO_MAPPING.out.bam, - fragments.collect() + take: + reads // [meta, read1, read2] + index // path + fragments // path + chrsize // path + ligation_site // value + map_res // values + fasta + + main: + ch_versions = Channel.empty() + + // Fastq to paired-end bam + HICPRO_MAPPING( + reads, + index, + ligation_site ) - ch_versions = ch_versions.mix(GET_VALID_INTERACTION.out.versions) - ch_valid_pairs = GET_VALID_INTERACTION.out.valid_pairs - ch_valid_stats = GET_VALID_INTERACTION.out.stats - - }else{ - - //**************************************** - // DNASE-LIKE PROTOCOLS - - GET_VALID_INTERACTION_DNASE ( - HICPRO_MAPPING.out.bam + ch_versions = ch_versions.mix(HICPRO_MAPPING.out.versions) + + //*************************************** + // FILTER PCR DUPLICATES + + if (params.filter_pcr_picard && !params.keep_dups){ + error "Error: cannot filter PCR duplicates with both methods! If filter_pcr_picard is true, keep_dups should be true too" + } + else if (params.filter_pcr_picard){ + FILTER_PCR_DUP( + HICPRO_MAPPING.out.bam, + fasta, + index + ) + FILTER_PCR_DUP.out.bam.set{ ch_bam } + } + else { + HICPRO_MAPPING.out.bam.set{ ch_bam } + } + + + //*************************************** + // DIGESTION PROTOCOLS + + if (!params.dnase){ + GET_VALID_INTERACTION ( + ch_bam, + fragments.collect() + ) + ch_versions = ch_versions.mix(GET_VALID_INTERACTION.out.versions) + ch_valid_pairs = GET_VALID_INTERACTION.out.valid_pairs + ch_valid_stats = GET_VALID_INTERACTION.out.stats + + }else{ + + //**************************************** + // DNASE-LIKE PROTOCOLS + + GET_VALID_INTERACTION_DNASE ( + ch_bam + ) + ch_versions = ch_versions.mix(GET_VALID_INTERACTION_DNASE.out.versions) + ch_valid_pairs = GET_VALID_INTERACTION_DNASE.out.valid_pairs + ch_valid_stats = GET_VALID_INTERACTION_DNASE.out.stats + } + + + //************************************** + // MERGE AND REMOVE DUPLICATES + + //if (params.split_fastq){ + ch_valid_pairs = ch_valid_pairs.map{ it -> removeChunks(it)}.groupTuple() + ch_hicpro_stats = HICPRO_MAPPING.out.mapstats.map{it->removeChunks(it)}.groupTuple() + .concat(HICPRO_MAPPING.out.pairstats.map{it->removeChunks(it)}.groupTuple(), + ch_valid_stats.map{it->removeChunks(it)}.groupTuple()) + //}else{ + // ch_hicpro_stats = HICPRO_MAPPING.out.mapstats.groupTuple() + // .concat(HICPRO_MAPPING.out.pairstats.groupTuple(), + // ch_valid_stats.groupTuple()) + //} + + MERGE_VALID_INTERACTION ( + ch_valid_pairs ) - ch_versions = ch_versions.mix(GET_VALID_INTERACTION_DNASE.out.versions) - ch_valid_pairs = GET_VALID_INTERACTION_DNASE.out.valid_pairs - ch_valid_stats = GET_VALID_INTERACTION_DNASE.out.stats - } - - - //************************************** - // MERGE AND REMOVE DUPLICATES - - //if (params.split_fastq){ - ch_valid_pairs = ch_valid_pairs.map{ it -> removeChunks(it)}.groupTuple() - ch_hicpro_stats = HICPRO_MAPPING.out.mapstats.map{it->removeChunks(it)}.groupTuple() - .concat(HICPRO_MAPPING.out.pairstats.map{it->removeChunks(it)}.groupTuple(), - ch_valid_stats.map{it->removeChunks(it)}.groupTuple()) - //}else{ - // ch_hicpro_stats = HICPRO_MAPPING.out.mapstats.groupTuple() - // .concat(HICPRO_MAPPING.out.pairstats.groupTuple(), - // ch_valid_stats.groupTuple()) - //} - - MERGE_VALID_INTERACTION ( - ch_valid_pairs - ) - ch_versions = ch_versions.mix(MERGE_VALID_INTERACTION.out.versions) - - MERGE_STATS( - ch_hicpro_stats - ) - ch_versions = ch_versions.mix(MERGE_STATS.out.versions) - - //*************************************** - // CONVERTS TO PAIRS - HICPRO2PAIRS ( - MERGE_VALID_INTERACTION.out.valid_pairs, - chrsize.collect() - ) - ch_versions = ch_versions.mix(HICPRO2PAIRS.out.versions) - - //*************************************** - // CONTACT MAPS - - if (params.hicpro_maps){ - - //build_contact_maps - BUILD_CONTACT_MAPS( - MERGE_VALID_INTERACTION.out.valid_pairs.combine(map_res), - chrsize.collect() + ch_versions = ch_versions.mix(MERGE_VALID_INTERACTION.out.versions) + + MERGE_STATS( + ch_hicpro_stats ) - ch_hicpro_raw_maps = BUILD_CONTACT_MAPS.out.maps - - // run_ice - ICE_NORMALIZATION( - BUILD_CONTACT_MAPS.out.maps + ch_versions = ch_versions.mix(MERGE_STATS.out.versions) + + //*************************************** + // CONVERTS TO PAIRS + HICPRO2PAIRS ( + MERGE_VALID_INTERACTION.out.valid_pairs, + chrsize.collect() ) - ch_hicpro_iced_maps = ICE_NORMALIZATION.out.maps - ch_versions = ch_versions.mix(ICE_NORMALIZATION.out.versions) - - }else{ - ch_hicpro_raw_maps = Channel.empty() - ch_hicpro_iced_maps = Channel.empty() - } - - emit: - versions = ch_versions - pairs = HICPRO2PAIRS.out.pairs - mqc = MERGE_VALID_INTERACTION.out.mqc.concat(MERGE_STATS.out.mqc) - raw_maps = ch_hicpro_raw_maps - iced_maps = ch_hicpro_iced_maps + ch_versions = ch_versions.mix(HICPRO2PAIRS.out.versions) + + //*************************************** + // CONTACT MAPS + + if (params.hicpro_maps){ + + //build_contact_maps + BUILD_CONTACT_MAPS( + MERGE_VALID_INTERACTION.out.valid_pairs.combine(map_res), + chrsize.collect() + ) + ch_hicpro_raw_maps = BUILD_CONTACT_MAPS.out.maps + + // run_ice + ICE_NORMALIZATION( + BUILD_CONTACT_MAPS.out.maps + ) + ch_hicpro_iced_maps = ICE_NORMALIZATION.out.maps + ch_versions = ch_versions.mix(ICE_NORMALIZATION.out.versions) + + }else{ + ch_hicpro_raw_maps = Channel.empty() + ch_hicpro_iced_maps = Channel.empty() + } + + emit: + versions = ch_versions + pairs = HICPRO2PAIRS.out.pairs + mqc = MERGE_VALID_INTERACTION.out.mqc.concat(MERGE_STATS.out.mqc) + raw_maps = ch_hicpro_raw_maps + iced_maps = ch_hicpro_iced_maps } diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf new file mode 100644 index 0000000000000000000000000000000000000000..9bb3406bbc21a298d1919402dd4d6bdcbdd0690d --- /dev/null +++ b/subworkflows/local/hicstuff_sub.nf @@ -0,0 +1,157 @@ + +include { BOWTIE2_ALIGNMENT } from '../../modules/local/hicstuff/align_bowtie2' +include { FRAGMENT_ENZYME } from '../../modules/local/hicstuff/fragment_enzyme' +include { BAM2PAIRS } from '../../modules/local/hicstuff/bam2pairs' +include { BUILD_MATRIX } from '../../modules/local/hicstuff/build_matrix' +include { BUILD_MATRIX_COOL } from '../../modules/local/hicstuff/build_matrix_cool' +include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matrix_cool_alt' +include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event' +include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' +include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' +include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/custom/gatk4/markduplicates/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/custom/samtools/sort/main' +include { SAMTOOLS_SORT_N } from '../../modules/nf-core/custom/samtools_n/sort/main' +include { FILTER_PAIR } from '../../modules/local/filterbam/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/custom/samtools/index/main' +include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/custom/picard/markduplicates/main' + +// Paired-end to Single-end +def pairToSingle(row, mates) { + def meta = row[0].clone() + meta.single_end = true + meta.mates = mates + if (mates == "R1") { + return [meta, [ row[1][0]] ] + }else if (mates == "R2"){ + return [meta, [ row[1][1]] ] + } +} + +// Single-end to Paired-end +def singleToPair(row){ + def meta = row[0].clone() + meta.remove('mates') + meta.single_end = false + return [ meta, row[1] ] +} + +workflow HICSTUFF_SUB { + + take: + reads + index + ligation_site + digestion + fasta + chromosome_size + + main: + + // Align each mates separetly and add mates information in [meta] + ch_reads_r1 = reads.map{ it -> pairToSingle(it,"R1") } + ch_reads_r2 = reads.map{ it -> pairToSingle(it,"R2") } + ch_reads = ch_reads_r1.concat(ch_reads_r2) + + BOWTIE2_ALIGNMENT( + ch_reads, + index.collect() + ) + + FRAGMENT_ENZYME( + digestion, + fasta + ) + + if (params.filter_pcr && params.filter_pcr_picard ){ + error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file" + } + else if (params.filter_pcr_picard){ + SAMTOOLS_SORT( + BOWTIE2_ALIGNMENT.out.bam + ) + + + PICARD_MARKDUPLICATES( + SAMTOOLS_SORT.out.bam, + fasta.collect(), + index.collect() + ) + + SAMTOOLS_SORT_N( + PICARD_MARKDUPLICATES.out.bam + ) + + SAMTOOLS_SORT_N.out.bam.set{ ch_bam } + + FILTER_PAIR( + ch_bam.combine(ch_bam) + .map { + meta1, bam1, meta2, bam2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null + }) + + FILTER_PAIR.out.bam.set{ new_ch_bam } + + } + else { + + BOWTIE2_ALIGNMENT.out.bam.set{ ch_bam } + + ch_bam.combine(ch_bam) + .map { + meta1, bam1, meta2, bam2 -> + meta1.id == meta2.id && meta1.chunk == meta2.chunk && meta1.mates == "R1" && meta2.mates == "R2" ? [ meta1, bam1, meta2, bam2 ] : null + }.set{ new_ch_bam } + + } + + BAM2PAIRS( + new_ch_bam, + FRAGMENT_ENZYME.out.info_contigs.collect(), + digestion, + fasta.collect() + ) + BAM2PAIRS.out.idx_pairs.set{ ch_idx_pairs } + + if( params.filter_event ){ + FILTER_EVENT( + ch_idx_pairs + ) + FILTER_EVENT.out.idx_pairs_filtered.set{ ch_idx_pairs } + } + + if ( params.distance_law ){ + DISTANCE_LAW( + ch_idx_pairs, + FRAGMENT_ENZYME.out.fragments_list.collect() + ) + } + + if (params.filter_pcr && params.filter_pcr_picard ){ + error "Error: filter_pcr and filter_pcr_picard can't both be true at the same time! Set one of them false in the config file" + } + else if ( params.filter_pcr ){ + FILTER_PCR( + ch_idx_pairs + ) + FILTER_PCR.out.idx_pairs_pcrfree.set{ ch_idx_pairs } + } + + //TODO rajouter filtres + distance law + filtres PCR en options + // pour les PCR filter, soit le hicstuff soit PICARD + + BUILD_MATRIX( + ch_idx_pairs, + FRAGMENT_ENZYME.out.fragments_list.collect() + ) + + BUILD_MATRIX_COOL( + ch_idx_pairs, + FRAGMENT_ENZYME.out.fragments_list.collect() + ) + + BUILD_MATRIX_COOL_ALT( + chromosome_size.collect(), + ch_idx_pairs + ) +} diff --git a/test.nf b/test.nf new file mode 100644 index 0000000000000000000000000000000000000000..b960e23168f81ba91048847de68419281d891640 --- /dev/null +++ b/test.nf @@ -0,0 +1,55 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + GENOME PARAMETER VALUES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.bwt2_index = WorkflowMain.getGenomeAttribute(params, 'bowtie2') + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE & PRINT PARAMETER SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +WorkflowMain.initialise(workflow, params, log) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + NAMED WORKFLOW FOR PIPELINE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { HICSTUFF } from './workflows/hicstuff' + +// +// WORKFLOW: Run main nf-core/hic analysis pipeline +// +workflow TEST_HICSTUFF { + HICSTUFF () +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN ALL WORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// WORKFLOW: Execute a single named workflow for the pipeline +// See: https://github.com/nf-core/rnaseq/issues/619 +// +workflow { + TEST_HICSTUFF () +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/hic.nf b/workflows/hic.nf index 2ffa5b4dc8bee033473cab5b755befe9460ca0a9..1b6b81e1b63dc902004d2bc3f6ed80905360ff89 100644 --- a/workflows/hic.nf +++ b/workflows/hic.nf @@ -54,7 +54,7 @@ if (params.res_tads && !params.skip_tads){ }else{ ch_tads_res=Channel.empty() if (!params.skip_tads){ - log.warn "[nf-core/hic] Hi-C resolution for TADs calling not specified. See --res_tads" + log.warn "[nf-core/hic] Hi-C resolution for TADs calling not specified. See --res_tads" } } @@ -64,7 +64,7 @@ if (params.res_dist_decay && !params.skip_dist_decay){ }else{ ch_ddecay_res = Channel.empty() if (!params.skip_dist_decay){ - log.warn "[nf-core/hic] Hi-C resolution for distance decay not specified. See --res_dist_decay" + log.warn "[nf-core/hic] Hi-C resolution for distance decay not specified. See --res_dist_decay" } } @@ -74,7 +74,7 @@ if (params.res_compartments && !params.skip_compartments){ }else{ ch_comp_res = Channel.empty() if (!params.skip_compartments){ - log.warn "[nf-core/hic] Hi-C resolution for compartment calling not specified. See --res_compartments" + log.warn "[nf-core/hic] Hi-C resolution for compartment calling not specified. See --res_compartments" } } @@ -99,7 +99,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // MODULE: Local to the pipeline // -include { HIC_PLOT_DIST_VS_COUNTS } from '../modules/local/hicexplorer/hicPlotDistVsCounts' +include { HIC_PLOT_DIST_VS_COUNTS } from '../modules/local/hicexplorer/hicPlotDistVsCounts' include { MULTIQC } from '../modules/local/multiqc' // @@ -182,7 +182,8 @@ workflow HIC { PREPARE_GENOME.out.res_frag, PREPARE_GENOME.out.chromosome_size, ch_ligation_site, - ch_map_res + ch_map_res, + ch_fasta ) ch_versions = ch_versions.mix(HICPRO.out.versions) @@ -239,7 +240,7 @@ workflow HIC { .filter{ it[0].resolution == it[2] } .map { it -> [it[0], it[1]]} .set{ ch_cool_tads } - + TADS( ch_cool_tads ) diff --git a/workflows/hicstuff.nf b/workflows/hicstuff.nf new file mode 100644 index 0000000000000000000000000000000000000000..8263bd3db82361774f74c84884e373cdc8d51a7c --- /dev/null +++ b/workflows/hicstuff.nf @@ -0,0 +1,45 @@ +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +// Validate input parameters +WorkflowHic.initialise(params, log) +// Check mandatory parameters +if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +// Digestion parameters +if (params.digestion){ + restriction_site = params.digestion ? params.digest[ params.digestion ].restriction_site ?: false : false + ch_restriction_site = Channel.value(restriction_site) //str seq avec ^ pour cut + ligation_site = params.digestion ? params.digest[ params.digestion ].ligation_site ?: false : false + ch_ligation_site = Channel.value(ligation_site) //str seq +}else if (params.dnase){ + ch_restriction_site = Channel.empty() + ch_ligation_site = Channel.empty() +}else{ + exit 1, "Ligation motif not found. Please either use the `--digestion` parameters or specify the `--restriction_site` and `--ligation_site`. For DNase Hi-C, please use '--dnase' option" +} + +Channel.fromPath( params.fasta ) + .ifEmpty { exit 1, "Genome index: Fasta file not found: ${params.fasta}" } + .map{it->[[:],it]} + .set { ch_fasta } + + +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { HICSTUFF_SUB } from '../subworkflows/local/hicstuff_sub' + +workflow HICSTUFF { + INPUT_CHECK ( + ch_input + ) + PREPARE_GENOME( + ch_fasta, + ch_restriction_site + ) + HICSTUFF_SUB( + INPUT_CHECK.out.reads, + PREPARE_GENOME.out.index, + ch_ligation_site, + params.digestion, //str enzyme + ch_fasta, + PREPARE_GENOME.out.chromosome_size + ) +}