diff --git a/bin/hicstuff b/bin/hicstuff new file mode 120000 index 0000000000000000000000000000000000000000..156df240f8aab8ee35206570c438a0461cbd76c8 --- /dev/null +++ b/bin/hicstuff @@ -0,0 +1 @@ +hicstuff_repo/hicstuff \ No newline at end of file diff --git a/bin/hicstuff_bam2pairs.py b/bin/hicstuff_bam2pairs.py index 744b5875d390a681cfc2bce8417ab7d0198ea608..83d5c9702fe04601322aaf772031cfe98c1227d2 100755 --- a/bin/hicstuff_bam2pairs.py +++ b/bin/hicstuff_bam2pairs.py @@ -5,14 +5,14 @@ import argparse import pysam as ps import pandas as pd import itertools -from hicstuff_log import logger -import hicstuff_log as hcl -import hicstuff_io as hio -import hicstuff_digest as hcd -import hicstuff_stats as hcs +from hicstuff.log import logger +import hicstuff.log as hcl +import hicstuff.io as hio +import hicstuff.digest as hcd +import hicstuff.stats as hcs from Bio import SeqIO import logging -from hicstuff_version import __version__ +from hicstuff.version import __version__ diff --git a/bin/hicstuff_build_matrix.py b/bin/hicstuff_build_matrix.py index f99700bbc648ced284c5e64a499d4eecd74e8cba..fef5f0a4a54cc20ebb7e6fed0216482697ca9f5e 100755 --- a/bin/hicstuff_build_matrix.py +++ b/bin/hicstuff_build_matrix.py @@ -7,11 +7,11 @@ import pandas as pd import shutil as st import subprocess as sp import itertools -from hicstuff_log import logger -import hicstuff_io as hio -import hicstuff_digest as hcd +from hicstuff.log import logger +import hicstuff.io as hio +import hicstuff.digest as hcd from Bio import SeqIO -import hicstuff_log as hcl +import hicstuff.log as hcl def pairs2matrix( diff --git a/bin/hicstuff_cutsite.py b/bin/hicstuff_cutsite.py index 058c907d82041b4d67c752841d25d1e422d40dc0..fa3faa2a74ccd718773ebdcc6b10c2a94c574910 100755 --- a/bin/hicstuff_cutsite.py +++ b/bin/hicstuff_cutsite.py @@ -28,359 +28,9 @@ import pyfastx import re import sys import argparse -import hicstuff_digest as hcd -from hicstuff_log import logger - - -def cut_ligation_sites( - fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu -): - """Create new reads to manage pairs with a digestion and create multiple - pairs to take into account all the contact present. - - The function write two files for both the forward and reverse fastq with the - new reads. The new reads have at the end of the ID ":[0-9]" added to - differentiate the different pairs created from one read. - - The function will look for all the sites present and create new pairs of - reads according to the mode given to retreive as much as possible of the HiC - signal. - - Parameters - ---------- - fq_for : str - Path to the forward fastq file to digest. - fq_rev : str - Path to the reverse fatsq file to digest. - digest_for : str - Path to the output digested forward fatsq file to write. - digest_rev : str - Path to the output digested reverse fatsq file to write. - enzyme : str - The list of restriction enzyme used to digest the genome separated by a - comma. Example: HpaII,MluCI. - mode : str - Mode to use to make the digestion. Three values possible: "all", - "for_vs_rev", "pile". - seed_size : int - Minimum size of a fragment (i.e. seed size used in mapping as reads - smaller won't be mapped.) - n_cpu : int - Number of CPUs. - """ - # Process the ligation sites given - ligation_sites = hcd.gen_enzyme_religation_regex(enzyme) - - # Defined stop_token which is used to mark the end of input file - stop_token = "STOP" - # A stack is a string cointaining multiple read pairs - max_stack_size = 1000 - - # Create count to have an idea of the digested pairs repartition. - original_number_of_pairs = 0 - final_number_of_pairs = 0 - new_reads_for = "" - new_reads_rev = "" - current_stack = 0 - - # Start parallel threading to compute the - # ctx = multiprocessing.get_context("spawn") - queue = multiprocessing.Queue(max(1, n_cpu - 1)) - writer_process = multiprocessing.Process( - target=_writer, args=(digest_for, digest_rev, queue, stop_token) - ) - writer_process.start() - - # Iterate on all pairs - for read_for, read_rev in zip( - pyfastx.Fastq(fq_for, build_index=False), - pyfastx.Fastq(fq_rev, build_index=False), - ): - - # Count the numbers of original reads processed. - original_number_of_pairs += 1 - - # Count for stack size. - current_stack += 1 - - # Extract components of the reads. - for_name, for_seq, for_qual = read_for - rev_name, rev_seq, rev_qual = read_rev - - # Sanity check to be sure all reads are with their mate. - if for_name != rev_name: - logger.error( - "The fastq files contains reads not sorted :\n{0}\n{1}".format( - for_name, rev_name - ) - ) - sys.exit(1) - - # Cut the forward and reverse reads at the ligation sites. - for_seq_list, for_qual_list = cutsite_read( - ligation_sites, - for_seq, - for_qual, - seed_size, - ) - rev_seq_list, rev_qual_list = cutsite_read( - ligation_sites, - rev_seq, - rev_qual, - seed_size, - ) - - # Write the new combinations of fragments. - new_reads_for, new_reads_rev, final_number_of_pairs = write_pair( - new_reads_for, - new_reads_rev, - for_name, - for_seq_list, - for_qual_list, - rev_seq_list, - rev_qual_list, - mode, - final_number_of_pairs, - ) - - # If stack full, add it in the queue. - if current_stack == max_stack_size: - - # Add the pair in the queue. - pairs = (new_reads_for.encode(), new_reads_rev.encode()) - queue.put(pairs) - - # Empty the stack - current_stack = 0 - new_reads_for = "" - new_reads_rev = "" - - # End the parallel processing. - pairs = (new_reads_for.encode(), new_reads_rev.encode()) - queue.put(pairs) - queue.put(stop_token) - writer_process.join() - - # Return information on the different pairs created - logger.info(f"Library used: {fq_for} - {fq_rev}") - logger.info(f"Number of pairs before digestion: {original_number_of_pairs}") - logger.info(f"Number of pairs after digestion: {final_number_of_pairs}") - - -def cutsite_read(ligation_sites, seq, qual, seed_size=0): - """Find ligation sites in a given sequence and return list of the fragmented - sequence and quality. - - Parameters: - ----------- - ligation_sites : re.Pattern - Regex of all possible ligations according to the given enzymes. - seq : str - Sequence where to search for ligation_sites. - qual : str - Quality values of the sequence given. - seed_size : int - Minimum size of a fragment (i.e. seed size used in mapping as reads - smaller won't be mapped.) - - Returns: - -------- - list of str - List of cut sequences. The split is made 4 bases after the start of - the ligation site. - list of str - List of string of the qualities. - - Examples: - --------- - >>> cutsite_read(re.compile(r'GA.TA.TC'), "AAGAGTATTC", "FFF--FAFAF") - (['AAGAGT', 'ATTC'], ['FFF--F', 'AFAF']) - """ - - # Find the ligation sites. - ligation_sites_list = [] - if ligation_sites.search(seq): - ligation_sites_list = [ - site.start() for site in ligation_sites.finditer(seq) - ] - ligation_sites_list.append(len(seq)) - - # Split the sequences on the ligation sites. - seq_list = [] - qual_list = [] - left_site = 0 - for right_site in ligation_sites_list: - if right_site + 4 - left_site > seed_size: - seq_list.append(seq[left_site : right_site + 4]) - qual_list.append(qual[left_site : right_site + 4]) - left_site = right_site + 4 - - return seq_list, qual_list - - -def write_pair( - new_reads_for, - new_reads_rev, - name, - for_seq_list, - for_qual_list, - rev_seq_list, - rev_qual_list, - mode, - final_number_of_pairs, -): - """Function to write one pair with the combinations of fragment depending on - the chosen mode. - - Parameters: - ----------- - new_reads_for : str - Stack of the new forward reads ready to be written. - new_reads_rev : str - Stack of the new reverse reads ready to be written. - name : str - Name of the fastq read. - for_seq_list : list - List of forward sequences of the fastq read. - for_qual_list : list - List of forward qualities of the fastq read. - rev_seq_list : list - List of reverse sequencs of the fastq read. - rev_qual_list : list - List of reverse qualities of the fastq read. - mode : str - Mode to use to make the digestion. Three values possible: "all", - "for_vs_rev", "pile". - final_numbers_of_pairs : int - Count of pairs after cutting. - - Returns: - -------- - str - Stack of forward reads ready to be written with the last pairs added. - str - Stack of reverse reads ready to be written with the last pairs added. - int - Count of pairs after cutting. - """ - - # Mode "for_vs_rev": Make contacts only between fragments from different - # reads (one fragment from forward and one from reverse). - if mode == "for_vs_rev": - for i in range(len(for_seq_list)): - for j in range(len(rev_seq_list)): - final_number_of_pairs += 1 - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - for_seq_list[i], - for_qual_list[i], - ) - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - rev_seq_list[j], - rev_qual_list[j], - ) - - # Mode "all": Make all the possible contacts between the fragments. - elif mode == "all": - seq_list = for_seq_list + rev_seq_list - qual_list = for_qual_list + rev_qual_list - for i in range(len(seq_list)): - for j in range(i + 1, len(seq_list)): - final_number_of_pairs += 1 - if i < len(for_seq_list): - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - seq_list[i], - qual_list[i], - ) - # Reverse the forward read if comes from reverse. - else: - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - seq_list[i][::-1], - qual_list[i][::-1], - ) - # Reverse the reverse read if comes from forward. - if j < len(for_seq_list): - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - seq_list[j][::-1], - qual_list[j][::-1], - ) - else: - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(j), - seq_list[j], - qual_list[j], - ) - - # Mode "pile": Only make contacts bewteen two adjacent fragments. - elif mode == "pile": - seq_list = for_seq_list + rev_seq_list - qual_list = for_qual_list + rev_qual_list - for i in range(len(seq_list) - 1): - final_number_of_pairs += 1 - if i < len(for_seq_list) - 1: - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i], - qual_list[i], - ) - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i + 1][::-1], - qual_list[i + 1][::-1], - ) - elif i == len(for_seq_list) - 1: - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i], - qual_list[i], - ) - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i + 1], - qual_list[i + 1], - ) - else: - new_reads_for += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i][::-1], - qual_list[i][::-1], - ) - new_reads_rev += "@%s\n%s\n+\n%s\n" % ( - name + ":" + str(i) + str(i + 1), - seq_list[i + 1], - qual_list[i + 1], - ) - - return new_reads_for, new_reads_rev, final_number_of_pairs - - -def _writer(output_for, output_rev, queue, stop_token): - """Function to write the pair throw parallel processing. - - Parameters: - ----------- - output_for : str - Path to the output forward compressed fastq file. - output_rev : str - Path to the output reverse compressed fastq file. - queue : multiprocessing.queues.Queue - Queue for the multiprocesing of the whole file. - stop_token : str - Token to signal that the end of the file have been reached. - """ - with gzip.open(output_for, "wb") as for_fq, gzip.open( - output_rev, "wb" - ) as rev_fq: - while True: - line = queue.get() - if line == stop_token: - return 0 - for_fq.write(line[0]) - rev_fq.write(line[1]) +import hicstuff.digest as hcd +from hicstuff.log import logger +import hicstuff.cutsite as hc if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -416,4 +66,4 @@ if __name__ == "__main__": elif enzyme == "arima": enzyme = "DpnII,HinfI" - cut_ligation_sites(reads1, reads2, out1, out2, enzyme, mode, seed_size, n_cpu) + hc.cut_ligation_sites(reads1, reads2, out1, out2, enzyme, mode, seed_size, n_cpu) diff --git a/bin/hicstuff_digest.py b/bin/hicstuff_digest.py deleted file mode 100644 index 3fba3da20ddfd4d500c00cc5b01141b8f355f410..0000000000000000000000000000000000000000 --- a/bin/hicstuff_digest.py +++ /dev/null @@ -1,489 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -"""Genome digestion -Functions used to write auxiliary instagraal compatible -sparse matrices. -""" - -from Bio import SeqIO, SeqUtils -from Bio.Seq import Seq -from Bio.Restriction import RestrictionBatch, Analysis -import os, sys, csv -import re -import collections -import copy -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from hicstuff_log import logger -import hicstuff_io as hio - -DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" -DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" -DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" -DEFAULT_KB_BINNING = 1 -DEFAULT_THRESHOLD_SIZE = 0 -# Most used enzyme for eukaryotes -DEFAULT_ENZYME = "DpnII" -# If using evenly-sized chunks instead of restriction -# enzymes, they shouldn't be too short -DEFAULT_MIN_CHUNK_SIZE = 50 - - -def write_frag_info( - fasta, - enzyme, - min_size=DEFAULT_THRESHOLD_SIZE, - circular=False, - output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, - output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME, - output_dir=None, -): - """Digest and write fragment information - Write the fragments_list.txt and info_contigs.txt that are necessary for - instagraal to run. - Parameters - ---------- - fasta : pathlib.Path or str - The path to the reference genome - enzyme : str, int or list of str - If a string, must be the name of an enzyme (e.g. DpnII) and the genome - will be cut at the enzyme's restriction sites. If a number, the genome - will be cut uniformly into chunks with length equal to that number. A - list of enzymes can also be specified if using multiple enzymes. - min_size : float, optional - Size below which shorter contigs are discarded. Default is 0, i.e. all - contigs are retained. - circular : bool, optional - Whether the genome is circular. Default is False. - output_contigs : str, optional - The name of the file with contig info. Default is info_contigs.txt - output_frags : str, optional - The name of the file with fragment info. Default is fragments_list.txt - output_dir : [type], optional - The path to the output directory, which will be created if not already - existing. Default is the current directory. - """ - - records = SeqIO.parse(hio.read_compressed(fasta), "fasta") - - try: - info_contigs_path = os.path.join(output_dir, output_contigs) - frag_list_path = os.path.join(output_dir, output_frags) - except TypeError: - info_contigs_path = output_contigs - frag_list_path = output_frags - - with open(info_contigs_path, "w") as info_contigs: - - info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") - - with open(frag_list_path, "w") as fragments_list: - - fragments_list.write( - "id\tchrom\tstart_pos" "\tend_pos\tsize\tgc_content\n" - ) - - total_frags = 0 - - for record in records: - contig_seq = record.seq - contig_name = record.id - contig_length = len(contig_seq) - if contig_length < int(min_size): - continue - - sites = get_restriction_table( - contig_seq, enzyme, circular=circular - ) - fragments = ( - contig_seq[sites[i] : sites[i + 1]] - for i in range(len(sites) - 1) - ) - n_frags = 0 - - current_id = 1 - start_pos = 0 - for frag in fragments: - frag_length = len(frag) - if frag_length > 0: - end_pos = start_pos + frag_length - gc_content = SeqUtils.GC(frag) / 100.0 - - current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( - current_id, - contig_name, - start_pos, - end_pos, - frag_length, - gc_content, - ) - - fragments_list.write(current_fragment_line) - - try: - assert (current_id == 1 and start_pos == 0) or ( - current_id > 1 and start_pos > 0 - ) - except AssertionError: - logger.error((current_id, start_pos)) - raise - start_pos = end_pos - current_id += 1 - n_frags += 1 - - current_contig_line = "%s\t%s\t%s\t%s\n" % ( - contig_name, - contig_length, - n_frags, - total_frags, - ) - total_frags += n_frags - info_contigs.write(current_contig_line) - - -def attribute_fragments(pairs_file, idx_pairs_file, restriction_table): - """ - Writes the indexed pairs file, which has two more columns than the input - pairs file corresponding to the restriction fragment index of each read. - Note that pairs files have 1bp point positions whereas restriction table - has 0bp point poisitions. - Parameters - ---------- - pairs_file: str - Path the the input pairs file. Consists of 7 tab-separated - columns: readID, chr1, pos1, chr2, pos2, strand1, strand2 - idx_pairs_file: str - Path to the output indexed pairs file. Consists of 9 white space - separated columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, - frag1, frag2. frag1 and frag2 are 0-based restriction fragments based - on whole genome. - restriction_table: dict - Dictionary with chromosome identifiers (str) as keys and list of - positions (int) of restriction sites as values. - """ - - # NOTE: Bottlenecks here are 1. binary search in find_frag and 2. writerow - # 1. could be reduced by searching groups of N frags in parallel and 2. by - # writing N frags simultaneously using a single call of writerows. - - # Parse and update header section - pairs_header = hio.get_pairs_header(pairs_file) - header_size = len(pairs_header) - chrom_order = [] - with open(idx_pairs_file, "w") as idx_pairs: - for line in pairs_header: - # Add new column names to header - if line.startswith("#columns"): - line = line.rstrip() + " frag1 frag2" - if line.startswith("#chromsize"): - chrom_order.append(line.split()[1]) - idx_pairs.write(line + "\n") - - # Get number of fragments per chrom to allow genome-based indices - shift_frags = {} - prev_frags = 0 - for rank, chrom in enumerate(chrom_order): - if rank > 0: - # Note the "-1" because there are nfrags + 1 sites in rest table - prev_frags += len(restriction_table[chrom_order[rank - 1]]) - 1 - # Idx of each chrom's frags will be shifted by n frags in previous chroms - shift_frags[chrom] = prev_frags - - missing_contigs = set() - # Attribute pairs to fragments and append them to output file (after header) - with open(pairs_file, "r") as pairs, open( - idx_pairs_file, "a" - ) as idx_pairs: - # Skip header lines - for _ in range(header_size): - next(pairs) - - # Define input and output fields - pairs_cols = [ - "readID", - "chr1", - "pos1", - "chr2", - "pos2", - "strand1", - "strand2", - ] - idx_cols = pairs_cols + ["frag1", "frag2"] - - # Use csv reader / writer to automatically parse columns into a dict - pairs_reader = csv.DictReader( - pairs, fieldnames=pairs_cols, delimiter="\t" - ) - pairs_writer = csv.DictWriter( - idx_pairs, fieldnames=idx_cols, delimiter="\t" - ) - - for pair in pairs_reader: - # Get the 0-based indices of corresponding restriction fragments - # Deducing 1 from pair position to get it into 0bp point - pair["frag1"] = find_frag( - int(pair["pos1"]) - 1, restriction_table[pair["chr1"]] - ) - pair["frag2"] = find_frag( - int(pair["pos2"]) - 1, restriction_table[pair["chr2"]] - ) - # Shift fragment indices to make them genome-based instead of - # chromosome-based - try: - pair["frag1"] += shift_frags[pair["chr1"]] - except KeyError: - missing_contigs.add(pair["chr1"]) - try: - pair["frag2"] += shift_frags[pair["chr2"]] - except KeyError: - missing_contigs.add(pair["chr2"]) - - # Write indexed pairs in the new file - pairs_writer.writerow(pair) - - if missing_contigs: - logger.warning( - "Pairs on the following contigs were discarded as " - "those contigs are not listed in the paris file header. " - "This is normal if you filtered out small contigs: %s" - % " ".join(list(missing_contigs)) - ) - - -def get_restriction_table(seq, enzyme, circular=False): - """ - Get the restriction table for a single genomic sequence. - Parameters - ---------- - seq : Seq object - A biopython Seq object representing a chromosomes or contig. - enzyme : int, str or list of str - The name of the restriction enzyme used, or a list of restriction - enzyme names. Can also be an integer, to digest by fixed chunk size. - circular : bool - Wether the genome is circular. - Returns - ------- - numpy.array: - List of restriction fragment boundary positions for the input sequence. - >>> from Bio.Seq import Seq - >>> get_restriction_table(Seq("AAGCCGGATCGG"),"HpaII") - array([ 0, 4, 12]) - >>> get_restriction_table(Seq("AA"),["HpaII", "MluCI"]) - array([0, 2]) - >>> get_restriction_table(Seq("AA"),"aeiou1") - Traceback (most recent call last): - ... - ValueError: aeiou1 is not a valid restriction enzyme. - >>> get_restriction_table("AA","HpaII") - Traceback (most recent call last): - ... - TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead - """ - chrom_len = len(seq) - wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme) - # Restriction batch containing the restriction enzyme - try: - enz = [enzyme] if isinstance(enzyme, str) else enzyme - cutter = RestrictionBatch(enz) - except (TypeError, ValueError): - try: - cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) - except ValueError: - raise ValueError(wrong_enzyme) - - # Conversion from string type to restriction type - if isinstance(cutter, int): - sites = [i for i in range(0, chrom_len, cutter)] - if sites[-1] < chrom_len: - sites.append(chrom_len) - else: - # Find sites of all restriction enzymes given - ana = Analysis(cutter, seq, linear=not circular) - sites = ana.full() - # Gets all sites into a single flat list with 0-based index - sites = [site - 1 for enz in sites.values() for site in enz] - # Sort by position and allow first add start and end of seq - sites.sort() - sites.insert(0, 0) - sites.append(chrom_len) - - return np.array(sites) - - -def find_frag(pos, r_sites): - """ - Use binary search to find the index of a chromosome restriction fragment - corresponding to an input genomic position. - Parameters - ---------- - pos : int - Genomic position, in base pairs. - r_sites : list - List of genomic positions corresponding to restriction sites. - Returns - ------- - int - The 0-based index of the restriction fragment to which the position belongs. - >>> find_frag(15, [0, 20, 30]) - 0 - >>> find_frag(15, [10, 20, 30]) - Traceback (most recent call last): - ... - ValueError: The first position in the restriction table is not 0. - >>> find_frag(31, [0, 20, 30]) - Traceback (most recent call last): - ... - ValueError: Read position is larger than last entry in restriction table. - """ - if r_sites[0] != 0: - raise ValueError( - "The first position in the restriction table is not 0." - ) - if pos > r_sites[-1]: - raise ValueError( - "Read position is larger than last entry in restriction table." - ) - # binary search for the index of the read - index = max(np.searchsorted(r_sites, pos, side="right") - 1, 0) - # Last site = end of the chrom, index of last fragment is last site - 1 - index = min(len(r_sites) - 2, index) - - return index - - -def frag_len( - frags_file_name=DEFAULT_FRAGMENTS_LIST_FILE_NAME, - output_dir=None, - plot=False, - fig_path=None, -): - """ - logs summary statistics of fragment length distribution based on an - input fragment file. Can optionally show a histogram instead - of text summary. - Parameters - ---------- - frags_file_name : str - Path to the output list of fragments. - output_dir : str - Directory where the list should be saved. - plot : bool - Wether a histogram of fragment length should be shown. - fig_path : str - If a path is given, the figure will be saved instead of shown. - """ - - try: - frag_list_path = os.path.join(output_dir, frags_file_name) - except TypeError: - frag_list_path = frags_file_name - frags = pd.read_csv(frag_list_path, sep="\t") - nfrags = frags.shape[0] - med_len = frags["size"].median() - nbins = 40 - if plot: - fig, ax = plt.subplots() - _, _, _ = ax.hist(frags["size"], bins=nbins) - - ax.set_xlabel("Fragment length [bp]") - ax.set_ylabel("Log10 number of fragments") - ax.set_title("Distribution of restriction fragment length") - ax.set_yscale("log", base=10) - ax.annotate( - "Total fragments: {}".format(nfrags), - xy=(0.95, 0.95), - xycoords="axes fraction", - fontsize=12, - horizontalalignment="right", - verticalalignment="top", - ) - ax.annotate( - "Median length: {}".format(med_len), - xy=(0.95, 0.90), - xycoords="axes fraction", - fontsize=12, - horizontalalignment="right", - verticalalignment="top", - ) - # Tweak spacing to prevent clipping of ylabel - fig.tight_layout() - if fig_path: - plt.savefig(fig_path) - else: - plt.show() - plt.clf() - else: - logger.info( - "Genome digested into {0} fragments with a median " - "length of {1}".format(nfrags, med_len) - ) - - -def gen_enzyme_religation_regex(enzyme): - """Return a regex which corresponds to all possible religation sites given a - set of enzyme. - Parameters: - ----------- - enzyme : str - String that contains the names of the enzyme separated by a comma. - Returns: - -------- - re.Pattern : - Regex that corresponds to all possible ligation sites given a set of - enzyme. - Examples: - --------- - >>> gen_enzyme_religation_regex('HpaII') - re.compile('CCGCGG') - >>> gen_enzyme_religation_regex('HpaII,MluCI') - re.compile('AATTAATT|AATTCGG|CCGAATT|CCGCGG') - """ - - # Split the str on the comma to separate the different enzymes. - enzyme = enzyme.split(",") - - # Check on Biopython dictionnary the enzyme. - rb = RestrictionBatch(enzyme) - - # Initiation: - give_list = [] - accept_list = [] - ligation_list = [] - - # Iterates on the enzymes. - for enz in rb: - - # Extract restriction sites and look for cut sites. - site = enz.elucidate() - fw_cut = site.find("^") - rev_cut = site.find("_") - - # Process "give" site. Remove N on the left (useless). - give_site = site[:rev_cut].replace("^", "") - while give_site[0] == "N": - give_site = give_site[1:] - give_list.append(give_site) - - # Process "accept" site. Remove N on the rigth (useless). - accept_site = site[fw_cut + 1 :].replace("_", "") - while accept_site[-1] == "N": - accept_site = accept_site[:-1] - accept_list.append(accept_site) - - # Iterates on the two list to build all the possible HiC ligation sites. - for give_site in give_list: - for accept_site in accept_list: - # Replace "N" by "." for regex searching of the sites - ligation_list.append((give_site + accept_site).replace("N", ".")) - ligation_list.append( - str(Seq(give_site + accept_site).reverse_complement()).replace( - "N", "." - ) - ) - - # Build the regex for any ligation sites. - pattern = "|".join(sorted(list(set(ligation_list)))) - return re.compile(pattern) diff --git a/bin/hicstuff_distance_law.py b/bin/hicstuff_distance_law.py index 545c3f7e1e0929d070a850ed239aab90d585f8dc..8b49d6c61921013abcffb8bf6d2384cfc4e0ce8d 100755 --- a/bin/hicstuff_distance_law.py +++ b/bin/hicstuff_distance_law.py @@ -10,770 +10,10 @@ from matplotlib import cm import pandas as pd import os as os import csv as csv -import hicstuff_io as hio -from hicstuff_log import logger +import hicstuff.io as hio +from hicstuff.log import logger import argparse - -def export_distance_law(xs, ps, names, out_file=None): - """ Export the x(s) and p(s) from two list of numpy.ndarrays to a table - in txt file with three columns separated by a tabulation. The first column - contains the x(s), the second the p(s) and the third the name of the arm or - chromosome. The file is created in the directory given by outdir or the - current file if no file is given. - Parameters - ---------- - xs : list of numpy.ndarray - The list of the start position of logbins of each p(s) in base pairs. - ps : list of numpy.ndarray - The list of p(s). - names : list of string - List containing the names of the chromosomes/arms/conditions of the p(s) - values given. - out_file : str or None - Path where output file should be written. ./distance_law.txt by default. - Return - ------ - txt file: - File with three columns separated by a tabulation. The first column - contains the x(s), the second the p(s) and the third the name of the arm - or chromosome. The file is creates in the output file given or the - default one if none given. - """ - # ./distance_law.txt as out_file if no out_file is given. - if out_file is None: - out_file = os.getcwd() + "/distance_law.txt" - # Sanity check: as many chromosomes/arms as ps - if len(xs) != len(names): - logger.error("Number of chromosomes/arms and number of p(s) list differ.") - sys.exit(1) - # Create the file and write it - f = open(out_file, "w") - for i in range(len(xs)): - for j in range(len(xs[i])): - line = ( - str(format(xs[i][j], "g")) - + "\t" - + str(format(ps[i][j], "g")) - + "\t" - + names[i] - + "\n" - ) - f.write(line) - f.close() - - -def import_distance_law(distance_law_file): - """ Import the table created by export_distance_law and return the list of - x(s) and p(s) in the order of the chromosomes. - Parameters - ---------- - distance_law_file : string - Path to the file containing three columns : the x(s), the p(s), and the - chromosome/arm name. - Returns - ------- - list of numpy.ndarray : - The start coordinate of each bin one array per chromosome or arm. - list of numpy.ndarray : - The distance law probabilities corresponding of the bins of the - previous list. - list of numpy.ndarray : - The names of the arms/chromosomes corresponding to the previous - list. - """ - file = pd.read_csv( - distance_law_file, - sep="\t", - header=None, - dtype={"a": np.int32, "b": np.float32, "c": str}, - ) - names_idx = np.unique(file.iloc[:, 2], return_index=True)[1] - names = [file.iloc[:, 2][index] for index in sorted(names_idx)] - xs = [None] * len(names) - ps = [None] * len(names) - labels = [None] * len(names) - for i in range(len(names)): - subfile = file[file.iloc[:, 2] == names[i]] - xs[i] = np.array(subfile.iloc[:, 0]) - ps[i] = np.array(subfile.iloc[:, 1]) - labels[i] = np.array(subfile.iloc[:, 2]) - return xs, ps, labels - - -def get_chr_segment_bins_index(fragments, centro_file=None, rm_centro=0): - """Get the index positions of the start and end bins of different - chromosomes, or arms if the centromers position have been given from the - fragments file made by hicstuff. - - Parameters - ---------- - fragments : pandas.DataFrame - Table containing in the first coulum the ID of the fragment, in the - second the names of the chromosome in the third and fourth the start - position and the end position of the fragment. The file have no header. - (File like the 'fragments_list.txt' from hicstuff) - centro_file : None or str - None or path to a file with the genomic positions of the centromers - sorted as the chromosomes separated by a space. The file have only one - line. - rm_centro : int - If a value is given, will remove the contacts close the centromeres. - It will remove as many kb as the argument given. Default is zero. - - Returns - ------- - list of floats : - The start and end indices of chromosomes/arms to compute the distance - law on each chromosome/arm separately. - """ - # Get bins where chromosomes start - chr_start_bins = np.where(fragments == 0)[0] - # Create a list of same length for the end of the bins - chr_end_bins = np.zeros(len(chr_start_bins)) - # Get bins where chromsomes end - for i in range(len(chr_start_bins) - 1): - chr_end_bins[i] = chr_start_bins[i + 1] - chr_end_bins[-1] = len(fragments.iloc[:, 0]) - # Combine start and end of bins in a single array. Values are the id of the - # bins - chr_segment_bins = np.sort(np.concatenate((chr_start_bins, chr_end_bins))) - if centro_file is not None: - # Read the file of the centromers - with open(centro_file, "r", newline="") as centro: - centro = csv.reader(centro, delimiter=" ") - centro_pos = next(centro) - # Sanity check: as many chroms as centromeres - if len(chr_start_bins) != len(centro_pos): - logger.warning( - "Number of chromosomes and centromeres differ, centromeres position are not taking into account." - ) - centro_file = None - if centro_file is not None: - # Get bins of centromeres - centro_bins = np.zeros(2 * len(centro_pos)) - for i in range(len(chr_start_bins)): - if (i + 1) < len(chr_start_bins): - subfrags = fragments[chr_start_bins[i] : chr_start_bins[i + 1]] - else: - subfrags = fragments[chr_start_bins[i] :] - # index of last fragment starting before centro in same chrom - centro_bins[2 * i] = chr_start_bins[i] + max( - np.where( - subfrags["start_pos"][:] // (int(centro_pos[i]) - rm_centro) == 0 - )[0] - ) - centro_bins[2 * i + 1] = chr_start_bins[i] + max( - np.where( - subfrags["start_pos"][:] // (int(centro_pos[i]) + rm_centro) == 0 - )[0] - ) - # Combine centro and chrom bins into a single array. Values are the id - # of the bins started and ending the arms. - chr_segment_bins = np.sort( - np.concatenate((chr_start_bins, chr_end_bins, centro_bins)) - ) - return list(chr_segment_bins) - - -def get_chr_segment_length(fragments, chr_segment_bins): - """Compute a list of the length of the different objects (arm or - chromosome) given by chr_segment_bins. - - Parameters - ---------- - fragments : pandas.DataFrame - Table containing in the first coulum the ID of the fragment, in the - second the names of the chromosome in the third and fourth the start - position and the end position of the fragment. The file have no header. - (File like the 'fragments_list.txt' from hicstuff) - chr_segment_bins : list of floats - The start and end indices of chromosomes/arms to compute the distance - law on each chromosome/arm separately. - - Returns - ------- - list of numpy.ndarray: - The length in base pairs of each chromosome or arm. - """ - chr_segment_length = [None] * int(len(chr_segment_bins) / 2) - # Iterate in chr_segment_bins in order to obtain the size of each chromosome/arm - for i in range(len(chr_segment_length)): - # Obtain the size of the chromosome/arm, the if loop is to avoid the - # case of arms where the end position of the last fragments doesn't - # mean the size of arm. If it's the right arm we have to start to count the - # size from the beginning of the arm. - if fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] == 0: - n = fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] - else: - n = ( - fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] - - fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] - ) - chr_segment_length[i] = n - return chr_segment_length - - -def logbins_xs(fragments, chr_segment_length, base=1.1, circular=False): - """Compute the logbins of each chromosome/arm in order to have theme to - compute distance law. At the end you will have bins of increasing with a - logspace with the base of the value given in base. - - Parameters - ---------- - fragments : pandas.DataFrame - Table containing in the first coulum the ID of the fragment, in the - second the names of the chromosome in the third and fourth the start - position and the end position of the fragment. The file have no header. - (File like the 'fragments_list.txt' from hicstuff) - chr_segment_length: list of floats - List of the size in base pairs of the different arms or chromosomes. - base : float - Base use to construct the logspace of the bins, 1.1 by default. - circular : bool - If True, calculate the distance as the chromosome is circular. Default - value is False. - - Returns - ------- - list of numpy.ndarray : - The start coordinate of each bin one array per chromosome or arm. - """ - # Create the xs array and a list of the length of the chromosomes/arms - xs = [None] * len(chr_segment_length) - # Iterate in chr_segment_bins in order to make the logspace - for i in range(len(chr_segment_length)): - n = chr_segment_length[i] - # if the chromosome is circular the mawimum distance between two reads - # are divided by two - if circular: - n /= 2 - n_bins = int(np.log(n) / np.log(base)) - # For each chromosome/arm compute a logspace to have the logbin - # equivalent to the size of the arms and increasing size of bins - xs[i] = np.unique( - np.logspace(0, n_bins, num=n_bins + 1, base=base, dtype=int) - ) - return xs - - -def circular_distance_law(distance, chr_segment_length, chr_bin): - """Recalculate the distance to return the distance in a circular chromosome - and not the distance between the two genomic positions. - Parameters - ---------- - chr_segment_bins : list of floats - The start and end indices of chromosomes/arms to compute the distance - law on each chromosome/arm separately. - chr_segment_length: list of floats - List of the size in base pairs of the different arms or chromosomes. - distance : int - Distance between two fragments with a contact. - Returns - ------- - int : - The real distance in the chromosome circular and not the distance - between two genomic positions - Examples - -------- - >>> circular_distance_law(7500, [2800, 9000], 1) - 1500 - >>> circular_distance_law(1300, [2800, 9000], 0) - 1300 - >>> circular_distance_law(1400, [2800, 9000], 0) - 1400 - """ - chr_len = chr_segment_length[chr_bin] - if distance > chr_len / 2: - distance = chr_len - distance - return distance - - -def get_pairs_distance( - line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular=False -): - """From a line of a pair reads file, filter -/+ or +/- reads, keep only the - reads in the same chromosome/arm and compute the distance of the the two - fragments. It modify the input ps in order to count or not the line given. - It will add one in the logbin corresponding to the distance. - Parameters - ---------- - line : OrderedDict - Line of a pair reads file with the these keys readID, chr1, pos1, chr2, - pos2, strand1, strand2, frag1, frag2. The values are in a dictionnary. - fragments : pandas.DataFrame - Table containing in the first coulum the ID of the fragment, in the - second the names of the chromosome in the third and fourth the start - position and the end position of the fragment. The file have no header. - (File like the 'fragments_list.txt' from hicstuff) - chr_segment_bins : list of floats - The start and end indices of chromosomes/arms to compute the distance - law on each chromosome/arm separately. - chr_segment_length: list of floats - List of the size in base pairs of the different arms or chromosomes. - xs : list of lists - The start coordinate of each bin one array per chromosome or arm. - ps : list of lists - The sum of contact already count. xs and ps should have the same - dimensions. - circular : bool - If True, calculate the distance as the chromosome is circular. Default - value is False. - """ - # Check this is a pairs_idx file and not simple pairs - if line['frag1'] is None: - logger.error( - 'Input pairs file must have frag1 and frag2 columns. In hicstuff ' - 'pipeline, this is the "valid_idx.pairs" file.' - ) - # We only keep the event +/+ or -/-. This is done to avoid to have any - # event of uncut which are not possible in these events. We can remove the - # good events of +/- or -/+ because we don't need a lot of reads to compute - # the distance law and if we eliminate these reads we do not create others - # biases as they should have the same distribution. - if line["strand1"] == line["strand2"]: - # Find in which chromosome/arm are the fragment 1 and 2. - chr_bin1 = ( - np.searchsorted(chr_segment_bins, int(line["frag1"]), side="right") - 1 - ) - chr_bin2 = ( - np.searchsorted(chr_segment_bins, int(line["frag2"]), side="right") - 1 - ) - # We only keep the reads with the two fragments in the same chromosome - # or arm. - if chr_bin1 == chr_bin2: - # Remove the contacts in the centromeres if centro_remove - if chr_bin1 % 2 == 0: - chr_bin1 = int(chr_bin1 / 2) - # For the reads -/-, the fragments should be religated with both - # their start position (position in the left on the genomic - # sequence, 5'). For the reads +/+ it's the contrary. We compute - # the distance as the distance between the two extremities which - # are religated. - if line["strand1"] == "-": - distance = abs( - np.array(fragments["start_pos"][int(line["frag1"])]) - - np.array(fragments["start_pos"][int(line["frag2"])]) - ) - if line["strand1"] == "+": - distance = abs( - np.array(fragments["end_pos"][int(line["frag1"])]) - - np.array(fragments["end_pos"][int(line["frag2"])]) - ) - if circular: - distance = circular_distance_law( - distance, chr_segment_length, chr_bin1 - ) - xs_temp = xs[chr_bin1][:] - # Find the logbins in which the distance is and add one to the sum - # of contact. - ps_indice = np.searchsorted(xs_temp, distance, side="right") - 1 - ps[chr_bin1][ps_indice] += 1 - - -def get_names(fragments, chr_segment_bins): - """Make a list of the names of the arms or the chromosomes. - Parameters - ---------- - fragments : pandas.DataFrame - Table containing in the first coulum the ID of the fragment, in the - second the names of the chromosome in the third and fourth the start - position and the end position of the fragment. The file have no header. - (File like the 'fragments_list.txt' from hicstuff) - chr_segment_bins : list of floats - The start position of chromosomes/arms to compute the distance law on - each chromosome/arm separately. - Returns - ------- - list of floats : - List of the labels given to the curves. It will be the name of the arms - or chromosomes. - """ - # Get the name of the chromosomes. - chr_names_idx = np.unique(fragments.iloc[:, 1], return_index=True)[1] - chr_names = [fragments.iloc[:, 1][index] for index in sorted(chr_names_idx)] - # Case where they are separate in chromosomes - if len(chr_segment_bins) / 2 != len(chr_names): - names = [] - for chr in chr_names: - names.append(str(chr) + "_left") - names.append(str(chr) + "_rigth") - chr_names = names - return chr_names - - -def get_distance_law( - pairs_reads_file, - fragments_file, - centro_file=None, - base=1.1, - out_file=None, - circular=False, - rm_centro=0, -): - """Compute distance law as a function of the genomic coordinate aka P(s). - Bin length increases exponentially with distance. Works on pairs file - format from 4D Nucleome Omics Data Standards Working Group. If the genome - is composed of several chromosomes and you want to compute the arms - separately, provide a file with the positions of centromers. Create a file - with three coulumns separated by a tabulation. The first column contains - the xs, the second the ps and the third the name of the arm or chromosome. - The file is create in the directory given in outdir or in the current - directory if no directory given. - Parameters - ---------- - pairs_reads_file : string - Path of a pairs file format from 4D Nucleome Omics Data Standards - Working Group with the 8th and 9th coulumns are the ID of the fragments - of the reads 1 and 2. - fragments_file : path - Path of a table containing in the first column the ID of the fragment, - in the second the names of the chromosome in the third and fourth - the start position and the end position of the fragment. The file have - no header. (File like the 'fragments_list.txt' from hicstuff) - centro_file : None or str - None or path to a file with the genomic positions of the centromers - sorted as the chromosomes separated by a space. The file have only one - line. - base : float - Base use to construct the logspace of the bins - 1.1 by default. - out_file : None or str - Path of the output file. If no path given, the output is returned. - circular : bool - If True, calculate the distance as the chromosome is circular. Default - value is False. Cannot be True if centro_file is not None - rm_centro : int - If a value is given, will remove the contacts close the centromeres. - It will remove as many kb as the argument given. Default is None. - Returns - ------- - xs : list of numpy.ndarray - Basepair coordinates of log bins used to compute distance law. - ps : list of numpy.ndarray - Contacts value, in arbitrary units, at increasingly long genomic ranges - given by xs. - names : list of strings - Names of chromosomes that are plotted - """ - # Sanity check : centro_fileition should be None if chromosomes are - # circulars (no centromeres is circular chromosomes). - if circular and centro_file != None: - logger.error("Chromosomes cannot have a centromere and be circular") - sys.exit(1) - # Import third columns of fragments file - fragments = pd.read_csv(fragments_file, sep="\t", header=0, usecols=[0, 1, 2, 3]) - # Calculate the indice of the bins to separate into chromosomes/arms - chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file, rm_centro) - # Calculate the length of each chromosoms/arms - chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins) - xs = logbins_xs(fragments, chr_segment_length, base, circular) - # Create the list of p(s) with one array for each chromosome/arm and each - # array contain as many values as in the logbin - ps = [None] * len(chr_segment_length) - for i in range(len(xs)): - ps[i] = [0] * len(xs[i]) - # Read the pair reads file - with open(pairs_reads_file, "r", newline="") as reads: - # Remove the line of the header - header_length = len(hio.get_pairs_header(pairs_reads_file)) - for i in range(header_length): - next(reads) - # Reads all the others lines and put the values in a dictionnary with - # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', - # 'strand2', 'frag1', 'frag2' - reader = csv.DictReader( - reads, - fieldnames=[ - "readID", - "chr1", - "pos1", - "chr2", - "pos2", - "strand1", - "strand2", - "frag1", - "frag2", - ], - delimiter="\t", - ) - for line in reader: - # Iterate in each line of the file after the header - get_pairs_distance( - line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular - ) - # Divide the number of contacts by the area of the logbin - for i in range(len(xs)): - n = chr_segment_length[i] - for j in range(len(xs[i]) - 1): - # Use the area of a trapezium to know the area of the logbin with n - # the size of the matrix. - ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * ( - (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]) - ) - # print( - # ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) - # * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j])) - # ) - # Case of the last logbin which is an isosceles rectangle triangle - # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2) - ps[i][-1] /= ((n - xs[i][-1]) ** 2) / 2 - names = get_names(fragments, chr_segment_bins) - if out_file: - export_distance_law(xs, ps, names, out_file) - return xs, ps, names - - -def normalize_distance_law(xs, ps, inf=3000, sup=None): - """Normalize the distance in order to have the sum of the ps values between - 'inf' (default value is 3kb) until the end of the array equal to one and - limit the effect of coverage between two conditions/chromosomes/arms when - you compare them together. If we have a list of ps, it will normalize until - the length of the shorter object or the value of sup, whichever is smaller. - Parameters - ---------- - xs : list of numpy.ndarray - list of logbins corresponding to the ps. - ps : list of numpy.ndarray - Average ps or list of ps of the chromosomes/arms. xs and ps have to - have the same shape. - inf : integer - Inferior value of the interval on which, the normalization is applied. - sup : integer - Superior value of the interval on which, the normalization is applied. - Returns - ------- - list of numpy.ndarray : - List of ps each normalized separately. - """ - # Sanity check: xs and ps have the same dimension - if np.shape(xs) != np.shape(ps): - logger.error("xs and ps should have the same dimension.") - sys.exit(1) - # Define the length of shortest chromosomes as a lower bound for the sup boundary - min_xs = len(min(xs, key=len)) - normed_ps = [None] * len(ps) - if sup is None: - sup = np.inf - for chrom_id, chrom_ps in enumerate(ps): - # Iterate on the different ps to normalize each of theme separately - chrom_sum = 0 - # Change the last value to have something continuous because the last - # one is much bigger (computed on matrix corner = triangle instead of trapezoid). - chrom_ps[-1] = chrom_ps[-2] - for bin_id, bin_value in enumerate(chrom_ps): - # Compute normalization factor based on values between inf and sup - # Sup will be whatever is smaller between user-provided sup and length of - # the shortest chromosome - if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] < sup) and (bin_id < min_xs): - chrom_sum += bin_value - if chrom_sum == 0: - chrom_sum += 1 - logger.warning("No values of p(s) in one segment") - # Make the normalisation - normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum - return normed_ps - - -def average_distance_law(xs, ps, sup, big_arm_only=False): - """Compute the average distance law between the file the different distance - law of the chromosomes/arms. - Parameters - ---------- - xs : list of numpy.ndarray - The list of logbins. - ps : list of lists of floats - The list of numpy.ndarray. - sup : int - Value given to set the minimum size of the chromosomes/arms to make the - average. - big_arm_only : bool - By default False. If True, will only take into account the arms/chromosomes - longer than the value of sup. Sup mandatory if set. - Returns - ------- - numpy.ndarray : - List of the xs with the max length. - numpy.ndarray : - List of the average_ps. - """ - # Find longest chromosome / arm and make two arrays of this length for the - # average distance law and remove the last value. - xs = max(xs, key=len) - max_length = len(xs) - ps_values = np.zeros(max_length) - ps_occur = np.zeros(max_length) - for chrom_ps in ps: - # Iterate on ps in order to calculate the number of occurences (all the - # chromossomes/arms are not as long as the longest one) and the sum of - # the values of distance law. - # Change the last value to have something continuous because the last - # one is much bigger. - chrom_ps[-1] = chrom_ps[-2] - # Sanity check : sup strictly inferior to maw length arms. - if big_arm_only: - if sup >= xs[-1]: - logger.error( - "sup have to be inferior to the max length of arms/chromsomes if big arm only set" - ) - sys.exit(1) - if sup <= xs[len(chrom_ps) - 1]: - ps_occur[: len(chrom_ps)] += 1 - ps_values[: len(chrom_ps)] += chrom_ps - else: - ps_occur[: len(chrom_ps)] += 1 - ps_values[: len(chrom_ps)] += chrom_ps - # Make the mean - averaged_ps = ps_values / ps_occur - return xs, averaged_ps - - -def slope_distance_law(xs, ps): - """Compute the slope of the loglog curve of the ps as the - [log(ps(n+1)) - log(ps(n))] / [log(n+1) - log(n)]. - Compute only list of ps, not list of array. - Parameters - ---------- - xs : list of numpy.ndarray - The list of logbins. - ps : list of numpy.ndarray - The list of ps. - Returns - ------- - list of numpy.ndarray : - The slope of the distance law. It will be shorter of one value than the - ps given initially. - """ - slope = [None] * len(ps) - for i in range(len(ps)): - ps[i][ps[i] == 0] = 10 ** (-9) - # Compute the slope - slope_temp = np.log(np.array(ps[i][1:]) / np.array(ps[i][:-1])) / np.log( - np.array(xs[i][1:]) / np.array(xs[i][:-1]) - ) - # The 1.8 is the intensity of the normalisation, it could be adapted. - slope_temp[slope_temp == np.nan] = 10 ** (-15) - slope[i] = ndimage.filters.gaussian_filter1d(slope_temp, 1.8) - return slope - - -def get_ylim(xs, curve, inf, sup): - """Compute the max and the min of the list of list between the inferior - (inf) and superior (sup) bounds. - Parameters - ---------- - xs : list of numpy.ndarray - The list of the logbins starting position in basepair. - curve : list of numpy.ndarray - A list of numpy.ndarray from which you want to extract minimum and - maximum values in a given interval. - inf : int - Inferior limit of the interval in basepair. - sup : int - Superior limit of the interval in basepair. - Returns - ------- - min_tot : float - Minimum value of the list of list in this interval. - max_tot : float - Maximum value of the list of list in this interval. - Examples - -------- - >>> get_ylim([np.array([1, 4, 15]), np.array([1, 4, 15, 40])], - ... [np.array([5.5, 3.2, 17.10]), np.array([24, 32, 1.111, 18.5])], - ... 2, - ... 15 - ... ) - (1.111, 32.0) - """ - # Create a list in order to add all the interesting values - flatten_list = [] - for i, logbins in enumerate(xs): - # Iterate on xs. - # Search for the minimum index corresponding to the smallest bin - # superior or equal to inf (in pair base). - min_value = min(logbins[logbins >= inf], default=0) - min_index = np.where(logbins == min_value)[0] - # Skip chromosome if total size smaller than inf - if len(min_index) == 0: - continue - # Search for the maximum index corresponding to the biggest bin - # inferior or equal to sup (in pair base). - max_value = max(logbins[logbins <= sup]) - max_index = np.where(logbins == max_value)[0] - # Add the values in the interval in the flattened list. - if int(max_index) != len(logbins) - 1: - max_index += 1 - for j in range(int(min_index), int(max_index)): - flatten_list.append(curve[i][j]) - # Caluclate the min and the max of this list. - min_tot = min(flatten_list) - max_tot = max(flatten_list) - return min_tot, max_tot - - -def plot_ps_slope(xs, ps, labels, fig_path=None, inf=3000, sup=None): - """Compute two plots, one with the different distance law of each - arm/chromosome in loglog scale and one with the slope (derivative) of these - curves. Generate a svg file with savefig. - Parameters - ---------- - xs : list of numpy.ndarray - The list of the logbins of each ps. - ps : list of numpy.ndarray - The list of ps. - labels_file : list of strings - Names of the different curves in the order in which they are given. - fig_path : str - Path where the figure will be created. If None (default), the plot is - shown in an interactive window. - inf : int - Value of the mimimum x of the window of the plot. Have to be strictly - positive. By default 3000. - sup : int - Value of the maximum x of the window of the plot. By default None. - - """ - # Give the max value for sup if no value have been attributed - if sup is None: - sup = max(max(xs, key=len)) - - # Compute slopes from the curves - slope = slope_distance_law(xs, ps) - # Make the plot of distance law - # Give a range of color - cols = iter(cm.rainbow(np.linspace(0, 1, len(ps)))) - fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(18, 10)) - plt.subplots_adjust(left=0.05, right=0.85, top=0.93, bottom=0.07) - ax1.set_xlabel("Distance (pb)", fontsize="x-large") - ax1.set_ylabel("P(s)", fontsize="x-large") - ax1.set_title("Distance law", fontsize="xx-large") - ylim = get_ylim(xs, ps, inf, sup) - ax1.set_ylim(0.9 * ylim[0], 1.1 * ylim[1]) - for i in range(len(ps)): - # Iterate on the different distance law array and take them by order of - # size in order to have the color scale equivalent to the size scale - col = next(cols) - ax1.loglog(xs[i], ps[i], label=labels[i]) - # Make the same plot with the slope - cols = iter(cm.rainbow(np.linspace(0, 1, len(slope)))) - ax2.set_xlabel("Distance (pb)", fontsize="x-large") - ax2.set_ylabel("Slope", fontsize="x-large") - ax2.set_title("Slope of the distance law", fontsize="xx-large") - ax2.set_xlim([inf, sup]) - ylim = get_ylim(xs, slope, inf, sup) - ax2.set_ylim(1.1 * ylim[0], 0.9 * ylim[1]) - xs2 = [None] * len(xs) - for i in range(len(slope)): - xs2[i] = xs[i][:-1] - col = next(cols) - ax2.semilogx(xs2[i], slope[i], label=labels[i], subs=[2, 3, 4, 5, 6, 7, 8, 9]) - ax2.legend(loc="upper left", bbox_to_anchor=(1.02, 1.00), ncol=1, fontsize="large") - # Save the figure in svg - if fig_path is not None: - plt.savefig(fig_path) - else: - plt.show() - - +import hicstuff.distance_law as hdl if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -819,7 +59,7 @@ if __name__ == "__main__": else: distance_law_plot = prefix+"_"+distance_law_plot - x_s, p_s, _ = get_distance_law( + x_s, p_s, _ = hdl.get_distance_law( input, fragment, centro_file, @@ -831,9 +71,9 @@ if __name__ == "__main__": if plot: # Retrieve chrom labels from distance law file - _, _, chr_labels = import_distance_law(out_file) + _, _, chr_labels = hdl.import_distance_law(out_file) chr_labels = [lab[0] for lab in chr_labels] chr_labels_idx = np.unique(chr_labels, return_index=True)[1] chr_labels = [chr_labels[index] for index in sorted(chr_labels_idx)] - p_s = normalize_distance_law(x_s, p_s) + p_s = hdl.normalize_distance_law(x_s, p_s) plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot) diff --git a/bin/hicstuff_filter.py b/bin/hicstuff_filter.py index 62642ac6a605697dfeafea77f99dba0f99a0560b..3b0bedde4d48d5aa3a1aabb06b66416f98eaf3fb 100755 --- a/bin/hicstuff_filter.py +++ b/bin/hicstuff_filter.py @@ -20,511 +20,11 @@ import sys import numpy as np from collections import OrderedDict import matplotlib.pyplot as plt -from hicstuff_log import logger import argparse import logging +import hicstuff.filter as hf import hicstuff_log as hcl - -def process_read_pair(line): - r"""Process and order read pairs in a .pairs record. - Takes a read pair (line) from a .pairs file as input, reorders the pair - so that read 1 in intrachromosomal pairs always has the smallest genomic - coordinate. - Parameters - ---------- - line : str - Record from a pairs file with columns: readID, chr1, pos1, chr2, - pos2, strand1, strand2, frag1, frag2 - Returns - ------- - dict - Dictionary with reordered pair where each column from the line as an - entry. The number of restriction sites separating reads in the pair and - the type of event are also stored in the dictionary, under keys - "nsites" and "type". - Examples - -------- - >>> d = process_read_pair("readX\ta\t1\tb\t20\t-\t-\t1\t3") - >>> for u in sorted(d.items()): - ... print(u) - ('chr1', 'a') - ('chr2', 'b') - ('frag1', 1) - ('frag2', 3) - ('nsites', 2) - ('pos1', 1) - ('pos2', 20) - ('readID', 'readX') - ('strand1', '-') - ('strand2', '-') - ('type', 'inter') - >>> d = process_read_pair('readY\ta\t20\ta\t10\t-\t+\t2\t1') - >>> [d[x] for x in sorted(d.keys())] - ['a', 'a', 1, 2, 1, 10, 20, 'readY', '+', '-', '+-'] - """ - # Split line by whitespace - p = line.split("\t") - if len(p) != 9: - raise ValueError( - "Your input file does not have 9 columns. Make sure " - "the file has the readID and twice the following 4 fields " - "(once for each read in the pair): chr pos strand frag." - ) - # Saving each column as a dictionary entry with meaningful key - cols = [ - "readID", - "chr1", - "pos1", - "chr2", - "pos2", - "strand1", - "strand2", - "frag1", - "frag2", - ] - p = {cols[i]: p[i] for i in range(len(cols))} - # Transforming numeric columns to int - for col in ["pos1", "pos2", "frag1", "frag2"]: - p[col] = int(p[col]) - - # invert records for intrachromosomal pairs where rec2 comes before - # rec1 in genomic coordinates - if p["chr1"] == p["chr2"] and p["pos2"] < p["pos1"]: - p["strand1"], p["strand2"] = p["strand2"], p["strand1"] - p["pos1"], p["pos2"] = p["pos2"], p["pos1"] - p["frag1"], p["frag2"] = p["frag2"], p["frag1"] - - # Number of restriction sites separating reads in the pair - p["nsites"] = abs(p["frag2"] - p["frag1"]) - # Get event type - if p["chr1"] == p["chr2"]: - p["type"] = "".join([p["strand1"], p["strand2"]]) - else: - p["type"] = "inter" - - return p - - -def get_thresholds( - in_dat, interactive=False, plot_events=False, fig_path=None, prefix=None -): - """Guess distance threshold for event filtering - Analyse the events in the first million of Hi-C pairs in the library, plot - the occurrences of each event type according to number of restriction - fragments, and ask user interactively for the minimum threshold for uncuts - and loops. - Parameters - ---------- - in_dat: str - Path to the .pairs file containing Hi-C pairs. - interactive: bool - If True, plots are diplayed and thresholds are required interactively. - plot_events : bool - Whether to show the plot - fig_path : str - Path where the figure will be saved. If None, the figure will be - diplayed interactively. - prefix : str - If the library has a name, it will be shown on plots. - - Returns - ------- - dictionary - dictionary with keys "uncuts" and "loops" where the values are the - corresponding thresholds entered by the user. - """ - thr_uncut = None - thr_loop = None - max_sites = 50 - # Map of event -> legend name of event for intrachromosomal pairs. - legend = { - "++": "++ (weird)", - "--": "-- (weird)", - "+-": "+- (uncuts)", - "-+": "-+ (loops)", - } - colors = {"++": "#222222", "+-": "r", "--": "#666666", "-+": "tab:orange"} - n_events = {event: np.zeros(max_sites) for event in legend} - i = 0 - # open the file for reading (just the first 1 000 000 lines) - with open(in_dat, "r") as pairs: - for line in pairs: - # Skip header lines - if line.startswith("#"): - continue - i += 1 - # Only use the first million pairs to estimate thresholds - if i == 1000000: - break - # Process Hi-C pair into a dictionary - p = process_read_pair(line) - # Type of event and number of restriction site between reads - etype = p["type"] - nsites = p["nsites"] - # Count number of events for intrachrom pairs - if etype != "inter" and nsites < max_sites: - n_events[etype][nsites] += 1 - - def plot_event(n_events, legend, name): - """Plot the frequency of a given event types over distance.""" - plt.xlim([-0.5, 15]) - plt.plot( - range(n_events[name].shape[0]), - n_events[name], - "o-", - label=legend[name], - linewidth=2.0, - c=colors[name], - ) - - if interactive: - # PLot: - try: - plt.figure(0) - for event in legend: - plot_event(n_events, legend, event) - plt.grid() - plt.xlabel("Number of restriction fragment(s)") - plt.ylabel("Number of events") - plt.yscale("log") - plt.legend() - plt.show(block=False) - - except Exception: - logger.error( - "Unable to show plots, skipping figure generation. Perhaps " - "there is no Xserver running ? (might be due to windows " - "environment). Try running without the interactive option." - ) - - # Asks the user for appropriate thresholds - print( - "Please enter the number of restriction fragments separating " - "reads in a Hi-C pair below or at which loops and " - "uncuts events will be excluded\n", - file=sys.stderr, - ) - thr_uncut = int(input("Enter threshold for the uncuts events (+-):")) - thr_loop = int(input("Enter threshold for the loops events (-+):")) - try: - plt.clf() - except Exception: - pass - else: - # Estimate thresholds from data - for event in n_events: - fixed = n_events[event] - fixed[fixed == 0] = 1 - n_events[event] = fixed - - all_events = np.log(np.array(list(n_events.values()))) - # Compute median occurences at each restriction sites - event_med = np.median(all_events, axis=0) - # Compute MAD, to have a robust estimator of the expected deviation - # from median at long distances - mad = np.median(abs(all_events - event_med)) - exp_stdev = mad / 0.67449 - # Iterate over sites, from furthest to frag+2 - for site in range(max_sites)[:1:-1]: - # For uncuts and loops, keep the last (closest) site where the - # deviation from other events <= expected_stdev - if ( - abs(np.log(n_events["+-"][site]) - event_med[site]) - <= exp_stdev - ): - thr_uncut = site - if ( - abs(np.log(n_events["-+"][site]) - event_med[site]) - <= exp_stdev - ): - thr_loop = site - if thr_uncut is None or thr_loop is None: - raise ValueError( - "The threshold for loops or uncut could not be estimated. " - "Please try running with -i to investigate the problem." - ) - logger.info( - "Filtering with thresholds: uncuts={0} loops={1}".format( - thr_uncut, thr_loop - ) - ) - if plot_events: - try: - plt.figure(1) - plt.xlim([-0.5, 15]) - # Draw colored lines for events to discard - plt.plot( - range(0, thr_uncut + 1), - n_events["+-"][: thr_uncut + 1], - "o-", - c=colors["+-"], - label=legend["+-"], - ) - plt.plot( - range(0, thr_loop + 1), - n_events["-+"][: thr_loop + 1], - "o-", - c=colors["-+"], - label=legend["-+"], - ) - plt.plot( - range(0, 2), - n_events["--"][:2], - "o-", - c=colors["--"], - label=legend["--"], - ) - plt.plot( - range(0, 2), - n_events["++"][:2], - "o-", - c=colors["++"], - label=legend["++"], - ) - # Draw black lines for events to keep - plt.plot( - range(thr_uncut, n_events["+-"].shape[0]), - n_events["+-"][thr_uncut:], - "o-", - range(thr_loop, n_events["-+"].shape[0]), - n_events["-+"][thr_loop:], - "o-", - range(1, n_events["--"].shape[0]), - n_events["--"][1:], - "o-", - range(1, n_events["++"].shape[0]), - n_events["++"][1:], - "o-", - label="kept", - linewidth=2.0, - c="g", - ) - plt.grid() - plt.xlabel("Number of restriction site(s)") - plt.ylabel("Number of events") - plt.yscale("log") - # Remove duplicate "kept" entries in legend - handles, labels = plt.gca().get_legend_handles_labels() - by_label = OrderedDict(zip(labels, handles)) - plt.legend(by_label.values(), by_label.keys()) - # Show uncut and loop threshold as vertical lines - plt.axvline(x=thr_loop, color=colors["-+"]) - plt.axvline(x=thr_uncut, color=colors["+-"]) - - if prefix: - plt.title( - "Library events by distance in {}".format(prefix) - ) - plt.tight_layout() - if fig_path: - plt.savefig(fig_path) - else: - plt.show(block=False) - # plt.clf() - - except Exception: - logger.error( - "Unable to show plots, skipping figure generation. Is " - "an X server running? (might be due to windows " - "environment). Try running without the plot option." - ) - return thr_uncut, thr_loop - - -def filter_events( - in_dat, - out_filtered, - thr_uncut, - thr_loop, - plot_events=False, - fig_path=None, - prefix=None, -): - """Filter events (loops, uncuts and weirds) - Filter out spurious intrachromosomal Hi-C pairs from input file. +- pairs - with reads closer or at the uncut threshold and -+ pairs with reads closer - or at the loop thresholds are excluded from the ouput file. -- and ++ pairs - with both mates on the same fragments are also discarded. All others are - written. - Parameters - ---------- - in_dat : file object - File handle in read mode to the 2D BED file containing Hi-C pairs. - out_filtered : file object - File handle in write mode the output filtered 2D BED file. - thr_uncut : int - Minimum number of restriction sites between reads to keep an - intrachromosomal +- pair. - thr_loop : int - Minimum number of restriction sites between reads to keep an - intrachromosomal -+ pair. - plot_events : bool - If True, a plot showing the proportion of each type of event will be - shown after filtering. - fig_path : str - Path where the figure will be saved. If None, figure is displayed - interactively. - prefix : str - If the library has a name, it will be shown on plots. - """ - n_uncuts = 0 - n_loops = 0 - n_weirds = 0 - lrange_intra = 0 - lrange_inter = 0 - - # open the files for reading and writing - with open(in_dat, "r") as pairs, open(out_filtered, "w") as filtered: - for line in pairs: # iterate over each line - # Copy header lines to output - if line.startswith("#"): - filtered.write(line) - continue - - p = process_read_pair(line) - line_to_write = ( - "\t".join( - map( - str, - ( - p["readID"], - p["chr1"], - p["pos1"], - p["chr2"], - p["pos2"], - p["strand1"], - p["strand2"], - p["frag1"], - p["frag2"], - ), - ) - ) - + "\n" - ) - if p["chr1"] == p["chr2"]: - # Do not report ++ and -- pairs on the same fragment (impossible) - if p["frag1"] == p["frag2"] and p["strand1"] == p["strand2"]: - n_weirds += 1 - elif p["nsites"] <= thr_loop and p["type"] == "-+": - n_loops += 1 - elif p["nsites"] <= thr_uncut and p["type"] == "+-": - n_uncuts += 1 - else: - lrange_intra += 1 - filtered.write(line_to_write) - - if p["chr1"] != p["chr2"]: - lrange_inter += 1 - filtered.write(line_to_write) - - if lrange_inter > 0: - ratio_inter = round( - 100 * lrange_inter / float(lrange_intra + lrange_inter), 2 - ) - else: - ratio_inter = 0 - - # Log quick summary of operation results - kept = lrange_intra + lrange_inter - discarded = n_loops + n_uncuts + n_weirds - total = kept + discarded - logger.info( - "Proportion of inter contacts: {0}% (intra: {1}, " - "inter: {2})".format(ratio_inter, lrange_intra, lrange_inter) - ) - logger.info( - "{0} pairs discarded: Loops: {1}, Uncuts: {2}, Weirds: {3}".format( - discarded, n_loops, n_uncuts, n_weirds - ) - ) - logger.info( - "{0} pairs kept ({1}%)".format( - kept, round(100 * kept / (kept + discarded), 2) - ) - ) - - # Visualize summary if requested by user - if plot_events: - try: - # Plot: make a square figure and axes to plot a pieChart: - plt.figure(2, figsize=(6, 6)) - # The slices will be ordered and plotted counter-clockwise. - fracs = [n_uncuts, n_loops, n_weirds, lrange_intra, lrange_inter] - # Format labels to include event names and proportion - labels = list( - map( - lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total), - [ - ("Uncuts", n_uncuts), - ("Loops", n_loops), - ("Weirds", n_weirds), - ("3D intra", lrange_intra), - ("3D inter", lrange_inter), - ], - ) - ) - colors = ["salmon", "lightskyblue", "yellow", "palegreen", "plum"] - patches, _ = plt.pie(fracs, colors=colors, startangle=90) - plt.legend( - patches, - labels, - loc='upper left', - bbox_to_anchor=(-0.1, 1.), - ) - if prefix: - plt.title( - "Distribution of library events in {}".format(prefix), - bbox={"facecolor": "1.0", "pad": 5}, - ) - plt.text( - 0.3, - 1.15, - "Threshold Uncuts = " + str(thr_uncut), - fontdict=None, - ) - plt.text( - 0.3, - 1.05, - "Threshold Loops = " + str(thr_loop), - fontdict=None, - ) - - plt.text( - -1.5, - -1.2, - "Total number of reads = " + str(total), - fontdict=None, - ) - plt.text( - -1.5, - -1.3, - "Ratio inter/(intra+inter) = " + str(ratio_inter) + "%", - fontdict=None, - ) - percentage = round( - 100 - * float(lrange_inter + lrange_intra) - / (n_loops + n_uncuts + n_weirds + lrange_inter + lrange_intra) - ) - plt.text( - -1.5, - -1.4, - "Selected reads = {0}%".format(percentage), - fontdict=None, - ) - if fig_path: - plt.savefig(fig_path) - else: - plt.show() - plt.clf() - except Exception: - logger.error( - "Unable to show plots. Perhaps there is no Xserver running ?" - "(might be due to windows environment) skipping figure " - "generation." - ) - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-i", "--input") @@ -559,11 +59,11 @@ if __name__ == "__main__": sys.stdout = open ("hicstuff_filter.log", "wt") hcl.set_file_handler(log_file) - uncut_thr, loop_thr = get_thresholds( + uncut_thr, loop_thr = hf.get_thresholds( pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix ) - filter_events( + hf.filter_events( pairs_idx, pairs_out, uncut_thr, diff --git a/bin/hicstuff_filter_pcr.py b/bin/hicstuff_filter_pcr.py index 40c45687d85de9e2e055776761df9152e2361971..30a7904d9ca516a374fdad8a7189a175343ea63c 100755 --- a/bin/hicstuff_filter_pcr.py +++ b/bin/hicstuff_filter_pcr.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import hicstuff_io as hio -from hicstuff_log import logger +import hicstuff.io as hio +from hicstuff.log import logger import argparse import os, time, csv, sys, re diff --git a/bin/hicstuff_fragments.py b/bin/hicstuff_fragments.py index 95fb0dcc02fe99b6ab03bb7eca37a9abd951c502..72182f3b347e96051eaddd6d02185f92937459a7 100755 --- a/bin/hicstuff_fragments.py +++ b/bin/hicstuff_fragments.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import hicstuff_digest as hcd +import hicstuff.digest as hcd import argparse if __name__ == "__main__": diff --git a/bin/hicstuff_io.py b/bin/hicstuff_io.py deleted file mode 100644 index 2aac719b31fbab18dee08538a3e1a322bb778f30..0000000000000000000000000000000000000000 --- a/bin/hicstuff_io.py +++ /dev/null @@ -1,1381 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import gzip -import zipfile -import bz2 -import io -import os -import functools -import sys -import numpy as np -import pandas as pd -import collections -import subprocess as sp -from scipy.sparse import tril, triu -import pathlib -import re -from os.path import join, exists -from random import getrandbits -from scipy.sparse import coo_matrix -from Bio import SeqIO, SeqUtils -#import hicstuff.hicstuff as hcs -#from hicstuff.log import logger -#from hicstuff.version import __version__ -import scipy.stats as ss - -DEFAULT_MAX_MATRIX_SHAPE = 10000 - -DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt" -DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt" -DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt" - - -def _cols_to_sparse(sparse_array, shape=None, dtype=np.float64): - """ - Make a coordinate based sparse matrix from columns. - Convert (3, n) shaped arrays to a sparse matrix. The first - one acts as the row coordinates, the second one as the - column coordinates, the third one as the data points - for each pair. If duplicates are found, the data points are - added. - Parameters - ---------- - sparse_array : array_like - An array with exactly three columns representing the sparse - matrix data in coordinate format. - shape : tuple of int - The total number of rows and columns in the matrix. Will be estimated - from nonzero values if omitted. - dtype : type, optional - The type of data being loaded. Default is numpy.float64 - Example - ------- - >>> import numpy as np - >>> row, col = np.array([1, 2, 3]), np.array([3, 2, 1]) - >>> data = np.array([4, 5, 6]) - >>> M = np.array([row, col, data]).T - >>> S = _cols_to_sparse(M) - >>> print(S.todense()) - [[0. 0. 0. 0.] - [0. 0. 0. 4.] - [0. 0. 5. 0.] - [0. 6. 0. 0.]] - """ - - row = sparse_array[:, 0] - col = sparse_array[:, 1] - data = sparse_array[:, 2] - if shape is None: - n = int(max(np.amax(row), np.amax(col))) + 1 - shape = (n, n) - - S = coo_matrix((data, (row, col)), shape=shape, dtype=dtype) - return S - - -def load_sparse_matrix(mat_path, binning=1, dtype=np.float64): - """Load sparse matrix - Load a text file matrix into a sparse matrix object. The expected format is - a 3 column file where columns are row_number, col_number, value. The first - line consists of 3 values representing the total number of rows, columns - and nonzero values. - Parameters - ---------- - mat_path : file, str or pathlib.Path - The input matrix file in instagraal format. - binning : int or "auto" - The binning to perform. If "auto", binning will - be automatically inferred so that the matrix size - will not go beyond (10000, 10000) in shape. That - can be changed by modifying the DEFAULT_MAX_MATRIX_SHAPE - value. Default is 1, i.e. no binning is performed - dtype : type, optional - The type of data being loaded. Default is numpy.float64 - Returns - ------- - sparse_mat : scipy.sparse.coo_matrix - The output (sparse) matrix in COOrdinate format. - Examples - -------- - >>> S = load_sparse_matrix('test_data/mat_5kb.tsv', binning=1) - >>> S.data[:10] - array([84., 2., 3., 2., 1., 1., 50., 1., 66., 1.]) - >>> S.shape - (16, 16) - """ - try: - cols_arr = np.loadtxt(mat_path, delimiter="\t", dtype=dtype) - shape = (int(cols_arr[0, 0]), int(cols_arr[0, 1])) - except ValueError: - cols_arr = np.loadtxt( - mat_path, delimiter="\t", dtype=dtype, skiprows=1 - ) - shape = None - - # Get values into an array without the header. Use the header to give size. - sparse_mat = _cols_to_sparse(cols_arr[1:, :], shape=shape, dtype=dtype) - - if binning == "auto": - num_bins = max(sparse_mat.shape) + 1 - subsampling_factor = num_bins // DEFAULT_MAX_MATRIX_SHAPE - else: - subsampling_factor = binning - sparse_mat = hcs.bin_sparse( - sparse_mat, subsampling_factor=subsampling_factor - ) - return sparse_mat - - -def save_sparse_matrix(s_mat, path): - """Save a sparse matrix - Saves a sparse matrix object into tsv format. - Parameters - ---------- - s_mat : scipy.sparse.coo_matrix - The sparse matrix to save on disk - path : str - File path where the matrix will be stored - """ - if s_mat.format != "coo": - ValueError("Sparse matrix must be in coo format") - dtype = s_mat.dtype - fmt = "%i" if dtype == int else "%.10e" - sparse_arr = np.vstack([s_mat.row, s_mat.col, s_mat.data]).T - - np.savetxt( - path, - sparse_arr, - header="{nrows}\t{ncols}\t{nonzero}".format( - nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz - ), - comments="", - fmt=fmt, - delimiter="\t", - ) - - -def load_pos_col(path, colnum, header=1, dtype=np.int64): - """ - Loads a single column of a TSV file with header into a numpy array. - - Parameters - ---------- - path : str - The path of the TSV file to load. - colnum : int - The 0-based index of the column to load. - header : int - Number of line to skip. By default the header is a single line. - Returns - ------- - numpy.array : - A 1D numpy array with the - Examples - -------- - >>> load_pos_col('test_data/mat_5kb.tsv', 0)[:10] - array([0, 0, 0, 0, 0, 0, 1, 1, 2, 2]) - """ - pos_arr = np.genfromtxt( - path, - delimiter="\t", - usecols=(colnum,), - skip_header=header, - dtype=dtype, - ) - return pos_arr - - -def generate_temp_dir(path): - """Temporary directory generation - Generates a temporary file with a random name at the input path. - - Parameters - ---------- - path : str - The path at which the temporary directory will be created. - - Returns - ------- - str - The path of the newly created temporary directory. - """ - exist = True - while exist: - # Keep trying random directory names if they already exist - directory = str(hex(getrandbits(32)))[2:] - full_path = join(path, directory) - if not exists(full_path): - exist = False - try: - os.makedirs(full_path) - except PermissionError: - raise PermissionError( - "The temporary directory cannot be created in {}. " - "Make sure you have write permission.".format(path) - ) - return full_path - - -def _check_cooler(fun): - """Decorates function `fun` to check if cooler is available..""" - - @functools.wraps(fun) - def wrapped(*args, **kwargs): - try: - import cooler - - fun.__globals__["cooler"] = cooler - except ImportError: - logger.error( - "The cooler package is required to use {0}, please install it first".format( - fun.__name__ - ) - ) - raise ImportError("The cooler package is required.") - return fun(*args, **kwargs) - - return wrapped - - -@_check_cooler -def add_cool_column( - clr, column, column_name, table_name="bins", metadata={}, dtype=None -): - """ - Adds a new column to a loaded Cooler store. If the column exists, - it is replaced. This will affect the .cool file. - - Parameters - ---------- - clr : Cooler object - A Cooler store. - column : pandas Series - The column to add to the cooler. - column_name : str - The name of the column to add. - table_name : str - The name of the table to which the column should be added. - Defaults to the "bins" table. - metadata : dict - A dictionary of metadata to associate with the new column. - """ - with clr.open("r+") as c: - if column_name in c[table_name]: - del c[table_name][column_name] - h5opts = dict(compression="gzip", compression_opts=6) - c[table_name].create_dataset( - column_name, data=column, dtype=dtype, **h5opts - ) - c[table_name][column_name].attrs.update(metadata) - - -@_check_cooler -def load_cool(cool): - """ - Reads a cool file into memory and parses it into graal style tables. - - Parameters - ---------- - cool : str - Path to the input .cool file. - Returns - ------- - mat : scipy coo_matrix - Hi-C contact map in COO format. - frags : pandas DataFrame - Table off bins matching the matrix. Corresponds to the content of the fragments_list.txt file. - chroms : pandas DataFrame - Table of chromosome informations. - """ - c = cooler.Cooler(cool) # pylint: disable=undefined-variable - frags = c.bins()[:] - chroms = c.chroms()[:] - mat = c.pixels()[:] - frags.rename( - columns={"start": "start_pos", "end": "end_pos"}, inplace=True - ) - frags["id"] = frags.groupby("chrom", sort=False).cumcount() + 1 - # Try loading hicstuff-specific columns - try: - frags = frags[ - ["id", "chrom", "start_pos", "end_pos", "size", "gc_content"] - ] - # If absent, only load standard columns - except KeyError: - frags = frags[["id", "chrom", "start_pos", "end_pos"]] - - chroms["cumul_length"] = ( - chroms.length.shift(1).fillna(0).cumsum().astype(int) - ) - n_frags = c.bins()[:].groupby("chrom", sort=False).count().start - chroms["n_frags"] = chroms.merge( - n_frags, right_index=True, left_on="name", how="left" - ).start - chroms.rename(columns={"name": "contig"}, inplace=True) - n = int(max(np.amax(mat.bin1_id), np.amax(mat.bin2_id))) + 1 - shape = (n, n) - mat = coo_matrix((mat["count"], (mat.bin1_id, mat.bin2_id)), shape=shape) - - return mat, frags, chroms - - -@_check_cooler -def save_cool(cool_out, mat, frags, metadata={}): - """ - Writes a .cool file from graal style tables. - - Parameters - ---------- - cool_out : str - Path to the output cool file. - mat : scipy coo_matrix - The Hi-C contact matrix in sparse COO format. - frags : pandas DataFrame - The graal style 'fragments_list' table. - metadata : dict - Potential metadata to associate with the cool file. - """ - up_tri = False - # Check if symmetric matrix is symmetric - # (i.e. only upper triangle or full mat) - if (abs(mat - mat.T) > 1e-10).nnz != 0: - up_tri = True - # Drop useless column - try: - bins = frags.drop("id", axis=1) - except KeyError: - bins = frags - # Get column names right - bins.rename( - columns={"seq": "chrom", "start_pos": "start", "end_pos": "end"}, - inplace=True, - ) - mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data} - pixels = pd.DataFrame(mat_dict) - cooler.create_cooler( # pylint: disable=undefined-variable - cool_out, - bins, - pixels, - metadata=metadata, - symmetric_upper=up_tri, - triucheck=False, - ) - - -def read_compressed(filename, mode='r'): - """Read compressed file - Opens the file in read mode with appropriate decompression algorithm. - Parameters - ---------- - filename : str - The path to the input file - Returns - ------- - file-like object - The handle to access the input file's content - """ - - # Standard header bytes for diff compression formats - comp_bytes = { - b"\x1f\x8b\x08": "gz", - b"\x42\x5a\x68": "bz2", - b"\x50\x4b\x03\x04": "zip", - } - - max_len = max(len(x) for x in comp_bytes) - - def file_type(filename): - """Guess file type - Compare header bytes with those in the file and return type. - """ - with open(filename, "rb") as f: - file_start = f.read(max_len) - for magic, filetype in comp_bytes.items(): - if file_start.startswith(magic): - return filetype - return "uncompressed" - - # Open file with appropriate function - mode_map = {'r': 'rt', 'rb': 'rb'} - comp = file_type(filename) - if comp == "gz": - return gzip.open(filename, mode_map[mode]) - elif comp == "bz2": - return bz2.open(filename, mode_map[mode]) - elif comp == "zip": - zip_arch = zipfile.ZipFile(filename, mode) - if len(zip_arch.namelist()) > 1: - raise IOError( - "Only a single fastq file must be in the zip archive." - ) - else: - # ZipFile opens as bytes by default, using io to read as text - zip_content = zip_arch.open(zip_arch.namelist()[0], mode) - return io.TextIOWrapper(zip_content, encoding="utf-8") - else: - return open(filename, mode) - - -def is_compressed(filename): - """Check compression status - Check if the input file is compressed from the first bytes. - Parameters - ---------- - filename : str - The path to the input file - Returns - ------- - bool - True if the file is compressed, False otherwise. - """ - - # Standard header bytes for diff compression formats - comp_bytes = { - b"\x1f\x8b\x08": "gz", - b"\x42\x5a\x68": "bz2", - b"\x50\x4b\x03\x04": "zip", - } - max_len = max(len(x) for x in comp_bytes) - with open(filename, "rb") as f: - file_start = f.read(max_len) - for magic, _ in comp_bytes.items(): - if file_start.startswith(magic): - return True - return False - - -def from_dade_matrix(filename, header=False): - """Load a DADE matrix - Load a numpy array from a DADE matrix file, optionally - returning bin information from the header. Header data - processing is delegated downstream. - Parameters - ---------- - filename : str, file or pathlib.Path - The name of the file containing the DADE matrix. - header : bool - Whether to return as well information contained - in the header. Default is False. - Example - ------- - >>> import numpy as np - >>> import tempfile - >>> lines = [['RST', 'chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'], - ... ['chr1~0', '5'], - ... ['chr1~10', '8', '3'], - ... ['chr2~0', '3', '5', '5'], - ... ['chr2~30', '5', '10', '11', '2'] - ... ] - >>> formatted = ["\\t".join(l) + "\\n" for l in lines ] - >>> dade = tempfile.NamedTemporaryFile(mode='w') - >>> for fm in formatted: - ... dade.write(fm) - 34 - 9 - 12 - 13 - 18 - >>> dade.flush() - >>> M, h = from_dade_matrix(dade.name, header=True) - >>> dade.close() - >>> print(M) - [[ 5. 8. 3. 5.] - [ 8. 3. 5. 10.] - [ 3. 5. 5. 11.] - [ 5. 10. 11. 2.]] - >>> print(h) - ['chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'] - See https://github.com/scovit/DADE for more details about Dade. - """ - - A = pd.read_csv(filename, sep="\t", header=None) - A.fillna("0", inplace=True) - M, headers = np.array(A.iloc[1:, 1:], dtype=np.float64), A.iloc[0, :] - matrix = M + M.T - np.diag(np.diag(M)) - if header: - return matrix, headers.tolist()[1:] - else: - return matrix - - -def to_dade_matrix(M, annotations="", filename=None): - """Returns a Dade matrix from input numpy matrix. Any annotations are added - as header. If filename is provided and valid, said matrix is also saved - as text. - """ - - n, m = M.shape - A = np.zeros((n + 1, m + 1)) - A[1:, 1:] = M - if not annotations: - annotations = np.array(["" for _ in n], dtype=str) - A[0, :] = annotations - A[:, 0] = annotations.T - if filename: - try: - np.savetxt(filename, A, fmt="%i") - logger.info( - "I saved input matrix in dade format as {0}".format( - str(filename) - ) - ) - except ValueError as e: - logger.warning("I couldn't save input matrix.") - logger.warning(str(e)) - - return A - - -def load_into_redis(filename): - """Load a file into redis - Load a matrix file and sotre it in memory with redis. - Useful to pass around huge datasets from scripts to - scripts and load them only once. - Inspired from https://gist.github.com/alexland/ce02d6ae5c8b63413843 - Parameters - ---------- - filename : str, file or pathlib.Path - The file of the matrix to load. - Returns - ------- - key : str - The key of the dataset needed to retrieve it from redis. - """ - try: - from redis import StrictRedis as redis - import time - except ImportError: - print( - "Error! Redis does not appear to be installed in your system.", - file=sys.stderr, - ) - exit(1) - - M = np.genfromtxt(filename, dtype=None) - array_dtype = str(M.dtype) - m, n = M.shape - M = M.ravel().tostring() - database = redis(host="localhost", port=6379, db=0) - key = "{0}|{1}#{2}#{3}".format(int(time.time()), array_dtype, m, n) - - database.set(key, M) - - return key - - -def load_from_redis(key): - """Retrieve a dataset from redis - Retrieve a cached dataset that was stored in redis - with the input key. - Parameters - ---------- - key : str - The key of the dataset that was stored in redis. - Returns - ------- - M : numpy.ndarray - The retrieved dataset in array format. - """ - - try: - from redis import StrictRedis as redis - except ImportError: - print( - "Error! Redis does not appear to be installed in your system.", - file=sys.stderr, - ) - exit(1) - - database = redis(host="localhost", port=6379, db=0) - - try: - M = database.get(key) - except KeyError: - print( - "Error! No dataset was found with the supplied key.", - file=sys.stderr, - ) - exit(1) - - array_dtype, n, m = key.split("|")[1].split("#") - - M = np.fromstring(M, dtype=array_dtype).reshape(int(n), int(m)) - return M - - -def dade_to_graal( - filename, - output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME, - output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, - output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME, - output_dir=None, -): - """Convert a matrix from DADE format (https://github.com/scovit/dade) - to a graal-compatible format. Since DADE matrices contain both fragment - and contact information all files are generated at the same time. - """ - - with open(output_matrix, "w") as sparse_file: - sparse_file.write("id_frag_a\tid_frag_b\tn_contact") - with open(filename) as file_handle: - first_line = file_handle.readline() - for row_index, line in enumerate(file_handle): - dense_row = np.array(line.split("\t")[1:], dtype=np.int32) - for col_index in np.nonzero(dense_row)[0]: - line_to_write = "{}\t{}\t{}\n".format( - row_index, col_index, dense_row[col_index] - ) - sparse_file.write(line_to_write) - - header = first_line.split("\t") - bin_type = header[0] - if bin_type == '"RST"': - logger.info("I detected fragment-wise binning") - elif bin_type == '"BIN"': - logger.info("I detected fixed size binning") - else: - logger.warning( - ( - "Sorry, I don't understand this matrix's " - "binning: I read {}".format(str(bin_type)) - ) - ) - - header_data = [ - header_elt.replace("'", "") - .replace('"', "") - .replace("\n", "") - .split("~") - for header_elt in header[1:] - ] - - ( - global_frag_ids, - contig_names, - local_frag_ids, - frag_starts, - frag_ends, - ) = np.array(list(zip(*header_data))) - - frag_starts = frag_starts.astype(np.int32) - 1 - frag_ends = frag_ends.astype(np.int32) - 1 - frag_lengths = frag_ends - frag_starts - - total_length = len(global_frag_ids) - - with open(output_contigs, "w") as info_contigs: - - info_contigs.write("contig\tlength\tn_frags\tcumul_length\n") - - cumul_length = 0 - - for contig in collections.OrderedDict.fromkeys(contig_names): - - length_tig = np.sum(frag_lengths[contig_names == contig]) - n_frags = collections.Counter(contig_names)[contig] - line_to_write = "%s\t%s\t%s\t%s\n" % ( - contig, - length_tig, - n_frags, - cumul_length, - ) - info_contigs.write(line_to_write) - cumul_length += n_frags - - with open(output_frags, "w") as fragments_list: - - fragments_list.write( - "id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n" - ) - bogus_gc = 0.5 - - for i in range(total_length): - line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( - int(local_frag_ids[i]) + 1, - contig_names[i], - frag_starts[i], - frag_ends[i], - frag_lengths[i], - bogus_gc, - ) - fragments_list.write(line_to_write) - - -def load_bedgraph2d(filename, bin_size=None, fragments_file=None): - """ - Loads matrix and fragment information from a 2D bedgraph file. Note this - function assumes chromosomes are ordered in alphabetical. order - - Parameters - ---------- - filename : str - Path to the bedgraph2D file. - bin_size : int - The size of bins in the case of fixed bin size. - fragments_file : str - Path to a fragments file to explicitely provide fragments positions. - If the matrix does not have fixed bin size, this prevents errors. - - Returns - ------- - mat : scipy.sparse.coo_matrix - The Hi-C contact map as the upper triangle of a symetric matrix, in - sparse format. - frags : pandas.DataFrame - The list of fragments/bin present in the matrix with their genomic - positions. - """ - bed2d = pd.read_csv(filename, sep="\t", header=None) - chrom_sizes = {} - if bin_size is not None: - # If bin size if provided, retrieve chromosome lengths, this will be - # used when regenerating bin coordinates - chroms_left = bed2d[[3, 5]] - chroms_left.columns = [0, 2] - chroms = ( - pd.concat([bed2d[[0, 2]], chroms_left]) - .groupby([0], sort=False) - .max() - ) - for chrom, size in zip(chroms.index, np.array(chroms)): - chrom_sizes[chrom] = size[0] - elif fragments_file is None: - logger.warning( - "Please be aware that not all information can be restored from a " - "bg2 file without fixed bin size; fragments without any contact " - "will be lost" - ) - # Get all possible fragment chrom-positions into an array - frag_pos = np.vstack( - [np.array(bed2d[[0, 1, 2]]), np.array(bed2d[[3, 4, 5]])] - ) - # Sort by position (least important, col 1) - frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")] - # Then by chrom (most important, col 0) - frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")] - # Get unique names for fragments (chrom+pos) - ordered_frag_pos = ( - pd.DataFrame(frag_pos).drop_duplicates().reset_index(drop=True) - ) - frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1) - frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1) - # If fragments file is provided, use fragments positions to indices mapping - if fragments_file is not None: - frags = pd.read_csv(fragments_file, delimiter="\t") - frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1) - frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)} - # If fixed fragment size available, use it to reconstruct original - # fragments ID (even if they are absent from the bedgraph file). - elif bin_size is not None: - frag_map = {} - chrom_frags = [] - for chrom, size in chrom_sizes.items(): - prev_frags = len(frag_map) - for bin_id, bin_pos in enumerate(range(0, size, bin_size)): - frag_map[(chrom, bin_pos)] = bin_id + prev_frags - n_bins = size // bin_size - chrom_frags.append( - pd.DataFrame( - { - "id": range(1, n_bins + 1), - "chrom": np.repeat(chrom, n_bins), - "start_pos": range(0, size, bin_size), - "end_pos": range(bin_size, size + bin_size, bin_size), - } - ) - ) - frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True) - frags.insert( - loc=3, column="size", value=frags.end_pos - frags.start_pos - ) - # If None available, guess fragments indices from bedgraph (potentially wrong) - else: - frag_map = { - (v[0], v[1]): i - for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows() - } - frags = ordered_frag_pos.copy() - frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1] - frags.insert(loc=0, column="id", value=0) - frags.id = frags.groupby([0], sort=False).cumcount() + 1 - frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"] - # Match bin indices to their names - frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a))) - frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b))) - contacts = np.array(bed2d.iloc[:, 6].tolist()) - # Use index to build matrix - n_frags = len(frag_map.keys()) - mat = coo_matrix( - (contacts, (frag_id_a, frag_id_b)), shape=(n_frags, n_frags) - ) - - # Get size of each chromosome in basepairs - chromsizes = frags.groupby("chrom", sort=False).apply( - lambda x: np.int64(max(x.end_pos)) - ) - chrom_bins = frags.groupby("chrom", sort=False).size() - # Shift chromsizes by one to get starting bin, first one is zero - # Make chromsize cumulative to get start bin of each chrom - # Get chroms into a 1D array of bin starts - chrom_start = chrom_bins.shift(1, fill_value=0).cumsum() - chroms = pd.DataFrame( - { - "contig": chromsizes.index, - "length": chromsizes.values, - "n_frags": chrom_bins, - "cumul_length": chrom_start, - } - ) - return mat, frags, chroms - - -def flexible_hic_loader( - mat, fragments_file=None, chroms_file=None, quiet=False -): - """ - Wrapper function to load COO, bg2 or cool input and return the same output. - COO formats requires fragments_file and chroms_file options. bg2 format can - infer bin_size if fixed. When providing a bg2 matrix with uneven fragments - length, one should provide fragments_file as well or empty bins will be - truncated from the output. - - Parameters - ---------- - mat : str - Path to the matrix in graal, bedgraph2 or cool format. - fragments_file : str or None - Path to the file with fragments information (fragments_list.txt). - Only required if the matrix is in graal format. - chroms_file : str or None - Path to the file with chromosome information (info_contigs.txt). Only required - if the matrix is in graal format. - quiet : bool - If True, will silence warnings for empty outputs. - Returns - ------- - mat : scipy.sparse.coo_matrix - Sparse upper triangle Hi-C matrix. - frags : pandas.DataFrame or None - Table of fragment informations. None if information was not provided. - chroms : pandas.DataFrame or None - Table of chromosomes/contig information. None if information was not provided. - """ - hic_format = get_hic_format(mat) - # Load cool based on file extension - if hic_format == "cool": - mat, frags, chroms = load_cool(mat) - # Use the first line to determine COO / bg2 format - if hic_format == "bg2": - # Use the frags file to define bins if available - if fragments_file is not None: - mat, frags, chroms = load_bedgraph2d( - mat, fragments_file=fragments_file - ) - else: - # Guess if bin size is fixed based on MAD - bg2 = pd.read_csv(mat, sep="\t") - sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1]) - size_mad = ss.median_abs_deviation(sizes, scale='normal') - # Use only the bg2 - if size_mad > 0: - mat, frags, chroms = load_bedgraph2d(mat) - logger.warning( - "Input is a bedgraph2d file with uneven bin size, " - "but no fragments_file was provided. Empty bins will " - "be missing from the output. To avoid this, provide a " - "fragments file." - ) - # Use fixed bin size - else: - mat, frags, chroms = load_bedgraph2d( - mat, bin_size=int(np.median(sizes)) - ) - - elif hic_format == "graal": - mat = load_sparse_matrix(mat) - try: - frags = pd.read_csv(fragments_file, sep="\t") - except ValueError: - if not quiet: - logger.warning( - "fragments_file was not provided when " - "loading a matrix in COO/graal format. frags will be None." - ) - frags = None - try: - chroms = pd.read_csv(chroms_file, sep="\t") - except ValueError: - if not quiet: - logger.warning( - "chroms_file was not provided when " - "loading a matrix in COO/graal format. chroms will be None." - ) - - chroms = None - - # Ensure the matrix is upper triangle symmetric - if mat.shape[0] == mat.shape[1]: - if (abs(mat - mat.T) > 1e-10).nnz > 0: - mat = mat + tril(mat, k=-1).T - mat = triu(mat, format="coo") - - return mat, frags, chroms - - -def get_hic_format(mat): - """Returns the format of the input Hi-C matrix - - Parameters - ---------- - mat : str - Path to the input Hi-C matrix - Returns - ------- - str : - Hi-C format string. One of graal, bg2, cool - """ - if ( - mat.endswith(".cool") - or mat.count(".mcool::/") == 1 - or mat.count(".cool::/") == 1 - ): - hic_format = "cool" - else: - # Use the first line to determine COO / bg2 format - ncols = len(open(mat).readline().split("\t")) - if ncols == 7: - hic_format = "bg2" - elif ncols == 3: - hic_format = "graal" - else: - raise ValueError("Unkown file format") - return hic_format - - -def flexible_hic_saver( - mat, out_prefix, frags=None, chroms=None, hic_fmt="graal", quiet=False, -): - """ - Saves objects to the desired Hi-C file format. - Parameters - ---------- - mat : scipy.sparse.coo_matrix - Hi-C contact map. - out_prefix : str - Output path without extension (the extension is added based on hic_fmt). - frags : pandas.DataFrame or None - Table of fragments informations. - chroms : pandas.DataFrame or None - Table of chromosomes / contigs informations. - hic_fmt : str - Output format. Can be one of graal for graal-compatible COO format, bg2 for - 2D bedgraph format, or cool for cooler compatible format. - """ - if hic_fmt == "graal": - save_sparse_matrix(mat, out_prefix + ".mat.tsv") - try: - frags.to_csv(out_prefix + ".frags.tsv", sep="\t", index=False) - except AttributeError: - if not quiet: - logger.warning( - "Could not create fragments_list.txt from input files" - ) - try: - chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False) - except AttributeError: - if not quiet: - logger.warning( - "Could not create info_contigs.txt from input files" - ) - elif hic_fmt == "cool": - frag_sizes = frags.end_pos - frags.start_pos - size_mad = np.median(frag_sizes - np.median(frag_sizes)) - bin_type = "variable" if size_mad else "fixed" - try: - save_cool( - out_prefix + ".cool", - mat, - frags, - metadata={"hicstuff": __version__, "bin-type": bin_type}, - ) - except NameError: - NameError("frags is required to save a cool file") - elif hic_fmt == "bg2": - try: - save_bedgraph2d(mat, frags, out_prefix + ".bg2") - except NameError: - NameError("frags is required to save a bg2 file") - else: - raise ValueError("Unknown output format: {0}".format(hic_fmt)) - - -def save_bedgraph2d(mat, frags, out_path): - """ - Given a sparse matrix and a corresponding list of fragments, save a file - in 2D bedgraph format. - Parameters - ---------- - mat : scipy.sparse.coo_matrix - The sparse contact map. - frags : pandas.DataFrame - A structure containing the annotations for each matrix bin. Should - correspond to the content of the fragments_list.txt file. - """ - - mat_df = pd.DataFrame( - {"row": mat.row, "col": mat.col, "data": mat.data.astype(int)} - ) - # Merge fragments with matrix based on row indices to annotate rows - merge_mat = mat_df.merge( - frags, left_on="row", right_index=True, how="left", suffixes=("", "") - ) - # Rename annotations to assign to frag1 - merge_mat.rename( - columns={"chrom": "chr1", "start_pos": "start1", "end_pos": "end1"}, - inplace=True, - ) - # Do the same operation for cols (frag2) - merge_mat = merge_mat.merge( - frags, - left_on="col", - right_index=True, - how="left", - suffixes=("_0", "_2"), - ) - merge_mat.rename( - columns={"chrom": "chr2", "start_pos": "start2", "end_pos": "end2"}, - inplace=True, - ) - # Select only relevant columns in correct order - bg2 = merge_mat.loc[ - :, ["chr1", "start1", "end1", "chr2", "start2", "end2", "data"] - ] - bg2.to_csv(out_path, header=None, index=False, sep="\t") - - -def get_pos_cols(df): - """Get column names representing chromosome, start and end column - from a dataframe. Allows flexible names. - Parameters - ---------- - df : pd.DataFrame - Returns - ------- - tuple of str: - Tuple containing the names of the chromosome, start and end - columns in the input dataframe. - Examples - -------- - >>> import pandas as pd - >>> d = [1, 2, 3] - >>> df = pd.DataFrame( - ... {'Chromosome': d, 'Start': d, 'End': d, 'species': d} - ... ) - >>> get_pos_cols(df) - ('Chromosome', 'Start', 'End') - >>> df = pd.DataFrame( - ... {'id': d, 'chr': d, 'start_bp': d, 'end_bp': d} - ... ) - >>> get_pos_cols(df) - ('chr', 'start_bp', 'end_bp') - """ - - # Get case insensitive column names - cnames = df.columns - inames = cnames.str.lower() - - def _col_getter(cols, pat): - """Helper to get column index from the start of its name""" - mask = cols.str.startswith(pat) - idx = np.flatnonzero(mask)[0] - return idx - - chrom_col = cnames[_col_getter(inames, "chr")] - start_col = cnames[_col_getter(inames, "start")] - end_col = cnames[_col_getter(inames, "end")] - return chrom_col, start_col, end_col - - -def gc_bins(genome_path, frags): - """Generate GC content annotation for bins using input genome. - Parameters - ---------- - genome_path : str - Path the the genome file in FASTA format. - frags : pandas.DataFrame - Table containing bin segmentation of the genome. - Required columns: chrom, start, end. - Returns - ------- - numpy.ndarray of floats: - GC content per bin, in the range [0.0, 1.0]. - """ - # Grab columns by name (catch start, Start, star_pos, etc) - chrom_col, start_col, end_col = get_pos_cols(frags) - # Fill the gc array by chromosome - gc_bins = np.zeros(frags.shape[0], dtype=float) - for rec in SeqIO.parse(genome_path, "fasta"): - mask = frags[chrom_col] == rec.id - # Define coordinates of each bin - starts = frags.loc[mask, start_col] - ends = frags.loc[mask, end_col] - # Slice chromosome sequence based on bins - seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)] - # Fill GC values for bins in the chromosome - idx = np.flatnonzero(mask) - gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0 - - return gc_bins - - -def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"): - """ - Sort a pairs file in batches using UNIX sort. - Parameters - ---------- - in_file : str - Path to the unsorted input file - out_file : str - Path to the sorted output file. - keys : list of str - list of columns to use as sort keys. Each column can be one of readID, - chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to - the order in the list. - tmp_dir : str - Path to the directory where temporary files will be created. Defaults - to current directory. - threads : int - Number of parallel sorting threads. - buffer : str - Buffer size used for sorting. Consists of a number and a unit. - """ - # TODO: Write a pure python implementation to drop GNU coreutils depencency, - # could be inspired from: https://stackoverflow.com/q/14465154/8440675 - - # Check if UNIX sort version supports parallelism - parallel_ok = True - sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE) - sort_ver = ( - sort_ver.communicate()[0] - .decode() - .split("\n")[0] - .split(" ")[-1] - .split(".") - ) - # If so, specify threads, otherwise don't mention it in the command line - try: - sort_ver = list(map(int, sort_ver)) - if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23): - logger.warning( - "GNU sort version is {0} but >8.23 is required for parallel " - "sort. Sorting on a single thread.".format( - ".".join(map(str, sort_ver)) - ) - ) - parallel_ok = False - # BSD sort has a different format and will throw error upon parsing. It does - # not support parallel processes anyway. - except ValueError: - logger.warning( - "Using BSD sort instead of GNU sort, sorting on a single thread." - ) - parallel_ok = False - - key_map = { - "readID": "-k1,1d", - "chr1": "-k2,2V", - "pos1": "-k3,3n", - "chr2": "-k4,4V", - "pos2": "-k5,5n", - "strand1": "-k6,6d", - "strand2": "-k7,7d", - "frag1": "-k8,8n", - "frag2": "-k9,9n", - } - - # transform column names to corresponding sort keys - try: - sort_keys = map(lambda k: key_map[k], keys) - except KeyError: - print("Unkown column name.") - raise - # Rewrite header with new sorting order - header = get_pairs_header(in_file) - with open(out_file, "w") as output: - for line in header: - if line.startswith("#sorted"): - output.write("#sorted: {0}\n".format("-".join(keys))) - else: - output.write(line + "\n") - - # Sort pairs and append to file. - with open(out_file, "a") as output: - grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE) - sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys) - if tmp_dir is not None: - sort_cmd.append("--temporary-directory={0}".format(tmp_dir)) - if parallel_ok: - sort_cmd.append("--parallel={0}".format(threads)) - print(sort_cmd) - print(grep_proc) - sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output) - sort_proc.communicate() - - -def get_pairs_header(pairs): - r"""Retrieves the header of a .pairs file and stores lines into a list. - Parameters - ---------- - pairs : str or file object - Path to the pairs file. - Returns - ------- - header : list of str - A list of header lines found, in the same order they appear in pairs. - Examples - -------- - >>> import os - >>> from tempfile import NamedTemporaryFile - >>> p = NamedTemporaryFile('w', delete=False) - >>> p.writelines(["## pairs format v1.0\n", "#sorted: chr1-chr2\n", "abcd\n"]) - >>> p.close() - >>> h = get_pairs_header(p.name) - >>> for line in h: - ... print([line]) - ['## pairs format v1.0'] - ['#sorted: chr1-chr2'] - >>> os.unlink(p.name) - """ - # Open file if needed - with open(pairs, "r") as pairs: - # Store header lines into a list - header = [] - line = pairs.readline() - while line.startswith("#"): - header.append(line.rstrip()) - line = pairs.readline() - - return header - - -def reorder_fasta(genome, output=None, threshold=100000): - """Reorder and trim a fasta file - Sort a fasta file by record lengths, optionally trimming the smallest ones. - Parameters - ---------- - genome : str, file or pathlib.Path - The genome scaffold file (or file handle) - output : str, file or pathlib.Path - The output file to write to - threshold : int, optional - The size below which scaffolds are discarded, by default 100000 - """ - - if output is None: - output = "{}_renamed.fa".format(genome.split(".")[0]) - - handle = SeqIO.parse(genome, "fasta") - handle_to_write = sorted( - (len(u) for u in handle if len(u) > threshold), reverse=True - ) - SeqIO.write(handle_to_write, output, "fasta") - - -def rename_genome(genome, output=None, ambiguous=True): - """Rename genome and slugify headers - Rename genomes according to a simple naming scheme; this - is mainly done to avoid special character weirdness. - Parameters - ---------- - genome : file, str or pathlib.Path - The input genome to be renamed and slugify. - output : file, str or pathlib.Path - The output genome to be written into. Default is <base>_renamed.fa, - where <base> is genome_in without its extension. - ambiguous : bool - Whether to retain ambiguous non-N bases, otherwise rename them to Ns. - Default is True. - """ - - if output is None: - output = "{}_renamed.fa".format(genome.split(".")[0]) - - with open(output, "w") as output_handle: - for record in SeqIO.parse(genome, "fasta"): - - # Replace hyphens, tabs and whitespace with underscores - new_record_id = record.id.replace(" ", "_") - new_record_id = new_record_id.replace("-", "_") - new_record_id = new_record_id.replace("\t", "_") - - # Remove anything that's weird, i.e. not alphanumeric - # or an underscore - new_record_id = re.sub("[^_A-Za-z0-9]+", "", new_record_id) - header = ">{}\n".format(new_record_id) - - new_seq = re.sub("[^ATGCatgcNn]", "N", str(record.seq)) - - output_handle.write(header) - output_handle.write("{}\n".format(new_seq)) - - -def check_fasta_index(ref, mode="bowtie2"): - """ - Checks for the existence of a bowtie2 or bwa index based on the reference - file name. - Parameters - ---------- - ref : str - Path to the reference genome. - mode : str - The alignment software used to build the index. bowtie2 or bwa. If any - other value is given, the function returns the reference path. - Returns - ------- - index : str - The bowtie2 or bwa index basename. None if no index was found - """ - ref = pathlib.Path(ref) - if mode == "bowtie2": - # Bowtie2 should have 6 index files - bt2_idx_files = list(ref.parent.glob("{}*bt2*".format(ref.name))) - index = None if len(bt2_idx_files) < 6 else bt2_idx_files - elif mode == "bwa": - refdir = str(ref.parent) - refdir_files = os.listdir(refdir) - bwa_idx_files = [ - join(refdir, f) - for f in refdir_files - if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f) - ] - index = None if len(bwa_idx_files) < 5 else bwa_idx_files - else: - index = [ref] - if index is not None: - # Convert the PosixPath objects to strings and get the longest common prefix to obtain - # index basename (without the dot) - index = os.path.commonprefix(list(map(str, index))).strip(".") - return index - - -def check_is_fasta(in_file): - """ - Checks whether input file is in fasta format. - - Parameters - ---------- - in_file : str - Path to the input file. - Returns - ------- - bool : - True if the input is in fasta format, False otherwise - """ - try: - with read_compressed(in_file) as handle: - fasta = any(SeqIO.parse(handle, "fasta")) - except FileNotFoundError: - fasta = False - - return fasta diff --git a/bin/hicstuff_iteralign.py b/bin/hicstuff_iteralign.py index a16b3c84872fedd8d4bcfee5e6e66e99e47174b2..ad19ed696b95acbe966e8b5304663b2f3413d918 100755 --- a/bin/hicstuff_iteralign.py +++ b/bin/hicstuff_iteralign.py @@ -18,310 +18,12 @@ import re import subprocess as sp import pysam as ps import shutil as st -import hicstuff_io as hio +import hicstuff.io as hio import contextlib import argparse -from hicstuff_log import logger +from hicstuff.log import logger from os.path import join - - -def iterative_align( - fq_in, - tmp_dir, - ref, - n_cpu, - bam_out, - aligner="bowtie2", - min_len=20, - min_qual=30, - read_len=None, -): - """Iterative alignment - - Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are - truncated to the 20 first nucleotides and unmapped reads are extended by 20 - nucleotides and realigned on each iteration. - - Parameters - ---------- - - fq_in : str - Path to input fastq file to align iteratively. - tmp_dir : str - Path where temporary files should be written. - ref : str - Path to the reference genome if Minimap2 is used for alignment. - Path to the index genome if Bowtie2/bwa is used for alignment. - n_cpu : int - The number of CPUs to use for the iterative alignment. - bam_out : str - Path where the final alignment should be written in BAM format. - aligner : str - Choose between minimap2, bwa or bowtie2 for the alignment. - min_len : int - The initial length of the fragments to align. - min_qual : int - Minimum mapping quality required to keep Hi-C pairs. - read_len : int - Read length in the fasta file. If set to None, the length of the first read - is used. Set this value to the longest read length in the file if you have - different read lengths. - - Examples - -------- - - iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2") - iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2") - """ - # set with the name of the unaligned reads : - remaining_reads = set() - total_reads = 0 - # Store path of SAM containing aligned reads at each iteration. - iter_out = [] - - # If there is already a file with the same name as the output file, - # remove it. Otherwise, ignore. - with contextlib.suppress(FileNotFoundError): - try: - os.remove(bam_out) - except IsADirectoryError: - logger.error("You need to give the BAM output file, not a folder.") - raise - - # Bowtie only accepts uncompressed fastq: uncompress it into a temp file - if aligner == "bowtie2" and hio.is_compressed(fq_in): - uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp") - with hio.read_compressed(fq_in) as inf: - with open(uncomp_path, "w") as uncomp: - st.copyfileobj(inf, uncomp) - else: - uncomp_path = fq_in - - # throw error if index does not exist - index = hio.check_fasta_index(ref, mode=aligner) - if index is None: - logger.error( - f"Reference index is missing, please build the {aligner} index first." - ) - sys.exit(1) - # Counting reads - with hio.read_compressed(uncomp_path) as inf: - for _ in inf: - total_reads += 1 - total_reads /= 4 - - # Use first read to guess read length if not provided. - if read_len is None: - with hio.read_compressed(uncomp_path) as inf: - # Skip first line (read header) - size = inf.readline() - # Stripping newline from sequence line. - read_len = len(inf.readline().rstrip()) - - # initial length of the fragments to align - # In case reads are shorter than provided min_len - if read_len > min_len: - n = min_len - else: - logger.warning( - "min_len is longer than the reads. Iterative mapping will have no effect." - ) - n = read_len - logger.info("{0} reads to parse".format(int(total_reads))) - - first_round = True - # iterative alignment per se - while n <= read_len: - logger.info( - "Truncating unaligned reads to {size}bp and mapping{again}.".format( - size=int(n), again="" if first_round else " again" - ) - ) - iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] - # Generate a temporary input fastq file with the n first nucleotids - # of the reads. - truncated_reads = truncate_reads( - tmp_dir, uncomp_path, remaining_reads, n, first_round - ) - - # Align the truncated reads on reference genome - temp_alignment = join(tmp_dir, "temp_alignment.bam") - map_args = { - "fa": ref, - "cpus": n_cpu, - "fq": truncated_reads, - "idx": index, - "bam": temp_alignment, - } - if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE): - cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args) - elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE): - cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args) - elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE): - cmd = ( - "bowtie2 -x {idx} -p {cpus} --quiet --very-sensitive-local -U {fq}" - ).format(**map_args) - else: - raise ValueError( - "Unknown aligner. Select bowtie2, minimap2 or bwa." - ) - - map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) - sort_process = sp.Popen( - "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args), - shell=True, - stdin=map_process.stdout, - ) - out, err = sort_process.communicate() - - # filter the reads: the reads whose truncated end was aligned are written - # to the output file. - # The reads whose truncated end was not aligned are kept for the next round. - remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) - - n += 20 - first_round = False - - # one last round without trimming - logger.info( - "Trying to map unaligned reads at full length ({0}bp).".format( - int(read_len) - ) - ) - - truncated_reads = truncate_reads( - tmp_dir, - infile=uncomp_path, - unaligned_set=remaining_reads, - trunc_len=n, - first_round=first_round, - ) - if aligner == "minimap2" or aligner == "Minimap2": - cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format( - fa=ref, cpus=n_cpu, fq=truncated_reads - ) - elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA": - cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format( - idx=index, cpus=n_cpu, fq=truncated_reads - ) - else: - cmd = ( - "bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}" - ).format(idx=index, cpus=n_cpu, fq=truncated_reads) - map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) - # Keep reads sorted by name - sort_process = sp.Popen( - "samtools sort -n -@ {cpus} -O BAM -o {bam}".format( - cpus=n_cpu, bam=temp_alignment - ), - shell=True, - stdin=map_process.stdout, - ) - out, err = sort_process.communicate() - iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] - remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) - - # Report unaligned reads as well - iter_out += [join(tmp_dir, "unaligned.bam")] - temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) - unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam) - for r in temp_bam: - # Do not write supplementary alignments (keeping 1 alignment/read) - if r.query_name in remaining_reads and not r.is_supplementary: - unmapped.write(r) - unmapped.close() - temp_bam.close() - - # Merge all aligned reads and unmapped reads into a single bam - ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out) - logger.info( - "{0} reads aligned / {1} total reads.".format( - int(total_reads - len(remaining_reads)), int(total_reads) - ) - ) - - return 0 - - -def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round): - """Trim read ends - - Writes the n first nucleotids of each sequence in infile to an auxiliary. - file in the temporary folder. - - Parameters - ---------- - - tmp_dir : str - Path to the temporary folder. - infile : str - Path to the fastq file to truncate. - unaligned_set : set - Contains the names of all reads that did not map unambiguously in - previous rounds. - trunc_len : int - The number of basepairs to keep in each truncated sequence. - first_round : bool - If this is the first round, truncate all reads without checking mapping. - - Returns - ------- - - str : - Path to the output fastq file containing truncated reads. - """ - - outfile = "{0}/truncated.fastq".format(tmp_dir) - with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf: - for entry in inf: - # If the read did not align in previous round or this is the first round - if (entry.name in unaligned_set) or first_round: - entry.sequence = entry.sequence[:trunc_len] - entry.quality = entry.quality[:trunc_len] - outf.write(str(entry) + "\n") - return outfile - - -def filter_bamfile(temp_alignment, filtered_out, min_qual=30): - """Filter alignment BAM files - - Reads all the reads in the input BAM alignment file. - Write reads to the output file if they are aligned with a good - quality, otherwise add their name in a set to stage them for the next round - of alignment. - - Parameters - ---------- - temp_alignment : str - Path to the input temporary alignment. - outfile : str - Path to the output filtered temporary alignment. - min_qual : int - Minimum mapping quality required to keep a Hi-C pair. - - Returns - ------- - set: - Contains the names reads that did not align. - """ - # Check the quality and status of each aligned fragment. - # Write the ones with good quality in the final output file. - # Keep those that do not map unambiguously for the next round. - - unaligned = set() - temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) - outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam) - for r in temp_bam: - if r.flag in [0, 16] and r.mapping_quality >= min_qual: - outf.write(r) - else: - unaligned.add(r.query_name) - - logger.info("{0} reads left to map.".format(len(unaligned))) - temp_bam.close() - outf.close() - - return unaligned +import hicstuff.iteralign as hi if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -349,4 +51,4 @@ if __name__ == "__main__": else: read_len = int(read_len) - iterative_align(fq_in=reads, tmp_dir=tmp_dir, ref=ref, n_cpu=n_cpu, bam_out=bam_out, min_qual=min_qual, read_len=read_len) + hi.iterative_align(fq_in=reads, tmp_dir=tmp_dir, ref=ref, n_cpu=n_cpu, bam_out=bam_out, min_qual=min_qual, read_len=read_len) diff --git a/bin/hicstuff_log.py b/bin/hicstuff_log.py deleted file mode 100644 index 171a5665ed1ea81371dcd39b28cc701f6cc7cb86..0000000000000000000000000000000000000000 --- a/bin/hicstuff_log.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -"""Basic logging -Basic logging setup with three main handlers (stdout, log file and optionally -texting with the right API). By default the log file is disabled, but can be -enabled or changed using set_file_handler. -""" - - -import logging -import logging.handlers -import requests -import json - -TEXT_CREDENTIALS_DEFAULT_PATH = "text_credentials.json" - -CURRENT_LOG_LEVEL = logging.INFO - - -logging.captureWarnings(True) - -logger = logging.getLogger() -logger.setLevel(CURRENT_LOG_LEVEL) - -logfile_formatter = logging.Formatter( - "%(asctime)s :: %(levelname)s :: %(message)s", datefmt="%Y-%m-%d,%H:%M:%S" -) -stdout_formatter = logging.Formatter("%(levelname)s :: %(message)s") -text_formatter = logging.Formatter("%(message)s") - -# file_handler = logging.FileHandler("hicstuff.log", "a") - -# file_handler.setLevel(logging.INFO) -# file_handler.setFormatter(logfile_formatter) -# logger.addHandler(file_handler) - -stream_handler = logging.StreamHandler() -stream_handler.setLevel(logging.DEBUG) -stream_handler.setFormatter(stdout_formatter) -logger.addHandler(stream_handler) - - -def set_file_handler(log_path, formatter=logfile_formatter): - """Change the file handler for custom log file location""" - - filehandler = logging.FileHandler(log_path, "a") - filehandler.setLevel(logging.INFO) - filehandler.setFormatter(formatter) - for hdlr in logger.handlers[:]: # remove the existing file handlers - if isinstance(hdlr, logging.FileHandler): - logger.removeHandler(hdlr) - logger.addHandler(filehandler) # set the new handler - - -def setup_text_logging(credentials=TEXT_CREDENTIALS_DEFAULT_PATH): - """Setup text logging - Setup text notifications on errors with a basic API. In order for it to - work, a file named 'text_credentials.json' must be in the directory where - a command is run. The logger performs a GET request to a text notification - API. - Parameters - ---------- - credentials : str or pathlib.Path - A JSON file with necessary credentials for the GET request - """ - - with open(credentials) as cred_handle: - cred_dict = json.load(cred_handle) - - api_service = cred_dict.pop("api_service") - - class TextHandler(logging.Handler): - def emit(self, record): - log_entry = self.format(record) - cred_dict["msg"] = log_entry - return requests.get(api_service, cred_dict) - - sms_handler = TextHandler() - sms_handler.setLevel(logging.ERROR) - sms_handler.setFormatter(text_formatter) - - logger.addHandler(sms_handler) - - -try: - setup_text_logging(TEXT_CREDENTIALS_DEFAULT_PATH) -except FileNotFoundError: - pass diff --git a/bin/hicstuff_stats.py b/bin/hicstuff_stats.py deleted file mode 100644 index 7fd1976162025e42db6c14c013473a6890716e24..0000000000000000000000000000000000000000 --- a/bin/hicstuff_stats.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import re -import os -from os.path import basename -from os.path import splitext -import json - -def get_pipeline_stats(log_file): - """Get stats after pipeline execution. - - Parameters - ---------- - log_file : str - Path to hicstuff log file. - - Returns - ------- - list: - A single list containing stats about hicstuff pipeline execution: - - Number of sequenced pairs from fastqs - - Number (% of total reads) of unmapped reads - - Number (% of total reads) of mapped reads - - Number (% of total reads) of pairs - - Number (% of total pairs) of filtered pairs - - Number (% of total pairs) of loops - - Number (% of total pairs) of uncuts - - Number (% of total pairs) of abnormal (-- and ++) - - Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates] - - From all pairs used in contact matrix: - - Number (% of matrix) of cis pairs - - Number (% of matrix) of trans pairs - - Trans ratio - """ - - with open(log_file) as file: - log_lines = [line.rstrip() for line in file] - - # 1. Number of sequenced pairs from fastqs - fastq_pairs = [s for s in log_lines if re.search("reads found in each", s)][0] - fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs) - fastq_pairs = re.sub(" reads found in each.*", "", fastq_pairs) - fastq_pairs = int(float(fastq_pairs)) - - # 2. Number (% of total) of (un)mapped reads - tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0] - tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped) - tot_mapped = re.sub("/.*", "", tot_mapped) - tot_mapped = int(float(tot_mapped)) - tot_unmapped = fastq_pairs*2 - tot_mapped - - # 3. Number (% of total) of pairs - tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0] - tot_pairs = re.sub(".*INFO :: ", "", tot_pairs) - tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs) - tot_pairs = int(float(tot_pairs)) - - # 4. Number (% of total) of filtered pairs - filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)] - if (len(filtered_pairs) > 0): - filtered_pairs = filtered_pairs[0] - loops_pairs = re.sub(".*Loops: ", "", filtered_pairs) - loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs) - loops_pairs = int(float(loops_pairs)) - uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs) - uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs) - uncuts_pairs = int(float(uncuts_pairs)) - abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs) - abnormal_pairs = int(float(abnormal_pairs)) - filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs) - filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs) - filtered_pairs = int(float(filtered_pairs)) - else: - loops_pairs=0 - uncuts_pairs=0 - abnormal_pairs=0 - filtered_pairs=0 - - # 5. Number (% of total) of PCR duplicates pairs - pcr_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)] - if (len(pcr_pairs) > 0): - pcr_pairs = pcr_pairs[0] - pcr_pairs = re.sub(".*have been filtered out \(", "", pcr_pairs) - pcr_pairs = re.sub(" / .*", "", pcr_pairs) - pcr_pairs = int(float(pcr_pairs)) - else: - pcr_pairs = 0 - - # 6. Number (%) of final pairs - removed_pairs=filtered_pairs+pcr_pairs - final_pairs=tot_pairs-removed_pairs - - # 7. Final stats - stats = { - 'Sample': re.sub(".hicstuff.*", "", basename(log_file)), - 'Total read pairs': fastq_pairs, - 'Mapped reads': tot_mapped, - 'Unmapped reads': tot_unmapped, - 'Recovered contacts': tot_pairs, - 'Final contacts': final_pairs, - 'Removed contacts': removed_pairs, - 'Filtered out': filtered_pairs, - 'Loops': loops_pairs, - 'Uncuts': uncuts_pairs, - 'Weirds': abnormal_pairs, - 'PCR duplicates': pcr_pairs - } - - return(stats) - -def write_pipeline_stats(stats, out_file): - prefix = stats['Sample'] - fastq_pairs=stats['Total read pairs'] - tot_mapped=stats['Mapped reads'] - tot_unmapped=stats['Unmapped reads'] - tot_pairs=stats['Recovered contacts'] - final_pairs=stats['Final contacts'] - removed_pairs=stats['Removed contacts'] - filtered_pairs=stats['Filtered out'] - loops_pairs=stats['Loops'] - uncuts_pairs=stats['Uncuts'] - abnormal_pairs=stats['Weirds'] - pcr_pairs=stats['PCR duplicates'] - pct_pairs = round(tot_pairs/fastq_pairs*100, 2) - pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2) - pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2) - pct_filtered = round(filtered_pairs/tot_pairs*100, 2) - pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2) - pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2) - pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2) - pct_pcr = round(pcr_pairs/tot_pairs*100, 2) - pct_removed=round(removed_pairs/tot_pairs*100, 2) - pct_final = round(final_pairs/tot_pairs*100, 2) - - # Open the log file and read its contents - _, file_extension = splitext(out_file) - if (file_extension == '.json'): - with open(out_file, 'w') as json_file: - json.dump(stats, json_file, indent=4) - else: - with open(out_file, 'w') as stats_file: - stats_file.write("Sample: {prefix}\n".format(prefix = prefix)) - stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs))) - stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format( - tot_mapped = "{:,}".format(tot_mapped), - pct_mapped = pct_mapped - ) - ) - stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format( - tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped - )) - stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format( - tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs - )) - stats_file.write("Removed contacts: {removed_pairs} ({pct_removed}% of all contacts)\n".format( - removed_pairs = "{:,}".format(removed_pairs), pct_removed = pct_removed - )) - stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format( - filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered - )) - stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format( - loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs - )) - stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format( - uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs - )) - stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format( - abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs - )) - stats_file.write(" PCR duplicates: {pcr_pairs} ({pct_pcr}% of all contacts)\n".format( - pcr_pairs = "{:,}".format(pcr_pairs), pct_pcr = pct_pcr - )) - stats_file.write("Final contacts: {final_pairs} ({pct_final}% of all contacts)\n".format( - final_pairs = "{:,}".format(final_pairs), pct_final = pct_final - )) - -def print_pipeline_stats(stats): - tmp = '.'+stats['Sample']+'.txt' - write_pipeline_stats(stats, tmp) - with open(tmp, 'r') as file: - lines = [line for line in file] - print(''.join(lines)) - os.unlink(tmp) diff --git a/bin/hicstuff_version.py b/bin/hicstuff_version.py deleted file mode 100644 index 29e4a9413d67347053bffff8555e4e60ec644686..0000000000000000000000000000000000000000 --- a/bin/hicstuff_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '3.2.2' diff --git a/modules/local/hicstuff/bam2pairs.nf b/modules/local/hicstuff/bam2pairs.nf index d4f279d6030db15ee66b3cb9d57e048794dac3c0..a12346f0da90086bd43a2caafd634e2292ea52f2 100644 --- a/modules/local/hicstuff/bam2pairs.nf +++ b/modules/local/hicstuff/bam2pairs.nf @@ -2,8 +2,7 @@ process BAM2PAIRS { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(bam1), val(meta2), path(bam2) diff --git a/modules/local/hicstuff/build_matrix.nf b/modules/local/hicstuff/build_matrix.nf index 5351c83d983fd9241b8185dd62d746bce273f453..0f9384dcb2f7a80cdade94c0cad6742c4990d10d 100644 --- a/modules/local/hicstuff/build_matrix.nf +++ b/modules/local/hicstuff/build_matrix.nf @@ -2,8 +2,7 @@ process BUILD_MATRIX { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(idx_pairs) diff --git a/modules/local/hicstuff/build_matrix_cool.nf b/modules/local/hicstuff/build_matrix_cool.nf index b63e931e23cef0bde43f8132df7ecc5de6fe6470..2b8ced046660aa44615d507272426d6b808d985c 100644 --- a/modules/local/hicstuff/build_matrix_cool.nf +++ b/modules/local/hicstuff/build_matrix_cool.nf @@ -2,8 +2,7 @@ process BUILD_MATRIX_COOL { tag "$meta1.id" label 'process_single' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(idx_pairs) diff --git a/modules/local/hicstuff/cutsite.nf b/modules/local/hicstuff/cutsite.nf index 161eed1a8e7039ed2e416f5d80277125a6feefb9..c17dff7c009b2ac5b9ca6fad8794122551123d28 100644 --- a/modules/local/hicstuff/cutsite.nf +++ b/modules/local/hicstuff/cutsite.nf @@ -2,8 +2,7 @@ process CUTSITE { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(reads1), val(meta2), path(reads2) diff --git a/modules/local/hicstuff/distance_law.nf b/modules/local/hicstuff/distance_law.nf index 9a6ad8e85297314df593eec10f888e3ff1f89e2c..0035be64aedeea1d0b498b25ffcb50532cd86b92 100644 --- a/modules/local/hicstuff/distance_law.nf +++ b/modules/local/hicstuff/distance_law.nf @@ -2,8 +2,7 @@ process DISTANCE_LAW { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(idx_pairs) diff --git a/modules/local/hicstuff/filter_event.nf b/modules/local/hicstuff/filter_event.nf index f6b0cde27bcf0ffdc1df0f567c18b44675b56c31..1e9ea836eeffb4218c9cbd36877dd381ef027350 100644 --- a/modules/local/hicstuff/filter_event.nf +++ b/modules/local/hicstuff/filter_event.nf @@ -2,8 +2,7 @@ process FILTER_EVENT { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(idx_pairs) diff --git a/modules/local/hicstuff/filter_pcr.nf b/modules/local/hicstuff/filter_pcr.nf index 0b6800ffa0f28f3b5c9a532f10e1ab23129749d3..0824c79be6f17f78a515bda74d3f5122e87c3f70 100644 --- a/modules/local/hicstuff/filter_pcr.nf +++ b/modules/local/hicstuff/filter_pcr.nf @@ -2,8 +2,7 @@ process FILTER_PCR { tag "$meta1.id" label 'process_high' - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta1), path(idx_pairs) diff --git a/modules/local/hicstuff/fragment_enzyme.nf b/modules/local/hicstuff/fragment_enzyme.nf index b6d044a2fda16e73b433e2512f09ef69070b8db8..d8c5a09618405d44ebd81897818320d64b9e74b7 100644 --- a/modules/local/hicstuff/fragment_enzyme.nf +++ b/modules/local/hicstuff/fragment_enzyme.nf @@ -2,8 +2,7 @@ process FRAGMENT_ENZYME { tag "$digestion" label "process_single" - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: val(digestion) diff --git a/modules/local/hicstuff/iteralign.nf b/modules/local/hicstuff/iteralign.nf index d455a7c1bc940dd0ba20be384e9e54c93a0d45c4..91f1d2930e9ff01fb89257d82e1bb79f0ae229cf 100644 --- a/modules/local/hicstuff/iteralign.nf +++ b/modules/local/hicstuff/iteralign.nf @@ -2,8 +2,7 @@ process ITERALIGN { tag "$meta.id" label "process_high" - conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" - container = "docker.io/lbmc/hicstuff:3.1.3" + container = "docker.io/lbmc/hicstuff:3.2.3" input: tuple val(meta), path(reads)