From fae56c14c08fedea0d81b3526d1377f56c794227 Mon Sep 17 00:00:00 2001 From: Mia Croiset <mia.croiset@ens-lyon.fr> Date: Thu, 11 May 2023 15:24:54 +0200 Subject: [PATCH] hicstuff distance law option --- bin/hicstuff_distance_law.py | 839 +++++++++++++++++++++++++ conf/hicstuff.config | 27 +- modules/local/hicstuff/distance_law.nf | 23 + subworkflows/local/hicstuff_sub.nf | 11 + 4 files changed, 899 insertions(+), 1 deletion(-) create mode 100755 bin/hicstuff_distance_law.py create mode 100644 modules/local/hicstuff/distance_law.nf diff --git a/bin/hicstuff_distance_law.py b/bin/hicstuff_distance_law.py new file mode 100755 index 0000000..545c3f7 --- /dev/null +++ b/bin/hicstuff_distance_law.py @@ -0,0 +1,839 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import sys +import matplotlib.pyplot as plt +import warnings +from scipy import ndimage +from matplotlib import cm +import pandas as pd +import os as os +import csv as csv +import hicstuff_io as hio +from hicstuff_log import logger +import argparse + +def export_distance_law(xs, ps, names, out_file=None): + """ Export the x(s) and p(s) from two list of numpy.ndarrays to a table + in txt file with three columns separated by a tabulation. The first column + contains the x(s), the second the p(s) and the third the name of the arm or + chromosome. The file is created in the directory given by outdir or the + current file if no file is given. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the start position of logbins of each p(s) in base pairs. + ps : list of numpy.ndarray + The list of p(s). + names : list of string + List containing the names of the chromosomes/arms/conditions of the p(s) + values given. + out_file : str or None + Path where output file should be written. ./distance_law.txt by default. + Return + ------ + txt file: + File with three columns separated by a tabulation. The first column + contains the x(s), the second the p(s) and the third the name of the arm + or chromosome. The file is creates in the output file given or the + default one if none given. + """ + # ./distance_law.txt as out_file if no out_file is given. + if out_file is None: + out_file = os.getcwd() + "/distance_law.txt" + # Sanity check: as many chromosomes/arms as ps + if len(xs) != len(names): + logger.error("Number of chromosomes/arms and number of p(s) list differ.") + sys.exit(1) + # Create the file and write it + f = open(out_file, "w") + for i in range(len(xs)): + for j in range(len(xs[i])): + line = ( + str(format(xs[i][j], "g")) + + "\t" + + str(format(ps[i][j], "g")) + + "\t" + + names[i] + + "\n" + ) + f.write(line) + f.close() + + +def import_distance_law(distance_law_file): + """ Import the table created by export_distance_law and return the list of + x(s) and p(s) in the order of the chromosomes. + Parameters + ---------- + distance_law_file : string + Path to the file containing three columns : the x(s), the p(s), and the + chromosome/arm name. + Returns + ------- + list of numpy.ndarray : + The start coordinate of each bin one array per chromosome or arm. + list of numpy.ndarray : + The distance law probabilities corresponding of the bins of the + previous list. + list of numpy.ndarray : + The names of the arms/chromosomes corresponding to the previous + list. + """ + file = pd.read_csv( + distance_law_file, + sep="\t", + header=None, + dtype={"a": np.int32, "b": np.float32, "c": str}, + ) + names_idx = np.unique(file.iloc[:, 2], return_index=True)[1] + names = [file.iloc[:, 2][index] for index in sorted(names_idx)] + xs = [None] * len(names) + ps = [None] * len(names) + labels = [None] * len(names) + for i in range(len(names)): + subfile = file[file.iloc[:, 2] == names[i]] + xs[i] = np.array(subfile.iloc[:, 0]) + ps[i] = np.array(subfile.iloc[:, 1]) + labels[i] = np.array(subfile.iloc[:, 2]) + return xs, ps, labels + + +def get_chr_segment_bins_index(fragments, centro_file=None, rm_centro=0): + """Get the index positions of the start and end bins of different + chromosomes, or arms if the centromers position have been given from the + fragments file made by hicstuff. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + centro_file : None or str + None or path to a file with the genomic positions of the centromers + sorted as the chromosomes separated by a space. The file have only one + line. + rm_centro : int + If a value is given, will remove the contacts close the centromeres. + It will remove as many kb as the argument given. Default is zero. + + Returns + ------- + list of floats : + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + """ + # Get bins where chromosomes start + chr_start_bins = np.where(fragments == 0)[0] + # Create a list of same length for the end of the bins + chr_end_bins = np.zeros(len(chr_start_bins)) + # Get bins where chromsomes end + for i in range(len(chr_start_bins) - 1): + chr_end_bins[i] = chr_start_bins[i + 1] + chr_end_bins[-1] = len(fragments.iloc[:, 0]) + # Combine start and end of bins in a single array. Values are the id of the + # bins + chr_segment_bins = np.sort(np.concatenate((chr_start_bins, chr_end_bins))) + if centro_file is not None: + # Read the file of the centromers + with open(centro_file, "r", newline="") as centro: + centro = csv.reader(centro, delimiter=" ") + centro_pos = next(centro) + # Sanity check: as many chroms as centromeres + if len(chr_start_bins) != len(centro_pos): + logger.warning( + "Number of chromosomes and centromeres differ, centromeres position are not taking into account." + ) + centro_file = None + if centro_file is not None: + # Get bins of centromeres + centro_bins = np.zeros(2 * len(centro_pos)) + for i in range(len(chr_start_bins)): + if (i + 1) < len(chr_start_bins): + subfrags = fragments[chr_start_bins[i] : chr_start_bins[i + 1]] + else: + subfrags = fragments[chr_start_bins[i] :] + # index of last fragment starting before centro in same chrom + centro_bins[2 * i] = chr_start_bins[i] + max( + np.where( + subfrags["start_pos"][:] // (int(centro_pos[i]) - rm_centro) == 0 + )[0] + ) + centro_bins[2 * i + 1] = chr_start_bins[i] + max( + np.where( + subfrags["start_pos"][:] // (int(centro_pos[i]) + rm_centro) == 0 + )[0] + ) + # Combine centro and chrom bins into a single array. Values are the id + # of the bins started and ending the arms. + chr_segment_bins = np.sort( + np.concatenate((chr_start_bins, chr_end_bins, centro_bins)) + ) + return list(chr_segment_bins) + + +def get_chr_segment_length(fragments, chr_segment_bins): + """Compute a list of the length of the different objects (arm or + chromosome) given by chr_segment_bins. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + + Returns + ------- + list of numpy.ndarray: + The length in base pairs of each chromosome or arm. + """ + chr_segment_length = [None] * int(len(chr_segment_bins) / 2) + # Iterate in chr_segment_bins in order to obtain the size of each chromosome/arm + for i in range(len(chr_segment_length)): + # Obtain the size of the chromosome/arm, the if loop is to avoid the + # case of arms where the end position of the last fragments doesn't + # mean the size of arm. If it's the right arm we have to start to count the + # size from the beginning of the arm. + if fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] == 0: + n = fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] + else: + n = ( + fragments["end_pos"].iloc[int(chr_segment_bins[2 * i + 1]) - 1] + - fragments["start_pos"].iloc[int(chr_segment_bins[2 * i])] + ) + chr_segment_length[i] = n + return chr_segment_length + + +def logbins_xs(fragments, chr_segment_length, base=1.1, circular=False): + """Compute the logbins of each chromosome/arm in order to have theme to + compute distance law. At the end you will have bins of increasing with a + logspace with the base of the value given in base. + + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + base : float + Base use to construct the logspace of the bins, 1.1 by default. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. + + Returns + ------- + list of numpy.ndarray : + The start coordinate of each bin one array per chromosome or arm. + """ + # Create the xs array and a list of the length of the chromosomes/arms + xs = [None] * len(chr_segment_length) + # Iterate in chr_segment_bins in order to make the logspace + for i in range(len(chr_segment_length)): + n = chr_segment_length[i] + # if the chromosome is circular the mawimum distance between two reads + # are divided by two + if circular: + n /= 2 + n_bins = int(np.log(n) / np.log(base)) + # For each chromosome/arm compute a logspace to have the logbin + # equivalent to the size of the arms and increasing size of bins + xs[i] = np.unique( + np.logspace(0, n_bins, num=n_bins + 1, base=base, dtype=int) + ) + return xs + + +def circular_distance_law(distance, chr_segment_length, chr_bin): + """Recalculate the distance to return the distance in a circular chromosome + and not the distance between the two genomic positions. + Parameters + ---------- + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + distance : int + Distance between two fragments with a contact. + Returns + ------- + int : + The real distance in the chromosome circular and not the distance + between two genomic positions + Examples + -------- + >>> circular_distance_law(7500, [2800, 9000], 1) + 1500 + >>> circular_distance_law(1300, [2800, 9000], 0) + 1300 + >>> circular_distance_law(1400, [2800, 9000], 0) + 1400 + """ + chr_len = chr_segment_length[chr_bin] + if distance > chr_len / 2: + distance = chr_len - distance + return distance + + +def get_pairs_distance( + line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular=False +): + """From a line of a pair reads file, filter -/+ or +/- reads, keep only the + reads in the same chromosome/arm and compute the distance of the the two + fragments. It modify the input ps in order to count or not the line given. + It will add one in the logbin corresponding to the distance. + Parameters + ---------- + line : OrderedDict + Line of a pair reads file with the these keys readID, chr1, pos1, chr2, + pos2, strand1, strand2, frag1, frag2. The values are in a dictionnary. + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start and end indices of chromosomes/arms to compute the distance + law on each chromosome/arm separately. + chr_segment_length: list of floats + List of the size in base pairs of the different arms or chromosomes. + xs : list of lists + The start coordinate of each bin one array per chromosome or arm. + ps : list of lists + The sum of contact already count. xs and ps should have the same + dimensions. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. + """ + # Check this is a pairs_idx file and not simple pairs + if line['frag1'] is None: + logger.error( + 'Input pairs file must have frag1 and frag2 columns. In hicstuff ' + 'pipeline, this is the "valid_idx.pairs" file.' + ) + # We only keep the event +/+ or -/-. This is done to avoid to have any + # event of uncut which are not possible in these events. We can remove the + # good events of +/- or -/+ because we don't need a lot of reads to compute + # the distance law and if we eliminate these reads we do not create others + # biases as they should have the same distribution. + if line["strand1"] == line["strand2"]: + # Find in which chromosome/arm are the fragment 1 and 2. + chr_bin1 = ( + np.searchsorted(chr_segment_bins, int(line["frag1"]), side="right") - 1 + ) + chr_bin2 = ( + np.searchsorted(chr_segment_bins, int(line["frag2"]), side="right") - 1 + ) + # We only keep the reads with the two fragments in the same chromosome + # or arm. + if chr_bin1 == chr_bin2: + # Remove the contacts in the centromeres if centro_remove + if chr_bin1 % 2 == 0: + chr_bin1 = int(chr_bin1 / 2) + # For the reads -/-, the fragments should be religated with both + # their start position (position in the left on the genomic + # sequence, 5'). For the reads +/+ it's the contrary. We compute + # the distance as the distance between the two extremities which + # are religated. + if line["strand1"] == "-": + distance = abs( + np.array(fragments["start_pos"][int(line["frag1"])]) + - np.array(fragments["start_pos"][int(line["frag2"])]) + ) + if line["strand1"] == "+": + distance = abs( + np.array(fragments["end_pos"][int(line["frag1"])]) + - np.array(fragments["end_pos"][int(line["frag2"])]) + ) + if circular: + distance = circular_distance_law( + distance, chr_segment_length, chr_bin1 + ) + xs_temp = xs[chr_bin1][:] + # Find the logbins in which the distance is and add one to the sum + # of contact. + ps_indice = np.searchsorted(xs_temp, distance, side="right") - 1 + ps[chr_bin1][ps_indice] += 1 + + +def get_names(fragments, chr_segment_bins): + """Make a list of the names of the arms or the chromosomes. + Parameters + ---------- + fragments : pandas.DataFrame + Table containing in the first coulum the ID of the fragment, in the + second the names of the chromosome in the third and fourth the start + position and the end position of the fragment. The file have no header. + (File like the 'fragments_list.txt' from hicstuff) + chr_segment_bins : list of floats + The start position of chromosomes/arms to compute the distance law on + each chromosome/arm separately. + Returns + ------- + list of floats : + List of the labels given to the curves. It will be the name of the arms + or chromosomes. + """ + # Get the name of the chromosomes. + chr_names_idx = np.unique(fragments.iloc[:, 1], return_index=True)[1] + chr_names = [fragments.iloc[:, 1][index] for index in sorted(chr_names_idx)] + # Case where they are separate in chromosomes + if len(chr_segment_bins) / 2 != len(chr_names): + names = [] + for chr in chr_names: + names.append(str(chr) + "_left") + names.append(str(chr) + "_rigth") + chr_names = names + return chr_names + + +def get_distance_law( + pairs_reads_file, + fragments_file, + centro_file=None, + base=1.1, + out_file=None, + circular=False, + rm_centro=0, +): + """Compute distance law as a function of the genomic coordinate aka P(s). + Bin length increases exponentially with distance. Works on pairs file + format from 4D Nucleome Omics Data Standards Working Group. If the genome + is composed of several chromosomes and you want to compute the arms + separately, provide a file with the positions of centromers. Create a file + with three coulumns separated by a tabulation. The first column contains + the xs, the second the ps and the third the name of the arm or chromosome. + The file is create in the directory given in outdir or in the current + directory if no directory given. + Parameters + ---------- + pairs_reads_file : string + Path of a pairs file format from 4D Nucleome Omics Data Standards + Working Group with the 8th and 9th coulumns are the ID of the fragments + of the reads 1 and 2. + fragments_file : path + Path of a table containing in the first column the ID of the fragment, + in the second the names of the chromosome in the third and fourth + the start position and the end position of the fragment. The file have + no header. (File like the 'fragments_list.txt' from hicstuff) + centro_file : None or str + None or path to a file with the genomic positions of the centromers + sorted as the chromosomes separated by a space. The file have only one + line. + base : float + Base use to construct the logspace of the bins - 1.1 by default. + out_file : None or str + Path of the output file. If no path given, the output is returned. + circular : bool + If True, calculate the distance as the chromosome is circular. Default + value is False. Cannot be True if centro_file is not None + rm_centro : int + If a value is given, will remove the contacts close the centromeres. + It will remove as many kb as the argument given. Default is None. + Returns + ------- + xs : list of numpy.ndarray + Basepair coordinates of log bins used to compute distance law. + ps : list of numpy.ndarray + Contacts value, in arbitrary units, at increasingly long genomic ranges + given by xs. + names : list of strings + Names of chromosomes that are plotted + """ + # Sanity check : centro_fileition should be None if chromosomes are + # circulars (no centromeres is circular chromosomes). + if circular and centro_file != None: + logger.error("Chromosomes cannot have a centromere and be circular") + sys.exit(1) + # Import third columns of fragments file + fragments = pd.read_csv(fragments_file, sep="\t", header=0, usecols=[0, 1, 2, 3]) + # Calculate the indice of the bins to separate into chromosomes/arms + chr_segment_bins = get_chr_segment_bins_index(fragments, centro_file, rm_centro) + # Calculate the length of each chromosoms/arms + chr_segment_length = get_chr_segment_length(fragments, chr_segment_bins) + xs = logbins_xs(fragments, chr_segment_length, base, circular) + # Create the list of p(s) with one array for each chromosome/arm and each + # array contain as many values as in the logbin + ps = [None] * len(chr_segment_length) + for i in range(len(xs)): + ps[i] = [0] * len(xs[i]) + # Read the pair reads file + with open(pairs_reads_file, "r", newline="") as reads: + # Remove the line of the header + header_length = len(hio.get_pairs_header(pairs_reads_file)) + for i in range(header_length): + next(reads) + # Reads all the others lines and put the values in a dictionnary with + # the keys : 'readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', + # 'strand2', 'frag1', 'frag2' + reader = csv.DictReader( + reads, + fieldnames=[ + "readID", + "chr1", + "pos1", + "chr2", + "pos2", + "strand1", + "strand2", + "frag1", + "frag2", + ], + delimiter="\t", + ) + for line in reader: + # Iterate in each line of the file after the header + get_pairs_distance( + line, fragments, chr_segment_bins, chr_segment_length, xs, ps, circular + ) + # Divide the number of contacts by the area of the logbin + for i in range(len(xs)): + n = chr_segment_length[i] + for j in range(len(xs[i]) - 1): + # Use the area of a trapezium to know the area of the logbin with n + # the size of the matrix. + ps[i][j] /= ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) * ( + (1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j]) + ) + # print( + # ((2 * n - xs[i][j + 1] - xs[i][j]) / 2) + # * ((1 / np.sqrt(2)) * (xs[i][j + 1] - xs[i][j])) + # ) + # Case of the last logbin which is an isosceles rectangle triangle + # print(ps[i][-5:-1], ((n - xs[i][-1]) ** 2) / 2) + ps[i][-1] /= ((n - xs[i][-1]) ** 2) / 2 + names = get_names(fragments, chr_segment_bins) + if out_file: + export_distance_law(xs, ps, names, out_file) + return xs, ps, names + + +def normalize_distance_law(xs, ps, inf=3000, sup=None): + """Normalize the distance in order to have the sum of the ps values between + 'inf' (default value is 3kb) until the end of the array equal to one and + limit the effect of coverage between two conditions/chromosomes/arms when + you compare them together. If we have a list of ps, it will normalize until + the length of the shorter object or the value of sup, whichever is smaller. + Parameters + ---------- + xs : list of numpy.ndarray + list of logbins corresponding to the ps. + ps : list of numpy.ndarray + Average ps or list of ps of the chromosomes/arms. xs and ps have to + have the same shape. + inf : integer + Inferior value of the interval on which, the normalization is applied. + sup : integer + Superior value of the interval on which, the normalization is applied. + Returns + ------- + list of numpy.ndarray : + List of ps each normalized separately. + """ + # Sanity check: xs and ps have the same dimension + if np.shape(xs) != np.shape(ps): + logger.error("xs and ps should have the same dimension.") + sys.exit(1) + # Define the length of shortest chromosomes as a lower bound for the sup boundary + min_xs = len(min(xs, key=len)) + normed_ps = [None] * len(ps) + if sup is None: + sup = np.inf + for chrom_id, chrom_ps in enumerate(ps): + # Iterate on the different ps to normalize each of theme separately + chrom_sum = 0 + # Change the last value to have something continuous because the last + # one is much bigger (computed on matrix corner = triangle instead of trapezoid). + chrom_ps[-1] = chrom_ps[-2] + for bin_id, bin_value in enumerate(chrom_ps): + # Compute normalization factor based on values between inf and sup + # Sup will be whatever is smaller between user-provided sup and length of + # the shortest chromosome + if (xs[chrom_id][bin_id] > inf) and (xs[chrom_id][bin_id] < sup) and (bin_id < min_xs): + chrom_sum += bin_value + if chrom_sum == 0: + chrom_sum += 1 + logger.warning("No values of p(s) in one segment") + # Make the normalisation + normed_ps[chrom_id] = np.array(ps[chrom_id]) / chrom_sum + return normed_ps + + +def average_distance_law(xs, ps, sup, big_arm_only=False): + """Compute the average distance law between the file the different distance + law of the chromosomes/arms. + Parameters + ---------- + xs : list of numpy.ndarray + The list of logbins. + ps : list of lists of floats + The list of numpy.ndarray. + sup : int + Value given to set the minimum size of the chromosomes/arms to make the + average. + big_arm_only : bool + By default False. If True, will only take into account the arms/chromosomes + longer than the value of sup. Sup mandatory if set. + Returns + ------- + numpy.ndarray : + List of the xs with the max length. + numpy.ndarray : + List of the average_ps. + """ + # Find longest chromosome / arm and make two arrays of this length for the + # average distance law and remove the last value. + xs = max(xs, key=len) + max_length = len(xs) + ps_values = np.zeros(max_length) + ps_occur = np.zeros(max_length) + for chrom_ps in ps: + # Iterate on ps in order to calculate the number of occurences (all the + # chromossomes/arms are not as long as the longest one) and the sum of + # the values of distance law. + # Change the last value to have something continuous because the last + # one is much bigger. + chrom_ps[-1] = chrom_ps[-2] + # Sanity check : sup strictly inferior to maw length arms. + if big_arm_only: + if sup >= xs[-1]: + logger.error( + "sup have to be inferior to the max length of arms/chromsomes if big arm only set" + ) + sys.exit(1) + if sup <= xs[len(chrom_ps) - 1]: + ps_occur[: len(chrom_ps)] += 1 + ps_values[: len(chrom_ps)] += chrom_ps + else: + ps_occur[: len(chrom_ps)] += 1 + ps_values[: len(chrom_ps)] += chrom_ps + # Make the mean + averaged_ps = ps_values / ps_occur + return xs, averaged_ps + + +def slope_distance_law(xs, ps): + """Compute the slope of the loglog curve of the ps as the + [log(ps(n+1)) - log(ps(n))] / [log(n+1) - log(n)]. + Compute only list of ps, not list of array. + Parameters + ---------- + xs : list of numpy.ndarray + The list of logbins. + ps : list of numpy.ndarray + The list of ps. + Returns + ------- + list of numpy.ndarray : + The slope of the distance law. It will be shorter of one value than the + ps given initially. + """ + slope = [None] * len(ps) + for i in range(len(ps)): + ps[i][ps[i] == 0] = 10 ** (-9) + # Compute the slope + slope_temp = np.log(np.array(ps[i][1:]) / np.array(ps[i][:-1])) / np.log( + np.array(xs[i][1:]) / np.array(xs[i][:-1]) + ) + # The 1.8 is the intensity of the normalisation, it could be adapted. + slope_temp[slope_temp == np.nan] = 10 ** (-15) + slope[i] = ndimage.filters.gaussian_filter1d(slope_temp, 1.8) + return slope + + +def get_ylim(xs, curve, inf, sup): + """Compute the max and the min of the list of list between the inferior + (inf) and superior (sup) bounds. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the logbins starting position in basepair. + curve : list of numpy.ndarray + A list of numpy.ndarray from which you want to extract minimum and + maximum values in a given interval. + inf : int + Inferior limit of the interval in basepair. + sup : int + Superior limit of the interval in basepair. + Returns + ------- + min_tot : float + Minimum value of the list of list in this interval. + max_tot : float + Maximum value of the list of list in this interval. + Examples + -------- + >>> get_ylim([np.array([1, 4, 15]), np.array([1, 4, 15, 40])], + ... [np.array([5.5, 3.2, 17.10]), np.array([24, 32, 1.111, 18.5])], + ... 2, + ... 15 + ... ) + (1.111, 32.0) + """ + # Create a list in order to add all the interesting values + flatten_list = [] + for i, logbins in enumerate(xs): + # Iterate on xs. + # Search for the minimum index corresponding to the smallest bin + # superior or equal to inf (in pair base). + min_value = min(logbins[logbins >= inf], default=0) + min_index = np.where(logbins == min_value)[0] + # Skip chromosome if total size smaller than inf + if len(min_index) == 0: + continue + # Search for the maximum index corresponding to the biggest bin + # inferior or equal to sup (in pair base). + max_value = max(logbins[logbins <= sup]) + max_index = np.where(logbins == max_value)[0] + # Add the values in the interval in the flattened list. + if int(max_index) != len(logbins) - 1: + max_index += 1 + for j in range(int(min_index), int(max_index)): + flatten_list.append(curve[i][j]) + # Caluclate the min and the max of this list. + min_tot = min(flatten_list) + max_tot = max(flatten_list) + return min_tot, max_tot + + +def plot_ps_slope(xs, ps, labels, fig_path=None, inf=3000, sup=None): + """Compute two plots, one with the different distance law of each + arm/chromosome in loglog scale and one with the slope (derivative) of these + curves. Generate a svg file with savefig. + Parameters + ---------- + xs : list of numpy.ndarray + The list of the logbins of each ps. + ps : list of numpy.ndarray + The list of ps. + labels_file : list of strings + Names of the different curves in the order in which they are given. + fig_path : str + Path where the figure will be created. If None (default), the plot is + shown in an interactive window. + inf : int + Value of the mimimum x of the window of the plot. Have to be strictly + positive. By default 3000. + sup : int + Value of the maximum x of the window of the plot. By default None. + + """ + # Give the max value for sup if no value have been attributed + if sup is None: + sup = max(max(xs, key=len)) + + # Compute slopes from the curves + slope = slope_distance_law(xs, ps) + # Make the plot of distance law + # Give a range of color + cols = iter(cm.rainbow(np.linspace(0, 1, len(ps)))) + fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(18, 10)) + plt.subplots_adjust(left=0.05, right=0.85, top=0.93, bottom=0.07) + ax1.set_xlabel("Distance (pb)", fontsize="x-large") + ax1.set_ylabel("P(s)", fontsize="x-large") + ax1.set_title("Distance law", fontsize="xx-large") + ylim = get_ylim(xs, ps, inf, sup) + ax1.set_ylim(0.9 * ylim[0], 1.1 * ylim[1]) + for i in range(len(ps)): + # Iterate on the different distance law array and take them by order of + # size in order to have the color scale equivalent to the size scale + col = next(cols) + ax1.loglog(xs[i], ps[i], label=labels[i]) + # Make the same plot with the slope + cols = iter(cm.rainbow(np.linspace(0, 1, len(slope)))) + ax2.set_xlabel("Distance (pb)", fontsize="x-large") + ax2.set_ylabel("Slope", fontsize="x-large") + ax2.set_title("Slope of the distance law", fontsize="xx-large") + ax2.set_xlim([inf, sup]) + ylim = get_ylim(xs, slope, inf, sup) + ax2.set_ylim(1.1 * ylim[0], 0.9 * ylim[1]) + xs2 = [None] * len(xs) + for i in range(len(slope)): + xs2[i] = xs[i][:-1] + col = next(cols) + ax2.semilogx(xs2[i], slope[i], label=labels[i], subs=[2, 3, 4, 5, 6, 7, 8, 9]) + ax2.legend(loc="upper left", bbox_to_anchor=(1.02, 1.00), ncol=1, fontsize="large") + # Save the figure in svg + if fig_path is not None: + plt.savefig(fig_path) + else: + plt.show() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input") + parser.add_argument("-f", "--fragment") + parser.add_argument("-c", "--centro_file") + parser.add_argument("-b", "--base") + parser.add_argument("-p", "--prefix") + parser.add_argument("-o", "--out_file") + parser.add_argument("-u", "--circular") + parser.add_argument("-r", "--rm_centro") + parser.add_argument("-l", "--plot") + parser.add_argument("-d", '--dist_plot') + args = parser.parse_args() + + input = args.input + fragment = args.fragment + prefix = args.prefix + centro_file = args.centro_file + if centro_file == "None": + centro_file = None + else: + centro_file = prefix+"_"+centro_file + base = float(args.base) + out_file = prefix+"_"+args.out_file + circular = args.circular + if circular.lower() == "false": + circular = False + elif circular.lower() == "true": + circular = True + rm_centro = args.rm_centro + if rm_centro == "None": + rm_centro = 0 + rm_centro = int(rm_centro) + plot = args.plot + if plot.lower() == "false": + plot = False + elif plot.lower() == "true": + plot = True + distance_law_plot = args.dist_plot + if distance_law_plot == "None": + distance_law_plot = None + else: + distance_law_plot = prefix+"_"+distance_law_plot + + x_s, p_s, _ = get_distance_law( + input, + fragment, + centro_file, + base, + out_file, + circular, + rm_centro + ) + + if plot: + # Retrieve chrom labels from distance law file + _, _, chr_labels = import_distance_law(out_file) + chr_labels = [lab[0] for lab in chr_labels] + chr_labels_idx = np.unique(chr_labels, return_index=True)[1] + chr_labels = [chr_labels[index] for index in sorted(chr_labels_idx)] + p_s = normalize_distance_law(x_s, p_s) + plot_ps_slope(x_s, p_s, labels=chr_labels, fig_path=distance_law_plot) diff --git a/conf/hicstuff.config b/conf/hicstuff.config index 44f2782..d7cae49 100644 --- a/conf/hicstuff.config +++ b/conf/hicstuff.config @@ -121,7 +121,7 @@ params { // Hicstuff hicstuff_bwt2_align_opts = '--very-sensitive-local' hicstuff_min_size = 0 - hicstuff_circular = 'frue' + hicstuff_circular = 'false' hicstuff_output_contigs = 'info_contigs.txt' hicstuff_output_frags = 'fragments_list.txt' hicstuff_frags_plot = 'false' @@ -135,9 +135,18 @@ params { hicstuff_plot_events = 'false' hicstuff_pie_plot = 'distrib' hicstuff_dist_plot = 'dist' + hicstuff_centro_file = 'None' //give absolute path or 'None' + hicstuff_base = 1.1 + hicstuff_distance_out_file = 'distance_law.txt' + hicstuff_rm_centro = 'None' //int or 'None' + hicstuff_distance_plot = 'false' + hicstuff_distance_out_plot = 'plot_distance_law.pdf' //Hicstuff optional modules filter_event = true + distance_law = true + filter_pcr = true + filter_pcr_picard = false } process { @@ -198,6 +207,22 @@ process { ] } + withName: 'DISTANCE_LAW' { + ext.args = { [ + " -c ${params.hicstuff_centro_file}", + " -b ${params.hicstuff_base}", + " -o ${params.hicstuff_distance_out_file}", + " -u ${params.hicstuff_circular}", + " -r ${params.hicstuff_rm_centro}", + " -l ${params.hicstuff_distance_plot}", + " -d ${params.hicstuff_distance_out_plot}" + ].join('').trim() } + publishDir = [ + path: { "${params.outdir}/hicstuff/pairs" }, + mode: 'copy' + ] + } + withName: 'BUILD_MATRIX' { ext.args = params.hicstuff_matrix publishDir = [ diff --git a/modules/local/hicstuff/distance_law.nf b/modules/local/hicstuff/distance_law.nf new file mode 100644 index 0000000..c87112c --- /dev/null +++ b/modules/local/hicstuff/distance_law.nf @@ -0,0 +1,23 @@ +process DISTANCE_LAW { + tag "$meta1.id" + label 'process_high' + + conda "conda-forge::python=3.9 conda-forge::biopython=1.80 conda-forge::numpy=1.22.3 conda-forge::matplotlib=3.6.3 conda-forge::pandas=1.5.3" + container = "lbmc/hicstuff:3.1.3" + + input: + tuple val(meta1), path(idx_pairs) + tuple val(meta), path(fragments_list) + + output: + path("*distance_law.txt") + path("*.pdf"), optional: true + + script: + + def args = task.ext.args ?: '' + + """ + hicstuff_distance_law.py -i ${idx_pairs} -f ${fragments_list} -p ${meta1.id} ${args} + """ +} diff --git a/subworkflows/local/hicstuff_sub.nf b/subworkflows/local/hicstuff_sub.nf index e74e682..048c084 100644 --- a/subworkflows/local/hicstuff_sub.nf +++ b/subworkflows/local/hicstuff_sub.nf @@ -6,6 +6,8 @@ include { BUILD_MATRIX } from '../../modules/local/hicstuff/build_matrix' include { BUILD_MATRIX_COOL } from '../../modules/local/hicstuff/build_matrix_cool' include { BUILD_MATRIX_COOL_ALT } from '../../modules/local/hicstuff/build_matrix_cool_alt' include { FILTER_EVENT } from '../../modules/local/hicstuff/filter_event' +include { DISTANCE_LAW } from '../../modules/local/hicstuff/distance_law' +include { FILTER_PCR } from '../../modules/local/hicstuff/filter_pcr' // Paired-end to Single-end def pairToSingle(row, mates) { @@ -73,6 +75,15 @@ workflow HICSTUFF_SUB { FILTER_EVENT.out.idx_pairs_filtered.set{ ch_idx_pairs } } + if ( params.distance_law ){ + DISTANCE_LAW( + ch_idx_pairs, + FRAGMENT_ENZYME.out.fragments_list.collect() + ) + } + + + //TODO rajouter filtres + distance law + filtres PCR en options // pour les PCR filter, soit le hicstuff soit PICARD -- GitLab