diff --git a/README.md b/README.md index ce231bc2d9517a8db52b47206db25ea3a71370f9..cf702bf337b100b48ab98405e67cb3b1e7c855d9 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,9 @@ https://img.shields.io/badge/singularity-available-7E4C74.svg) ### Introduction This pipeline is based on the [HiC-Pro workflow](https://github.com/nservant/HiC-Pro). -It was designed to process Hi-C data from raw fastq files (paired-end Illumina data) to normalized contact maps. The current version supports digestion protocols. -Support for other protocols is ongoing. +It was designed to process Hi-C data from raw fastq files (paired-end Illumina data) to normalized contact maps. +The current version supports most protocols, including digestion protocols as well as protocols that do not require restriction enzymes such as DNase Hi-C. +In practice, this workflow was successfully applied to many data-sets including dilution Hi-C, in situ Hi-C, DNase Hi-C, Micro-C, capture-C, capture Hi-C or HiChip data. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible. diff --git a/bin/mapped_2hic_dnase.py b/bin/mapped_2hic_dnase.py new file mode 100755 index 0000000000000000000000000000000000000000..36c5a605d0001de3775bb70e7934d06be7145797 --- /dev/null +++ b/bin/mapped_2hic_dnase.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python + +# HiC-Pro +# Copyleft 2015 Institut Curie +# Author(s): Nicolas Servant, Eric Viara +# Contact: nicolas.servant@curie.fr +# This software is distributed without any guarantee under the terms of the +# GNU General +# Public License, either Version 2, June 1991 or Version 3, June 2007. + +""" +Script to keep only valid pairs when no restriction enzyme are used (i.e. DNAse or Micro-HiC) +""" + +import getopt +import sys +import os +import re +import pysam + + +def usage(): + """Usage function""" + print "Usage : python mapped_2hic_dnase.py" + print "-r/--mappedReadsFile <BAM/SAM file of mapped reads>" + print "[-o/--outputDir] <Output directory. Default is current directory>" + print "[-d/--minCisDist] <Minimum distance between intrachromosomal contact to consider>" + print "[-g/--gtag] <Genotype tag. If specified, this tag will be reported in the valid pairs output for allele specific classification>" + print "[-a/--all] <Write all additional output files, with information about the discarded reads (self-circle, dangling end, etc.)>" + print "[-v/--verbose] <Verbose>" + print "[-h/--help] <Help>" + return + + +def get_args(): + """Get argument""" + try: + opts, args = getopt.getopt( + sys.argv[1:], + "r:o:d:g:avh", + ["mappedReadsFile=", + "outputDir=", "minDist=", "gatg", "all", "verbose", "help"]) + except getopt.GetoptError: + usage() + sys.exit(-1) + return opts + + +def get_read_strand(read): + """ + Conversion of read position to naive strand representation + + Parameters + ---------- + read : list + list of aligned reads + """ + strand = "+" + if read.is_reverse: + strand = "-" + return strand + + +def get_read_pos(read, st="start"): + """ + Return the read position (zero-based) used for the intersection with + the restriction fragment + + The 5' end is not a good choice for the reverse reads (which contain part + of the restriction site, and thus overlap the next restriction fragment) + Using the left-most position (5' for forward, 3' for reverse) or the + middle of the read should work but the middle of the reads might be more + safe + + Parameters + ----------- + read : list + list of aligned reads + """ + if st == "middle": + pos = read.pos + int(read.alen/2) + elif st =="start": + pos = get_read_start(read) + elif st == "left": + pos = read.pos + + return pos + + +def get_read_start(read): + """ + Return the 5' end of the read + """ + if read.is_reverse: + pos = read.pos + read.alen -1 + else: + pos = read.pos + return pos + + +def get_ordered_reads(read1, read2): + """ + Reorient reads + + The sequencing is usually not oriented. Reorient the reads so that r1 is + always before r2 + + read1 = [AlignedRead] + read2 = [AlignedRead] + """ + if read1.tid == read2.tid: + if get_read_pos(read1) < get_read_pos(read2): + r1 = read1 + r2 = read2 + else: + r1 = read2 + r2 = read1 + else: + if read1.tid < read2.tid: + r1 = read1 + r2 = read2 + else: + r1 = read2 + r2 = read1 + + return r1, r2 + + +def isIntraChrom(read1, read2): + """ + Return true is the reads pair is intrachromosomal + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + if read1.tid == read2.tid: + return True + else: + return False + + +def get_valid_orientation(read1, read2): + """ + Both reads are expected to be on the different restriction fragments + + Check the orientation of reads ->-> / <-<- / -><- / <--> + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + # Get oriented reads + r1, r2 = get_ordered_reads(read1, read2) + + direction = None + if get_read_strand(r1) == "+" and get_read_strand(r2) == "+": + direction = "FF" + elif get_read_strand(r1) == "-" and get_read_strand(r2) == "-": + direction = "RR" + elif get_read_strand(r1) == "+" and get_read_strand(r2) == "-": + direction = "FR" + elif get_read_strand(r1) == "-" and get_read_strand(r2) == "+": + direction = "RF" + + return direction + + +def get_cis_dist(read1, read2): + """ + Calculte the size of the DNA fragment library + + read1 : [AlignedRead] + read2 : [AlignedRead] + + """ + # Get oriented reads + ##r1, r2 = get_ordered_reads(read1, read2) + dist = None + if not r1.is_unmapped and not r2.is_unmapped: + ## Contact distances can be calculated for intrachromosomal reads only + if isIntraChrom(read1, read2): + r1pos = get_read_pos(read1) + r2pos = get_read_pos(read2) + dist = abs(r1pos - r2pos) + return dist + + +def get_read_tag(read, tag): + for t in read.tags: + if t[0] == tag: + return t[1] + return None + + +if __name__ == "__main__": + # Read command line arguments + opts = get_args() + verbose = False + allOutput = False + minInsertSize = None + maxInsertSize = None + minDist = None + outputDir = "." + gtag = None + + if len(opts) == 0: + usage() + sys.exit() + + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + elif opt in ("-r", "--mappedReadsFile"): + mappedReadsFile = arg + elif opt in ("-o", "--outputDir"): + outputDir = arg + elif opt in ("-d", "--minCisDist"): + minDist = arg + elif opt in ("-g", "--gtag"): + gtag = arg + elif opt in ("-a", "--all"): + allOutput = True + elif opt in ("-v", "--verbose"): + verbose = True + else: + assert False, "unhandled option" + + # Verbose mode + if verbose: + print "## overlapMapped2HiCFragments.py" + print "## mappedReadsFile=", mappedReadsFile + print "## minCisDist=", minDist + print "## allOuput=", allOutput + print "## verbose=", verbose, "\n" + + # Initialize variables + reads_counter = 0 + valid_counter = 0 + valid_counter_FF = 0 + valid_counter_RR = 0 + valid_counter_FR = 0 + valid_counter_RF = 0 + single_counter = 0 + dump_counter = 0 + filt_counter = 0 + + # AS counter + G1G1_ascounter = 0 + G2G2_ascounter = 0 + G1U_ascounter = 0 + UG1_ascounter = 0 + G2U_ascounter = 0 + UG2_ascounter = 0 + G1G2_ascounter = 0 + G2G1_ascounter = 0 + UU_ascounter = 0 + CF_ascounter = 0 + + baseReadsFile = os.path.basename(mappedReadsFile) + baseReadsFile = re.sub(r'\.bam$|\.sam$', '', baseReadsFile) + + # Open handlers for output files + handle_valid = open(outputDir + '/' + baseReadsFile + '.validPairs', 'w') + + if allOutput: + handle_dump = open(outputDir + '/' + baseReadsFile + '.DumpPairs', 'w') + handle_single = open(outputDir + '/' + baseReadsFile + '.SinglePairs','w') + handle_filt = open(outputDir + '/' + baseReadsFile + '.FiltPairs','w') + + # Read the SAM/BAM file + if verbose: + print "## Opening SAM/BAM file '", mappedReadsFile, "'..." + samfile = pysam.Samfile(mappedReadsFile, "rb") + + # Reads are 0-based too (for both SAM and BAM format) + # Loop on all reads + for read in samfile.fetch(until_eof=True): + reads_counter += 1 + cur_handler = None + interactionType = None + htag = "" + + # First mate + if read.is_read1: + r1 = read + if not r1.is_unmapped: + r1_chrom = samfile.getrname(r1.tid) + else: + r1_chrom = None + + # Second mate + elif read.is_read2: + r2 = read + if not r2.is_unmapped: + r2_chrom = samfile.getrname(r2.tid) + else: + r2_chrom = None + + if isIntraChrom(r1,r2): + dist = get_cis_dist(r1, r2) + else: + dist = None + + # Check singleton + if r1.is_unmapped or r2.is_unmapped: + interactionType = "SI" + single_counter += 1 + cur_handler = handle_single if allOutput else None + + # Check Distance criteria - Filter + if (minDist is not None and dist is not None and dist < int(minDist)): + interactionType = "FILT" + filt_counter += 1 + cur_handler = handle_filt if allOutput else None + + # By default pair is valid + if interactionType == None: + interactionType = "VI" + valid_counter += 1 + cur_handler = handle_valid + validType = get_valid_orientation(r1, r2) + if validType == "RR": + valid_counter_RR += 1 + elif validType == "FF": + valid_counter_FF += 1 + elif validType == "FR": + valid_counter_FR += 1 + elif validType == "RF": + valid_counter_RF += 1 + else: + interactionType = "DUMP" + dump_counter += 1 + cur_handler = handle_dump if allOutput else None + + + + # Split valid pairs based on XA tag + if gtag is not None: + r1as = get_read_tag(r1, gtag) + r2as = get_read_tag(r2, gtag) + + if r1as == 1 and r2as == 1: + G1G1_ascounter += 1 + elif r1as == 2 and r2as == 2: + G2G2_ascounter += 1 + elif r1as == 1 and r2as == 0: + G1U_ascounter += 1 + elif r1as == 0 and r2as == 1: + UG1_ascounter += 1 + elif r1as == 2 and r2as == 0: + G2U_ascounter += 1 + elif r1as == 0 and r2as == 2: + UG2_ascounter += 1 + elif r1as == 1 and r2as == 2: + G1G2_ascounter += 1 + elif r1as == 2 and r2as == 1: + G2G1_ascounter += 1 + elif r1as == 3 or r2as == 3: + CF_ascounter += 1 + else: + UU_ascounter += 1 + + + if cur_handler is not None: + if not r1.is_unmapped and not r2.is_unmapped: + + ##reorient reads to ease duplicates removal + or1, or2 = get_ordered_reads(r1, r2) + or1_chrom = samfile.getrname(or1.tid) + or2_chrom = samfile.getrname(or2.tid) + + ##reset as tag now that the reads are oriented + r1as = get_read_tag(or1, gtag) + r2as = get_read_tag(or2, gtag) + if gtag is not None: + htag = str(r1as)+"-"+str(r2as) + + cur_handler.write( + or1.qname + "\t" + + or1_chrom + "\t" + + str(get_read_pos(or1)+1) + "\t" + + str(get_read_strand(or1)) + "\t" + + or2_chrom + "\t" + + str(get_read_pos(or2)+1) + "\t" + + str(get_read_strand(or2)) + "\t" + + "NA" + "\t" + ##dist + "NA" + "\t" + ##resfrag1 + "NA" + "\t" + ##resfrag2 + str(or1.mapping_quality) + "\t" + + str(or2.mapping_quality) + "\t" + + str(htag) + "\n") + + elif r2.is_unmapped and not r1.is_unmapped: + cur_handler.write( + r1.qname + "\t" + + r1_chrom + "\t" + + str(get_read_pos(r1)+1) + "\t" + + str(get_read_strand(r1)) + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + str(r1.mapping_quality) + "\t" + + "*" + "\n") + elif r1.is_unmapped and not r2.is_unmapped: + cur_handler.write( + r2.qname + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + r2_chrom + "\t" + + str(get_read_pos(r2)+1) + "\t" + + str(get_read_strand(r2)) + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + "*" + "\t" + + str(r2.mapping_quality) + "\n") + + if (reads_counter % 100000 == 0 and verbose): + print "##", reads_counter + + # Close handler + handle_valid.close() + if allOutput: + handle_dump.close() + handle_single.close() + handle_filt.close() + + # Write stats file + handle_stat = open(outputDir + '/' + baseReadsFile + '.RSstat', 'w') + handle_stat.write("## Hi-C processing - no restriction fragments\n") + handle_stat.write("Valid_interaction_pairs\t" + str(valid_counter) + "\n") + handle_stat.write( + "Valid_interaction_pairs_FF\t" + str(valid_counter_FF) + "\n") + handle_stat.write( + "Valid_interaction_pairs_RR\t" + str(valid_counter_RR) + "\n") + handle_stat.write( + "Valid_interaction_pairs_RF\t" + str(valid_counter_RF) + "\n") + handle_stat.write( + "Valid_interaction_pairs_FR\t" + str(valid_counter_FR) + "\n") + handle_stat.write("Single-end_pairs\t" + str(single_counter) + "\n") + handle_stat.write("Filtered_pairs\t" + str(filt_counter) + "\n") + handle_stat.write("Dumped_pairs\t" + str(dump_counter) + "\n") + + ## Write AS report + if gtag is not None: + handle_stat.write("## ======================================\n") + handle_stat.write("## Allele specific information\n") + handle_stat.write("Valid_pairs_from_ref_genome_(1-1)\t" + str(G1G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)\t" + str(UG1_ascounter+G1U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_(2-2)\t" + str(G2G2_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)\t" + str(UG2_ascounter+G2U_ascounter) + "\n") + handle_stat.write("Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)\t" + str(G1G2_ascounter+G2G1_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_both_unassigned_mated_(0-0)\t" + str(UU_ascounter) + "\n") + handle_stat.write("Valid_pairs_with_at_least_one_conflicting_mate_(3-)\t" + str(CF_ascounter) + "\n") + + handle_stat.close() + + diff --git a/conf/hicpro.config b/conf/hicpro.config index b4eac51dacad0bb55f4695d9bfcc0a29551d4211..0a2c9b9e0db09f4f9861ba353b84a534820aba38 100644 --- a/conf/hicpro.config +++ b/conf/hicpro.config @@ -21,7 +21,7 @@ params { min_restriction_fragment_size = max_restriction_fragment_size = min_insert_size = - max_insert_size = + max_insert_size = // Hi-C Processing min_cis_dist = @@ -29,7 +29,7 @@ params { rm_multi = true rm_dup = true - bins_size = '1000000,500000' + bin_size = '1000000,500000' ice_max_iter = 100 ice_filer_low_count_perc = 0.02 diff --git a/docs/output.md b/docs/output.md index 9de7067b987fe10ef7435c135bbd58545f13ecd2..518ac60f545d4f87051dfec5ace6f972cd93d65b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,31 +2,117 @@ This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. -<!-- TODO nf-core: Write this documentation describing your workflow's output --> - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [FastQC](#fastqc) - read quality control -* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline +* [Reads alignment](#reads-alignment) +* [Valid pairs detection](#valid-pairs-detection) +* [Duplicates removal](#duplicates-removal) +* [Contact maps](#contact-maps) +* [MultiQC](#multiqc) - aggregate report and quality controls, describing results of the whole pipeline +* [Export](#exprot) - additionnal export for compatibility with downstream analysis tool and visualization + +The current version is mainly based on the [HiC-Pro](https://github.com/nservant/HiC-Pro) pipeline. +For details about the workflow, see [Servant et al. 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0831-x) + +## Reads alignment + +Using Hi-C data, each reads mate has to be independantly aligned on the reference genome. +The current workflow implements a two steps mapping strategy. First, the reads are aligned using an end-to-end aligner. +Second, reads spanning the ligation junction are trimmmed from their 3' end, and aligned back on the genome. +Aligned reads for both fragment mates are then paired in a single paired-end BAM file. +Singletons are discarded, and multi-hits are filtered according to the configuration parameters (`--rm-multi`). +Note that if the `--dnase` mode is activated, HiC-Pro will skip the second mapping step. + +**Output directory: `results/mapping`** + +* `*bwt2pairs.bam` - final BAM file with aligned paired data +* `*.pairstat` - mapping statistics + +if `--saveAlignedIntermediates` is specified, additional mapping file results are available ; + +* `*.bam` - Aligned reads (R1 and R2) from end-to-end alignment +* `*_unmap.fastq` - Unmapped reads after end-to-end alignment +* `*_trimmed.fastq` - Trimmed reads after end-to-end alignment +* `*_trimmed.bam` - Alignment of trimmed reads +* `*bwt2merged.bam` - merged BAM file after the two-steps alignment +* `*.mapstat` - mapping statistics per read mate + +Usually, a high fraction of reads is expected to be aligned on the genome (80-90%). Among them, we usually observed a few percent (around 10%) of step 2 aligned reads. Those reads are chimeric fragments for which we detect a ligation junction. An abnormal level of chimeric reads can reflect a ligation issue during the library preparation. +The fraction of singleton or multi-hits depends on the genome complexity and the fraction of unmapped reads. The fraction of singleton is usually close to the sum of unmapped R1 and R2 reads, as it is unlikely that both mates from the same pair were unmapped. + +## Valid pairs detection + +Each aligned reads can be assigned to one restriction fragment according to the reference genome and the digestion protocol. + +Invalid pairs are classified as follow: +* Dangling end, i.e. unligated fragments (both reads mapped on the same restriction fragment) +* Self circles, i.e. fragments ligated on themselves (both reads mapped on the same restriction fragment in inverted orientation) +* Religation, i.e. ligation of juxtaposed fragments +* Filtered pairs, i.e. any pairs that do not match the filtering criteria on inserts size, restriction fragments size +* Dumped pairs, i.e. any pairs for which we were not able to reconstruct the ligation product. + +Only valid pairs involving two different restriction fragments are used to build the contact maps. +Duplicated valid pairs associated to PCR artefacts are discarded (see `--rm_dup`. -## FastQC -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. +In case of Hi-C protocols that do not require a restriction enzyme such as DNase Hi-C or micro Hi-C, the assignment to a restriction is not possible (see `--dnase`). +Short range interactions that are likely to be spurious ligation products can thus be discarded using the `--min_cis_dist` parameter. -For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +* `*.validPairs` - List of valid ligation products +* `*RSstat` - Statitics of number of read pairs falling in each category -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. +The validPairs are stored using a simple tab-delimited text format ; -**Output directory: `results/fastqc`** +``` +read name / chr_reads1 / pos_reads1 / strand_reads1 / chr_reads2 / pos_reads2 / strand_reads2 / fragment_size / res frag name R1 / res frag R2 / mapping qual R1 / mapping qual R2 [/ allele_specific_tag] +``` -* `sample_fastqc.html` - * FastQC report, containing quality metrics for your untrimmed raw fastq files -* `zips/sample_fastqc.zip` - * zip file containing the FastQC report, tab-delimited data file and plot images +The ligation efficiency can be assessed using the filtering of valid and invalid pairs. As the ligation is a random process, 25% of each valid ligation class is expected. In the same way, a high level of dangling-end or self-circle read pairs is associated with a low quality experiment, and reveals a problem during the digestion, fill-in or ligation steps. +In the context of Hi-C protocol without restriction enzyme, this analysis step is skipped. The aligned pairs are therefore directly used to generate the contact maps. A filter of the short range contact (typically <1kb) is recommanded as this pairs are likely to be self ligation products. + +## Duplicates removal + +Note that validPairs file are generated per reads chunck. +These files are then merged in the allValidPairs file, and duplicates are removed if the `--rm_dup` parameter is used. + +* `*allValidPairs` - combined valid pairs from all read chunks +* `*mergestat` - statistics about duplicates removal and valid pairs information + +Additional quality controls such as fragment size distribution can be extracted from the list of valid interaction products. +We usually expect to see a distribution centered around 300 pb which correspond to the paired-end insert size commonly used. +The fraction of dplicates is also presented. A high level of duplication indicates a poor molecular complexity and a potential PCR bias. +Finaly, an important metric is to look at the fraction of intra and inter-chromosomal interactions, as well as long range (>20kb) versus short range (<20kb) intra-chromosomal interactions. + +## Contact maps + +Intra et inter-chromosomal contact maps are build for all specified resolutions. +The genome is splitted into bins of equal size. Each valid interaction is associated with the genomic bins to generate the raw maps. +In addition, Hi-C data can contain several sources of biases which has to be corrected. +The current workflow uses the [ìced](https://github.com/hiclib/iced) and [Varoquaux and Servant, 2018](http://joss.theoj.org/papers/10.21105/joss.01286) python package which proposes a fast implementation of the original ICE normalization algorithm (Imakaev et al. 2012), making the assumption of equal visibility of each fragment. + +* `*.matrix` - genome-wide contact maps +* `*_iced.matrix` - genome-wide iced contact maps + +The contact maps are generated for all specified resolution (see `--bin_size` argument) +A contact map is defined by : +* A list of genomic intervals related to the specified resolution (BED format). +* A matrix, stored as standard triplet sparse format (i.e. list format). + +Based on the observation that a contact map is symmetric and usually sparse, only non-zero values are stored for half of the matrix. The user can specified if the 'upper', 'lower' or 'complete' matrix has to be stored. The 'asis' option allows to store the contacts as they are observed from the valid pairs files. + +``` + A B 10 + A C 23 + B C 24 + (...) +``` + +This format is memory efficient, and is compatible with several software for downstream analysis. ## MultiQC + [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 927d1b712a15d979f05fc3d74f1c16ddfd9d457c..e6772eb34bd66f12b8477547a1c3cc250d34f33d 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1,7 +1,5 @@ # nf-core/hic: Troubleshooting -<!-- TODO nf-core: Change this documentation if these parameters/errors are not relevant for your workflow --> - ## Input files not found If only no file, only one input file , or only read one and not read two is picked up then something is wrong with your input file declaration diff --git a/docs/usage.md b/docs/usage.md index 15405acf307a67d179bad24da88ea1dc1a7aa600..853c38414b6e53090c3d9b0f19e849a0972b1243 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -34,7 +34,9 @@ * [`--max_restriction_fragment_size`](#--max_restriction_fragment_size) * [`--min_insert_size`](#--min_insert_size) * [`--max_insert_size`](#--max_insert_size) - * [Hi-C Processing](#hi-c-processing) + * [DNase Hi-C](#dnase-hi-c) + * [`--dnase`](#--dnase) + * [Hi-C Processing](#hi-c-processing) * [`--min_cis_dist`](#--min_cis_dist) * [`--rm_singleton`](#--rm_singleton) * [`--rm_dup`](#--rm_dup) @@ -149,7 +151,7 @@ Please note the following requirements: If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` -## Reference genomes +## Reference genomes and annotation files The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. @@ -223,7 +225,7 @@ If not specified, this file will be automatically created by the pipeline. In th ``` ```bash ---bwt2_index '[path to chromosome size file]' +--chromosome_size '[path to chromosome size file]' ``` ### `--restriction_fragments` @@ -251,7 +253,7 @@ Note that the `--restriction_site` parameter is mandatory to create this file. The following options are defined in the `hicpro.config` file, and can be updated either using a custom configuration file (see `-c` option) or using command line parameter. -## Reads mapping +### Reads mapping The reads mapping is currently based on the two-steps strategy implemented in the HiC-pro pipeline. The idea is to first align reads from end-to-end. Reads that do not aligned are then trimmed at the ligation site, and their 5' end is re-aligned to the reference genome. @@ -281,7 +283,7 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10 --min_mapq '[Minimum quality value]' ``` -## Digestion Hi-C +### Digestion Hi-C #### `--restriction_site` @@ -340,7 +342,18 @@ Maximum reads insert size. Longer 3C products are discarded. Default: '' --max_insert_size '[numeric]' ``` -## Hi-C processing +### DNAse Hi-C + +#### `--dnase` + +In DNAse Hi-C mode, all options related to digestion Hi-C (see previous section) are ignored. +In this case, it is highly recommanded to use the `--min_cis_dist` parameter to remove spurious ligation products. + +```bash +--dnase' +``` + +### Hi-C processing #### `--min_cis_dist` @@ -376,7 +389,7 @@ If specified, reads that aligned multiple times on the genome are discarded. Not ## Genome-wide contact maps -#### `--bins_size` +#### `--bin_size` Resolution of contact maps to generate (space separated). Default:'1000000,500000' diff --git a/environment.yml b/environment.yml index 810f4af056251c9616a4619cfc596d3df6e94e8b..afddba2ef81dd47d399b156c582c4641a6e28dc3 100644 --- a/environment.yml +++ b/environment.yml @@ -16,4 +16,4 @@ dependencies: - samtools=1.9 - multiqc=1.6 - pip: - - iced=0.4.2 \ No newline at end of file + - iced==0.4.2 \ No newline at end of file diff --git a/main.nf b/main.nf index 8ac8bd57b2e2bb1b2674d2e0fb457a7ae935f877..5550fb1d156943fb8cdf46f6ae09a64b347a463a 100644 --- a/main.nf +++ b/main.nf @@ -36,7 +36,7 @@ def helpMessage() { -profile Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, awsbatch, test and more. - References If not specified in the configuration file or you wish to overwrite any of the references. + References: If not specified in the configuration file or you wish to overwrite any of the references. --genome Name of iGenomes reference --bwt2_index Path to Bowtie2 index --fasta Path to Fasta reference @@ -50,11 +50,13 @@ def helpMessage() { --restriction_site Cutting motif(s) of restriction enzyme(s) (comma separated) --ligation_site Ligation motifs to trim (comma separated) - --min_restriction_fragment_size Minimum size of restriction fragments to consider --max_restriction_framgnet_size Maximum size of restriction fragmants to consider --min_insert_size Minimum insert size of mapped reads to consider --max_insert_size Maximum insert size of mapped reads to consider + + --dnase Run DNase Hi-C mode. All options related to restriction fragments are not considered + --min_cis_dist Minimum intra-chromosomal distance to consider --rm_singleton Remove singleton reads --rm_multi Remove multi-mapped reads @@ -97,6 +99,11 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } +// Check Digestion or DNase Hi-C mode +if (!params.dnase && !params.ligation_site) { + exit 1, "Ligation motif not found. For DNase Hi-C, please use '--dnase' option" +} + // Reference index path configuration params.bwt2_index = params.genome ? params.genomes[ params.genome ].bowtie2 ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false @@ -221,7 +228,7 @@ else { } // Resolutions for contact maps -map_res = Channel.from( params.bins_size.tokenize(',') ) +map_res = Channel.from( params.bin_size.tokenize(',') ) // Stage config files ch_multiqc_config = Channel.fromPath(params.multiqc_config) @@ -242,31 +249,36 @@ log.info """======================================================= nf-core/hic v${workflow.manifest.version}" =======================================================""" def summary = [:] -summary['Pipeline Name'] = 'nf-core/hic' +summary['Pipeline Name'] = 'nf-core/hic' summary['Pipeline Version'] = workflow.manifest.version -summary['Run Name'] = custom_runName ?: workflow.runName - -summary['Reads'] = params.reads -summary['Fasta Ref'] = params.fasta - - -summary['Max Memory'] = params.max_memory -summary['Max CPUs'] = params.max_cpus -summary['Max Time'] = params.max_time -summary['Output dir'] = params.outdir -summary['Working dir'] = workflow.workDir +summary['Run Name'] = custom_runName ?: workflow.runName + +summary['Reads'] = params.reads +summary['splitFastq'] = params.splitFastq +summary['Fasta Ref'] = params.fasta +summary['Ligation Motif'] = params.ligation_site +summary['DNase Mode'] = params.dnase +summary['Remove Dup'] = params.rm_dup +summary['Maps resolution'] = params.bin_size + +summary['Max Memory'] = params.max_memory +summary['Max CPUs'] = params.max_cpus +summary['Max Time'] = params.max_time +summary['Output dir'] = params.outdir +summary['Working dir'] = workflow.workDir summary['Container Engine'] = workflow.containerEngine -if(workflow.containerEngine) summary['Container'] = workflow.container -summary['Current home'] = "$HOME" -summary['Current user'] = "$USER" -summary['Current path'] = "$PWD" -summary['Working dir'] = workflow.workDir -summary['Output dir'] = params.outdir -summary['Script dir'] = workflow.projectDir -summary['Config Profile'] = workflow.profile +if(workflow.containerEngine) + summary['Container'] = workflow.container +summary['Current home'] = "$HOME" +summary['Current user'] = "$USER" +summary['Current path'] = "$PWD" +summary['Working dir'] = workflow.workDir +summary['Output dir'] = params.outdir +summary['Script dir'] = workflow.projectDir +summary['Config Profile'] = workflow.profile if(workflow.profile == 'awsbatch'){ - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue } if(params.email) summary['E-mail Address'] = params.email log.info summary.collect { k,v -> "${k.padRight(15)}: $v" }.join("\n") @@ -358,7 +370,7 @@ if(!params.chromosome_size && params.fasta){ } } -if(!params.restriction_fragments && params.fasta){ +if(!params.restriction_fragments && params.fasta && !params.dnase){ process getRestrictionFragments { tag "$fasta [${params.restriction_site}]" publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir }, @@ -401,14 +413,26 @@ process bowtie2_end_to_end { script: prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/ def bwt2_opts = params.bwt2_opts_end2end - """ - bowtie2 --rg-id BMG --rg SM:${prefix} \\ + + if (!params.dnase){ + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ ${bwt2_opts} \\ -p ${task.cpus} \\ -x ${index}/${bwt2_base} \\ --un ${prefix}_unmap.fastq \\ -U ${reads} | samtools view -F 4 -bS - > ${prefix}.bam - """ + """ + }else{ + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${bwt2_opts} \\ + -p ${task.cpus} \\ + -x ${index}/${bwt2_base} \\ + --un ${prefix}_unmap.fastq \\ + -U ${reads} > ${prefix}.bam + """ + } } process trim_reads { @@ -416,6 +440,9 @@ process trim_reads { publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + when: + !params.dnase + input: set val(prefix), file(reads) from unmapped_end_to_end @@ -435,6 +462,9 @@ process bowtie2_on_trimmed_reads { publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + when: + !params.dnase + input: set val(prefix), file(reads) from trimmed_reads file index from bwt2_index_trim.collect() @@ -453,47 +483,80 @@ process bowtie2_on_trimmed_reads { """ } -process merge_mapping_steps{ - tag "$sample = $bam1 + $bam2" - publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, +if (!params.dnase){ + process merge_mapping_steps{ + tag "$sample = $bam1 + $bam2" + publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' - input: - set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) + input: + set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) - output: - set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam - set val(oname), file("${prefix}.mapstat") into all_mapstat + output: + set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam + set val(oname), file("${prefix}.mapstat") into all_mapstat - script: - sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ - tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2" - oname = prefix.toString() - ~/(\.[0-9]+)$/ + script: + sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ + tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2" + oname = prefix.toString() - ~/(\.[0-9]+)$/ - """ - samtools merge -@ ${task.cpus} \\ + """ + samtools merge -@ ${task.cpus} \\ -f ${prefix}_bwt2merged.bam \\ ${bam1} ${bam2} - samtools sort -@ ${task.cpus} -m 800M \\ + samtools sort -@ ${task.cpus} -m 800M \\ -n -T /tmp/ \\ -o ${prefix}_bwt2merged.sorted.bam \\ ${prefix}_bwt2merged.bam - mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam - - echo "## ${prefix}" > ${prefix}.mapstat - echo -n "total_${tag}\t" >> ${prefix}.mapstat - samtools view -c ${prefix}_bwt2merged.bam >> ${prefix}.mapstat - echo -n "mapped_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${prefix}_bwt2merged.bam >> ${prefix}.mapstat - echo -n "global_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat - echo -n "local_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam2} >> ${prefix}.mapstat - """ + mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam + + echo "## ${prefix}" > ${prefix}.mapstat + echo -n "total_${tag}\t" >> ${prefix}.mapstat + samtools view -c ${prefix}_bwt2merged.bam >> ${prefix}.mapstat + echo -n "mapped_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${prefix}_bwt2merged.bam >> ${prefix}.mapstat + echo -n "global_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "local_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam2} >> ${prefix}.mapstat + """ + } +}else{ + process dnase_mapping_stats{ + tag "$sample = $bam1" + publishDir path: { params.saveAlignedIntermediates ? "${params.outdir}/mapping" : params.outdir }, + saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' + + input: + set val(prefix), file(bam1) from end_to_end_bam + + output: + set val(sample), file(bam1) into bwt2_merged_bam + set val(oname), file("${prefix}.mapstat") into all_mapstat + + script: + sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ + tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2" + oname = prefix.toString() - ~/(\.[0-9]+)$/ + + """ + echo "## ${prefix}" > ${prefix}.mapstat + echo -n "total_${tag}\t" >> ${prefix}.mapstat + samtools view -c ${bam1} >> ${prefix}.mapstat + echo -n "mapped_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "global_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "local_${tag}\t0" >> ${prefix}.mapstat + """ + } } + + process combine_mapped_files{ tag "$sample = $r1_prefix + $r2_prefix" publishDir "${params.outdir}/mapping", mode: 'copy', @@ -527,39 +590,66 @@ process combine_mapped_files{ * STEP2 - DETECT VALID PAIRS */ - -process get_valid_interaction{ - tag "$sample" - publishDir "${params.outdir}/hic_results/data", mode: 'copy', +if (!params.dnase){ + process get_valid_interaction{ + tag "$sample" + publishDir "${params.outdir}/hic_results/data", mode: 'copy', saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} - input: - set val(sample), file(pe_bam) from paired_bam - file frag_file from res_frag_file.collect() - - output: - set val(sample), file("*.validPairs") into valid_pairs - set val(sample), file("*.validPairs") into valid_pairs_4cool - set val(sample), file("*RSstat") into all_rsstat - - script: - - if (params.splitFastq){ - sample = sample.toString() - ~/(\.[0-9]+)$/ - } - - def opts = "" - if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" - if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}" - if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" - if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" - if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" + input: + set val(sample), file(pe_bam) from paired_bam + file frag_file from res_frag_file.collect() + + output: + set val(sample), file("*.validPairs") into valid_pairs + set val(sample), file("*.validPairs") into valid_pairs_4cool + set val(sample), file("*RSstat") into all_rsstat + + script: + if (params.splitFastq){ + sample = sample.toString() - ~/(\.[0-9]+)$/ + } + + def opts = "" + if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}" + if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" + if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" + if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" + + """ + mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} ${opts} + """ + } +} +else{ + process get_valid_interaction_dnase{ + tag "$sample" + publishDir "${params.outdir}/hic_results/data", mode: 'copy', + saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} - """ - mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} ${opts} - """ + input: + set val(sample), file(pe_bam) from paired_bam + + output: + set val(sample), file("*.validPairs") into valid_pairs + set val(sample), file("*.validPairs") into valid_pairs_4cool + set val(sample), file("*RSstat") into all_rsstat + + script: + if (params.splitFastq){ + sample = sample.toString() - ~/(\.[0-9]+)$/ + } + + def opts = "" + if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + """ + mapped_2hic_dnase.py -r ${pe_bam} ${opts} + """ + } } + /* * STEP3 - BUILD MATRIX */ diff --git a/nextflow.config b/nextflow.config index 5cd6dfec6888a0727fdfc7c46862313328d60c3b..32486aab168ccff7647220cd75a61f72f695fbc5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ params { restriction_fragments = false skip_cool = false skip_multiqc = false + dnase = false // Boilerplate options name = false