Newer
Older
#!/usr/bin/env nextflow
/*
========================================================================================
nf-core/hic
========================================================================================
nf-core/hic Analysis Pipeline.
#### Homepage / Documentation
https://github.com/nf-core/hic
----------------------------------------------------------------------------------------
*/
def helpMessage() {
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run nf-core/hic --input '*_R{1,2}.fastq.gz' -profile docker
Mandatory arguments:
--input [file] Path to input data (must be surrounded with quotes)
-profile [str] Configuration profile to use. Can use multiple (comma separated)
Available: conda, docker, singularity, awsbatch, test and more.
References If not specified in the configuration file or you wish to overwrite any of the references.
--bwt2_index [file] Path to Bowtie2 index
--fasta [file] Path to Fasta reference
Digestion Hi-C If not specified in the configuration file or you wish to set up specific digestion protocol
--digestion [str] Digestion Hi-C. Name of restriction enzyme used for digestion pre-configuration. Default: 'hindiii'
--ligation_site [str] Ligation motifs to trim (comma separated) if not available in --digestion. Default: false
--restriction_site [str] Cutting motif(s) of restriction enzyme(s) (comma separated) if not available in --digestion. Default: false
--chromosome_size [file] Path to chromosome size file
--restriction_fragments [file] Path to restriction fragment file (bed)
--save_reference [bool] Save reference genome to output folder. Default: False
DNase Hi-C
--dnase [bool] Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False
--min_cis_dist [int] Minimum intra-chromosomal distance to consider. Default: None
--bwt2_opts_end2end [str] Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default.
--bwt2_opts_trimmed [str] Options for bowtie2 mapping after ligation site trimming. See hic.config for default.
--min_mapq [int] Minimum mapping quality values to consider. Default: 10
--keep_multi [bool] Keep multi-mapped reads (--min_mapq is ignored). Default: false
--keep_dups [bool] Keep duplicates. Default: false
--save_aligned_intermediates [bool] Save intermediates alignment files. Default: False
--split_fastq [bool] Split fastq files in reads chunks to speed up computation. Default: false
--fastq_chunks_size [int] Size of read chunks if split_fastq is true. Default: 20000000
Valid Pairs Detection
--min_restriction_fragment_size [int] Minimum size of restriction fragments to consider. Default: None
--max_restriction_fragment_size [int] Maximum size of restriction fragments to consider. Default: None
--min_insert_size [int] Minimum insert size of mapped reads to consider. Default: None
--max_insert_size [int] Maximum insert size of mapped reads to consider. Default: None
--save_interaction_bam [bool] Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False
--bin_size [str] Bin size for contact maps (comma separated). Default: '1000000,500000'
--ice_max_iter [int] Maximum number of iteration for ICE normalization. Default: 100
--ice_filter_low_count_perc [float] Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02
--ice_filter_high_count_perc [float] Percentage of high counts columns/rows to filter before ICE normalization. Default: 0
--ice_eps [float] Convergence criteria for ICE normalization. Default: 0.1
--skip_maps [bool] Skip generation of contact maps. Useful for capture-C. Default: False
--skip_ice [bool] Skip ICE normalization. Default: False
--skip_cool [bool] Skip generation of cool files. Default: False
--skip_multiqc [bool] Skip MultiQC. Default: False
--outdir [file] The output directory where the results will be saved
--publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy)
--email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None
--email_on_fail [email] Same as --email, except only send mail if the workflow is not successful
--max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
-name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None
--awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch
--awsregion [str] The AWS Region for your AWS Batch job to run on
""".stripIndent()
}
/**********************************************************
* SET UP CONFIGURATION VARIABLES
*/
helpMessage()
exit 0
}
// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
if (params.digest && params.digestion && !params.digest.containsKey(params.digestion)) {
exit 1, "Unknown digestion protocol. Currently, the available digestion options are ${params.digest.keySet().join(", ")}. Please set manually the '--restriction_site' and '--ligation_site' parameters."
}
params.restriction_site = params.digestion ? params.digest[ params.digestion ].restriction_site ?: false : false
params.ligation_site = params.digestion ? params.digest[ params.digestion ].ligation_site ?: false : false
// Check Digestion or DNase Hi-C mode
if (!params.dnase && !params.ligation_site) {
exit 1, "Ligation motif not found. Please either use the `--digestion` parameters or specify the `--restriction_site` and `--ligation_site`. For DNase Hi-C, please use '--dnase' option"
params.bwt2_index = params.genome ? params.genomes[ params.genome ].bowtie2 ?: false : false
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
custom_runName = workflow.runName
// Check AWS batch settings
if (workflow.profile.contains('awsbatch')) {
// AWSBatch sanity checking
if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
// Check outdir paths to be S3 buckets if running on AWSBatch
// related: https://github.com/nextflow-io/nextflow/issues/813
if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
// Prevent trace files to be stored on S3 since S3 does not support rolling files.
if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}
// Stage config files
ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true)
ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true)
/**********************************************************
* SET UP CHANNELS
*/
raw_reads = Channel.create()
raw_reads_2 = Channel.create()
Channel
.map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] }
.separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0] + "_R1", a[1][0]), tuple(a[0] + "_R2", a[1][1])] }
raw_reads = Channel.create()
raw_reads_2 = Channel.create()
if ( params.split_fastq ){
Channel
.fromFilePairs( params.input, flat:true )
.splitFastq( by: params.fastq_chunks_size, pe:true, file: true, compress:true)
.separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0] + "_R1", a[1]), tuple(a[0] + "_R2", a[2])] }
}else{
Channel
.fromFilePairs( params.input )
.separate( raw_reads, raw_reads_2 ) { a -> [tuple(a[0] + "_R1", a[1][0]), tuple(a[0] + "_R2", a[1][1])] }
}
// Update sample name if splitFastq is used
def updateSampleName(x) {
if ((matcher = x[1] =~ /\s*(\.[\d]+).fastq.gz/)) {
res = matcher[0][1]
}
return [x[0] + res, x[1]]
if (params.split_fastq ){
raw_reads = raw_reads.concat( raw_reads_2 ).map{it -> updateSampleName(it)}.dump(tag:'input')
}else{
raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag:'input')
}
lastPath = params.bwt2_index.lastIndexOf(File.separator)
bwt2_dir = params.bwt2_index.substring(0,lastPath+1)
bwt2_base = params.bwt2_index.substring(lastPath+1)
Channel.fromPath( bwt2_dir , checkIfExists: true)
.ifEmpty { exit 1, "Genome index: Provided index not found: ${params.bwt2_index}" }
.into { bwt2_index_end2end; bwt2_index_trim }
lastPath = params.fasta.lastIndexOf(File.separator)
fasta_base = params.fasta.substring(lastPath+1)
bwt2_base = fasta_base.toString() - ~/(\.fa)?(\.fasta)?(\.fas)?(\.fsa)?$/
.ifEmpty { exit 1, "Genome index: Fasta file not found: ${params.fasta}" }
.set { fasta_for_index }
}
else {
exit 1, "No reference genome specified!"
}
Channel.fromPath( params.chromosome_size , checkIfExists: true)
.ifEmpty { exit 1, "Chromosome sizes: Fasta file not found: ${params.fasta}" }
.set { fasta_for_chromsize }
}
else {
exit 1, "No chromosome size specified!"
}
// Restriction fragments
if ( params.restriction_fragments ){
Channel.fromPath( params.restriction_fragments, checkIfExists: true )
.ifEmpty { exit 1, "Restriction fragments: Fasta file not found: ${params.fasta}" }
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
if (params.res_tads){
Channel.from( "${params.res_tads}" )
.splitCsv()
.flatten()
.into {tads_res_hicexplorer; tads_res_insulation; tads_bin }
}else{
tads_res=Channel.create()
tads_bin=Channel.create()
if (!params.skip_tads){
log.warn "[nf-core/hic] Hi-C resolution for TADs calling not specified. See --res_tads"
}
}
if (params.res_dist_decay){
Channel.from( "${params.res_dist_decay}" )
.splitCsv()
.flatten()
.into {ddecay_res; ddecay_bin }
}else{
ddecay_res = Channel.create()
ddecay_bin = Channel.create()
if (!params.skip_dist_decay){
log.warn "[nf-core/hic] Hi-C resolution for distance decay not specified. See --res_dist_decay"
}
}
map_res = Channel.from( params.bin_size ).splitCsv().flatten()
/**********************************************************
* SET UP LOGS
*/
// Header log info
if(workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name'] = custom_runName ?: workflow.runName
summary['Input'] = params.input
if (params.split_fastq)
summary['Read chunks Size'] = params.fastq_chunks_size
summary['Fasta Ref'] = params.fasta
summary['Restriction Motif']= params.restriction_site
summary['Ligation Motif'] = params.ligation_site
summary['Min Fragment Size']= params.min_restriction_fragment_size
summary['Max Fragment Size']= params.max_restriction_fragment_size
summary['Min Insert Size'] = params.min_insert_size
summary['Max Insert Size'] = params.max_insert_size
}else{
summary['DNase Mode'] = params.dnase
summary['Min CIS dist'] = params.min_cis_dist
summary['Keep Duplicates'] = params.keep_dups ? 'Yes' : 'No'
summary['Keep Multihits'] = params.keep_multi ? 'Yes' : 'No'
summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir'] = params.outdir
summary['Launch dir'] = workflow.launchDir
summary['Working dir'] = workflow.workDir
summary['Script dir'] = workflow.projectDir
summary['User'] = workflow.userName
if (workflow.profile.contains('awsbatch')) {
summary['AWS Region'] = params.awsregion
summary['AWS Queue'] = params.awsqueue
summary['AWS CLI'] = params.awscli
}
summary['Config Profile'] = workflow.profile
if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description
if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact
if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url
summary['Config Files'] = workflow.configFiles.join(', ')
if (params.email || params.email_on_fail) {
summary['E-mail Address'] = params.email
summary['E-mail on failure'] = params.email_on_fail
summary['MultiQC maxsize'] = params.max_multiqc_email_size
log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"
// Check the hostnames against configured profiles
checkHostname()
Channel.from(summary.collect{ [it.key, it.value] })
.map { k,v -> "<dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }
.reduce { a, b -> return [a, b].join("\n ") }
.map { x -> """
id: 'nf-core-hic-summary'
description: " - this information is collected when the pipeline is started."
section_name: 'nf-core/hic Workflow Summary'
section_href: 'https://github.com/nf-core/hic'
plot_type: 'html'
data: |
<dl class=\"dl-horizontal\">
""".stripIndent() }
.set { ch_workflow_summary }
/*
* Parse software version numbers
*/
process get_software_versions {
publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.indexOf(".csv") > 0) filename
else null
}
output:
file 'software_versions_mqc.yaml' into software_versions_yaml
file "software_versions.csv"
script:
"""
echo $workflow.manifest.version > v_pipeline.txt
echo $workflow.nextflow.version > v_nextflow.txt
bowtie2 --version > v_bowtie2.txt
samtools --version > v_samtools.txt
scrape_software_versions.py &> software_versions_mqc.yaml
"""
def create_workflow_summary(summary) {
def yaml_file = workDir.resolve('workflow_summary_mqc.yaml')
yaml_file.text = """
description: " - this information is collected when the pipeline is started."
section_name: 'nf-core/hic Workflow Summary'
section_href: 'https://github.com/nf-core/hic'
plot_type: 'html'
data: |
<dl class=\"dl-horizontal\">
${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")}
</dl>
""".stripIndent()
return yaml_file
}
/****************************************************
* PRE-PROCESSING
*/
if(!params.bwt2_index && params.fasta){
process makeBowtie2Index {
tag "$bwt2_base"
label 'process_highmem'
publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir },
saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode
file "bowtie2_index" into bwt2_index_end2end
file "bowtie2_index" into bwt2_index_trim
mkdir bowtie2_index
bowtie2-build ${fasta} bowtie2_index/${bwt2_base}
"""
}
}
if(!params.chromosome_size && params.fasta){
process makeChromSize {
tag "$fasta"
publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir },
saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode
input:
file fasta from fasta_for_chromsize
output:
file "*.size" into chromosome_size, chromosome_size_cool
samtools faidx ${fasta}
cut -f1,2 ${fasta}.fai > chrom.size
if(!params.restriction_fragments && params.fasta && !params.dnase){
tag "$fasta ${params.restriction_site}"
label 'process_low'
publishDir path: { params.save_reference ? "${params.outdir}/reference_genome" : params.outdir },
saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode
input:
file fasta from fasta_for_resfrag
output:
digest_genome.py -r ${params.restriction_site} -o restriction_fragments.bed ${fasta}
/****************************************************
* MAIN WORKFLOW
*/
* STEP 1 - Two-steps Reads Mapping
*/
label 'process_medium'
publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir },
saveAs: { params.save_aligned_intermediates ? it : null }, mode: params.publish_dir_mode
set val(sample), file(reads) from raw_reads
file index from bwt2_index_end2end.collect()
set val(sample), file("${prefix}_unmap.fastq") into unmapped_end_to_end
set val(sample), file("${prefix}.bam") into end_to_end_bam
prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/
def bwt2_opts = params.bwt2_opts_end2end
if (!params.dnase){
"""
bowtie2 --rg-id BMG --rg SM:${prefix} \\
${bwt2_opts} \\
-p ${task.cpus} \\
-x ${index}/${bwt2_base} \\
--un ${prefix}_unmap.fastq \\
-U ${reads} | samtools view -F 4 -bS - > ${prefix}.bam
"""
}else{
"""
bowtie2 --rg-id BMG --rg SM:${prefix} \\
${bwt2_opts} \\
-p ${task.cpus} \\
-x ${index}/${bwt2_base} \\
--un ${prefix}_unmap.fastq \\
-U ${reads} > ${prefix}.bam
"""
}
label 'process_low'
publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir },
saveAs: { params.save_aligned_intermediates ? it : null }, mode: params.publish_dir_mode
set val(sample), file(reads) from unmapped_end_to_end
set val(sample), file("${prefix}_trimmed.fastq") into trimmed_reads
prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/
"""
cutsite_trimming --fastq $reads \\
--cutsite ${params.ligation_site} \\
--out ${prefix}_trimmed.fastq
"""
label 'process_medium'
publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir },
saveAs: { params.save_aligned_intermediates ? it : null }, mode: params.publish_dir_mode
set val(sample), file(reads) from trimmed_reads
set val(sample), file("${prefix}_trimmed.bam") into trimmed_bam
prefix = reads.toString() - ~/(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/
"""
bowtie2 --rg-id BMG --rg SM:${prefix} \\
${params.bwt2_opts_trimmed} \\
-p ${task.cpus} \\
-x ${index}/${bwt2_base} \\
-U ${reads} | samtools view -bS - > ${prefix}_trimmed.bam
"""
process bowtie2_merge_mapping_steps{
tag "$prefix = $bam1 + $bam2"
label 'process_medium'
publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir },
saveAs: { params.save_aligned_intermediates ? it : null }, mode: params.publish_dir_mode
set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ).dump(tag:'merge')
set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam
set val(oname), file("${prefix}.mapstat") into all_mapstat
//sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1|_2)/
sample = prefix.toString() - ~/(_R1|_R2)/
//tag = prefix.toString() =~/_R1|_val_1|_1/ ? "R1" : "R2"
tag = prefix.toString() =~/_R1/ ? "R1" : "R2"
oname = prefix.toString() - ~/(\.[0-9]+)$/
"""
samtools merge -@ ${task.cpus} \\
-f ${prefix}_bwt2merged.bam \\
${bam1} ${bam2}
-n -T /tmp/ \\
-o ${prefix}_bwt2merged.sorted.bam \\
${prefix}_bwt2merged.bam
mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam
echo "## ${prefix}" > ${prefix}.mapstat
echo -n "total_${tag}\t" >> ${prefix}.mapstat
samtools view -c ${prefix}_bwt2merged.bam >> ${prefix}.mapstat
echo -n "mapped_${tag}\t" >> ${prefix}.mapstat
samtools view -c -F 4 ${prefix}_bwt2merged.bam >> ${prefix}.mapstat
echo -n "global_${tag}\t" >> ${prefix}.mapstat
samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat
echo -n "local_${tag}\t" >> ${prefix}.mapstat
samtools view -c -F 4 ${bam2} >> ${prefix}.mapstat
"""
label 'process_medium'
publishDir path: { params.save_aligned_intermediates ? "${params.outdir}/mapping" : params.outdir },
saveAs: { params.save_aligned_intermediates ? it : null }, mode: params.publish_dir_mode
set val(prefix), file(bam) from end_to_end_bam
set val(sample), file(bam) into bwt2_merged_bam
//sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1|_2)/
sample = prefix.toString() - ~/(_R1|_R2)/
//tag = prefix.toString() =~/_R1|_val_1|_1/ ? "R1" : "R2"
tag = prefix.toString() =~/_R1/ ? "R1" : "R2"
oname = prefix.toString() - ~/(\.[0-9]+)$/
"""
echo "## ${prefix}" > ${prefix}.mapstat
echo -n "total_${tag}\t" >> ${prefix}.mapstat
samtools view -c ${bam} >> ${prefix}.mapstat
samtools view -c -F 4 ${bam} >> ${prefix}.mapstat
samtools view -c -F 4 ${bam} >> ${prefix}.mapstat
tag "$sample = $r1_prefix + $r2_prefix"
publishDir "${params.outdir}/mapping", mode: params.publish_dir_mode,
saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"}
set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple().dump(tag:'mates')
set val(oname), file("${sample}_bwt2pairs.bam") into paired_bam
r1_bam = aligned_bam[0]
r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/
r2_bam = aligned_bam[1]
r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/
oname = sample.toString() - ~/(\.[0-9]+)$/
def opts = "-t"
if (params.keep_multi) {
opts="${opts} --multi"
}else if (params.min_mapq){
opts="${opts} -q ${params.min_mapq}"
}
"""
mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam ${opts}
"""
if (!params.dnase){
process get_valid_interaction{
tag "$sample"
publishDir "${params.outdir}/hic_results/data", mode: params.publish_dir_mode,
saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"}
set val(sample), file(pe_bam) from paired_bam
file frag_file from res_frag_file.collect()
set val(sample), file("*.validPairs") into valid_pairs
set val(sample), file("*.validPairs") into valid_pairs_4cool
set val(sample), file("*.DEPairs") into de_pairs
set val(sample), file("*.SCPairs") into sc_pairs
set val(sample), file("*.REPairs") into re_pairs
set val(sample), file("*.FiltPairs") into filt_pairs
set val(sample), file("*RSstat") into all_rsstat
sample = sample.toString() - ~/(\.[0-9]+)$/
}
def opts = ""
opts += params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
opts += params.min_insert_size > 0 ? " -s ${params.min_insert_size}" : ''
opts += params.max_insert_size > 0 ? " -l ${params.max_insert_size}" : ''
opts += params.min_restriction_fragment_size > 0 ? " -t ${params.min_restriction_fragment_size}" : ''
opts += params.max_restriction_fragment_size > 0 ? " -m ${params.max_restriction_fragment_size}" : ''
opts += params.save_interaction_bam ? " --sam" : ''
"""
mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts}
sort -T /tmp/ -k2,2V -k3,3n -k5,5V -k6,6n -o ${prefix}.validPairs ${prefix}.validPairs
}
}
else{
process get_valid_interaction_dnase{
tag "$sample"
publishDir "${params.outdir}/hic_results/data", mode: params.publish_dir_mode,
saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"}
set val(sample), file("*.validPairs") into valid_pairs
set val(sample), file("*.validPairs") into valid_pairs_4cool
set val(sample), file("*RSstat") into all_rsstat
opts = params.min_cis_dist > 0 ? " -d ${params.min_cis_dist}" : ''
sort -T /tmp/ -k2,2V -k3,3n -k5,5V -k6,6n -o ${prefix}.validPairs ${prefix}.validPairs
publishDir "${params.outdir}/hic_results/data", mode: params.publish_dir_mode,
saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"}
set val(sample), file(vpairs) from valid_pairs.groupTuple().dump(tag:'final')
set val(sample), file("*.allValidPairs") into ch_vpairs, ch_vpairs_4cool
## Sort valid pairs and remove read pairs with same starts (i.e duplicated read pairs)
sort -T /tmp/ -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m ${vpairs} | \
awk -F"\\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=\$2 || c2!=\$5 || s1!=\$3 || s2!=\$6){print;c1=\$2;c2=\$5;s1=\$3;s2=\$6}' > ${sample}.allValidPairs
echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
## Count short range (<20000) vs long range contacts
awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
"""
}else{
"""
mkdir -p stats/${sample}
cat ${vpairs} > ${sample}.allValidPairs
echo -n "valid_interaction\t" > stats/${sample}/${sample}_allValidPairs.mergestat
cat ${vpairs} | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
echo -n "valid_interaction_rmdup\t" >> stats/${sample}/${sample}_allValidPairs.mergestat
cat ${sample}.allValidPairs | wc -l >> stats/${sample}/${sample}_allValidPairs.mergestat
## Count short range (<20000) vs long range contacts
awk 'BEGIN{cis=0;trans=0;sr=0;lr=0} \$2 == \$5{cis=cis+1; d=\$6>\$3?\$6-\$3:\$3-\$6; if (d<=20000){sr=sr+1}else{lr=lr+1}} \$2!=\$5{trans=trans+1}END{print "trans_interaction\\t"trans"\\ncis_interaction\\t"cis"\\ncis_shortRange\\t"sr"\\ncis_longRange\\t"lr}' ${sample}.allValidPairs >> stats/${sample}/${sample}_allValidPairs.mergestat
"""
publishDir "${params.outdir}/hic_results/stats/${sample}", mode: params.publish_dir_mode
set val(prefix), file(fstat) from all_mapstat.groupTuple().concat(all_pairstat.groupTuple(), all_rsstat.groupTuple())
sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2|_1|_2)/
if ( (fstat =~ /.mapstat/) ){ ext = "mmapstat" }
if ( (fstat =~ /.pairstat/) ){ ext = "mpairstat" }
if ( (fstat =~ /.RSstat/) ){ ext = "mRSstat" }
"""
mkdir -p mstats/${sample}
merge_statfiles.py -f ${fstat} > mstats/${sample}/${prefix}.${ext}
"""
publishDir "${params.outdir}/hic_results/matrix/raw", mode: params.publish_dir_mode
set val(sample), file(vpairs), val(mres) from ch_vpairs.combine(map_res)
set val(sample), val(mres), file("*.matrix"), file("*.bed") into raw_maps, raw_maps_4cool
build_matrix --matrix-format upper --binsize ${mres} --chrsizes ${chrsize} --ifile ${vpairs} --oprefix ${sample}_${mres}
}
/*
* STEP 4 - NORMALIZE MATRIX
*/
publishDir "${params.outdir}/hic_results/matrix/iced", mode: params.publish_dir_mode
set val(sample), val(res), file(rmaps), file(bed) from raw_maps
set val(sample), val(res), file("*iced.matrix"), file(bed) into iced_maps_4h5, iced_maps_4cool
file ("*.biases") into iced_bias
script:
prefix = rmaps.toString() - ~/(\.matrix)?$/
"""
ice --filter_low_counts_perc ${params.ice_filer_low_count_perc} \
--results_filename ${prefix}_iced.matrix \
--filter_high_counts_perc ${params.ice_filer_high_count_perc} \
--max_iter ${params.ice_max_iter} --eps ${params.ice_eps} --remove-all-zeros-loci --output-bias 1 --verbose 1 ${rmaps}
publishDir "${params.outdir}/contact_maps/cool", mode: 'copy'
set val(sample), val(res), file(mat), file(bed) from iced_maps_4cool
cooler load --count-as-float -f coo --one-based ${bed} ${mat} ${sample}_${res}.cool
cooler zoomify --nproc ${task.cpus} ${sample}_${res}.cool
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
/*
* Create h5 file
*/
process convert_to_h5 {
tag "$sample"
label 'process_medium'
publishDir "${params.outdir}/contact_maps/h5", mode: 'copy'
input:
set val(sample), val(res), file(maps), file(bed) from iced_maps_4h5
output:
set val(sample), val(res), file("*.h5") into h5maps_ddecay, h5maps_ccomp, h5maps_tads
script:
prefix = maps.toString() - ~/(\.matrix)?$/
"""
hicConvertFormat --matrices ${maps} \
--outFileName ${prefix}.h5 \
--resolution ${res} \
--inputFormat hicpro \
--outputFormat h5 \
-bf ${bed}
"""
}
/****************************************************
* DOWNSTREAM ANALYSIS
*/
/*
* Counts vs distance QC
*/
chddecay = h5maps_ddecay.combine(ddecay_res).filter{ it[1] == it[3] }.dump(tag: "ddecay")
process dist_decay {
tag "$sample"
label 'process_medium'
publishDir "${params.outdir}/hic_results/dist", mode: 'copy'
when:
!params.skip_dist_decay
input:
set val(sample), val(res), file(h5mat), val(r) from chddecay
output:
file("*_distcount.txt")
file("*.png")
script:
prefix = h5mat.toString() - ~/(\.h5)?$/
"""
hicPlotDistVsCounts --matrices ${h5mat} \
--plotFile ${prefix}_distcount.png \
--outFileData ${prefix}_distcount.txt
"""
}
/*
* TADs calling
*/
chtads = h5maps_tads.combine(tads_res_hicexplorer).filter{ it[1] == it[3] }.dump(tag: "hicexp")
process tads_hicexplorer {
tag "$sample - $res"
label 'process_medium'
publishDir "${params.outdir}/tads", mode: 'copy'
when:
!params.skip_tads && params.tads_caller =~ 'hicexplorer'
input:
set val(sample), val(res), file(h5mat), val(r) from chtads
output:
file("*.{bed,bedgraph,gff}") into hicexplorer_tads
script:
"""
hicFindTADs --matrix ${h5mat} \
--outPrefix tad \
--correctForMultipleTesting fdr \
--numberOfProcessors ${task.cpus}
"""
}
chIS = cool_maps.combine(tads_res_insulation).filter{ it[1] == it[3] }.dump(tag : "ins")