Skip to content
Snippets Groups Projects
Unverified Commit f48326c5 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

Merge branch 'bwamem'

parents 25be4607 40a19885
No related branches found
No related tags found
No related merge requests found
......@@ -11,12 +11,14 @@ cd ~/projects/JU28_59vs17_SNP/
# training set analysis
./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]"
mkdir tests
cd tests
../nextflow ../src/SNP_calling.nf -c ../src/SNP_calling.config -profile docker --fasta "../data/fasta/DBG2OLC_output2.fasta" --fastq "../data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]" --seq_number 800000
~/scripts/sms.sh "SNP done"
# real set analysis
./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]"
./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC_output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]"
~/scripts/sms.sh "SNP done"
./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC-output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" --sam "results/mapping/sam/*.sam" -resume -w ~/data/work/
......
......@@ -9,6 +9,9 @@ profiles {
withName: trimming {
container = "urqt:d62c1f8"
}
withName: filter_fasta {
container = "bioawk:1.0"
}
withName: index_fasta {
container = "bowtie2:2.3.4.1"
}
......@@ -33,6 +36,18 @@ profiles {
withName: HaplotypeCaller {
container = "gatk:4.0.8.1"
}
withName: GetPileupSummaries {
container = "gatk:4.0.8.1"
}
withName: CalculateContamination {
container = "gatk:4.0.8.1"
}
withName: CollectSequencingArtifactMetrics {
container = "gatk:4.0.8.1"
}
withName: filter_SNP {
container = "gatk:4.0.8.1"
}
}
}
sge {
......
params.fastq = "$baseDir/data/*.fastq"
params.fasta = "$baseDir/data/*.fasta"
params.seq_length = 800000
log.info "fastq files : ${params.fastq}"
log.info "fasta files : ${params.fasta}"
log.info "fasta length to retain : ${params.seq_length}"
def normal_sample = Eval.me(params.normal)
def tumor_sample = Eval.me(params.tumor)
log.info "normal : ${normal_sample}"
......@@ -11,11 +13,7 @@ Channel
.fromPath( params.fasta )
.ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" }
.map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]}
.into { fasta_file;
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
.set { fasta_file }
Channel
.fromFilePairs( params.fastq )
.ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" }
......@@ -61,13 +59,39 @@ UrQt --t 20 --m ${task.cpus} --gz \
"""
}
process filter_fasta {
tag "$fasta_id"
cpus 4
publishDir "results/fasta/", mode: 'copy'
input:
set fasta_id, file(fasta) from fasta_file
output:
set fasta_idf, "*_filtered.fasta" into filter_fasta_files
script:
fasta_idf = "${fasta_id}_filtered"
"""
bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \
${fasta_id}_filtered.fasta
"""
}
filter_fasta_files.into{
filtered_fasta_files;
indel_fasta_file;
recalibration_fasta_file;
haplotypecaller_fasta_file
}
process index_fasta {
tag "$file_id"
cpus 4
publishDir "results/mapping/index/", mode: 'copy'
input:
set file_id, file(fasta) from fasta_file
set fasta_id, file(fasta) from filtered_fasta_files
output:
file "*.index*" into index_files
......@@ -133,19 +157,18 @@ sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam}
sorted_bam_files.into {
sorted_bam_file_norm;
sorted_bam_file_tumor
sorted_bam_file_tumor;
}
collect_sorted_bam_file_norm = sorted_bam_file_norm
.filter{ normal_sample.contains(it[0]) }
.map { it -> it[1]}
.collect()
.buffer( size: normal_sample.size())
.map { it -> ["normal_sample", it]}
collect_sorted_bam_file_tumor = sorted_bam_file_tumor
.filter{ tumor_sample.contains(it[0]) }
.map { it -> it[1]}
.collect()
.buffer( size: tumor_sample.size())
.map { it -> ["tumor_sample", it]}
collect_sorted_bam_file = Channel.create()
......@@ -154,6 +177,7 @@ collect_sorted_bam_file = Channel.create()
process merge_bam {
tag "$file_id"
cpus 4
publishDir "results/mapping/bam/", mode: 'copy'
input:
set file_id, file(bam) from collect_sorted_bam_file
......@@ -200,7 +224,7 @@ index_bam_files.into{
}
haplotypecaller_fasta_file.into{
haplo_fasta_file;
final_fasta_file;
index2_fasta_file
index3_fasta_file
}
......@@ -237,117 +261,191 @@ samtools faidx ${fasta}
"""
}
haplotypecaller_bam_files_norm = haplo_bam_files_norm
final_bam_files_norm = haplo_bam_files_norm
.filter{ "normal_sample" == it[0] }
haplotypecaller_bam_files_tumor = haplo_bam_files_tumor
.filter{ "tumor_sample" == it[0] }
final_bam_files_tumor = haplo_bam_files_tumor
.filter{ "tumor_sample" == it[0] }
indexed_bam_files.into {
index_bam_files_norm;
index_bam_files_tumor
}
indexed_bam_files_norm = index_bam_files_norm
final_indexed_bam_files_norm = index_bam_files_norm
.filter{ "normal_sample" == it[0] }
indexed_bam_files_tumor = index_bam_files_tumor
final_indexed_bam_files_tumor = index_bam_files_tumor
.filter{ "tumor_sample" == it[0] }
final_bam_files_norm.set{
haplotypecaller_bam_files_norm
}
final_bam_files_tumor.into{
haplotypecaller_bam_files_tumor;
artifact_bam_files_tumor;
pileup_bam_files_tumor
}
final_indexed_bam_files_norm.set{
haplo_index_bam_files_norm
}
final_indexed_bam_files_tumor.into{
haplo_index_bam_files_tumor;
artifact_index_bam_files_tumor;
pileup_index_bam_files_tumor
}
final_fasta_file.into{
haplo_fasta_file;
artifact_fasta_file;
filter_fasta_file
}
indexed2_fasta_file.into{
haplo_indexed2_fasta_file;
artifact_indexed2_fasta_file;
filter_indexed2_fasta_file
}
indexed3_fasta_file.into{
haplo_indexed3_fasta_file;
artifact_indexed3_fasta_file;
filter_indexed3_fasta_file
}
process HaplotypeCaller {
tag "$file_id"
cpus 4
tag "$file_id_norm"
cpus 10
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm.collect()
set file_ididx_norm, file(bamidx_norm) from indexed_bam_files_norm.collect()
set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor.collect()
set file_ididx_tumor, file(bamidx_tumor) from indexed_bam_files_tumor.collect()
set genome_id, file(fasta) from haplo_fasta_file.collect()
set genome2_idx, file(fasta2idx) from indexed2_fasta_file.collect()
set genome3_idx, file(fasta3idx) from indexed3_fasta_file.collect()
set file_id_norm, file(bam_norm) from haplotypecaller_bam_files_norm
set file_ididx_norm, file(bamidx_norm) from haplo_index_bam_files_norm
set file_id_tumor, file(bam_tumor) from haplotypecaller_bam_files_tumor
set file_ididx_tumor, file(bamidx_tumor) from haplo_index_bam_files_tumor
set genome_id, file(fasta) from haplo_fasta_file
set genome2_idx, file(fasta2idx) from haplo_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from haplo_indexed3_fasta_file
output:
set file_id, "*.vcf" into vcf_files
set file_id, "*.bam" into realigned_bams_files
set file_id_norm, "*.vcf" into vcf_files
set file_id_norm, "*.vcf.idx" into index_vcf_files
set file_id_norm, "*.bam" into realigned_bams_files
file "*_mutect2_report.txt" into mutect2_report
script:
"""
gatk Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
gatk --java-options "-Xmx32G" Mutect2 --native-pair-hmm-threads ${task.cpus} -R ${fasta} \
-I ${bam_tumor} -tumor ${file_id_tumor} \
-I ${bam_norm} -normal ${file_id_norm} \
-O ${file_id}_raw_calls.g.vcf \
-bamout ${file_id}_realigned.bam
-O ${file_id_norm}_raw_calls.g.vcf \
-bamout ${file_id_norm}_realigned.bam 2> ${file_id_norm}_mutect2_report.txt
"""
}
vcf_files.into{
pileup_vcf_files;
filter_vcf_files
}
index_vcf_files.into{
pileup_index_vcf_files;
filter_index_vcf_files
}
/*
process filter_SNP {
tag "$file_id"
cpus 4
process GetPileupSummaries {
tag "$file_id_norm"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(vcf) from pileup_vcf_files
set fileidx_id_norm, file(vcfidx) from pileup_index_vcf_files
set file_id_tumor, file(bam_tumor) from pileup_bam_files_tumor
output:
set file_id, "*.vcf" into vcf_files_filtered
set file_id_tumor, "*.table" into pileup_files
file "*_pileup_report.txt" into pileup_report
script:
"""
gatk --java-options "-Xmx2g" Mutect2 \
-R hg38/Homo_sapiens_assembly38.fasta \
-I tumor.bam \
-I normal.bam \
-tumor HCC1143_tumor \
-normal HCC1143_normal \
-pon resources/chr17_pon.vcf.gz \
--germline-resource resources/chr17_af-only-gnomad_grch38.vcf.gz \
--af-of-alleles-not-in-resource 0.0000025 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 1_somatic_m2.vcf.gz \
-bamout 2_tumor_normal_m2.bam
gatk Mutect2 \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta \
-I HG00190.bam \
-tumor HG00190 \
--disable-read-filter MateOnSameContigOrNoMappedMateReadFilter \
-L chr17plus.interval_list \
-O 3_HG00190.vcf.gz
gatk CreateSomaticPanelOfNormals \
-vcfs 3_HG00190.vcf.gz \
-vcfs 4_NA19771.vcf.gz \
-vcfs 5_HG02759.vcf.gz \
-O 6_threesamplepon.vcf.gz
gatk GetPileupSummaries \
-I tumor.bam \
-V resources/chr17_small_exac_common_3_grch38.vcf.gz \
-O 7_tumor_getpileupsummaries.table
gatk CalculateContamination \
-I 7_tumor_getpileupsummaries.table \
-O 8_tumor_calculatecontamination.table
gatk --java-options "-Xmx32G" GetPileupSummaries \
-I ${bam_tumor} \
-V ${vcf} \
-O ${file_id_tumor}_pileup.table \
2> ${file_id_tumor}_pileup_report.txt
"""
}
gatk FilterMutectCalls \
-V somatic_m2.vcf.gz \
--contamination-table tumor_calculatecontamination.table \
-O 9_somatic_oncefiltered.vcf.gz
process CalculateContamination {
tag "$file_id_tumor"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(pileup_table) from pileup_files
output:
set file_id_tumor, "*.table" into contamination_files
file "*_contamination_report.txt" into contamination_report
script:
"""
gatk --java-options "-Xmx32G" CalculateContamination \
-I ${pileup_table} \
-O $file_id_tumor}_contamination.table \
2> ${file_id_tumor}_contamination_report.txt
"""
}
*/
process CollectSequencingArtifactMetrics {
tag "$file_id_tumor"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_tumor, file(bam_tumor) from artifact_bam_files_tumor
set genome_id, file(fasta) from artifact_fasta_file
set genome2_idx, file(fasta2idx) from artifact_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from artifact_indexed3_fasta_file
output:
set file_id_tumor, "*_artifact.*" into artifact_files
file "*_artifact_report.txt" into artifact_report
script:
"""
gatk CollectSequencingArtifactMetrics \
-I tumor.bam \
-O 10_tumor_artifact \
–-FILE_EXTENSION ".txt" \
-R ~/Documents/ref/hg38/Homo_sapiens_assembly38.fasta
gatk FilterByOrientationBias \
-A G/T \
-A C/T \
-V 9_somatic_oncefiltered.vcf.gz \
-P tumor_artifact.pre_adapter_detail_metrics.txt \
-O 11_somatic_twicefiltered.vcf.gz
-I ${bam_tumor} \
-O ${file_id_tumor}_artifact \
-R ${fasta} \
2> ${file_id_tumor}_artifact_report.txt
"""
}
process filter_SNP {
tag "$file_id_norm"
cpus 1
publishDir "results/SNP/vcf/", mode: 'copy'
input:
set file_id_norm, file(vcf) from filter_vcf_files
set fileidx_id_norm, file(vcfid) from filter_index_vcf_files
set genome_id, file(fasta) from filter_fasta_file
set genome2_idx, file(fasta2idx) from filter_indexed2_fasta_file
set genome3_idx, file(fasta3idx) from filter_indexed3_fasta_file
output:
set file_id_norm, "*.vcf" into vcf_files_filtered
file "*_filter_report.txt" into filter_report
script:
"""
gatk FilterMutectCalls \
-V ${vcf} \
-O ${file_id_norm}_filtered.vcf \
2> ${file_id_norm}_filter_report.txt
gatk SelectVariants \
-R ${fasta} \
--variant ${file_id_norm}_filtered.vcf \
--exclude-filtered \
-O ${file_id_norm}_filtered_pass.vcf \
2>> ${file_id_norm}_filter_report.txt
"""
}
*/
FROM ubuntu:18.04
MAINTAINER Laurent Modolo
ENV BIOAWK_VERSION=1.0
ENV PACKAGES git=1:2.17* \
build-essential=12.4* \
ca-certificates=20180409 \
zlib1g-dev=1:1.2.11* \
byacc
RUN apt-get update && \
apt-get install -y --no-install-recommends ${PACKAGES} && \
apt-get clean
RUN git clone https://github.com/lh3/bioawk.git && \
cd bioawk && \
git checkout tags/v${BIOAWK_VERSION} && \
make && \
cd .. && \
mv bioawk/bioawk /usr/bin/ && \
rm -Rf bioawk
#!/bin/sh
docker build src/docker_modules/bioawk/1.0 -t 'bioawk:1.0'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment