From 315a6d3a0b9a2bc363c5912cd7fa9d5e903abe80 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent@modolo.fr> Date: Tue, 16 Oct 2018 15:19:06 +0200 Subject: [PATCH] rm SNP_calling scripts --- src/1_JU28_59vs17_SNP_calling.sh | 58 ----- src/SNP_calling.config | 187 -------------- src/SNP_calling.nf | 417 ------------------------------- src/intersect_SNP.R | 100 -------- 4 files changed, 762 deletions(-) delete mode 100644 src/1_JU28_59vs17_SNP_calling.sh delete mode 100644 src/SNP_calling.config delete mode 100644 src/SNP_calling.nf delete mode 100755 src/intersect_SNP.R diff --git a/src/1_JU28_59vs17_SNP_calling.sh b/src/1_JU28_59vs17_SNP_calling.sh deleted file mode 100644 index b31f6c51..00000000 --- a/src/1_JU28_59vs17_SNP_calling.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -# generate training set -ll *.gz | sed 's/.gz//g' | awk '{system("gunzip -c "$9".gz > ~/data/JU28_59vs17_SNP/data/samples/"$9)}' -cd ~/data/JU28_59vs17_SNP/data/samples/ -~/scripts/fastq_sampler/fastq_sampler.py -i 100000 -f MR_350_clean_1.fastq -g MR_350_clean_2.fastq -~/scripts/fastq_sampler/fastq_sampler.py -i 100000 -f MR_550_clean_1.fastq -g MR_550_clean_2.fastq -~/scripts/fastq_sampler/fastq_sampler.py -i 100000 -f NG-10944_JU2859_bis_lib169352_5217_1_1.fastq -g NG-10944_JU2859_bis_lib169352_5217_1_2.fastq -ll s_*.fastq | awk '{system("gzip < "$9" > "$9".gz")}' -cd ~/projects/JU28_59vs17_SNP/ - -# training set analysis - -mkdir tests -cd tests -../nextflow ../src/SNP_calling.nf -c ../src/SNP_calling.config -profile docker --fasta "../data/fasta/DBG2OLC_output2.fasta" --fastq "../data/samples/*_{1,2}.fastq.gz" -resume -w ~/data/work_s/ --tumor "[\"s_NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"s_MR_550_clean\", \"s_MR_350_clean\"]" --seq_number 800000 -~/scripts/sms.sh "SNP done" - -# real set analysis - -./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/DBG2OLC_output2.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]" -~/scripts/sms.sh "SNP done" -src/intersect_SNP.R \ - results/SNP/vcf_samtools/normal_sample_filtered.csv \ - results/SNP/vcf_samtools/tumor_sample_filtered.csv \ - results/fasta/DBG2OLC_output2_filtered.fasta \ - data/list_of_enzymes.csv -~/scripts/sms.sh "SNP analysis done" - -./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile docker --fasta "data/fasta/final_assembly.fasta" --fastq "data/fastq/*_{1,2}.fastq.gz" -resume -w ~/data/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]" -~/scripts/sms.sh "SNP done" -src/intersect_SNP.R \ - results/SNP/vcf_samtools/normal_sample_filtered.csv \ - results/SNP/vcf_samtools/tumor_sample_filtered.csv \ - results/fasta/final_assembly_filtered.fasta \ - data/list_of_enzymes.csv -~/scripts/sms.sh "SNP analysis done" - - -# on the PSMN -find ~/data/ -name "MR_350_clean*" -find ~/data/ -name "MR_550_clean*" -find ~/data/ -name "10944_JU2859_bis_lib169352_5217_1*" - -./nextflow src/SNP_calling.nf -c src/SNP_calling.config -profile sge --fasta "data/fasta/final_assembly.fasta" --fastq "/Xnfs/lbmcdb/Delattre_team/Request/Clean_data_for_assembly/*_{1,2}.fastq.gz" -resume -w /scratch/lmodolo/work/ --tumor "[\"NG-10944_JU2859_bis_lib169352_5217_1\"]" --normal "[\"MR_550_clean\", \"MR_350_clean\"]" - -mkdir -p results/blastall/ -makeblastdb -in data/fasta/DBG2OLC_output2.fasta -parse_seqids -dbtype nucl -blastn -query data/RNA5S_belari.fasta -db data/fasta/DBG2OLC_output2.fasta -out results/blastall/RNA5S_2.out -less results/blastall/RNA5S_2.out - -makeblastdb -in data/fasta/DBG2OLC_output1.fasta -parse_seqids -dbtype nucl -blastn -query data/RNA5S_belari.fasta -db data/fasta/DBG2OLC_output1.fasta -out results/blastall/RNA5S_1.out -less results/blastall/RNA5S_1.out - -makeblastdb -in data/fasta/nanoport_denovo.fasta -parse_seqids -dbtype nucl -blastn -query data/RNA5S_belari.fasta -db data/fasta/nanoport_denovo.fasta -out results/blastall/RNA5S_nanoport.out -less results/blastall/RNA5S_nanoport.out diff --git a/src/SNP_calling.config b/src/SNP_calling.config deleted file mode 100644 index 1ca9ac2c..00000000 --- a/src/SNP_calling.config +++ /dev/null @@ -1,187 +0,0 @@ -profiles { - docker { - docker.temp = 'auto' - docker.enabled = true - process { - withName: adaptor_removal { - container = "cutadapt:1.14" - } - withName: trimming { - container = "urqt:d62c1f8" - } - withName: filter_fasta { - container = "bioawk:1.0" - } - withName: index_fasta { - container = "bowtie2:2.3.4.1" - } - withName: mapping_fastq { - container = "bowtie2:2.3.4.1" - } - withName: merge_bam { - container = "sambamba:0.6.7" - } - withName: sort_bam { - container = "sambamba:0.6.7" - } - withName: index_bam { - container = "sambamba:0.6.7" - } - withName: index2_fasta { - container = "gatk:4.0.8.1" - } - withName: index3_fasta { - container = "samtools:1.7" - } - withName: samtools_SNP_tumor { - container = "bcftools:1.7" - } - withName: samtools_SNP_norm { - container = "bcftools:1.7" - } - withName: vcf_to_csv_tumor { - container = "gatk:4.0.8.1" - } - withName: vcf_to_csv_norm { - container = "gatk:4.0.8.1" - } - } - } - sge { - process{ - queueSize = 1000 - pollInterval = '60sec' - withName: adaptor_removal { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "cutadapt/1.14" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: trimming { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "UrQt/d62c1f8" - executor = "sge" - cpus = 16 - memory = "5GB" - time = "12h" - queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' - penv = 'openmp16' - } - withName: filter_fasta { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "bioawk/1.0" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: index_fasta { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "Bowtie2/2.3.4.1:sambamba/0.6.7:samblaster/0.1.24" - executor = "sge" - cpus = 1 - memory = "30GB" - time = "6h" - queue = 'monointeldeb128' - } - withName: mapping_fastq { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "Bowtie2/2.3.4.1:sambamba/0.6.7:samblaster/0.1.24" - executor = "sge" - cpus = 16 - memory = "30GB" - time = "12h" - queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' - penv = 'openmp16' - } - withName: merge_bam { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "sambamba/0.6.7" - executor = "sge" - cpus = 16 - memory = "30GB" - time = "12h" - queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' - penv = 'openmp16' - } - withName: sort_bam { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "sambamba/0.6.7" - executor = "sge" - cpus = 16 - memory = "30GB" - time = "12h" - queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' - penv = 'openmp16' - } - withName: index_bam { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "sambamba/0.6.7" - executor = "sge" - cpus = 16 - memory = "30GB" - time = "12h" - queue = 'E5-2670deb128A,E5-2670deb128B,E5-2670deb128C,E5-2670deb128D,E5-2670deb128E,E5-2670deb128F' - penv = 'openmp16' - } - withName: index2_fasta { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "GATK/4.0.10.1" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: index3_fasta { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "SAMtools/1.7" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: samtools_SNP_tumor { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "BCFtools/1.7" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: samtools_SNP_norm { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "BCFtools/1.7" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: vcf_to_csv_tumor { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "GATK/4.0.10.1" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - withName: vcf_to_csv_norm { - beforeScript = "source /usr/share/lmod/lmod/init/bash; module use ~/privatemodules" - module = "GATK/4.0.10.1" - executor = "sge" - cpus = 1 - memory = "20GB" - time = "12h" - queue = 'monointeldeb128' - } - } - } -} diff --git a/src/SNP_calling.nf b/src/SNP_calling.nf deleted file mode 100644 index 096afb9f..00000000 --- a/src/SNP_calling.nf +++ /dev/null @@ -1,417 +0,0 @@ -params.fastq = "$baseDir/data/*.fastq" -params.fasta = "$baseDir/data/*.fasta" -params.seq_length = 800000 -log.info "fastq files : ${params.fastq}" -log.info "fasta files : ${params.fasta}" -log.info "fasta length to retain : ${params.seq_length}" -def normal_sample = Eval.me(params.normal) -def tumor_sample = Eval.me(params.tumor) -log.info "normal : ${normal_sample}" -log.info "tumor : ${tumor_sample}" - -Channel - .fromPath( params.fasta ) - .ifEmpty { error "Cannot find any fasta files matching: ${params.fasta}" } - .map { it -> [(it.baseName =~ /([^\.]*)/)[0][1], it]} - .set { fasta_file } -Channel - .fromFilePairs( params.fastq ) - .ifEmpty { error "Cannot find any fastq files matching: ${params.fastq}" } - .set { fastq_files } - -process adaptor_removal { - tag "$pair_id" - publishDir "results/fastq/adaptor_removal/", mode: 'copy' - - input: - set pair_id, file(reads) from fastq_files - - output: - set pair_id, file("*.fastq.gz") into fastq_files_cut - file "*_cutadapt_report.txt" into cut_files_report - - script: -""" -cutadapt -a AGATCGGAAGAG -g CTCTTCCGATCT -A AGATCGGAAGAG -G CTCTTCCGATCT \ --o ${pair_id}_cut_R1.fastq.gz -p ${pair_id}_cut_R2.fastq.gz \ -${reads[0]} ${reads[1]} > ${pair_id}_cutadapt_report.txt -""" -} - -process trimming { - tag "${pair_id}" - cpus 4 - publishDir "results/fastq/trimming/", mode: 'copy' - - input: - set pair_id, file(reads) from fastq_files_cut - - output: - set pair_id, file("*.fastq.gz") into fastq_files_trim - file "*_trimming_report.txt" into trimming_files_report - - script: -""" -UrQt --t 20 --m ${task.cpus} --gz \ ---in ${reads[0]} --inpair ${reads[1]} \ ---out ${pair_id}_trim_R1.fastq.gz --outpair ${pair_id}_trim_R2.fastq.gz \ -> ${pair_id}_trimming_report.txt -""" -} - -process filter_fasta { - tag "$fasta_id" - cpus 4 - publishDir "results/fasta/", mode: 'copy' - - input: - set fasta_id, file(fasta) from fasta_file - - output: - set fasta_idf, "*_filtered.fasta" into filter_fasta_files - - script: - fasta_idf = "${fasta_id}_filtered" -""" -bioawk -c fastx '{ if(length(\$seq) > $params.seq_length) { print ">"\$name; print \$seq }}' ${fasta} > \ -${fasta_id}_filtered.fasta -""" -} - -filter_fasta_files.into{ - filtered_fasta_files; - indel_fasta_file; - recalibration_fasta_file; - haplotypecaller_fasta_file -} - -process index_fasta { - tag "$file_id" - cpus 12 - publishDir "results/mapping/index/", mode: 'copy' - - input: - set file_id, file(fasta) from filtered_fasta_files - - output: - file "*.index*" into index_files - file "*_report.txt" into indexing_report - - script: -""" -bowtie2-build --threads ${task.cpus} ${fasta} ${file_id}.index &> ${file_id}_bowtie2_report.txt - -if grep -q "Error" ${file_id}_bowtie2_report.txt; then - exit 1 -fi -""" -} - -fastq_files_trim.into{ - fastq_files_trim_norm; - fastq_files_trim_tumor -} - -collect_fastq_files_trim_norm = fastq_files_trim_norm - .filter{ normal_sample.contains(it[0]) } - .map { it -> ["normal_sample", it[0], it[1]]} - -collect_fastq_files_trim_tumor = fastq_files_trim_tumor - .filter{ tumor_sample.contains(it[0]) } - .map { it -> ["tumor_sample", it[0], it[1]]} - -collect_fastq_files_trim = Channel.create() - .mix(collect_fastq_files_trim_norm, collect_fastq_files_trim_tumor) - -process mapping_fastq { - tag "$pair_id" - cpus 12 - publishDir "results/mapping/bam/", mode: 'copy' - - input: - set sample_name, pair_id, file(reads) from collect_fastq_files_trim - file index from index_files.collect() - - output: - set pair_id, "*.bam" into bam_files - file "*_report.txt" into mapping_report - - script: - index_id = index[0] - for (index_file in index) { - if (index_file =~ /.*\.1\.bt2/ && !(index_file =~ /.*\.rev\.1\.bt2/)) { - index_id = ( index_file =~ /(.*)\.1\.bt2/)[0][1] - } - } -""" -bowtie2 --very-sensitive -p ${task.cpus} -x ${index_id} \ ---rg-id ${sample_name} \ ---rg PL:Illumina \ ---rg SM:${sample_name} \ --1 ${reads[0]} -2 ${reads[1]} 2> \ -${pair_id}_bowtie2_report.txt | \ -samblaster --addMateTags -M -i /dev/stdin | \ -sambamba view -t ${task.cpus} --valid -S -f bam -l 0 /dev/stdin \ --o ${pair_id}.bam - -if grep -q "Error" ${pair_id}_bowtie2_report.txt; then - exit 1 -fi -""" -} - -process sort_bam { - tag "$file_id" - cpus 4 - - input: - set file_id, file(bam) from bam_files - - output: - set file_id, "*_sorted.bam" into sorted_bam_files - - script: -""" -sambamba sort -t ${task.cpus} --tmpdir=./tmp -o ${file_id}_sorted.bam ${bam} -""" -} - -sorted_bam_files.into { - sorted_bam_file_norm; - sorted_bam_file_tumor; -} - -collect_sorted_bam_file_norm = sorted_bam_file_norm - .filter{ normal_sample.contains(it[0]) } - .map { it -> it[1]} - .buffer( size: normal_sample.size()) - .map { it -> ["normal_sample", it]} -collect_sorted_bam_file_tumor = sorted_bam_file_tumor - .filter{ tumor_sample.contains(it[0]) } - .map { it -> it[1]} - .buffer( size: tumor_sample.size()) - .map { it -> ["tumor_sample", it]} - -collect_sorted_bam_file = Channel.create() - .mix(collect_sorted_bam_file_norm, collect_sorted_bam_file_tumor) - -process merge_bam { - tag "$file_id" - cpus 4 - publishDir "results/mapping/bam/", mode: 'copy' - - input: - set file_id, file(bam) from collect_sorted_bam_file - - output: - set file_id, "*.bam" into merged_bam_files - - script: -""" -if ((\$(ls -l *.bam | wc -l) > 1)); then -sambamba merge -t ${task.cpus} ${file_id}.bam ${bam} -else -cp ${bam} ${file_id}.bam -fi -""" -} - -merged_bam_files.into{ - index_merged_bam_files; - haplo_bam_files_norm; - haplo_bam_files_tumor -} - -process index_bam { - tag "$file_id" - cpus 4 - publishDir "results/mapping/bam/", mode: 'copy' - - input: - set file_id, file(bam) from index_merged_bam_files - - output: - set file_id, "*.bam.bai" into index_bam_files - - script: -""" -sambamba index -t ${task.cpus} ${bam} -""" -} - -index_bam_files.into{ - named_index_bam_files; - indexed_bam_files -} - -haplotypecaller_fasta_file.into{ - final_fasta_file; - index2_fasta_file - index3_fasta_file - } - -process index2_fasta { - tag "$genome_id" - publishDir "results/fasta/", mode: 'copy' - - input: - set genome_id, file(fasta) from index2_fasta_file - - output: - set genome_id, "*.dict" into indexed2_fasta_file - - script: -""" -gatk CreateSequenceDictionary -R ${fasta} &> gatk_output.txt -""" -} - -process index3_fasta { - tag "$genome_id" - publishDir "results/fasta/", mode: 'copy' - - input: - set genome_id, file(fasta) from index3_fasta_file - - output: - set genome_id, "*.fai" into indexed3_fasta_file - - script: -""" -samtools faidx ${fasta} -""" -} - -final_bam_files_norm = haplo_bam_files_norm - .filter{ "normal_sample" == it[0] } -final_bam_files_tumor = haplo_bam_files_tumor - .filter{ "tumor_sample" == it[0] } - -indexed_bam_files.into { - index_bam_files_norm; - index_bam_files_tumor -} -final_indexed_bam_files_norm = index_bam_files_norm - .filter{ "normal_sample" == it[0] } -final_indexed_bam_files_tumor = index_bam_files_tumor - .filter{ "tumor_sample" == it[0] } - -final_bam_files_norm.set{ - samtools_SNP_bam_files_norm -} -final_bam_files_tumor.set{ - samtools_SNP_bam_files_tumor; -} -final_indexed_bam_files_norm.set{ - samtools_SNP_index_bam_files_norm -} -final_indexed_bam_files_tumor.set{ - samtools_SNP_index_bam_files_tumor; -} -final_fasta_file.into{ - samtools_SNP_fasta_file_tumor; - samtools_SNP_fasta_file_norm; -} -indexed2_fasta_file.into{ - samtools_SNP_indexed2_fasta_file_tumor; - samtools_SNP_indexed2_fasta_file_norm; -} -indexed3_fasta_file.into{ - samtools_SNP_indexed3_fasta_file_tumor; - samtools_SNP_indexed3_fasta_file_norm; -} - -process samtools_SNP_tumor { - tag "$file_id_tumor" - cpus 1 - publishDir "results/SNP/vcf_samtools/", mode: 'copy' - - input: - set file_id_tumor, file(bam_tumor) from samtools_SNP_bam_files_tumor - set file_ididx_tumor, file(bamidx_tumor) from samtools_SNP_index_bam_files_tumor - set genome_id, file(fasta) from samtools_SNP_fasta_file_tumor - set genome2_idx, file(fasta2idx) from samtools_SNP_indexed2_fasta_file_tumor - set genome3_idx, file(fasta3idx) from samtools_SNP_indexed3_fasta_file_tumor - - output: - set file_id_tumor, "*.vcf" into vcf_files_tumor - - script: -""" -bcftools mpileup -AE -f ${fasta} ${bam_tumor} --output-type v \ --a FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/DP,FORMAT/SP,INFO/AD,INFO/ADF,INFO/ADR | \ -bcftools call -mv --output-type v > ${file_id_tumor}_raw.vcf -bcftools filter -s LowQual -e '%QUAL<20 || DP>100' ${file_id_tumor}_raw.vcf \ -> ${file_id_tumor}_filtered.vcf -""" -} - -process samtools_SNP_norm { - tag "$file_id_norm" - cpus 1 - publishDir "results/SNP/vcf_samtools/", mode: 'copy' - - input: - set file_id_norm, file(bam_norm) from samtools_SNP_bam_files_norm - set file_ididx_norm, file(bamidx_norm) from samtools_SNP_index_bam_files_norm - set genome_id, file(fasta) from samtools_SNP_fasta_file_norm - set genome2_idx, file(fasta2idx) from samtools_SNP_indexed2_fasta_file_norm - set genome3_idx, file(fasta3idx) from samtools_SNP_indexed3_fasta_file_norm - - output: - set file_id_norm, "*.vcf" into vcf_files_norm - - script: -""" -bcftools mpileup -AE -f ${fasta} ${bam_norm} --output-type v \ --a FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/DP,FORMAT/SP,INFO/AD,INFO/ADF,INFO/ADR | \ -bcftools call -mv --output-type v > ${file_id_norm}_raw.vcf -bcftools filter -s LowQual -e '%QUAL<20 || DP>100' ${file_id_norm}_raw.vcf \ -> ${file_id_norm}_filtered.vcf -""" -} - -process vcf_to_csv_tumor { - tag "$file_id_tumor" - cpus 1 - publishDir "results/SNP/vcf_samtools/", mode: 'copy' - - input: - set file_id_tumor, file(vcf) from vcf_files_tumor - - output: - set file_id_tumor, "*.csv" into csv_files_tumor - - script: -""" -gatk VariantsToTable -V ${file_id_tumor}_raw.vcf \ --F CHROM -F POS -F TYPE -GF GT -GF AD -F AD -F DP \ --O ${file_id_tumor}_raw.csv -gatk VariantsToTable -V ${file_id_tumor}_filtered.vcf \ --F CHROM -F POS -F TYPE -GF GT -GF AD -F AD -F DP \ --O ${file_id_tumor}_filtered.csv -""" -} - -process vcf_to_csv_norm { - tag "$file_id_norm" - cpus 1 - publishDir "results/SNP/vcf_samtools/", mode: 'copy' - - input: - set file_id_norm, file(vcf) from vcf_files_norm - - output: - set file_id_norm, "*.csv" into csv_files_norm - - script: -""" -gatk VariantsToTable -V ${file_id_norm}_raw.vcf \ --F CHROM -F POS -F TYPE -GF GT -GF AD -F AD -F DP \ --O ${file_id_norm}_raw.csv -gatk VariantsToTable -V ${file_id_norm}_filtered.vcf \ --F CHROM -F POS -F TYPE -GF GT -GF AD -F AD -F DP \ --O ${file_id_norm}_filtered.csv -""" -} - diff --git a/src/intersect_SNP.R b/src/intersect_SNP.R deleted file mode 100755 index 50fa2e8c..00000000 --- a/src/intersect_SNP.R +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/Rscript -rm(list = ls()) -library("tidyverse") -library("seqinr") - -args <- c( - "results/SNP/vcf_samtools/normal_sample_filtered.csv", - "results/SNP/vcf_samtools/tumor_sample_filtered.csv", - "results/fasta/DBG2OLC_output2_filtered.fasta", - "data/list_of_enzymes.csv" - ) -seq_restric_size <- 21 - -args <- commandArgs(trailingOnly = TRUE) -snp_a <- read_delim(args[1], delim = "\t") %>% - mutate(cords = paste0(CHROM, POS)) -snp_b <- read_delim(args[2], delim = "\t") %>% - mutate(cords = paste0(CHROM, POS)) - -only_b <- snp_b %>% - select(cords) %>% - setdiff(snp_a %>% select(cords)) %>% - pull(cords) - -snp <- snp_b %>% - filter(cords %in% only_b) %>% - filter(tumor_sample.GT %in% c("A/A", "T/T", "G/G", "C/C")) %>% - mutate(REF = do.call(rbind, strsplit(AD, split = ",", fixed = TRUE))[,1], - VAR = do.call(rbind, strsplit(AD, split = ",", fixed = TRUE))[,2], - REF = as.integer(REF), - VAR = as.integer(VAR), - tumor_sample.AD = NULL, - cords = NULL - ) %>% - filter(REF == 0) %>% - filter(VAR >= 10) %>% - arrange(CHROM, desc(VAR)) - -fastafile <- read.fasta(file = args[3], - as.string = TRUE) - -snp$seq_list <- snp %>% - apply(1, FUN = function(x, fastafile, POS, CHROM, seq_restric_size){ - begin <- as.integer(x[ POS ]) - ((seq_restric_size - 1) / 2) - end <- as.integer(x[ POS ]) + ((seq_restric_size - 1) / 2) - chrom <- x[ CHROM ] - seq_restric <- fastafile[[ chrom ]] %>% - c2s() %>% - substr(begin, end) %>% - s2c() - seq_restric[((seq_restric_size - 1) / 2) + 1] <- - toupper(seq_restric[((seq_restric_size - 1) / 2) + 1]) - print(paste0(chrom, ":", begin, "-", end, " ", seq_restric %>% c2s())) - return(seq_restric %>% c2s()) - }, - fastafile = fastafile, - POS = which(colnames(snp) %in% "POS"), - CHROM = which(colnames(snp) %in% "CHROM"), - seq_restric_size = seq_restric_size - ) - -snp %>% - write_csv(paste0(args[2], "only.csv" )) - - -snp <- read_csv(paste0(args[2], "only.csv" )) -enzyme_list <- read_csv(args[4]) %>% - mutate(size = nchar(seq)) - -snp <- snp %>% - mutate(enzyme = NA, - enzyme_pos = NA, - enzyme_seq = NA) -for (i in seq_len(nrow(enzyme_list))) { - enzyme <- enzyme_list[i, ] - enzyme_search <- gregexpr(toupper(enzyme$seq), - toupper(snp$seq_list), - fixed = TRUE) %>% - lapply(FUN = function(x, enzyme, seq_restric_size) { - if (x[1] < 0) { - return(c(NA, NA, NA)) - } else { - if (x[1] <= (seq_restric_size - 1) / 2 + 1 & - x[1] + enzyme$size >= (seq_restric_size - 1) / 2 + 1) { - return(c(enzyme$enzyme, x[1], enzyme$seq)) - } - return(c(NA, NA, NA)) - } - }, - enzyme = enzyme, - seq_restric_size = seq_restric_size) - enzyme_search <- do.call(rbind, enzyme_search) - snp[is.na(snp$enzyme), c("enzyme", "enzyme_pos", "enzyme_seq")] <- - enzyme_search[is.na(snp$enzyme), ] -} - -snp %>% - filter(!is.na(enzyme)) %>% - write_csv(paste0(args[2], "only_enzyme.csv" )) - -- GitLab