From 6355ecd1cc1d80dfeeaf364666149efa13fbf7bf Mon Sep 17 00:00:00 2001 From: Nathan Lecouvreur <nathan.lecouvreur@ens-lyon.fr> Date: Mon, 14 Feb 2022 10:48:45 +0100 Subject: [PATCH] beggining of the implementation --- src/GATC_analysis.py => GATC_analysis.py | 0 bin_GATC.py | 17 +++++++++ src/.docker_modules/GATC_finder/Dockerfile | 15 ++++++++ .../GATC_finder/script/GATC_finder.py | 32 +++++++++++++++++ src/Dam_ID_analysis.nf | 19 +++++----- src/GATC_finder.py | 35 ------------------- src/nf_modules/GATC_finder/main.nf | 21 +++++++++++ 7 files changed, 94 insertions(+), 45 deletions(-) rename src/GATC_analysis.py => GATC_analysis.py (100%) create mode 100644 bin_GATC.py create mode 100644 src/.docker_modules/GATC_finder/Dockerfile create mode 100644 src/.docker_modules/GATC_finder/script/GATC_finder.py delete mode 100644 src/GATC_finder.py create mode 100644 src/nf_modules/GATC_finder/main.nf diff --git a/src/GATC_analysis.py b/GATC_analysis.py similarity index 100% rename from src/GATC_analysis.py rename to GATC_analysis.py diff --git a/bin_GATC.py b/bin_GATC.py new file mode 100644 index 00000000..c7356eef --- /dev/null +++ b/bin_GATC.py @@ -0,0 +1,17 @@ +import pybedtools +import pysam +from Bio import SeqIO + +sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed") + + + +samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb") + +print(samfile) + +print(samfile) +for read in samfile.fetch("chr1", 100, 120): + print(read) + + diff --git a/src/.docker_modules/GATC_finder/Dockerfile b/src/.docker_modules/GATC_finder/Dockerfile new file mode 100644 index 00000000..a0265610 --- /dev/null +++ b/src/.docker_modules/GATC_finder/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.8-alpine + +RUN apk update \ + && apk add make automake gcc g++ subversion python3-dev + +RUN pip install numpy\ + && pip install biopython + +COPY script/ /script + +RUN adduser -D -u 1000 finder +RUN chown -R finder /script +USER finder + +ENTRYPOINT [ "python","script/GATC_finder.py" ] \ No newline at end of file diff --git a/src/.docker_modules/GATC_finder/script/GATC_finder.py b/src/.docker_modules/GATC_finder/script/GATC_finder.py new file mode 100644 index 00000000..b3e4e037 --- /dev/null +++ b/src/.docker_modules/GATC_finder/script/GATC_finder.py @@ -0,0 +1,32 @@ +import sys +import re +from Bio import SeqIO + +if len(sys.argv) < 3: + raise IndexError("Please enter 2 arguments") + +# Gets the arguments in the command line +out_file_path = str(sys.argv[2]) +genome_file = str(sys.argv[1]) + +# Opening the file to write the positions in +f = open(out_file_path, "w") + +# Motif we are looking for +motif = "GATC" + +# Cycles through the parsed chromosomes from the fasta file +for seq_record in SeqIO.parse(genome_file, "fasta"): + + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome + for match in re.finditer(motif, str(seq_record.seq)): + + start_pos = match.start() +1 + end_pos = match.end() + 1 + + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) \ No newline at end of file diff --git a/src/Dam_ID_analysis.nf b/src/Dam_ID_analysis.nf index d55f649c..2dd38bf8 100644 --- a/src/Dam_ID_analysis.nf +++ b/src/Dam_ID_analysis.nf @@ -4,21 +4,16 @@ nextflow.enable.dsl=2 */ +include { fastp } from "./nf_modules/fastp/main.nf" +include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/") - - -/*========================= modules import ================================*/ - -include { fastp } from "./nf_modules/fastp/main.nf" - -include { index_fasta; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/") - +include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf" params.fasta = "data/genome/*_G.fasta" params.fastq = "data/reads/*_R.fastq" - +params.bam = "results/mapping/*.bam" channel @@ -32,13 +27,17 @@ channel .set {fastq_files} +channel + .fromPath(params.bam) + .set{bam_file} /*================================ workflow ================================*/ workflow { fastp(fastq_files) - //mapping index_fasta(fasta_files) mapping_fastq(index_fasta.out.index.collect(), fastp.out.fastq) + sort_bam(bam_file) + index_bam(sort_bam.out.bam) } diff --git a/src/GATC_finder.py b/src/GATC_finder.py deleted file mode 100644 index a14cbf65..00000000 --- a/src/GATC_finder.py +++ /dev/null @@ -1,35 +0,0 @@ -import re - -from Bio import SeqIO - - -def main(genome_file, out_file_path): - """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] - - Args: - genome_file ([string]): [full path to the fasta file] - out_file_path ([string]): [full path to the output file] - """ - # Opening the file to write the positions in - f = open(out_file_path, "w") - - motif = "GATC" - - # Cycles through the parsed chromosomes from the fasta file - for seq_record in SeqIO.parse(genome_file, "fasta"): - - # Gets the id of the chormosome in the file - chrom = seq_record.id - - # Cycle throught all the motif that are found in the chromosome - for match in re.finditer(motif, str(seq_record.seq)): - - start_pos = match.start() +1 - end_pos = match.end() + 1 - - # Writes the position in the .bed file (chro/start/end) - line = f"{chrom}\t{start_pos}\t{end_pos}\n" - f.write(line) - -if __name__ == "__main__": - main() diff --git a/src/nf_modules/GATC_finder/main.nf b/src/nf_modules/GATC_finder/main.nf new file mode 100644 index 00000000..d9c32669 --- /dev/null +++ b/src/nf_modules/GATC_finder/main.nf @@ -0,0 +1,21 @@ +container + +params.genome = "" +params.out_file = "" + +process GATC_finder { + container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder" + label "?" + tag "?" +} + + input: + val params.genome + val params.out_file + + output: + file "sites.bed" + +""" +gatc_finder ${params.genome} ${params.out_file} +""" \ No newline at end of file -- GitLab