diff --git a/src/GATC_analysis.py b/GATC_analysis.py similarity index 100% rename from src/GATC_analysis.py rename to GATC_analysis.py diff --git a/bin_GATC.py b/bin_GATC.py new file mode 100644 index 0000000000000000000000000000000000000000..c7356eef19f0560cfe09ff8e3ac0742840de79a2 --- /dev/null +++ b/bin_GATC.py @@ -0,0 +1,17 @@ +import pybedtools +import pysam +from Bio import SeqIO + +sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed") + + + +samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb") + +print(samfile) + +print(samfile) +for read in samfile.fetch("chr1", 100, 120): + print(read) + + diff --git a/src/.docker_modules/GATC_finder/Dockerfile b/src/.docker_modules/GATC_finder/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a026561016f9a673d03af8efc85697d4affd8bdd --- /dev/null +++ b/src/.docker_modules/GATC_finder/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.8-alpine + +RUN apk update \ + && apk add make automake gcc g++ subversion python3-dev + +RUN pip install numpy\ + && pip install biopython + +COPY script/ /script + +RUN adduser -D -u 1000 finder +RUN chown -R finder /script +USER finder + +ENTRYPOINT [ "python","script/GATC_finder.py" ] \ No newline at end of file diff --git a/src/.docker_modules/GATC_finder/script/GATC_finder.py b/src/.docker_modules/GATC_finder/script/GATC_finder.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e4e037a9b25cd313d9fff86abc6aca155e8e06 --- /dev/null +++ b/src/.docker_modules/GATC_finder/script/GATC_finder.py @@ -0,0 +1,32 @@ +import sys +import re +from Bio import SeqIO + +if len(sys.argv) < 3: + raise IndexError("Please enter 2 arguments") + +# Gets the arguments in the command line +out_file_path = str(sys.argv[2]) +genome_file = str(sys.argv[1]) + +# Opening the file to write the positions in +f = open(out_file_path, "w") + +# Motif we are looking for +motif = "GATC" + +# Cycles through the parsed chromosomes from the fasta file +for seq_record in SeqIO.parse(genome_file, "fasta"): + + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome + for match in re.finditer(motif, str(seq_record.seq)): + + start_pos = match.start() +1 + end_pos = match.end() + 1 + + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) \ No newline at end of file diff --git a/src/Dam_ID_analysis.nf b/src/Dam_ID_analysis.nf index d55f649c317c72bf578a0b92d85bb35f49326a71..2dd38bf857c3540f7bfb5cf964b3f8c364fafccd 100644 --- a/src/Dam_ID_analysis.nf +++ b/src/Dam_ID_analysis.nf @@ -4,21 +4,16 @@ nextflow.enable.dsl=2 */ +include { fastp } from "./nf_modules/fastp/main.nf" +include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/") - - -/*========================= modules import ================================*/ - -include { fastp } from "./nf_modules/fastp/main.nf" - -include { index_fasta; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/") - +include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf" params.fasta = "data/genome/*_G.fasta" params.fastq = "data/reads/*_R.fastq" - +params.bam = "results/mapping/*.bam" channel @@ -32,13 +27,17 @@ channel .set {fastq_files} +channel + .fromPath(params.bam) + .set{bam_file} /*================================ workflow ================================*/ workflow { fastp(fastq_files) - //mapping index_fasta(fasta_files) mapping_fastq(index_fasta.out.index.collect(), fastp.out.fastq) + sort_bam(bam_file) + index_bam(sort_bam.out.bam) } diff --git a/src/GATC_finder.py b/src/GATC_finder.py deleted file mode 100644 index a14cbf651b36dbd05ecd9ee9412fa8b4f89497d5..0000000000000000000000000000000000000000 --- a/src/GATC_finder.py +++ /dev/null @@ -1,35 +0,0 @@ -import re - -from Bio import SeqIO - - -def main(genome_file, out_file_path): - """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] - - Args: - genome_file ([string]): [full path to the fasta file] - out_file_path ([string]): [full path to the output file] - """ - # Opening the file to write the positions in - f = open(out_file_path, "w") - - motif = "GATC" - - # Cycles through the parsed chromosomes from the fasta file - for seq_record in SeqIO.parse(genome_file, "fasta"): - - # Gets the id of the chormosome in the file - chrom = seq_record.id - - # Cycle throught all the motif that are found in the chromosome - for match in re.finditer(motif, str(seq_record.seq)): - - start_pos = match.start() +1 - end_pos = match.end() + 1 - - # Writes the position in the .bed file (chro/start/end) - line = f"{chrom}\t{start_pos}\t{end_pos}\n" - f.write(line) - -if __name__ == "__main__": - main() diff --git a/src/nf_modules/GATC_finder/main.nf b/src/nf_modules/GATC_finder/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..d9c32669d5d40a27959da9fa47bd8623de3cbcc1 --- /dev/null +++ b/src/nf_modules/GATC_finder/main.nf @@ -0,0 +1,21 @@ +container + +params.genome = "" +params.out_file = "" + +process GATC_finder { + container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder" + label "?" + tag "?" +} + + input: + val params.genome + val params.out_file + + output: + file "sites.bed" + +""" +gatc_finder ${params.genome} ${params.out_file} +""" \ No newline at end of file