From 6355ecd1cc1d80dfeeaf364666149efa13fbf7bf Mon Sep 17 00:00:00 2001
From: Nathan Lecouvreur <nathan.lecouvreur@ens-lyon.fr>
Date: Mon, 14 Feb 2022 10:48:45 +0100
Subject: [PATCH]  beggining of the implementation

---
 src/GATC_analysis.py => GATC_analysis.py      |  0
 bin_GATC.py                                   | 17 +++++++++
 src/.docker_modules/GATC_finder/Dockerfile    | 15 ++++++++
 .../GATC_finder/script/GATC_finder.py         | 32 +++++++++++++++++
 src/Dam_ID_analysis.nf                        | 19 +++++-----
 src/GATC_finder.py                            | 35 -------------------
 src/nf_modules/GATC_finder/main.nf            | 21 +++++++++++
 7 files changed, 94 insertions(+), 45 deletions(-)
 rename src/GATC_analysis.py => GATC_analysis.py (100%)
 create mode 100644 bin_GATC.py
 create mode 100644 src/.docker_modules/GATC_finder/Dockerfile
 create mode 100644 src/.docker_modules/GATC_finder/script/GATC_finder.py
 delete mode 100644 src/GATC_finder.py
 create mode 100644 src/nf_modules/GATC_finder/main.nf

diff --git a/src/GATC_analysis.py b/GATC_analysis.py
similarity index 100%
rename from src/GATC_analysis.py
rename to GATC_analysis.py
diff --git a/bin_GATC.py b/bin_GATC.py
new file mode 100644
index 00000000..c7356eef
--- /dev/null
+++ b/bin_GATC.py
@@ -0,0 +1,17 @@
+import pybedtools
+import pysam
+from Bio import SeqIO
+
+sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed")
+
+
+
+samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb")
+
+print(samfile)
+
+print(samfile)
+for read in samfile.fetch("chr1", 100, 120):
+    print(read)
+
+
diff --git a/src/.docker_modules/GATC_finder/Dockerfile b/src/.docker_modules/GATC_finder/Dockerfile
new file mode 100644
index 00000000..a0265610
--- /dev/null
+++ b/src/.docker_modules/GATC_finder/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.8-alpine
+
+RUN apk update \
+    && apk add make automake gcc g++ subversion python3-dev
+
+RUN pip install numpy\
+    && pip install biopython
+
+COPY script/ /script
+
+RUN adduser -D -u 1000 finder
+RUN chown -R finder /script
+USER finder
+
+ENTRYPOINT [ "python","script/GATC_finder.py" ]
\ No newline at end of file
diff --git a/src/.docker_modules/GATC_finder/script/GATC_finder.py b/src/.docker_modules/GATC_finder/script/GATC_finder.py
new file mode 100644
index 00000000..b3e4e037
--- /dev/null
+++ b/src/.docker_modules/GATC_finder/script/GATC_finder.py
@@ -0,0 +1,32 @@
+import sys
+import re
+from Bio import SeqIO
+
+if len(sys.argv) < 3:
+    raise IndexError("Please enter 2 arguments")
+
+# Gets the arguments in the command line
+out_file_path = str(sys.argv[2])
+genome_file = str(sys.argv[1])
+
+# Opening the file to write the positions in
+f = open(out_file_path, "w")
+
+# Motif we are looking for
+motif = "GATC"
+
+# Cycles through the parsed chromosomes from the fasta file
+for seq_record in SeqIO.parse(genome_file, "fasta"):
+    
+    # Gets the id of the chormosome in the file
+    chrom = seq_record.id
+    
+    # Cycle throught all the motif that are found in the chromosome
+    for match in re.finditer(motif, str(seq_record.seq)):
+        
+        start_pos = match.start() +1
+        end_pos = match.end() + 1
+        
+        # Writes the position in the .bed file (chro/start/end)
+        line = f"{chrom}\t{start_pos}\t{end_pos}\n"
+        f.write(line)   
\ No newline at end of file
diff --git a/src/Dam_ID_analysis.nf b/src/Dam_ID_analysis.nf
index d55f649c..2dd38bf8 100644
--- a/src/Dam_ID_analysis.nf
+++ b/src/Dam_ID_analysis.nf
@@ -4,21 +4,16 @@ nextflow.enable.dsl=2
 */
 
 
+include { fastp } from "./nf_modules/fastp/main.nf" 
+include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")
 
-
-
-/*========================= modules import ================================*/
-
-include { fastp } from "./nf_modules/fastp/main.nf"
-
-include { index_fasta; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")
-
+include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf" 
 
 
 
 params.fasta = "data/genome/*_G.fasta"
 params.fastq = "data/reads/*_R.fastq"
-
+params.bam = "results/mapping/*.bam"
 
 
 channel
@@ -32,13 +27,17 @@ channel
     .set {fastq_files}
 
 
+channel
+    .fromPath(params.bam)
+    .set{bam_file}
 /*================================ workflow ================================*/
 
 workflow {
     fastp(fastq_files)
-    //mapping
     index_fasta(fasta_files)
     mapping_fastq(index_fasta.out.index.collect(), 
                   fastp.out.fastq)
+    sort_bam(bam_file)
+    index_bam(sort_bam.out.bam)
 }
 
diff --git a/src/GATC_finder.py b/src/GATC_finder.py
deleted file mode 100644
index a14cbf65..00000000
--- a/src/GATC_finder.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import re
-
-from Bio import SeqIO
-
-
-def main(genome_file, out_file_path):
-    """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
-
-    Args:
-        genome_file ([string]): [full path to the fasta file]
-        out_file_path ([string]): [full path to the output file]
-    """
-    # Opening the file to write the positions in
-    f = open(out_file_path, "w")
-    
-    motif = "GATC"
-    
-    # Cycles through the parsed chromosomes from the fasta file
-    for seq_record in SeqIO.parse(genome_file, "fasta"):
-        
-        # Gets the id of the chormosome in the file
-        chrom = seq_record.id
-
-        # Cycle throught all the motif that are found in the chromosome
-        for match in re.finditer(motif, str(seq_record.seq)):
-            
-            start_pos = match.start() +1
-            end_pos = match.end() + 1
-
-            # Writes the position in the .bed file (chro/start/end)
-            line = f"{chrom}\t{start_pos}\t{end_pos}\n"
-            f.write(line)   
-
-if __name__ == "__main__":
-    main()
diff --git a/src/nf_modules/GATC_finder/main.nf b/src/nf_modules/GATC_finder/main.nf
new file mode 100644
index 00000000..d9c32669
--- /dev/null
+++ b/src/nf_modules/GATC_finder/main.nf
@@ -0,0 +1,21 @@
+container
+
+params.genome = ""
+params.out_file = ""
+
+process GATC_finder {
+    container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder"
+    label "?"
+    tag "?"
+}
+
+    input:
+        val params.genome
+        val params.out_file
+
+    output:
+        file "sites.bed"
+
+"""
+gatc_finder ${params.genome} ${params.out_file}
+"""
\ No newline at end of file
-- 
GitLab