beggining of the implementation

6355ecd1 · nlecouvr · e8997bda · 6355ecd1 · 6355ecd1 · 6355ecd1
Commit 6355ecd1 authored Feb 14, 2022 by nlecouvr
--- a/src/GATC_analysis.py
+++ b/src/GATC_analysis.py
--- a/bin_GATC.py
+++ b/bin_GATC.py
+import pybedtools
+import pysam
+from Bio import SeqIO
+
+sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed")
+
+
+
+samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb")
+
+print(samfile)
+
+print(samfile)
+for read in samfile.fetch("chr1", 100, 120):
+    print(read)
+
+
--- a/src/.docker_modules/GATC_finder/Dockerfile
+++ b/src/.docker_modules/GATC_finder/Dockerfile
+FROM python:3.8-alpine
+
+RUN apk update \
+    && apk add make automake gcc g++ subversion python3-dev
+
+RUN pip install numpy\
+    && pip install biopython
+
+COPY script/ /script
+
+RUN adduser -D -u 1000 finder
+RUN chown -R finder /script
+USER finder
+
+ENTRYPOINT [ "python","script/GATC_finder.py" ]
\ No newline at end of file
--- a/src/.docker_modules/GATC_finder/script/GATC_finder.py
+++ b/src/.docker_modules/GATC_finder/script/GATC_finder.py
+import sys
+import re
+from Bio import SeqIO
+
+if len(sys.argv) < 3:
+    raise IndexError("Please enter 2 arguments")
+
+# Gets the arguments in the command line
+out_file_path = str(sys.argv[2])
+genome_file = str(sys.argv[1])
+
+# Opening the file to write the positions in
+f = open(out_file_path, "w")
+
+# Motif we are looking for
+motif = "GATC"
+
+# Cycles through the parsed chromosomes from the fasta file
+for seq_record in SeqIO.parse(genome_file, "fasta"):
+    
+    # Gets the id of the chormosome in the file
+    chrom = seq_record.id
+    
+    # Cycle throught all the motif that are found in the chromosome
+    for match in re.finditer(motif, str(seq_record.seq)):
+        
+        start_pos = match.start() +1
+        end_pos = match.end() + 1
+        
+        # Writes the position in the .bed file (chro/start/end)
+        line = f"{chrom}\t{start_pos}\t{end_pos}\n"
+        f.write(line)   
\ No newline at end of file
--- a/src/Dam_ID_analysis.nf
+++ b/src/Dam_ID_analysis.nf
@@ -4,21 +4,16 @@ nextflow.enable.dsl=2
 */


-
-
-
-/*========================= modules import ================================*/
-
 include { fastp } from "./nf_modules/fastp/main.nf" 
-
 include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")

+include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf" 



 params.fasta = "data/genome/*_G.fasta"
 params.fastq = "data/reads/*_R.fastq"
-
+params.bam = "results/mapping/*.bam"


 channel
@@ -32,13 +27,17 @@ channel
    .set {fastq_files}


+channel
+    .fromPath(params.bam)
+    .set{bam_file}
 /*================================ workflow ================================*/

 workflow {
    fastp(fastq_files)
-    //mapping
    index_fasta(fasta_files)
    mapping_fastq(index_fasta.out.index.collect(), 
                  fastp.out.fastq)
+    sort_bam(bam_file)
+    index_bam(sort_bam.out.bam)
 }

--- a/src/GATC_finder.py
+++ b/src/GATC_finder.py
-import re
-
-from Bio import SeqIO
-
-
-def main(genome_file, out_file_path):
-    """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
-
-    Args:
-        genome_file ([string]): [full path to the fasta file]
-        out_file_path ([string]): [full path to the output file]
-    """
-    # Opening the file to write the positions in
-    f = open(out_file_path, "w")
-    
-    motif = "GATC"
-    
-    # Cycles through the parsed chromosomes from the fasta file
-    for seq_record in SeqIO.parse(genome_file, "fasta"):
-        
-        # Gets the id of the chormosome in the file
-        chrom = seq_record.id
-
-        # Cycle throught all the motif that are found in the chromosome
-        for match in re.finditer(motif, str(seq_record.seq)):
-            
-            start_pos = match.start() +1
-            end_pos = match.end() + 1
-
-            # Writes the position in the .bed file (chro/start/end)
-            line = f"{chrom}\t{start_pos}\t{end_pos}\n"
-            f.write(line)   
-
-if __name__ == "__main__":
-    main()
--- a/src/nf_modules/GATC_finder/main.nf
+++ b/src/nf_modules/GATC_finder/main.nf
+container
+
+params.genome = ""
+params.out_file = ""
+
+process GATC_finder {
+    container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder"
+    label "?"
+    tag "?"
+}
+
+    input:
+        val params.genome
+        val params.out_file
+
+    output:
+        file "sites.bed"
+
+"""
+gatc_finder ${params.genome} ${params.out_file}
+"""
\ No newline at end of file