Commit 6355ecd1 authored by nlecouvr's avatar nlecouvr
Browse files

beggining of the implementation

parent e8997bda
import pybedtools
import pysam
from Bio import SeqIO
sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed")
samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb")
print(samfile)
print(samfile)
for read in samfile.fetch("chr1", 100, 120):
print(read)
FROM python:3.8-alpine
RUN apk update \
&& apk add make automake gcc g++ subversion python3-dev
RUN pip install numpy\
&& pip install biopython
COPY script/ /script
RUN adduser -D -u 1000 finder
RUN chown -R finder /script
USER finder
ENTRYPOINT [ "python","script/GATC_finder.py" ]
\ No newline at end of file
import sys
import re
from Bio import SeqIO
if len(sys.argv) < 3:
raise IndexError("Please enter 2 arguments")
# Gets the arguments in the command line
out_file_path = str(sys.argv[2])
genome_file = str(sys.argv[1])
# Opening the file to write the positions in
f = open(out_file_path, "w")
# Motif we are looking for
motif = "GATC"
# Cycles through the parsed chromosomes from the fasta file
for seq_record in SeqIO.parse(genome_file, "fasta"):
# Gets the id of the chormosome in the file
chrom = seq_record.id
# Cycle throught all the motif that are found in the chromosome
for match in re.finditer(motif, str(seq_record.seq)):
start_pos = match.start() +1
end_pos = match.end() + 1
# Writes the position in the .bed file (chro/start/end)
line = f"{chrom}\t{start_pos}\t{end_pos}\n"
f.write(line)
\ No newline at end of file
......@@ -4,21 +4,16 @@ nextflow.enable.dsl=2
*/
include { fastp } from "./nf_modules/fastp/main.nf"
include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")
/*========================= modules import ================================*/
include { fastp } from "./nf_modules/fastp/main.nf"
include { index_fasta; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")
include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf"
params.fasta = "data/genome/*_G.fasta"
params.fastq = "data/reads/*_R.fastq"
params.bam = "results/mapping/*.bam"
channel
......@@ -32,13 +27,17 @@ channel
.set {fastq_files}
channel
.fromPath(params.bam)
.set{bam_file}
/*================================ workflow ================================*/
workflow {
fastp(fastq_files)
//mapping
index_fasta(fasta_files)
mapping_fastq(index_fasta.out.index.collect(),
fastp.out.fastq)
sort_bam(bam_file)
index_bam(sort_bam.out.bam)
}
import re
from Bio import SeqIO
def main(genome_file, out_file_path):
"""[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
Args:
genome_file ([string]): [full path to the fasta file]
out_file_path ([string]): [full path to the output file]
"""
# Opening the file to write the positions in
f = open(out_file_path, "w")
motif = "GATC"
# Cycles through the parsed chromosomes from the fasta file
for seq_record in SeqIO.parse(genome_file, "fasta"):
# Gets the id of the chormosome in the file
chrom = seq_record.id
# Cycle throught all the motif that are found in the chromosome
for match in re.finditer(motif, str(seq_record.seq)):
start_pos = match.start() +1
end_pos = match.end() + 1
# Writes the position in the .bed file (chro/start/end)
line = f"{chrom}\t{start_pos}\t{end_pos}\n"
f.write(line)
if __name__ == "__main__":
main()
container
params.genome = ""
params.out_file = ""
process GATC_finder {
container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder"
label "?"
tag "?"
}
input:
val params.genome
val params.out_file
output:
file "sites.bed"
"""
gatc_finder ${params.genome} ${params.out_file}
"""
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment