Skip to content
Snippets Groups Projects
Commit 6355ecd1 authored by nlecouvr's avatar nlecouvr
Browse files

beggining of the implementation

parent e8997bda
Branches
No related tags found
No related merge requests found
File moved
import pybedtools
import pysam
from Bio import SeqIO
sites = pybedtools.BedTool("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites_yeast.bed")
samfile = pysam.AlignmentFile("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/mapping/data.bam","rb")
print(samfile)
print(samfile)
for read in samfile.fetch("chr1", 100, 120):
print(read)
FROM python:3.8-alpine
RUN apk update \
&& apk add make automake gcc g++ subversion python3-dev
RUN pip install numpy\
&& pip install biopython
COPY script/ /script
RUN adduser -D -u 1000 finder
RUN chown -R finder /script
USER finder
ENTRYPOINT [ "python","script/GATC_finder.py" ]
\ No newline at end of file
import sys
import re
from Bio import SeqIO
if len(sys.argv) < 3:
raise IndexError("Please enter 2 arguments")
# Gets the arguments in the command line
out_file_path = str(sys.argv[2])
genome_file = str(sys.argv[1])
# Opening the file to write the positions in
f = open(out_file_path, "w")
# Motif we are looking for
motif = "GATC"
# Cycles through the parsed chromosomes from the fasta file
for seq_record in SeqIO.parse(genome_file, "fasta"):
# Gets the id of the chormosome in the file
chrom = seq_record.id
# Cycle throught all the motif that are found in the chromosome
for match in re.finditer(motif, str(seq_record.seq)):
start_pos = match.start() +1
end_pos = match.end() + 1
# Writes the position in the .bed file (chro/start/end)
line = f"{chrom}\t{start_pos}\t{end_pos}\n"
f.write(line)
\ No newline at end of file
......@@ -4,21 +4,16 @@ nextflow.enable.dsl=2
*/
/*========================= modules import ================================*/
include { fastp } from "./nf_modules/fastp/main.nf"
include { index_fasta ; mapping_fastq } from "./nf_modules/bowtie2/main.nf" addParams(mapping_fastq_out: "mapping/")
include { index_bam ; sort_bam} from "./nf_modules/samtools/main.nf"
params.fasta = "data/genome/*_G.fasta"
params.fastq = "data/reads/*_R.fastq"
params.bam = "results/mapping/*.bam"
channel
......@@ -32,13 +27,17 @@ channel
.set {fastq_files}
channel
.fromPath(params.bam)
.set{bam_file}
/*================================ workflow ================================*/
workflow {
fastp(fastq_files)
//mapping
index_fasta(fasta_files)
mapping_fastq(index_fasta.out.index.collect(),
fastp.out.fastq)
sort_bam(bam_file)
index_bam(sort_bam.out.bam)
}
import re
from Bio import SeqIO
def main(genome_file, out_file_path):
"""[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
Args:
genome_file ([string]): [full path to the fasta file]
out_file_path ([string]): [full path to the output file]
"""
# Opening the file to write the positions in
f = open(out_file_path, "w")
motif = "GATC"
# Cycles through the parsed chromosomes from the fasta file
for seq_record in SeqIO.parse(genome_file, "fasta"):
# Gets the id of the chormosome in the file
chrom = seq_record.id
# Cycle throught all the motif that are found in the chromosome
for match in re.finditer(motif, str(seq_record.seq)):
start_pos = match.start() +1
end_pos = match.end() + 1
# Writes the position in the .bed file (chro/start/end)
line = f"{chrom}\t{start_pos}\t{end_pos}\n"
f.write(line)
if __name__ == "__main__":
main()
container
params.genome = ""
params.out_file = ""
process GATC_finder {
container = "/home/nathan/projects/vscode_nextflow/nextflow-nathan/src/.docker_modules/GATC_finder"
label "?"
tag "?"
}
input:
val params.genome
val params.out_file
output:
file "sites.bed"
"""
gatc_finder ${params.genome} ${params.out_file}
"""
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment