From e8997bdaca4abdd92169baeb151ca9fcd67f9e5f Mon Sep 17 00:00:00 2001 From: Nathan Lecouvreur <nathan.lecouvreur@ens-lyon.fr> Date: Wed, 2 Feb 2022 10:49:30 +0100 Subject: [PATCH] made finder runnable --- GATC_finder.py | 35 +++++++++++++++++++++++++++++++++++ src/GATC_analysis.py | 8 +++++++- src/GATC_finder.py | 39 ++++++++++++++++++++++----------------- 3 files changed, 64 insertions(+), 18 deletions(-) create mode 100644 GATC_finder.py diff --git a/GATC_finder.py b/GATC_finder.py new file mode 100644 index 0000000..a14cbf6 --- /dev/null +++ b/GATC_finder.py @@ -0,0 +1,35 @@ +import re + +from Bio import SeqIO + + +def main(genome_file, out_file_path): + """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] + + Args: + genome_file ([string]): [full path to the fasta file] + out_file_path ([string]): [full path to the output file] + """ + # Opening the file to write the positions in + f = open(out_file_path, "w") + + motif = "GATC" + + # Cycles through the parsed chromosomes from the fasta file + for seq_record in SeqIO.parse(genome_file, "fasta"): + + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome + for match in re.finditer(motif, str(seq_record.seq)): + + start_pos = match.start() +1 + end_pos = match.end() + 1 + + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) + +if __name__ == "__main__": + main() diff --git a/src/GATC_analysis.py b/src/GATC_analysis.py index 5d837c2..1a75638 100644 --- a/src/GATC_analysis.py +++ b/src/GATC_analysis.py @@ -69,9 +69,15 @@ for chrom, regions, name in zip(chromosomes, chrom_regions, id_list): if j >= 5: j = 0 i += 1 - + + pos = np.arange(1, int(max(regions)), 1 ) + y = np.full(len(pos), 39) + print(len(pos)) + print(len(y)) + axes[i, j].set_title(name) axes[i, j].set_ylabel("site number / bin") + axes[i, j].plot(pos, y, color = "black") axes[i, j].plot(regions, chrom) j += 1 diff --git a/src/GATC_finder.py b/src/GATC_finder.py index 12816e8..a14cbf6 100644 --- a/src/GATC_finder.py +++ b/src/GATC_finder.py @@ -1,30 +1,35 @@ import re -import matplotlib.pyplot as plt -import numpy as np -import pandas -from Bio import SeqIO, motifs -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord +from Bio import SeqIO -def main(): +def main(genome_file, out_file_path): + """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] + + Args: + genome_file ([string]): [full path to the fasta file] + out_file_path ([string]): [full path to the output file] + """ + # Opening the file to write the positions in + f = open(out_file_path, "w") - f = open("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites.bed", "w") motif = "GATC" - pos_list = list() - - for seq_record in SeqIO.parse("/home/nathan/projects/vscode_nextflow/nextflow-nathan/data/genome/data_G.fasta", "fasta"): - chrom = seq_record.id + + # Cycles through the parsed chromosomes from the fasta file + for seq_record in SeqIO.parse(genome_file, "fasta"): + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome for match in re.finditer(motif, str(seq_record.seq)): + start_pos = match.start() +1 end_pos = match.end() + 1 - - line = f"{chrom}\t{start_pos}\t{end_pos}\n" - - f.write(line) + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) if __name__ == "__main__": - main() \ No newline at end of file + main() -- GitLab