diff --git a/GATC_finder.py b/GATC_finder.py new file mode 100644 index 0000000000000000000000000000000000000000..a14cbf651b36dbd05ecd9ee9412fa8b4f89497d5 --- /dev/null +++ b/GATC_finder.py @@ -0,0 +1,35 @@ +import re + +from Bio import SeqIO + + +def main(genome_file, out_file_path): + """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] + + Args: + genome_file ([string]): [full path to the fasta file] + out_file_path ([string]): [full path to the output file] + """ + # Opening the file to write the positions in + f = open(out_file_path, "w") + + motif = "GATC" + + # Cycles through the parsed chromosomes from the fasta file + for seq_record in SeqIO.parse(genome_file, "fasta"): + + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome + for match in re.finditer(motif, str(seq_record.seq)): + + start_pos = match.start() +1 + end_pos = match.end() + 1 + + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) + +if __name__ == "__main__": + main() diff --git a/src/GATC_analysis.py b/src/GATC_analysis.py index 5d837c208a9beed31549ded660298b1f3522a883..1a756389c3c4a635dc1a790d1edec5fd16eab32d 100644 --- a/src/GATC_analysis.py +++ b/src/GATC_analysis.py @@ -69,9 +69,15 @@ for chrom, regions, name in zip(chromosomes, chrom_regions, id_list): if j >= 5: j = 0 i += 1 - + + pos = np.arange(1, int(max(regions)), 1 ) + y = np.full(len(pos), 39) + print(len(pos)) + print(len(y)) + axes[i, j].set_title(name) axes[i, j].set_ylabel("site number / bin") + axes[i, j].plot(pos, y, color = "black") axes[i, j].plot(regions, chrom) j += 1 diff --git a/src/GATC_finder.py b/src/GATC_finder.py index 12816e8f7db5f98f6a0cbf389d37aa766035feb0..a14cbf651b36dbd05ecd9ee9412fa8b4f89497d5 100644 --- a/src/GATC_finder.py +++ b/src/GATC_finder.py @@ -1,30 +1,35 @@ import re -import matplotlib.pyplot as plt -import numpy as np -import pandas -from Bio import SeqIO, motifs -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord +from Bio import SeqIO -def main(): +def main(genome_file, out_file_path): + """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file] + + Args: + genome_file ([string]): [full path to the fasta file] + out_file_path ([string]): [full path to the output file] + """ + # Opening the file to write the positions in + f = open(out_file_path, "w") - f = open("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites.bed", "w") motif = "GATC" - pos_list = list() - - for seq_record in SeqIO.parse("/home/nathan/projects/vscode_nextflow/nextflow-nathan/data/genome/data_G.fasta", "fasta"): - chrom = seq_record.id + + # Cycles through the parsed chromosomes from the fasta file + for seq_record in SeqIO.parse(genome_file, "fasta"): + # Gets the id of the chormosome in the file + chrom = seq_record.id + + # Cycle throught all the motif that are found in the chromosome for match in re.finditer(motif, str(seq_record.seq)): + start_pos = match.start() +1 end_pos = match.end() + 1 - - line = f"{chrom}\t{start_pos}\t{end_pos}\n" - - f.write(line) + # Writes the position in the .bed file (chro/start/end) + line = f"{chrom}\t{start_pos}\t{end_pos}\n" + f.write(line) if __name__ == "__main__": - main() \ No newline at end of file + main()