made finder runnable

e8997bda · nlecouvr · 0b77a330 · e8997bda · e8997bda · e8997bda
Commit e8997bda authored 3 years ago by nlecouvr
--- a/GATC_finder.py
+++ b/GATC_finder.py
+import re
+
+from Bio import SeqIO
+
+
+def main(genome_file, out_file_path):
+    """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
+
+    Args:
+        genome_file ([string]): [full path to the fasta file]
+        out_file_path ([string]): [full path to the output file]
+    """
+    # Opening the file to write the positions in
+    f = open(out_file_path, "w")
+    
+    motif = "GATC"
+    
+    # Cycles through the parsed chromosomes from the fasta file
+    for seq_record in SeqIO.parse(genome_file, "fasta"):
+        
+        # Gets the id of the chormosome in the file
+        chrom = seq_record.id
+
+        # Cycle throught all the motif that are found in the chromosome
+        for match in re.finditer(motif, str(seq_record.seq)):
+            
+            start_pos = match.start() +1
+            end_pos = match.end() + 1
+
+            # Writes the position in the .bed file (chro/start/end)
+            line = f"{chrom}\t{start_pos}\t{end_pos}\n"
+            f.write(line)   
+
+if __name__ == "__main__":
+    main()
--- a/src/GATC_analysis.py
+++ b/src/GATC_analysis.py
@@ -69,9 +69,15 @@ for chrom, regions, name in zip(chromosomes, chrom_regions, id_list):
    if j >= 5:
        j = 0
        i += 1
-        
+    
+    pos = np.arange(1, int(max(regions)), 1 )
+    y = np.full(len(pos), 39)
+    print(len(pos))
+    print(len(y))
+    
    axes[i, j].set_title(name)
    axes[i, j].set_ylabel("site number / bin")
+    axes[i, j].plot(pos, y, color = "black")
    axes[i, j].plot(regions, chrom)
    j += 1
    

--- a/src/GATC_finder.py
+++ b/src/GATC_finder.py
 import re

-import matplotlib.pyplot as plt
-import numpy as np
-import pandas
-from Bio import SeqIO, motifs
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO


-def main():
+def main(genome_file, out_file_path):
+    """[Gets all the GATC file from the given genome or sequence and puts them in a .bed file]
+
+    Args:
+        genome_file ([string]): [full path to the fasta file]
+        out_file_path ([string]): [full path to the output file]
+    """
+    # Opening the file to write the positions in
+    f = open(out_file_path, "w")
    
-    f = open("/home/nathan/projects/vscode_nextflow/nextflow-nathan/results/GATC/sites.bed", "w")
    motif = "GATC"
-    pos_list = list()
-
-    for seq_record in SeqIO.parse("/home/nathan/projects/vscode_nextflow/nextflow-nathan/data/genome/data_G.fasta", "fasta"):
-        chrom = seq_record.id
+    
+    # Cycles through the parsed chromosomes from the fasta file
+    for seq_record in SeqIO.parse(genome_file, "fasta"):
        
+        # Gets the id of the chormosome in the file
+        chrom = seq_record.id
+
+        # Cycle throught all the motif that are found in the chromosome
        for match in re.finditer(motif, str(seq_record.seq)):
+            
            start_pos = match.start() +1
            end_pos = match.end() + 1
-            
-            line = f"{chrom}\t{start_pos}\t{end_pos}\n"
-            
-            f.write(line)

+            # Writes the position in the .bed file (chro/start/end)
+            line = f"{chrom}\t{start_pos}\t{end_pos}\n"
+            f.write(line)   

 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()