From 57bdbc3181274f2ddd12b77d04daafbac6b181d8 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Thu, 15 Jul 2021 10:32:06 +0200
Subject: [PATCH] kb: remove gffutils buggy dependency for t2g.txt construction

---
 src/.docker_modules/kb/0.26.0/Dockerfile |  2 +-
 src/.docker_modules/kb/0.26.0/t2g.py     | 69 +++++++++++++++---------
 src/nf_modules/kb/main.nf                |  1 +
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile
index 1da76dcd..f8f649a1 100644
--- a/src/.docker_modules/kb/0.26.0/Dockerfile
+++ b/src/.docker_modules/kb/0.26.0/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.9-slim
 
 ENV KB_VERSION="0.26.0"
 
-RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} gffutils==0.10.1
+RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
 
 COPY t2g.py /usr/bin/
 
diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py
index f9f0b45d..06332ade 100755
--- a/src/.docker_modules/kb/0.26.0/t2g.py
+++ b/src/.docker_modules/kb/0.26.0/t2g.py
@@ -1,6 +1,7 @@
 #!/usr/local/bin/python
 import os
-import gffutils
+import re
+import gzip
 import argparse
 
 
@@ -12,6 +13,43 @@ def validate_file(f):
     return f
 
 
+def t2g_line(transcript, gene):
+    return str(transcript + "\t" + str(gene) + "\n")
+
+
+def build_gene_re():
+    return re.compile(".*gene_id\s+\"(\S+)\";.*")
+
+
+def build_transcript_re():
+    return re.compile(".*transcript_id\s+\"(\S+)\";.*")
+
+
+def get_gene(line, gene_re):
+    return gene_re.match(line)
+
+
+def get_transcript(line, transcript_re):
+    return transcript_re.match(line)
+
+
+def gtf_line(line, transcript_re, gene_re):
+    transcript_id = get_transcript(line, transcript_re)
+    gene_id = get_gene(line, gene_re)
+    return {'transcript_id': transcript_id, 'gene_id': gene_id}
+
+
+def write_t2g_line(t2g, line, transcript_re, gene_re):
+    results = gtf_line(line, transcript_re, gene_re)
+    if results['transcript_id']:
+        t2g.write(
+            t2g_line(
+                results['transcript_id'].group(1),
+                results['gene_id'].group(1)
+            )
+        )
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="create transcript to genes file from a gtf file."
@@ -21,27 +59,10 @@ if __name__ == "__main__":
         help="gtf file", metavar="FILE"
     )
     args = parser.parse_args()
+    gene_re = build_gene_re()
+    transcript_re = build_transcript_re()
 
-    db = gffutils.create_db(
-        args.gtf,
-        dbfn=":memory:",
-        force=True,
-        merge_strategy="merge",
-        disable_infer_transcripts=False,
-        disable_infer_genes=False
-    )
-    with open("t2g.txt", "w") as t2g:
-        for gene in db.all_features():
-            for transcript in db.children(
-              gene, featuretype='transcript', order_by='start'
-            ):
-                t2g_line = str(transcript["transcript_id"][0]) + \
-                    "\t" + \
-                    str(gene["gene_id"][0])
-                t2g_line = t2g_line.split("\t")
-                t2g.write(
-                    str(t2g_line[0].split(".")[0]) +
-                    "\t" +
-                    str(t2g_line[1].split(".")[0]) +
-                    "\n"
-                )
+    with gzip.open(args.gtf, "rb") as gtf:
+        with open("t2g_dup.txt", "w") as t2g:
+            for line in gtf:
+                write_t2g_line(t2g, str(line), transcript_re, gene_re)
\ No newline at end of file
diff --git a/src/nf_modules/kb/main.nf b/src/nf_modules/kb/main.nf
index a6cbfa75..2df29caa 100644
--- a/src/nf_modules/kb/main.nf
+++ b/src/nf_modules/kb/main.nf
@@ -37,6 +37,7 @@ process tr2g {
   script:
   """
   t2g.py --gtf ${gtf}
+  uniq t2g_dup.txt > t2g.txt
   """
 }
 
-- 
GitLab