From 6efd55d6462be7580288b885f67400de9f7dcdd7 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Thu, 29 Apr 2021 09:06:07 +0200
Subject: [PATCH] kb: add transcript to gene script to Docker

---
 src/.docker_modules/kb/0.26.0/Dockerfile |  6 +++++-
 src/.docker_modules/kb/0.26.0/t2g.py     | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100755 src/.docker_modules/kb/0.26.0/t2g.py

diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile
index f0d156b5..1da76dcd 100644
--- a/src/.docker_modules/kb/0.26.0/Dockerfile
+++ b/src/.docker_modules/kb/0.26.0/Dockerfile
@@ -2,6 +2,10 @@ FROM python:3.9-slim
 
 ENV KB_VERSION="0.26.0"
 
-RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
+RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} gffutils==0.10.1
+
+COPY t2g.py /usr/bin/
+
+RUN chmod +x /usr/bin/t2g.py
 
 CMD [ "bash" ]
diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py
new file mode 100755
index 00000000..e40bb063
--- /dev/null
+++ b/src/.docker_modules/kb/0.26.0/t2g.py
@@ -0,0 +1,23 @@
+#!/usr/local/bin/python
+import os
+import gffutils
+import argparse
+
+def validate_file(f):
+    if not os.path.exists(f):
+        # Argparse uses the ArgumentTypeError to give a rejection message like:
+        # error: argument input: x does not exist
+        raise argparse.ArgumentTypeError("{0} does not exist".format(f))
+    return f
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="create transcript to genes file from a gtf file.")
+  parser.add_argument("-g", "--gtf", dest="gtf", required=True, type=validate_file,
+                      help="gtf file", metavar="FILE")
+  args = parser.parse_args()
+
+  db = gffutils.create_db(args.gtf, dbfn = ":memory:", force = True, merge_strategy="merge", disable_infer_transcripts=False, disable_infer_genes=False)
+  with open("t2g.txt", "w") as t2g:
+    for gene in db.all_features():
+      for transcript in db.children(gene, featuretype='transcript', order_by='start'):
+        t2g.write(str(gene["gene_id"][0]) + "\t" + str(transcript["transcript_id"][0]) + "\n")
-- 
GitLab