From 5dc7c7f88ffed23e3c71e293c1129fe03876fb43 Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Wed, 21 Jul 2021 17:03:26 +0200
Subject: [PATCH] kb: create a correction step for the t2g.txt messed with by
 kb

---
 src/.docker_modules/kb/0.26.0/Dockerfile |  2 +
 src/.docker_modules/kb/0.26.0/fix_t2g.py | 57 ++++++++++++++++++++++++
 src/.docker_modules/kb/0.26.0/t2g.py     |  2 +-
 src/nf_modules/kb/main.nf                | 37 ++++++---------
 4 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 src/.docker_modules/kb/0.26.0/fix_t2g.py

diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile
index f8f649a1..37464635 100644
--- a/src/.docker_modules/kb/0.26.0/Dockerfile
+++ b/src/.docker_modules/kb/0.26.0/Dockerfile
@@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0"
 RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
 
 COPY t2g.py /usr/bin/
+COPY fix_t2g.py /usr/bin/
 
 RUN chmod +x /usr/bin/t2g.py
+RUN chmod +x /usr/bin/fix_t2g.py
 
 CMD [ "bash" ]
diff --git a/src/.docker_modules/kb/0.26.0/fix_t2g.py b/src/.docker_modules/kb/0.26.0/fix_t2g.py
new file mode 100644
index 00000000..6535758b
--- /dev/null
+++ b/src/.docker_modules/kb/0.26.0/fix_t2g.py
@@ -0,0 +1,57 @@
+#!/usr/local/bin/python
+import os
+import re
+import argparse
+
+
+def validate_file(f):
+    if not os.path.exists(f):
+        # Argparse uses the ArgumentTypeError to give a rejection message like:
+        # error: argument input: x does not exist
+        raise argparse.ArgumentTypeError("{0} does not exist".format(f))
+    return f
+
+
+def t2g_line(transcript, gene):
+    return str(transcript) + "\t" + str(gene) + "\n"
+
+
+def build_t2g_re():
+    return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+")
+
+
+def get_t2g(line, t2g_re):
+    return t2g_re.match(line)
+
+
+def get_t2g_line(line, t2g_re):
+    t2g_id = get_t2g(line, t2g_re)
+    return {'transcript_id': t2g_id, 'gene_id': t2g_id}
+
+
+def write_t2g_line(t2g, line, t2g_re):
+    results = t2g_line(line, t2g_re)
+    if results['transcript_id']:
+        t2g.write(
+            t2g_line(
+                results['transcript_id'].group(1),
+                results['gene_id'].group(2)
+            )
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="create transcript to genes file from a gtf file."
+    )
+    parser.add_argument(
+        "-f", "--t2g", dest="t2g", required=True, type=validate_file,
+        help="t2g file", metavar="FILE"
+    )
+    args = parser.parse_args()
+    t2g_re = build_t2g_re()
+
+    with open(args.t2g, "r") as gtf:
+        with open("fix_t2g.txt", "w") as t2g:
+            for line in gtf:
+                write_t2g_line(t2g, str(line), t2g_re)
diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py
index 06332ade..02a4c30d 100755
--- a/src/.docker_modules/kb/0.26.0/t2g.py
+++ b/src/.docker_modules/kb/0.26.0/t2g.py
@@ -14,7 +14,7 @@ def validate_file(f):
 
 
 def t2g_line(transcript, gene):
-    return str(transcript + "\t" + str(gene) + "\n")
+    return str(transcript) + "\t" + str(gene) + "\n"
 
 
 def build_gene_re():
diff --git a/src/nf_modules/kb/main.nf b/src/nf_modules/kb/main.nf
index 73de7b0b..d27e84fb 100644
--- a/src/nf_modules/kb/main.nf
+++ b/src/nf_modules/kb/main.nf
@@ -150,11 +150,10 @@ process kb_default {
     --h5ad \
     ${params.count} \
     ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   """
 }
 
@@ -205,11 +204,9 @@ process kb_marseq {
     --h5ad \
     -x 1,0,6:1,6,14:0,0,0 \
     ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   """
   else
   """
@@ -224,11 +221,9 @@ process kb_marseq {
     -x 1,0,6:1,6,14:0,0,0 \
     --h5ad \
     ${reads} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   """
 }
 
@@ -357,11 +352,9 @@ process velocity_default {
     --h5ad \
     ${params.count} \
     ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   cp ${cdna_t2g} ${file_prefix}/
   cp ${intron_t2g} ${file_prefix}/
   """
@@ -417,11 +410,9 @@ process velocity_marseq {
     ${params.count} \
     -x 1,0,6:1,6,14:0,0,0 \
     ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   cp ${cdna_t2g} ${file_prefix}/
   cp ${intron_t2g} ${file_prefix}/
   """
@@ -440,11 +431,9 @@ process velocity_marseq {
     ${params.count} \
     -x 1,0,6:1,6,14:0,0,0 \
     ${reads} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
-    clean_${transcript_to_gene}
+  fix_t2g.py --t2g ${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
   cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
   cp ${cdna_t2g} ${file_prefix}/
   cp ${intron_t2g} ${file_prefix}/
   """
-- 
GitLab