kb: create a correction step for the t2g.txt messed with by kb

5dc7c7f8 · Laurent Modolo · c3a4c54e · 5dc7c7f8 · 5dc7c7f8 · 5dc7c7f8
Verified Commit 5dc7c7f8 authored 3 years ago by Laurent Modolo
--- a/src/.docker_modules/kb/0.26.0/Dockerfile
+++ b/src/.docker_modules/kb/0.26.0/Dockerfile
@@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0"
 RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
 COPY t2g.py /usr/bin/
+COPY fix_t2g.py /usr/bin/
 RUN chmod +x /usr/bin/t2g.py
+RUN chmod +x /usr/bin/fix_t2g.py
 CMD [ "bash" ]
--- a/src/.docker_modules/kb/0.26.0/fix_t2g.py
+++ b/src/.docker_modules/kb/0.26.0/fix_t2g.py
+#!/usr/local/bin/python
+import os
+import re
+import argparse
+def validate_file(f):
+    if not os.path.exists(f):
+        # Argparse uses the ArgumentTypeError to give a rejection message like:
+        # error: argument input: x does not exist
+        raise argparse.ArgumentTypeError("{0} does not exist".format(f))
+    return f
+def t2g_line(transcript, gene):
+    return str(transcript) + "\t" + str(gene) + "\n"
+def build_t2g_re():
+    return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+")
+def get_t2g(line, t2g_re):
+    return t2g_re.match(line)
+def get_t2g_line(line, t2g_re):
+    t2g_id = get_t2g(line, t2g_re)
+    return {'transcript_id': t2g_id, 'gene_id': t2g_id}
+def write_t2g_line(t2g, line, t2g_re):
+    results = t2g_line(line, t2g_re)
+    if results['transcript_id']:
+        t2g.write(
+            t2g_line(
+                results['transcript_id'].group(1),
+                results['gene_id'].group(2)
+            )
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="create transcript to genes file from a gtf file."
+    )
+    parser.add_argument(
+        "-f", "--t2g", dest="t2g", required=True, type=validate_file,
+        help="t2g file", metavar="FILE"
+    )
+    args = parser.parse_args()
+    t2g_re = build_t2g_re()
+    with open(args.t2g, "r") as gtf:
+        with open("fix_t2g.txt", "w") as t2g:
+            for line in gtf:
+                write_t2g_line(t2g, str(line), t2g_re)
--- a/src/.docker_modules/kb/0.26.0/t2g.py
+++ b/src/.docker_modules/kb/0.26.0/t2g.py
@@ -14,7 +14,7 @@ def validate_file(f):
 def t2g_line(transcript, gene):
-    return str(transcript + "\t" + str(gene) + "\n")
+    return str(transcript) + "\t" + str(gene) + "\n"
 def build_gene_re():

--- a/src/nf_modules/kb/main.nf
+++ b/src/nf_modules/kb/main.nf
@@ -150,11 +150,10 @@ process kb_default {
    --h5ad \
    ${params.count} \
    ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    clean_${transcript_to_gene}
+  cp fix_t2g.txt ${file_prefix}/
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  """
 }
@@ -205,11 +204,9 @@ process kb_marseq {
    --h5ad \
    -x 1,0,6:1,6,14:0,0,0 \
    ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  cp fix_t2g.txt ${file_prefix}/
-    clean_${transcript_to_gene}
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  """
  else
  """
@@ -224,11 +221,9 @@ process kb_marseq {
    -x 1,0,6:1,6,14:0,0,0 \
    --h5ad \
    ${reads} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  cp fix_t2g.txt ${file_prefix}/
-    clean_${transcript_to_gene}
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  """
 }
@@ -357,11 +352,9 @@ process velocity_default {
    --h5ad \
    ${params.count} \
    ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  cp fix_t2g.txt ${file_prefix}/
-    clean_${transcript_to_gene}
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  cp ${cdna_t2g} ${file_prefix}/
  cp ${intron_t2g} ${file_prefix}/
  """
@@ -417,11 +410,9 @@ process velocity_marseq {
    ${params.count} \
    -x 1,0,6:1,6,14:0,0,0 \
    ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  cp fix_t2g.txt ${file_prefix}/
-    clean_${transcript_to_gene}
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  cp ${cdna_t2g} ${file_prefix}/
  cp ${intron_t2g} ${file_prefix}/
  """
@@ -440,11 +431,9 @@ process velocity_marseq {
    ${params.count} \
    -x 1,0,6:1,6,14:0,0,0 \
    ${reads} > ${file_prefix}_kb_mapping_report.txt
-  awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
+  fix_t2g.py --t2g ${transcript_to_gene}
-    sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
+  cp fix_t2g.txt ${file_prefix}/
-    clean_${transcript_to_gene}
  cp ${transcript_to_gene} ${file_prefix}/
-  cp clean_${transcript_to_gene} ${file_prefix}/
  cp ${cdna_t2g} ${file_prefix}/
  cp ${intron_t2g} ${file_prefix}/
  """