From 6efd55d6462be7580288b885f67400de9f7dcdd7 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Thu, 29 Apr 2021 09:06:07 +0200 Subject: [PATCH] kb: add transcript to gene script to Docker --- src/.docker_modules/kb/0.26.0/Dockerfile | 6 +++++- src/.docker_modules/kb/0.26.0/t2g.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100755 src/.docker_modules/kb/0.26.0/t2g.py diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile index f0d156b5..1da76dcd 100644 --- a/src/.docker_modules/kb/0.26.0/Dockerfile +++ b/src/.docker_modules/kb/0.26.0/Dockerfile @@ -2,6 +2,10 @@ FROM python:3.9-slim ENV KB_VERSION="0.26.0" -RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} +RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} gffutils==0.10.1 + +COPY t2g.py /usr/bin/ + +RUN chmod +x /usr/bin/t2g.py CMD [ "bash" ] diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py new file mode 100755 index 00000000..e40bb063 --- /dev/null +++ b/src/.docker_modules/kb/0.26.0/t2g.py @@ -0,0 +1,23 @@ +#!/usr/local/bin/python +import os +import gffutils +import argparse + +def validate_file(f): + if not os.path.exists(f): + # Argparse uses the ArgumentTypeError to give a rejection message like: + # error: argument input: x does not exist + raise argparse.ArgumentTypeError("{0} does not exist".format(f)) + return f + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="create transcript to genes file from a gtf file.") + parser.add_argument("-g", "--gtf", dest="gtf", required=True, type=validate_file, + help="gtf file", metavar="FILE") + args = parser.parse_args() + + db = gffutils.create_db(args.gtf, dbfn = ":memory:", force = True, merge_strategy="merge", disable_infer_transcripts=False, disable_infer_genes=False) + with open("t2g.txt", "w") as t2g: + for gene in db.all_features(): + for transcript in db.children(gene, featuretype='transcript', order_by='start'): + t2g.write(str(gene["gene_id"][0]) + "\t" + str(transcript["transcript_id"][0]) + "\n") -- GitLab