From 57bdbc3181274f2ddd12b77d04daafbac6b181d8 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Thu, 15 Jul 2021 10:32:06 +0200 Subject: [PATCH] kb: remove gffutils buggy dependency for t2g.txt construction --- src/.docker_modules/kb/0.26.0/Dockerfile | 2 +- src/.docker_modules/kb/0.26.0/t2g.py | 69 +++++++++++++++--------- src/nf_modules/kb/main.nf | 1 + 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile index 1da76dcd..f8f649a1 100644 --- a/src/.docker_modules/kb/0.26.0/Dockerfile +++ b/src/.docker_modules/kb/0.26.0/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.9-slim ENV KB_VERSION="0.26.0" -RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} gffutils==0.10.1 +RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} COPY t2g.py /usr/bin/ diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py index f9f0b45d..06332ade 100755 --- a/src/.docker_modules/kb/0.26.0/t2g.py +++ b/src/.docker_modules/kb/0.26.0/t2g.py @@ -1,6 +1,7 @@ #!/usr/local/bin/python import os -import gffutils +import re +import gzip import argparse @@ -12,6 +13,43 @@ def validate_file(f): return f +def t2g_line(transcript, gene): + return str(transcript + "\t" + str(gene) + "\n") + + +def build_gene_re(): + return re.compile(".*gene_id\s+\"(\S+)\";.*") + + +def build_transcript_re(): + return re.compile(".*transcript_id\s+\"(\S+)\";.*") + + +def get_gene(line, gene_re): + return gene_re.match(line) + + +def get_transcript(line, transcript_re): + return transcript_re.match(line) + + +def gtf_line(line, transcript_re, gene_re): + transcript_id = get_transcript(line, transcript_re) + gene_id = get_gene(line, gene_re) + return {'transcript_id': transcript_id, 'gene_id': gene_id} + + +def write_t2g_line(t2g, line, transcript_re, gene_re): + results = gtf_line(line, transcript_re, gene_re) + if results['transcript_id']: + t2g.write( + t2g_line( + results['transcript_id'].group(1), + results['gene_id'].group(1) + ) + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="create transcript to genes file from a gtf file." @@ -21,27 +59,10 @@ if __name__ == "__main__": help="gtf file", metavar="FILE" ) args = parser.parse_args() + gene_re = build_gene_re() + transcript_re = build_transcript_re() - db = gffutils.create_db( - args.gtf, - dbfn=":memory:", - force=True, - merge_strategy="merge", - disable_infer_transcripts=False, - disable_infer_genes=False - ) - with open("t2g.txt", "w") as t2g: - for gene in db.all_features(): - for transcript in db.children( - gene, featuretype='transcript', order_by='start' - ): - t2g_line = str(transcript["transcript_id"][0]) + \ - "\t" + \ - str(gene["gene_id"][0]) - t2g_line = t2g_line.split("\t") - t2g.write( - str(t2g_line[0].split(".")[0]) + - "\t" + - str(t2g_line[1].split(".")[0]) + - "\n" - ) + with gzip.open(args.gtf, "rb") as gtf: + with open("t2g_dup.txt", "w") as t2g: + for line in gtf: + write_t2g_line(t2g, str(line), transcript_re, gene_re) \ No newline at end of file diff --git a/src/nf_modules/kb/main.nf b/src/nf_modules/kb/main.nf index a6cbfa75..2df29caa 100644 --- a/src/nf_modules/kb/main.nf +++ b/src/nf_modules/kb/main.nf @@ -37,6 +37,7 @@ process tr2g { script: """ t2g.py --gtf ${gtf} + uniq t2g_dup.txt > t2g.txt """ } -- GitLab