Skip to content
Snippets Groups Projects
t2g.py 2.14 KiB
#!/usr/local/bin/python

# SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
#
# SPDX-License-Identifier: AGPL-3.0-or-later

import os
import re
import gzip
import argparse


def validate_file(f):
    if not os.path.exists(f):
        # Argparse uses the ArgumentTypeError to give a rejection message like:
        # error: argument input: x does not exist
        raise argparse.ArgumentTypeError("{0} does not exist".format(f))
    return f


def t2g_line(transcript, gene):
    return str(transcript) + "\t" + str(gene) + "\n"


def build_gene_re():
    return re.compile(".*gene_id\s+\"(\S+)\";.*")


def build_transcript_re():
    return re.compile(".*transcript_id\s+\"(\S+)\";.*")


def get_gene(line, gene_re):
    return gene_re.match(line)


def get_transcript(line, transcript_re):
    return transcript_re.match(line)


def gtf_line(line, transcript_re, gene_re):
    transcript_id = get_transcript(line, transcript_re)
    gene_id = get_gene(line, gene_re)
    return {'transcript_id': transcript_id, 'gene_id': gene_id}


def write_t2g_line(t2g, line, transcript_re, gene_re):
    results = gtf_line(line, transcript_re, gene_re)
    if results['transcript_id']:
        t2g.write(
            t2g_line(
                results['transcript_id'].group(1),
                results['gene_id'].group(1)
            )
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="create transcript to genes file from a gtf file."
    )
    parser.add_argument(
        "-g", "--gtf", dest="gtf", required=True, type=validate_file,
        help="gtf file", metavar="FILE"
    )
    args = parser.parse_args()
    gene_re = build_gene_re()
    transcript_re = build_transcript_re()

    try:
        with gzip.open(args.gtf, "rb") as gtf:
            with open("t2g_dup.txt", "w") as t2g:
                for line in gtf:
                    write_t2g_line(t2g, str(line), transcript_re, gene_re)
    except gzip.BadGzipFile:
        with open(args.gtf, "r") as gtf:
            with open("t2g_dup.txt", "w") as t2g:
                for line in gtf:
                    write_t2g_line(t2g, str(line), transcript_re, gene_re)