-
Laurent Modolo authoredLaurent Modolo authored
t2g.py 2.14 KiB
#!/usr/local/bin/python
# SPDX-FileCopyrightText: 2022 Laurent Modolo <laurent.modolo@ens-lyon.fr>
#
# SPDX-License-Identifier: AGPL-3.0-or-later
import os
import re
import gzip
import argparse
def validate_file(f):
if not os.path.exists(f):
# Argparse uses the ArgumentTypeError to give a rejection message like:
# error: argument input: x does not exist
raise argparse.ArgumentTypeError("{0} does not exist".format(f))
return f
def t2g_line(transcript, gene):
return str(transcript) + "\t" + str(gene) + "\n"
def build_gene_re():
return re.compile(".*gene_id\s+\"(\S+)\";.*")
def build_transcript_re():
return re.compile(".*transcript_id\s+\"(\S+)\";.*")
def get_gene(line, gene_re):
return gene_re.match(line)
def get_transcript(line, transcript_re):
return transcript_re.match(line)
def gtf_line(line, transcript_re, gene_re):
transcript_id = get_transcript(line, transcript_re)
gene_id = get_gene(line, gene_re)
return {'transcript_id': transcript_id, 'gene_id': gene_id}
def write_t2g_line(t2g, line, transcript_re, gene_re):
results = gtf_line(line, transcript_re, gene_re)
if results['transcript_id']:
t2g.write(
t2g_line(
results['transcript_id'].group(1),
results['gene_id'].group(1)
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="create transcript to genes file from a gtf file."
)
parser.add_argument(
"-g", "--gtf", dest="gtf", required=True, type=validate_file,
help="gtf file", metavar="FILE"
)
args = parser.parse_args()
gene_re = build_gene_re()
transcript_re = build_transcript_re()
try:
with gzip.open(args.gtf, "rb") as gtf:
with open("t2g_dup.txt", "w") as t2g:
for line in gtf:
write_t2g_line(t2g, str(line), transcript_re, gene_re)
except gzip.BadGzipFile:
with open(args.gtf, "r") as gtf:
with open("t2g_dup.txt", "w") as t2g:
for line in gtf:
write_t2g_line(t2g, str(line), transcript_re, gene_re)