From 5dc7c7f88ffed23e3c71e293c1129fe03876fb43 Mon Sep 17 00:00:00 2001 From: Laurent Modolo <laurent.modolo@ens-lyon.fr> Date: Wed, 21 Jul 2021 17:03:26 +0200 Subject: [PATCH] kb: create a correction step for the t2g.txt messed with by kb --- src/.docker_modules/kb/0.26.0/Dockerfile | 2 + src/.docker_modules/kb/0.26.0/fix_t2g.py | 57 ++++++++++++++++++++++++ src/.docker_modules/kb/0.26.0/t2g.py | 2 +- src/nf_modules/kb/main.nf | 37 ++++++--------- 4 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 src/.docker_modules/kb/0.26.0/fix_t2g.py diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile index f8f649a1..37464635 100644 --- a/src/.docker_modules/kb/0.26.0/Dockerfile +++ b/src/.docker_modules/kb/0.26.0/Dockerfile @@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0" RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} COPY t2g.py /usr/bin/ +COPY fix_t2g.py /usr/bin/ RUN chmod +x /usr/bin/t2g.py +RUN chmod +x /usr/bin/fix_t2g.py CMD [ "bash" ] diff --git a/src/.docker_modules/kb/0.26.0/fix_t2g.py b/src/.docker_modules/kb/0.26.0/fix_t2g.py new file mode 100644 index 00000000..6535758b --- /dev/null +++ b/src/.docker_modules/kb/0.26.0/fix_t2g.py @@ -0,0 +1,57 @@ +#!/usr/local/bin/python +import os +import re +import argparse + + +def validate_file(f): + if not os.path.exists(f): + # Argparse uses the ArgumentTypeError to give a rejection message like: + # error: argument input: x does not exist + raise argparse.ArgumentTypeError("{0} does not exist".format(f)) + return f + + +def t2g_line(transcript, gene): + return str(transcript) + "\t" + str(gene) + "\n" + + +def build_t2g_re(): + return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+") + + +def get_t2g(line, t2g_re): + return t2g_re.match(line) + + +def get_t2g_line(line, t2g_re): + t2g_id = get_t2g(line, t2g_re) + return {'transcript_id': t2g_id, 'gene_id': t2g_id} + + +def write_t2g_line(t2g, line, t2g_re): + results = t2g_line(line, t2g_re) + if results['transcript_id']: + t2g.write( + t2g_line( + results['transcript_id'].group(1), + results['gene_id'].group(2) + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="create transcript to genes file from a gtf file." + ) + parser.add_argument( + "-f", "--t2g", dest="t2g", required=True, type=validate_file, + help="t2g file", metavar="FILE" + ) + args = parser.parse_args() + t2g_re = build_t2g_re() + + with open(args.t2g, "r") as gtf: + with open("fix_t2g.txt", "w") as t2g: + for line in gtf: + write_t2g_line(t2g, str(line), t2g_re) diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py index 06332ade..02a4c30d 100755 --- a/src/.docker_modules/kb/0.26.0/t2g.py +++ b/src/.docker_modules/kb/0.26.0/t2g.py @@ -14,7 +14,7 @@ def validate_file(f): def t2g_line(transcript, gene): - return str(transcript + "\t" + str(gene) + "\n") + return str(transcript) + "\t" + str(gene) + "\n" def build_gene_re(): diff --git a/src/nf_modules/kb/main.nf b/src/nf_modules/kb/main.nf index 73de7b0b..d27e84fb 100644 --- a/src/nf_modules/kb/main.nf +++ b/src/nf_modules/kb/main.nf @@ -150,11 +150,10 @@ process kb_default { --h5ad \ ${params.count} \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ } @@ -205,11 +204,9 @@ process kb_marseq { --h5ad \ -x 1,0,6:1,6,14:0,0,0 \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ else """ @@ -224,11 +221,9 @@ process kb_marseq { -x 1,0,6:1,6,14:0,0,0 \ --h5ad \ ${reads} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ } @@ -357,11 +352,9 @@ process velocity_default { --h5ad \ ${params.count} \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """ @@ -417,11 +410,9 @@ process velocity_marseq { ${params.count} \ -x 1,0,6:1,6,14:0,0,0 \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """ @@ -440,11 +431,9 @@ process velocity_marseq { ${params.count} \ -x 1,0,6:1,6,14:0,0,0 \ ${reads} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """ -- GitLab