diff --git a/src/.docker_modules/kb/0.26.0/Dockerfile b/src/.docker_modules/kb/0.26.0/Dockerfile index f8f649a1773905e7f8e99a02839760bd667fadc7..374646355b3f3f3895e8450c70d49eec1688c280 100644 --- a/src/.docker_modules/kb/0.26.0/Dockerfile +++ b/src/.docker_modules/kb/0.26.0/Dockerfile @@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0" RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} COPY t2g.py /usr/bin/ +COPY fix_t2g.py /usr/bin/ RUN chmod +x /usr/bin/t2g.py +RUN chmod +x /usr/bin/fix_t2g.py CMD [ "bash" ] diff --git a/src/.docker_modules/kb/0.26.0/fix_t2g.py b/src/.docker_modules/kb/0.26.0/fix_t2g.py new file mode 100644 index 0000000000000000000000000000000000000000..6535758b519ef55d2e4ca3debdf6d2b44cade874 --- /dev/null +++ b/src/.docker_modules/kb/0.26.0/fix_t2g.py @@ -0,0 +1,57 @@ +#!/usr/local/bin/python +import os +import re +import argparse + + +def validate_file(f): + if not os.path.exists(f): + # Argparse uses the ArgumentTypeError to give a rejection message like: + # error: argument input: x does not exist + raise argparse.ArgumentTypeError("{0} does not exist".format(f)) + return f + + +def t2g_line(transcript, gene): + return str(transcript) + "\t" + str(gene) + "\n" + + +def build_t2g_re(): + return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+") + + +def get_t2g(line, t2g_re): + return t2g_re.match(line) + + +def get_t2g_line(line, t2g_re): + t2g_id = get_t2g(line, t2g_re) + return {'transcript_id': t2g_id, 'gene_id': t2g_id} + + +def write_t2g_line(t2g, line, t2g_re): + results = t2g_line(line, t2g_re) + if results['transcript_id']: + t2g.write( + t2g_line( + results['transcript_id'].group(1), + results['gene_id'].group(2) + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="create transcript to genes file from a gtf file." + ) + parser.add_argument( + "-f", "--t2g", dest="t2g", required=True, type=validate_file, + help="t2g file", metavar="FILE" + ) + args = parser.parse_args() + t2g_re = build_t2g_re() + + with open(args.t2g, "r") as gtf: + with open("fix_t2g.txt", "w") as t2g: + for line in gtf: + write_t2g_line(t2g, str(line), t2g_re) diff --git a/src/.docker_modules/kb/0.26.0/t2g.py b/src/.docker_modules/kb/0.26.0/t2g.py index 06332adee03fa8beaf83560c76e5b0cf8b8110fc..02a4c30d770ca140418d40c367662de6af0ce32a 100755 --- a/src/.docker_modules/kb/0.26.0/t2g.py +++ b/src/.docker_modules/kb/0.26.0/t2g.py @@ -14,7 +14,7 @@ def validate_file(f): def t2g_line(transcript, gene): - return str(transcript + "\t" + str(gene) + "\n") + return str(transcript) + "\t" + str(gene) + "\n" def build_gene_re(): diff --git a/src/nf_modules/kb/main.nf b/src/nf_modules/kb/main.nf index 73de7b0b9e94adce753e881fc1e44fd0a27a8ccf..d27e84fba2eaeb946cda4a0c93605a88132e22d6 100644 --- a/src/nf_modules/kb/main.nf +++ b/src/nf_modules/kb/main.nf @@ -150,11 +150,10 @@ process kb_default { --h5ad \ ${params.count} \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ } @@ -205,11 +204,9 @@ process kb_marseq { --h5ad \ -x 1,0,6:1,6,14:0,0,0 \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ else """ @@ -224,11 +221,9 @@ process kb_marseq { -x 1,0,6:1,6,14:0,0,0 \ --h5ad \ ${reads} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ """ } @@ -357,11 +352,9 @@ process velocity_default { --h5ad \ ${params.count} \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """ @@ -417,11 +410,9 @@ process velocity_marseq { ${params.count} \ -x 1,0,6:1,6,14:0,0,0 \ ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """ @@ -440,11 +431,9 @@ process velocity_marseq { ${params.count} \ -x 1,0,6:1,6,14:0,0,0 \ ${reads} > ${file_prefix}_kb_mapping_report.txt - awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ - sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ - clean_${transcript_to_gene} + fix_t2g.py --t2g ${transcript_to_gene} + cp fix_t2g.txt ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/ - cp clean_${transcript_to_gene} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/ """