Skip to content
Snippets Groups Projects
Verified Commit 5dc7c7f8 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

kb: create a correction step for the t2g.txt messed with by kb

parent c3a4c54e
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0"
RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
COPY t2g.py /usr/bin/
COPY fix_t2g.py /usr/bin/
RUN chmod +x /usr/bin/t2g.py
RUN chmod +x /usr/bin/fix_t2g.py
CMD [ "bash" ]
#!/usr/local/bin/python
import os
import re
import argparse
def validate_file(f):
if not os.path.exists(f):
# Argparse uses the ArgumentTypeError to give a rejection message like:
# error: argument input: x does not exist
raise argparse.ArgumentTypeError("{0} does not exist".format(f))
return f
def t2g_line(transcript, gene):
return str(transcript) + "\t" + str(gene) + "\n"
def build_t2g_re():
return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+")
def get_t2g(line, t2g_re):
return t2g_re.match(line)
def get_t2g_line(line, t2g_re):
t2g_id = get_t2g(line, t2g_re)
return {'transcript_id': t2g_id, 'gene_id': t2g_id}
def write_t2g_line(t2g, line, t2g_re):
results = t2g_line(line, t2g_re)
if results['transcript_id']:
t2g.write(
t2g_line(
results['transcript_id'].group(1),
results['gene_id'].group(2)
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="create transcript to genes file from a gtf file."
)
parser.add_argument(
"-f", "--t2g", dest="t2g", required=True, type=validate_file,
help="t2g file", metavar="FILE"
)
args = parser.parse_args()
t2g_re = build_t2g_re()
with open(args.t2g, "r") as gtf:
with open("fix_t2g.txt", "w") as t2g:
for line in gtf:
write_t2g_line(t2g, str(line), t2g_re)
......@@ -14,7 +14,7 @@ def validate_file(f):
def t2g_line(transcript, gene):
return str(transcript + "\t" + str(gene) + "\n")
return str(transcript) + "\t" + str(gene) + "\n"
def build_gene_re():
......
......@@ -150,11 +150,10 @@ process kb_default {
--h5ad \
${params.count} \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
"""
}
......@@ -205,11 +204,9 @@ process kb_marseq {
--h5ad \
-x 1,0,6:1,6,14:0,0,0 \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
"""
else
"""
......@@ -224,11 +221,9 @@ process kb_marseq {
-x 1,0,6:1,6,14:0,0,0 \
--h5ad \
${reads} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
"""
}
......@@ -357,11 +352,9 @@ process velocity_default {
--h5ad \
${params.count} \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/
"""
......@@ -417,11 +410,9 @@ process velocity_marseq {
${params.count} \
-x 1,0,6:1,6,14:0,0,0 \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/
"""
......@@ -440,11 +431,9 @@ process velocity_marseq {
${params.count} \
-x 1,0,6:1,6,14:0,0,0 \
${reads} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \
clean_${transcript_to_gene}
fix_t2g.py --t2g ${transcript_to_gene}
cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment