Skip to content
Snippets Groups Projects
Verified Commit 5dc7c7f8 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

kb: create a correction step for the t2g.txt messed with by kb

parent c3a4c54e
No related branches found
No related tags found
No related merge requests found
...@@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0" ...@@ -5,7 +5,9 @@ ENV KB_VERSION="0.26.0"
RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION} RUN apt update && apt install -y procps && pip3 install kb-python==${KB_VERSION}
COPY t2g.py /usr/bin/ COPY t2g.py /usr/bin/
COPY fix_t2g.py /usr/bin/
RUN chmod +x /usr/bin/t2g.py RUN chmod +x /usr/bin/t2g.py
RUN chmod +x /usr/bin/fix_t2g.py
CMD [ "bash" ] CMD [ "bash" ]
#!/usr/local/bin/python
import os
import re
import argparse
def validate_file(f):
if not os.path.exists(f):
# Argparse uses the ArgumentTypeError to give a rejection message like:
# error: argument input: x does not exist
raise argparse.ArgumentTypeError("{0} does not exist".format(f))
return f
def t2g_line(transcript, gene):
return str(transcript) + "\t" + str(gene) + "\n"
def build_t2g_re():
return re.compile("([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+")
def get_t2g(line, t2g_re):
return t2g_re.match(line)
def get_t2g_line(line, t2g_re):
t2g_id = get_t2g(line, t2g_re)
return {'transcript_id': t2g_id, 'gene_id': t2g_id}
def write_t2g_line(t2g, line, t2g_re):
results = t2g_line(line, t2g_re)
if results['transcript_id']:
t2g.write(
t2g_line(
results['transcript_id'].group(1),
results['gene_id'].group(2)
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="create transcript to genes file from a gtf file."
)
parser.add_argument(
"-f", "--t2g", dest="t2g", required=True, type=validate_file,
help="t2g file", metavar="FILE"
)
args = parser.parse_args()
t2g_re = build_t2g_re()
with open(args.t2g, "r") as gtf:
with open("fix_t2g.txt", "w") as t2g:
for line in gtf:
write_t2g_line(t2g, str(line), t2g_re)
...@@ -14,7 +14,7 @@ def validate_file(f): ...@@ -14,7 +14,7 @@ def validate_file(f):
def t2g_line(transcript, gene): def t2g_line(transcript, gene):
return str(transcript + "\t" + str(gene) + "\n") return str(transcript) + "\t" + str(gene) + "\n"
def build_gene_re(): def build_gene_re():
......
...@@ -150,11 +150,10 @@ process kb_default { ...@@ -150,11 +150,10 @@ process kb_default {
--h5ad \ --h5ad \
${params.count} \ ${params.count} \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ fix_t2g.py --t2g ${transcript_to_gene}
clean_${transcript_to_gene} cp fix_t2g.txt ${file_prefix}/
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
""" """
} }
...@@ -205,11 +204,9 @@ process kb_marseq { ...@@ -205,11 +204,9 @@ process kb_marseq {
--h5ad \ --h5ad \
-x 1,0,6:1,6,14:0,0,0 \ -x 1,0,6:1,6,14:0,0,0 \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ fix_t2g.py --t2g ${transcript_to_gene}
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ cp fix_t2g.txt ${file_prefix}/
clean_${transcript_to_gene}
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
""" """
else else
""" """
...@@ -224,11 +221,9 @@ process kb_marseq { ...@@ -224,11 +221,9 @@ process kb_marseq {
-x 1,0,6:1,6,14:0,0,0 \ -x 1,0,6:1,6,14:0,0,0 \
--h5ad \ --h5ad \
${reads} > ${file_prefix}_kb_mapping_report.txt ${reads} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ fix_t2g.py --t2g ${transcript_to_gene}
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ cp fix_t2g.txt ${file_prefix}/
clean_${transcript_to_gene}
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
""" """
} }
...@@ -357,11 +352,9 @@ process velocity_default { ...@@ -357,11 +352,9 @@ process velocity_default {
--h5ad \ --h5ad \
${params.count} \ ${params.count} \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ fix_t2g.py --t2g ${transcript_to_gene}
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ cp fix_t2g.txt ${file_prefix}/
clean_${transcript_to_gene}
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/
""" """
...@@ -417,11 +410,9 @@ process velocity_marseq { ...@@ -417,11 +410,9 @@ process velocity_marseq {
${params.count} \ ${params.count} \
-x 1,0,6:1,6,14:0,0,0 \ -x 1,0,6:1,6,14:0,0,0 \
${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt ${reads[0]} ${reads[1]} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ fix_t2g.py --t2g ${transcript_to_gene}
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ cp fix_t2g.txt ${file_prefix}/
clean_${transcript_to_gene}
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/
""" """
...@@ -440,11 +431,9 @@ process velocity_marseq { ...@@ -440,11 +431,9 @@ process velocity_marseq {
${params.count} \ ${params.count} \
-x 1,0,6:1,6,14:0,0,0 \ -x 1,0,6:1,6,14:0,0,0 \
${reads} > ${file_prefix}_kb_mapping_report.txt ${reads} > ${file_prefix}_kb_mapping_report.txt
awk -v OFS='\t' '{print(\$1, \$2)}' ${transcript_to_gene} | \ fix_t2g.py --t2g ${transcript_to_gene}
sed -E "s|([A-Z]+[0-9]+)\.\S+\s([A-Z]+[0-9]+)\.\S+|\1\t\2|" > \ cp fix_t2g.txt ${file_prefix}/
clean_${transcript_to_gene}
cp ${transcript_to_gene} ${file_prefix}/ cp ${transcript_to_gene} ${file_prefix}/
cp clean_${transcript_to_gene} ${file_prefix}/
cp ${cdna_t2g} ${file_prefix}/ cp ${cdna_t2g} ${file_prefix}/
cp ${intron_t2g} ${file_prefix}/ cp ${intron_t2g} ${file_prefix}/
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment