diff --git a/src/.docker_modules/nanosplicer/1.1/Dockerfile b/src/.docker_modules/nanosplicer/1.1/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..5a0e18470c19468b33980443ffff98107805e736 --- /dev/null +++ b/src/.docker_modules/nanosplicer/1.1/Dockerfile @@ -0,0 +1,10 @@ +FROM xgrand/nanosplicer:1.0 + +RUN mv /NanoSplicer/bin/JWR_checker.py /NanoSplicer/bin/JWR_checker_original.py +COPY JWR_checker_modified.py /NanoSplicer/bin/ +RUN mv /NanoSplicer/bin/JWR_checker_modified.py /NanoSplicer/bin/JWR_checker.py + +WORKDIR /NanoSplicer/bin + +# command to run on container start +CMD [ "bash" ] diff --git a/src/.docker_modules/nanosplicer/1.1/JWR_checker_modified.py b/src/.docker_modules/nanosplicer/1.1/JWR_checker_modified.py new file mode 100755 index 0000000000000000000000000000000000000000..7159558c96d8dda28569b77df8f1b1faef0cd156 --- /dev/null +++ b/src/.docker_modules/nanosplicer/1.1/JWR_checker_modified.py @@ -0,0 +1,329 @@ +''' + Finding junctions within reads (JWRs) from a bam file + Input: + bam_filename <str>: + filename of the BAM + output_name <str> <default: "JWR_full_list.hd5">: + the filename of the output table. + Output: + A HDF5 with given output_name will be generated with the following + columns: + 1. read_id: <str> the id of original read of the JWR + 2. mapQ: <Int> the mapping quality in the input BAM + 3. transcript_strand: '+'/'-' + 4. chrID: mapped chromosome name + 5. <tuple> location of the splice junciton in the reference genome + 6. JAQ: <float> junction alignment quality +''' +import h5py +import warnings +import pandas as pd +import textwrap +import pysam +import os +import re +import numpy as np +import sys +import getopt +from tqdm import tqdm +import concurrent.futures +from concurrent.futures import ThreadPoolExecutor, as_completed +import multiprocessing as mp +import helper + +# suppress pd performace warning +warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) + +class JWR_to_hdf(h5py.File): + ''' + modify the output HDF5 fil + ''' + def __init__(self, Filename): + super().__init__(Filename, 'r+') + def add_setting_info(self, name, value): + name = 'setting/' + name + self[name] = value + +class JWR_from_reads: + ''' + Get information from bam + ''' + def __init__(self, read_iter): + ''' + read_iter: read iterator from pysam. e.g. from .fetch function + ''' + self.read_iter = read_iter + def get_JWR(self): + """ + Input: + read: AlignedSegment object from pysam + Return: + list of introns [(start, stop),...] + Listing the intronic sites in the reads (identified by + 'N' in the cigar strings). + """ + for read in self.read_iter: + match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position + ref_skip = 3 + base_position = read.pos + for op, nt in read.cigartuples: + if op in match_or_deletion: + base_position += nt + elif op == ref_skip: + junc_start = base_position + base_position += nt + yield JWR_class(read, (junc_start, base_position),read.reference_name) + +class JWR_class: + def __init__(self, read, loc, chrID): + ''' + Define a junction within read (JWR) + Input: + read: pysam AlignedSegment + loc: tuple of intron location corresponding to the JWR + ''' + self.qname = read.qname + self.cigarstring = read.cigarstring + self.reference_start = read.reference_start + self.reference_end =read.reference_end + self.loc = loc + self.chrID = chrID + self.mapq = read.mapping_quality + try: + self.ts = read.get_tag("ts") + if read.is_reverse: + if self.ts == '+': + self.ts = '-' + elif self.ts == '-': + self.ts = '+' + except: + self.ts = None + def get_JAQ(self, half_motif_size=25): + ''' + Report the junction alignment quality + ''' + junction_cigar = \ + self.get_junction_cigar(half_motif_size=25) + junction_alignment_quality =\ + self.get_junc_map_quality(junction_cigar) + return junction_alignment_quality + def get_junction_cigar(self, half_motif_size): + ''' + Return the cigar letter of around the JWR + ''' + cigar_long = [] + for count, op in re.findall('(\d+)([A-Za-z=])', self.cigarstring): + cigar_long += int(count) * [op] + junc1_rel_read_start = self.loc[0] - self.reference_start + junc2_rel_read_start = self.loc[1] - self.reference_start + junction_cigar1 = '' + junction_cigar2 = '' + ref_index = -1 + for i in cigar_long: + if i in 'MND=X': + ref_index += 1 + if ref_index >= max(0, junc1_rel_read_start - half_motif_size) \ + and ref_index < junc1_rel_read_start \ + and i != 'N': + junction_cigar1 += i + if ref_index >= junc1_rel_read_start \ + and ref_index < min(junc2_rel_read_start + half_motif_size, + self.reference_end - self.reference_start) \ + and i != 'N': + junction_cigar2 += i + if ref_index >= min(junc2_rel_read_start + half_motif_size, + self.reference_end - self.reference_start): + break + return junction_cigar1[-25:] + junction_cigar2[:25] + def get_junc_map_quality(self, cigar): + ''' + The junc map quality is simply as the proportion of 'M's within the cigar string + ''' + if not cigar: + return np.nan + elif 'M' in cigar: + return cigar.count('M')/len(cigar) + elif '=' in cigar: + return cigar.count('=')/len(cigar) + else: + return 0 + +class JWR_checker_param: + def __init__(self, arg = sys.argv): + self.arg = arg + try: + opts, args = getopt.getopt(arg[1:],"hw:", + ["help", + "window=", + "chrID=", + "genome-loc=", + "threads=", + "output_csv"]) + except getopt.GetoptError: + helper.err_msg("ERROR:Invalid input.") + self.print_help() + sys.exit(1) + + # DEFAULT VALUE + self.junc_cigar_win = 25 + self.chrID, self.g_loc = None, (None,None) + self.threads = 32 + self.output_csv = False + + for opt, arg in opts: + if opt in ("-h", "--help"): + self.print_help() + sys.exit(0) + elif opt in ("-w", "--window"): + self.junc_cigar_win = int(arg) + elif opt == "--chrID": + self.chrID = arg + elif opt == "--genome-loc": + self.g_loc =\ + tuple([int(i) for i in arg.split('-')]) + if len(self.g_loc) != 2: + helper.err_msg("ERROR:Invalid genome-loc.") + sys.exit(1) + elif opt == "--threads": + self.threads = int(arg) + elif opt == "--output_csv": + self.output_csv = True + + if len(args) <2: + helper.err_msg("ERROR:Invalid input.") + self.print_help() + sys.exit(1) + self.bamfile, self.outfile = args + + def print_help(self): + help_message =\ + ''' + Finding junctions within reads (JWRs) from a spliced mapping result (BAM). + + Usage: python3 {} [OPTIONS] <BAM file> <output hdf5 file> + + Options: + -h/--help Print this help text + -w/--window Candidate search window size (nt) <default: 25> + --chrID Target a specific chromosome, chrID should match + the chromosome name in the BAM. All chromosomes + in the BAM will be searched if not specified + --genome-loc Target a specific genomic region, e.g. --genome-loc=0-10000 + Use in conjunction with --chrID option. Entire + chromosome will be searched if location not specified + --threads Number of threads used <default: 32>. + --output_csv With this option, a csv file will be output along with the hdf5 file + '''.format(sys.argv[0]) + + print(textwrap.dedent(help_message)) + +def tqdm_parallel_map(executor, fn, *iterables, **kwargs): + """ + Equivalent to executor.map(fn, *iterables), + but displays a tqdm-based progress bar. + + Does not support timeout or chunksize as executor.submit is used internally + + **kwargs is passed to tqdm. + """ + futures_list = [] + for iterable in iterables: + futures_list += [executor.submit(fn, i) for i in iterable] + for f in tqdm(concurrent.futures.as_completed(futures_list), total=len(futures_list), **kwargs): + yield f.result() + +def get_row(jwr_class_list, junc_cigar_win): + d = pd.DataFrame( + {'id': [jwr.qname for jwr in jwr_class_list], + 'mapQ': [jwr.mapq for jwr in jwr_class_list], + 'transcript_strand': [jwr.ts for jwr in jwr_class_list], + 'chrID': [jwr.chrID for jwr in jwr_class_list], + 'loc': [jwr.loc for jwr in jwr_class_list], + 'JAQ': [jwr.get_JAQ(junc_cigar_win) for jwr in jwr_class_list] + }) + return d + +def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def main(): + + # read command line arguments + param = JWR_checker_param() + + # check mismatch recorded in CIGAR + algn_file = pysam.AlignmentFile(param.bamfile) + for read in algn_file.fetch(): + if 'M' in read.cigarstring: + warning_text =\ + ''' + Warning: Mismatches are not explicitly labeled in the CIGAR from the input BAM file. + The JAQs can still be calculated but mismatched bases will be treated as matched bases. + It is recommended to update the mapping setting (e.g., use '--eqx' option in minimap2) + to label the mismatches in CIGAR. + ''' + helper.warning_msg(textwrap.dedent(warning_text)) + + break + if '=' in read.cigarstring or 'X' in read.cigarstring: + break + + print("Searching for JWRs ...\n\n") + + # get splice junctions from BAM + algn_file = pysam.AlignmentFile(param.bamfile) + reads_fetch = algn_file.fetch(param.chrID, + param.g_loc[0], + param.g_loc[1]) + JWR_fetch = JWR_from_reads(reads_fetch) + + JWRs = list(JWR_fetch.get_JWR()) + # JWRs = [x for x in JWRs if x.loc[0] > param.g_loc[0] - 50 and + # x.loc[1] < param.g_loc[1] + 50 ] + + print("Calculating JAQ for {} JWRs found".format(len(JWRs))) + executor = concurrent.futures.ProcessPoolExecutor(param.threads) + futures = [executor.submit(get_row, jwr, param.junc_cigar_win) for jwr in chunks(JWRs, 500)] + + pbar = tqdm(total = len(JWRs)) + for future in as_completed(futures): + pbar.update(500) + + d_list = [x.result() for x in futures if x.result() is not None] + if d_list: + d = pd.concat(d_list) + d.to_hdf(param.outfile, 'data') + pd.DataFrame([]).to_hdf(param.outfile, 'skipped') + + # output csv file if required + if param.output_csv: + d.to_csv(param.outfile + ".csv") + + else: + empty_data = pd.DataFrame({ + 'id': [], + 'mapQ': [], + 'transcript_strand': [], + 'chrID': [], + 'loc': [], + 'JAQ': [] + }) + empty_data.to_hdf(param.outfile, 'data') + pd.DataFrame([]).to_hdf(param.outfile, 'skipped') + + if param.output_csv: + empty_data.to_csv(param.outfile + ".csv") + + # d = pd.concat([x.result() for x in futures]) + # + # d.to_hdf(param.outfile, 'data') + # pd.DataFrame([]).to_hdf(param.outfile, 'skipped') + # + # # output csv file if required + # if param.output_csv: + # d.to_csv(param.outfile + ".csv") + +if __name__ == "__main__": + main() diff --git a/src/.docker_modules/nanosplicer/1.1/docker_init.sh b/src/.docker_modules/nanosplicer/1.1/docker_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..903b9d2f53fb1fd9fdf049908ea0438bf94942dd --- /dev/null +++ b/src/.docker_modules/nanosplicer/1.1/docker_init.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# docker pull lbmc/nanosplicer:1.1 +# docker build src/.docker_modules/nanosplicer/1.1 -t 'lbmc/nanosplicer:1.1' +# docker push lbmc/nanosplicer:1.1 +# docker buildx build --platform linux/amd64,linux/arm64 -t "lbmc/nanosplicer:1.1" --push src/.docker_modules/nanosplicer/1.1 + +# docker pull xgrand/nanosplicer:1.1 +docker build src/.docker_modules/nanosplicer/1.1 -t 'xgrand/nanosplicer:1.1' +docker push xgrand/nanosplicer:1.1 +# docker buildx build --platform linux/amd64,linux/arm64 -t "xgrand/nanosplicer:1.1" --push src/.docker_modules/nanosplicer/1.1 diff --git a/src/nf_modules/nanosplicer/main.nf b/src/nf_modules/nanosplicer/main.nf index b49e0639b63820fffe5f39ad23550a4844ff97ea..94b6c19e13bddfe56e20f2ffd800b3c2d2db3400 100644 --- a/src/nf_modules/nanosplicer/main.nf +++ b/src/nf_modules/nanosplicer/main.nf @@ -1,4 +1,4 @@ -version = "1.0" +version = "1.1" container_url = "xgrand/nanosplicer:${version}" params.nanosplicer_out = ""