Skip to content
Snippets Groups Projects
Verified Commit d520fcd0 authored by Mia Croiset's avatar Mia Croiset
Browse files

clean functions to call hicstuff functions

parent 49ad90d5
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python #!/usr/bin/env python
import os, time, csv, sys, re """
Bridge from nextflow function call to hicstuff's python function for bam2pairs
"""
import os, sys
import argparse import argparse
import pysam as ps
import pandas as pd
import itertools
from hicstuff.log import logger from hicstuff.log import logger
import hicstuff.log as hcl import hicstuff.log as hcl
import hicstuff.io as hio import hicstuff.io as hio
import hicstuff.digest as hcd import hicstuff.digest as hcd
import hicstuff.stats as hcs
from Bio import SeqIO from Bio import SeqIO
import logging import logging
from hicstuff.version import __version__ from hicstuff.version import __version__
from hicstuff.pipeline import bam2pairs, generate_log_header
def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
"""
Make a .pairs file from two Hi-C bam files sorted by read names.
The Hi-C mates are matched by read identifier. Pairs where at least one
reads maps with MAPQ below min_qual threshold are discarded. Pairs are
sorted by readID and stored in upper triangle (first pair higher).
Parameters
----------
bam1 : str
Path to the name-sorted BAM file with aligned Hi-C forward reads.
bam2 : str
Path to the name-sorted BAM file with aligned Hi-C reverse reads.
out_pairs : str
Path to the output space-separated .pairs file with columns
readID, chr1 pos1 chr2 pos2 strand1 strand2
info_contigs : str
Path to the info contigs file, to get info on chromosome sizes and order.
min_qual : int
Minimum mapping quality required to keep a Hi-C pair.
"""
forward = ps.AlignmentFile(bam1, "rb")
reverse = ps.AlignmentFile(bam2, "rb")
# Generate header lines
format_version = "## pairs format v1.0\n"
sorting = "#sorted: readID\n"
cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n"
# Chromosome order will be identical in info_contigs and pair files
chroms = pd.read_csv(info_contigs, sep="\t").apply(
lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1
)
with open(out_pairs, "w") as pairs:
pairs.writelines([format_version, sorting, cols] + chroms.tolist())
pairs_writer = csv.writer(pairs, delimiter="\t")
n_reads = {"total": 0, "mapped": 0}
# Remember if some read IDs were missing from either file
unmatched_reads = 0
# Remember if all reads in one bam file have been read
exhausted = [False, False]
# Iterate on both BAM simultaneously
end_regex = re.compile(r'/[12]$')
for end1, end2 in itertools.zip_longest(forward, reverse):
# Remove end-specific suffix if any
end1.query_name = re.sub(end_regex, '', end1.query_name)
end2.query_name = re.sub(end_regex, '', end2.query_name)
# Both file still have reads
# Check if reads pass filter
try:
end1_passed = end1.mapping_quality >= min_qual
# Happens if end1 bam file has been exhausted
except AttributeError:
exhausted[0] = True
end1_passed = False
try:
end2_passed = end2.mapping_quality >= min_qual
# Happens if end2 bam file has been exhausted
except AttributeError:
exhausted[1] = True
end2_passed = False
# Skip read if mate is not present until they match or reads
# have been exhausted
while sum(exhausted) == 0 and end1.query_name != end2.query_name:
# Get next read and check filters again
# Count single-read iteration
unmatched_reads += 1
n_reads["total"] += 1
if end1.query_name < end2.query_name:
try:
end1 = next(forward)
end1_passed = end1.mapping_quality >= min_qual
# If EOF is reached in BAM 1
except (StopIteration, AttributeError):
exhausted[0] = True
end1_passed = False
n_reads["mapped"] += end1_passed
elif end1.query_name > end2.query_name:
try:
end2 = next(reverse)
end2_passed = end2.mapping_quality >= min_qual
# If EOF is reached in BAM 2
except (StopIteration, AttributeError):
exhausted[1] = True
end2_passed = False
n_reads["mapped"] += end2_passed
# 2 reads processed per iteration, unless one file is exhausted
n_reads["total"] += 2 - sum(exhausted)
n_reads["mapped"] += sum([end1_passed, end2_passed])
# Keep only pairs where both reads have good quality
if end1_passed and end2_passed:
# Flipping to get upper triangle
if (
end1.reference_id == end2.reference_id
and end1.reference_start > end2.reference_start
) or end1.reference_id > end2.reference_id:
end1, end2 = end2, end1
pairs_writer.writerow(
[
end1.query_name,
end1.reference_name,
end1.reference_start + 1,
end2.reference_name,
end2.reference_start + 1,
"-" if end1.is_reverse else "+",
"-" if end2.is_reverse else "+",
]
)
pairs.close()
if unmatched_reads > 0:
logger.warning(
"%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.",
unmatched_reads,
)
logger.info(
"{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format(
total=n_reads["total"],
mapped=n_reads["mapped"],
perc_map=round(100 * n_reads["mapped"] / n_reads["total"]),
qual=min_qual,
)
)
def generate_log_header(log_path, input1, input2, genome, enzyme):
hcl.set_file_handler(log_path, formatter=logging.Formatter(""))
logger.info("## hicstuff: v%s log file", __version__)
logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S"))
logger.info("## enzyme: %s", str(enzyme))
logger.info("## input1: %s ", input1)
logger.info("## input2: %s", input2)
logger.info("## ref: %s", genome)
logger.info("---")
#hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
#!/usr/bin/env python #!/usr/bin/env python
import os, time, csv, sys, re import os, sys
import argparse import argparse
import pysam as ps
import pandas as pd
import shutil as st
import subprocess as sp
import itertools
from hicstuff.log import logger from hicstuff.log import logger
import hicstuff.io as hio
import hicstuff.digest as hcd
from Bio import SeqIO
import hicstuff.log as hcl import hicstuff.log as hcl
import hicstuff.pipeline as hp import hicstuff.pipeline as hp
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding: utf-8 # coding: utf-8
"""Cut the reads at their ligation sites. """
Bridge from nextflow function call to hicstuff's python function for cutsite
The module will cut at ligation sites of the given enzyme. It will from the
original fastq (gzipped or not) files create new gzipped fastq files with
combinations of the fragments of reads obtains by cutting at the ligation sites
of the reads.
There are three choices to how combine the fragments. 1. "for_vs_rev": All the
combinations are made between one forward fragment and one reverse fragment.
It's the most releveant one for usual workflow. 2. "all": All 2-combinations are
made. 3. "pile": Only combinations between adjacent fragments in the initial
reads are made.
This module contains the following functions:
- cut_ligation_sites
- cutsite_read
- write_pair
- _writer
""" """
import gzip
import multiprocessing
import pyfastx
import re
import sys
import argparse import argparse
import hicstuff.digest as hcd
from hicstuff.log import logger
import hicstuff.cutsite as hc import hicstuff.cutsite as hc
if __name__ == "__main__": if __name__ == "__main__":
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
Bridge from nextflow function call to hicstuff's python function for distance law
"""
import numpy as np import numpy as np
import sys
import matplotlib.pyplot as plt
import warnings
from scipy import ndimage
from matplotlib import cm
import pandas as pd
import os as os
import csv as csv
import hicstuff.io as hio
from hicstuff.log import logger
import argparse import argparse
import hicstuff.distance_law as hdl import hicstuff.distance_law as hdl
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Hi-C event filtering
Analyse the contents of a 3C library and filters spurious events such as loops
and uncuts to improve the overall signal. Filtering consists in excluding +-
and -+ Hi-C pairs if their reads are closer than a threshold in minimum number
of restriction fragments. This threshold represents the distance at which the
abundance of these events deviate significantly from the rest of the library.
It is estimated automatically by default using the median absolute deviation of
pairs at longer distances, but can also be entered manually.
The program takes a 2D BED file as input with the following fields:
chromA startA endA indiceA strandA chromB startB endB indiceB strandB
Each line of the file represents a Hi-C pair with reads A and B. The indices
are 0-based and represent the restriction fragment to which reads are
attributed.
@author: cmdoret (reimplementation of Axel KournaK's code)
"""
import sys import sys
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt
import argparse import argparse
import logging
import hicstuff.filter as hf import hicstuff.filter as hf
import hicstuff_log as hcl import hicstuff_log as hcl
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
Bridge from nextflow function call to hicstuff's python function for PCR duplicates filtering
"""
import hicstuff.io as hio import hicstuff.io as hio
from hicstuff.log import logger from hicstuff.log import logger
from hicstuff.pipeline import filter_pcr_dup
import argparse import argparse
import os, time, csv, sys, re import csv
def filter_pcr_dup(pairs_idx_file, filtered_file):
"""
Filter out PCR duplicates from a coordinate-sorted pairs file using
overrrepresented exact coordinates. If multiple fragments have two reads
with the exact same coordinates, only one of those fragments is kept.
Parameters
----------
pairs_idx_file : str
Path to an indexed pairs file containing the Hi-C reads.
filtered_file : str
Path to the output pairs file after removing duplicates.
"""
# Keep count of how many reads are filtered
filter_count = 0
reads_count = 0
# Store header lines
header = hio.get_pairs_header(pairs_idx_file)
with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered:
# Copy header lines to filtered file
for head_line in header:
filtered.write(head_line + "\n")
next(pairs)
# Use csv methods to easily access columns
paircols = [
"readID",
"chr1",
"pos1",
"chr2",
"pos2",
"strand1",
"strand2",
"frag1",
"frag2",
]
# Columns used for comparison of coordinates
coord_cols = [col for col in paircols if col != "readID"]
pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols)
filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols)
# Initialise a variable to store coordinates of reads in previous pair
prev = {k: 0 for k in paircols}
for pair in pair_reader:
reads_count += 1
# If coordinates are the same as before, skip pair
if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols):
filter_count += 1
continue
# Else write pair and store new coordinates as previous
else:
filt_writer.writerow(pair)
prev = pair
logger.info(
"%d%% PCR duplicates have been filtered out (%d / %d pairs) "
% (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# coding: utf-8 # coding: utf-8
"""Iterative alignment
Aligns iteratively reads from a 3C fastq file: reads
are trimmed with a range-sweeping number of basepairs
and each read set generated this way is mapped onto
the reference genome. This may result in a small
increase of properly mapped reads.
@author: Remi Montagne & cmdoret
"""
import os
import sys
import glob
import re
import subprocess as sp
import pysam as ps
import shutil as st
import hicstuff.io as hio import hicstuff.io as hio
import contextlib
import argparse import argparse
from hicstuff.log import logger
from os.path import join
import hicstuff.iteralign as hi import hicstuff.iteralign as hi
if __name__ == "__main__": if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment