Skip to content
Snippets Groups Projects
Commit a5c58fa2 authored by nfontrod's avatar nfontrod
Browse files

src/download_encode_eclip/merge_eclip.py: modification of merge_files function...

src/download_encode_eclip/merge_eclip.py: modification of merge_files function to filter onily the peaks with a score of 1000 and inside a gene (first and last exons not included)"
parent f63dde59
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,7 @@ from pathlib import Path
from typing import List, Tuple, Dict
import subprocess as sp
import lazyparser as lp
from .create_gene_bed import create_gene_bed
def load_clip_files(assay_name: List[str] = "eclip") -> List[Path]:
......@@ -136,18 +137,25 @@ def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None:
"""
outfolder = ConfigEncodeClip.merged_output / target
outfolder.mkdir(parents=True, exist_ok=True)
gene_bed = ConfigEncodeClip.output_gene
for k in dfiles:
print(f"Working on {k}")
tmp_out = outfolder / f"{k}_tmp.bed"
tmp_out2 = outfolder / f"{k}_tmp2.bed"
res_out = outfolder / f"{k}_merged.bed.gz"
list_file = " ".join(map(str, dfiles[k]))
cmd = f"zcat {list_file} | sort -k1,1 -k2,2n > {tmp_out}"
cmd2 = f"bedtools merge -i {tmp_out} -s -c 4,5,6 " \
cmd = "zcat %s | awk 'BEGIN{FS=\"\\t\"}{if($5==1000){print $0}}' " \
"> %s" % (list_file, tmp_out)
cmd2 = f"bedtools intersect -a {tmp_out} -b {gene_bed} -s " \
f"| sort -k1,1 -k2,2n > {tmp_out2}"
cmd3 = f"bedtools merge -i {tmp_out2} -d -1 -s -c 4,5,6 " \
f"-o collapse,mean,distinct | cut -f1-3,5-7 | sed 's/^chr//g' " \
f"| gzip -c > {res_out}"
sp.check_output(cmd, shell=True)
sp.check_output(cmd2, shell=True)
sp.check_output(cmd3, shell=True)
tmp_out.unlink()
tmp_out2.unlink()
@lp.parse(assay=["eCLIP", "iCLIP", "RIP-chip", "RIP-seq"],
......@@ -160,6 +168,8 @@ def create_merged_files(assay: List[str] = ("eCLIP", "iCLIP"),
:param target: The level used to group the files. (factor, replicate, \
cell) (default 'factor').
"""
if not ConfigEncodeClip.output_gene.is_file():
create_gene_bed()
mfiles = load_clip_files(assay)
dfiles = group_files(mfiles, target)
merge_files(dfiles, target)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment