Skip to content
Snippets Groups Projects
Commit db1d1d1e authored by nfontrod's avatar nfontrod
Browse files

src/download_encode_eclip/merge_eclip.py: add a parameter level in...

src/download_encode_eclip/merge_eclip.py: add a parameter level in create_merged_files function to filter within genes (without first and last exons) or within exons (extended by 200 nt at each side)
parent b5f1625c
No related branches found
No related tags found
No related merge requests found
......@@ -127,17 +127,18 @@ def group_files(mfiles: List[Path], target: str) -> Dict[str, List[Path]]:
return d
def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None:
def merge_files(dfiles: Dict[str, List[Path]], target: str, filter_bed: Path
) -> None:
"""
Merge the file together.
:param dfiles: A dictionary that links each target to the files of interest
:param target: The level used to group the files. (factor, replicate, cell)
:param filter_bed: bed used to filter the clip peaks
:return: The merged bed file
"""
outfolder = ConfigEncodeClip.merged_output / target
outfolder.mkdir(parents=True, exist_ok=True)
gene_bed = ConfigEncodeClip.output_gene
for k in dfiles:
print(f"Working on {k}")
tmp_out = outfolder / f"{k}_tmp.bed"
......@@ -146,11 +147,11 @@ def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None:
list_file = " ".join(map(str, dfiles[k]))
cmd = "zcat %s | awk 'BEGIN{FS=\"\\t\"}{if($5==1000){print $0}}' " \
"> %s" % (list_file, tmp_out)
cmd2 = f"bedtools intersect -a {tmp_out} -b {gene_bed} -s " \
cmd2 = f"bedtools intersect -a {tmp_out} -b {filter_bed} -s " \
f"| sort -k1,1 -k2,2n > {tmp_out2}"
cmd3 = f"bedtools merge -i {tmp_out2} -d -1 -s -c 4,5,6 " \
f"-o collapse,mean,distinct | cut -f1-3,5-7 | sed 's/^chr//g' " \
f"| gzip -c > {res_out}"
f"-o collapse,mean,distinct | cut -f1-3,5-7 | " \
f"sed 's/^chr//g' | gzip -c > {res_out}"
sp.check_output(cmd, shell=True)
sp.check_output(cmd2, shell=True)
sp.check_output(cmd3, shell=True)
......@@ -161,18 +162,20 @@ def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None:
@lp.parse(assay=["eCLIP", "iCLIP", "RIP-chip", "RIP-seq"],
target=["factor", "replicate", "cell"])
def create_merged_files(assay: List[str] = ("eCLIP", "iCLIP"),
target: str = "factor"):
target: str = "factor", level: str = "exon"):
"""
:param assay: The kind of assay of interest(default ['eCLIP', 'iCLIP']
:param target: The level used to group the files. (factor, replicate, \
cell) (default 'factor').
:param level: 'gene' to create a bed file without first and last exons \
'exon' to create a bed file of extended exon (200nt at each size) \
without first and last exon (default exon)
"""
if not ConfigEncodeClip.output_gene.is_file():
create_gene_bed()
filter_bed = create_gene_bed(level)
mfiles = load_clip_files(assay)
dfiles = group_files(mfiles, target)
merge_files(dfiles, target)
merge_files(dfiles, target, filter_bed)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment