diff --git a/src/download_encode_eclip/merge_eclip.py b/src/download_encode_eclip/merge_eclip.py index ff716595859e4d1faabc5c55d628a83ace7c98ef..99e646dcd107812d352d676908264abacfc0ffd5 100644 --- a/src/download_encode_eclip/merge_eclip.py +++ b/src/download_encode_eclip/merge_eclip.py @@ -127,17 +127,18 @@ def group_files(mfiles: List[Path], target: str) -> Dict[str, List[Path]]: return d -def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None: +def merge_files(dfiles: Dict[str, List[Path]], target: str, filter_bed: Path + ) -> None: """ Merge the file together. :param dfiles: A dictionary that links each target to the files of interest :param target: The level used to group the files. (factor, replicate, cell) + :param filter_bed: bed used to filter the clip peaks :return: The merged bed file """ outfolder = ConfigEncodeClip.merged_output / target outfolder.mkdir(parents=True, exist_ok=True) - gene_bed = ConfigEncodeClip.output_gene for k in dfiles: print(f"Working on {k}") tmp_out = outfolder / f"{k}_tmp.bed" @@ -146,11 +147,11 @@ def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None: list_file = " ".join(map(str, dfiles[k])) cmd = "zcat %s | awk 'BEGIN{FS=\"\\t\"}{if($5==1000){print $0}}' " \ "> %s" % (list_file, tmp_out) - cmd2 = f"bedtools intersect -a {tmp_out} -b {gene_bed} -s " \ + cmd2 = f"bedtools intersect -a {tmp_out} -b {filter_bed} -s " \ f"| sort -k1,1 -k2,2n > {tmp_out2}" cmd3 = f"bedtools merge -i {tmp_out2} -d -1 -s -c 4,5,6 " \ - f"-o collapse,mean,distinct | cut -f1-3,5-7 | sed 's/^chr//g' " \ - f"| gzip -c > {res_out}" + f"-o collapse,mean,distinct | cut -f1-3,5-7 | " \ + f"sed 's/^chr//g' | gzip -c > {res_out}" sp.check_output(cmd, shell=True) sp.check_output(cmd2, shell=True) sp.check_output(cmd3, shell=True) @@ -161,18 +162,20 @@ def merge_files(dfiles: Dict[str, List[Path]], target: str) -> None: @lp.parse(assay=["eCLIP", "iCLIP", "RIP-chip", "RIP-seq"], target=["factor", "replicate", "cell"]) def create_merged_files(assay: List[str] = ("eCLIP", "iCLIP"), - target: str = "factor"): + target: str = "factor", level: str = "exon"): """ :param assay: The kind of assay of interest(default ['eCLIP', 'iCLIP'] :param target: The level used to group the files. (factor, replicate, \ cell) (default 'factor'). + :param level: 'gene' to create a bed file without first and last exons \ + 'exon' to create a bed file of extended exon (200nt at each size) \ + without first and last exon (default exon) """ - if not ConfigEncodeClip.output_gene.is_file(): - create_gene_bed() + filter_bed = create_gene_bed(level) mfiles = load_clip_files(assay) dfiles = group_files(mfiles, target) - merge_files(dfiles, target) + merge_files(dfiles, target, filter_bed) if __name__ == "__main__":