diff --git a/src/db_utils/interactions/features_interactions.py b/src/db_utils/interactions/features_interactions.py index bc0f819d522f49fdc14f101e1d7b9dbd50e9a98d..6369dbb85172f6815b964c1184abad4d5ea01a5f 100644 --- a/src/db_utils/interactions/features_interactions.py +++ b/src/db_utils/interactions/features_interactions.py @@ -28,6 +28,28 @@ from tqdm import tqdm from typing import Dict, List import re import numpy as np +import os.path +import sys +from ..populate_database import populate_df + + +MY_ID = sys.argv[1] + +PET = Config.chia_pet / f"{MY_ID}.bed" +FILE_1 = os.path.split(PET)[1].split(".")[0] + +if sys.argv[2] == "--exon": + OPT_GENE = "off" + OPT_EXON = "on" + INTER_FILE = Config.pet_vs_exon_output / f"exon_w200_vs_{MY_ID}.bed" + FILE_2 = ((os.path.split(INTER_FILE)[1].split(".")[0]).split("vs_")[1]) + MY_OUT_FILE = Config.couple_exon / f"{MY_ID}_couple_exon.txt" +elif sys.argv[2] == "--gene": + OPT_GENE = "on" + OPT_EXON = "off" + INTER_FILE = Config.pet_vs_gene_output / f"gene_w200_vs_{MY_ID}.bed" + FILE_2 = ((os.path.split(INTER_FILE)[1].split(".")[0]).split("vs_")[1]) + MY_OUT_FILE = Config.couple_gene / f"{MY_ID}_couple_gene.txt" def work_on_pet(): @@ -42,8 +64,7 @@ def work_on_pet(): :return: Pet in this format: chr1:start1..end1 chr2:start2..end2 weight1-2 """ - pet = pd.read_csv(Config.chia_pet / "GSM1517080.bed", sep="\t", - header=None) + pet = pd.read_csv(PET, sep="\t", header=None) pet.drop_duplicates(subset=3, inplace=True) pet = pet.iloc[:, 3].str.split(r"-|,", expand=True) pet.columns = ["anchor1", "anchor2", "weight"] @@ -99,9 +120,8 @@ def work_on_intersection(): :return: A dictionary which link the id of the pet with the region (exon/gene) it contains. """ - inter_file = Config.pet_vs_exon_output / "exon_w200_vs_GSM1517080.bed" dic = {} - with inter_file.open("r") as infile: + with INTER_FILE.open("r") as infile: for line in infile: line = line.strip("\n").split("\t") id_anchor = f"{line[6]}:{line[7]}..{line[8]}" @@ -219,8 +239,21 @@ def filtering_2(df_filter_2: pd.DataFrame): df_filter_2[["id_region_1", "id_region_2"]] = df_filter_2.id.str.\ split("$", expand=True) del df_filter_2["id"] - df_filter_2 = df_filter_2.reindex(columns=["id_region_1", "id_region_2", - "weight", "level"]) + if OPT_EXON == "on": + df_filter_2.columns = ["level", "weight", "exon1", "exon2"] + df_filter_2 = df_filter_2.reindex(columns=["exon1", "exon2", "weight", + "level"]) + elif OPT_GENE == "on": + df_filter_2.columns = ["level", "weight", "gene1", "gene2"] + df_filter_2 = df_filter_2.reindex(columns=["gene1", "gene2", "weight", + "level"]) + df_filter_2 = df_filter_2.reset_index().rename(columns={"index": "id"}) + if FILE_1 == FILE_2: + df_filter_2["id_project"] = FILE_1 + df_filter_2["id2"] = df_filter_2["id"].astype(str) + "_" + \ + df_filter_2["id_project"].astype(str) + del df_filter_2["id"] + df_filter_2.rename(columns={"id2": "id"}, inplace=True) return df_filter_2 @@ -231,6 +264,8 @@ def create_interaction_table(logging_level: str = "DISABLE"): :return: The table of interaction """ logging_def(Config.chia_pet_interaction, __file__, logging_level) + Config.couple_exon.mkdir(exist_ok=True, parents=True) + Config.couple_gene.mkdir(exist_ok=True, parents=True) logging.debug("Reading of intersection between genomic regions and an " "anchor") anchor_dic = work_on_intersection() @@ -246,7 +281,13 @@ def create_interaction_table(logging_level: str = "DISABLE"): logging.debug("Sum weight of identical interaction") df = filtering_2(df) logging.debug(df.head()) - df.to_csv('test3.txt', index=False, sep="\t") + df.to_csv(MY_OUT_FILE, index=False, sep="\t", header=True) + if OPT_EXON == "on": + logging.debug("Filling cin_exon_interaction") + populate_df(table="cin_exon_interaction", df=df, clean="n") + elif OPT_GENE == "on": + logging.debug("Filling cin_gene_interaction") + populate_df(table="cin_gene_interaction", df=df, clean="n") return df