Skip to content
Snippets Groups Projects
Commit c2cfe070 authored by nfontrod's avatar nfontrod
Browse files

src/find_interaction_cluster/nt_and_community.py: add a region parameter

parent 011bb626
No related branches found
No related tags found
No related merge requests found
...@@ -27,13 +27,14 @@ from .sf_and_communities import get_key ...@@ -27,13 +27,14 @@ from .sf_and_communities import get_key
def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
feature: str) -> pd.DataFrame: feature: str, region: str = "") -> pd.DataFrame:
""" """
Get the frequency of every nucleotides for features in list_ft. Get the frequency of every nucleotides for features in list_ft.
:param cnx: Connection to chia-pet database :param cnx: Connection to chia-pet database
:param list_ft: The list of exons for which we want to get :param list_ft: The list of exons for which we want to get
:param feature: the kind of feature analysed :param feature: the kind of feature analysed
:param region: The region of gene analysed if feature is gene
:return: the frequency of nucleotides for the list of exons. :return: the frequency of nucleotides for the list of exons.
>>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file), >>> d = get_nt_frequency(sqlite3.connect(ConfigGraph.db_file),
...@@ -49,13 +50,18 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str], ...@@ -49,13 +50,18 @@ def get_nt_frequency(cnx: sqlite3.Connection, list_ft: List[str],
0 1 29.49376 18.34271 18.43874 33.72479 0 1 29.49376 18.34271 18.43874 33.72479
1 2 31.90401 16.40251 18.79033 32.90315 1 2 31.90401 16.40251 18.79033 32.90315
""" """
query_region = ""
if feature == "gene": if feature == "gene":
list_ft = [int(ft) for ft in list_ft] list_ft = [int(ft) for ft in list_ft]
if region == "":
region = "gene"
query_region = f"AND region = '{region}'"
query = f""" query = f"""
SELECT ft, id_{feature}, frequency SELECT ft, id_{feature}, frequency
FROM cin_{feature}_frequency FROM cin_{feature}_frequency
WHERE id_{feature} IN {tuple(list_ft)} WHERE id_{feature} IN {tuple(list_ft)}
AND ft_type="nt" AND ft_type="nt"
{query_region}
""" """
df = pd.read_sql_query(query, cnx) df = pd.read_sql_query(query, cnx)
df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\ df = df.pivot_table(index=f"id_{feature}", columns="ft", values="frequency")\
...@@ -190,7 +196,8 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]: ...@@ -190,7 +196,8 @@ def get_ft_id(cnx: sqlite3.Connection, feature: str = "exon") -> List[str]:
def create_ctrl_community(df: pd.DataFrame, outfile: Path, def create_ctrl_community(df: pd.DataFrame, outfile: Path,
feature: str = 'exon') -> pd.DataFrame: feature: str = 'exon', region: str = ""
) -> pd.DataFrame:
""" """
:param df: A dataframe containing the frequency of each feature in a \ :param df: A dataframe containing the frequency of each feature in a \
community. community.
...@@ -206,7 +213,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, ...@@ -206,7 +213,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
ft_id = get_ft_id(cnx, feature) ft_id = get_ft_id(cnx, feature)
list_ft = [cid for cid in ft_id list_ft = [cid for cid in ft_id
if cid not in df[f"id_{feature}"].to_list()] if cid not in df[f"id_{feature}"].to_list()]
df_nt = get_nt_frequency(cnx, list_ft, feature) df_nt = get_nt_frequency(cnx, list_ft, feature, region)
df_com = get_community_table([list_ft], size_threshold, feature) df_com = get_community_table([list_ft], size_threshold, feature)
df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}") df_nt = df_nt.merge(df_com, how="left", on=f"id_{feature}")
df_nt['community'] = ['C-CTRL'] * df_nt.shape[0] df_nt['community'] = ['C-CTRL'] * df_nt.shape[0]
...@@ -218,6 +225,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path, ...@@ -218,6 +225,7 @@ def create_ctrl_community(df: pd.DataFrame, outfile: Path,
def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
global_weight: int, same_gene: bool, global_weight: int, same_gene: bool,
nt: str, feature: str = "exon", nt: str, feature: str = "exon",
region: str = "",
) -> pd.Series: ) -> pd.Series:
""" """
Get data (proportion of `reg` regulated exons by a splicing factor Get data (proportion of `reg` regulated exons by a splicing factor
...@@ -235,6 +243,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, ...@@ -235,6 +243,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
same gene (True) or not (False) (default False) same gene (True) or not (False) (default False)
:param nt: The nucleotide of interest :param nt: The nucleotide of interest
:param feature: The king of feature analysed :param feature: The king of feature analysed
:param region: the region of interest to extract from gene
""" """
logging.debug(f"lmm for {project}, w:{weight}, " logging.debug(f"lmm for {project}, w:{weight}, "
f"g:{global_weight} nt: {nt}") f"g:{global_weight} nt: {nt}")
...@@ -254,7 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int, ...@@ -254,7 +263,7 @@ def get_stat_nt_communities(df: pd.DataFrame, project: str, weight: int,
res['pval'] = pval res['pval'] = pval
nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt", nt_ctrl_table = noutfile.parent / noutfile.name.replace("_stat.txt",
"_ctrl.txt") "_ctrl.txt")
ndf = create_ctrl_community(df, nt_ctrl_table, feature) ndf = create_ctrl_community(df, nt_ctrl_table, feature, region)
sum_df = lmm_maker_summary(ndf, outfile, nt) sum_df = lmm_maker_summary(ndf, outfile, nt)
outfile_ctrl = ConfigGraph.get_community_file(project, weight, outfile_ctrl = ConfigGraph.get_community_file(project, weight,
global_weight, global_weight,
...@@ -298,7 +307,7 @@ def create_dataframe(project: str, weight: int, global_weight: int, ...@@ -298,7 +307,7 @@ def create_dataframe(project: str, weight: int, global_weight: int,
def multiple_nt_lmm_launcher(ps: int, weights: List[int], def multiple_nt_lmm_launcher(ps: int, weights: List[int],
global_weights: List[int], global_weights: List[int],
same_gene: bool, same_gene: bool,
feature: str = 'exon', feature: str = 'exon', region: str = '',
logging_level: str = "DISABLE"): logging_level: str = "DISABLE"):
""" """
Launch the statistical analysis for every Launch the statistical analysis for every
...@@ -312,6 +321,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], ...@@ -312,6 +321,7 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
:param same_gene: Say if we consider as co-localised exon within the \ :param same_gene: Say if we consider as co-localised exon within the \
same gene same gene
:param feature: The kind of analysed feature :param feature: The kind of analysed feature
:param region: the region of interest to extract from gene
:param logging_level: Level of information to display :param logging_level: Level of information to display
""" """
ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True) ConfigGraph.community_folder.mkdir(exist_ok=True, parents=True)
...@@ -342,7 +352,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int], ...@@ -342,7 +352,8 @@ def multiple_nt_lmm_launcher(ps: int, weights: List[int],
True) True)
df.to_csv(nfile_table, sep="\t", index=False) df.to_csv(nfile_table, sep="\t", index=False)
dic_df[ckey] = df dic_df[ckey] = df
args = [df, project, weight, global_weight, same_gene, nt, feature] args = [df, project, weight, global_weight, same_gene, nt, feature,
region]
if ckey not in processes.keys(): if ckey not in processes.keys():
processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)] processes[ckey] = [pool.apply_async(get_stat_nt_communities, args)]
else: else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment