From d1232a66cd7019cf2bcb4ab3f64db35d0c98eb37 Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Fri, 19 Jun 2020 14:38:02 +0200
Subject: [PATCH] src/nt_composition/get_projects_interaction.py: add a new
 parameter, same_gene to choose whether or not to keep the co-localised exons
 within the same gene + modification of get_interaction_by_project function to
 use this new parameter

---
 .../get_projects_interaction.py               | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/nt_composition/get_projects_interaction.py b/src/nt_composition/get_projects_interaction.py
index 8a485638..f9dbc9df 100644
--- a/src/nt_composition/get_projects_interaction.py
+++ b/src/nt_composition/get_projects_interaction.py
@@ -21,20 +21,31 @@ from ..logging_conf import logging_def
 import logging
 
 
-def get_interaction_by_project(cnx: sqlite3.Connection, weight: int
-                               ) -> pd.DataFrame:
+def get_interaction_by_project(cnx: sqlite3.Connection, weight: int,
+                               same_gene: bool) -> pd.DataFrame:
     """
     Get the number of interactions by projects.
 
     :param cnx: Connection to chia-pet database
     :param weight: A weight threshold
+    :param same_gene: Say if we are considering interaction within the same \
+    gene
     :return: The table containing the number of interaction by projects
     """
     logging.debug('Getting interaction from database')
-    query = f"SELECT id_project, COUNT(*) " \
-            f"FROM cin_exon_interaction " \
-            f"WHERE weight >= {weight} " \
-            f"GROUP BY id_project"
+    if same_gene:
+        query = f"SELECT id_project, COUNT(*) " \
+                f"FROM cin_exon_interaction " \
+                f"WHERE weight >= {weight} " \
+                f"GROUP BY id_project"
+    else:
+        query = f"""SELECT id_project, COUNT(*)
+                    FROM cin_exon_interaction t1, cin_exon t2, cin_exon t3
+                    WHERE t1.weight >= {weight}
+                    AND t1.exon1 = t2.id
+                    AND t1.exon2 = t3.id
+                    AND t2.id_gene != t3.id_gene
+                    GROUP BY id_project"""
     df = pd.read_sql_query(query, cnx)
     df.columns = ['projects', 'interaction_count']
     df.sort_values('interaction_count', ascending=True, inplace=True)
@@ -79,16 +90,19 @@ def select_projects(df: pd.DataFrame):
         outf.write("\n".join(sp) + "\n")
 
 
-def get_interactions_number(weight: int = 1, logging_level: str = "DISABLE"):
+def get_interactions_number(weight: int = 1, same_gene: bool = False,
+                            logging_level: str = "DISABLE"):
     """
     Get the number of interaction by projects
 
     :param weight: The minimum weight of correlation to consider them
+    :param same_gene: Say if we are considering interaction within the same \
+    gene
     """
     logging_def(ConfigNt.interaction, __file__, logging_level)
     logging.info(f'Recovering interaction count with a weight of {weight}')
     cnx = sqlite3.connect(ConfigNt.db_file)
-    df = get_interaction_by_project(cnx, weight)
+    df = get_interaction_by_project(cnx, weight, same_gene)
     make_barplot(df, weight)
     df.to_csv(ConfigNt.get_interaction_file(weight),
               sep="\t", index=False)
-- 
GitLab