From 1f8d5beff192539ee4d4052212276537df1a3bac Mon Sep 17 00:00:00 2001
From: Fontrodona Nicolas <nicolas.fontrodona@ens-lyon.fr>
Date: Wed, 10 Jun 2020 14:29:00 +0200
Subject: [PATCH] src/nt_composition/get_projects_interaction.py: This script
 creates a figure that shows the number of co-localisation in chia-pet
 projects, and select some projects with different number of interaction

---
 .../get_projects_interaction.py               | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 src/nt_composition/get_projects_interaction.py

diff --git a/src/nt_composition/get_projects_interaction.py b/src/nt_composition/get_projects_interaction.py
new file mode 100644
index 00000000..d2aa5cde
--- /dev/null
+++ b/src/nt_composition/get_projects_interaction.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+# -*- coding: UTF-8 -*-
+
+"""
+Description: The goal of this script is to get the total  \
+number of interaction by projects and select 9 projects with the following \
+requirements:
+
+* 3 projects must be those with the minimum possible interactions.
+* 3 projects must be those with the greatest number of interactions
+* 3 projects must contains an average number of interactions
+"""
+
+import sqlite3
+import pandas as pd
+from .config import ConfigNt
+import seaborn as sns
+import matplotlib.pyplot as plt
+from ..logging_conf import logging_def
+import logging
+
+
+def get_interaction_by_project(cnx: sqlite3.Connection) -> pd.DataFrame:
+    """
+    Get the number of interactions by projects.
+
+    :param cnx: Connection to chia-pet database
+    :return: The table containing the number of interaction by projects
+    """
+    logging.debug('Getting interaction from database')
+    query = "SELECT id_project, COUNT(*) " \
+            "FROM cin_exon_interaction " \
+            "GROUP BY id_project"
+    df = pd.read_sql_query(query, cnx)
+    df.columns = ['projects', 'interaction_count']
+    df.sort_values('interaction_count', ascending=True, inplace=True)
+    logging.debug(df.head())
+    return df
+
+
+def make_barplot(df: pd.DataFrame):
+    """
+    Make a barplot displaying the number of interactions for every project.
+
+    :param df: The dataframe containing the number of interaction by \
+    projects
+    """
+    logging.debug("Creating barplot figure")
+    ConfigNt.interaction.mkdir(parents=True, exist_ok=True)
+    sns.set()
+    sns.set_context('talk')
+    plt.figure(figsize=(20, 12))
+    sns.barplot(x="projects", y="interaction_count", data=df)
+    plt.xticks(rotation=90)
+    plt.savefig(ConfigNt.interaction_file.parent /
+                ConfigNt.interaction_file.name.replace('txt', 'pdf'),
+                bbox_inches='tight')
+    plt.close()
+
+
+def select_projects(df: pd.DataFrame):
+    """
+    Select the wanted projects and write them in a file
+
+    :param df: The dataframe containing the number of interaction by \
+    projects
+    """
+    logging.debug("Selecting projects")
+    sp = list(df[df['interaction_count'] > 2000].projects.values)[0:2]
+    sp += list(df[df['interaction_count'] > 30000].projects.values)[0:2]
+    sp += list(df[df['interaction_count'] > 100000].projects.values)[0:2]
+    sp += list(df[df['interaction_count'] > 400000].projects.values)[0:2]
+    with ConfigNt.selected_project.open('w') as outf:
+        outf.write("\n".join(sp) + "\n")
+
+
+def get_interactions_number(logging_level: str = "DISABLE"):
+    """
+    Get the number of interaction by projects
+    """
+    logging_def(ConfigNt.interaction, __file__, logging_level)
+    cnx = sqlite3.connect(ConfigNt.db_file)
+    df = get_interaction_by_project(cnx)
+    make_barplot(df)
+    df.to_csv(ConfigNt.interaction_file,
+              sep="\t", index=False)
+    sns.barplot()
+    select_projects(df)
+
+
+if __name__ == "__main__":
+    get_interactions_number()
-- 
GitLab