From 29b46a8726fdd5ce09b8a7a29f15e5a2308037ea Mon Sep 17 00:00:00 2001
From: Bertache <samir.bertache@ens-lyon.fr>
Date: Fri, 25 Apr 2025 16:35:27 +0200
Subject: [PATCH] Add clean_parasplit_label

Don't take in account dangling end for parasplited reads input !
---
 src/hiclassifier/processing.py | 104 ++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 13 deletions(-)

diff --git a/src/hiclassifier/processing.py b/src/hiclassifier/processing.py
index f3b9d35..90fca09 100644
--- a/src/hiclassifier/processing.py
+++ b/src/hiclassifier/processing.py
@@ -1,3 +1,11 @@
+# import logging
+# import multiprocessing
+
+# logger = multiprocessing.get_logger()
+# logger.setLevel(logging.DEBUG)
+from typing import Pattern
+
+
 def get_label(current_pair, dict_digested, tolerance):
     """
     Annotation spécifique Intra et Interchromosomique
@@ -88,6 +96,53 @@ def qualitad(current_pair, mapq_score_max):
     return False
 
 
+def clean_parasplit_label(
+    label: str,
+    query_name: str,
+    pair_suffix_re: Pattern[str],
+) -> str:
+    """
+    Remplace « DL » par « None » dans *label* lorsque *query_name* se termine
+    par un suffixe « :ij ».
+
+    En effet, Parasplit, comme Cutsite est un outil qui coupe les lectures en
+    de la présence en site de LIGATION ! Il est donc normale de trouver un nombre
+    accru de Dangling ends. Cet fonction a pour but de corriger cela (en partie)
+
+    Examples
+    --------
+    >>> clean_parasplit_label("DL-border", "read:01")
+    'None-border'
+    >>> clean_parasplit_label("read-DL", "read:03")
+    'read-None'
+    >>> clean_parasplit_label("read-border", "read:03")
+    'read-border'
+    >>> clean_parasplit_label("DL-border", "read")
+    'DL-border'
+    """
+    # 1) Vérifier le suffixe « :ij » dans le query_name
+    if pair_suffix_re.search(query_name) is None:
+        return label
+
+    # 2) Vérifier que label est une str contenant « DL »
+    if not (isinstance(label, str) and "DL" in label):
+        return label
+
+    # 3) Séparer le label
+    try:
+        read_part, border_part = label.split("-", maxsplit=1)
+    except ValueError:  # format inattendu
+        return label
+
+    # 4) Remplacer « DL » par « None » dans chaque partie
+    if "DL" in read_part:
+        read_part = "None"
+    if "DL" in border_part:
+        border_part = "None"
+
+    return f"{read_part}-{border_part}"
+
+
 def process_items(
     input_queue,
     output_valide_queue,
@@ -112,6 +167,8 @@ def process_items(
         output_queue (Queue): Queue to put processed read pairs.
     """
 
+    import re
+
     from .auxiliary import check_data, pair_complete, terminaison
     from .event_collector import (
         update_restriction_counts,
@@ -119,6 +176,10 @@ def process_items(
     )
     from .readmanager import PairedRead
 
+    _DEFAULT_PAIR_SUFFIX_RE = re.compile(
+        r":(\d{2})$"
+    )  # cf Parasplit (gitbio.ens-lyon.fr)
+
     loop_count = 0
 
     # Création des dictionnaire de comptage du processus courant :
@@ -135,6 +196,17 @@ def process_items(
         while True:
             loop_count += 1
             data = input_queue.get()
+
+            # if loop_count % 10 == 0:
+            #     logger.warning(
+            #         "loop=%d\tin=%s\tval=%s\toth=%s\tdel=%s",
+            #         loop_count,
+            #         safe_qsize(input_queue),
+            #         safe_qsize(output_valide_queue),
+            #         safe_qsize(output_other_queue),
+            #         safe_qsize(output_del_queue),
+            #     )
+
             label = False
 
             if data is None:
@@ -179,21 +251,26 @@ def process_items(
                     undigested, Counts_Undigested_Site
                 )
 
-            if label and ("Other" not in label):
-                (
-                    Counts_Biotin_Intra,
-                    Counts_Dangling_Intra,
-                    Counts_Biotin_Inter,
-                    Counts_Dangling_Inter,
-                ) = update_restriction_counts(
-                    paired_read=current_pair,
-                    label=label,
-                    counts_vl_intra=Counts_Biotin_Intra,
-                    counts_dl_intra=Counts_Dangling_Intra,
-                    counts_vl_inter=Counts_Biotin_Inter,
-                    counts_dl_inter=Counts_Dangling_Inter,
+            if label:
+                label = clean_parasplit_label(
+                    label, current_pair.query_name_1, _DEFAULT_PAIR_SUFFIX_RE
                 )
 
+                if isinstance(label, str) and ("Other" not in label):
+                    (
+                        Counts_Biotin_Intra,
+                        Counts_Dangling_Intra,
+                        Counts_Biotin_Inter,
+                        Counts_Dangling_Inter,
+                    ) = update_restriction_counts(
+                        paired_read=current_pair,
+                        label=label,
+                        counts_vl_intra=Counts_Biotin_Intra,
+                        counts_dl_intra=Counts_Dangling_Intra,
+                        counts_vl_inter=Counts_Biotin_Inter,
+                        counts_dl_inter=Counts_Dangling_Inter,
+                    )
+
             # # Send in queue
             sending_to_queue(
                 data,
@@ -202,6 +279,7 @@ def process_items(
                 output_other_queue=output_other_queue,
                 output_del_queue=output_del_queue,
             )
+
     finally:
         terminaison(
             output_valide_queue=output_valide_queue,
-- 
GitLab