Skip to content
Snippets Groups Projects
Commit 29b46a87 authored by Bertache's avatar Bertache
Browse files

Add clean_parasplit_label

Don't take in account dangling end for parasplited reads input !
parent 011c5bb3
No related branches found
No related tags found
No related merge requests found
# import logging
# import multiprocessing
# logger = multiprocessing.get_logger()
# logger.setLevel(logging.DEBUG)
from typing import Pattern
def get_label(current_pair, dict_digested, tolerance):
"""
Annotation spécifique Intra et Interchromosomique
......@@ -88,6 +96,53 @@ def qualitad(current_pair, mapq_score_max):
return False
def clean_parasplit_label(
label: str,
query_name: str,
pair_suffix_re: Pattern[str],
) -> str:
"""
Remplace « DL » par « None » dans *label* lorsque *query_name* se termine
par un suffixe « :ij ».
En effet, Parasplit, comme Cutsite est un outil qui coupe les lectures en
de la présence en site de LIGATION ! Il est donc normale de trouver un nombre
accru de Dangling ends. Cet fonction a pour but de corriger cela (en partie)
Examples
--------
>>> clean_parasplit_label("DL-border", "read:01")
'None-border'
>>> clean_parasplit_label("read-DL", "read:03")
'read-None'
>>> clean_parasplit_label("read-border", "read:03")
'read-border'
>>> clean_parasplit_label("DL-border", "read")
'DL-border'
"""
# 1) Vérifier le suffixe « :ij » dans le query_name
if pair_suffix_re.search(query_name) is None:
return label
# 2) Vérifier que label est une str contenant « DL »
if not (isinstance(label, str) and "DL" in label):
return label
# 3) Séparer le label
try:
read_part, border_part = label.split("-", maxsplit=1)
except ValueError: # format inattendu
return label
# 4) Remplacer « DL » par « None » dans chaque partie
if "DL" in read_part:
read_part = "None"
if "DL" in border_part:
border_part = "None"
return f"{read_part}-{border_part}"
def process_items(
input_queue,
output_valide_queue,
......@@ -112,6 +167,8 @@ def process_items(
output_queue (Queue): Queue to put processed read pairs.
"""
import re
from .auxiliary import check_data, pair_complete, terminaison
from .event_collector import (
update_restriction_counts,
......@@ -119,6 +176,10 @@ def process_items(
)
from .readmanager import PairedRead
_DEFAULT_PAIR_SUFFIX_RE = re.compile(
r":(\d{2})$"
) # cf Parasplit (gitbio.ens-lyon.fr)
loop_count = 0
# Création des dictionnaire de comptage du processus courant :
......@@ -135,6 +196,17 @@ def process_items(
while True:
loop_count += 1
data = input_queue.get()
# if loop_count % 10 == 0:
# logger.warning(
# "loop=%d\tin=%s\tval=%s\toth=%s\tdel=%s",
# loop_count,
# safe_qsize(input_queue),
# safe_qsize(output_valide_queue),
# safe_qsize(output_other_queue),
# safe_qsize(output_del_queue),
# )
label = False
if data is None:
......@@ -179,7 +251,12 @@ def process_items(
undigested, Counts_Undigested_Site
)
if label and ("Other" not in label):
if label:
label = clean_parasplit_label(
label, current_pair.query_name_1, _DEFAULT_PAIR_SUFFIX_RE
)
if isinstance(label, str) and ("Other" not in label):
(
Counts_Biotin_Intra,
Counts_Dangling_Intra,
......@@ -202,6 +279,7 @@ def process_items(
output_other_queue=output_other_queue,
output_del_queue=output_del_queue,
)
finally:
terminaison(
output_valide_queue=output_valide_queue,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment