Select Git revision
SyntheticDataset.py
Forked from
LBMC / Hub / Microsplit
This fork has diverged from the upstream repository.
Bertache authored
Change name for testing new options : (Try to adapt dangling end counts with parasplit output which increase fakely the number of dangling !)
SyntheticDataset.py 5.27 KiB
#!/usr/bin/env python3
import random
import pysam
def generate_fake_genome(chrom, num_cycles):
"""Génère une séquence pour un chromosome sous forme d'alternance de 50 A et 'GATC'."""
cycle = "A" * 150 + "GATC" # cycle de 54 pb
return cycle * num_cycles
def write_fake_genome_fasta():
"""Écrit un fichier FASTA avec deux chromosomes synthétiques."""
# Définition des cycles pour obtenir des longueurs proches de celles désirées
chr1_cycles = 125 # 125 * 154 = 19250 pb
chr2_cycles = 111 # 111 * 154 = 17094 pb
chr1_seq = generate_fake_genome("chr1", chr1_cycles)
chr2_seq = generate_fake_genome("chr2", chr2_cycles)
fasta_filename = "fake_genome.fasta"
with open(fasta_filename, "w") as f:
f.write(">chr1\n")
f.write(chr1_seq + "\n")
f.write(">chr2\n")
f.write(chr2_seq + "\n")
# Retourne le nom du fichier et un dictionnaire des longueurs
return fasta_filename, {"chr1": len(chr1_seq), "chr2": len(chr2_seq)}
def create_synthetic_hic_bams(
genome_fasta, chrom_lengths, num_pairs=30, read_length=25
):
"""
Crée deux fichiers BAM (façon Hi‑C) : R1.bam pour les lectures forward
et R2.bam pour les lectures reverse. On simule un certain nombre de paires.
"""
# 1) Charger le génome
genome = {}
with open(genome_fasta, "r") as f:
current_chrom = None
seq_lines = []
for line in f:
line = line.strip()
if line.startswith(">"):
if current_chrom is not None:
genome[current_chrom] = "".join(seq_lines)
current_chrom = line[1:]
seq_lines = []
else:
seq_lines.append(line)
if current_chrom is not None:
genome[current_chrom] = "".join(seq_lines)
# 2) Construire un objet AlignmentHeader
header = {
"HD": {"VN": "1.0"},
"SQ": [{"LN": 6750, "SN": "chr1"}, {"LN": 5994, "SN": "chr2"}],
}
header_obj = pysam.AlignmentHeader.from_dict(header)
# 3) Ouvrir deux fichiers BAM (un pour R1, un pour R2)
bam_r1 = "synthetic_R1.bam"
bam_r2 = "synthetic_R2.bam"
with (
pysam.AlignmentFile(bam_r1, "wb", header=header) as outf1,
pysam.AlignmentFile(bam_r2, "wb", header=header) as outf2,
):
for i in range(num_pairs):
# Décider aléatoirement intrachromosomique ou interchromosomique
if random.random() < 0.5:
# Intrachromosomique
chrom = random.choice(list(chrom_lengths.keys()))
pos1 = random.randint(
0, chrom_lengths[chrom] - read_length - 1
)
pos2 = random.randint(
0, chrom_lengths[chrom] - read_length - 1
)
chrom_mate = chrom
else:
# Interchromosomique
chrom1, chrom2 = random.sample(list(chrom_lengths.keys()), 2)
pos1 = random.randint(
0, chrom_lengths[chrom1] - read_length - 1
)
pos2 = random.randint(
0, chrom_lengths[chrom2] - read_length - 1
)
chrom = chrom1
chrom_mate = chrom2
# Extraire la séquence pour read1
read1_seq = genome[chrom][pos1 : pos1 + read_length]
# Extraire la séquence pour read2
read2_seq = genome[chrom_mate][pos2 : pos2 + read_length]
# Créer deux segments AlignedSegment
segment1 = pysam.AlignedSegment(header_obj)
segment2 = pysam.AlignedSegment(header_obj)
# Nom de lecture identique
segment1.query_name = f"read{i}:01"
segment2.query_name = f"read{i}:01"
if random.random() < 0.5:
segment1.flag = 0
else:
segment1.flag = 0x16
if random.random() < 0.5:
segment2.flag = 0
else:
segment2.flag = 0x16
# Attributs pour read1
segment1.reference_name = chrom
segment1.reference_start = pos1
segment1.mapping_quality = 5
segment1.cigarstring = f"{read_length}M"
segment1.query_sequence = read1_seq
segment1.query_qualities = pysam.qualitystring_to_array(
"I" * read_length
)
# Attributs pour read2
segment2.reference_name = chrom_mate
segment2.reference_start = pos2
segment2.mapping_quality = 5
segment2.cigarstring = f"{read_length}M"
segment2.query_sequence = read2_seq
segment2.query_qualities = pysam.qualitystring_to_array(
"I" * read_length
)
# Ecrire R1 dans le fichier R1, R2 dans le fichier R2
outf1.write(segment1)
outf2.write(segment2)
return bam_r1, bam_r2
if __name__ == "__main__":
# Générer un faux génome
chr_fasta, chr_lengths = write_fake_genome_fasta()
# Créer deux fichiers R1.bam et R2.bam
r1_file, r2_file = create_synthetic_hic_bams(
chr_fasta, chr_lengths, num_pairs=30000000, read_length=25
)
print("Synthetic R1 file:", r1_file)
print("Synthetic R2 file:", r2_file)