Skip to content
Snippets Groups Projects
Select Git revision
  • 231fd9e6a361a8095f07f32ce8c42b69f9409f8f
  • master default protected
  • yjia01-master-patch-35664
  • dev
  • v0.4.0
  • v0.3.0
  • v0.2.9
  • v0.2.8
  • v0.2.7
  • v0.2.6
  • v0.1.0
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
18 results

docker_init.sh

Blame
  • Forked from LBMC / nextflow
    Source project has a limited visibility.
    SyntheticDataset.py 5.27 KiB
    #!/usr/bin/env python3
    import random
    
    import pysam
    
    
    def generate_fake_genome(chrom, num_cycles):
        """Génère une séquence pour un chromosome sous forme d'alternance de 50 A et 'GATC'."""
        cycle = "A" * 150 + "GATC"  # cycle de 54 pb
        return cycle * num_cycles
    
    
    def write_fake_genome_fasta():
        """Écrit un fichier FASTA avec deux chromosomes synthétiques."""
        # Définition des cycles pour obtenir des longueurs proches de celles désirées
        chr1_cycles = 125  # 125 * 154 = 19250 pb
        chr2_cycles = 111  # 111 * 154 = 17094 pb
        chr1_seq = generate_fake_genome("chr1", chr1_cycles)
        chr2_seq = generate_fake_genome("chr2", chr2_cycles)
        fasta_filename = "fake_genome.fasta"
        with open(fasta_filename, "w") as f:
            f.write(">chr1\n")
            f.write(chr1_seq + "\n")
            f.write(">chr2\n")
            f.write(chr2_seq + "\n")
        # Retourne le nom du fichier et un dictionnaire des longueurs
        return fasta_filename, {"chr1": len(chr1_seq), "chr2": len(chr2_seq)}
    
    
    def create_synthetic_hic_bams(
        genome_fasta, chrom_lengths, num_pairs=30, read_length=25
    ):
        """
        Crée deux fichiers BAM (façon Hi‑C) : R1.bam pour les lectures forward
        et R2.bam pour les lectures reverse. On simule un certain nombre de paires.
        """
        # 1) Charger le génome
        genome = {}
        with open(genome_fasta, "r") as f:
            current_chrom = None
            seq_lines = []
            for line in f:
                line = line.strip()
                if line.startswith(">"):
                    if current_chrom is not None:
                        genome[current_chrom] = "".join(seq_lines)
                    current_chrom = line[1:]
                    seq_lines = []
                else:
                    seq_lines.append(line)
            if current_chrom is not None:
                genome[current_chrom] = "".join(seq_lines)
    
        # 2) Construire un objet AlignmentHeader
        header = {
            "HD": {"VN": "1.0"},
            "SQ": [{"LN": 6750, "SN": "chr1"}, {"LN": 5994, "SN": "chr2"}],
        }
    
        header_obj = pysam.AlignmentHeader.from_dict(header)
    
        # 3) Ouvrir deux fichiers BAM (un pour R1, un pour R2)
        bam_r1 = "synthetic_R1.bam"
        bam_r2 = "synthetic_R2.bam"
        with (
            pysam.AlignmentFile(bam_r1, "wb", header=header) as outf1,
            pysam.AlignmentFile(bam_r2, "wb", header=header) as outf2,
        ):
            for i in range(num_pairs):
                # Décider aléatoirement intrachromosomique ou interchromosomique
                if random.random() < 0.5:
                    # Intrachromosomique
                    chrom = random.choice(list(chrom_lengths.keys()))
                    pos1 = random.randint(
                        0, chrom_lengths[chrom] - read_length - 1
                    )
                    pos2 = random.randint(
                        0, chrom_lengths[chrom] - read_length - 1
                    )
                    chrom_mate = chrom
                else:
                    # Interchromosomique
                    chrom1, chrom2 = random.sample(list(chrom_lengths.keys()), 2)
                    pos1 = random.randint(
                        0, chrom_lengths[chrom1] - read_length - 1
                    )
                    pos2 = random.randint(
                        0, chrom_lengths[chrom2] - read_length - 1
                    )
                    chrom = chrom1
                    chrom_mate = chrom2
    
                # Extraire la séquence pour read1
                read1_seq = genome[chrom][pos1 : pos1 + read_length]
                # Extraire la séquence pour read2
                read2_seq = genome[chrom_mate][pos2 : pos2 + read_length]
    
                # Créer deux segments AlignedSegment
                segment1 = pysam.AlignedSegment(header_obj)
                segment2 = pysam.AlignedSegment(header_obj)
    
                # Nom de lecture identique
                segment1.query_name = f"read{i}:01"
                segment2.query_name = f"read{i}:01"
    
                if random.random() < 0.5:
                    segment1.flag = 0
                else:
                    segment1.flag = 0x16
    
                if random.random() < 0.5:
                    segment2.flag = 0
                else:
                    segment2.flag = 0x16
    
                # Attributs pour read1
                segment1.reference_name = chrom
                segment1.reference_start = pos1
                segment1.mapping_quality = 5
                segment1.cigarstring = f"{read_length}M"
    
                segment1.query_sequence = read1_seq
                segment1.query_qualities = pysam.qualitystring_to_array(
                    "I" * read_length
                )
    
                # Attributs pour read2
                segment2.reference_name = chrom_mate
                segment2.reference_start = pos2
                segment2.mapping_quality = 5
                segment2.cigarstring = f"{read_length}M"
    
                segment2.query_sequence = read2_seq
                segment2.query_qualities = pysam.qualitystring_to_array(
                    "I" * read_length
                )
    
                # Ecrire R1 dans le fichier R1, R2 dans le fichier R2
                outf1.write(segment1)
                outf2.write(segment2)
    
        return bam_r1, bam_r2
    
    
    if __name__ == "__main__":
        # Générer un faux génome
        chr_fasta, chr_lengths = write_fake_genome_fasta()
        # Créer deux fichiers R1.bam et R2.bam
        r1_file, r2_file = create_synthetic_hic_bams(
            chr_fasta, chr_lengths, num_pairs=30000000, read_length=25
        )
        print("Synthetic R1 file:", r1_file)
        print("Synthetic R2 file:", r2_file)