From c1e020faeee7ee5ef0ae16cb3a4f405dcdb5005e Mon Sep 17 00:00:00 2001
From: Laurent Modolo <laurent.modolo@ens-lyon.fr>
Date: Mon, 13 Mar 2023 10:46:09 +0100
Subject: [PATCH] change OR formula to remove 1.0e9 scaling and replace it with
 custom scaling

---
 README.md            |  9 +++++----
 doc/presentation.qmd | 28 +++++++---------------------
 src/bed.rs           |  6 ++++--
 src/main.rs          |  6 ++++++
 src/pileup.rs        | 13 ++-----------
 src/stats.rs         | 18 +++++++++++-------
 src/utils.rs         | 19 +++++++++++++++----
 7 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 843dc31..f89db82 100644
--- a/README.md
+++ b/README.md
@@ -72,12 +72,12 @@ The `bamcalib` expect the following parameters:
 - $WCE_{c}\left(t\right)$ the coverage at position $t$ in the WCE sample on the calibration genome
 
 ```math
-\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{10^9 \sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)}
+\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\frac{1}{T_x}\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)}
 ```
 
 In a reference genome of size $`T_x`$ (ignoring the chromosome segmentation), and in a calibration genome of size $`T_c`$.
 
-To be able to better analyze the coverage information at repetitive regions of the gnome, we propose to extend the previous normalization to normalize the signal nucleotide by nucleotide and work with the following quantity:
+To be able to better analyze the coverage information at repetitive regions of the genome, we propose to extend the previous normalization to normalize the signal nucleotide by nucleotide and work with the following quantity:
 
 ```math
 \text{ratio}IP\left(t\right) = \frac{\text{norm}IP\left(t\right)}{\text{norm}WCE\left(t\right)}
@@ -86,7 +86,7 @@ To be able to better analyze the coverage information at repetitive regions of t
 with:
 
 ```math
-\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{10^9}{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}  \alpha
+\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{1}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}  \alpha
 ```
 
 We then find $`\alpha`$ such that:
@@ -101,7 +101,8 @@ which gives
 \alpha = \frac{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{10^9} \frac{\sum_{t=1}^{T_x}\frac{IP_{x}\left(t\right)}{WCE_{x}\left(t\right)}}{\sum_{t=1}^{T_x}IP_{x}\left(t\right)}
 ```
 
-Not that all the computation is made at the nucleotide level and not at the read level as in [Hu et al.](https://doi.org/10.1093/nar/gkv670), this is also why we use $10^9$ as a scaling factor instead of $10^6$.
+Not that all the computation are scaled by the genome size and not at the read number as in [Hu et al.](https://doi.org/10.1093/nar/gkv670), this is also why we add a scaling factor (default to $10^3$).
+This scaling the $\text{ratio}IP\left(t\right)$ is multiplied by this scaling factor.
 
 With this method, we retain the interesting properties of [Hu et al.](https://doi.org/10.1093/nar/gkv670) normalization on the average read density between samples (i.e., we can compare two different samples in a quantitative way) and we account for the local bias of read density observed in the WCE samples (differential chromatin accessibility, repetition, low mappability region, etc.).
 
diff --git a/doc/presentation.qmd b/doc/presentation.qmd
index 6f74fa2..3e1c5ac 100644
--- a/doc/presentation.qmd
+++ b/doc/presentation.qmd
@@ -3,8 +3,10 @@ title: "Calibrated Chip-Seq with `bamcalib`"
 author: "laurent.modolo@ens-lyon.fr"
 format: 
     revealjs:
-        theme: default 
         transition: none
+        format:
+        highlight-style: monokai
+        theme: white
 ---
 
 ## Calibration data
@@ -27,23 +29,7 @@ The coverage differences can be due to:
 
 -   Differences in experiment efficiency
 
--   
-
-    ## Differences in amount of biological material
-
-## Normalization
-
-![](./img/IP_1.png){width="50%" height="50%"}
-
-Scaling the data by the calibration coverage:
-
--   Differences in experiment efficiency
-
-. . .
-
-How do we account for differences in the amount of biological material ?
-
-------------------------------------------------------------------------
+-   Differences in amount of biological material
 
 ## Normalization
 
@@ -87,7 +73,7 @@ Scaling the data by the scaled WCE coverage:
 :::
 
 ::: {.column width="50%"}
-We scale by the total coverage $\sum_{t=1}^{T}IP(t)$
+We scale by the average coverage $\frac{1}{T}\sum_{t=1}^{T}IP(t)$
 :::
 :::
 
@@ -105,7 +91,7 @@ With
 ## Normalization
 
 $$
-\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{10^9 \sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)}
+\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{ \frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\frac{1}{T_x}\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)}
 $$
 We scale everything to $10^9$
 
@@ -131,7 +117,7 @@ We can remove the :
 -   Differences in experiment efficiency
 
 $$
-\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{10^9}{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}  \alpha
+\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{1}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}  \alpha
 $$
 
 How do we compute $\alpha$ to remove the :
diff --git a/src/bed.rs b/src/bed.rs
index cefd6ea..fa12b61 100644
--- a/src/bed.rs
+++ b/src/bed.rs
@@ -100,11 +100,12 @@ impl BedGraph {
         bam_ip: &str,
         bam_wce: &str,
         prefix: &str,
+        scaling: f64,
         thread_local: u16,
         no_bits: u16,
     ) {
         self.or_stats
-            .init(bam_ip, bam_wce, prefix, thread_local, no_bits);
+            .init(bam_ip, bam_wce, prefix, scaling, thread_local, no_bits);
 
         println!("OR: {}", self.or_stats.get_or());
         println!("alpha: {}", self.or_stats.get_alpha());
@@ -197,6 +198,7 @@ impl BedGraphWrapper {
         bam_ip: &str,
         bam_wce: &str,
         prefix: &str,
+        scaling: f64,
         thread_local: u16,
         no_bits: u16,
         file_ip: &str,
@@ -207,7 +209,7 @@ impl BedGraphWrapper {
             .create()
             .expect("Unable to create thread pool.");
         let mut bed = BedGraph::new();
-        bed.init(&bam_ip, &bam_wce, &prefix, thread_local, no_bits);
+        bed.init(&bam_ip, &bam_wce, &prefix, scaling, thread_local, no_bits);
         let bedwrapper = BedGraphWrapper::new(bed);
 
         let chrom_map = bedwrapper.bed.or_stats.get_chr().to_hashmap_filtered(false);
diff --git a/src/main.rs b/src/main.rs
index d0125f8..7a8ebfd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -36,6 +36,10 @@ struct Args {
     #[arg(short, long, default_value_t = String::from("calib_"))]
     cal_prefix: String,
 
+    /// multiply the OR by this value (must be the same across sample)
+    #[arg(short, long, default_value_t = 1.0e3)]
+    scaling: f64,
+
     /// remove reads which bit code match this code
     #[arg(short, long, default_value_t = 1540)]
     no_bits: u16,
@@ -52,6 +56,7 @@ fn main() {
     let bigwig_ip = matches.bigwig_ip;
     let bigwig_wce = matches.bigwig_wce;
     let cal_prefix = matches.cal_prefix;
+    let scaling = matches.scaling;
     let mut thread: u16 = matches.thread;
     if thread == 0 {
         thread = 1;
@@ -61,6 +66,7 @@ fn main() {
         &bam_ip,
         &bam_wce,
         &cal_prefix,
+        scaling,
         thread,
         no_bits,
         &bigwig_ip,
diff --git a/src/pileup.rs b/src/pileup.rs
index f42f035..31d7d2b 100644
--- a/src/pileup.rs
+++ b/src/pileup.rs
@@ -269,7 +269,7 @@ pub fn compute_depth(
     prefix: &str,
     thread_local: u16,
     no_bits: u16,
-) -> (Depth, f64, f64, Chromosomes) {
+) -> (Depth, u64, u64, Chromosomes) {
     let (multimap, chr, fragment_len) =
         MultiMappingReads::init(file, prefix, thread_local, no_bits);
     let mut reader = open_bam(file, thread_local);
@@ -278,8 +278,6 @@ pub fn compute_depth(
     depth.init(&chr);
     let mut ref_count: u64 = 0;
     let mut calib_count: u64 = 0;
-    let mut ref_size: u64 = 0;
-    let mut calib_size: u64 = 0;
     let mut ref_depth = ColumnDepth::new(fragment_len);
     let mut calib_depth = ColumnDepth::new(fragment_len);
     for column in pileup {
@@ -289,10 +287,8 @@ pub fn compute_depth(
                 column.ref_pos(),
                 ref_depth.depth(&column) as u64,
             );
-            ref_size += 1;
         } else {
             calib_count += calib_depth.depth(&column) as u64;
-            calib_size += 1;
         }
     }
     if ref_count == 0 {
@@ -301,10 +297,5 @@ pub fn compute_depth(
     if calib_count == 0 {
         panic!("no read detected in the calibration genome (maybe the no_bits is filtering too stringent)")
     }
-    return (
-        depth,
-        (ref_count as f64 / ref_size as f64),
-        (calib_count as f64 / calib_size as f64),
-        chr,
-    );
+    return (depth, ref_count, calib_count, chr);
 }
diff --git a/src/stats.rs b/src/stats.rs
index 5d9973e..4b08093 100644
--- a/src/stats.rs
+++ b/src/stats.rs
@@ -10,13 +10,13 @@ use std::thread;
 /// Store genome statistics necessary for normalization
 #[derive(Debug, Copy, Clone)]
 struct GenomeStats {
-    read_count: f64,
+    read_count: u64,
 }
 
 impl GenomeStats {
     /// Create a new GenomeStats with read_count set to 0
     pub fn new() -> GenomeStats {
-        GenomeStats { read_count: 0.0 }
+        GenomeStats { read_count: 0 }
     }
 }
 
@@ -87,6 +87,7 @@ impl ORStats {
         bam_ip: &str,
         bam_wce: &str,
         prefix: &str,
+        scaling: f64,
         thread_local: u16,
         no_bits: u16,
     ) {
@@ -95,7 +96,7 @@ impl ORStats {
             Some(pos) => pos,
             None => panic!("init(): no valid position in the reference genome"),
         };
-        self.compute_or();
+        self.compute_or(scaling);
         self.compute_alpha();
     }
 
@@ -166,9 +167,11 @@ impl ORStats {
     }
 
     /// Function to compute the OR from the results of the compute_depth() function
-    pub fn compute_or(&mut self) {
-        self.or = (self.wce.calibration.read_count as f64)
-            / ((self.ip.calibration.read_count as f64) * (self.wce.reference.read_count as f64));
+    pub fn compute_or(&mut self, scaling: f64) {
+        let ref_size = self.ip.chromosomes.get_genomesize(true) as f64;
+        self.or = scaling * (self.wce.calibration.read_count as f64)
+            / ((self.ip.calibration.read_count as f64)
+                * (self.wce.reference.read_count as f64 / ref_size));
     }
 
     /// Function to compute the alpha from the results of the compute_depth() function
@@ -197,8 +200,9 @@ impl ORStats {
                 self.alpha += (ip_depth as f64) / (wce_depth as f64);
             }
         }
+        let calib_size = self.ip.chromosomes.get_genomesize(false) as f64;
         self.alpha = (self.alpha / (self.ip.reference.read_count as f64))
-            * (self.wce.calibration.read_count as f64);
+            * (self.wce.calibration.read_count as f64 / calib_size);
     }
 
     /// Return the OR value to normalize the IP coverage
diff --git a/src/utils.rs b/src/utils.rs
index 993f391..a2e6df1 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -117,15 +117,26 @@ impl Chromosomes {
     }
 
     /// return the length of a genome
-    /// # Arguments
-    /// - ip: bool the size of the reference genome (if true) or reference genome (if false)
     /// # Return
-    /// usize the sum of the chromosome lengths
-
+    /// &Vec<u32> the chromosome lengths
     pub fn get_lens(&self) -> &Vec<u32> {
         &self.len
     }
 
+    /// return the sizee of a genome
+    /// - calib: bool
+    /// # Return
+    /// u32 the sum of the chromosome lengths
+    pub fn get_genomesize(&self, calib: bool) -> u32 {
+        let mut genomesize = 0;
+        for (chr_check, chr_len) in self.calib.iter().zip(self.len.iter()) {
+            if *chr_check == calib {
+                genomesize += chr_len;
+            }
+        }
+        genomesize
+    }
+
     /// Convert the Chromosomes struct to an HashMap<String, u32>
     /// # Return
     /// HashMap<String, u32>
-- 
GitLab