diff --git a/README.md b/README.md index 843dc3192fee2247ed2db788ef20239aeb38bdb8..f89db8261851fa36fa67a3362fc7c85f208a680e 100644 --- a/README.md +++ b/README.md @@ -72,12 +72,12 @@ The `bamcalib` expect the following parameters: - $WCE_{c}\left(t\right)$ the coverage at position $t$ in the WCE sample on the calibration genome ```math -\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{10^9 \sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)} +\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\frac{1}{T_x}\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)} ``` In a reference genome of size $`T_x`$ (ignoring the chromosome segmentation), and in a calibration genome of size $`T_c`$. -To be able to better analyze the coverage information at repetitive regions of the gnome, we propose to extend the previous normalization to normalize the signal nucleotide by nucleotide and work with the following quantity: +To be able to better analyze the coverage information at repetitive regions of the genome, we propose to extend the previous normalization to normalize the signal nucleotide by nucleotide and work with the following quantity: ```math \text{ratio}IP\left(t\right) = \frac{\text{norm}IP\left(t\right)}{\text{norm}WCE\left(t\right)} @@ -86,7 +86,7 @@ To be able to better analyze the coverage information at repetitive regions of t with: ```math -\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{10^9}{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)} \alpha +\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{1}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)} \alpha ``` We then find $`\alpha`$ such that: @@ -101,7 +101,8 @@ which gives \alpha = \frac{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{10^9} \frac{\sum_{t=1}^{T_x}\frac{IP_{x}\left(t\right)}{WCE_{x}\left(t\right)}}{\sum_{t=1}^{T_x}IP_{x}\left(t\right)} ``` -Not that all the computation is made at the nucleotide level and not at the read level as in [Hu et al.](https://doi.org/10.1093/nar/gkv670), this is also why we use $10^9$ as a scaling factor instead of $10^6$. +Not that all the computation are scaled by the genome size and not at the read number as in [Hu et al.](https://doi.org/10.1093/nar/gkv670), this is also why we add a scaling factor (default to $10^3$). +This scaling the $\text{ratio}IP\left(t\right)$ is multiplied by this scaling factor. With this method, we retain the interesting properties of [Hu et al.](https://doi.org/10.1093/nar/gkv670) normalization on the average read density between samples (i.e., we can compare two different samples in a quantitative way) and we account for the local bias of read density observed in the WCE samples (differential chromatin accessibility, repetition, low mappability region, etc.). diff --git a/doc/presentation.qmd b/doc/presentation.qmd index 6f74fa2337d6524c653daed9878551a980fcec05..3e1c5acb9997e92cec35ad84c66c0254d32f72b3 100644 --- a/doc/presentation.qmd +++ b/doc/presentation.qmd @@ -3,8 +3,10 @@ title: "Calibrated Chip-Seq with `bamcalib`" author: "laurent.modolo@ens-lyon.fr" format: revealjs: - theme: default transition: none + format: + highlight-style: monokai + theme: white --- ## Calibration data @@ -27,23 +29,7 @@ The coverage differences can be due to: - Differences in experiment efficiency -- - - ## Differences in amount of biological material - -## Normalization - -{width="50%" height="50%"} - -Scaling the data by the calibration coverage: - -- Differences in experiment efficiency - -. . . - -How do we account for differences in the amount of biological material ? - ------------------------------------------------------------------------- +- Differences in amount of biological material ## Normalization @@ -87,7 +73,7 @@ Scaling the data by the scaled WCE coverage: ::: ::: {.column width="50%"} -We scale by the total coverage $\sum_{t=1}^{T}IP(t)$ +We scale by the average coverage $\frac{1}{T}\sum_{t=1}^{T}IP(t)$ ::: ::: @@ -105,7 +91,7 @@ With ## Normalization $$ -\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{10^9 \sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)} +\text{norm}IP\left(t\right) = IP_{x}\left(t\right) \frac{ \frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}IP_{c}\left(t^{'}\right)\frac{1}{T_x}\sum_{t^{'}=1}^{T_x}WCE_{x}\left(t^{'}\right)} $$ We scale everything to $10^9$ @@ -131,7 +117,7 @@ We can remove the : - Differences in experiment efficiency $$ -\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{10^9}{\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)} \alpha +\text{norm}WCE\left(t\right) = WCE_{x}\left(t\right) \frac{1}{\frac{1}{T_c}\sum_{t^{'}=1}^{T_c}WCE_{c}\left(t^{'}\right)} \alpha $$ How do we compute $\alpha$ to remove the : diff --git a/src/bed.rs b/src/bed.rs index cefd6ea9973f161f43ddb92a0e6fc20beda8b186..fa12b61dc292d10f0efd986580926e42bea17392 100644 --- a/src/bed.rs +++ b/src/bed.rs @@ -100,11 +100,12 @@ impl BedGraph { bam_ip: &str, bam_wce: &str, prefix: &str, + scaling: f64, thread_local: u16, no_bits: u16, ) { self.or_stats - .init(bam_ip, bam_wce, prefix, thread_local, no_bits); + .init(bam_ip, bam_wce, prefix, scaling, thread_local, no_bits); println!("OR: {}", self.or_stats.get_or()); println!("alpha: {}", self.or_stats.get_alpha()); @@ -197,6 +198,7 @@ impl BedGraphWrapper { bam_ip: &str, bam_wce: &str, prefix: &str, + scaling: f64, thread_local: u16, no_bits: u16, file_ip: &str, @@ -207,7 +209,7 @@ impl BedGraphWrapper { .create() .expect("Unable to create thread pool."); let mut bed = BedGraph::new(); - bed.init(&bam_ip, &bam_wce, &prefix, thread_local, no_bits); + bed.init(&bam_ip, &bam_wce, &prefix, scaling, thread_local, no_bits); let bedwrapper = BedGraphWrapper::new(bed); let chrom_map = bedwrapper.bed.or_stats.get_chr().to_hashmap_filtered(false); diff --git a/src/main.rs b/src/main.rs index d0125f8e365a2dd86b6e0e07f121d8a074c54bb3..7a8ebfd67052c55f6ab09804a3e6e62f2ebc745e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -36,6 +36,10 @@ struct Args { #[arg(short, long, default_value_t = String::from("calib_"))] cal_prefix: String, + /// multiply the OR by this value (must be the same across sample) + #[arg(short, long, default_value_t = 1.0e3)] + scaling: f64, + /// remove reads which bit code match this code #[arg(short, long, default_value_t = 1540)] no_bits: u16, @@ -52,6 +56,7 @@ fn main() { let bigwig_ip = matches.bigwig_ip; let bigwig_wce = matches.bigwig_wce; let cal_prefix = matches.cal_prefix; + let scaling = matches.scaling; let mut thread: u16 = matches.thread; if thread == 0 { thread = 1; @@ -61,6 +66,7 @@ fn main() { &bam_ip, &bam_wce, &cal_prefix, + scaling, thread, no_bits, &bigwig_ip, diff --git a/src/pileup.rs b/src/pileup.rs index f42f035f12aa2a27bb8b2ad7e86ffb1276721eef..31d7d2b9fc2618dd6e80072f1a5e3ffdfe5394c6 100644 --- a/src/pileup.rs +++ b/src/pileup.rs @@ -269,7 +269,7 @@ pub fn compute_depth( prefix: &str, thread_local: u16, no_bits: u16, -) -> (Depth, f64, f64, Chromosomes) { +) -> (Depth, u64, u64, Chromosomes) { let (multimap, chr, fragment_len) = MultiMappingReads::init(file, prefix, thread_local, no_bits); let mut reader = open_bam(file, thread_local); @@ -278,8 +278,6 @@ pub fn compute_depth( depth.init(&chr); let mut ref_count: u64 = 0; let mut calib_count: u64 = 0; - let mut ref_size: u64 = 0; - let mut calib_size: u64 = 0; let mut ref_depth = ColumnDepth::new(fragment_len); let mut calib_depth = ColumnDepth::new(fragment_len); for column in pileup { @@ -289,10 +287,8 @@ pub fn compute_depth( column.ref_pos(), ref_depth.depth(&column) as u64, ); - ref_size += 1; } else { calib_count += calib_depth.depth(&column) as u64; - calib_size += 1; } } if ref_count == 0 { @@ -301,10 +297,5 @@ pub fn compute_depth( if calib_count == 0 { panic!("no read detected in the calibration genome (maybe the no_bits is filtering too stringent)") } - return ( - depth, - (ref_count as f64 / ref_size as f64), - (calib_count as f64 / calib_size as f64), - chr, - ); + return (depth, ref_count, calib_count, chr); } diff --git a/src/stats.rs b/src/stats.rs index 5d9973ea67681cc6ba0a332b5f0fc73de3383700..4b08093c0c778eb223702500c6b9e24de973de80 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -10,13 +10,13 @@ use std::thread; /// Store genome statistics necessary for normalization #[derive(Debug, Copy, Clone)] struct GenomeStats { - read_count: f64, + read_count: u64, } impl GenomeStats { /// Create a new GenomeStats with read_count set to 0 pub fn new() -> GenomeStats { - GenomeStats { read_count: 0.0 } + GenomeStats { read_count: 0 } } } @@ -87,6 +87,7 @@ impl ORStats { bam_ip: &str, bam_wce: &str, prefix: &str, + scaling: f64, thread_local: u16, no_bits: u16, ) { @@ -95,7 +96,7 @@ impl ORStats { Some(pos) => pos, None => panic!("init(): no valid position in the reference genome"), }; - self.compute_or(); + self.compute_or(scaling); self.compute_alpha(); } @@ -166,9 +167,11 @@ impl ORStats { } /// Function to compute the OR from the results of the compute_depth() function - pub fn compute_or(&mut self) { - self.or = (self.wce.calibration.read_count as f64) - / ((self.ip.calibration.read_count as f64) * (self.wce.reference.read_count as f64)); + pub fn compute_or(&mut self, scaling: f64) { + let ref_size = self.ip.chromosomes.get_genomesize(true) as f64; + self.or = scaling * (self.wce.calibration.read_count as f64) + / ((self.ip.calibration.read_count as f64) + * (self.wce.reference.read_count as f64 / ref_size)); } /// Function to compute the alpha from the results of the compute_depth() function @@ -197,8 +200,9 @@ impl ORStats { self.alpha += (ip_depth as f64) / (wce_depth as f64); } } + let calib_size = self.ip.chromosomes.get_genomesize(false) as f64; self.alpha = (self.alpha / (self.ip.reference.read_count as f64)) - * (self.wce.calibration.read_count as f64); + * (self.wce.calibration.read_count as f64 / calib_size); } /// Return the OR value to normalize the IP coverage diff --git a/src/utils.rs b/src/utils.rs index 993f391df333470b909c8238ca18ac31764c51c7..a2e6df1a583e352498ab953f32dcd6d89301e89e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -117,15 +117,26 @@ impl Chromosomes { } /// return the length of a genome - /// # Arguments - /// - ip: bool the size of the reference genome (if true) or reference genome (if false) /// # Return - /// usize the sum of the chromosome lengths - + /// &Vec<u32> the chromosome lengths pub fn get_lens(&self) -> &Vec<u32> { &self.len } + /// return the sizee of a genome + /// - calib: bool + /// # Return + /// u32 the sum of the chromosome lengths + pub fn get_genomesize(&self, calib: bool) -> u32 { + let mut genomesize = 0; + for (chr_check, chr_len) in self.calib.iter().zip(self.len.iter()) { + if *chr_check == calib { + genomesize += chr_len; + } + } + genomesize + } + /// Convert the Chromosomes struct to an HashMap<String, u32> /// # Return /// HashMap<String, u32>