Skip to content
Snippets Groups Projects
Commit cbccbab1 authored by GD's avatar GD
Browse files

course on linear model and multiple testing

parent af2c50b0
No related branches found
No related tags found
No related merge requests found
Showing
with 451 additions and 0 deletions
% define color
\definecolor{InvisibleRed}{rgb}{0.97, 0.92, 0.92}
\definecolor{InvisibleGreen}{rgb}{0.92, 0.97, 0.92}
\definecolor{InvisibleBlue}{rgb}{0.92, 0.92, 0.97}
\definecolor{MediumRed}{rgb}{0.925, 0.345, 0.345}
\definecolor{MediumGreen}{rgb}{0.37, 0.7, 0.66}
\definecolor{MediumBlue}{rgb}{0.015, 0.315, 0.45}
\definecolor{DarkBlue}{rgb}{0.05, 0.15, 0.35}
% set color
\usecolortheme[named=DarkBlue]{structure}
\setbeamercolor{titlelike}{parent=structure}
\setbeamercolor{block title}{bg=MediumBlue}
\setbeamercolor{block body}{bg=InvisibleBlue}
\setbeamercolor{block title example}{bg=MediumGreen}
\setbeamercolor{block body example}{bg=InvisibleGreen}
\setbeamercolor{block title alerted}{bg=MediumRed}
\setbeamercolor{block body alerted}{bg=InvisibleRed}
\ No newline at end of file
\mode<presentation>
%%%%%%%%%%%% fonts
\setbeamerfont{structure}{family=\sffamily,series=\mdseries}
\setbeamerfont{title}{size=\huge,series=\bfseries,parent=structure}
\setbeamerfont{subtitle}{size=\normalsize,parent=title}
\setbeamerfont{date}{size=\scriptsize,series=\mdseries,parent=structure}
\setbeamerfont{author}{size=\large,series=\mdseries,parent=structure}
\setbeamerfont{institute}{size=\scriptsize,series=\mdseries,parent=structure}
\setbeamerfont{section in toc}{size=\Large,series=\bfseries,parent=structure}
\setbeamerfont{section in head/foot}{size=\tiny,parent=structure}
\setbeamerfont{subsection in toc}{size=\small,series=\mdseries,parent={section in toc}}
\setbeamerfont{frametitle}{size=\LARGE,series=\bfseries,parent=structure}
\setbeamerfont{framesubtitle}{parent=frametitle,size=\Large}
\setbeamerfont{caption}{size=\footnotesize}
\setbeamerfont{item}{parent=structure,series=\mdseries}
\setbeamerfont{block title}{size=\large,series=\mdseries,parent={structure,block body}}
\mode
<all>
\pgfdeclareverticalshading[lower.bg,upper.bg]{bmb@transition}{200cm}{%
color(0pt)=(lower.bg); color(2pt)=(lower.bg); color(4pt)=(lower.bg)}
\setbeamersize{text margin left=2em,text margin right=2em}
% table of contents (overview)
\setbeamertemplate{section in toc}[sections numbered]
\setbeamertemplate{subsection in toc}{\leavevmode\leftskip=3.2em\rlap{\hskip-2em\inserttocsectionnumber.\inserttocsubsectionnumber}\inserttocsubsection\par}
\setbeamertemplate{footline}[page number]
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{blocks}[rounded][shadow=false]
\setbeamertemplate{enumerate items}[default]
\setbeamertemplate{frametitle}{\vspace*{0.5em}\bfseries\insertframetitle\par\vskip-6pt\hrulefill\vspace{-0.1em}}
\setbeamertemplate{title page}{
\vspace{7em}
\begingroup
\centering
% ------------------------
\begin{beamercolorbox}[sep=8pt,center]{title}
\usebeamerfont{title}\inserttitle\par%
\ifx\insertsubtitle\@empty%
\else%
\vskip0.25em%
{\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}%
\fi%
\end{beamercolorbox}%
\vskip0.5em\par
% ------------------------
\begin{beamercolorbox}[sep=8pt,center]{author}
\usebeamerfont{author}\insertauthor
\end{beamercolorbox}
\vskip-1em
% ------------------------
\begin{beamercolorbox}[sep=8pt,center]{institute}
\usebeamerfont{institute}\insertinstitute
\end{beamercolorbox}
% ------------------------
\begin{beamercolorbox}[sep=8pt,center]{date}
\usebeamerfont{date}\insertdate
\end{beamercolorbox}\vskip0.5em
% ------------------------
{\usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par}
\endgroup
\vfill
}
\ No newline at end of file
\DeclareOptionBeamer{compress}{\beamer@compresstrue}
\ProcessOptionsBeamer
\mode<presentation>
\useoutertheme[footline=authortitle]{miniframes}
\useinnertheme{circles}
\usecolortheme{whale}
\usecolortheme{orchid}
\definecolor{beamer@blendedblue}{rgb}{0.137,0.466,0.741}
\setbeamercolor{structure}{fg=beamer@blendedblue}
\setbeamercolor{titlelike}{parent=structure}
\setbeamercolor{frametitle}{fg=black}
\setbeamercolor{title}{fg=black}
\setbeamercolor{item}{fg=black}
\mode
<all>
\mode<presentation>
% Settings
\usetheme{Madrid}
\useinnertheme{circles}
\usefonttheme{SimplePlus}
\usecolortheme{SimplePlus}
\useinnertheme{SimplePlus}
\mode<all>
\ No newline at end of file
% 25086649
@Article{pmid25086649,
Author="Pollen, A. A. and Nowakowski, T. J. and Shuga, J. and Wang, X. and Leyrat, A. A. and Lui, J. H. and Li, N. and Szpankowski, L. and Fowler, B. and Chen, P. and Ramalingam, N. and Sun, G. and Thu, M. and Norris, M. and Lebofsky, R. and Toppani, D. and Kemp, D. W. and Wong, M. and Clerkson, B. and Jones, B. N. and Wu, S. and Knutsson, L. and Alvarado, B. and Wang, J. and Weaver, L. S. and May, A. P. and Jones, R. C. and Unger, M. A. and Kriegstein, A. R. and West, J. A. ",
Title="{{L}ow-coverage single-cell m{R}{N}{A} sequencing reveals cellular heterogeneity and activated signaling pathways in developing cerebral cortex}",
Journal="Nat Biotechnol",
Year="2014",
Volume="32",
Number="10",
Pages="1053--1058",
Month="Oct"
}
\section[DEA-Seq]{Differential Expression Analysis for sequencing data}
\begin{frame}
\frametitle{Basic Principles}
\begin{columns}[c]
\begin{column}{.45\textwidth}
\begin{itemize}
\item Many studies in Genomics can be restated as a comparison problem
\end{itemize}
\end{column}
\begin{column}{.45\textwidth}
\begin{center}
\includegraphics[scale=0.37]{./figures/gene_dea.jpg}
\end{center}
\end{column}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Let's adopt the ANOVA framework}
\begin{itemize}
\item $Y_{ijr}$ : expression (continuous) for gene $i$ in condition $j$ at replicate $r$
\item Perform DE between conditions using model $$
\textcolor{cerisepink}{Y_{ijr} \sim \mathcal{N}\left(\mathbb{E}(Y_{ijr}),\sigma^2\right)}$$
$$
\textcolor{cerisepink}{\mathbb{E}(Y_{ikr})= \mu_{ij}=\mu + \alpha_i +\beta_j + \left( \alpha \beta\right)_{ij}}
$$
\item The parameters of the model are interpreted as :
\begin{itemize}
\item $\alpha_i$ : mean expression of gene $i$ (across conditions),
\item $\beta_j$ : mean expression in condition $j$ (across genes),
\item $\left( \alpha \beta\right)_{ij}$ : interaction effect gene x condition
\end{itemize}
\item Allows to integrate normalization while testing
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Testing framework}
\begin{itemize}
\item \textbf{Hypothesis} : no expression difference between conditions
$$\textcolor{cerisepink}{\mathcal{H}_0^i : \{\left( \alpha \beta\right)_{i1}=\left( \alpha \beta\right)_{i2}\}}$$
\item The classical statistic for gene $i$ is the Student statistic
$$T_i =\frac{|\widehat{\alpha \beta}_{i1}-\widehat{\alpha \beta}_{i2}|}{\widehat{\sigma}} \times \sqrt{2R-2} \underset{\mathcal{H}_0}{\sim} \mathcal{T}(2R-2)$$
\item Estimation of mean fixed effects is done by \textbf{Maximum Likelihood}
\item Multiple testing issues are assessed using the FDR control
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{What about the estimation of the dispersion parameter ?}
\begin{itemize}
\item Refinements / difficulties concern the \textbf{estimation of $\sigma$}, the dispersion parameter
\item A \textbf{common variance} to all genes $\sigma^2$ : robust but lacks of power
\item A \textbf{specific variance} to every gene $\sigma_i^2$ : powerful but sensitive to outliers,
\begin{itemize}
\item[-] Large sampling variance
\item[-] To be stabilized empirically
\end{itemize}
\item \textbf{Groups of variances} (combination of both)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The Generalized Linear Model framework}
\begin{itemize}
\item $Y_{ijr}$ : the \textbf{read count} (positive integer), for gene $i$ in condition $j$
\item Define the \textbf{Generalized Linear Model} (GLM) by setting
\textcolor{cerisepink}{
\begin{eqnarray*}
Y_{ijr} &\sim& \mathcal{P}(\mu_{ij}) \\
\log \mathbb{E}(Y_{ijr})=\log(\mu_{ij}) &=& \mu + \alpha_i +\beta_j + \left( \alpha \beta\right)_{ij}
\end{eqnarray*}}
\item Parameters have the same interpretation
\item Testing hypotheses are similar : $\mathcal{H}_0^i : \{\left( \alpha \beta\right)_{i1}=\left( \alpha \beta\right)_{i2}\}$
\item Dispersion parameter ? Test statistics ?
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{The Exponential family of distributions}
\begin{itemize}
\item Family of distributions that share common mathematical properties
\item The \textbf{Exponential Family} is one of the most widely used
\item Consider two types of parameters :
\begin{itemize}
\item $\theta$ the \textbf{natural parameter}, related to the location parameter
\item $\phi$ the \textbf{dispersion parameter}
\end{itemize}
\item If $Y$ belongs to the exponential family, its density is of the form
\end{itemize}
\begin{center}
\begin{tabular}{cc}
$p(y;\theta,\phi) \propto \exp \left(\frac{y \theta-b(\theta)}{a(\phi)}\right)$ &
\begin{tabular}{ccc}
& $\theta$ & $\phi$ \\
\hline
Gaussian & $\mu$ & $\sigma^2$ \\
Poisson & $\log(\mu)$ & $1$ \\
Binomial & $\text{logit}(\mu)$ & $1$
\end{tabular}
\end{tabular}
\end{center}
\end{frame}
\begin{frame}
\frametitle{The Generalized Linear Model (GLM)}
\begin{itemize}
\item Suppose that $\mathbb{E}(Y)=\mu$ linearly depends on some covariates $X$ $$g(\mathbb{E}(Y))=g(\mu)=X\beta$$
\item $\eta = g(\mu)$ is often called the \textbf{linear predictor}
\item Most of the time the \textbf{canonical link} is used $g(\mu)=g(b'(\theta))=\theta$
\begin{center}
\begin{tabular}{ccc}
& $g(\mu)$ & $V(\mu)$ \\
\hline
Gaussian & $\mu$ & 1 \\
Poisson & $\log(\mu)$ & $\mu$ \\
Binomial & $\text{logit}(\mu)$ & $\mu(1-\mu)$
\end{tabular}
\end{center}
\vspace{0.5cm}
\item In GLMs, overdispersion $\phi$ is not used (exponential dispersion family)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Location / Dispersion relations}
\begin{itemize}
\item The first two moments of the exponential family are :
\begin{eqnarray*}
\mathbb{E}[Y] &=& b'(\theta) = \mu \\
\mathbb{V}[Y] &=& b''(\theta) \times a(\phi)
\end{eqnarray*}
\item The \textbf{expectation $\mu$ is a function of $\theta$ only} (location)
\item The variance is a function of \textbf{both $(\theta,\phi)$} (location and dispersion)
\item $b''(\bullet)$ is called the \textbf{variance function} (also denoted by $V(\mu)$), and describes \textcolor{blue}{how the variance relates to the mean}
\item Gaussian : (exception!) $a(\phi)=\sigma^2$ and $b''(\theta)=1$ (cst curvature)
\item Poisson, Binomial : $a(\phi)=1$ (no freedom)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{A matter of vocabulary}
\begin{itemize}
\item Recall that $\mathbb{V}[Y] = a(\phi) \times V(\mu)$ where $\phi$ is the dispersion parameter
\item The \textbf{Poisson distribution has no dispersion parameter}
\item The only possible Discrete Exponential Dispersion model with a disperson parameter are additive models such as \textbf{Negative Binomial} or \textbf{Poisson-Tweedie}
\item Parameter $\alpha$ may be called dispersion parameter
\end{itemize}
\begin{center}
\begin{tabular}{lcc}
& $a(\phi)$ & $V(\mu)$ \\
\hline
Poisson & 1 & $\mu$\\
QuasiPoisson & $\phi$ & $\mu$\\
Negative Binomial & 1 & $\mu + \kappa \mu^2$ \\
Tweedie Poisson & 1 & $\mu^p$\\
\end{tabular}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Dealing with dispersion estimates}
\begin{itemize}
\item If we choose the NB model, $V(\mu) = \mu + \kappa \mu^2$
\item Then we can follow the same procedure compared with the Gaussian case:
\item Use genes as replicates to uncover the mean/variance relationship (one gene=one point)
\item perform a regression $V(\mu) = \mu + \kappa_{Global} \mu^2$ to estimate $\kappa_{Global}$ that would be \textbf{common to every gene}
\item In Anders et al., the final ``dispersion'' estimate is for each gene : $\max(\widehat{\kappa}_{Global},\widehat{\kappa}_i)$ (little loss in power)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Testing Strategies based on LRT}
\begin{itemize}
\item Compare different models, for instance
\begin{eqnarray*}
\log(\mu_{ij}) &=& \mu + \alpha_i +\beta_j \\
\log(\mu_{ij}) &=& \mu + \alpha_i +\beta_j + \left( \alpha \beta\right)_{ij}
\end{eqnarray*}
\item Use the Ratio of log likelihoods as a Statistics, which incorporates all infos:
$$
LRT = -2 \log \left( \frac{\mathcal{L}(\widehat{\mu},\widehat{\alpha},\widehat{\beta},\widehat{\alpha \beta})}
{\mathcal{L}(\widehat{\mu},\widehat{\alpha},\widehat{\beta})}\right) \underset{\mathcal{H}_0}{\sim}{\chi^2(\Delta df)}
$$
\item This has been shown to be the best strategy on Sequencing data
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Conclusion: don't think Normal !}
\begin{itemize}
\item Use \textbf{Generalized Linear Models} to perform Count regression, and not Gaussian regression on the log-counts
\item Incorporate effects in the model to perform a global analysis that \textbf{accounts for distributional characteristics}
\item Do not perform tests that imply Poisson distribution when data are over-dispersed
\item Use \textbf{Likelihood Ratio Tests} to compare models
\item Overdispersion leads to estimation issues due to \textbf{numerical problems}
\end{itemize}
\end{frame}
\section[scDEA]{Differential Expression Analysis for single cell data}
\begin{frame}
\frametitle{How bad is the situation in single cell data ?}
\begin{center}
\includegraphics[scale=0.23]{./figures/dropout_kharchenko.pdf}
\end{center}
Overdispersion is mainly biological because diversity is high between cells \cite{Kharchenko2014Bayesian}
\end{frame}
\begin{frame}
\frametitle{Expression is a stochastic bursty process: biological zeros}
\begin{center}
\includegraphics[scale=0.23]{./figures/burst.pdf}
\end{center}
\end{frame}
\begin{frame}
\frametitle{The curse of Dropouts}
\begin{itemize}
\item Low starting amount of RNAs: transcripts will be missed during RT
\item Amplification is needed ($\times 10^6$), which creates distortions
\item Stochasticity of gene expression (bursty process) sparsity of the data, high proportion of zeros
\item Dropout depends on cells (different in different wells),
\item Lowly expressed genes : sampling / amplification issues
\item Highly expressed genes: is more likely to indicate a burst
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Consequences for the analysis of multiple single cells}
\begin{itemize}
\item The data consist of a \textbf{snapshot} : all cells are not synchronized
\item No technical replicate per cell (invasive experiment)
\item A lot of zeros in the data : zero inflated count distributions
\item For cell $r$, gene $i$, condition $j$, the expression value is modelled by
$$
Y_{ijr} \sim \pi_i \delta_0 + (1-\pi_i) \mathsf{NB}(\mu_{ijr})
$$
\item Difficulty to discriminate between low expression / no expression
$$
\pi_i = f(\mathbb{E}(Y_{ijr}))
$$
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Two-component Generalized Linear Model \cite{Risso2017ZINB}}
\begin{center}
\includegraphics[scale=0.22]{./figures/ZINB_wave_Risso.pdf}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Inference for ZINB models}
\begin{itemize}
\item Optimization can be based on IRWLS (iterative) combined with the EM algorithm
\item EM is used to retrieve the ZI compartment
\item Then IRWLS is used to estimate the parameters in the NB compartement
\item Quite challenging from the numerical point of view
\item Use Bayesian strategies thanks to the Poisson-Gamma representation of NB distributions
$$
\mu \sim \Gamma(a,b) , \,\, Y \vert \mu \sim \mathcal{P}(\mu), \,\, Y \sim NB(a,\frac{b}{a+1})
$$
\item Model the sampling process of genes
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Normalizing single cell expression data}
\begin{itemize}
\item Adjusts for effects related to distributional differences in read counts between cells (sequencing depth)
\item Scaling factor for cell $r$ to rescale all cell specific measures on a common scale
$$
\mathbb{E}(Y_{ijr}) = s_r \times \mu_{ij}
$$
\item How to estimate the scaling factor ? RPKM ?
\item Library size normalization can be dominated by a handful of highly expressed genes, which can bias downstream analysis .
\item Quantile matching ? but difficult to apply with zero Inflation
\item Litterature suggests to use the trimmed means proposed by DESeq \cite{Vallejos_normalization_2017}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Normalizing single cell expression data \cite{Vallejos_normalization_2017}}
\begin{center}
\fbox{\includegraphics[scale=0.2]{./figures/normalization_vallejos.pdf}}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Hypothesis testing becomes more intricate}
\begin{itemize}
\item A Compartment model defined by $(\pi,\mu)$
\item Testing hypotheses are similar : $\mathcal{H}_0^i : \{\left( \pi_j,\mu_j\right) =\left( \pi_{j'},\mu_{j'}\right)\}$
\item What is differential expression ? Differential dropout ?
\item Difficulty to define $\mathcal{H}_1$
\end{itemize}
\end{frame}
M1_biosciences_regression_multiple_testing/figures/Learning+Types.jpg

254 KiB

File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment