diff --git a/M1_biosciences_clustering/biblio.bib b/M1_biosciences_clustering/biblio.bib index 78ae157e3a755c763fa8e9570b4dca7f4b86bdbd..cc6eef167c4606f398582d38c922a5fd5400dfd0 100644 --- a/M1_biosciences_clustering/biblio.bib +++ b/M1_biosciences_clustering/biblio.bib @@ -20,3 +20,15 @@ Pages="641--642" } +% 30617341 +@Article{pmid30617341, + Author="Kiselev, V. Y. and Andrews, T. S. and Hemberg, M. ", + Title="{{C}hallenges in unsupervised clustering of single-cell {R}{N}{A}-seq data}", + Journal="Nat Rev Genet", + Year="2019", + Volume="20", + Number="5", + Pages="273--282", + Month="05" +} + diff --git a/M1_biosciences_clustering/figures/BIC_clustering.png b/M1_biosciences_clustering/figures/BIC_clustering.png new file mode 100644 index 0000000000000000000000000000000000000000..69b0784713fdb7860c34e6aab93a7f7d1ae35641 Binary files /dev/null and b/M1_biosciences_clustering/figures/BIC_clustering.png differ diff --git a/M1_biosciences_clustering/figures/clustering_singlecell.pdf b/M1_biosciences_clustering/figures/clustering_singlecell.pdf new file mode 100644 index 0000000000000000000000000000000000000000..17af3dd819a2bc0c279dc4983fdca22af40d90e4 Binary files /dev/null and b/M1_biosciences_clustering/figures/clustering_singlecell.pdf differ diff --git a/M1_biosciences_clustering/figures/distances_highdim.pdf b/M1_biosciences_clustering/figures/distances_highdim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ee459d870ee5f743ee00a56b4498a10c12d81f61 Binary files /dev/null and b/M1_biosciences_clustering/figures/distances_highdim.pdf differ diff --git a/M1_biosciences_clustering/figures/elbow_plot.png b/M1_biosciences_clustering/figures/elbow_plot.png new file mode 100644 index 0000000000000000000000000000000000000000..dbfae1d2ff057a12734381a29c9192dd8432d67b Binary files /dev/null and b/M1_biosciences_clustering/figures/elbow_plot.png differ diff --git a/M1_biosciences_clustering/figures/graph_cut.pdf b/M1_biosciences_clustering/figures/graph_cut.pdf new file mode 100644 index 0000000000000000000000000000000000000000..180940e7c0b8f6c355c815a4bb5ff16df6340fda Binary files /dev/null and b/M1_biosciences_clustering/figures/graph_cut.pdf differ diff --git a/M1_biosciences_clustering/figures/graph_towardsdatascience.png b/M1_biosciences_clustering/figures/graph_towardsdatascience.png new file mode 100644 index 0000000000000000000000000000000000000000..efd4df9a7ff4818635c56fbceacf2ced423407ef Binary files /dev/null and b/M1_biosciences_clustering/figures/graph_towardsdatascience.png differ diff --git a/M1_biosciences_clustering/figures/knn_graph.pdf b/M1_biosciences_clustering/figures/knn_graph.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b00eb84d06bfddba7c4997770baee2d7cf5f8060 Binary files /dev/null and b/M1_biosciences_clustering/figures/knn_graph.pdf differ diff --git a/M1_biosciences_clustering/figures/knn_graph_result.pdf b/M1_biosciences_clustering/figures/knn_graph_result.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e7749319e16c412ea3277c7d24947f401b18a215 Binary files /dev/null and b/M1_biosciences_clustering/figures/knn_graph_result.pdf differ diff --git a/M1_biosciences_clustering/figures/knn_singlecell.pdf b/M1_biosciences_clustering/figures/knn_singlecell.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ab6831d80f6c83fb292996d99371480256ec1ae5 Binary files /dev/null and b/M1_biosciences_clustering/figures/knn_singlecell.pdf differ diff --git a/M1_biosciences_clustering/figures/louvain_singlecell.jpg b/M1_biosciences_clustering/figures/louvain_singlecell.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6282658cae6a55e9f8cf114515beded075a0545c Binary files /dev/null and b/M1_biosciences_clustering/figures/louvain_singlecell.jpg differ diff --git a/M1_biosciences_clustering/figures/netwizz.png b/M1_biosciences_clustering/figures/netwizz.png new file mode 100644 index 0000000000000000000000000000000000000000..c1e8c876881c7a681d7125c770141ac6b4319ed0 Binary files /dev/null and b/M1_biosciences_clustering/figures/netwizz.png differ diff --git a/M1_biosciences_clustering/figures/sphere.pdf b/M1_biosciences_clustering/figures/sphere.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8e40b6857e876bd6c45ad9c2f8b775fbae51dd17 Binary files /dev/null and b/M1_biosciences_clustering/figures/sphere.pdf differ diff --git a/M1_biosciences_clustering/graph_clustering.tex b/M1_biosciences_clustering/graph_clustering.tex index 834f96371dca77b756addf1cdaec1b06bed8705f..eb9fa41072fc16b01b58225dc6361ed5eb49ef8f 100644 --- a/M1_biosciences_clustering/graph_clustering.tex +++ b/M1_biosciences_clustering/graph_clustering.tex @@ -8,6 +8,7 @@ \begin{itemize} \item Linear methods provides clusters that can be separated by planes \item Recent developments propose to generalize clustering beyond linear methods + \item Popular methods consist in constructing a proximity graph between points to represent interactions \end{itemize} \column{.45\textwidth} @@ -17,17 +18,39 @@ \end{columns} \end{frame} +\begin{frame} +\frametitle{Graph-based clustering in single cell genomics} +\begin{columns}[c] + \column{.5\textwidth} + \begin{itemize} + \item Single Cell transcriptomic data: given cell expression, assign cells to cell-types + \item Group cells according to their transcriptomic proximities + \item The graph represents distances between cells + \end{itemize} + + \column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.25]{./figures/clustering_singlecell.pdf} + \end{center} +\end{columns} +\end{frame} + + \begin{frame} \frametitle{Definition of a graph} +A graph $\mathcal{G}=\big(\mathcal{V}, \mathcal{E}\big)$ is defined by a set of vertices $\mathcal{V} = \{1,\hdots,n\}$ and a set of edges $\mathcal{E} = \{(i,j) \in \mathcal{V}^2, \quad i \sim j\}$, directed or non directed \\ + + \begin{center} + \includegraphics[scale=0.4]{./figures/graph_towardsdatascience.png} + \end{center} +\end{frame} + +\begin{frame} +\frametitle{Basic Features of graphs} \begin{columns}[c] \column{.5\textwidth} \begin{itemize} - \item A graph $\mathcal{G}=\big(\mathcal{V}, \mathcal{E}\big)$ is defined by : - \begin{itemize} - \item[$\rightarrow$] a set of vertices $\mathcal{V} = \{1,\hdots,n\}$ - \item[$\rightarrow$] a set of edges $\mathcal{E} = \{(i,j) \in \mathcal{V}^2, \quad i \sim j\}$ - \end{itemize} \item Define the adjacency matrix of a graph $$ A = \begin{cases} w_{ij} \in \mathbb{R}, \text{ if } i \sim j \\ 0 \text{ otherwise}\end{cases} @@ -35,24 +58,26 @@ \item A graph can be binary: $w_{ij} \in \{0,1\}$ \item Or weighted : $w_{ij} \in \mathbb{R}$ - \item Directed or non directed \end{itemize} \column{.45\textwidth} \begin{center} + \includegraphics[scale=0.45]{./figures/adjacency.pdf} \end{center} \end{columns} \end{frame} + + \begin{frame} \frametitle{From a dissimilarity matrix to a graph} \begin{columns}[c] \column{.5\textwidth} \begin{itemize} - \item How to construct the graph ? - \item From any dissimilarity matrix - \item Other strategies (kNN graph) + \item How to construct the graph from the data $\Xbf$? + \item From any dissimilarity matrix (Gram Matrix) + \item Most popular method : neighborhood graph (kNN graph) \item Clustering can be restated as finding clusters of vertices \end{itemize} \column{.45\textwidth} @@ -63,6 +88,28 @@ \end{frame} +\begin{frame} +\frametitle{knn-Graphs} +\begin{columns}[c] + \column{.5\textwidth} + \begin{itemize} + \item Consider a dissimilarity matrix + \item Choose a number of neighbors (resolution parameter) + \item For a given vertex, consider the $k$ nearest neighbors + \item Construct the proximity graph iteratively + \item Can also consider shared neighborhoods + \item Sparsification of the original dense graph + \item Need efficient methods on large datasets + \end{itemize} + \column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.35]{./figures/knn_graph.pdf} + \end{center} +\end{columns} +\end{frame} + + + \begin{frame} \frametitle{Clustering strategy based on modules/community} \begin{columns}[c] @@ -75,23 +122,63 @@ \end{itemize} \column{.45\textwidth} \begin{center} - \includegraphics[scale=0.45]{./figures/modularity.pdf} + \includegraphics[scale=0.38]{./figures/knn_singlecell.pdf} \end{center} \end{columns} \end{frame} \begin{frame} -\frametitle{Modularity optimization} +\frametitle{The Graph-Cut problem} \begin{columns}[c] \column{.5\textwidth} \begin{itemize} + \item How to partition a graph into subgraphs with a given objective ? + \item The \texttt{size} of a cut is the number of cut edges + \item Clustering by graph-cuts: smallest cut that make homogeneous subgraphs + \end{itemize} + \column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.45]{./figures/graph_cut.pdf} + \end{center} +\end{columns} +\end{frame} + + +\begin{frame} +\frametitle{Finding the best cut} +\begin{columns}[c] + \column{.5\textwidth} + \begin{itemize} + \item $\text{Vol}(S)$ volume of subgraph $S$ (nb of nodes) + \item $\text{Cut}(S,S')$ number of edges that link two subgraphs $S$ and $S'$ + \item The normalized cut value: + $$ + \text{NormCut}(S,S')= \frac{\text{Cut}(S,S')}{\text{Vol}(S)} + \frac{\text{Cut}(S,S')}{\text{Vol}(S')} + $$ + \item Avoids cuts that generate too-small subgraphs + \item The combinatorial complexity is too high, need heuristics + \end{itemize} + \column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.45]{./figures/graph_cut.pdf} + \end{center} +\end{columns} +\end{frame} + + +\begin{frame} +\frametitle{Modularity optimization using the Louvain Algorithm} +\begin{columns}[c] + \column{.5\textwidth} + \begin{itemize} + \item Approximation of a graph-cut problem \item A cluster is equivalent to a module \item If $K$ clusters (module) with indicator variables $z_{ik}$, the modularity is $$ - M_K(\zbf) = \frac{1}{2K}\sum_{k=1}^K \sum_{\ell=1}^K z_{ik} z_{j\ell} \Big( A_{ij} - p_{ij} \Big) + M_K(\zbf) = \frac{1}{2m}\sum_{k=1}^K \sum_{\ell=1}^K z_{ik} z_{j\ell} \Big( A_{ij} - \frac{d_id_j}{2m} \Big) $$ + \item $d_i = \sum_{j} A_{ij}$, $m=\sum_{ij}A_{ij}$ \item Find $\zbf$ such that $M_K(\zbf)$ is maximal - \item Example : Louvain algorithm \end{itemize} \column{.45\textwidth} \begin{center} @@ -99,3 +186,22 @@ \end{center} \end{columns} \end{frame} + +\begin{frame} +\frametitle{Extensions and generalizations} +\begin{columns}[c] + \column{.5\textwidth} + \begin{itemize} + \item The Louvain algorithm is one example of graph-based clustering methods + \item Widely used in single cell data analysis + \item Many hyper parameters to tune + \item Non Linear clustering is a very active field of research + \end{itemize} + \column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.45]{./figures/louvain_singlecell.jpg} + \end{center} +\end{columns} +\end{frame} + + diff --git a/M1_biosciences_clustering/hclust.tex b/M1_biosciences_clustering/hclust.tex index 7c2b670c8df135375c5b73888933f64b647d5a3f..4f9704e5a55604242a67abefd58f3309d5a40263 100644 --- a/M1_biosciences_clustering/hclust.tex +++ b/M1_biosciences_clustering/hclust.tex @@ -60,7 +60,7 @@ $$ \item Using this distance minimizes the increase in the within group inertia at each step of the hierarchy \item This distance accounts for desequilibria of clusters size - \item This is the default betweend-group distance implemented in softwares + \item This is the default between-group distance implemented in software \end{itemize} \end{frame} @@ -75,7 +75,7 @@ A, i^\prime \in B}{\min}d(\xbf_i,\xbf_{i^\prime})$$ \item Maximal link : $$d(A,B) = \underset{i \in A, i^\prime \in B}{\max}d(\xbf_i,\xbf_{i^\prime})$$ - \item Average Link + \item Average Link $$ d(A,B) = \frac{1}{n_A \times n_B}\sum_{i \in A, i^\prime \in B} d(\xbf_i,\xbf_{i^\prime}) $$ diff --git a/M1_biosciences_clustering/highdim_clustering.tex b/M1_biosciences_clustering/highdim_clustering.tex new file mode 100644 index 0000000000000000000000000000000000000000..139597f9cb07c5d48bed18984ec4747f4b4f3438 --- /dev/null +++ b/M1_biosciences_clustering/highdim_clustering.tex @@ -0,0 +1,2 @@ + + diff --git a/M1_biosciences_clustering/introduction.tex b/M1_biosciences_clustering/introduction.tex index 36988290f7c0a3a156bc53ca723fe463031fd576..0f72d4695866e0b68f22d5356a53aad23bfe1c32 100644 --- a/M1_biosciences_clustering/introduction.tex +++ b/M1_biosciences_clustering/introduction.tex @@ -85,10 +85,9 @@ x_{n}^1 &\hdots& \hdots &x_{n}^p \item[$\rightarrow$] The number of partitions of E into $K$ groupes (Stirling number) $$p(n,k) \sim K^n/K!$$ \item[$\rightarrow$] The total number of partitions of E (Bell number) $$B_n = \sum_{k=1}^n p(n,k) = \frac{1}{e} \sum_{k \geq 1}\frac{k^n}{k!}$$ +\end{itemize} \item The exploration of all partitions is not possible \item Algorithms will be iterative and approximate -\end{itemize} - \end{itemize} \end{frame} @@ -127,7 +126,7 @@ Probabilistic & Mixture Models\\ \end{column} \end{columns} \begin{itemize} -\item \textbf{High dimension:} $n$ grows but $\ll p$ +\item \textbf{High dimension:} $p$ grows and $\gg n$ \item \textbf{Big Data:} $n$ and $p$ grow \end{itemize} \end{frame} @@ -281,9 +280,13 @@ $$ d: E \times E &\rightarrow& \mathbb{R}^+ \\ (i,i^\prime) &\rightarrow & d(i,i^\prime) \end{eqnarray*} - \item Properties: $d(i,i^\prime)=d(i^\prime,i)$, $\forall i^\prime, \,\, - d(i,i) \leq d(i,i^\prime)$,$d(i,i)=0$ - \item Distance : additional triangular inequality $$d(i,i^\prime) + \item Properties: + \begin{itemize} + \item Non negativity for distinct elements $d(i,i^\prime) >0 $ if $i \neq i'$ + \item Symmetry: $\forall (i,i^\prime), \,\, d(i,i') = d(i',i)$ + \item $d(i,i')=0$ i.i.f $i=i'$ + \end{itemize} + \item Distance : additional triangular inequality $$d(i,i^\prime) \leq d(i,i^{\prime}) + \leq d(i^{\prime},i^{\prime \prime})$$ \end{itemize} \end{frame} @@ -341,11 +344,12 @@ I_T(\widetilde{\Xbf}_c) & = & \frac{1}{n} \sum_{i=1}^n d_2^2(\widetilde{\xbf}_{ \begin{columns}[c] \column{.5\textwidth} \begin{itemize} - \item Introduce indicator variables $z_{ik}$, such that + \item Suppose there exists $K$ clusters + \item Introduce indicator variables $z_{ik}$: $$ z_{ik}= \begin{cases} - 1, \text{ if } i \in k \\ + 1, \text{ if } i \in \text{cluster }k \\ 0, \text{otherwise} \end{cases} $$ @@ -378,7 +382,7 @@ I_T(\widetilde{\Xbf}_c) & = & \frac{1}{n} \sum_{i=1}^n d_2^2(\widetilde{\xbf}_{ $$ \Ibf_B = \sum_{k=1}^K n_k d^2(\overline{\xbf}_k,\overline{\xbf}) $$ - \item Between-class variance: distance of points to their cluster center + \item Within-class variance: distance of points to their cluster center $$ \Ibf_W = \sum_{k=1}^K \sum_{i=1}^n d^2(\mathbf{x}_i,\overline{\xbf}_k) $$ diff --git a/M1_biosciences_clustering/kmeans.tex b/M1_biosciences_clustering/kmeans.tex index 5636e045411a99858e18bcf470bd55d596ab5611..455d9fcd44b535efd8fb7b9444b050e346aef6ad 100644 --- a/M1_biosciences_clustering/kmeans.tex +++ b/M1_biosciences_clustering/kmeans.tex @@ -18,7 +18,7 @@ \begin{itemize} \item Consider the indicator variable $z_{ik}$ that equals 1 if individual $i$ is in cluster $k$ $$n_k = \sum_{i=1}^n z_{ik}$$ - \item Inertia boil down to + \item Inertia boils down to \begin{eqnarray*} \Ibf_W &=& \sum_{i=1}^n \sum_{k=1}^K z_{ik} d^2(\xbf_i,\overline{\xbf}_k) \\ \Ibf_B &=& n \times \sum_{k=1}d^2(\overline{\xbf}_k,\overline{\xbf}) diff --git a/M1_biosciences_clustering/main.tex b/M1_biosciences_clustering/main.tex index 44674dd36d2e1b4578edcb1335afe8f2db9ec305..1bac996feb6639b03c0faf1ea34111bdf2744e84 100644 --- a/M1_biosciences_clustering/main.tex +++ b/M1_biosciences_clustering/main.tex @@ -199,16 +199,22 @@ \include{hclust} \include{kmeans} \include{graph_clustering} +\include{postclustering} + + \begin{frame}{References} \begin{itemize} \item \url{https://towardsdatascience.com/} - \item Introduction to Machine Learning \url{http://cazencott.info/dotclear/public/lectures/IntroML_Azencott.pdf} + \item \href{http://cazencott.info/dotclear/public/lectures/IntroML_Azencott.pdf}{Introduction to Machine Learning (C. Azencott)} + \item \href{https://www.imo.universite-paris-saclay.fr/~giraud/Orsay/Bookv3.pdf}{Introduction to High Dimensional Statistics (C. Giraud)} + \end{itemize} \begin{small} \bibliographystyle{plain} \bibliography{biblio} \nocite{PCANatMeth} +\nocite{pmid30617341} \end{small} \end{frame} \end{document} diff --git a/M1_biosciences_clustering/postclustering.tex b/M1_biosciences_clustering/postclustering.tex new file mode 100644 index 0000000000000000000000000000000000000000..b6038fd7bc40076fbb77538bd48edd995e00e74b --- /dev/null +++ b/M1_biosciences_clustering/postclustering.tex @@ -0,0 +1,186 @@ +\section{Post Clustering Analysis} + + +\begin{frame} +\frametitle{Clustering analysis in a nutshell} +\begin{itemize} + \item choose a distance + \item choose an algorithm + \item choose the number of clusters $K$ + \item repeat the analysis for 1 to $K_{\max}$ clusters + \item choose the number of clusters + \item check the stability of clusters + \item interpret the clusters + \item Clustering is non supervised, part of the analysis is subjective, so we need guidelines +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{Elbow plot and model selection} +\begin{columns}[c] + \column{.5\textwidth} +\begin{itemize} + \item Choosing the number of clusters is a model selection task + \item To choose a model we need a measure of quality of fit + $$ + \widehat{\Ibf}_W(K) = \sum_{i=1}^n \sum_{k=1}^K \widehat{z}_{ik} d^2(\xbf_i,\overline{\xbf}_k) + $$ + \item When $K$ increases, $\widehat{\Ibf}_W(K)$ decreases because clusters are more and more homogeneous + \item The elbow plot consists in finding the best trade-off between quality of fit and a reasonable number of clusters +\end{itemize} +\column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.3]{./figures/elbow_plot.png} + \end{center} +\end{columns} +\end{frame} + + +\begin{frame} +\frametitle{Intuitions for model selection} +\begin{itemize} + \item Model selection is based on the bias-variance trade-off + \item Bias : if a model has more parameters, it will approximate the data very precisely + \item Variance : if a model has more parameters, the estimation error will increase + \item How to find the best trade-off between both trends ? + \item Model-selection criteria are based on penalized criteria: + $$ + \operatorname{C}_K + \lambda \operatorname{pen}(K) + $$ + \item $C_K$ is a contrast that decreases with the dimension + \item $\operatorname{pen}(K)$ is a penalty that increases with the dimension of the model + \item $\lambda$ is a penalty constant that tunes the trade-off +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{Examples of model selection criteria (not exhaustive)} +\begin{columns}[c] + \column{.5\textwidth} +\begin{itemize} + \item The Akaike Information Criterion + $$ + \operatorname{AIC}_K = -2 \log \widehat{\Ibf}_W(K) + 2 K + $$ + \item The Bayesian Information Criterion + $$ + \operatorname{BIC}_K = -2 \log \widehat{\Ibf}_W(K) + K \log(n) + $$ + \item The Integrated Classification Likelihood + $$ + \operatorname{ICL}_K = -2 \log \widehat{\Ibf}_W(K) + K \log(n) + + \sum_{k=1}^K n_k \log n_k$$ +\end{itemize} +\column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.25]{./figures/BIC_clustering.png} + \end{center} +\end{columns} +\end{frame} + +\begin{frame} +\frametitle{Assessing cluster separation with the Silhouette score} +\begin{itemize} + \item Consider clustering results into $K$ clusters with inferred labels $(\widehat{\zbf}_1, \hdots, \widehat{\zbf}_K)$ + \item For point $i$ that has been assigned to cluster $k$, compute the distance with points of the same cluster + $$ + a_i = \frac{1}{n_k-1} \sum_{j \neq i} \widehat{z}_{ik} \widehat{z}_{jk} d^2(\xbf_i,\xbf_j) + $$ + \item Compute the distance with points of other clusters + $$ + b_i= \min_{\ell} \left\{ \frac{1}{n_{\ell}} \sum_{j} \widehat{z}_{ik} \widehat{z}_{j \ell} d^2(\xbf_i,\xbf_j) \right\} + $$ + \item Compute the silhouette score for each point + $$ + s_i = \frac{b_i-a_i}{\max \left\{ a_i,b_i \right\}} \in [-1,1] + $$ +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{Assessing cluster stability} +\begin{itemize} + \item Consider two clustering results $\widehat{\mathcal{P}}_n^K,\widehat{\mathcal{P}}'^{K'}_{n}$ + \item (In)Stability of clustering results is defined as the expectation of distances between partitions (like adjusted Rand Index) + $$ +\mathbb{E} \left\{ +d \left( \widehat{\mathcal{P}}_n^K,\widehat{\mathcal{P}}'^{K'}_{n} \right) +\right\} + $$ + \item Use sub-sampling to perturb the data + $$ + \frac{1}{B^2} \sum_{b,b'} d \left( \widehat{\mathcal{P}}_{(b)}^K,\widehat{\mathcal{P}}'^{K'}_{(b')} \right) + $$ + \item To perturb the data (for instance): resampling, adding noise, use different dimension reduction methods +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{High dimensional setting} +\begin{columns}[c] + \column{.5\textwidth} +\begin{itemize} + \item distance-based methods are sensitive to increases in dimension + \item The geometry of data is modified in high dimension + \item Consider a sphere $S(\xbf,R)$ and a cube $C(\xbf,R)$ centered on $\xbf \in \mathbb{R}^p$ with radius $R$ + $$ + \frac{\operatorname{Vol} \left[ C(\xbf,R)\right]}{\operatorname{Vol} \left[ S(\xbf,R)\right]} = \frac{2^p R^p}{2R^p \pi^{p/2}/ p\Gamma(p/2)} + $$ + $$ + \frac{\operatorname{Vol} \left[ C(\xbf,R)\right]}{\operatorname{Vol} \left[ S(\xbf,R)\right]} \underset{p \to \infty}{\rightarrow} 0 + $$ +\end{itemize} +\column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.7]{./figures/sphere.pdf} \\ + From C.Azencott + \end{center} +\end{columns} +\end{frame} + +\begin{frame} +\frametitle{Many points are needed to fill the space} +\begin{columns}[c] + \column{.5\textwidth} +\begin{itemize} + \item Number of points $(\xbf_1,\hdots, \xbf_n)$ needed to fill the cube $[0,1]^p$ by $S(\xbf_1,1),\hdots,S(\xbf_n,1)$ \\ + \vspace{0.5cm} +\begin{tabular}{lllll} + p & 20 & 30 & 50 & 100 \\ +\hline + n & 39 & 45,630 & 5.7 10$^{12}$ & 42 10$^{39}$ + \end{tabular} + \vspace{0.5cm} + \item High dimensional spaces are empty ! + \item Points are far apart +\end{itemize} +\column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.25]{./figures/distances_highdim.pdf} \\ + From C.Giraud + \end{center} +\end{columns} +\end{frame} + + +\begin{frame} +\frametitle{Clustering in high dimension} +\begin{columns}[c] + \column{.5\textwidth} +\begin{itemize} + \item Dimension reduction is mandatory for clustering in high dimension + \item Combine DR + clustering + \item Use feature selection + \item Try different DR methods + \item Try different clustering methods + \item Interpret clusters in the input space ! +\end{itemize} +\column{.45\textwidth} + \begin{center} + \includegraphics[scale=0.18]{./figures/netwizz.png} + \end{center} +\end{columns} +\end{frame} \ No newline at end of file