diff --git a/4_clustering/clustering.Rmd b/4_clustering/clustering.Rmd index f3cc1a8a1b0787a848af6d086325d3590e4d52c8..a0aef1ec1fa3c69b3589f0988903af26ffc41a9d 100644 --- a/4_clustering/clustering.Rmd +++ b/4_clustering/clustering.Rmd @@ -19,16 +19,16 @@ classoption: aspectratio=169 # Introduction -## Programme +## Program 1. Single-cell RNASeq data from 10X Sequencing (Friday 3 June 2022 - 14:00) 2. Normalization and spurious effects (Wednesday 8 June 2022 - 14:00) 3. Dimension reduction and data visualization (Monday 13 June 2022 - 15:00) 4. Clustering and annotation (Thursday 30 June 2022 2022 - 14:00) 5. Pseudo-time and velocity inference (Friday 8 July 2022 - 14:00) -6. Differental expression analysis (Monday 11 July 2022 - 15:30) +6. Differential expression analysis (Monday 11 July 2022 - 15:30) -## Programme +## Program 1. Single-cell RNASeq data from 10X Sequencing (Friday 3 June 2022 - 14:00) 2. Normalization and spurious effects (Wednesday 8 June 2022 - 14:00) @@ -38,9 +38,9 @@ classoption: aspectratio=169 - Clustering - Classification 5. Pseudo-time and velocity inference (Friday 8 July 2022 - 14:00) -6. Differental expression analysis (Monday 11 July 2022 - 15:30) +6. Differential expression analysis (Monday 11 July 2022 - 15:30) -## Different kind of clustering +## Different kinds of clustering \includegraphics[width=\textwidth]{img/learning_type.png} @@ -103,7 +103,7 @@ We have up to $10^6$ rows and $10^6$ columns (cells) ### Hermann-Minkowski ${\left(\sum_{i=1}^n (x_i-y_i)^p\right)^{\frac{1}{p}}}$ \begin{center} -\includegraphics[width=0.7\textwidth]{img/hermann-minkowski.png} +\includegraphics[width=0.6\textwidth]{img/hermann-minkowski.png} \end{center} ## Statistical divergence @@ -273,8 +273,8 @@ ggsave("img/cor_kendall.pdf", width = 4, height = 4) Where: \begin{itemize} -\item $A$ is number of concordant pairs -\item $B$ is number of discordant pairs +\item $A$ is the number of concordant pairs +\item $B$ is the number of discordant pairs \item ${n \choose 2} = {n (n-1) \over 2}$ is the Binomial coefficient for the number of ways to choose $2$ items from $n$ items. \end{itemize} \end{columns} @@ -403,7 +403,7 @@ The algorithm assigns each point to the closest centroid to get initial clusters \column{0.5\textwidth} For every cluster, the algorithm recomputes the centroid by taking the average of all points in the cluster. -Since the centroids change, the algorithm then re-assigns the points to the closest centroid. +Since the centroids change, the algorithm then reassigns the points to the closest centroid. \end{columns} \end{center} @@ -439,6 +439,7 @@ When clustering large datasets, you stop the algorithm before reaching convergen } \column{0.5\textwidth} Compute the Within-Cluster-Sum of Squared Errors (WSS) (each point vs the centroid) for different values of $k$ +\[\sum_{i=1}^n (x_i - c_i)^2\] \end{columns} \end{center} @@ -461,8 +462,8 @@ The silhouette value measures how similar a point is to its own cluster (cohesio with \begin{itemize} - \item $a(i)$ the mean distance between $i$ and aussi cells in the same cluster - \item $b(i)$ the mean distance between $i$ and aussi cells in different clusters + \item $a(i)$ the mean distance between $i$ and cells in the same cluster + \item $b(i)$ the mean distance between $i$ and cells in different clusters \end{itemize} We plot $\frac{1}{n}\sum_{i=1}^n s(i)$ @@ -572,16 +573,15 @@ At each step we merge clusters with their closest neighbor } \end{center} -# Neighberhood graph +# Neighborhood graph ## $k$-NN graph -### Instead of considering every the relations between every cells we can focus on $k$ neighbors +### Instead of considering every the relations between every cell we can focus on $k$ neighbors \begin{center} \href{http://www.biomedcentral.com/1471-2164/13/S7/S27}{ \includegraphics[width=\textwidth]{img/knn_k2.png} } -k = 2 \end{center} ## SNN graph @@ -592,12 +592,27 @@ k = 2 \href{http://www.biomedcentral.com/1471-2164/13/S7/S27}{ \includegraphics[width=\textwidth]{img/knn_k2.png} } -k = 2 +\end{center} +The weights on the edges are the intersection of the kNN neighborhood of two points + +## Louvain algorithm + +### Move nodes to optimize the modularity +\[Q = \frac{1}{2m}\sum\limits_{ij}\bigg[A_{ij} - \frac{k_i k_j}{2m}\bigg]\delta (c_i,c_j)\] + +where: +\begin{itemize} + \item $A_{ij}$ represents the edge weight between nodes $i$ and $j$; + \item $k_i$ and $k_j$ are the sum of the weights of the edges attached to nodes $i$ and $j$, respectively; + \item $m$ is the sum of all of the edge weights in the graph; + \item $c_i$ and $c_j$ are the communities of the nodes; and + \item $\delta (x,y)= 1$ if $x=y$, $0$ otherwise. +\end{itemize} ## Louvain algorithm -### Move nodes to optimized the modularity +### Move nodes to optimize the modularity \begin{columns} \column{0.5\textwidth} @@ -639,7 +654,7 @@ Measures the density of links inside communities compared to links between commu ## Louvain algorithm -### Move nodes to optimized the modularity +### Move nodes to optimize the modularity \begin{columns} \column{0.5\textwidth} @@ -659,6 +674,14 @@ Measures the density of links inside communities compared to links between commu Measures the density of links inside communities compared to links between communities +## Louvain algorithm + +\begin{center} +\href{https://www.nature.com/articles/s41467-021-27464-5}{ + \includegraphics[width=0.7\textwidth]{img/paper_cluster_louvain.png} +} +\end{center} + ## Validation methods \begin{center} @@ -671,14 +694,14 @@ Partition data in 2 and compare the two clustering ### Adjusted mutual information (AMI) -**AMI** takes a value of $1$ when the two clusterings are identical and $0$ when the **MI** between two partitions equals the value expected due to chance alone. +**AMI** takes a value of $1$ when the two clustering are identical and $0$ when the **MI** between two partitions equals the value expected due to chance alone. ### V-measure -Geometric mean between the **homogeneity** (how much the sample in a cluster are similar) and the **Completeness** (how much similar samples are put together by the clustering algorithm) +Geometric mean between the **homogeneity** (how much the sample in a cluster is similar) and the **Completeness** (how much similar samples are put together by the clustering algorithm) \begin{center} -Cell which flip assignement must be labeled as ambiguous +Cell which flip assignment must be labeled as ambiguous \end{center} ## SC3 @@ -691,7 +714,7 @@ Cell which flip assignement must be labeled as ambiguous ## Mixture model -### We model the probability of belonging to one groupe +### We model the probability of belonging to one group \begin{columns} \column{0.5\textwidth} @@ -706,7 +729,7 @@ We can fit with \begin{itemize} - \item $K$ the number of cluster + \item $K$ the number of clusters \item $\alpha_i$ the proportion of cluster $i$ \item $\theta$ the model parameters \end{itemize} @@ -715,7 +738,7 @@ with ## Mixture model -### We cannot direcly cluster the multidimentional distribution of gene expression +### We cannot directly cluster the multidimensional distribution of gene expression \begin{center} \href{https://doi.org/10.1093/bioinformatics/btac136}{ @@ -725,7 +748,7 @@ with ## Mixture model -### We cannot direcly cluster the multidimentional distribution of gene expression +### We cannot directly cluster the multidimensional distribution of gene expression \begin{center} \href{https://doi.org/10.1093/bioinformatics/btac136}{ @@ -749,7 +772,7 @@ with \begin{center} \href{https://www.nature.com/articles/s42256-019-0037-0/}{ - \includegraphics[width=\textwidth]{img/scDeepCluster.png} + \includegraphics[width=0.8\textwidth]{img/scDeepCluster.png} } \end{center} @@ -771,6 +794,14 @@ with } \end{center} +## clusterProfiler + +\begin{center} +\href{https://www.nature.com/articles/s41467-021-27464-5}{ + \includegraphics[width=0.7\textwidth]{img/paper_cluster_id.png} +} +\end{center} + ## Harmony \begin{center} @@ -787,6 +818,15 @@ with } \end{center} +## Harmony + +\begin{center} +\href{https://www.nature.com/articles/s41467-021-27464-5}{ + \includegraphics[width=0.7\textwidth]{img/paper_harmony.png} +} +\end{center} + + ## GNN \begin{center} diff --git a/4_clustering/img/paper_cluster.png b/4_clustering/img/paper_cluster.png new file mode 100644 index 0000000000000000000000000000000000000000..273005a225231b5056108e3c73119dd9a1d9b335 Binary files /dev/null and b/4_clustering/img/paper_cluster.png differ diff --git a/4_clustering/img/paper_cluster_id.png b/4_clustering/img/paper_cluster_id.png new file mode 100644 index 0000000000000000000000000000000000000000..44ad45ccf38e7cbf6e4796daedaeb5c2134a3656 Binary files /dev/null and b/4_clustering/img/paper_cluster_id.png differ diff --git a/4_clustering/img/paper_cluster_louvain.png b/4_clustering/img/paper_cluster_louvain.png new file mode 100644 index 0000000000000000000000000000000000000000..c2a6ec7c605763821fa6524a4c778948d4209df1 Binary files /dev/null and b/4_clustering/img/paper_cluster_louvain.png differ diff --git a/4_clustering/img/paper_harmony.png b/4_clustering/img/paper_harmony.png new file mode 100644 index 0000000000000000000000000000000000000000..b06d2328c673370e220940be00a770eddf8a3c84 Binary files /dev/null and b/4_clustering/img/paper_harmony.png differ