Skip to content
Snippets Groups Projects
Verified Commit b156ab32 authored by Laurent Modolo's avatar Laurent Modolo
Browse files

version v0.0.8

parent 85dab5be
Branches
Tags
No related merge requests found
Showing
with 303 additions and 51 deletions
Package: kmerclust
Title: Kmerclust A Small Package To Test If The Kmer Count Correspond To A
XY Or XO Genome
Version: 0.0.7
Version: 0.0.8
Authors@R:
person("Laurent", "Modolo", , "laurent.modolo@ens-lyon.fr", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-7606-4110"))
......@@ -31,7 +31,7 @@ Suggests:
VignetteBuilder:
knitr
Config/fusen/version: 0.5.2
Date/Publication: 2023/10/06
Date/Publication: 2023/10/20
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
......@@ -9,8 +9,10 @@ export(compare_models_constaint)
export(compute_tpm)
export(e_bipoiss)
export(e_bipoiss_clust)
export(e_bipoiss_clust_batch)
export(em_bipoiss)
export(em_bipoiss_clust)
export(get_cluster_poiss)
export(loglik_bipoiss)
export(loglik_bipoiss_clust)
export(m_bipoiss)
......
......@@ -11,10 +11,10 @@
bic_bipoiss_clust <- function(x, params, loglik) {
k <- 3
if (length(params) >= 2) {
k <- k + 3
k <- k + 5
}
if (length(params) >= 3) {
k <- k + 3
k <- k + 4
}
return(k * log(nrow(x)) - 2 * loglik)
}
......@@ -12,13 +12,18 @@ e_bipoiss_clust <- function(x, params) {
s <- matrix(0, nrow(x), length(params))
h <- s
for (i in 1:length(params)) {
suppressWarnings({
h[, i] <- extraDistr::dbvpois(x, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3)
s[id, i] <- params[[i]]$l3 *
extraDistr::dbvpois(x[id, ] - 1, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3) /
h[id, i]
h[, i] <- params[[i]]$kappa * h[, i]
})
}
s[is.na(s)] <- 0
h <- h / rowSums(h)
h[is.na(h)] <- 0
denum <- rowSums(h)
h[denum > 0, ] <- h[denum > 0, ] / denum[denum > 0]
h[denum == 0, ] <- 1 / ncol(h)
return(list(s = s, h = h))
}
# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand
#' run the expetation step for each patch iteratively
#'
#' @param x the data matrix (2 columns)
#' @param params the list of 3 lambda
#' @param hidden the list of hidden variable
#' @param nbatch the number of batch
#' @return a list of hidden variable
#' @export
e_bipoiss_clust_batch <- function(x, params, hidden = NULL, nbatch = 10) {
if (is.null(hidden)) {
hidden <- list()
hidden$current <- 1
hidden$batch <- rep(1:nbatch, round(nrow(x) / nbatch) + 1)[1:nrow(x)]
hidden$s <- matrix(0, nrow(x), length(params))
hidden$h <- hidden$s
for (i in 1:length(params)) {
hidden$s[, i] <- rpois(nrow(x), params[[1]]$l3)
hidden$h[, i] <- runif(nrow(x))
}
hidden$h <- hidden$h / rowSums(hidden$h)
}
res <- e_bipoiss_clust(x[hidden$batch == hidden$current, ], params)
hidden$s[hidden$batch == hidden$current, ] = res$s
hidden$h[hidden$batch == hidden$current, ] = res$h
hidden$current <- hidden$current + 1
if (hidden$current > nbatch) {
hidden$current <- 1
}
if(any(is.na(hidden$h))) {
stop("error in e_bipoiss_clust_batch h")
}
if(any(is.na(hidden$s))) {
stop("error in e_bipoiss_clust_batch s")
}
return(hidden)
}
......@@ -4,7 +4,8 @@
#'
#' @param x the data matrix (2 columns)
#' @param params the list of 3 lambda (initial value)
#' @param frac (default: 1.) fraction of the data to use at each step
#' @param gamma a vector of kappa prior
#' @param nbatch (default: 10) the number of batch
#' @param threshold (default: c(0.1, 5)) maximum difference between kappa and lambda
#' @param max_iter (default: 100) maximum number of iteration
#' @return a list of 3 lambda
......@@ -21,41 +22,31 @@ em_bipoiss_clust <- function(x, params = list(
A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x)),
X = list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x)),
Y = list(kappa = 0.3, l1 = mean(x), l2 = 1, l3 = mean(x))
), frac = 1., threshold = c(0.1, 1), max_iter = 100) {
size <- round(nrow(x) * frac)
),
gamma = c(round(nrow(x) * .8), round(nrow(x) * .15), round(nrow(x) * .05)),
nbatch = 10, threshold = c(0.1, 1), max_iter = 100) {
old_params <- params
old_params[[1]]$kappa <- Inf
hidden <- NULL
iter <- 0
while (param_diff_poiss(params, old_params, threshold = threshold) & iter < max_iter) {
old_params <- params
if (size < nrow(x)) {
id <- sample(1:nrow(x), size = size)
} else {
id <- 1:nrow(x)
}
hidden <- e_bipoiss_clust(x[id, ], params)
params <- m_bipoiss_clust(x[id, ], s = hidden$s, h = hidden$h)
hidden <- e_bipoiss_clust_batch(x, params = params, hidden = hidden, nbatch = nbatch)
params <- m_bipoiss_clust(x, s = hidden$s, h = round(hidden$h), gamma = gamma)
iter <- iter + 1
}
res <- params
if (length(res) == 1) {
names(res) <- "A"
print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3))
}
if (length(res) == 2) {
names(res) <- c("A", "X")
print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3))
print(paste(res$X$kappa, res$X$l1, res$X$l2, res$X$l3))
}
if (length(res) == 3) {
names(res) <- c("A", "X", "Y")
print(paste(res$A$kappa, res$A$l1, res$A$l2, res$A$l3))
print(paste(res$X$kappa, res$X$l1, res$X$l2, res$X$l3))
print(paste(res$Y$kappa, res$Y$l1, res$Y$l2, res$Y$l3))
}
res$loglik <- loglik_bipoiss_clust(x, params)
res$BIC <- bic_bipoiss_clust(x, params, loglik = res$loglik)
print(res$loglik)
print(res$BIC)
res$loglik <- loglik_bipoiss_clust(x, params = params, gamma = gamma)
res$BIC <- bic_bipoiss_clust(x, params = params, loglik = res$loglik)
res <- get_cluster_poiss(x, res = res, params = params, hidden = hidden)
return(res)
}
# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand
#' add the cluster assigment and proba to the results
#'
#' @param x the data matrix (2 columns)
#' @param res the list of the results
#' @param params the list of 3 lambda
#' @param hidden the list of hidden variable
#' @return a bool
#' @export
get_cluster_poiss <- function(x, res, params, hidden) {
chr_name <- c("A", "X", "Y")
hidden <- e_bipoiss_clust(x, params)
res$h <- hidden$h
res$chromosome <- chr_name[
apply(res$h, 1, function(x){which(x == max(x))[1]})
]
return(res)
}
......@@ -4,21 +4,22 @@
#'
#' @param x the data matrix (2 columns)
#' @param params the list of 3 lambda
#' @param gamma a vector of kappa prior
#' @return logliklihood
#' @importFrom extraDistr dbvpois
#' @importFrom Boom ddirichlet
#' @export
loglik_bipoiss_clust <- function(x, params) {
loglik_bipoiss_clust <- function(x, params, gamma) {
loglik <- matrix(0, nrow(x), length(params))
kappa <- c()
for (i in 1:length(params)) {
loglik[, i] <- params[[i]]$kappa *
extraDistr::dbvpois(x = x, a = params[[i]]$l1, b = params[[i]]$l2, c = params[[i]]$l3)
kappa <- c(kappa, params[[i]]$kappa)
}
loglik <- sum(log(rowSums(loglik)))
if (is.infinite(loglik)) {
print("Inf loglik")
}
if (is.na(loglik)) {
print("NA loglik")
if (length(kappa) > 1) {
loglik <- loglik + Boom::ddirichlet(probabilities = kappa, nu = gamma, logscale = T)
}
return(loglik)
}
......@@ -4,15 +4,25 @@
#'
#' @param x the data matrix (2 columns)
#' @param s of size 3 time nrow(x)
#' @param h of size 3 time nrow(x)
#' @param gamma a vector of kappa prior
#' @param m_chr the maximization function for each type of chromosome
#' @return a list of 3 lambda
#' @importFrom extraDistr dbvpois
#' @export
m_bipoiss_clust <- function(x, s, h,
m_bipoiss_clust <- function(x, s, h, gamma,
m_chr = list(m_bipoiss_clust_a_chr, m_bipoiss_clust_x_chr, m_bipoiss_clust_y_chr)) {
params <- list()
gamma_sum <- sum(gamma)
kappa <- c()
for (i in 1:ncol(h)) {
params[[i]] <- m_chr[[i]](x, s[, i], h[, i])
params[[i]]$kappa <- sum(h[, i]) / nrow(h)
params[[i]]$kappa <- (sum(h[, i]) + gamma[i] - 1) / (nrow(h) + gamma_sum - ncol(h))
kappa <- c(kappa, params[[i]]$kappa)
}
params[[1]]$kappa <- 1
if (length(kappa) > 1) {
params[[1]]$kappa <- 1 - sum(kappa[2:length(kappa)])
}
return(params)
}
......@@ -53,11 +63,15 @@ m_bipoiss_clust_a_chr <- function(x, s, h) {
sum((x[, 1] - s) * h) / denum +
sum((x[, 2] - s) * h) / denum
) / 2
return(list(
res <- list(
l1 = lambda_global,
l2 = lambda_global,
l3 = sum(s * h) / denum
))
)
for (i in which(is.na(res))) {
res[[i]] <- 1
}
return(res)
}
#' maximisation step for bi-Poisson EM
......@@ -73,11 +87,15 @@ m_bipoiss_clust_a_chr <- function(x, s, h) {
#' em_bipoiss(m_bipoiss = m_bipoiss_x_chr)
m_bipoiss_clust_x_chr <- function(x, s, h) {
denum <- sum(h)
return(list(
res <- list(
l1 = 1,
l2 = sum((x[, 2] - s) * h) / denum,
l3 = sum(s * h) / denum
))
)
for (i in which(is.na(res))) {
res[[i]] <- 1
}
return(res)
}
......@@ -94,9 +112,13 @@ m_bipoiss_clust_x_chr <- function(x, s, h) {
#' em_bipoiss(m_bipoiss = m_bipoiss_y_chr)
m_bipoiss_clust_y_chr <- function(x, s, h) {
denum <- sum(h)
return(list(
res <- list(
l1 = sum((x[, 1] - s) * h) / denum,
l2 = 1,
l3 = sum(s * h) / denum
))
)
for (i in which(is.na(res))) {
res[[i]] <- 1
}
return(res)
}
......@@ -35,6 +35,10 @@ param_diff_poiss <- function(
} else {
j <- 2
}
if (is.na(params[[i]][[param]])) {
warning(paste("param_diff_poiss: parameter value: ", i, ":", params, "\n"))
return(T)
}
if (abs(params[[i]][[param]] - old_params[[i]][[param]]) > threshold[j])
return(T)
}
......
# WARNING - Generated by {fusen} from dev/flat_full_poisson.Rmd: do not edit by hand
#' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param params a list of init parameters
#' @param gamma a vector of kappa prior
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param max_iter (default: 100) maximum number of iteration
#' @param max_error (default: 5) maximum number of try
#' @return tibble of the statistics
#' @importFrom dplyr sample_n sample_frac
#' @importFrom parallel mclapply
#' @examples
#' rbind(
#' extraDistr::rbvpois(1000, 10, 10, 20), # A
#' extraDistr::rbvpois(500, 0, 20, 20), # X
#' extraDistr::rbvpois(100, 20, 0, 5) # Y
#' ) %>% pois_boostrap_EM(nboot = 10)
#' @noRd
poiss_boostrap_EM_sub <- function(iter, x, bootsize, params, gamma, max_iter, max_error) {
res <- list()
res$loglik <- c(NA)
iter <- 0
while (is.na(res$loglik) | iter < max_error) {
res <- x %>%
as_tibble() %>%
sample_frac(bootsize, replace = T) %>%
as.matrix() %>%
em_bipoiss_clust(params = params, gamma = gamma, max_iter = max_iter)
iter <- iter + 1
}
return(tibble(loglik = res$loglik, BIC = res$BIC))
}
#' Function to compile statistics bootstrap samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param params a list of init parameters
#' @param gamma a vector of kappa prior
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param max_iter (default: 100) maximum number of iteration
#' @param max_error (default: 5) maximum number of try
#' @return tibble of the statistics
#' @importFrom dplyr sample_n sample_frac
#' @importFrom parallel mclapply
#' @examples
#' rbind(
#' extraDistr::rbvpois(1000, 10, 10, 20), # A
#' extraDistr::rbvpois(500, 0, 20, 20), # X
#' extraDistr::rbvpois(100, 20, 0, 5) # Y
#' ) %>% pois_boostrap_EM(nboot = 10)
#' @noRd
poiss_boostrap_EM <- function(x, sex = "XY", threshold = 1, nboot = 100, bootsize = 1, core = 1, max_iter = 100, max_error = 5) {
params = list(
A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x))
)
gamma <- c(round(nrow(x) * bootsize))
if (sex %in% c("XO", "XY")) {
params$X <- list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x))
gamma <- c(round(nrow(x) * bootsize) * .8, round(nrow(x) * bootsize) * .2)
}
if (sex == "XY") {
params$Y <- list(kappa = 0.3, l1 = mean(x), l2 = 1, l3 = mean(x))
gamma <- c(round(nrow(x) * bootsize) * .8, round(nrow(x) * bootsize) * .15, round(nrow(x) * bootsize) * .05)
}
if (core == 1) {
res <- lapply(as.list(1:nboot), FUN = poiss_boostrap_EM_sub, x = x, bootsize = bootsize, params = params, gamma = gamma, max_iter = max_iter, max_error = max_error)
} else {
res <- parallel::mclapply(as.list(1:nboot), FUN = poiss_boostrap_EM_sub, x = x, bootsize = bootsize, params = params, gamma = gamma, max_iter = max_iter, max_error = max_error, mc.cores = core)
}
return(res)
}
#' Function to compile statistics differences between model XY and XO on boostrap
#' samples of the data
#'
#' @param x a matrix of male female kmers counts
#' @param threshold theshold to stop the EM algorithm
#' @param nboot number of boostrap sample
#' @param bootsize size of the boostrap sample (if < 0 we take a percentage of x)
#' @param core number of cpus to use for the computation
#' @param max_iter (default: 100) maximum number of iteration
#' @param max_error (default: 5) maximum number of try
#' @return tibble with the statistics
#' @importFrom tidyr pivot_longer
#' @export
#' @examples
#' res <- rbind(
#' extraDistr::rbvpois(1000, 10, 10, 20), # A
#' extraDistr::rbvpois(1000, 0, 20, 20), # X
#' extraDistr::rbvpois(1000, 20, 0, 5) # Y
#' ) %>%
#' poiss_compare_models(nboot = 3, core = 1)
poiss_compare_models <- function(x, threshold = 1, nboot = 100, bootsize = 1, core = 1, max_iter = 100, max_error = 5) {
tibble(
model_XY = poiss_boostrap_EM(x, sex = "XY", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error),
model_XO = poiss_boostrap_EM(x, sex = "XO", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error),
model_OO = poiss_boostrap_EM(x, sex = "OO", threshold = threshold, nboot = nboot, bootsize = bootsize, core = core, max_iter = max_iter, max_error = max_error)
) %>%
pivot_longer(cols = c(model_XY, model_XO, model_OO)) %>%
unnest(c(value))
}
......@@ -21,7 +21,7 @@ fusen::fill_description(
`Authors@R` = c(
person("Laurent", "Modolo", email = "laurent.modolo@ens-lyon.fr", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7606-4110"))
),
Version = "0.0.7",
Version = "0.0.8",
"Date/Publication" = format(Sys.time(), "%Y/%m/%d")
),
overwrite = T
......
"type","path","origin"
"R","R/e_bipoiss_clust_batch.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd"
"R","R/get_cluster_poiss.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd"
"R","R/kmerclust-package.R","No existing source path found."
"R","R/poiss_boostrap_em_sub.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full_poisson.Rmd"
"R","R/sim_kmer.R","Possibly deprecated file. Please check its link with detected flat source: dev/flat_full.Rmd"
"R","R/utils-pipe.R","No existing source path found."
---
title: "flat_full_bayesian.Rmd for working package"
title: "Getting started bayesian version"
output: html_document
editor_options:
chunk_output_type: inline
......@@ -722,7 +722,7 @@ test_model_XY <- function(data) {
)
)
}
model_XO <- test_model_XO(data)
model_XO <- test_model_XY(data)
plot_model(model_XO)
```
......
---
title: "flat_full_poisson.Rmd for working package"
title: "Getting started Poisson version"
output: html_document
editor_options:
chunk_output_type: inline
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/e_bipoiss_clust_batch.R
\name{e_bipoiss_clust_batch}
\alias{e_bipoiss_clust_batch}
\title{run the expetation step for each patch iteratively}
\usage{
e_bipoiss_clust_batch(x, params, hidden = NULL, nbatch = 10)
}
\arguments{
\item{x}{the data matrix (2 columns)}
\item{params}{the list of 3 lambda}
\item{hidden}{the list of hidden variable}
\item{nbatch}{the number of batch}
}
\value{
a list of hidden variable
}
\description{
run the expetation step for each patch iteratively
}
......@@ -9,7 +9,8 @@ em_bipoiss_clust(
params = list(A = list(kappa = 0.4, l1 = mean(x), l2 = mean(x), l3 = mean(x)), X =
list(kappa = 0.3, l1 = 1, l2 = mean(x), l3 = mean(x)), Y = list(kappa = 0.3, l1 =
mean(x), l2 = 1, l3 = mean(x))),
frac = 1,
gamma = c(round(nrow(x) * 0.8), round(nrow(x) * 0.15), round(nrow(x) * 0.05)),
nbatch = 10,
threshold = c(0.1, 1),
max_iter = 100
)
......@@ -19,7 +20,9 @@ em_bipoiss_clust(
\item{params}{the list of 3 lambda (initial value)}
\item{frac}{(default: 1.) fraction of the data to use at each step}
\item{gamma}{a vector of kappa prior}
\item{nbatch}{(default: 10) the number of batch}
\item{threshold}{(default: c(0.1, 5)) maximum difference between kappa and lambda}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_cluster_poiss.R
\name{get_cluster_poiss}
\alias{get_cluster_poiss}
\title{add the cluster assigment and proba to the results}
\usage{
get_cluster_poiss(x, res, params, hidden)
}
\arguments{
\item{x}{the data matrix (2 columns)}
\item{res}{the list of the results}
\item{params}{the list of 3 lambda}
\item{hidden}{the list of hidden variable}
}
\value{
a bool
}
\description{
add the cluster assigment and proba to the results
}
......@@ -4,12 +4,14 @@
\alias{loglik_bipoiss_clust}
\title{logliklihood for bi-Poisson EM}
\usage{
loglik_bipoiss_clust(x, params)
loglik_bipoiss_clust(x, params, gamma)
}
\arguments{
\item{x}{the data matrix (2 columns)}
\item{params}{the list of 3 lambda}
\item{gamma}{a vector of kappa prior}
}
\value{
logliklihood
......
......@@ -8,6 +8,7 @@ m_bipoiss_clust(
x,
s,
h,
gamma,
m_chr = list(m_bipoiss_clust_a_chr, m_bipoiss_clust_x_chr, m_bipoiss_clust_y_chr)
)
}
......@@ -15,6 +16,12 @@ m_bipoiss_clust(
\item{x}{the data matrix (2 columns)}
\item{s}{of size 3 time nrow(x)}
\item{h}{of size 3 time nrow(x)}
\item{gamma}{a vector of kappa prior}
\item{m_chr}{the maximization function for each type of chromosome}
}
\value{
a list of 3 lambda
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment