Select Git revision
docker_init.sh
Forked from
LBMC / nextflow
Source project has a limited visibility.
tdd_analysis_boostedtree.R 10.79 KiB
################################################################################
################################ boosted tree ##################################
################################################################################
source("src/tdd_analysis_load.R")
tuneplot <- function(x, probs = .90) {
ggplot(x) +
coord_cartesian(
ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))
) +
theme_bw()
}
fitControl <-
trainControl(
method = "cv",
number = 3,
allowParallel = T
)
# fit model with base parameter
grid_default <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
bt_train_base <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = grid_default,
verbose = T
)
save(
bt_train_base,
file = "results/bt_train_base.Rdata")
load(file = "results/bt_train_base.Rdata")
# compute model rmse
rmse <- function(error) {
sqrt(mean(error^2))
}
tb_pred <- predict(
bt_train_base,
tdd_index %>%
dplyr::select(-transcript_id) %>%
dplyr::select(-tdd_index)
)
rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)
nrounds <- 1000
# grid search for eta and depth
tune_grid_r1 <- expand.grid(
nrounds = seq(from = 200, to = nrounds, by = 50),
eta = c(0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6),
max_depth = 1:20,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
save(
tune_grid_r1,
file = "results/bt_grid_r1.Rdata")
bt_train_r1 <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_r1,
verbose = T
)
save(
bt_train_r1,
file = "results/bt_train_r1.Rdata")
load(file = "results/bt_train_r1.Rdata")
tuneplot(bt_train_r1)
bt_train_r1$bestTune
# grid search for Maximum Depth and Minimum Child Weight
nrounds <- 3000
tune_grid_r2 <- expand.grid(
nrounds = seq(from = 1000, to = nrounds, by = 100),
eta = bt_train_r1$bestTune$eta,
max_depth = bt_train_r1$bestTune$max_depth,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1:10,
subsample = 1
)
save(
tune_grid_r2,
file = "results/bt_grid_r2.Rdata")
bt_train_r2 <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_r2,
verbose = T
)
save(
bt_train_r2,
file = "results/bt_train_r2.Rdata")
load(file = "results/bt_train_r2.Rdata")
tuneplot(bt_train_r2)
bt_train_r2$bestTune
# grid search for Column and Row Sampling
nrounds <- 2000
tune_grid_r3 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = bt_train_r1$bestTune$eta,
max_depth = bt_train_r2$bestTune$max_depth,
gamma = 0,
colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
min_child_weight = bt_train_r2$bestTune$min_child_weight,
subsample = c(0.5, 0.75, 1.0)
)
save(
tune_grid_r3,
file = "results/bt_grid_r3.Rdata")
bt_train_r3 <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_r3,
verbose = T
)
save(
bt_train_r3,
file = "results/bt_train_r3.Rdata")
load(file = "results/bt_train_r3.Rdata")
tuneplot(bt_train_r3)
bt_train_r3$bestTune
# grid search Gamma
tune_grid_r4 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = bt_train_r1$bestTune$eta,
max_depth = bt_train_r2$bestTune$max_depth,
gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
min_child_weight = bt_train_r2$bestTune$min_child_weight,
subsample = bt_train_r3$bestTune$subsample
)
save(
tune_grid_r4,
file = "results/bt_grid_r4.Rdata")
bt_train_r4 <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_r4,
verbose = T
)
save(
bt_train_r4,
file = "results/bt_train_r4.Rdata")
load(file = "results/bt_train_r4.Rdata")
tuneplot(bt_train_r4, 0.95)
bt_train_r4$bestTune
# Reducing the Learning Rate
tune_grid_r5 <- expand.grid(
nrounds = seq(from = 100, to = 5000, by = 100),
eta = seq(from = 0.01, to = 0.2, by = 0.01),
max_depth = bt_train_r2$bestTune$max_depth,
gamma = bt_train_r4$bestTune$gamma,
colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
min_child_weight = bt_train_r2$bestTune$min_child_weight,
subsample = bt_train_r3$bestTune$subsample
)
save(
tune_grid_r5,
file = "results/bt_grid_r5.Rdata")
bt_train_r5 <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_r5,
verbose = T
)
save(
bt_train_r5,
file = "results/bt_train_r5.Rdata")
load(file = "results/bt_train_r5.Rdata")
tuneplot(bt_train_r5)
bt_train_r5$bestTune
# model fitting
tune_grid_final <- expand.grid(
nrounds = bt_train_r5$bestTune$nrounds,
eta = bt_train_r5$bestTune$eta,
max_depth = bt_train_r5$bestTune$max_depth,
gamma = bt_train_r5$bestTune$gamma,
colsample_bytree = bt_train_r5$bestTune$colsample_bytree,
min_child_weight = bt_train_r5$bestTune$min_child_weight,
subsample = bt_train_r5$bestTune$subsample
)
bt_fit <- tdd_index %>%
dplyr::select(-transcript_id) %>%
train(
tdd_index ~ .,
data = .,
method = "xgbTree",
trControl = fitControl,
tuneGrid = tune_grid_final,
verbose = T
)
save(
bt_fit,
file = "results/bt_fit.Rdata")
load(file = "results/bt_fit.Rdata")
# compute model rmse
rmse <- function(error) {
sqrt(mean(error^2))
}
tb_pred <- predict(
bt_fit,
tdd_index %>%
dplyr::select(-transcript_id) %>%
dplyr::select(-tdd_index)
)
rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)
################################################################################
bt_predictor = Predictor$new(
bt_fit,
data = tdd_index %>%
dplyr::select(-transcript_id) %>%
dplyr::select(-tdd_index),
y = tdd_index %>%
dplyr::pull(-tdd_index)
)
save(bt_predictor,
file = "results/bt_predictor.Rdata")
save(tdd_index, bt_fit, bt_predictor,
file = "results/bt_model.Rdata")
# compute Feature importance
# Permutation feature importance measures the increase in the prediction error
# of the model after we permuted the feature’s values, which breaks the
# relationship between the feature and the true outcome.
cl <- makePSOCKcluster(12)
registerDoParallel(cl)
load(file = "results/bt_model.Rdata")
bt_imp_mse = FeatureImp$new(bt_predictor, loss = "mse", parallel = T)
bt_imp_mae = FeatureImp$new(bt_predictor, loss = "mae", parallel = T)
save(bt_imp_mse, bt_imp_mae,
file = "results/bt_imp.Rdata")
load(file = "results/bt_imp.Rdata")
plot(bt_imp_mae)
plot(bt_imp_mse)
# compute Feature effects
# The partial dependence function at a particular feature value represents the
# average prediction if we force all data points to assume that feature value.
load(file = "results/bt_model.Rdata")
bt_ale <- list()
for (feature in bt_imp_mse$results %>%
dplyr::pull(feature) %>% as_vector()) {
if(is.null(bt_ale[[feature]])){
bt_ale[[feature]] <- tryCatch({
FeatureEffect$new(
bt_predictor, feature = feature, method = "ale"
)
}, error = {
FeatureEffect$new(
bt_predictor, feature = feature, method = "pdp"
)
}, finally = {
NULL
})
}
}
save(bt_ale,
file = "results/bt_ale.Rdata")
load(file = "results/bt_ale.Rdata")
for (feature in names(bt_ale)) {
if (is.null(bt_ale[[feature]])) {
print(feature)
}
}
for (feature in names(bt_ale)) {
print(feature)
print(plot(bt_ale[[feature]]))
}
bt_ale[["tdd_transcription_drug"]] <- FeatureEffect$new(
bt_predictor, feature = "tdd_transcription_drug", method = "ale"
)
plot(bt_ale[["tdd_transcription_drug"]])
bt_ale_numeric <- tibble(ale = NA, type = NA, value = NA, feature = NA)
for (feature in tdd_index %>%
keep(is.numeric) %>% dplyr::select(-tdd_index) %>% names()) {
tmp <- bt_ale[[feature]]$results
names(tmp) <- c("ale", "type", "value")
tmp$feature <- feature
bt_ale_numeric <- rbind(bt_ale_numeric, tmp)
}
bt_ale_numeric %>%
drop_na() %>%
ggplot(aes(x = value, y = ale)) +
geom_line() +
geom_rug() +
facet_wrap(~feature, scales = "free", ncol = 4)
bt_ale_fct <- tibble(ale = NA, type = NA, value = NA, feature = NA)
for (feature in tdd_index %>%
keep(is.factor) %>% dplyr::select(-transcript_id) %>% names()) {
tmp <- bt_ale[[feature]]$results
names(tmp) <- c("ale", "type", "value")
if (tmp$value[1] == "pdp") {
names(tmp) <- c("type", "ale", "value")
}
tmp$feature <- feature
bt_ale_fct <- bt_ale_fct %>% bind_rows(tmp %>% mutate(value = as.character(value)))
}
bt_ale_fct %>%
drop_na() %>%
mutate(type = factor(type)) %>%
ggplot(aes(x = type, y = ale)) +
geom_boxplot() +
facet_wrap(~feature, scales = "free", ncol = 3)
# ICE plot
# Individual Conditional Expectation (ICE) plots display one line per instance
# that shows how the instance’s prediction changes when a feature changes.
load(file = "results/bt_model.Rdata")
bt_ice <- list()
for (feature in bt_imp_mse$results %>%
dplyr::pull(feature) %>% as_vector()) {
bt_ice[[feature]] <- tryCatch({
FeatureEffect$new(
bt_predictor, feature = feature, method = "pdp+ice"
)
}, warning = function(w) {
NULL
}, error = function(e) {
NULL
}, finally = {
NULL
})
}
save(bt_ice,
file = "results/bt_ice.Rdata")
load(file = "results/bt_ice.Rdata")
for (feature in names(bt_ice)) {
if (is.null(bt_ice[[feature]])) {
print(feature)
}
}
for (feature in names(bt_ale)) {
print(feature)
print(plot(bt_ice[[feature]]))
}
# compute interaction of any feature with others
load(file = "results/bt_model.Rdata")
bt_interact = Interaction$new(bt_predictor, parallel = T)
save(bt_interact,
file = "results/bt_interact.Rdata")
bt_interact_2w <- list()
for (feature in tdd_index %>%
dplyr::select(-transcript_id) %>%
dplyr::select(-tdd_index) %>%
names()) {
if (is.null(bt_interact_2w[[feature]])) {
bt_interact_2w[[feature]] <- tryCatch({
Interaction$new(bt_predictor, feature = feature)
}, error = function(e) {
NULL
}, finally = {
NULL
})
}
save(bt_interact_2w,
file = "results/bt_interact_2w.Rdata")
}
save(bt_interact_2w,
file = "results/bt_interact_2w.Rdata")
load(file = "results/bt_interact.Rdata")
load(file = "results/bt_interact_2w.Rdata")
plot(bt_interact)
for (feature in names(bt_interact_2w)) {
if (is.null(bt_interact_2w[[feature]])) {
print(feature)
}
}
for (feature in names(bt_interact_2w)) {
print(feature)
print(plot(bt_interact_2w[[feature]]))
}