tdd_analysis_boostedtree.R

################################################################################
################################ boosted tree ##################################
################################################################################
source("src/tdd_analysis_load.R")

tuneplot <- function(x, probs = .90) {
  ggplot(x) +
    coord_cartesian(
      ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))
    ) +
    theme_bw()
}

fitControl <-
  trainControl(
    method = "cv",
    number = 3,
    allowParallel = T
  )

# fit model with base parameter
 
grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

bt_train_base <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = grid_default,
    verbose = T
  )
save(
  bt_train_base,
  file = "results/bt_train_base.Rdata")
load(file = "results/bt_train_base.Rdata")

# compute model rmse

rmse <- function(error) {
  sqrt(mean(error^2))
}

tb_pred <- predict(
  bt_train_base,
  tdd_index %>%
  dplyr::select(-transcript_id) %>% 
    dplyr::select(-tdd_index)
  )
rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)

nrounds <- 1000

# grid search for eta and depth

tune_grid_r1 <- expand.grid(
  nrounds = seq(from = 200, to = nrounds, by = 50),
  eta = c(0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6),
  max_depth = 1:20,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)
save(
  tune_grid_r1,
  file = "results/bt_grid_r1.Rdata")

bt_train_r1 <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_r1,
    verbose = T
  )
save(
  bt_train_r1,
  file = "results/bt_train_r1.Rdata")
load(file = "results/bt_train_r1.Rdata")

tuneplot(bt_train_r1)
bt_train_r1$bestTune

# grid search for Maximum Depth and Minimum Child Weight

nrounds <- 3000
tune_grid_r2 <- expand.grid(
  nrounds = seq(from = 1000, to = nrounds, by = 100),
  eta = bt_train_r1$bestTune$eta,
  max_depth = bt_train_r1$bestTune$max_depth,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1:10,
  subsample = 1
)
save(
  tune_grid_r2,
  file = "results/bt_grid_r2.Rdata")

bt_train_r2 <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_r2,
    verbose = T
  )
save(
  bt_train_r2,
  file = "results/bt_train_r2.Rdata")
load(file = "results/bt_train_r2.Rdata")

tuneplot(bt_train_r2)
bt_train_r2$bestTune

# grid search for Column and Row Sampling

nrounds <- 2000
tune_grid_r3 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = bt_train_r1$bestTune$eta,
  max_depth = bt_train_r2$bestTune$max_depth,
  gamma = 0,
  colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
  min_child_weight = bt_train_r2$bestTune$min_child_weight,
  subsample = c(0.5, 0.75, 1.0)
)
save(
  tune_grid_r3,
  file = "results/bt_grid_r3.Rdata")

bt_train_r3 <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_r3,
    verbose = T
  )
save(
  bt_train_r3,
  file = "results/bt_train_r3.Rdata")
load(file = "results/bt_train_r3.Rdata")

tuneplot(bt_train_r3)
bt_train_r3$bestTune

# grid search Gamma

tune_grid_r4 <- expand.grid(
  nrounds = seq(from = 50, to = nrounds, by = 50),
  eta = bt_train_r1$bestTune$eta,
  max_depth = bt_train_r2$bestTune$max_depth,
  gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
  colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
  min_child_weight = bt_train_r2$bestTune$min_child_weight,
  subsample = bt_train_r3$bestTune$subsample
)
save(
  tune_grid_r4,
  file = "results/bt_grid_r4.Rdata")

bt_train_r4 <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_r4,
    verbose = T
  )
save(
  bt_train_r4,
  file = "results/bt_train_r4.Rdata")
load(file = "results/bt_train_r4.Rdata")

tuneplot(bt_train_r4, 0.95)
bt_train_r4$bestTune

# Reducing the Learning Rate

tune_grid_r5 <- expand.grid(
  nrounds = seq(from = 100, to = 5000, by = 100),
  eta = seq(from = 0.01, to = 0.2, by = 0.01),
  max_depth = bt_train_r2$bestTune$max_depth,
  gamma = bt_train_r4$bestTune$gamma,
  colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
  min_child_weight = bt_train_r2$bestTune$min_child_weight,
  subsample = bt_train_r3$bestTune$subsample
)
save(
  tune_grid_r5,
  file = "results/bt_grid_r5.Rdata")

bt_train_r5 <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_r5,
    verbose = T
  )
save(
  bt_train_r5,
  file = "results/bt_train_r5.Rdata")
load(file = "results/bt_train_r5.Rdata")

tuneplot(bt_train_r5)
bt_train_r5$bestTune

# model fitting

tune_grid_final <- expand.grid(
  nrounds = bt_train_r5$bestTune$nrounds,
  eta = bt_train_r5$bestTune$eta,
  max_depth = bt_train_r5$bestTune$max_depth,
  gamma = bt_train_r5$bestTune$gamma,
  colsample_bytree = bt_train_r5$bestTune$colsample_bytree,
  min_child_weight = bt_train_r5$bestTune$min_child_weight,
  subsample = bt_train_r5$bestTune$subsample
)

bt_fit <- tdd_index %>% 
  dplyr::select(-transcript_id) %>% 
  train(
    tdd_index ~ .,
    data = .,
    method = "xgbTree",
    trControl = fitControl,
    tuneGrid = tune_grid_final,
    verbose = T
  )
save(
  bt_fit,
  file = "results/bt_fit.Rdata")
load(file = "results/bt_fit.Rdata")

# compute model rmse

rmse <- function(error) {
  sqrt(mean(error^2))
}

tb_pred <- predict(
  bt_fit,
  tdd_index %>%
  dplyr::select(-transcript_id) %>% 
    dplyr::select(-tdd_index)
  )
rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)

################################################################################
bt_predictor = Predictor$new(
  bt_fit,
  data = tdd_index %>%
    dplyr::select(-transcript_id) %>% 
    dplyr::select(-tdd_index),
  y = tdd_index %>%
    dplyr::pull(-tdd_index)
  )
save(bt_predictor,
     file = "results/bt_predictor.Rdata")
save(tdd_index, bt_fit, bt_predictor,
     file = "results/bt_model.Rdata")


# compute Feature importance
# Permutation feature importance measures the increase in the prediction error 
# of the model after we permuted the feature’s values, which breaks the 
# relationship between the feature and the true outcome.
cl <- makePSOCKcluster(12)
registerDoParallel(cl)

load(file = "results/bt_model.Rdata")
bt_imp_mse = FeatureImp$new(bt_predictor, loss = "mse", parallel = T)
bt_imp_mae = FeatureImp$new(bt_predictor, loss = "mae", parallel = T)
save(bt_imp_mse, bt_imp_mae,
     file = "results/bt_imp.Rdata")
load(file = "results/bt_imp.Rdata")
plot(bt_imp_mae)
plot(bt_imp_mse)

# compute Feature effects
# The partial dependence function at a particular feature value represents the 
# average prediction if we force all data points to assume that feature value.
load(file = "results/bt_model.Rdata")
bt_ale <- list()
for (feature in bt_imp_mse$results %>%
     dplyr::pull(feature) %>% as_vector()) {
  if(is.null(bt_ale[[feature]])){
  bt_ale[[feature]] <- tryCatch({
    FeatureEffect$new(
      bt_predictor, feature = feature, method = "ale"
    )
  }, error = {
    FeatureEffect$new(
      bt_predictor, feature = feature, method = "pdp"
    )
  }, finally = {
    NULL
  })
  }
}

save(bt_ale,
     file = "results/bt_ale.Rdata")
load(file = "results/bt_ale.Rdata")

for (feature in names(bt_ale)) {
  if (is.null(bt_ale[[feature]])) {
    print(feature)
  }
}
for (feature in names(bt_ale)) {
  print(feature)
  print(plot(bt_ale[[feature]]))
}
bt_ale[["tdd_transcription_drug"]] <- FeatureEffect$new(
      bt_predictor, feature = "tdd_transcription_drug", method = "ale"
    )
plot(bt_ale[["tdd_transcription_drug"]])

bt_ale_numeric <- tibble(ale = NA, type = NA, value = NA, feature = NA)
for (feature in tdd_index %>%
     keep(is.numeric) %>% dplyr::select(-tdd_index) %>% names()) {
  tmp <- bt_ale[[feature]]$results
  names(tmp) <- c("ale", "type", "value")
  tmp$feature <- feature
  bt_ale_numeric <- rbind(bt_ale_numeric, tmp)
}
bt_ale_numeric %>% 
  drop_na() %>% 
  ggplot(aes(x = value, y = ale)) +
  geom_line() +
  geom_rug() +
  facet_wrap(~feature, scales = "free", ncol = 4)

bt_ale_fct <- tibble(ale = NA, type = NA, value = NA, feature = NA)
for (feature in tdd_index %>%
     keep(is.factor) %>% dplyr::select(-transcript_id) %>% names()) {
  tmp <- bt_ale[[feature]]$results
  names(tmp) <- c("ale", "type", "value")
  if (tmp$value[1] == "pdp") {
    names(tmp) <- c("type", "ale", "value")
  }
  tmp$feature <- feature
  bt_ale_fct <- bt_ale_fct %>% bind_rows(tmp %>% mutate(value = as.character(value)))
}
bt_ale_fct %>% 
  drop_na() %>% 
  mutate(type = factor(type)) %>% 
  ggplot(aes(x = type, y = ale)) +
  geom_boxplot() +
  facet_wrap(~feature, scales = "free", ncol = 3)
  

# ICE plot
# Individual Conditional Expectation (ICE) plots display one line per instance 
# that shows how the instance’s prediction changes when a feature changes.

load(file = "results/bt_model.Rdata")
bt_ice <- list()
for (feature in bt_imp_mse$results %>%
     dplyr::pull(feature) %>% as_vector()) {
  bt_ice[[feature]] <- tryCatch({
    FeatureEffect$new(
      bt_predictor, feature = feature, method = "pdp+ice"
    )
  }, warning = function(w) {
    NULL
  }, error = function(e) {
    NULL
  }, finally = {
    NULL
  })
}
save(bt_ice,
     file = "results/bt_ice.Rdata")
load(file = "results/bt_ice.Rdata")

for (feature in names(bt_ice)) {
  if (is.null(bt_ice[[feature]])) {
    print(feature)
  }
}
for (feature in names(bt_ale)) {
  print(feature)
  print(plot(bt_ice[[feature]]))
}

# compute interaction of any feature with others
load(file = "results/bt_model.Rdata")

bt_interact = Interaction$new(bt_predictor, parallel = T)
save(bt_interact,
     file = "results/bt_interact.Rdata")

bt_interact_2w <- list()
for (feature in tdd_index %>%
     dplyr::select(-transcript_id) %>%
     dplyr::select(-tdd_index) %>%
     names()) {
  if (is.null(bt_interact_2w[[feature]])) {
    bt_interact_2w[[feature]] <- tryCatch({
      Interaction$new(bt_predictor, feature = feature)
    }, error = function(e) {
      NULL
    }, finally = {
      NULL
    })
  }
  save(bt_interact_2w,
      file = "results/bt_interact_2w.Rdata")
}
save(bt_interact_2w,
     file = "results/bt_interact_2w.Rdata")

load(file = "results/bt_interact.Rdata")
load(file = "results/bt_interact_2w.Rdata")
plot(bt_interact)

for (feature in names(bt_interact_2w)) {
  if (is.null(bt_interact_2w[[feature]])) {
    print(feature)
  }
}
for (feature in names(bt_interact_2w)) {
  print(feature)
  print(plot(bt_interact_2w[[feature]]))
}