Skip to content
Snippets Groups Projects
Select Git revision
  • 1c45e57b20209a9aca36fdf94bd2c04a4233c1f3
  • master default protected
  • yjia01-master-patch-35664
  • dev
  • v0.4.0
  • v0.3.0
  • v0.2.9
  • v0.2.8
  • v0.2.7
  • v0.2.6
  • v0.1.0
  • v0.2.5
  • v0.2.4
  • v0.2.3
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
18 results

docker_init.sh

  • Forked from LBMC / nextflow
    Source project has a limited visibility.
    tdd_analysis_boostedtree.R 10.79 KiB
    ################################################################################
    ################################ boosted tree ##################################
    ################################################################################
    source("src/tdd_analysis_load.R")
    
    tuneplot <- function(x, probs = .90) {
      ggplot(x) +
        coord_cartesian(
          ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))
        ) +
        theme_bw()
    }
    
    fitControl <-
      trainControl(
        method = "cv",
        number = 3,
        allowParallel = T
      )
    
    # fit model with base parameter
     
    grid_default <- expand.grid(
      nrounds = 100,
      max_depth = 6,
      eta = 0.3,
      gamma = 0,
      colsample_bytree = 1,
      min_child_weight = 1,
      subsample = 1
    )
    
    bt_train_base <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = grid_default,
        verbose = T
      )
    save(
      bt_train_base,
      file = "results/bt_train_base.Rdata")
    load(file = "results/bt_train_base.Rdata")
    
    # compute model rmse
    
    rmse <- function(error) {
      sqrt(mean(error^2))
    }
    
    tb_pred <- predict(
      bt_train_base,
      tdd_index %>%
      dplyr::select(-transcript_id) %>% 
        dplyr::select(-tdd_index)
      )
    rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)
    
    nrounds <- 1000
    
    # grid search for eta and depth
    
    tune_grid_r1 <- expand.grid(
      nrounds = seq(from = 200, to = nrounds, by = 50),
      eta = c(0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6),
      max_depth = 1:20,
      gamma = 0,
      colsample_bytree = 1,
      min_child_weight = 1,
      subsample = 1
    )
    save(
      tune_grid_r1,
      file = "results/bt_grid_r1.Rdata")
    
    bt_train_r1 <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_r1,
        verbose = T
      )
    save(
      bt_train_r1,
      file = "results/bt_train_r1.Rdata")
    load(file = "results/bt_train_r1.Rdata")
    
    tuneplot(bt_train_r1)
    bt_train_r1$bestTune
    
    # grid search for Maximum Depth and Minimum Child Weight
    
    nrounds <- 3000
    tune_grid_r2 <- expand.grid(
      nrounds = seq(from = 1000, to = nrounds, by = 100),
      eta = bt_train_r1$bestTune$eta,
      max_depth = bt_train_r1$bestTune$max_depth,
      gamma = 0,
      colsample_bytree = 1,
      min_child_weight = 1:10,
      subsample = 1
    )
    save(
      tune_grid_r2,
      file = "results/bt_grid_r2.Rdata")
    
    bt_train_r2 <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_r2,
        verbose = T
      )
    save(
      bt_train_r2,
      file = "results/bt_train_r2.Rdata")
    load(file = "results/bt_train_r2.Rdata")
    
    tuneplot(bt_train_r2)
    bt_train_r2$bestTune
    
    # grid search for Column and Row Sampling
    
    nrounds <- 2000
    tune_grid_r3 <- expand.grid(
      nrounds = seq(from = 50, to = nrounds, by = 50),
      eta = bt_train_r1$bestTune$eta,
      max_depth = bt_train_r2$bestTune$max_depth,
      gamma = 0,
      colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
      min_child_weight = bt_train_r2$bestTune$min_child_weight,
      subsample = c(0.5, 0.75, 1.0)
    )
    save(
      tune_grid_r3,
      file = "results/bt_grid_r3.Rdata")
    
    bt_train_r3 <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_r3,
        verbose = T
      )
    save(
      bt_train_r3,
      file = "results/bt_train_r3.Rdata")
    load(file = "results/bt_train_r3.Rdata")
    
    tuneplot(bt_train_r3)
    bt_train_r3$bestTune
    
    # grid search Gamma
    
    tune_grid_r4 <- expand.grid(
      nrounds = seq(from = 50, to = nrounds, by = 50),
      eta = bt_train_r1$bestTune$eta,
      max_depth = bt_train_r2$bestTune$max_depth,
      gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
      colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
      min_child_weight = bt_train_r2$bestTune$min_child_weight,
      subsample = bt_train_r3$bestTune$subsample
    )
    save(
      tune_grid_r4,
      file = "results/bt_grid_r4.Rdata")
    
    bt_train_r4 <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_r4,
        verbose = T
      )
    save(
      bt_train_r4,
      file = "results/bt_train_r4.Rdata")
    load(file = "results/bt_train_r4.Rdata")
    
    tuneplot(bt_train_r4, 0.95)
    bt_train_r4$bestTune
    
    # Reducing the Learning Rate
    
    tune_grid_r5 <- expand.grid(
      nrounds = seq(from = 100, to = 5000, by = 100),
      eta = seq(from = 0.01, to = 0.2, by = 0.01),
      max_depth = bt_train_r2$bestTune$max_depth,
      gamma = bt_train_r4$bestTune$gamma,
      colsample_bytree = bt_train_r3$bestTune$colsample_bytree,
      min_child_weight = bt_train_r2$bestTune$min_child_weight,
      subsample = bt_train_r3$bestTune$subsample
    )
    save(
      tune_grid_r5,
      file = "results/bt_grid_r5.Rdata")
    
    bt_train_r5 <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_r5,
        verbose = T
      )
    save(
      bt_train_r5,
      file = "results/bt_train_r5.Rdata")
    load(file = "results/bt_train_r5.Rdata")
    
    tuneplot(bt_train_r5)
    bt_train_r5$bestTune
    
    # model fitting
    
    tune_grid_final <- expand.grid(
      nrounds = bt_train_r5$bestTune$nrounds,
      eta = bt_train_r5$bestTune$eta,
      max_depth = bt_train_r5$bestTune$max_depth,
      gamma = bt_train_r5$bestTune$gamma,
      colsample_bytree = bt_train_r5$bestTune$colsample_bytree,
      min_child_weight = bt_train_r5$bestTune$min_child_weight,
      subsample = bt_train_r5$bestTune$subsample
    )
    
    bt_fit <- tdd_index %>% 
      dplyr::select(-transcript_id) %>% 
      train(
        tdd_index ~ .,
        data = .,
        method = "xgbTree",
        trControl = fitControl,
        tuneGrid = tune_grid_final,
        verbose = T
      )
    save(
      bt_fit,
      file = "results/bt_fit.Rdata")
    load(file = "results/bt_fit.Rdata")
    
    # compute model rmse
    
    rmse <- function(error) {
      sqrt(mean(error^2))
    }
    
    tb_pred <- predict(
      bt_fit,
      tdd_index %>%
      dplyr::select(-transcript_id) %>% 
        dplyr::select(-tdd_index)
      )
    rmse(tdd_index %>% dplyr::pull(tdd_index) - tb_pred)
    
    ################################################################################
    bt_predictor = Predictor$new(
      bt_fit,
      data = tdd_index %>%
        dplyr::select(-transcript_id) %>% 
        dplyr::select(-tdd_index),
      y = tdd_index %>%
        dplyr::pull(-tdd_index)
      )
    save(bt_predictor,
         file = "results/bt_predictor.Rdata")
    save(tdd_index, bt_fit, bt_predictor,
         file = "results/bt_model.Rdata")
    
    
    # compute Feature importance
    # Permutation feature importance measures the increase in the prediction error 
    # of the model after we permuted the feature’s values, which breaks the 
    # relationship between the feature and the true outcome.
    cl <- makePSOCKcluster(12)
    registerDoParallel(cl)
    
    load(file = "results/bt_model.Rdata")
    bt_imp_mse = FeatureImp$new(bt_predictor, loss = "mse", parallel = T)
    bt_imp_mae = FeatureImp$new(bt_predictor, loss = "mae", parallel = T)
    save(bt_imp_mse, bt_imp_mae,
         file = "results/bt_imp.Rdata")
    load(file = "results/bt_imp.Rdata")
    plot(bt_imp_mae)
    plot(bt_imp_mse)
    
    # compute Feature effects
    # The partial dependence function at a particular feature value represents the 
    # average prediction if we force all data points to assume that feature value.
    load(file = "results/bt_model.Rdata")
    bt_ale <- list()
    for (feature in bt_imp_mse$results %>%
         dplyr::pull(feature) %>% as_vector()) {
      if(is.null(bt_ale[[feature]])){
      bt_ale[[feature]] <- tryCatch({
        FeatureEffect$new(
          bt_predictor, feature = feature, method = "ale"
        )
      }, error = {
        FeatureEffect$new(
          bt_predictor, feature = feature, method = "pdp"
        )
      }, finally = {
        NULL
      })
      }
    }
    
    save(bt_ale,
         file = "results/bt_ale.Rdata")
    load(file = "results/bt_ale.Rdata")
    
    for (feature in names(bt_ale)) {
      if (is.null(bt_ale[[feature]])) {
        print(feature)
      }
    }
    for (feature in names(bt_ale)) {
      print(feature)
      print(plot(bt_ale[[feature]]))
    }
    bt_ale[["tdd_transcription_drug"]] <- FeatureEffect$new(
          bt_predictor, feature = "tdd_transcription_drug", method = "ale"
        )
    plot(bt_ale[["tdd_transcription_drug"]])
    
    bt_ale_numeric <- tibble(ale = NA, type = NA, value = NA, feature = NA)
    for (feature in tdd_index %>%
         keep(is.numeric) %>% dplyr::select(-tdd_index) %>% names()) {
      tmp <- bt_ale[[feature]]$results
      names(tmp) <- c("ale", "type", "value")
      tmp$feature <- feature
      bt_ale_numeric <- rbind(bt_ale_numeric, tmp)
    }
    bt_ale_numeric %>% 
      drop_na() %>% 
      ggplot(aes(x = value, y = ale)) +
      geom_line() +
      geom_rug() +
      facet_wrap(~feature, scales = "free", ncol = 4)
    
    bt_ale_fct <- tibble(ale = NA, type = NA, value = NA, feature = NA)
    for (feature in tdd_index %>%
         keep(is.factor) %>% dplyr::select(-transcript_id) %>% names()) {
      tmp <- bt_ale[[feature]]$results
      names(tmp) <- c("ale", "type", "value")
      if (tmp$value[1] == "pdp") {
        names(tmp) <- c("type", "ale", "value")
      }
      tmp$feature <- feature
      bt_ale_fct <- bt_ale_fct %>% bind_rows(tmp %>% mutate(value = as.character(value)))
    }
    bt_ale_fct %>% 
      drop_na() %>% 
      mutate(type = factor(type)) %>% 
      ggplot(aes(x = type, y = ale)) +
      geom_boxplot() +
      facet_wrap(~feature, scales = "free", ncol = 3)
      
    
    # ICE plot
    # Individual Conditional Expectation (ICE) plots display one line per instance 
    # that shows how the instance’s prediction changes when a feature changes.
    
    load(file = "results/bt_model.Rdata")
    bt_ice <- list()
    for (feature in bt_imp_mse$results %>%
         dplyr::pull(feature) %>% as_vector()) {
      bt_ice[[feature]] <- tryCatch({
        FeatureEffect$new(
          bt_predictor, feature = feature, method = "pdp+ice"
        )
      }, warning = function(w) {
        NULL
      }, error = function(e) {
        NULL
      }, finally = {
        NULL
      })
    }
    save(bt_ice,
         file = "results/bt_ice.Rdata")
    load(file = "results/bt_ice.Rdata")
    
    for (feature in names(bt_ice)) {
      if (is.null(bt_ice[[feature]])) {
        print(feature)
      }
    }
    for (feature in names(bt_ale)) {
      print(feature)
      print(plot(bt_ice[[feature]]))
    }
    
    # compute interaction of any feature with others
    load(file = "results/bt_model.Rdata")
    
    bt_interact = Interaction$new(bt_predictor, parallel = T)
    save(bt_interact,
         file = "results/bt_interact.Rdata")
    
    bt_interact_2w <- list()
    for (feature in tdd_index %>%
         dplyr::select(-transcript_id) %>%
         dplyr::select(-tdd_index) %>%
         names()) {
      if (is.null(bt_interact_2w[[feature]])) {
        bt_interact_2w[[feature]] <- tryCatch({
          Interaction$new(bt_predictor, feature = feature)
        }, error = function(e) {
          NULL
        }, finally = {
          NULL
        })
      }
      save(bt_interact_2w,
          file = "results/bt_interact_2w.Rdata")
    }
    save(bt_interact_2w,
         file = "results/bt_interact_2w.Rdata")
    
    load(file = "results/bt_interact.Rdata")
    load(file = "results/bt_interact_2w.Rdata")
    plot(bt_interact)
    
    for (feature in names(bt_interact_2w)) {
      if (is.null(bt_interact_2w[[feature]])) {
        print(feature)
      }
    }
    for (feature in names(bt_interact_2w)) {
      print(feature)
      print(plot(bt_interact_2w[[feature]]))
    }