Explain machine learning models: Predicting student debt

Application exercise
Modified

April 25, 2024

# packages for wrangling data and the original models
library(tidyverse)
library(tidymodels)
library(rcis)
library(patchwork)

# packages for model interpretation/explanation
library(DALEX)
library(DALEXtra)
library(lime)

# set random number generator seed value for reproducibility
set.seed(123)

theme_set(theme_minimal())

Student debt in the United States has increased substantially over the past twenty years. In this application exercise we will interpret the results of a set of machine learning models predicting the median student debt load for students graduating in 2020-21 at four-year colleges and universities as a function of university-specific factors (e.g. public vs. private school, admissions rate, cost of attendance).

Tip

Load the documentation for rcis::scorecard to identify the variables used in the models.

Import models

We have estimated three distinct machine learning models to predict the median student debt load for students graduating in 2020-21 at four-year colleges and universities. Each model uses the same set of predictors, but the algorithms differ. Specifically, we have estimated

  • Random forest
  • Penalized regression
  • 10-nearest neighbors

All models were estimated using tidymodels. We will load the training set, test set, and ML workflows from data/scorecard-models.Rdata.

# load Rdata file with all the data frames and pre-trained models
load("data/scorecard-models.RData")

Create explainer objects

In order to generate our interpretations, we will use the DALEX package. The first step in any DALEX operation is to create an explainer object. This object contains all the information needed to interpret the model’s predictions. We will create explainer objects for each of the three models.

Your turn: Review the syntax below for creating explainer objects using the explain_tidymodels() function. Then, create explainer objects for the random forest and \(k\) nearest neighbors models.

# use explain_*() to create explainer object
# first step of an DALEX operation
explainer_glmnet <- explain_tidymodels(
  model = glmnet_wf,
  # data should exclude the outcome feature
  data = scorecard_train |> select(-debt),
  # y should be a vector containing the outcome of interest for the training set
  y = scorecard_train$debt,
  # assign a label to clearly identify model in later plots
  label = "penalized regression"
)

# explainer for random forest model
explainer_rf <- explain_tidymodels(
  model = rf_wf,
  data = scorecard_train |> select(-debt),
  y = scorecard_train$debt,
  label = "random forest"
)

# explainer for nearest neighbors model
explainer_kknn <- explain_tidymodels(
  model = kknn_wf,
  data = scorecard_train |> select(-debt),
  y = scorecard_train$debt,
  label = "k nearest neighbors"
)

Choose a couple of observations to explain

cornell <- filter(.data = scorecard, name == "Cornell University") |>
  select(-unitid, -name)
ic <- filter(.data = scorecard, name == "Ithaca College") |>
  select(-unitid, -name)
both <- bind_rows(cornell, ic)

# set row names for LIME later
rownames(both) <- c("Cornell University", "Ithaca College")

Shapley values

# explain Cornell with rf model
shap_cornell_rf <- predict_parts(
  explainer = explainer_rf,
  new_observation = cornell,
  type = "shap"
)
plot(shap_cornell_rf)

# explain Cornell with kknn model
shap_cornell_kknn <- predict_parts(
  explainer = explainer_kknn,
  new_observation = cornell,
  type = "shap"
)
plot(shap_cornell_kknn)

# increase the number of feature order permutations
shap_cornell_kknn_40 <- predict_parts(
  explainer = explainer_kknn,
  new_observation = cornell,
  type = "shap",
  B = 40
)

plot(shap_cornell_kknn_40)

Pair with ggplot2

# based on example from https://www.tmwr.org/explain.html#local-explanations

shap_cornell_kknn |>
  # convert to pure tibble-formatted data frame
  as_tibble() |>
  # calculate average contribution per variable across permutations
  mutate(mean_val = mean(contribution), .by = variable) |>
  # reorder variable levels in order of absolute value of mean contribution
  mutate(variable = fct_reorder(variable, abs(mean_val))) |>
  # define basic ggplot object for horizontal boxplot
  ggplot(mapping = aes(x = contribution, y = variable, fill = mean_val > 0)) +
  # add a bar plot
  geom_col(
    data = ~ distinct(., variable, mean_val),
    mapping = aes(x = mean_val, y = variable),
    alpha = 0.5
  ) +
  # overlay with boxplot to show distribution
  geom_boxplot(width = 0.5) +
  # outcome variable is measured in dollars - contributions are the same units
  scale_x_continuous(labels = label_dollar()) +
  # use viridis color palette
  scale_fill_viridis_d(guide = "none") +
  labs(y = NULL)

Exercises

Your turn: Explain each model’s prediction for Ithaca College. How do they differ from each other?

# calculate shapley values
shap_ic_rf <- TODO

shap_ic_kknn <- TODO

shap_ic_glmnet <- TODO
# generate plots for each
plot(shap_ic_rf)
plot(shap_ic_kknn)
plot(shap_ic_glmnet)

# view side by side
plot(shap_ic_rf) +
  plot(shap_ic_kknn) +
  plot(shap_ic_glmnet)

# or combine together and reuse ggplot code from above
bind_rows(
  shap_ic_rf,
  shap_ic_kknn,
  shap_ic_glmnet
) |>
  # convert to pure tibble-formatted data frame
  as_tibble() |>
  # calculate average contribution per variable across permutations
  mutate(mean_val = mean(contribution), .by = c(label, variable)) |>
  # reorder variable levels in order of absolute value of mean contribution
  mutate(variable = tidytext::reorder_within(x = variable, by = abs(mean_val), within = label)) |>
  # define basic ggplot object for horizontal boxplot
  ggplot(mapping = aes(x = contribution, y = variable, fill = mean_val > 0)) +
  # add a bar plot
  geom_col(
    data = ~ distinct(., label, variable, mean_val),
    mapping = aes(x = mean_val, y = variable),
    alpha = 0.5
  ) +
  # overlay with boxplot to show distribution
  geom_boxplot(width = 0.5) +
  # facet for each model
  facet_wrap(vars(label), scales = "free_y") +
  tidytext::scale_y_reordered() +
  # outcome variable is measured in dollars - contributions are the same units
  scale_x_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  # use viridis color palette
  scale_fill_viridis_d(guide = "none") +
  labs(y = NULL)

Add response here.

LIME

# prepare the recipe
prepped_rec_rf <- extract_recipe(rf_wf)

# write a function to bake the observation
bake_rf <- function(x) {
  bake(
    prepped_rec_rf,
    new_data = x
  )
}

# create explainer object
lime_explainer_rf <- lime(
  x = scorecard_train,
  model = extract_fit_parsnip(rf_wf),
  preprocess = bake_rf
)

# top 5 features
explanation_rf <- explain(
  x = cornell,
  explainer = lime_explainer_rf,
  n_features = 5
)

plot_features(explanation_rf)

# top 10 features, increased permutations
explanation_rf <- explain(
  x = cornell,
  explainer = lime_explainer_rf,
  n_features = 10,
  n_permutations = 2000
)

plot_features(explanation_rf)

A note on the penalized regression model

Due to how the model was trained, bake_glmnet() requires an additional composition argument. Otherwise everything else is the same.

# prepare the recipe
prepped_rec_glmnet <- extract_recipe(glmnet_wf)

# write a function to convert the legislative description to an appropriate matrix object
bake_glmnet <- function(x) {
  bake(
    prepped_rec_glmnet,
    new_data = x,
    composition = "dgCMatrix"
  )
}

# create explainer object
lime_explainer_glmnet <- lime(
  x = scorecard_train,
  model = extract_fit_parsnip(glmnet_wf),
  preprocess = bake_glmnet
)

# top 5 features
explanation_glmnet <- explain(
  x = cornell,
  explainer = lime_explainer_glmnet,
  n_features = 10
)

plot_features(explanation_glmnet)

Exercises

Your turn: Calculate a LIME explanation for Ithaca College and the \(k\) nearest neighbors model. What are the top 10 features? How well does the local model explain the prediction?

# prepare the recipe
prepped_rec_kknn <- extract_recipe(TODO)

# write a function to bake the observation
bake_kknn <- function(x) {
  TODO
}

# create explainer object
lime_explainer_kknn <- lime(
  x = scorecard_train,
  model = extract_fit_parsnip(TODO),
  preprocess = bake_kknn
)

# top 10 features
explanation_kknn <- explain(
  x = TODO,
  explainer = lime_explainer_kknn,
  n_features = 10
)

plot_features(explanation_kknn)

Add response here.

Your turn: Reproduce the explanation but use a lasso model to select the most important features. How does the explanation change?

# use lasso to select the most important features
explanation_lasso_kknn <- explain(
  x = TODO,
  explainer = lime_explainer_kknn,
  n_features = 10,
  # use a lasso model to select the features instead of ridge regression
  feature_select = "lasso_path"
)

plot_features(explanation_lasso_kknn)

Choose your own adventure

Your turn: Choose at least two other universities in the scorecard dataset. Generate explanations of their predicted median student debt from the random forest model using both SHAP and LIME. Compare the results. What are the most important features for each university? How do the explanations differ?

# add code here

Add response here.