# packages for wrangling data and the original models
library(tidyverse)
library(tidymodels)
library(rcis)
library(patchwork)
# packages for model interpretation/explanation
library(DALEX)
library(DALEXtra)
library(lime)
# set random number generator seed value for reproducibility
set.seed(123)
theme_set(theme_minimal())
Explain machine learning models: Predicting student debt
Student debt in the United States has increased substantially over the past twenty years. In this application exercise we will interpret the results of a set of machine learning models predicting the median student debt load for students graduating in 2020-21 at four-year colleges and universities as a function of university-specific factors (e.g. public vs. private school, admissions rate, cost of attendance).
Load the documentation for rcis::scorecard
to identify the variables used in the models.
Import models
We have estimated three distinct machine learning models to predict the median student debt load for students graduating in 2020-21 at four-year colleges and universities. Each model uses the same set of predictors, but the algorithms differ. Specifically, we have estimated
- Random forest
- Penalized regression
- 10-nearest neighbors
All models were estimated using tidymodels. We will load the training set, test set, and ML workflows from data/scorecard-models.Rdata
.
# load Rdata file with all the data frames and pre-trained models
load("data/scorecard-models.RData")
Create explainer objects
In order to generate our interpretations, we will use the DALEX package. The first step in any DALEX operation is to create an explainer object. This object contains all the information needed to interpret the model’s predictions. We will create explainer objects for each of the three models.
Your turn: Review the syntax below for creating explainer objects using the explain_tidymodels()
function. Then, create explainer objects for the random forest and \(k\) nearest neighbors models.
# use explain_*() to create explainer object
# first step of an DALEX operation
<- explain_tidymodels(
explainer_glmnet model = glmnet_wf,
# data should exclude the outcome feature
data = scorecard_train |> select(-debt),
# y should be a vector containing the outcome of interest for the training set
y = scorecard_train$debt,
# assign a label to clearly identify model in later plots
label = "penalized regression"
)
# explainer for random forest model
<- explain_tidymodels(
explainer_rf model = rf_wf,
data = scorecard_train |> select(-debt),
y = scorecard_train$debt,
label = "random forest"
)
# explainer for nearest neighbors model
<- explain_tidymodels(
explainer_kknn model = kknn_wf,
data = scorecard_train |> select(-debt),
y = scorecard_train$debt,
label = "k nearest neighbors"
)
Choose a couple of observations to explain
<- filter(.data = scorecard, name == "Cornell University") |>
cornell select(-unitid, -name)
<- filter(.data = scorecard, name == "Ithaca College") |>
ic select(-unitid, -name)
<- bind_rows(cornell, ic)
both
# set row names for LIME later
rownames(both) <- c("Cornell University", "Ithaca College")
Shapley values
# explain Cornell with rf model
<- predict_parts(
shap_cornell_rf explainer = explainer_rf,
new_observation = cornell,
type = "shap"
)plot(shap_cornell_rf)
# explain Cornell with kknn model
<- predict_parts(
shap_cornell_kknn explainer = explainer_kknn,
new_observation = cornell,
type = "shap"
)plot(shap_cornell_kknn)
# increase the number of feature order permutations
<- predict_parts(
shap_cornell_kknn_40 explainer = explainer_kknn,
new_observation = cornell,
type = "shap",
B = 40
)
plot(shap_cornell_kknn_40)
Pair with ggplot2
# based on example from https://www.tmwr.org/explain.html#local-explanations
|>
shap_cornell_kknn # convert to pure tibble-formatted data frame
as_tibble() |>
# calculate average contribution per variable across permutations
mutate(mean_val = mean(contribution), .by = variable) |>
# reorder variable levels in order of absolute value of mean contribution
mutate(variable = fct_reorder(variable, abs(mean_val))) |>
# define basic ggplot object for horizontal boxplot
ggplot(mapping = aes(x = contribution, y = variable, fill = mean_val > 0)) +
# add a bar plot
geom_col(
data = ~ distinct(., variable, mean_val),
mapping = aes(x = mean_val, y = variable),
alpha = 0.5
+
) # overlay with boxplot to show distribution
geom_boxplot(width = 0.5) +
# outcome variable is measured in dollars - contributions are the same units
scale_x_continuous(labels = label_dollar()) +
# use viridis color palette
scale_fill_viridis_d(guide = "none") +
labs(y = NULL)
Exercises
Your turn: Explain each model’s prediction for Ithaca College. How do they differ from each other?
# calculate shapley values
<- TODO
shap_ic_rf
<- TODO
shap_ic_kknn
<- TODO shap_ic_glmnet
# generate plots for each
plot(shap_ic_rf)
plot(shap_ic_kknn)
plot(shap_ic_glmnet)
# view side by side
plot(shap_ic_rf) +
plot(shap_ic_kknn) +
plot(shap_ic_glmnet)
# or combine together and reuse ggplot code from above
bind_rows(
shap_ic_rf,
shap_ic_kknn,
shap_ic_glmnet|>
) # convert to pure tibble-formatted data frame
as_tibble() |>
# calculate average contribution per variable across permutations
mutate(mean_val = mean(contribution), .by = c(label, variable)) |>
# reorder variable levels in order of absolute value of mean contribution
mutate(variable = tidytext::reorder_within(x = variable, by = abs(mean_val), within = label)) |>
# define basic ggplot object for horizontal boxplot
ggplot(mapping = aes(x = contribution, y = variable, fill = mean_val > 0)) +
# add a bar plot
geom_col(
data = ~ distinct(., label, variable, mean_val),
mapping = aes(x = mean_val, y = variable),
alpha = 0.5
+
) # overlay with boxplot to show distribution
geom_boxplot(width = 0.5) +
# facet for each model
facet_wrap(vars(label), scales = "free_y") +
::scale_y_reordered() +
tidytext# outcome variable is measured in dollars - contributions are the same units
scale_x_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
# use viridis color palette
scale_fill_viridis_d(guide = "none") +
labs(y = NULL)
Add response here.
LIME
# prepare the recipe
<- extract_recipe(rf_wf)
prepped_rec_rf
# write a function to bake the observation
<- function(x) {
bake_rf bake(
prepped_rec_rf,new_data = x
)
}
# create explainer object
<- lime(
lime_explainer_rf x = scorecard_train,
model = extract_fit_parsnip(rf_wf),
preprocess = bake_rf
)
# top 5 features
<- explain(
explanation_rf x = cornell,
explainer = lime_explainer_rf,
n_features = 5
)
plot_features(explanation_rf)
# top 10 features, increased permutations
<- explain(
explanation_rf x = cornell,
explainer = lime_explainer_rf,
n_features = 10,
n_permutations = 2000
)
plot_features(explanation_rf)
A note on the penalized regression model
Due to how the model was trained, bake_glmnet()
requires an additional composition
argument. Otherwise everything else is the same.
# prepare the recipe
<- extract_recipe(glmnet_wf)
prepped_rec_glmnet
# write a function to convert the legislative description to an appropriate matrix object
<- function(x) {
bake_glmnet bake(
prepped_rec_glmnet,new_data = x,
composition = "dgCMatrix"
)
}
# create explainer object
<- lime(
lime_explainer_glmnet x = scorecard_train,
model = extract_fit_parsnip(glmnet_wf),
preprocess = bake_glmnet
)
# top 5 features
<- explain(
explanation_glmnet x = cornell,
explainer = lime_explainer_glmnet,
n_features = 10
)
plot_features(explanation_glmnet)
Exercises
Your turn: Calculate a LIME explanation for Ithaca College and the \(k\) nearest neighbors model. What are the top 10 features? How well does the local model explain the prediction?
# prepare the recipe
<- extract_recipe(TODO)
prepped_rec_kknn
# write a function to bake the observation
<- function(x) {
bake_kknn
TODO
}
# create explainer object
<- lime(
lime_explainer_kknn x = scorecard_train,
model = extract_fit_parsnip(TODO),
preprocess = bake_kknn
)
# top 10 features
<- explain(
explanation_kknn x = TODO,
explainer = lime_explainer_kknn,
n_features = 10
)
plot_features(explanation_kknn)
Add response here.
Your turn: Reproduce the explanation but use a lasso model to select the most important features. How does the explanation change?
# use lasso to select the most important features
<- explain(
explanation_lasso_kknn x = TODO,
explainer = lime_explainer_kknn,
n_features = 10,
# use a lasso model to select the features instead of ridge regression
feature_select = "lasso_path"
)
plot_features(explanation_lasso_kknn)
Choose your own adventure
Your turn: Choose at least two other universities in the scorecard
dataset. Generate explanations of their predicted median student debt from the random forest model using both SHAP and LIME. Compare the results. What are the most important features for each university? How do the explanations differ?
# add code here
Add response here.