# packages for wrangling data and the original models
library(tidyverse)
library(tidymodels)
library(rcis)
# packages for model interpretation/explanation
library(DALEX)
library(DALEXtra)
library(rattle) # fancy tree plots
# set random number generator seed value for reproducibility
set.seed(123)
theme_set(theme_minimal())
Interpret machine learning models: Predicting student debt
Student debt in the United States has increased substantially over the past twenty years. In this application exercise we will interpret the results of a set of machine learning models predicting the median student debt load for students graduating in 2020-21 at four-year colleges and universities as a function of university-specific factors (e.g. public vs. private school, admissions rate, cost of attendance).
Load the documentation for rcis::scorecard
to identify the variables used in the models.
Import models
We have estimated three distinct machine learning models to predict the median student debt load for students graduating in 2020-21 at four-year colleges and universities. Each model uses the same set of predictors, but the algorithms differ. Specifically, we have estimated
- Random forest
- Penalized regression
- 10-nearest neighbors
All models were estimated using tidymodels. We will load the training set, test set, and ML workflows from data/scorecard-models.Rdata
.
# load Rdata file with all the data frames and pre-trained models
load("data/scorecard-models.RData")
Create explainer objects
In order to generate our interpretations, we will use the DALEX package. The first step in any DALEX operation is to create an explainer object. This object contains all the information needed to interpret the model’s predictions. We will create explainer objects for each of the three models.
Your turn: Review the syntax below for creating explainer objects using the explain_tidymodels()
function. Then, create explainer objects for the random forest and \(k\) nearest neighbors models.
# use explain_*() to create explainer object
# first step of an DALEX operation
<- explain_tidymodels(
explainer_glmnet model = glmnet_wf,
# data should exclude the outcome feature
data = scorecard_train |> select(-debt),
# y should be a vector containing the outcome of interest for the training set
y = scorecard_train$debt,
# assign a label to clearly identify model in later plots
label = "penalized regression"
)
# explainer for random forest model
<- explain_tidymodels(
explainer_rf model = TODO,
data = scorecard_train |> select(-debt),
y = scorecard_train$debt,
label = "TODO"
)
# explainer for nearest neighbors model
<- explain_tidymodels(
explainer_kknn model = TODO,
data = scorecard_train |> select(-debt),
y = scorecard_train$debt,
label = "TODO"
)
Permutation-based feature importance
The DALEX package provides a variety of methods for interpreting machine learning models. One common method is to calculate feature importance. Feature importance measures the contribution of each predictor variable to the model’s predictions. We will use the model_parts()
function to calculate feature importance for the random forest model. It includes a built-in plot()
method using ggplot2 to visualize the results.
# generate feature importance measures
<- model_parts(explainer_rf)
vip_rf
vip_rf
# visualize feature importance
plot(vip_rf)
Your turn: Calculate feature importance for the random forest model using 100, 1000, and all observations for permutations. How does the compute time change?
# N = 100
system.time({
model_parts(explainer_rf, N = 100)
})
# default N = 1000
system.time({
model_parts(explainer_rf)
})
# all observations
system.time({
model_parts(explainer_rf, N = NULL)
})
Add response here.
Your turn: Calculate feature importance for the random forest model using the ratio of the raw change in the loss function. How does this differ from the raw change?
# calculate ratio rather than raw change
model_parts(explainer_rf, type = "ratio") |>
plot()
Add response here.
Exercises
Your turn: Calculate feature importance for the penalized regression and \(k\) nearest neighbors using all observations for permutations. How do they compare to the random forest model?
# random forest model
<- model_parts(explainer_rf, N = TODO)
vip_rf plot(vip_rf)
# compare to the glmnet model
<- model_parts(explainer_glmnet, N = TODO)
vip_glmnet plot(vip_glmnet)
# compare to the kknn model
<- model_parts(explainer_kknn, N = TODO)
vip_kknn plot(vip_kknn)
Add response here.
Your turn: Calculate feature importance for the random forest model three times using N = 100
, changing the random seed value before each calculation. How do the results change? How does that compare to N = 1000
?
# calculate random forest feature importance thrice
set.seed(123)
model_parts(explainer_rf, N = TODO) |> plot()
set.seed(234)
model_parts(explainer_rf, N = TODO) |> plot()
set.seed(345)
model_parts(explainer_rf, N = TODO) |> plot()
Add response here.
Partial dependence plots
In order to generate partial dependence plots, we will use model_profile()
. This function calculates the average model prediction for a given variable while holding all other variables constant. We will generate partial dependence plots for the random forest model using the netcost
variable.
# basic pdp for RF model and netcost variable
<- model_profile(explainer_rf, variables = "netcost")
pdp_netcost pdp_netcost
We can visualize just the PDP, or the PDP with the ICE curves that underly the PDP.
# just the PDP
plot(pdp_netcost)
# PDP with ICE curves
plot(pdp_netcost, geom = "profiles")
By default model_profile()
only uses 100 randomly selected observations to generate the profiles. Increasing the sample size may increase the quality of our estimate of the PDP, at the expense of a longer compute time.
# larger sample size
model_profile(explainer_rf, variables = "netcost", N = 500) |>
plot(geom = "profiles")
We can also examine the PDP of a variable for distinct subgroups in the dataset. This is relevant for ML models which (either by design or the type of algorithm used) allow for interactions between predictor variables. In this case, we will examine the PDP of the net cost of attendance independently for public, private non-profit, and private for-profit institutions.
<- model_profile(explainer_rf, variables = "netcost", groups = "type")
pdp_cost_group plot(pdp_cost_group, geom = "profiles")
We also might want to generate a visualization directly from the aggregated profiles, rather than rely on DALEX to generate it for us.
# PDP for state - very hard to read
<- model_profile(explainer_kknn, variables = "state")
pdp_state_kknn plot(pdp_state_kknn)
# examine the data structure
pdp_state_kknn
# extract aggregated profiles
$agr_profiles |>
pdp_state_kknn# convert to tibble
as_tibble() |>
# reorder for plotting
mutate(`_x_` = fct_reorder(.f = `_x_`, .x = `_yhat_`)) |>
ggplot(mapping = aes(x = `_yhat_`, y = `_x_`, fill = `_yhat_`)) +
geom_col() +
scale_x_continuous(labels = label_dollar()) +
scale_fill_viridis_c(guide = "none") +
labs(
title = "Partial dependence plot for state",
subtitle = "Created for the k nearest neighbors model",
x = "Average prediction",
y = NULL
)
Exercises
Your turn: Create PDP + ICE curves for average faculty salary using all three models. How does the role of average faculty salary differ across the models?
# create PDP + ICE curves for avgfacsal from all three models
model_profile(explainer_rf, variables = "TODO") |> plot(geom = "TODO")
model_profile(explainer_glmnet, variables = "TODO") |> plot(geom = "TODO")
model_profile(explainer_kknn, variables = "TODO") |> plot(geom = "TODO")
Add response here.
Your turn: Create a PDP for all numeric variables in the penalized regression model. How do the variables compare in terms of their impact on the model?
# create PDP for all numeric variables in glmnet model
model_profile(explainer_glmnet, variables = TODO) |>
plot()
Add response here.
Global surrogate models
In global surrogates, we estimate a simpler model to approximate the predictions of a more complex model. This can help us understand the relationships between the predictors and the outcome in the more complex model.
Demo: Fit a single decision tree to the predictions of the nearest neighbors model.
# get training set predictions from kknn model
<- bind_cols(
kknn_train_preds
scorecard_train,predict(kknn_wf, new_data = scorecard_train)
)
# fit a single decision tree to the training set predictions
<- decision_tree(min_n = 40) |>
gs_kknn_fit set_mode("regression") |>
fit(
# exclude debt from the formula - it is the outcome of interest
formula = .pred ~ . - debt,
data = kknn_train_preds
)
# evaluate the performance of the surrogate model
bind_cols(
kknn_train_preds,predict(gs_kknn_fit, new_data = kknn_train_preds)
|>
) # distinguish each prediction column - which model they came from
rename(
.pred_kknn = .pred...13,
.pred_tree = .pred...14
|>
) # estimate surrogate model performance
metrics(truth = .pred_kknn, estimate = .pred_tree)
fancyRpartPlot(gs_kknn_fit$fit,
sub = NULL,
palettes = "BuGn",
tweak = 2
)
Your turn: How well does the surrogate model explain the predictions of the nearest neighbors model? How do you interpret the predictions of the nearest neighbors model?
Add response here.