# Load libraries
library(kableExtra)
library(tidyverse)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(pdp)
##
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
##
## partial
# Load data extracted from the NHL APIs (extraction done in separate file)
goalies = read_csv("../data/nhl_goalies_data.csv")
## Rows: 2256 Columns: 34
## ── Column specification ────────────────────────────────────
## Delimiter: ","
## chr (9): birthCity, birthCountryCode, birthStateProvinceCode, fullName, is...
## dbl (24): draftOverall, draftRound, draftYear, height, playerId, weight, se...
## date (1): birthDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
prospects = read_csv("../data/nhl_goalie_prospects_data.csv")
## Rows: 869 Columns: 15
## ── Column specification ────────────────────────────────────
## Delimiter: ","
## chr (8): positionCode, catches, lastAmateurClub, lastAmateurLeague, birthCi...
## dbl (5): height, weight, midtermRank, finalRank, draftYear
## lgl (1): international
## date (1): birthDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert the invalid double columns in the goalies data to integers and create some factors
goalies = goalies |>
mutate(
across(
c(seasonId, playerId, height, weight, draftYear, draftRound, draftOverall, assists,
gamesPlayed, gamesStarted, goals, goalsAgainst, losses, otLosses, penaltyMinutes,
points, saves, shotsAgainst, shutouts, ties, timeOnIce, wins),
as.integer
),
across(
c(birthCountryCode, isInHallOfFameYn, catches),
as.factor
)
)
# Convert the invalid double columns in the prospects data to integers and create some factors
prospects = prospects |>
mutate(
across(
c(draftYear, height, weight, midtermRank, finalRank),
as.integer
),
across(
c(positionCode, catches, birthCountryCode),
as.factor
)
)
# Create column for checking whether a goalie was drafted or not
goalies = goalies |>
mutate(
drafted = !is.na(draftYear)
)
# Create column for goalie age as of the beginning of the season (i.e. September 15th as per NHL Hockey Operations Guidelines)
goalies = goalies |>
separate(seasonId, into = c("seasonStartYear", "seasonEndYear"), sep = 4,
convert = T, remove = F) |>
mutate(
ageAtSeasonStart = as.integer(
interval(birthDate, date(paste0(seasonStartYear, "-09-15"))) / years()
)
)
# Get the average goalie height minus 2 inches for every season
undersize_heights_per_season = goalies |>
group_by(seasonId) |>
summarize(
undersize_threshold = mean(height) - 2
) |>
ungroup()
# Create column classifying a goalie as undersized or not (i.e. <= 2" below average)
goalies = goalies |>
left_join(undersize_heights_per_season, by = "seasonId") |>
mutate(
undersized = height <= undersize_threshold
)
# Create column for prospect age as of the start of the next season (i.e. September 15th as per NHL Hockey Operations Guidelines)
prospects = prospects |>
mutate(
draftYearAge = as.integer(
interval(birthDate, date(paste0(draftYear, "-09-15"))) / years()
)
)
# Find emergency situation goalies
emergency_goalies = goalies |>
filter (fullName %in% c("Shawn Hunwick", "Jorge Alves", "Jett Alexander",
"Thomas Hodges", "Matthew Berlin", "Scott Foster"))
# Remove emergency goalies from data
goalies = goalies |>
anti_join(emergency_goalies, by = "playerId")
# Get established goalies
established_goalies = goalies |>
filter(gamesPlayed >= 25 | (gamesPlayed >= 22 & seasonStartYear == 2024))
# Load NHL team standings data (extracted in separate file)
teams = read_csv("../data/nhl_teams_data.csv")
## Rows: 490 Columns: 6
## ── Column specification ────────────────────────────────────
## Delimiter: ","
## chr (1): teamAbbrev
## dbl (5): seasonId, iceTime, xGoalsAgainst, blockedShotAttemptsAgainst, dZone...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check for import issues with NHL teams data
dim(teams)
## [1] 490 6
head(teams)
## # A tibble: 6 × 6
## seasonId teamAbbrev iceTime xGoalsAgainst blockedShotAttemptsAgainst
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 20082009 MIN 299195 202. 1033
## 2 20082009 BOS 295613 216. 1050
## 3 20082009 ARI 294089 224. 1187
## 4 20082009 LAK 300139 221. 1040
## 5 20082009 COL 299370 215. 1317
## 6 20082009 DAL 300273 223. 982
## # ℹ 1 more variable: dZoneGiveawaysAgainst <dbl>
tail(teams)
## # A tibble: 6 × 6
## seasonId teamAbbrev iceTime xGoalsAgainst blockedShotAttemptsAgainst
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 20242025 NYI 298682 259. 1459
## 2 20242025 SEA 297525 267. 1441
## 3 20242025 OTT 297335 256. 1391
## 4 20242025 LAK 297801 214. 1302
## 5 20242025 DAL 297658 253. 1262
## 6 20242025 CHI 297870 279. 1354
## # ℹ 1 more variable: dZoneGiveawaysAgainst <dbl>
# Convert the invalid double columns in the teams data to integers and create some factors
teams = teams |>
mutate(
across(
c(seasonId, blockedShotAttemptsAgainst, dZoneGiveawaysAgainst),
as.integer
),
across(
c(teamAbbrev),
as.factor
)
)
# Check the variable types in the teams data
str(teams)
## tibble [490 × 6] (S3: tbl_df/tbl/data.frame)
## $ seasonId : int [1:490] 20082009 20082009 20082009 20082009 20082009 20082009 20082009 20082009 20082009 20082009 ...
## $ teamAbbrev : Factor w/ 34 levels "ANA","ARI","ATL",..: 16 4 2 15 10 11 29 6 22 5 ...
## $ iceTime : num [1:490] 299195 295613 294089 300139 299370 ...
## $ xGoalsAgainst : num [1:490] 202 216 224 221 215 ...
## $ blockedShotAttemptsAgainst: int [1:490] 1033 1050 1187 1040 1317 982 1239 1194 1166 1083 ...
## $ dZoneGiveawaysAgainst : int [1:490] 257 293 228 471 291 467 531 300 529 400 ...
# Summarize the teams data
summary(teams)
## seasonId teamAbbrev iceTime xGoalsAgainst
## Min. :20082009 ANA : 16 Min. :173904 Min. :102.2
## 1st Qu.:20122013 BOS : 16 1st Qu.:297536 1st Qu.:203.0
## Median :20162017 BUF : 16 Median :298796 Median :224.6
## Mean :20159364 CAR : 16 Mean :282769 Mean :218.5
## 3rd Qu.:20202021 CBJ : 16 3rd Qu.:299748 3rd Qu.:242.4
## Max. :20242025 CGY : 16 Max. :308162 Max. :313.4
## (Other):394
## blockedShotAttemptsAgainst dZoneGiveawaysAgainst
## Min. : 511 Min. :141.0
## 1st Qu.:1006 1st Qu.:303.2
## Median :1130 Median :377.0
## Mean :1114 Mean :392.0
## 3rd Qu.:1259 3rd Qu.:472.5
## Max. :1673 Max. :789.0
##
We have 490 rows across 6 variables for the NHL teams data. No unexpected import issues found. It appears all variables in the teams data have reasonable values.
# Limit each goalies team to be the team they finished the season with
goalies = goalies |>
mutate(
teamAbbrev = as.factor(substr(teamAbbrevs, nchar(teamAbbrevs) - 2, nchar(teamAbbrevs))) # take last 3 chars
)
table(goalies$teamAbbrev)
##
## ANA ARI ATL BOS BUF CAR CBJ CGY CHI COL DAL DET EDM FLA LAK MIN MTL NJD NSH NYI
## 71 39 34 73 77 75 74 65 78 77 68 72 66 64 76 71 70 73 66 75
## NYR OTT PHI PHX PIT SEA SJS STL TBL TOR UTA VAN VGK WPG WSH
## 75 87 82 40 73 13 67 76 80 77 3 76 31 36 70
# Normalize team stats to be per 60 minutes played for comparison
teams = teams |>
mutate(
teamXGoalsAgainstPer60 = (xGoalsAgainst / iceTime) * 3600,
teamBlockedShotsPer60 = (blockedShotAttemptsAgainst / iceTime) * 3600,
teamDZoneGiveawaysPer60 = (dZoneGiveawaysAgainst / iceTime) * 3600
) |>
rename(teamIceTime = iceTime) |>
select(!c(xGoalsAgainst, blockedShotAttemptsAgainst, dZoneGiveawaysAgainst))
summary(teams)
## seasonId teamAbbrev teamIceTime teamXGoalsAgainstPer60
## Min. :20082009 ANA : 16 Min. :173904 Min. :2.101
## 1st Qu.:20122013 BOS : 16 1st Qu.:297536 1st Qu.:2.585
## Median :20162017 BUF : 16 Median :298796 Median :2.776
## Mean :20159364 CAR : 16 Mean :282769 Mean :2.775
## 3rd Qu.:20202021 CBJ : 16 3rd Qu.:299748 3rd Qu.:2.946
## Max. :20242025 CGY : 16 Max. :308162 Max. :3.783
## (Other):394
## teamBlockedShotsPer60 teamDZoneGiveawaysPer60
## Min. : 8.876 Min. :2.593
## 1st Qu.:12.831 1st Qu.:3.882
## Median :14.124 Median :4.788
## Mean :14.148 Mean :4.986
## 3rd Qu.:15.340 3rd Qu.:5.866
## Max. :20.144 Max. :9.803
##
# Denormalize the goalies data to include team stats of the last team they played on in each season (starting in 2008)
goalies = goalies |>
inner_join(teams, by = c("seasonId", "teamAbbrev")) |>
relocate(seasonId, playerId, teamAbbrev)
dim(goalies)
## [1] 1489 45
head(goalies)
## # A tibble: 6 × 45
## seasonId playerId teamAbbrev birthCity birthCountryCode birthDate
## <int> <int> <fct> <chr> <fct> <date>
## 1 20082009 8467913 OTT Cold Lake CAN 1981-01-07
## 2 20082009 8467453 COL Belleville CAN 1980-05-04
## 3 20082009 8467511 PHI Turku FIN 1980-06-18
## 4 20082009 8474550 CHI Vantaa FIN 1983-08-29
## 5 20082009 8471750 STL Denver USA 1986-11-21
## 6 20082009 8462161 WSH Farmington USA 1977-03-12
## # ℹ 39 more variables: birthStateProvinceCode <chr>, draftOverall <int>,
## # draftRound <int>, draftYear <int>, fullName <chr>, height <int>,
## # isInHallOfFameYn <fct>, lastName <chr>, nationalityCode <chr>,
## # catches <fct>, weight <int>, seasonStartYear <int>, seasonEndYear <int>,
## # assists <int>, gamesPlayed <int>, gamesStarted <int>, goals <int>,
## # goalsAgainst <int>, goalsAgainstAverage <dbl>, losses <int>,
## # otLosses <int>, penaltyMinutes <int>, points <int>, savePct <dbl>, …
tail(goalies)
## # A tibble: 6 × 45
## seasonId playerId teamAbbrev birthCity birthCountryCode birthDate
## <int> <int> <fct> <chr> <fct> <date>
## 1 20242025 8477465 PIT Surrey CAN 1995-04-29
## 2 20242025 8480045 BUF Espoo FIN 1999-03-09
## 3 20242025 8478024 ANA Helsinki FIN 1995-02-06
## 4 20242025 8477970 FLA Havlickuv Brod CZE 1996-01-09
## 5 20242025 8484293 CAR Dollard-des-Ormeaux CAN 2000-03-04
## 6 20242025 8482137 SJS Omsk RUS 2002-06-16
## # ℹ 39 more variables: birthStateProvinceCode <chr>, draftOverall <int>,
## # draftRound <int>, draftYear <int>, fullName <chr>, height <int>,
## # isInHallOfFameYn <fct>, lastName <chr>, nationalityCode <chr>,
## # catches <fct>, weight <int>, seasonStartYear <int>, seasonEndYear <int>,
## # assists <int>, gamesPlayed <int>, gamesStarted <int>, goals <int>,
## # goalsAgainst <int>, goalsAgainstAverage <dbl>, losses <int>,
## # otLosses <int>, penaltyMinutes <int>, points <int>, savePct <dbl>, …
# Get established goalies again (since goalies has changed)
established_goalies = goalies |>
filter(gamesPlayed >= 25 | (gamesPlayed >= 22 & seasonStartYear == 2024))
dim(established_goalies)
## [1] 765 45
# Explore distribution of team expected goals against per 60 minutes
teams |>
ggplot(aes(teamXGoalsAgainstPer60)) +
geom_histogram(binwidth = 0.05, color = "black", fill = "lightblue") +
labs(
title = "Distribution of Team Expected Goals Against Per 60 Minutes\nfor Every Season Since 2008",
x = "Expected Goals Against Per 60 Minutes",
y = "Frequency"
)

# Explore distribution of team blocked shots per 60 minutes
teams |>
ggplot(aes(teamBlockedShotsPer60)) +
geom_histogram(binwidth = 0.3, color = "black", fill = "lightblue") +
labs(
title = "Distribution of Team Blocked Shots Per 60 Minutes\nfor Every Season Since 2008",
x = "Blocked Shots Per 60 Minutes",
y = "Frequency"
)

# Explore distribution of team dzone giveaways per 60 minutes
teams |>
ggplot(aes(teamDZoneGiveawaysPer60)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "lightblue") +
labs(
title = "Distribution of Team DZone Giveaways Per 60 Minutes\nfor Every Season Since 2008",
x = "DZone Giveaways Per 60 Minutes",
y = "Frequency"
)

Expected goals against per 60 minutes and blocked shots per 60 minutes appear to be normally distributed. Meanwhile, defensive zone giveaways per 60 minutes appears to be right skewed.
# Visualize relationships between performance metrics and goalie characteristics
established_goalies |>
select(height, weight, ageAtSeasonStart, savePct, goalsAgainstAverage) |>
rename(Height = height, Weight = weight, Age = ageAtSeasonStart, `SV%` = savePct, GAA = goalsAgainstAverage) |>
ggpairs()

# Visualize relationships between performance metrics and team stats
established_goalies |>
select(teamXGoalsAgainstPer60, teamBlockedShotsPer60, teamDZoneGiveawaysPer60, savePct, goalsAgainstAverage) |>
rename(xGAA = teamXGoalsAgainstPer60, `Avg Blocks` = teamBlockedShotsPer60, `Avg DZone Giveaways` = teamDZoneGiveawaysPer60, `SV%` = savePct, GAA = goalsAgainstAverage) |>
ggpairs()

# Visualize relationships between performance metrics and time
established_goalies |>
select(seasonStartYear, savePct, goalsAgainstAverage) |>
rename(Season = seasonStartYear, `SV%` = savePct, GAA = goalsAgainstAverage) |>
ggpairs()

SV% and GAA appear to be moderately correlated with team expected goals against average, weakly to moderately correlated with average team blocks per 60 minutes, and very weakly correlated with average team dzone giveaways per 60 minutes. Additionally, SV% appears to have a left skew while GAA has a slight right skew. The beta distribution makes sense for SV% since it is continuous and bounded between 0 and 1. The gamma distribution makes sense for GAA since it is positive and skewed to the right.
# Fit a GAM for SV% using height
savePct_gam = gam(
savePct ~
s(seasonStartYear, bs="cr", k=10)
+ s(height, bs="cr", k=10)
+ s(weight, bs="cr", k=20)
+ s(ageAtSeasonStart, bs="cr", k=20)
+ s(teamXGoalsAgainstPer60, bs="cr", k=20)
+ s(teamBlockedShotsPer60, bs="cr", k=20)
+ s(teamDZoneGiveawaysPer60, bs="cr", k=20)
+ s(birthCountryCode, bs="re")
+ s(teamAbbrev, bs="re")
+ catches,
data = established_goalies,
method = "ML",
family = betar(link = "logit")
)
summary(savePct_gam)
##
## Family: Beta regression(891.2)
## Link function: logit
##
## Formula:
## savePct ~ s(seasonStartYear, bs = "cr", k = 10) + s(height, bs = "cr",
## k = 10) + s(weight, bs = "cr", k = 20) + s(ageAtSeasonStart,
## bs = "cr", k = 20) + s(teamXGoalsAgainstPer60, bs = "cr",
## k = 20) + s(teamBlockedShotsPer60, bs = "cr", k = 20) + s(teamDZoneGiveawaysPer60,
## bs = "cr", k = 20) + s(birthCountryCode, bs = "re") + s(teamAbbrev,
## bs = "re") + catches
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.324510 0.009584 242.54 <2e-16 ***
## catchesR -0.004211 0.018319 -0.23 0.818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(seasonStartYear) 3.210 3.956 43.443 < 2e-16 ***
## s(height) 1.000 1.000 1.272 0.259367
## s(weight) 2.189 2.774 3.870 0.265172
## s(ageAtSeasonStart) 1.236 1.433 4.037 0.053017 .
## s(teamXGoalsAgainstPer60) 1.001 1.003 29.674 < 2e-16 ***
## s(teamBlockedShotsPer60) 1.001 1.001 0.596 0.440587
## s(teamDZoneGiveawaysPer60) 1.001 1.002 0.035 0.853424
## s(birthCountryCode) 4.824 13.000 23.023 0.000249 ***
## s(teamAbbrev) 16.450 33.000 38.666 0.000132 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.215 Deviance explained = 25%
## -ML = -2455.2 Scale est. = 1 n = 765
plot(savePct_gam)









# Check diagnostics for SV% GAM with height
gam.check(savePct_gam)

##
## Method: ML Optimizer: outer newton
## full convergence after 12 iterations.
## Gradient range [-0.0004850616,0.0005750555]
## (score -2455.25 & scale 1).
## Hessian positive definite, eigenvalue range [1.378693e-05,364.0157].
## Model rank = 163 / 163
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(seasonStartYear) 9.00 3.21 1.02 0.690
## s(height) 9.00 1.00 1.02 0.685
## s(weight) 19.00 2.19 0.96 0.130
## s(ageAtSeasonStart) 19.00 1.24 1.02 0.750
## s(teamXGoalsAgainstPer60) 19.00 1.00 0.97 0.190
## s(teamBlockedShotsPer60) 19.00 1.00 0.94 0.035 *
## s(teamDZoneGiveawaysPer60) 19.00 1.00 0.99 0.385
## s(birthCountryCode) 14.00 4.82 NA NA
## s(teamAbbrev) 34.00 16.45 NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit a GAM for GAA using height
gaa_gam = gam(
goalsAgainstAverage ~
s(seasonStartYear, bs="cr", k=10)
+ s(height, bs="cr", k=10)
+ s(weight, bs="cr", k=20)
+ s(ageAtSeasonStart, bs="cr", k=20)
+ s(teamXGoalsAgainstPer60, bs="cr", k=20)
+ s(teamBlockedShotsPer60, bs="cr", k=20)
+ s(teamDZoneGiveawaysPer60, bs="cr", k=20)
+ s(birthCountryCode, bs="re")
+ s(teamAbbrev, bs="re")
+ catches,
data = established_goalies,
method = "ML",
family = Gamma(link = "log")
)
summary(gaa_gam)
##
## Family: Gamma
## Link function: log
##
## Formula:
## goalsAgainstAverage ~ s(seasonStartYear, bs = "cr", k = 10) +
## s(height, bs = "cr", k = 10) + s(weight, bs = "cr", k = 20) +
## s(ageAtSeasonStart, bs = "cr", k = 20) + s(teamXGoalsAgainstPer60,
## bs = "cr", k = 20) + s(teamBlockedShotsPer60, bs = "cr",
## k = 20) + s(teamDZoneGiveawaysPer60, bs = "cr", k = 20) +
## s(birthCountryCode, bs = "re") + s(teamAbbrev, bs = "re") +
## catches
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.980219 0.008407 116.596 <2e-16 ***
## catchesR 0.014966 0.016566 0.903 0.367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(seasonStartYear) 4.819 5.833 4.336 0.000280 ***
## s(height) 1.000 1.000 0.716 0.397835
## s(weight) 1.000 1.000 0.012 0.911550
## s(ageAtSeasonStart) 1.850 2.342 1.631 0.209694
## s(teamXGoalsAgainstPer60) 1.000 1.000 248.322 < 2e-16 ***
## s(teamBlockedShotsPer60) 1.001 1.001 0.343 0.558922
## s(teamDZoneGiveawaysPer60) 1.000 1.000 1.233 0.267223
## s(birthCountryCode) 4.376 13.000 1.375 0.001846 **
## s(teamAbbrev) 17.298 33.000 1.271 0.000113 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.414 Deviance explained = 44.1%
## -ML = 128.83 Scale est. = 0.011177 n = 765
plot(gaa_gam)









# Check diagnostics for GAA GAM using height
gam.check(gaa_gam)

##
## Method: ML Optimizer: outer newton
## full convergence after 14 iterations.
## Gradient range [-3.394565e-05,0.0003006665]
## (score 128.8342 & scale 0.01117685).
## Hessian positive definite, eigenvalue range [5.622321e-06,384.1277].
## Model rank = 163 / 163
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(seasonStartYear) 9.00 4.82 1.04 0.83
## s(height) 9.00 1.00 1.02 0.72
## s(weight) 19.00 1.00 0.99 0.47
## s(ageAtSeasonStart) 19.00 1.85 1.01 0.61
## s(teamXGoalsAgainstPer60) 19.00 1.00 0.98 0.27
## s(teamBlockedShotsPer60) 19.00 1.00 0.97 0.18
## s(teamDZoneGiveawaysPer60) 19.00 1.00 0.99 0.42
## s(birthCountryCode) 14.00 4.38 NA NA
## s(teamAbbrev) 34.00 17.30 NA NA
Notice that height is not statistically significant in either GAM. However, the estimated smooth function for SV% suggests a potentially positive association with height while the estimated smooth function for GAA suggests a potentially negative association with height.
# Fit a GAM for SV% using undersized
savePct_gam2 = gam(
savePct ~
s(seasonStartYear, bs="cr", k=10)
+ undersized
+ s(weight, bs="cr", k=20)
+ s(ageAtSeasonStart, bs="cr", k=20)
+ s(teamXGoalsAgainstPer60, bs="cr", k=20)
+ s(teamBlockedShotsPer60, bs="cr", k=20)
+ s(teamDZoneGiveawaysPer60, bs="cr", k=20)
+ s(birthCountryCode, bs="re")
+ s(teamAbbrev, bs="re")
+ catches,
data = established_goalies,
method = "ML",
family = betar(link = "logit")
)
summary(savePct_gam2)
##
## Family: Beta regression(889.98)
## Link function: logit
##
## Formula:
## savePct ~ s(seasonStartYear, bs = "cr", k = 10) + undersized +
## s(weight, bs = "cr", k = 20) + s(ageAtSeasonStart, bs = "cr",
## k = 20) + s(teamXGoalsAgainstPer60, bs = "cr", k = 20) +
## s(teamBlockedShotsPer60, bs = "cr", k = 20) + s(teamDZoneGiveawaysPer60,
## bs = "cr", k = 20) + s(birthCountryCode, bs = "re") + s(teamAbbrev,
## bs = "re") + catches
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.324890 0.009721 239.165 <2e-16 ***
## undersizedTRUE -0.005642 0.013798 -0.409 0.683
## catchesR -0.003516 0.018314 -0.192 0.848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(seasonStartYear) 3.221 3.969 42.654 < 2e-16 ***
## s(weight) 2.221 2.814 4.762 0.189527
## s(ageAtSeasonStart) 1.273 1.496 4.987 0.033185 *
## s(teamXGoalsAgainstPer60) 1.001 1.002 30.200 < 2e-16 ***
## s(teamBlockedShotsPer60) 1.000 1.001 0.674 0.412019
## s(teamDZoneGiveawaysPer60) 1.001 1.001 0.031 0.861931
## s(birthCountryCode) 4.747 13.000 22.449 0.000290 ***
## s(teamAbbrev) 16.514 33.000 38.834 0.000127 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.214 Deviance explained = 24.9%
## -ML = -2454.7 Scale est. = 1 n = 765
plot(savePct_gam2)








# Check diagnostics for SV% GAM with undersized
gam.check(savePct_gam2)

##
## Method: ML Optimizer: outer newton
## full convergence after 12 iterations.
## Gradient range [-0.0003896624,0.0004066905]
## (score -2454.699 & scale 1).
## Hessian positive definite, eigenvalue range [0.0002460978,363.9805].
## Model rank = 155 / 155
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(seasonStartYear) 9.00 3.22 1.02 0.66
## s(weight) 19.00 2.22 0.96 0.12
## s(ageAtSeasonStart) 19.00 1.27 1.02 0.70
## s(teamXGoalsAgainstPer60) 19.00 1.00 0.97 0.17
## s(teamBlockedShotsPer60) 19.00 1.00 0.94 0.03 *
## s(teamDZoneGiveawaysPer60) 19.00 1.00 0.99 0.36
## s(birthCountryCode) 14.00 4.75 NA NA
## s(teamAbbrev) 34.00 16.51 NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit a GAM for GAA using undersized
gaa_gam2 = gam(
goalsAgainstAverage ~
s(seasonStartYear, bs="cr", k=10)
+ undersized
+ s(weight, bs="cr", k=20)
+ s(ageAtSeasonStart, bs="cr", k=20)
+ s(teamXGoalsAgainstPer60, bs="cr", k=20)
+ s(teamBlockedShotsPer60, bs="cr", k=20)
+ s(teamDZoneGiveawaysPer60, bs="cr", k=20)
+ s(birthCountryCode, bs="re")
+ s(teamAbbrev, bs="re")
+ catches,
data = established_goalies,
method = "ML",
family = Gamma(link = "log")
)
summary(gaa_gam2)
##
## Family: Gamma
## Link function: log
##
## Formula:
## goalsAgainstAverage ~ s(seasonStartYear, bs = "cr", k = 10) +
## undersized + s(weight, bs = "cr", k = 20) + s(ageAtSeasonStart,
## bs = "cr", k = 20) + s(teamXGoalsAgainstPer60, bs = "cr",
## k = 20) + s(teamBlockedShotsPer60, bs = "cr", k = 20) + s(teamDZoneGiveawaysPer60,
## bs = "cr", k = 20) + s(birthCountryCode, bs = "re") + s(teamAbbrev,
## bs = "re") + catches
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.979892 0.008499 115.300 <2e-16 ***
## undersizedTRUE 0.004342 0.012484 0.348 0.728
## catchesR 0.014508 0.016556 0.876 0.381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(seasonStartYear) 4.822 5.837 4.450 0.000211 ***
## s(weight) 1.000 1.000 0.216 0.642230
## s(ageAtSeasonStart) 1.864 2.361 1.860 0.164222
## s(teamXGoalsAgainstPer60) 1.000 1.000 250.190 < 2e-16 ***
## s(teamBlockedShotsPer60) 1.000 1.001 0.305 0.581552
## s(teamDZoneGiveawaysPer60) 1.000 1.000 1.252 0.263627
## s(birthCountryCode) 4.285 13.000 1.332 0.002026 **
## s(teamAbbrev) 17.194 33.000 1.251 0.000131 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.413 Deviance explained = 44%
## -ML = 129.14 Scale est. = 0.011194 n = 765
plot(gaa_gam2)








# Check diagnostics for GAA GAM using undersized
gam.check(gaa_gam2)

##
## Method: ML Optimizer: outer newton
## full convergence after 14 iterations.
## Gradient range [-2.300376e-05,0.0001650268]
## (score 129.1375 & scale 0.01119372).
## Hessian positive definite, eigenvalue range [3.638282e-06,384.126].
## Model rank = 155 / 155
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(seasonStartYear) 9.00 4.82 1.04 0.81
## s(weight) 19.00 1.00 0.99 0.42
## s(ageAtSeasonStart) 19.00 1.86 1.01 0.64
## s(teamXGoalsAgainstPer60) 19.00 1.00 0.98 0.28
## s(teamBlockedShotsPer60) 19.00 1.00 0.97 0.17
## s(teamDZoneGiveawaysPer60) 19.00 1.00 0.99 0.47
## s(birthCountryCode) 14.00 4.29 NA NA
## s(teamAbbrev) 34.00 17.19 NA NA
Notice that undersized status is not statistically significant in either GAM. However, the estimated coefficient of undersized for SV% suggests shorter goalies have lower SV% while the estimated coefficient of undersized for GAA suggests shorter goalies have a higher GAA, on average. The variance of these coefficients show that this is in no way conclusive however.
set.seed(42)
# Fit decision tree model for SV% (using height and undersized)
savePct_dt = rpart(
savePct ~ seasonStartYear + height + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
method = "anova",
control = rpart.control(cp = 0.01)
)
# Plot CP table, CV error vs CP, and final tree
printcp(savePct_dt)
##
## Regression tree:
## rpart(formula = savePct ~ seasonStartYear + height + undersized +
## weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60 +
## teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev +
## catches, data = established_goalies, method = "anova", control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] birthCountryCode teamAbbrev teamXGoalsAgainstPer60
##
## Root node error: 0.090993/765 = 0.00011894
##
## n= 765
##
## CP nsplit rel error xerror xstd
## 1 0.081278 0 1.00000 1.00558 0.053089
## 2 0.037286 1 0.91872 0.97313 0.051423
## 3 0.035963 2 0.88144 1.00947 0.052602
## 4 0.016932 3 0.84547 1.01169 0.052233
## 5 0.016263 4 0.82854 1.00817 0.052519
## 6 0.014110 5 0.81228 1.02341 0.052705
## 7 0.012515 6 0.79817 1.03054 0.053284
## 8 0.010708 8 0.77314 1.01744 0.053806
## 9 0.010000 9 0.76243 1.01791 0.053179
plotcp(savePct_dt)

rpart.plot(savePct_dt, main = "Regression Tree for SV%")

set.seed(42)
# Fit decision tree model for GAA (using height and undersized)
gaa_dt = rpart(
goalsAgainstAverage ~ seasonStartYear + height + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
method = "anova",
control = rpart.control(cp = 0.01)
)
# Plot CP table, CV error vs CP, and final tree
printcp(gaa_dt)
##
## Regression tree:
## rpart(formula = goalsAgainstAverage ~ seasonStartYear + height +
## undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 +
## teamBlockedShotsPer60 + teamDZoneGiveawaysPer60 + birthCountryCode +
## teamAbbrev + catches, data = established_goalies, method = "anova",
## control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] birthCountryCode teamAbbrev teamXGoalsAgainstPer60
##
## Root node error: 104.28/765 = 0.13632
##
## n= 765
##
## CP nsplit rel error xerror xstd
## 1 0.256476 0 1.00000 1.00466 0.055207
## 2 0.049934 1 0.74352 0.77025 0.045130
## 3 0.031667 2 0.69359 0.73403 0.041404
## 4 0.029068 3 0.66192 0.77549 0.043030
## 5 0.027548 4 0.63285 0.77484 0.041636
## 6 0.012175 5 0.60531 0.75259 0.040641
## 7 0.011680 6 0.59313 0.76951 0.041451
## 8 0.010000 7 0.58145 0.77031 0.041973
plotcp(gaa_dt)

rpart.plot(gaa_dt, main = "Regression Tree for GAA")

Notice that neither height or undersized status gets used in any of the regression trees, indicating that is not a relevant predictor to SV% or GAA.
set.seed(42)
# Fit RF model for SV% using height
savePct_rf = randomForest(
savePct ~ seasonStartYear + height + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
importance = TRUE)
savePct_rf
##
## Call:
## randomForest(formula = savePct ~ seasonStartYear + height + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60 + teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches, data = established_goalies, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 9.871289e-05
## % Var explained: 17.01
# Plot variable importance for SV% RF using height
varImpPlot(savePct_rf, main = "Variable Importance Plots of SV% Random Forest Model (w/ Height)")

importance(savePct_rf) |>
as.data.frame() |>
rownames_to_column(var = "Variable") |>
arrange(desc(`%IncMSE`))
## Variable %IncMSE IncNodePurity
## 1 teamXGoalsAgainstPer60 22.240285 0.012598025
## 2 seasonStartYear 21.372579 0.009375545
## 3 teamAbbrev 21.080935 0.021453499
## 4 birthCountryCode 13.264285 0.008270551
## 5 weight 10.839731 0.006114244
## 6 height 7.653012 0.004504520
## 7 teamDZoneGiveawaysPer60 7.376831 0.006919735
## 8 teamBlockedShotsPer60 7.143536 0.007912293
## 9 ageAtSeasonStart 4.754529 0.006419229
## 10 catches 1.650327 0.000611284
# Extract the effect of height in the SV% RF
partial(savePct_rf, pred.var = "height", plot = TRUE)

set.seed(42)
# Fit RF model for GAA using height
gaa_rf = randomForest(
goalsAgainstAverage ~ seasonStartYear + height + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
importance = TRUE)
gaa_rf
##
## Call:
## randomForest(formula = goalsAgainstAverage ~ seasonStartYear + height + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60 + teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches, data = established_goalies, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.08798583
## % Var explained: 35.46
# Plot variable importance for GAA RF using height
varImpPlot(gaa_rf, main = "Variable Importance Plots of GAA Random Forest Model (w/ Height)")

importance(gaa_rf) |>
as.data.frame() |>
rownames_to_column(var = "Variable") |>
arrange(desc(`%IncMSE`))
## Variable %IncMSE IncNodePurity
## 1 teamXGoalsAgainstPer60 46.210710 29.1537612
## 2 teamAbbrev 22.729221 23.0659420
## 3 seasonStartYear 20.748142 8.4861192
## 4 birthCountryCode 10.611164 7.3519548
## 5 weight 9.198025 6.1448196
## 6 teamBlockedShotsPer60 8.168180 7.3576914
## 7 height 7.138220 3.9880184
## 8 teamDZoneGiveawaysPer60 4.671174 5.9202798
## 9 ageAtSeasonStart 3.307344 5.8291679
## 10 catches 0.742054 0.6381632
# Extract the effect of height in the GAA RF
partial(gaa_rf, pred.var = "height", plot = TRUE)

Notice that both the SV% and GAA random forest models perform worse than the GAM in terms of variance explained. Similar to the GAMs, height is not an important variable in an already weak model. Also like the gams, however, it appears the shortest of goalies have worse SV% and GAA than taller goalies.
set.seed(42)
# Fit RF model for SV% using undersized
savePct_rf2 = randomForest(
savePct ~ seasonStartYear + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
importance = TRUE)
savePct_rf2
##
## Call:
## randomForest(formula = savePct ~ seasonStartYear + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60 + teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches, data = established_goalies, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.0001001335
## % Var explained: 15.82
# Plot variable importance for SV% RF using undersized status
varImpPlot(savePct_rf2, main = "Variable Importance Plots of SV% Random Forest Model (w/ Undersized)")

importance(savePct_rf2) |>
as.data.frame() |>
rownames_to_column(var = "Variable") |>
arrange(desc(`%IncMSE`))
## Variable %IncMSE IncNodePurity
## 1 seasonStartYear 19.172006 0.0096166565
## 2 teamAbbrev 17.991901 0.0217328327
## 3 teamXGoalsAgainstPer60 17.878684 0.0125605464
## 4 birthCountryCode 13.231796 0.0083173680
## 5 weight 9.059573 0.0067809201
## 6 teamDZoneGiveawaysPer60 6.709960 0.0073737664
## 7 teamBlockedShotsPer60 5.543850 0.0080385262
## 8 ageAtSeasonStart 4.209160 0.0066991805
## 9 catches 3.754359 0.0006499772
## 10 undersized 2.222845 0.0013033837
# Extract the effect of undersized status in the SV% RF
partial(savePct_rf2, pred.var = "undersized", plot = TRUE)

set.seed(42)
# Fit RF model for GAA using undersized
gaa_rf2 = randomForest(
goalsAgainstAverage ~ seasonStartYear + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60
+ teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches,
data = established_goalies,
importance = TRUE)
gaa_rf2
##
## Call:
## randomForest(formula = goalsAgainstAverage ~ seasonStartYear + undersized + weight + ageAtSeasonStart + teamXGoalsAgainstPer60 + teamBlockedShotsPer60 + teamDZoneGiveawaysPer60 + birthCountryCode + teamAbbrev + catches, data = established_goalies, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.08803049
## % Var explained: 35.42
# Plot variable importance for GAA RF using undersized status
varImpPlot(gaa_rf2, main = "Variable Importance Plots of GAA Random Forest Model (w/ Undersized)")

importance(gaa_rf2) |>
as.data.frame() |>
rownames_to_column(var = "Variable") |>
arrange(desc(`%IncMSE`))
## Variable %IncMSE IncNodePurity
## 1 teamXGoalsAgainstPer60 48.40677586 29.416470
## 2 teamAbbrev 22.79424450 23.574982
## 3 seasonStartYear 19.14060288 8.365800
## 4 birthCountryCode 9.11168388 7.656296
## 5 weight 8.90114071 6.745816
## 6 teamBlockedShotsPer60 8.17006344 7.576926
## 7 teamDZoneGiveawaysPer60 6.35435283 6.196713
## 8 undersized 2.65241775 0.847283
## 9 ageAtSeasonStart 0.01202912 6.186019
## 10 catches -1.05775466 0.709676
# Extract the effect of undersized status in the GAA RF
partial(gaa_rf2, pred.var = "undersized", plot = TRUE)

Notice that both the SV% and GAA random forest models using undersized status perform worse than the GAMs in terms of variance explained. Similar to the GAMs, undersized status is not an important variable in an already weak model. Also like the GAMs, however, it appears the shortest of goalies have worse SV% and GAA than taller goalies.