Loading of Necessary Libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(corrplot)
## corrplot 0.90 loaded
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(stringr)

Helper Functions

calc_loocv_rmse = function(model) {
sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
}
plot_fitted_resid = function(model, pointcol = "dodgerblue", linecol = "darkorange") {
  plot(fitted(model), resid(model), 
       col = pointcol, pch = 20, cex = 1.5,
       xlab = "Fitted", ylab = "Residuals")
  abline(h = 0, col = linecol, lwd = 2)
}
plot_qq = function(model, pointcol = "dodgerblue", linecol = "darkorange") {
  qqnorm(resid(model), col = pointcol, pch = 20, cex = 1.5)
  qqline(resid(model), col = linecol, lwd = 2)
}

Introduction

Our group made the decision to study advanced statistics from NBA players since the 3-point line was introduced in 1980. This data is available on basketball-reference.com, but only on a yearly basis. In order for us to extract the whole data set we created a scraper that gathered every year from 1980 to this last NBA season. We then merged it into a single csv file ready to be loaded into R-Studio. Our purpose is to see how advanced statistics help predict a player’s efficiency rating(PER). We will build a variety of models using different combinations of predictors as well as models with only offensive variables as predictors as well as only defensive variables as predictors. While these smaller models might not be the best at predicting we are curious to see if offense or defense is better at predicting a players efficiency.

Data Cleaning

As with most data sets online, we had to clean the data in order for us to more efficiently use the data to build models.

First we start by loading the data:

data = read_csv("advanced_stats.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Player = col_character(),
##   Pos = col_character(),
##   Tm = col_character(),
##   ` ` = col_logical(),
##   ` _1` = col_logical()
## )
## ℹ Use `spec()` for the full column specifications.
str(data)
## spec_tbl_df [21,786 × 30] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1    : num [1:21786] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Player: chr [1:21786] "Kareem Abdul-Jabbar*" "Tom Abernethy" "Alvan Adams" "Tiny Archibald*" ...
##  $ Pos   : chr [1:21786] "C" "PF" "C" "PG" ...
##  $ Age   : num [1:21786] 32 25 25 31 31 28 22 25 28 27 ...
##  $ Tm    : chr [1:21786] "LAL" "GSW" "PHO" "BOS" ...
##  $ G     : num [1:21786] 82 67 75 80 26 20 67 82 77 20 ...
##  $ MP    : num [1:21786] 3143 1222 2168 2864 560 ...
##  $ PER   : num [1:21786] 25.3 11 19.2 15.3 7.4 9.3 12.3 18.1 13.7 8.2 ...
##  $ TS%   : num [1:21786] 0.639 0.511 0.571 0.574 0.524 0.467 0.495 0.532 0.533 0.432 ...
##  $ 3PAr  : num [1:21786] 0.001 0.003 0.002 0.023 0 0.029 0 0.043 0.004 0 ...
##  $ FTr   : num [1:21786] 0.344 0.258 0.27 0.548 0.833 0.371 0.373 0.206 0.275 0.533 ...
##  $ ORB%  : num [1:21786] 7.2 5.4 8.2 2.3 6 3.3 10.2 9.8 8.3 12.4 ...
##  $ DRB%  : num [1:21786] 22.2 12 22.4 5.3 16.9 12.4 18.3 16.6 12.1 16.8 ...
##  $ TRB%  : num [1:21786] 15.4 8.6 15.4 3.8 11.5 7.8 14.3 13.1 10.1 14.5 ...
##  $ AST%  : num [1:21786] 16.5 9.3 21.6 30.2 9 17.8 5.3 9.7 15.9 7.8 ...
##  $ STL%  : num [1:21786] 1.2 1.4 2.3 1.7 1 1.8 1.4 1.7 1.7 0.8 ...
##  $ BLK%  : num [1:21786] 4.6 0.6 1.4 0.2 1.5 1.2 4.1 0.8 1.1 2.3 ...
##  $ TOV%  : num [1:21786] 15.7 9.9 18.2 19.7 24.8 21.3 20 10 18.2 19.5 ...
##  $ USG%  : num [1:21786] 24.1 13.3 21.9 17 7.9 11.3 21.4 21.6 17.3 12.8 ...
##  $       : logi [1:21786] NA NA NA NA NA NA ...
##  $ OWS   : num [1:21786] 9.5 1.2 3.1 5.9 0.1 0 -0.4 4.1 2.1 -0.1 ...
##  $ DWS   : num [1:21786] 5.3 0.8 3.9 2.9 0.5 0.2 1.4 2.8 1.9 0.2 ...
##  $ WS    : num [1:21786] 14.8 2 7 8.9 0.6 0.2 1 6.9 3.9 0.1 ...
##  $ WS/48 : num [1:21786] 0.227 0.08 0.155 0.148 0.053 0.043 0.063 0.136 0.081 0.019 ...
##  $  _1   : logi [1:21786] NA NA NA NA NA NA ...
##  $ OBPM  : num [1:21786] 4.8 -1 1.7 1.4 -2.3 -3 -4.2 1.9 -0.3 -3.8 ...
##  $ DBPM  : num [1:21786] 2.4 -0.2 1.9 -0.3 0.9 1.3 0.9 0.1 -0.5 -0.5 ...
##  $ BPM   : num [1:21786] 7.2 -1.2 3.6 1.1 -1.4 -1.7 -3.3 2 -0.7 -4.3 ...
##  $ VORP  : num [1:21786] 7.3 0.2 3.1 2.3 0.1 0.1 -0.2 2.5 0.7 -0.2 ...
##  $ MVP   : num [1:21786] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   Player = col_character(),
##   ..   Pos = col_character(),
##   ..   Age = col_double(),
##   ..   Tm = col_character(),
##   ..   G = col_double(),
##   ..   MP = col_double(),
##   ..   PER = col_double(),
##   ..   `TS%` = col_double(),
##   ..   `3PAr` = col_double(),
##   ..   FTr = col_double(),
##   ..   `ORB%` = col_double(),
##   ..   `DRB%` = col_double(),
##   ..   `TRB%` = col_double(),
##   ..   `AST%` = col_double(),
##   ..   `STL%` = col_double(),
##   ..   `BLK%` = col_double(),
##   ..   `TOV%` = col_double(),
##   ..   `USG%` = col_double(),
##   ..   ` ` = col_logical(),
##   ..   OWS = col_double(),
##   ..   DWS = col_double(),
##   ..   WS = col_double(),
##   ..   `WS/48` = col_double(),
##   ..   ` _1` = col_logical(),
##   ..   OBPM = col_double(),
##   ..   DBPM = col_double(),
##   ..   BPM = col_double(),
##   ..   VORP = col_double(),
##   ..   MVP = col_double()
##   .. )

We noticed that after scraping the data there are two columns that are empty, this was due to the way the data was shown on basketball-reference.com. We then proceed to remove them as well as a third column named MVP that will not be used.

data = na.omit(dplyr::select(data, -c(20,25,30))) # remove empty columns, MVP column and rows with NA values

We noticed that there are some very serious outliers in the data, an example of this is when a player only plays a single game in his career and has a game where he never missed a shot his PER will be extremely high and not representative of his actual ability. Due to this we decided to establish a cutoff and only include players that played at least 41 out of 82 games in a season and at least 15 out of 48 minutes in a game.

Another issue we encountered in the data is when a player is traded mid-season. This causes the player to show up three times in the same year, one for each team and one for the combined statistics for all teams he played in that season. We decided to only keep the combined row for each player that was traded.

The final thing we did is to change the Pos values to only include the five original positions(PG, SG, SF, PF, C) instead of combinations of two of those for players who sometimes played a second position. We did this because we did not want a factor variable with 16 levels.

qualified_player_cutoff = (48*82*0.3125) / (82/2) # Minutes in a game * games in a season * 
# 31% of total available minutes played divided by games in a season / 2

n_lines = nrow(data)
tempName = ""
for(i in 1:n_lines) {

  has_dash = grepl("-", data$Pos[i], fixed = TRUE)

  if(has_dash == TRUE){
    first_pos = sapply(strsplit(data$Pos[i],"-"), `[`, 1)
    data$Pos[i] = first_pos
  }

  minutes_played_per_game = data$MP[i] / data$G[i]

  if(minutes_played_per_game < qualified_player_cutoff) {
    data$G[i] = NA
  }

  if(data$Tm[i] == "TOT") {
    tempName = data$Player[i]
  }
  else {
    if(tempName == data$Player[i]) {
      data$Player[i] = NA
    }
    else {
      tempName = ""
    }
  }
}

We also converted the Position(POS) column to be a factor

is.factor(data$Pos)
## [1] FALSE
data$Pos = as.factor(data$Pos)
is.factor(data$Pos)
## [1] TRUE
levels(data$Pos)
## [1] "C"  "PF" "PG" "SF" "SG"

After cleaning up the unnecessary columns and duplicated players we then remove all rows that contain NAs.

data = na.omit(data)

Data Partitioning

We decided to do a 70/30 split for training/testing data.

dt = sort(sample(n_lines, n_lines*.7))
train = data[dt,]
test = data[-dt,]

Methodology

For model building we start with a basic additive model using all the qualified predictors. We have a factor variable called Pos which states the player’s playing position. We will test an additive model both including the Pos variable and excluding it to see if in fact it seems to be significant at \[\alpha = 0.05\]

additive_model = lm(PER ~ . - X1 - Player - Pos - Tm, data = train) 
additive_model_POS = lm(PER ~ . - X1 - Player - Tm, data = train) 

anova(additive_model, additive_model_POS)[2,"Pr(>F)"] < 0.05
## [1] TRUE

As we can see the P-Value is lower than \(\alpha = 0.05\) so we reject the null hypothesis and prefer the larger model with the player positions.

With that additive model and a null model with no predictors as a starting point we built 4 additional models using the step function:

selected_backward_AIC = step(additive_model_POS, direction = "backward", trace = 0)
selected_backward_BIC = step(additive_model_POS, direction = "backward", trace = 0, k = log(n_lines))
null_model = lm(PER ~ 1, data = train)
biggest = formula(additive_model_POS) #Scope
selected_forward_AIC = step(null_model, scope = biggest, direction = "forward", trace = 0)
selected_forward_BIC = step(null_model, scope = biggest, direction = "forward", trace = 0, k = log(n_lines))


table = data.frame(Method = c("Backward AIC","Backward BIC",
"Forward AIC","Forward BIC"),
         RMSE = c(sqrt(mean(selected_backward_AIC$residuals^2)),sqrt(mean(selected_backward_BIC$residuals^2)),
          sqrt(mean(selected_forward_AIC$residuals^2)),sqrt(mean(selected_forward_BIC$residuals^2))),
         R_Squared = c(summary(selected_backward_AIC)$r.squared,summary(selected_backward_BIC)$r.squared,
         summary(selected_forward_AIC)$r.squared,summary(selected_forward_BIC)$r.squared))
kbl(table) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Method RMSE R_Squared
Backward AIC 0.4077065 0.9892502
Backward BIC 0.4085298 0.9892067
Forward AIC 0.4078867 0.9892407
Forward BIC 0.4080366 0.9892328
par(mfrow=c(2,2))
plot_fitted_resid(selected_backward_AIC)
plot_fitted_resid(selected_forward_AIC)
plot_fitted_resid(selected_backward_BIC)
plot_fitted_resid(selected_forward_AIC)

After building these four models we noticed that every method led to virtually the same predictors and RMSE. We now started looking at the predictors and using a correlation matrix and the VIF function looked for any variables that we might remove without affecting the model.

corrplot(cor(subset(data, select = -c(Player,Pos,Tm))), type = "upper")

car::vif(selected_forward_AIC)
##                GVIF Df GVIF^(1/(2*Df))
## OBPM      28.911280  1        5.376921
## `TRB%`  1435.023382  1       37.881702
## `USG%`     4.352006  1        2.086146
## `WS/48`   17.930354  1        4.234425
## `3PAr`     2.374778  1        1.541031
## `BLK%`     3.458829  1        1.859793
## `STL%`     3.194140  1        1.787216
## DWS        9.206501  1        3.034222
## Pos       14.967513  4        1.402470
## MP        12.367702  1        3.516774
## `DRB%`   666.040901  1       25.807768
## `TS%`      4.821238  1        2.195732
## `AST%`     9.529492  1        3.086987
## `TOV%`     3.961071  1        1.990244
## DBPM      11.284035  1        3.359172
## G          9.457546  1        3.075312
## FTr        2.154126  1        1.467694
## VORP      19.582002  1        4.425156
## `ORB%`   199.012103  1       14.107165
## Age        1.197910  1        1.094491

First thing we notice is that the Age column barely has a relationship with the rest of the variables so we will remove it in future models. We can see that variables such as TRB%, WS and BPM are strongly correlated to their offensive and defensive variations, DRB%, ORB%, OBPM, DBPM DWS, and OWS. We also see strong multi-colinearity using the VIF function. Knowing this we removed TRB% and BPM from the model to see what happens.

additive_model_reduced = lm(PER ~ Pos + G + MP + `TS%` + `3PAr` + FTr + `ORB%` + `DRB%` + `AST%` + `STL%` + `BLK%` + `TOV%` + `USG%` + OWS + DWS + `WS/48` + OBPM + DBPM, data = train) 
summary(additive_model_reduced)$r.squared
## [1] 0.9891921
sqrt(mean(additive_model_reduced$residuals^2))
## [1] 0.4088063

We managed to remove some variables that were likely overkill and still managed to maintain the already good values for R-squared and RMSE. We then moved on to try some interactions between variables to see if we can get better results. While it was partly trial and error we had a hunch that USG% would be a very important variable given its positive relationship to PER. That was confirmed when comparing various combinations of interactions(These were not included for the sake of brevity) which showed that indeed the interactions of variables with USG% were the most significant and reduced the RMSE even more.

interaction_model_USG = lm(PER ~ (G + MP + `TS%` + `3PAr` + FTr + `ORB%` + `DRB%` + `AST%` + `STL%` + `BLK%` + `TOV%` + `USG%` + OWS + DWS + `WS/48` + OBPM + DBPM) * `USG%`, data = train) 
summary(interaction_model_USG)$r.squared
## [1] 0.9914518
sqrt(mean(interaction_model_USG$residuals^2))
## [1] 0.3635672

We then decided to try one last method and see if by running a stepwise AIC on the interaction model we could get better results or at least reduce some interactions and maintain the already good metrics achieved. We would also compare the model with all interactions of USG% and the selected model from the stepwise AIC process and make a statistical decision at \[ \alpha = 0.05 \]

interaction_model_AIC = step(interaction_model_USG, trace = FALSE)
summary(interaction_model_AIC)$r.squared
## [1] 0.9914443
sqrt(mean(interaction_model_AIC$residuals^2))
## [1] 0.3637271
anova(interaction_model_AIC, interaction_model_USG)[2, "Pr(>F)"] < 0.05
## [1] FALSE

As we can see the P-Value was larger than \(\alpha = 0.05\) which means we fail to reject the null-hypothesis and prefer the smaller model.

Results

We now move on to test the models using the testing data we created at the beginning. We will now plot two models, first we have the best model we found which was the interaction model found through stepwise AIC, then we have the reduced additive model found through AIC where we removed additional variables that we felt were not needed.

actual = test$PER

predicted_best = predict(interaction_model_AIC, newdata = test)
predicted_additive_model_reduced = predict(additive_model_reduced, newdata = test)

par(mfrow=c(1,2))
plot(predicted_best, actual,
     xlab = "Predicted Values",
     ylab = "Actual Values",
     main = "USG% Interaction AIC Model")
abline(a = 0,                                       
       b = 1,
       col = "red",
       lwd = 2)

plot(predicted_additive_model_reduced, actual,
     xlab = "Predicted Values",
     ylab = "Actual Values",
     main = "Additive Forward AIC Model")
abline(a = 0,                                       
       b = 1,
       col = "red",
       lwd = 2)

As we can see on the plot, the first two models are almost equally good at predicting PER. We also tested some less effective models(See Appendix).

While both the USG% Interaction AIC and Reduced Additive Model are very good, we would choose the Interaction model since it gives us a lower RMSE value and a higher R-squared value. We can see these in the following table

comparison_table = data.frame(Method = c("USG% Interaction AIC", "Reduced Additive Model"),
         RMSE = c(sqrt(mean(interaction_model_AIC$residuals^2)),sqrt(mean(additive_model_reduced$residuals^2))),
         R_Squared = c(summary(interaction_model_AIC)$r.squared,summary(additive_model_reduced)$r.squared))
kbl(comparison_table) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Method RMSE R_Squared
USG% Interaction AIC 0.3637271 0.9914443
Reduced Additive Model 0.4088063 0.9891921

We added a Fitted vs. Residuals and Q-Q normality plot to show how well the data fits using the USG% Interaction AIC model.

plot_fitted_resid(interaction_model_AIC)

plot_qq(interaction_model_AIC)

Discussion

Before starting this analysis we didn’t really know what to expect regarding the accuracy or how difficult it would be to find a good model. After running the first few tests we realized that even from the starting additive model we got a pretty solid one. It would only make sense that it could get better when applying the other methods learned during the class such as transformations, interactions, AIC, BIC, etc. In the end we were left with an Interaction model that used the interaction of only one variable(`USG%) and ran it through the step function and managed to improve it even more using AIC. This model is a close to perfect as we can get without abusing the number of predictors and adding all possible interactions. While there are still some predictors that have strong correlations with each other we found that removing them made the model worse, we feel confident that this model is the best choice.

Appendix

Before starting the project we had asked the question: Are offensive or defensive statistics better predictors for PER? When we built the models we noticed they were not as good as the additive or interaction models, for this reason we included them in the appendix and limited the plots and results shown.

We now move on to build additional models based on whether a statistic is a defensive or an offensive one. To briefly explain, a variable such as ORB%(Offensive Rebound Rating) is considered an offensive statistic and DRB%(Defensive Rebound Rating) is considered a defensive statistic. This is done mainly for curiosity and to determine if there is an advantage to using one type of variable over the other.

Offensive Statistics Models:

offensive_model = lm(PER ~ `TS%` + `3PAr` + FTr + `ORB%` + `AST%` + OWS + OBPM, data = train)
summary(offensive_model)$r.squared
## [1] 0.8942361
calc_loocv_rmse(offensive_model)
## [1] 1.283157

Defensive Statistics Models:

defensive_model = lm(PER ~ `DRB%` + `STL%` + `BLK%` + DWS + DBPM, data = train)
summary(defensive_model)$r.squared
## [1] 0.2211871
calc_loocv_rmse(defensive_model)
## [1] 3.479125

These results seem interesting, we thought both models would manage about the same but that was not the case. The offensive model performs better than the defensive model. What this tells us is that a player that is better offensively will likely have a higher Efficiency Rating than a player that is primarily a defender.

We also tried to build some other models with additional transformations but these were not any better than the interaction or additive models. Many more models were built and discarded, we decided not to include every single one so that the report was not cluttered with extra code that was not actually helpful.

log_model = lm(log(PER) ~ Pos + I(G^2) + I(MP^2) + I(`TS%`^2) + I(`3PAr`^2) + I(FTr^2) + I(`ORB%`^2) + I(`DRB%`^2) + 
    I(`AST%`^2) + I(`STL%`^2) + I(`BLK%`^2) + I(`TOV%`^2) + I(`USG%`^2) + I(OWS^2) + I(DWS^2) + 
    I(`WS/48`^2) + I(OBPM^2) + I(DBPM^2), data = train)

log_model2 = lm(log(PER) ~ G + MP + `TS%` + `3PAr` + `ORB%` + `DRB%` + `AST%` + `STL%` + 
    `BLK%` + `TOV%` + `USG%` + OWS + DWS + `WS/48` + OBPM + DBPM + 
    G:`USG%` + `TS%`:`USG%` + `3PAr`:`USG%` + `ORB%`:`USG%` + 
    `DRB%`:`USG%` + `AST%`:`USG%` + `STL%`:`USG%` + `TOV%`:`USG%` + 
    `USG%`:OWS + `USG%`:DBPM, data = train)

sqrt(mean(log_model$residuals^2))
## [1] 0.06706434
summary(log_model)$r.squared
## [1] 0.9129188
sqrt(mean(log_model2$residuals^2))
## [1] 0.03430721
summary(log_model2)$r.squared
## [1] 0.9772117

Group Members: