# needed libraries
# might need to install.packages("nbastatR")
library(tidyverse)
library(nbastatR)
library(lubridate)
library(dplyr)
library(ggplot2)
# change connection size to allow for importing large data sets
Sys.setenv("VROOM_CONNECTION_SIZE" = 262144)
# make sure all columns are printed
options(width = 10000)
# nba stats from seasons 2004-2024
if (file.exists("data/nbastats.rds")) {
<- readRDS("data/nbastats.rds")
combined_tibble <- combined_tibble |> filter(source == "nba_2004") |> select(-source)
nba_2004 <- combined_tibble |> filter(source == "nba_2005") |> select(-source)
nba_2005 <- combined_tibble |> filter(source == "nba_2006") |> select(-source)
nba_2006 <- combined_tibble |> filter(source == "nba_2007") |> select(-source)
nba_2007 <- combined_tibble |> filter(source == "nba_2008") |> select(-source)
nba_2008 <- combined_tibble |> filter(source == "nba_2009") |> select(-source)
nba_2009 <- combined_tibble |> filter(source == "nba_2010") |> select(-source)
nba_2010 <- combined_tibble |> filter(source == "nba_2011") |> select(-source)
nba_2011 <- combined_tibble |> filter(source == "nba_2012") |> select(-source)
nba_2012 <- combined_tibble |> filter(source == "nba_2013") |> select(-source)
nba_2013 <- combined_tibble |> filter(source == "nba_2014") |> select(-source)
nba_2014 <- combined_tibble |> filter(source == "nba_2015") |> select(-source)
nba_2015 <- combined_tibble |> filter(source == "nba_2016") |> select(-source)
nba_2016 <- combined_tibble |> filter(source == "nba_2017") |> select(-source)
nba_2017 <- combined_tibble |> filter(source == "nba_2018") |> select(-source)
nba_2018 <- combined_tibble |> filter(source == "nba_2019") |> select(-source)
nba_2019 <- combined_tibble |> filter(source == "nba_2020") |> select(-source)
nba_2020 <- combined_tibble |> filter(source == "nba_2021") |> select(-source)
nba_2021 <- combined_tibble |> filter(source == "nba_2022") |> select(-source)
nba_2022 <- combined_tibble |> filter(source == "nba_2023") |> select(-source)
nba_2023 <- combined_tibble |> filter(source == "nba_2024") |> select(-source)
nba_2024 else {
} <- game_logs(seasons = 2004, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2004 <- game_logs(seasons = 2005, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2005 <- game_logs(seasons = 2006, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2006 <- game_logs(seasons = 2007, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2007 <- game_logs(seasons = 2008, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2008 <- game_logs(seasons = 2009, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2009 <- game_logs(seasons = 2010, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2010 <- game_logs(seasons = 2011, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2011 <- game_logs(seasons = 2012, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2012 <- game_logs(seasons = 2013, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2013 <- game_logs(seasons = 2014, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2014 <- game_logs(seasons = 2015, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2015 <- game_logs(seasons = 2016, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2016 <- game_logs(seasons = 2017, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2017 <- game_logs(seasons = 2018, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2018 <- game_logs(seasons = 2019, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2019 <- game_logs(seasons = 2020, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2020 <- game_logs(seasons = 2021, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2021 <- game_logs(seasons = 2022, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2022 <- game_logs(seasons = 2023, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2023 <- game_logs(seasons = 2024, league = "NBA", result_types = "player", season_types = "Regular Season")
nba_2024 <- bind_rows(
combined_tibble |> mutate(source = "nba_2004"),
nba_2004 |> mutate(source = "nba_2005"),
nba_2005 |> mutate(source = "nba_2006"),
nba_2006 |> mutate(source = "nba_2007"),
nba_2007 |> mutate(source = "nba_2008"),
nba_2008 |> mutate(source = "nba_2009"),
nba_2009 |> mutate(source = "nba_2010"),
nba_2010 |> mutate(source = "nba_2011"),
nba_2011 |> mutate(source = "nba_2012"),
nba_2012 |> mutate(source = "nba_2013"),
nba_2013 |> mutate(source = "nba_2014"),
nba_2014 |> mutate(source = "nba_2015"),
nba_2015 |> mutate(source = "nba_2016"),
nba_2016 |> mutate(source = "nba_2017"),
nba_2017 |> mutate(source = "nba_2018"),
nba_2018 |> mutate(source = "nab_2019"),
nba_2019 |> mutate(source = "nba_2020"),
nba_2020 |> mutate(source = "nba_2021"),
nba_2021 |> mutate(source = "nba_2022"),
nba_2022 |> mutate(source = "nba_2023"),
nba_2023 |> mutate(source = "nba_2024"),
nba_2024
)saveRDS(combined_tibble, "data/nbastats.rds")
}
# function to isolate games for LeBron
<- function(x, nameOfPlayer) {
getyearstats |>
x filter(namePlayer == nameOfPlayer) |>
mutate(games_missed = lead(numberGameTeamSeason) - numberGameTeamSeason - 1) |>
mutate(game_score = pts + (0.4 * fgm) - (0.7 * fga) - (0.4 * (fta - ftm)) + (0.7 * oreb) + (0.3 * dreb) + stl + (0.7 * ast) + (0.7 * blk) - (0.4 * pf) - tov) |>
mutate(game_PER = ((fgm * 85.910) + (stl * 53.897) + (fg3m * 51.757) + (ftm * 46.845) + (blk * 39.190) + (oreb * 39.190) + (ast * 34.677) + (dreb * 14.707) - (pf * 17.174) - ((fta - ftm) * 20.091) - ((fga - fgm) * 39.190) - (tov * 53.897)) * (1 / minutes)) |>
mutate(year = year(dateGame)) |>
group_by(year) |>
summarize(
avg_pts = mean(pts),
avg_mins = mean(minutes),
avg_fg_perc = mean(pctFG),
avg_plus_minus = mean(plusminus),
avg_game_score = mean(game_score),
PER = mean(game_PER),
games_played = n(),
most_games_missed = max(games_missed, na.rm = TRUE)
) }
NBA Player Injury Analysis - Jeremiah Tan
Introduction
In today’s world, professional athletes are paid millions to perform at the highest level. Star athletes like LeBron James, Lionel Messi, and Shohei Ohtani dominate headlines and demand some of the highest salaries ever. However, are such exorbitant costs reasonably proportional to the performance of these players, even through seasons of wear and tear? Or can we find a more accurate measure of their value? This is a constant question in the minds of general managers, especially within the NBA, where salary must carefully be balanced with performance in order to create an optimal team to win championships. Current analytical models, like FiveThirtyEight, are a decent predictor for player value, but have large drawbacks, as they do not factor in certain intangibles like injury history and clutch factors. My research aims to utilize data analytics to analyze performance before and after injuries, and equate that to salary changes. In the future, I hope to develop a more sophisticated model that can later be extended to more sports than basketball.
Summary of Winter 2024 Work
Throughout the course of this quarter, the primary focus of my research has been getting familiar with the R language, as well as finding a way to obtain NBA regular season statistics data for the last 20-25 years. I attempted to web scrape data off of NBA.com and BasketballReference, but experienced various setbacks, as I had problems getting R to recognize what data to collect. After finally collecting the necessary data, I experimented with various ways to filter out useful quantities and statstics that I will need to analyze further next quarter, such as the maximum number of consecutive games missed and a player’s game score. An example is shown below, using the renowned basketball player LeBron James.
LeBron James
Data for the NBA player LeBron James through 2004-2021 (Regular Season only):
# finding lebron stats every year
<- getyearstats(nba_2004, "LeBron James")
lebron_2004 <- getyearstats(nba_2005, "LeBron James")
lebron_2005 <- getyearstats(nba_2006, "LeBron James")
lebron_2006 <- getyearstats(nba_2007, "LeBron James")
lebron_2007 <- getyearstats(nba_2008, "LeBron James")
lebron_2008 <- getyearstats(nba_2009, "LeBron James")
lebron_2009 <- getyearstats(nba_2010, "LeBron James")
lebron_2010 <- getyearstats(nba_2011, "LeBron James")
lebron_2011 <- getyearstats(nba_2012, "LeBron James")
lebron_2012 <- getyearstats(nba_2013, "LeBron James")
lebron_2013 <- getyearstats(nba_2014, "LeBron James")
lebron_2014 <- getyearstats(nba_2015, "LeBron James")
lebron_2015 <- getyearstats(nba_2016, "LeBron James")
lebron_2016 <- getyearstats(nba_2017, "LeBron James")
lebron_2017 <- getyearstats(nba_2018, "LeBron James")
lebron_2018 <- getyearstats(nba_2019, "LeBron James")
lebron_2019 <- getyearstats(nba_2020, "LeBron James")
lebron_2020 <- getyearstats(nba_2021, "LeBron James")
lebron_2021
<- bind_rows(lebron_2004, lebron_2005, lebron_2006, lebron_2007, lebron_2008, lebron_2009, lebron_2010, lebron_2011, lebron_2012, lebron_2013, lebron_2014, lebron_2015, lebron_2016, lebron_2017, lebron_2018, lebron_2019, lebron_2020, lebron_2021) lebron_year_by_year
lebron_year_by_year
# A tibble: 34 × 9
year avg_pts avg_mins avg_fg_perc avg_plus_minus avg_game_score PER games_played most_games_missed
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2003 20.2 40.1 0.412 -2.09 14.3 16.2 32 0
2 2004 21.4 39.1 0.402 -1.60 14.7 17.1 47 3
3 2004 24.4 40.9 0.474 4.59 21.1 24.6 29 0
4 2005 28.8 43.2 0.473 0.373 22.7 25.0 51 2
5 2005 30.4 41.5 0.497 5.79 23.0 26.6 28 0
6 2006 31.9 43.1 0.471 2.22 24.0 26.3 51 1
7 2006 27.3 40.3 0.485 4 20.5 24.3 29 0
8 2007 27.3 41.3 0.466 5.31 20.2 23.4 49 1
9 2007 28.8 39.5 0.473 2.12 23.2 27.9 26 5
10 2008 30.6 40.9 0.485 1.67 24.7 28.9 49 1
# ℹ 24 more rows
The lebron_year_by_year
tibble returns half-year statistics for LeBron, and the most_games_missed
column should indicate when LeBron suffered an injury; for example, in the first half of the 2007-2008 season, LeBron missed 5 games in a row, his longest absence up to that point, indicating a possible injury was suffered. We can obtain every half-year for which LeBron missed more than 5 games consecutively, providing a baseline for identifying injury:
|>
lebron_year_by_year filter(most_games_missed >= 5)
# A tibble: 3 × 9
year avg_pts avg_mins avg_fg_perc avg_plus_minus avg_game_score PER games_played most_games_missed
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2007 28.8 39.5 0.473 2.12 23.2 27.9 26 5
2 2014 25.2 37.6 0.481 3.79 19.8 25.2 29 8
3 2021 25.2 33.6 0.515 6.43 20.7 29.7 40 20
Based on this information, we now know what years to focus our study on. We can now plot all of our data using the ggplot2
library (will test between local regression and generative additive models):
ggplot(lebron_year_by_year, aes(x = year, y = avg_game_score)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "LeBron's performance year over year, 2003-2021", x = "years", y = "average game score")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
To highlight certain injury points identified earlier, we will re-plot this graph, labeling above points where LeBron suffered significant missed time:
<- lebron_year_by_year[lebron_year_by_year$most_games_missed >= 5, ]
highlighted
ggplot(lebron_year_by_year, aes(x = year, y = avg_game_score)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
geom_text(data = highlighted, aes(label = "Injury Point"), vjust = -0.5, hjust = 0.5, size = 4, color = "red") +
labs(title = "LeBron's performance year over year, 2003-2021", x = "years", y = "average game score")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
This model is reproducible for almost any player within the NBA, granted they have started their careers post 2000.
Now we will graph this same graph, but replacing our y-axis with PER
instead of avg_game_score
.
ggplot(lebron_year_by_year, aes(x = year, y = PER)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "LeBron's performance year over year, 2003-2021", x = "years", y = "PER - Player Efficiency Rating")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
<- lebron_year_by_year[lebron_year_by_year$most_games_missed >= 5, ]
highlighted
ggplot(lebron_year_by_year, aes(x = year, y = PER)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
geom_text(data = highlighted, aes(label = "Injury Point"), vjust = -0.5, hjust = 0.5, size = 4, color = "red") +
labs(title = "LeBron's performance year over year, 2003-2021", x = "years", y = "PER - Player Efficiency Rating")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Questions:
Some questions/ideas that the rest of my research will focus on will be as follows:
Q1) Is PER (Player Efficiency Rating) a better measure of player success than Game Score?
Q1a) If so, do we use uPER (unadjusted PER)?
Q1b) If so, we should measure the difference between the PER’s of prior season to injury and post season to injury.
Q2) We will need to graph the difference in PER vs maximum numbers of games missed consecutively across numerous players (similar to LeBron graphs above)
Q2a) We could make different graphs depending on category of injury (5-10 games missed, 11-25 games, >25games)
Q3b) We need to figure out a way to consistently return the right number of max consecutive games missed (second half returns NA values, most likely due to the lead function being used)
Q3) Should we continue to use half year intervals for data collection?
Q3a) If so, we need split years into two (oct, nov, dec of one year, jan, feb, mar of second year)
Q3b) add in around 25-50 players within tibble to get preliminary info
Q4) Based on this average difference of PER, can we find a reliable way to predict how a player’s value decreases based on the category of injury sustained?
Q4a) Should we add in player contract value per year?
Q4b) Can we find a reliable formula/equation to predict change in value?
Q4c) Should we add player position info to make data more applicable to certain players?
Q5) To test validity of model, we need to use examples of real injuries to players and compare predicted player performance decrease to actual player performance decrease.
New Developments:
Since Winter quarter, a key focus for this project has been to automate most of the function needed to extract player data from the large tibbles that the nbastatR
package has granted us. As such, most of the beginning of the quarter has been dedicated to refining the data collection process, as well as creating and automating new functions.
Below is a function, get_player_stats
, able to extract data within a specific time period for any player in the NBA within the nbastatR
library, granted you already have manually created tables of the NBA seasons they have participated in, in the form nba_20xx
:
<- function(years, nameOfPlayer) {
get_player_stats
# Initialize a list to store the results
<- list()
player_stats
# Loop through each year
for (year in years) {
# Get the tibble for the current year (assuming you have nba_2004, nba_2005, ..., nba_2024 loaded in your environment)
<- get(paste0("nba_", year))
current_nba
# Get the stats for players for the current year
paste0(tolower(gsub(" ", "_", nameOfPlayer)), "_", year)]] <- getyearstats(current_nba, nameOfPlayer)
player_stats[[
}
for (i in seq_along(player_stats)) {
# Determine the index of the year column
<- which(names(player_stats[[i]]) == "year")
year_col_index
# Determine the year range for the current season
<- min(player_stats[[i]]$year)
season_start_year <- max(player_stats[[i]]$year)
season_end_year
# Determine the mid-year of the season
<- (season_start_year + season_end_year) %/% 2
mid_year
# Add a new column indicating the half-year
$half_year <- ifelse(player_stats[[i]]$year <= mid_year, 1, 2)
player_stats[[i]]
# Move the half_year column next to the year column
<- player_stats[[i]] |> select(year, half_year, everything())
player_stats[[i]]
}
# Combine everything and return a big table
<- do.call(rbind, player_stats)
full_stats_table return(full_stats_table)
}
We can test this using our example player LeBron James, and compare the results of this function to the manual collection we did last quarter:
<- get_player_stats(2004:2021, "LeBron James")
lebron_stats
lebron_stats
# A tibble: 34 × 10
year half_year avg_pts avg_mins avg_fg_perc avg_plus_minus avg_game_score PER games_played most_games_missed
* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2003 1 20.2 40.1 0.412 -2.09 14.3 16.2 32 0
2 2004 2 21.4 39.1 0.402 -1.60 14.7 17.1 47 3
3 2004 1 24.4 40.9 0.474 4.59 21.1 24.6 29 0
4 2005 2 28.8 43.2 0.473 0.373 22.7 25.0 51 2
5 2005 1 30.4 41.5 0.497 5.79 23.0 26.6 28 0
6 2006 2 31.9 43.1 0.471 2.22 24.0 26.3 51 1
7 2006 1 27.3 40.3 0.485 4 20.5 24.3 29 0
8 2007 2 27.3 41.3 0.466 5.31 20.2 23.4 49 1
9 2007 1 28.8 39.5 0.473 2.12 23.2 27.9 26 5
10 2008 2 30.6 40.9 0.485 1.67 24.7 28.9 49 1
# ℹ 24 more rows
lebron_year_by_year
# A tibble: 34 × 9
year avg_pts avg_mins avg_fg_perc avg_plus_minus avg_game_score PER games_played most_games_missed
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 2003 20.2 40.1 0.412 -2.09 14.3 16.2 32 0
2 2004 21.4 39.1 0.402 -1.60 14.7 17.1 47 3
3 2004 24.4 40.9 0.474 4.59 21.1 24.6 29 0
4 2005 28.8 43.2 0.473 0.373 22.7 25.0 51 2
5 2005 30.4 41.5 0.497 5.79 23.0 26.6 28 0
6 2006 31.9 43.1 0.471 2.22 24.0 26.3 51 1
7 2006 27.3 40.3 0.485 4 20.5 24.3 29 0
8 2007 27.3 41.3 0.466 5.31 20.2 23.4 49 1
9 2007 28.8 39.5 0.473 2.12 23.2 27.9 26 5
10 2008 30.6 40.9 0.485 1.67 24.7 28.9 49 1
# ℹ 24 more rows
As we can see, the tables are largely identical, meaning we were successful in automating a part of the process of data collection. However, in order to iterate through potentially hundreds of players for data analysis, we need to go further, tackling the next problem of identifying which years these players played in, through two functions, get_min_career_year
and get_max_career_year
:
<- function(nameOfPlayer) {
get_min_career_year
# accesses nbastatR function player_careers, which returns tibbles of all stats of a player, including playoffs
<- players_careers(players = nameOfPlayer, modes = c("Totals"))
player_years
# specifically identify a specific tibble, and then a specific column to extract a year from
<- player_years[5]$dataTable[1]
player_years <- player_years[[1]]
player_years
# identifies the last 2 numbers in the year column, and extracts them
<- sapply(strsplit(player_years$slugSeason, "-"), function(x) as.numeric(x[2]))
years
# if this player has played in 19xx, then display that, if not, then they have to have played 20xx
<- ifelse(years > 24, years + 1900, years + 2000)
adjusted_years
# identify the lowest year to determine the start of their NBA career
<- min(adjusted_years, na.rm = TRUE)
min_year
# return value
return(min_year)
}
# same principles above apply to this get_max function
<- function(nameOfPlayer) {
get_max_career_year
<- players_careers(players = nameOfPlayer, modes = c("Totals"))
player_years
<- player_years[5]$dataTable[1]
player_years <- player_years[[1]]
player_years
<- sapply(strsplit(player_years$slugSeason, "-"), function(x) as.numeric(x[2]))
years
<- ifelse(years > 24, years + 1900, years + 2000)
adjusted_years
<- max(adjusted_years, na.rm = TRUE)
max_year
return(max_year)
}
We can test these two new functions with two players, LeBron James and Kobe Bryant, which should return 2004
and 2016
:
<- get_min_career_year("LeBron James") lebron_min_year_test
Acquiring LeBron James career Totals statistic tables
SeasonTotalsRegularSeason
CareerTotalsRegularSeason
SeasonTotalsPostSeason
CareerTotalsPostSeason
SeasonTotalsAllStarSeason
CareerTotalsAllStarSeason
SeasonRankingsRegularSeason
SeasonRankingsPostSeason
<- get_max_career_year("Kobe Bryant") kobe_max_year_test
Acquiring Kobe Bryant career Totals statistic tables
SeasonTotalsRegularSeason
CareerTotalsRegularSeason
SeasonTotalsPostSeason
CareerTotalsPostSeason
SeasonTotalsAllStarSeason
CareerTotalsAllStarSeason
SeasonRankingsRegularSeason
SeasonRankingsPostSeason
lebron_min_year_test
[1] 2004
kobe_max_year_test
[1] 2016
Now that we have this, we can create another function, get_all_stats
, that combines the three we have made, and can get the career statistic data for any player in the NBA, given the player name:
<- function(nameOfPlayer) {
get_all_stats <- get_min_career_year(nameOfPlayer)
player_min <- get_max_career_year(nameOfPlayer)
player_max
<- get_player_stats(player_min:player_max, nameOfPlayer)
player_stats return(player_stats)
}
Now that we have such a function, we can create a large table of the top 100 players in the NBA within the past 10 years, and start our data analysis by isolating these players’ data and combining them into one large table:
if (file.exists("data/allplayertibble.rds")) {
<- read_rds("data/allplayertibble.rds")
all_player_tibble else {
} <- c("LeBron James", "Kevin Durant", "Stephen Curry", "Giannis Antetokounmpo", "Kawhi Leonard", "James Harden", "Anthony Davis", "Luka Doncic", "Nikola Jokic", "Joel Embiid", "Damian Lillard", "Chris Paul", "Russell Westbrook", "Kyrie Irving", "Jayson Tatum", "Paul George", "Jimmy Butler", "Klay Thompson", "Draymond Green", "Bradley Beal", "Devin Booker", "Donovan Mitchell", "Karl-Anthony Towns", "DeMar DeRozan", "Trae Young", "Pascal Siakam", "Jalen Brunson", "Rudy Gobert", "Jrue Holiday", "Ja Morant", "Bam Adebayo", "Jaylen Brown", "Khris Middleton", "Zach LaVine", "Brandon Ingram", "De'Aaron Fox", "Shai Gilgeous-Alexander", "Darius Garland", "CJ McCollum", "Julius Randle", "Domantas Sabonis", "Fred VanVleet", "Kristaps Porzingis", "Myles Turner", "OG Anunoby", "Andrew Wiggins", "Marcus Smart", "Dejounte Murray", "Lauri Markkanen", "Michael Porter Jr.", "Jerami Grant", "Aaron Gordon", "Jarrett Allen", "Tobias Harris", "Jordan Clarkson", "Deandre Ayton", "Mikal Bridges", "Kyle Lowry", "Gordon Hayward", "Kemba Walker", "Blake Griffin", "Al Horford", "Steven Adams", "Serge Ibaka", "Montrezl Harrell", "Andre Drummond", "Victor Oladipo", "Malcolm Brogdon", "Robert Covington", "Buddy Hield", "Joe Harris", "Clint Capela", "Markieff Morris", "Harrison Barnes", "Jusuf Nurkic", "Eric Bledsoe", "Terry Rozier", "Spencer Dinwiddie", "Norman Powell", "Seth Curry", "Dennis Schroder", "Bogdan Bogdanovic", "Ricky Rubio", "Goran Dragic", "Derrick White", "Kevin Huerter", "Duncan Robinson", "P.J. Tucker", "Reggie Jackson", "Eric Gordon", "Patrick Beverley", "Lou Williams", "Will Barton", "Enes Freedom", "Nicolas Batum", "Kentavious Caldwell-Pope", "Marcus Morris Sr.", "John Collins", "Matisse Thybulle", "Wendell Carter Jr.", "Tyler Herro")
all_players
# adds player name column to data
<- function(player_data, player_name) {
add_player_name <- player_data %>% mutate(player = player_name)
player_data return(player_data)
}
# gets data of all players in the list above
<- list()
all_players_stats for (player in all_players) {
<- get_all_stats(player)
player_stats <- add_player_name(player_stats, player)
player_stats_with_name <- player_stats_with_name
all_players_stats[[player]]
}
# binds data into one large tibble
<- bind_rows(all_players_stats)
all_player_tibble
# Arrange the tibble by player name and year
<- all_player_tibble %>% arrange(player, year)
all_player_tibble
# View the combined tibble
all_player_tibble
# Create a new column 'season' based on 'year' and 'half_year', to make values easier to look at
<- all_player_tibble |>
all_player_tibble mutate(season = if_else(half_year == 1, paste0(year, "-", substr(year + 1, 3, 4)), paste0(year - 1, "-", substr(year, 3, 4)))) |>
select(season, player, everything())
# Create a new column that has delta PER to eventually plot it
<- all_player_tibble |>
all_player_tibble group_by(player) |>
arrange(year, half_year) |>
mutate(del_PER = PER - lag(PER)) |>
ungroup()
<- all_player_tibble |>
all_player_tibble arrange(player)
saveRDS(all_player_tibble, "allplayertibble.rds")
}
# view the whole tibble
all_player_tibble
# A tibble: 1,874 × 13
season player year half_year avg_pts avg_mins avg_fg_perc avg_plus_minus avg_game_score PER games_played most_games_missed del_PER
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 2014-15 Aaron Gordon 2014 1 5.82 15 0.534 -3.73 4.55 12.4 11 32 NA
2 2015-16 Aaron Gordon 2015 1 6.84 18.4 0.456 -0.0968 6.35 17.4 31 1 4.94
3 2014-15 Aaron Gordon 2015 2 4.97 17.6 NA -3.39 3.75 10.3 36 2 -7.06
4 2016-17 Aaron Gordon 2016 1 10.6 26.0 0.419 -2.06 7.37 13.1 34 0 2.79
5 2015-16 Aaron Gordon 2016 2 10.8 27.5 NA -1.49 10.1 17.2 47 3 4.15
6 2017-18 Aaron Gordon 2017 1 18.9 33.3 0.487 -2.39 15.3 21.9 28 5 4.68
7 2016-17 Aaron Gordon 2017 2 14.3 30.8 0.454 -1.98 11.0 16.9 46 2 -5.06
8 2017-18 Aaron Gordon 2018 2 16.5 32.7 0.385 -0.833 10.5 15.3 30 9 -1.55
9 2019-20 Aaron Gordon 2019 1 13.2 31.1 0.388 -2.79 9.86 14.6 29 3 -0.723
10 2020-21 Aaron Gordon 2020 1 11.4 23.8 0.420 -0.200 8.56 16.1 5 0 1.54
# ℹ 1,864 more rows
The resulting table we get is relatively large, and to visualize our results, we can use the ggplot
library, and graph our table, using most_games_missed
as our x-axis and del_PER
as our y-axis:
ggplot(all_player_tibble, aes(x = most_games_missed, y = del_PER)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "Data from Top 100 Players in the NBA (>= 3 seasons played)", x = "Max Consec. Games Missed", y = "Change in PER")
At first glance, this data looks quite jumbled and somewhat hard to draw any conclusions from, as the trendline seems to hold even rather than slope upwards or downwards. In order to better analyze the trend line, we can first try to take log(del_PER)
, which could help offset the effects of outliers on the graph itself, as iit mproves the fit of the model by transforming the distribution of the features to a more normally-shaped bell curve.
ggplot(all_player_tibble,
aes(x = most_games_missed,
y = sign(del_PER)*log(abs(del_PER)))) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "Data from Top 100 Players in the NBA (>= 3 seasons played)", x = "Max Consec. Games Missed", y = "Change in PER")
While the data becomes somewhat easier to interpret, we can go one step further and eliminate the points altogether, viewing only the curve of best fit. This way, the scale of the y-axis reduces, and the trendline should become more pronounced.
ggplot(all_player_tibble, aes(x = most_games_missed, y = del_PER)) +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "Data from Top 100 Players in the NBA (>= 3 seasons played)", x = "Max Consec. Games Missed", y = "Change in PER")
Within this new graph, we definitely see a more pronounced trendline, which seems to indicate a negative correlation between missing games due to injury and change in a player’s performance, meaning the graph supports the conclusion that more severe injuries lead to decreased performance, which is very reasonable. We can confirm this by running a simple linear regression test, denoted with the function lm()
:
(While linear regression does not tell us about the curve of best fit, the program will create a line of best fit, which is still useful.)
# linear regression test
<- all_player_tibble |>
all_player_tibble mutate(across(everything(), ~ ifelse(is.infinite(.), NA, .)))
lm(del_PER ~ 1 + most_games_missed,
data = all_player_tibble, na.action = na.exclude) |> summary()
Call:
lm(formula = del_PER ~ 1 + most_games_missed, data = all_player_tibble,
na.action = na.exclude)
Residuals:
Min 1Q Median 3Q Max
-33.858 -2.766 -0.173 2.781 40.287
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.76904 0.13408 5.736 1.15e-08 ***
most_games_missed -0.07992 0.01884 -4.242 2.33e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.789 on 1716 degrees of freedom
(156 observations deleted due to missingness)
Multiple R-squared: 0.01038, Adjusted R-squared: 0.009802
F-statistic: 18 on 1 and 1716 DF, p-value: 2.33e-05
From the results, we see that the program’s line of best fit has an estimate slope of -0.079
, meaning on average, a missed game contributes to a loss of -0.079
to a player’s PER. For context, the league wide average PER is 15.00
, so a loss of ~.08
per game does impact playing to a non-trivial extent, agreeing with the conclusion our curve of best fit created earlier.
In order to clean up the data further, toward the 0-20
game mark, it seems that there are an abundance of data points, and we can make a histogram graph to see how many instances there are of players missing a certain range of games, like 0-5
, 6-10
, etc.:
# baseline of the "explanatory variable"
ggplot(all_player_tibble, aes(x = most_games_missed)) +
geom_histogram()
As we can see, there is quite a lot of data in the 0-5
category, but for the intents and purposes of this research, we will disregard players missing less than 5 consecutive games, as our baseline for identifying injury is >=5
games missed. We can isolate players suffering an injury and regraph to see the effects on our new trendline.
<- all_player_tibble |>
filtered_tibble filter(most_games_missed >= 5)
# filtered tibble mapped with points
ggplot(filtered_tibble, aes(x = most_games_missed, y = del_PER)) +
geom_point() +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "Data from Top 100 Players in the NBA (>= 3 seasons played, >=5 games missed)", x = "Max Consec. Games Missed", y = "Change in PER")
# filtered tibble mapped only with curve
ggplot(filtered_tibble, aes(x = most_games_missed, y = del_PER)) +
geom_smooth(method = "gam", se = TRUE) +
labs(title = "Data from Top 100 Players in the NBA (>= 3 seasons played, >=5 games missed)", x = "Max Consec. Games Missed", y = "Change in PER")
# new linear regression once again
<- filtered_tibble |>
filtered_tibble mutate(across(everything(), ~ ifelse(is.infinite(.), NA, .)))
lm(del_PER ~ 1 + most_games_missed,
data = filtered_tibble, na.action = na.exclude) |> summary()
Call:
lm(formula = del_PER ~ 1 + most_games_missed, data = filtered_tibble,
na.action = na.exclude)
Residuals:
Min 1Q Median 3Q Max
-17.735 -2.719 0.120 2.452 40.168
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.88700 0.39704 2.234 0.02602 *
most_games_missed -0.07983 0.02782 -2.870 0.00432 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.857 on 406 degrees of freedom
(49 observations deleted due to missingness)
Multiple R-squared: 0.01988, Adjusted R-squared: 0.01746
F-statistic: 8.235 on 1 and 406 DF, p-value: 0.004325
Now we see a resultant estimate slope of ~0.79
, surprisingly identical to the test we ran earlier. Thus, it is reasonable to conclude that more games missed due to injury impacts a player’s performance by reducing it more drastically the more severe an injury is.
Conclusions and Reflections:
Learning R
Throughout the course of both Winter and Spring quarters, learning R has been both challenging and rewarding. As a powerful statistical programming language, R is renowned for its capabilities in data analysis and visualization, which are essential for any data-driven project. My learning process involved understanding the basics of R syntax, data structures (such as vectors, data frames, and lists), and key functions for data manipulation and analysis. The initial learning curve was steep, but the wealth of online resources and the help of Professor Zhang, I was able to expedite the learning process. I believe that the skills I’ve learned through this TRELS project will help me in my future endeavors, and this experience has definitely made me more open to the field of data science and analytics, a topic I was previously not to keen to learn about.
Future Implications and Possible Connections
In the earlier iterations of my project, I was hoping to be able to connect my data analysis conclusions to a real-world factor in the NBA, specifically the salaries of players. But as the project wore on, it seemed less likely that I would be able to include it. However, for future iterations of this project, linking PER change to salary changes is one of the key things I would like to add. For example, we could create an average of player salary per PER, and calculate projected loss of salary/worth after missing games as well.