index.Rmd

---
title: "NBA Predictions"
description: |
  Comparing my models against FiveThirtyEight
site: distill::distill_website
---

```{r setup, include=FALSE}
source('packages.R')
knitr::opts_chunk$set(echo = FALSE)

# Learn more about creating websites with Distill at:
# https://rstudio.github.io/distill/website.html


get_path <- function(fn){
  #. Copy file from python's output to this site's folder (for git versioning)
  dir_data = 'data'
  file.path(dir_data, fn)
}


#ggthemes::palette_pander(8)
light_blue = "#56B4E9"
green = "#009E73"
green2 = "#00C073" # ppen apple Notes, cmd + shift + c for color pallete. I entered green, thens shifted the green slider right.
green3 = "#00DD73"
yellow = "#F0E442"
dk_blue = "#0072B2"
red = "#D55E00"
pink="#CC79A7"
gray ="#999999"
orange = "#E69F00"

css = paste0("background-color: ", dk_blue, "; color: #fff;")

models = c('elo', 'result_v02')

# Use this for team names
df_name = read_csv(get_path('records.csv')) %>%
  select(team, name, record_result) %>%
  mutate(team1 = team,
         team2 = team,
         name1 = paste0('<span style="font-weight:bold;color:', dk_blue, '">', name, '</span>'),
         rec1 = paste0('<span style="color:', gray, '">', record_result, '</span>'),
         rec2 = paste0('<span style="color:', gray, '">', record_result, '</span>'),
         name2 = paste0('<span style="font-weight:bold;color:', dk_blue, '">', name, '</span>')) %>%
         #name1 = paste(span(name, color=blue, `font-weight`=bold), record_result),
         #name2 = paste(name, record_result)) %>%
  select(-record_result)
           
```

Each morning I challenge [FiveThirtyEight's Raptor](https://projects.fivethirtyeight.com/2020-nba-predictions/games/) predictions[^1] and compare them against some super-simple benchmarks. **Updated `r format(now("EST"), '%a, %b %d %I:%M %p')`**.

[^1]:FiveThirtyEight's predictions are updated daily and can be downloaded here: https://projects.fivethirtyeight.com/nba-model/nba_elo_latest.csv.


## Upcoming Games
Upcoming games for the home team's win probability. Blue represents an expected win for the home team (score > 50%) and orange/red represents an expected loss. `elo` is the FiveThirtyEight model and `V02` is my model.

```{r readin}

df = read_csv(get_path('all_scores.csv')) %>%
  # ignore datetime - time zones mess things up anyway
  mutate(date = as_date(datetime)) %>%
  select(-datetime) %>%
  select(date, everything()) %>%
  # Replace team names
  left_join(df_name %>% select(team1, name1, rec1), on='team1') %>%
  mutate(team1 = name1) %>% select(-name1) %>%
  left_join(df_name %>% select(team2, name2, rec2), on='team2') %>%
  mutate(team2 = name2) %>% select(-name2)
  

colnames(df) %<>% gsub('[.]*\\_prob1', '', .)
# colnames(df) %<>% gsub('result\\_[.]*', '', .)


# Get dates 2 days after, 3 days before. Sometimes there might not be games for a day.
next_2_days <- df %>% select(date) %>% filter(date >= Sys.Date()) %>% unique() %>% arrange() %>% head(2) %>% pull()
past_3_days <- df %>% select(date) %>% filter(date < Sys.Date()) %>% unique() %>% arrange(desc(date)) %>% head(3) %>% pull() 
# Today's games
df_today = df %>% 
  filter(date %in% next_2_days) %>%
  select(-result, -starts_with('pg'), -season_week) %>%
  mutate(game = paste0(team1, ' vs. ', team2, "<br>", rec1, '&nbsp;&nbsp;&nbsp;', rec2),
         date = format(date, format='%m/%d, %a')) %>%
  select(date, game, !!models) %>% 
  arrange(date)
 

# Header formatting:
# Header row: https://stackoverflow.com/a/53658138/2138773
head_format = JS(
    "function(settings, json) {",
    paste0("$(this.api().table().header()).css({'font-size': '10pt', 'background-color': '", dk_blue,"', 'color': '#fff'});"),
    "}")

dt <- df_today %>%
  datatable(#style='default',
            escape=F,
            #container=sketch,
            class = 'cell-border stripe',
            rownames=F,
            colnames = str_to_upper(colnames(.)),
            options = 
              list(
                # Header formatting
                initComplete = head_format,
                pageLength=100,
                dom = 'ft',
                rowsGroup=list(0),
                columnDefs = list(
                  list(
                    className = 'dt-center', targets= 0:3
                  ))
              )) %>%
  formatPercentage(models) 
  # For every model to be shown, hide the _acc column and color the model col
for (mod in models){
  dt %<>% formatStyle(
    mod,
    color = 'white',
    fontWeight = 'bold',
    backgroundColor = styleInterval(
      cuts = c(0.4, .5, 0.6), 
      values = c(red, orange, light_blue, dk_blue))
  ) 
}
rowGroupDep <- htmltools::htmlDependency(
      name="RowsGroup", version="2.0.0", 
      src='www', script = "dataTables.rowsGroup.js")
dt$dependencies = c(dt$dependencies, list(rowGroupDep))

dt
  
```

## Last Three Day's Predictions vs. Results
The home team win's probability is colored green if the model correctly predicted the home team winning. 

```{r last3}

# Yesterday's Games
check = paste0('<font color="', green,'"><strong>✓</strong></font>')
df_yesterday = df %>% 
  filter(date %in% past_3_days) %>%
  mutate(
    indicator1 = ifelse(pg_score1 > pg_score2, check, ' '),
    indicator2 = ifelse(pg_score1 > pg_score2, ' ', check),
    # pg_score_1 = ifelse(pg_score1 > pg_score1,
    #                    paste0('<span stype="color:', black, '">', pg_score1, '</span>'), pg_score1),
    game = paste0(indicator1, team1, 
                  ' (', pg_score1, ') | ', team2, ' (', pg_score2, ')', indicator2,
                  "<br>", rec1, '&nbsp;&nbsp;&nbsp;', rec2),
    date = format(date, format='%m/%d, %a')
         ) %>%
  select(-team1, -team2, -starts_with('pg'), -season_week, -starts_with('indi')) %>%
  select(date, game, everything()) %>%
  arrange(desc(date))


# https://rstudio.github.io/DT/010-style.html
mod_accs = c()
for (mod in models){
  mod_acc = paste0(mod, '_acc')
  r = df_yesterday$result
  m = df_yesterday[[mod]]
  df_yesterday[[mod_acc]] = as.numeric((r==0 & m < 0.5) | (r == 1 & m > 0.5)) 
  mod_accs = c(mod_accs, mod_acc)
}

n_games = nrow(df_yesterday)
total_games = paste0('Total Correct (out of', n_games, ')')

total_sums = df_yesterday %>% select(!!mod_accs) %>% apply(2, sum)
total_pct = paste0(total_sums, ' (', round(total_sums/n_games *100), '%)')
sketch = htmltools::withTags(table(
  tableHeader(toupper(c('Date', 'Game', 'Elo', 'V02'))),
  tableFooter(c('', total_games, total_pct))
))

dt <- df_yesterday %>%
  select(date, game, !!models, ends_with('_acc')) %>%
  datatable(style='default',
            escape=F,
            container=sketch,
            class = 'cell-border stripe',
            rownames=F,
            colnames = str_to_title(colnames(.)),
            options = 
              list(
                # Header formatting
                initComplete = head_format,
                pageLength=100,
                dom = 'ft',
                rowsGroup = list(0),
                columnDefs = list(
                  list(
                    className = 'dt-center', targets= 0:4
                  ),
                  list(
                    # Hide columns with _acc, this is zero-indexed (hence -1)
                    targets = which(colnames(.) %>% str_detect('_acc$')) - 1, 
                    visible = F
                  ))
              )) %>%
  formatPercentage(models) 
  # For every model to be shown, hide the _acc column and color the model col
for (mod in models){
  mod_acc = paste0(mod, '_acc')
  dt %<>% formatStyle(
    mod, mod_acc,
    color = 'white',
    fontWeight = 'bold',
    backgroundColor = styleEqual(c(0, 1), c(gray, green))
  ) 
}
dt$dependencies = c(dt$dependencies, list(rowGroupDep))
dt
```

## Projected Record

I estimate the team's final record and compare that to FiveThirtyEight's projections.

```{r records}
df_rec = read_csv(get_path('records.csv'))
# Keep only the modeled + 538
mod = 'v02'
keep_names = names(df_rec)[str_detect(names(df_rec), mod)]
df_rec = df_rec %>%
  select(name, record_result, record_proj_538_raptor, !!keep_names) %>%
  arrange(desc(record_result))

sketch = htmltools::withTags(table(
  tableHeader(toupper(c('team', 'Current Record', '538 Proj', 'V02 Proj')))
))

df_rec %>% 
 datatable(style='default',
            escape=F,
            container=sketch,
            class = 'cell-border stripe',
            rownames=F,
            colnames = str_to_upper(colnames(.)),
            options = 
              list(
                # Header formatting
                initComplete = head_format,
                pageLength=100,
                dom = 'ft',
                columnDefs = list(
                  list(
                    className = 'dt-center', targets= 0:3
                  ))
              ))
  
  
```


## Seasonal Model Accuracy 
This section compares how accurate my models are with FiveThirtyEight's.

#### Up to Today
```{r accuracy}
# Todo: save out the index
df = read_csv(get_path('acc_overall.csv')) %>% select(-index)
all = round(df[1,], 3)
all = all %>% select(home_overall, everything())
# min = .50; max=1
# g_home_freq = gauge(value = all$home_overall, min, max, label='Home')
# g_home_winpct = gauge(value = all$home_winpct, min, max)
# g_v01 = gauge(value = all$v01, min, max)
# g_elo = gauge(value = all$elo, min, max)
# g_home_freq
# fluidRow(
#   column(3,
#       g_home_freq, 
#   ),
#   column(3,
#       g_home_freq, 
#   ),
#   column(3,
#       g_home_freq, 
#   ),
#   column(3,
#       g_home_freq, 
#   )
# )


# TODO: https://haozhu233.github.io/kableExtra/awesome_table_in_html.html#integration_with_formattable
# add better cloring
colnames(all) = c("Win Rate", "Home %", "Home % > Away %", "v01", 'v02', "elo", "carm-elo", "raptor")
all %>%
  t() %>%
  # as.data.frame() %>%
  # mutate(V1 = scales::percent(V1,accuracy=.1)) %>%
  # rename(` ` = V1) 
  kable() %>%
  kable_styling(c('striped', 'hover'), full_width = F) %>%
  pack_rows("Home Team Benchmarks", 1,3, label_row_css = css) %>%
  pack_rows("My Models", 4, 5, label_row_css = css) %>%
  pack_rows("Five Thirty Eight", 6, ncol(all), label_row_css = css)
  # add_header_above(c("Home Team Benchmarks" = 3, "My Model"=1, "538" = 3))
  
```

* Win Rate: How often does the home team win?
* Home %: How often does the home team win when its win % before the game was more than 50%? (A home team with a record of 53% is predicted to win, butwith a record of 49% is predicted to lose.)
* Home % > Away %: If the home team's win % is greater than away team's win %, the home team is predicted to win. Otherwise, they're predicted to lose. E.g., if home win % = 0.39 and away win% = 0.40, I predict the away team will win.
* v01: I build my own model.
* V02: Version 2 of my model.

#### Model Calibration

How well do the model's predictions align with true probabilities? A prediction of 70% should win 70% of the time. You'd expect that if the model probability ranges between 40-50%, the home team should win 40-50% of the time. If the model probability ranges between 90%-100%, you'd expect the home team to win 90-100% of the time. Black line represents true probability

```{r calib}
df = read_csv(get_path('acc_overall.csv')) %>% 
  select( -home_overall, -home_vs_away_winpct) %>%
  rename(score_range = index)

# TODO: this isn't a true calibration plot. 
# This chart should show wether the model point is between 0.80and 0.90,
# Does the interval capture the probability?
df %>% 
  filter(score_range != 'all') %>%
  mutate(prob_range = row_number()/10) %>% # - 0.05
  select(-score_range) %>%
  gather(model, avg_prob, -prob_range) %>%
  ggplot(aes(x = prob_range, y = avg_prob, color = model)) + 
  geom_line() + 
  geom_point() + 
  geom_abline(intercept = 0, slope=1, linetype=2, color='gray') + 
  geom_abline(intercept = -0.05, slope=1, linetype=1, color='gray') + 
  geom_abline(intercept = -0.1, slope=1, linetype=2, color='gray') + 
  ggthemes::scale_color_pander() +
  scale_x_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) + 
  scale_y_continuous(breaks = seq(from = 0, to = 1, by = 0.2)) + 
  theme_minimal() +
  labs(title = 'Probability Calibration Plot',
       subtitle = 'How much do the model align with true probabilities?',
       x = 'Probability Score Range (0.1 = from 0.0-0.1)',
       y = 'Frequency of Home Team Win (Within Prob Range)')
```

#### Model Accuracy By Week
```{r accwithin}
plot_acc_by_week <- function(df, title, subtitle){
  
  df %>%
    rename(`Home %` = home_winpct) %>%
    gather(model, accuracy, -season_week) %>%
    ggplot(aes(x=season_week, y = accuracy, color = model)) +
    geom_line() +
    geom_point() + 
    geom_hline(yintercept=.5) +
    ggthemes::scale_color_pander() + 
    theme_minimal() + 
    labs(title=title,
         subtitle=subtitle,
         x = 'Season Week',
         y = "Model Accuracy",
         caption = 'Source: FiveThirtyEight.com')
}

df = read_csv(get_path('acc_within_week.csv'))
colnames(df) %<>% gsub('acc_pred_[.]*', '', .)
plot_acc_by_week(df, title = 'Accuracy of Model Within Season Week',
                 subtitle = 'A score of .67 at week 5 is the accuracy of the model for all games within week 5.')

```

```{r accupto}
df = read_csv(get_path('acc_upto_week.csv'))
colnames(df) %<>% gsub('acc_pred_[.]*', '', .)
plot_acc_by_week(df, title = 'Accuracy of Model Up To Season Week',
                 subtitle = 'A score of .67 at week 5 is the accuracy of the model for all games from week 1-5.')
```

```{r accsince}
df = read_csv(get_path('acc_since_week.csv'))
colnames(df) %<>% gsub('acc_pred_[.]*', '', .)
plot_acc_by_week(df, title = 'Accuracy of Model Since Season Week',
                 subtitle = 'A score of .67 at week 5 is the accuracy of the model for all games from week 5-X, where X is today\'s week.')
```