Tutorial for goals saved above expected
Intro
Use the chickenstats
library to scrape play-by-play data and analyze goalies' goals saved above expected.
Parts of this tutorial are optional and will be clearly marked as such. For help, or any questions, please don't hesitate to reach out to chicken@chickenandstats.com or @chickenandstats.com on Blue Sky.
Housekeeping
Import dependencies
Import the dependencies we'll need for the guide
import datetime as dt
import matplotlib.patheffects as mpe
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.lines import Line2D
import chickenstats.utilities
from chickenstats.chicken_nhl import Scraper, Season
from chickenstats.chicken_nhl.info import NHL_COLORS
from chickenstats.chicken_nhl.helpers import charts_directory
Pandas options
Set different pandas options. This cell is optional
Folder structure
Chickenstats matplotlib style
chickenstats.utilities includes a custom style package - this activates it. This cell is also optional
Scrape data
Schedule
Scrape the schedule using the Season
object
Standings and team names
Scrape the standings and create team name dictionaries to use later
team_names = standings.sort_values(by="team_name").team_name.str.upper().tolist()
team_codes = standings.sort_values(by="team_name").team.str.upper().tolist()
team_names_dict = dict(zip(team_codes, team_names, strict=False))
Game IDs
Create a list of game IDs to crape
conds = schedule.game_state == "OFF"
game_ids = schedule.loc[conds].game_id.unique().tolist()
latest_date = schedule.loc[conds].game_date.max()
Play-by-play
Scrape play-by-play data using the Scraper
object
Stats
Aggregate statistics to season and game level
scraper.prep_stats(level="season", disable_progress_bar=True)
season_stats = scraper.stats.copy(deep=True)
scraper.prep_stats(level="game", disable_progress_bar=True)
game_stats = scraper.stats.copy(deep=True)
Goalie dataframes
Filter the dataframes for goalies and add goals saved above expected stats
def prep_goalie_df(data: pd.DataFrame, strengths: bool) -> pd.DataFrame:
"""Prep goalie dataframes for later analysis.
Parameters:
data (pd.DataFrame):
Pandas dataframe of individual statistics scraped using `chickenstats` library
strengths (bool):
If False, aggregates data to all strengths states. If True, data are aggregated
to strength state level
"""
df = data.copy(deep=True)
df = df.loc[df.position == "G"].reset_index(drop=True)
if not strengths:
group_cols = [
"season",
"session",
"game_id",
"game_date",
"player",
"eh_id",
"api_id",
"position",
"team",
"opp_team",
]
group_cols = [x for x in group_cols if x in df.columns]
agg_stats = {
x: "sum"
for x in df.columns
if x not in group_cols and x != "strength_state" and "percent" not in x and "p60" not in x
}
df = df.groupby(group_cols, as_index=False).agg(agg_stats)
df["gsax"] = df.xga - df.ga
df["gsax_p60"] = df.gsax / df.toi * 60
return df
GSaX line charts
Prepping data
Calculating cumlative stats
def calc_cumulative_stats(data: pd.DataFrame) -> pd.DataFrame:
"""Calculate cumulative TOI and xG against.
Parameters:
data (pd.DataFrame):
Pandas dataframe of statistics aggregated from the `chickenstats` library and
prepped for goalie analysis
"""
df = data.copy(deep=True)
group_list = ["season", "session", "player", "eh_id", "api_id", "team"]
df["cum_gp"] = df.groupby(group_list)["game_id"].transform("cumcount") + 1
df["cum_toi"] = df.groupby(group_list)["toi"].transform("cumsum")
df["cum_gsax"] = df.groupby(group_list)["gsax"].transform("cumsum")
return df
Plot line chart function
Function to plot cumulative GSaX and TOI for a given goalie
def plot_line_chart(
data: pd.DataFrame,
goalie: pd.Series,
ax: plt.axes,
ax_title: str | None = None,
legend_label: str | None = None,
x_label: bool = False,
y_label: bool = False,
):
"""Function to plot a seaborn line chart of cumulative time-on-ice and goals scored above expected.
Parameters:
data (pd.DataFrame):
Pandas dataframe of game-level goalie data to plot
goalie (pd.Series):
Row of data from season-level goalie data
ax (plt.axes):
The matplotlib axes to return after plotting the chart
ax_title (str | None):
Customize ax title, or, if None, use the goalie's name
legend_label (str | None):
Customize the legend label, or, if None, list the cumulative GSaX and TOI
x_label (bool):
Whether to print or hide the x-axis label
y_label (bool):
Whether to print or hide the y-axis label
"""
plot_df = data.copy()
color_palette = np.where(
plot_df.api_id == goalie.api_id, NHL_COLORS[goalie.team]["SHOT"], NHL_COLORS[goalie.team]["MISS"]
)
color_palette = dict(zip(plot_df.player, color_palette, strict=False))
NHL_COLORS[goalie.team]["MISS"]
line_width = 3
conds = plot_df.player != goalie.player
sns.lineplot(
x="cum_toi", y="cum_gsax", data=plot_df[conds], hue="player", palette=color_palette, ax=ax, lw=line_width
)
conds = plot_df.player == goalie.player
NHL_COLORS[goalie.team]["SHOT"]
line_width = 6
path_effect_ec = NHL_COLORS[goalie.team]["GOAL"]
path_effect = [mpe.Stroke(foreground=path_effect_ec, alpha=1, linewidth=7), mpe.Normal()]
sns.lineplot(
x="cum_toi",
y="cum_gsax",
data=plot_df[conds],
hue="player",
palette=color_palette,
ax=ax,
zorder=3,
lw=3.5,
path_effects=path_effect,
)
if ax_title == "":
ax_title = ""
elif not ax_title:
ax_title = goalie.player
ax.set_title(ax_title, size=18, weight="heavy", pad=15)
if y_label:
ax.set_ylabel("Cumulative GSaX", size=16, labelpad=15, weight="heavy")
else:
ax.set_ylabel("")
ax.yaxis.set_tick_params(which="both", labelbottom=True)
if x_label:
ax.set_xlabel("Cumulative time-on-ice (minutes)", size=16, labelpad=15, weight="heavy")
else:
ax.set_xlabel("")
ax.xaxis.set_tick_params(which="both", labelbottom=True)
legend_elements = list()
color = NHL_COLORS[goalie.team]["SHOT"]
xG = round(goalie.gsax, 2)
toi_max = round(goalie.toi, 2)
if not legend_label:
legend_label = f"{xG} GSaX in {toi_max} minutes"
element = Line2D([0], [0], lw=3, label=legend_label, color=color, path_effects=path_effect)
legend_elements.append(element)
ax.legend(
handles=legend_elements,
loc="upper left",
ncol=1,
fontsize=14,
title_fontsize=12,
facecolor="white",
framealpha=1,
edgecolor="gray",
).set_zorder(-1)
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)
return ax
Juuse Saros
Plot single goalie's goals saved above expected and time-on-ice
selected_goalie = "JUUSE.SAROS"
conds = goalies_season_all_sit.eh_id == selected_goalie
fig_size = (8, 5)
fig, ax = plt.subplots(figsize=fig_size, dpi=650)
for idx, goalie in goalies_season_all_sit.loc[conds].iterrows():
plot_df = goalies_game_all_sit.copy()
plot_line_chart(data=plot_df, goalie=goalie, ax=ax, ax_title="", x_label=True, y_label=True)
title = "Saros is having an NHL-average year"
fig.suptitle(title, ha="center", va="center", y=1.027, size=16, weight="heavy")
subtitle = f"Cumulative GSaX & TOI, all situations | 2024-25 season, as of {latest_date}"
fig.text(s=subtitle, ha="center", va="center", x=0.5, y=0.98, size=12)
# Attribution
attribution = "Data & xG model @chickenandstats.com | Viz @chickenandstats.com"
fig.text(s=attribution, x=0.95, y=-0.095, fontsize=8, horizontalalignment="right", style="italic")
fig.savefig("./charts/saros_gsax.png", dpi=650, bbox_inches="tight", facecolor="white")
Top-6 goalies
Create the top goalies dataframe to iterate through for plotting
top_goalies = goalies_season_all_sit.sort_values(by="gsax", ascending=False).head(6).reset_index(drop=True)
Plot cumulative GSaX and TOI
Cumulative time-on-ice and goals saved above expected for top-6 goalies in NHL
## setting figure size
fig_size = (15, 15)
fig, axes = plt.subplots(3, 2, figsize=fig_size, dpi=650, sharex=True, sharey=True)
fig.tight_layout(pad=5)
axes = axes.reshape(-1)
for idx, top_goalie in top_goalies.iterrows():
ax = axes[idx]
x_label = idx >= 4
y_label = idx in [0, 2, 4]
plot_df = goalies_game_all_sit.copy()
plot_line_chart(data=plot_df, goalie=top_goalie, ax=ax, x_label=x_label, y_label=y_label)
title = "Top-6 goaltenders by cumulative goals saved above expected"
fig.suptitle(title, ha="center", va="center", y=1.027, size=24, weight="heavy")
subtitle = f"Cumulative GSaX & cumulative TOI, all situations | 2024-25 season, as of {latest_date}"
fig.text(s=subtitle, ha="center", va="center", x=0.5, y=1.001, size=18)
# Attribution
attribution = "Data & xG model @chickenandstats.com | Viz @chickenandstats.com"
fig.text(s=attribution, x=0.99, y=-0.0125, fontsize=12, horizontalalignment="right", style="italic")
fig.savefig("./charts/top_6_gsax.png", dpi=650, bbox_inches="tight", facecolor="white")
GSaX and time between games
Prepping data
Getting game winners and calculating time between games with the schedule object
def prep_hours_since(data: pd.DataFrame, schedule: pd.DataFrame, strengths: list = None) -> pd.DataFrame:
"""Function to prep dataframe of gsax and hours since for an individual goalie.
Parameters:
data (pd.DataFrame):
Pandas dataframe of goalie stats from `chickenstats` library
strengths (list):
List of strength states to filter the dataframe
"""
if strengths is None:
strengths = ["5v5"]
df = data.copy()
winners = np.where(schedule.home_score > schedule.away_score, schedule.home_team, schedule.away_team)
winners_map = dict(zip(schedule.game_id.astype(str), winners, strict=False))
game_date_dt = pd.to_datetime(schedule.game_date_dt, utc=True)
game_date_map = dict(zip(schedule.game_id.astype(str), game_date_dt, strict=False))
df["game_date_dt"] = df.game_id.map(game_date_map)
df["win"] = df.game_id.map(winners_map)
df.win = np.where(df.team == df.win, 1, 0)
conds = df.strength_state.isin(strengths)
df = df.loc[conds].reset_index(drop=True)
group_list = ["season", "session", "team", "player", "eh_id"]
df["hours_since"] = df.groupby(group_list).game_date_dt.transform(lambda x: x - x.shift(1)).astype(
"timedelta64[s]"
) / pd.Timedelta(hours=1)
conds = np.logical_and.reduce([df.hours_since > 0, df.hours_since <= 175, df.toi >= 10])
df = df.loc[conds].reset_index(drop=True)
return df
Plotting function
Plot individual goalie GSaX / 60 and time since last game
def plot_hours_since(
data: pd.DataFrame,
goalie: pd.Series,
ax: plt.axes,
ax_title: str | None = None,
legend_label: str | None = None,
x_label: bool = False,
y_label: bool = False,
):
"""Function to plot a seaborn line chart of cumulative time-on-ice and goals scored above expected.
Parameters:
data (pd.DataFrame):
Pandas dataframe of game-level goalie data to plot
goalie (pd.Series):
Row of data from season-level goalie data
ax (plt.axes):
The matplotlib axes to return after plotting the chart
ax_title (str | None):
Customize ax title, or, if None, use the goalie's name
x_label (bool):
Whether to print or hide the x-axis label
y_label (bool):
Whether to print or hide the y-axis label
"""
sns.despine(right=False, top=False, ax=ax)
df = data.copy()
min_size = df.fa_p60.min()
max_size = df.fa_p60.max()
df.fa_p60.mean()
size_norm = (min_size, max_size)
sizes = (10, 500)
alpha = 0.65
line_width = 1.3
colors = NHL_COLORS[goalie.team]
conds = df.eh_id != goalie.eh_id
sns.scatterplot(
x="hours_since",
y="gsax_p60",
data=df[conds],
color=colors["MISS"],
size="fa_p60",
size_norm=size_norm,
sizes=sizes,
alpha=alpha,
edgecolor="white",
linewidth=line_width,
legend="full",
ax=ax,
)
color_palette = {0: colors["SHOT"], 1: colors["GOAL"]}
for result, color in color_palette.items():
conds = df.eh_id == goalie.eh_id
edge_color = "white" if result == 0 else colors["SHOT"]
sns.scatterplot(
x="hours_since",
y="gsax_p60",
data=df[conds],
hue="win",
palette=color_palette,
size="fa_p60",
size_norm=size_norm,
sizes=sizes,
alpha=alpha,
edgecolor=edge_color,
linewidth=line_width,
legend=False,
ax=ax,
)
legend_elements = []
legend_element_labels = ["Win", "Loss", "Other goalies"]
for label in legend_element_labels:
if label == "Win":
color = colors["GOAL"]
edge_color = colors["SHOT"]
if label == "Loss":
color = colors["SHOT"]
edge_color = "white"
if label == "Other goalies":
color = colors["MISS"]
edge_color = "white"
element = Line2D(
[0], [0], lw=0, label=label, markersize=14, marker="o", color=color, mec=edge_color, alpha=alpha
)
legend_elements.append(element)
legend = ax.legend(
handles=legend_elements,
loc="upper left",
ncol=1,
fontsize=12,
title_fontsize=16,
facecolor="white",
framealpha=1,
edgecolor="gray",
)
ax.add_artist(legend).set_zorder(-1)
if not ax_title and ax_title != "":
ax_title = goalie.player
if ax_title:
ax.set_title(ax_title, size=18, weight="heavy", pad=15)
if x_label:
ax.set_xlabel("Hours since last game", size=18, labelpad=15, weight="heavy")
else:
ax.set_xlabel("", size=18, weight="heavy")
ax.xaxis.set_tick_params(which="both", labelbottom=True)
if y_label:
ax.set_ylabel("GSaX / 60", size=18, labelpad=15, weight="heavy")
else:
ax.set_ylabel("")
ax.yaxis.set_tick_params(which="both", labelleft=True)
ax.xaxis.set_tick_params(labelsize=14)
ax.yaxis.set_tick_params(labelsize=14)
ax.yaxis.set_major_locator(ticker.MultipleLocator(2))
ax.yaxis.set_minor_locator(ticker.MultipleLocator(1))
return ax
Data
Prepping the data for plotting the next two charts
Plotting Juuse Saros
You can also change to plot to whichever goalie you prefer
goalie = "JUUSE.SAROS"
strengths = ["5v5"]
fig_size = (8, 8)
fig, ax = plt.subplots(figsize=fig_size, dpi=650)
fig.tight_layout()
sns.despine(right=False, top=False)
goalie_df = goalies_season.loc[
np.logical_and(goalies_season.strength_state.isin(strengths), goalies_season.eh_id == goalie)
]
for idx, goalie in goalie_df.iterrows():
ax = plot_hours_since(data=hours_since_data, goalie=goalie, ax=ax, ax_title="", x_label=True, y_label=True)
title = "Saros's worst games after long breaks"
fig.suptitle(title, ha="center", va="center", y=1.045, size=16, weight="heavy")
subtitle = f"GSaX / 60 & hours since last game (bubbles sized for FA / 60) | 2024-25 season, as of {latest_date}"
fig.text(s=subtitle, ha="center", va="center", x=0.5, y=1.015, size=12)
attribution = "Data & xG model @chickenandstats | Viz @chickenandstats"
fig.text(s=attribution, ha="right", va="center", y=-0.1, x=0.95, size=10, style="italic")
fig.savefig("./charts/saros_gsax_hours_since.png", dpi=650, bbox_inches="tight", facecolor="white")
Elite goalies
Plotting performance and hours since last game for top-6 goalies
## setting figure size
fig_size = (15, 15)
fig, axes = plt.subplots(3, 2, figsize=fig_size, dpi=650)
fig.tight_layout(pad=5)
axes = axes.reshape(-1)
for idx, top_goalie in top_goalies.iterrows():
ax = axes[idx]
x_label = idx >= 4
y_label = idx in [0, 2, 4]
ax = plot_hours_since(data=hours_since_data, goalie=top_goalie, ax=ax, x_label=x_label, y_label=y_label)
title = "Top-6 goaltenders by cumulative goals saved above expected"
fig.suptitle(title, ha="center", va="center", y=1.027, size=24, weight="heavy")
subtitle = f"GSaX / 60 and hours since last game (bubbles sized for FA / 60), 5v5 | 2024-25 season, as of {latest_date}"
fig.text(s=subtitle, ha="center", va="center", x=0.5, y=1.001, size=18)
# Attribution
attribution = "Data & xG model @chickenandstats.com | Viz @chickenandstats.com"
fig.text(s=attribution, x=0.99, y=-0.0125, fontsize=12, horizontalalignment="right", style="italic")
fig.savefig("./charts/top_6_gsax_hours_since.png", dpi=650, bbox_inches="tight", facecolor="white")