Exercises
Practice your soccer analytics skills with hands-on exercises. Each exercise includes hints and full solutions in both R and Python.
Exercise 1.1: Load and Explore StatsBomb Data
Using StatsBomb open data, load the 2022 World Cup matches and explore the data structure. Answer: How many matches are in the dataset? What columns are available in the events data?
Hint 1: Use sb.matches() or FreeMatches() to get the matches list.
Hint 2: Filter for competition_id=43 and season_id=106 for World Cup 2022.
from statsbombpy import sb
# Get World Cup 2022 matches
matches = sb.matches(competition_id=43, season_id=106)
print(f'Total matches: {len(matches)}')
print(f'Columns: {matches.columns.tolist()}')
# Get events from one match
events = sb.events(match_id=matches.iloc[0]['match_id'])
print(f'Event columns: {events.columns.tolist()}')
library(StatsBombR)
# Get World Cup 2022 matches
comp <- FreeCompetitions() %>%
filter(competition_id == 43, season_id == 106)
matches <- FreeMatches(comp)
print(paste('Total matches:', nrow(matches)))
print(names(matches))
# Get events from one match
events <- get.matchFree(matches[1,])
print(names(events))
Exercise 1.2: Calculate Team Shot Statistics
For any World Cup 2022 match, calculate the total shots, shots on target, and goals for each team.
Hint 1: Filter events where type == "Shot" or type.name == "Shot"
Hint 2: Use shot_outcome or shot.outcome.name to check if a shot was a goal.
from statsbombpy import sb
# Get Argentina vs France final
events = sb.events(match_id=3869685)
# Filter shots
shots = events[events['type'] == 'Shot']
# Group by team
stats = shots.groupby('team').agg(
total_shots=('type', 'count'),
on_target=('shot_outcome', lambda x: x.isin(['Goal', 'Saved']).sum()),
goals=('shot_outcome', lambda x: (x == 'Goal').sum())
)
print(stats)
library(StatsBombR)
library(tidyverse)
# Get Argentina vs France final
events <- get.matchFree(data.frame(match_id = 3869685))
# Filter shots and calculate stats
shots <- events %>%
filter(type.name == 'Shot') %>%
group_by(team.name) %>%
summarise(
total_shots = n(),
on_target = sum(shot.outcome.name %in% c('Goal', 'Saved')),
goals = sum(shot.outcome.name == 'Goal')
)
print(shots)
Exercise 6.1: xG Comparison Analysis
Using StatsBomb open data for the 2018 World Cup: 1) Calculate total xG for each team in the tournament 2) Compare xG to actual goals scored 3) Identify teams that overperformed/underperformed xG 4) Create a scatter plot of xG vs Goals.
Hint 1: First get all matches for the 2018 World Cup, then get all events.
Hint 2: Aggregate shots and xG by team across all matches.
from statsbombpy import sb
import pandas as pd
import matplotlib.pyplot as plt
# Get all World Cup 2018 matches
matches = sb.matches(competition_id=43, season_id=3)
# Get all events
all_events = pd.concat([sb.events(m) for m in matches['match_id']])
# Filter shots and calculate xG by team
shots = all_events[all_events['type'] == 'Shot']
xg_summary = shots.groupby('team').agg(
total_xg=('shot_statsbomb_xg', 'sum'),
goals=('shot_outcome', lambda x: (x == 'Goal').sum())
).reset_index()
xg_summary['difference'] = xg_summary['goals'] - xg_summary['total_xg']
# Plot
plt.figure(figsize=(10, 8))
plt.scatter(xg_summary['total_xg'], xg_summary['goals'])
plt.plot([0, 15], [0, 15], 'r--', label='xG = Goals')
plt.xlabel('Expected Goals (xG)')
plt.ylabel('Actual Goals')
plt.title('World Cup 2018: xG vs Goals')
plt.legend()
plt.show()
library(StatsBombR)
library(tidyverse)
# Get all World Cup 2018 matches and events
comp <- FreeCompetitions() %>%
filter(competition_id == 43, season_id == 3)
matches <- FreeMatches(comp)
events <- free_allevents(MatchesDF = matches)
# Calculate xG summary
xg_summary <- events %>%
filter(type.name == 'Shot') %>%
group_by(team.name) %>%
summarise(
total_xg = sum(shot.statsbomb_xg, na.rm = TRUE),
goals = sum(shot.outcome.name == 'Goal')
) %>%
mutate(difference = goals - total_xg)
# Plot
ggplot(xg_summary, aes(x = total_xg, y = goals)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed', color = 'red') +
labs(x = 'Expected Goals (xG)', y = 'Actual Goals',
title = 'World Cup 2018: xG vs Goals') +
theme_minimal()
Exercise 4.1: Create a Shot Map
Create a shot map visualization for any World Cup match showing all shots on a football pitch, with size indicating xG and color indicating outcome (goal vs. non-goal).
Hint 1: Use mplsoccer Pitch() or ggsoccer to draw the pitch.
Hint 2: Shot locations are in the location column (Python) or location.x/location.y (R).
from mplsoccer import Pitch, VerticalPitch
from statsbombpy import sb
import matplotlib.pyplot as plt
# Get match events
events = sb.events(match_id=3869685)
shots = events[events['type'] == 'Shot'].copy()
# Extract coordinates
shots['x'] = shots['location'].apply(lambda x: x[0])
shots['y'] = shots['location'].apply(lambda x: x[1])
shots['is_goal'] = shots['shot_outcome'] == 'Goal'
# Create pitch
pitch = VerticalPitch(half=True, pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(10, 8))
# Plot shots
for team in shots['team'].unique():
team_shots = shots[shots['team'] == team]
scatter = pitch.scatter(
team_shots['x'], team_shots['y'],
s=team_shots['shot_statsbomb_xg'] * 500,
c=['green' if g else 'red' for g in team_shots['is_goal']],
alpha=0.6, ax=ax, label=team
)
plt.title('Argentina vs France - Shot Map')
plt.legend()
plt.show()
library(StatsBombR)
library(ggsoccer)
library(tidyverse)
# Get match events
events <- get.matchFree(data.frame(match_id = 3869685))
shots <- events %>%
filter(type.name == 'Shot') %>%
mutate(
is_goal = shot.outcome.name == 'Goal',
xg = shot.statsbomb_xg
)
# Create shot map
ggplot(shots) +
annotate_pitch(colour = 'white', fill = '#538032') +
geom_point(
aes(x = location.x, y = location.y,
size = xg, color = is_goal),
alpha = 0.7
) +
scale_color_manual(values = c('red', 'green'),
labels = c('No Goal', 'Goal')) +
coord_flip(xlim = c(60, 120)) +
theme_pitch() +
labs(title = 'Argentina vs France - Shot Map',
size = 'xG', color = 'Outcome')
Exercise 9.1: Build a Pass Network
Create a pass network visualization showing the average positions of players and the passing connections between them for a team in a specific match.
Hint 1: Calculate average x,y position for each player from their pass start locations.
Hint 2: Count passes between each pair of players to determine edge weights.
from mplsoccer import Pitch
from statsbombpy import sb
import pandas as pd
import matplotlib.pyplot as plt
# Get events
events = sb.events(match_id=3869685)
arg_passes = events[(events['team'] == 'Argentina') &
(events['type'] == 'Pass') &
(events['pass_outcome'].isna())].copy()
# Calculate average positions
arg_passes['x'] = arg_passes['location'].apply(lambda x: x[0])
arg_passes['y'] = arg_passes['location'].apply(lambda x: x[1])
avg_pos = arg_passes.groupby('player').agg(
x=('x', 'mean'),
y=('y', 'mean'),
passes=('type', 'count')
).reset_index()
# Count pass combinations
arg_passes['recipient'] = arg_passes['pass_recipient']
pass_pairs = arg_passes.groupby(['player', 'recipient']).size().reset_index(name='count')
pass_pairs = pass_pairs[pass_pairs['count'] >= 3] # Min 3 passes
# Plot
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(12, 8))
# Draw edges
for _, row in pass_pairs.iterrows():
p1 = avg_pos[avg_pos['player'] == row['player']]
p2 = avg_pos[avg_pos['recipient'] == row['recipient']]
if len(p1) > 0 and len(p2) > 0:
ax.plot([p1['x'].values[0], p2['x'].values[0]],
[p1['y'].values[0], p2['y'].values[0]],
'gray', alpha=row['count']/20, linewidth=row['count']/2)
# Draw nodes
ax.scatter(avg_pos['x'], avg_pos['y'], s=avg_pos['passes']*3,
c='blue', zorder=10)
plt.title('Argentina Pass Network')
plt.show()
library(StatsBombR)
library(ggsoccer)
library(tidyverse)
# Get events
events <- get.matchFree(data.frame(match_id = 3869685))
arg_passes <- events %>%
filter(team.name == 'Argentina', type.name == 'Pass',
is.na(pass.outcome.name))
# Calculate average positions
avg_pos <- arg_passes %>%
group_by(player.name) %>%
summarise(
x = mean(location.x),
y = mean(location.y),
passes = n()
)
# Count pass combinations
pass_pairs <- arg_passes %>%
group_by(player.name, pass.recipient.name) %>%
summarise(count = n()) %>%
filter(count >= 3)
# Create pass network plot
ggplot() +
annotate_pitch() +
geom_segment(
data = pass_pairs %>%
left_join(avg_pos, by = c('player.name' = 'player.name')) %>%
left_join(avg_pos, by = c('pass.recipient.name' = 'player.name'),
suffix = c('', '_end')),
aes(x = x, y = y, xend = x_end, yend = y_end,
alpha = count, linewidth = count),
color = 'gray'
) +
geom_point(
data = avg_pos,
aes(x = x, y = y, size = passes),
color = 'blue'
) +
theme_pitch() +
labs(title = 'Argentina Pass Network')