Exercises

Practice your soccer analytics skills with hands-on exercises. Each exercise includes hints and full solutions in both R and Python.

Exercise 1.1: Load and Explore StatsBomb Data

Easy

Using StatsBomb open data, load the 2022 World Cup matches and explore the data structure. Answer: How many matches are in the dataset? What columns are available in the events data?

Show Hints

Hint 1: Use sb.matches() or FreeMatches() to get the matches list.

Hint 2: Filter for competition_id=43 and season_id=106 for World Cup 2022.

Show Solution
solution
from statsbombpy import sb

# Get World Cup 2022 matches
matches = sb.matches(competition_id=43, season_id=106)
print(f'Total matches: {len(matches)}')
print(f'Columns: {matches.columns.tolist()}')

# Get events from one match
events = sb.events(match_id=matches.iloc[0]['match_id'])
print(f'Event columns: {events.columns.tolist()}')
library(StatsBombR)

# Get World Cup 2022 matches
comp <- FreeCompetitions() %>%
  filter(competition_id == 43, season_id == 106)
matches <- FreeMatches(comp)
print(paste('Total matches:', nrow(matches)))
print(names(matches))

# Get events from one match
events <- get.matchFree(matches[1,])
print(names(events))

Exercise 1.2: Calculate Team Shot Statistics

Easy

For any World Cup 2022 match, calculate the total shots, shots on target, and goals for each team.

Show Hints

Hint 1: Filter events where type == "Shot" or type.name == "Shot"

Hint 2: Use shot_outcome or shot.outcome.name to check if a shot was a goal.

Show Solution
solution
from statsbombpy import sb

# Get Argentina vs France final
events = sb.events(match_id=3869685)

# Filter shots
shots = events[events['type'] == 'Shot']

# Group by team
stats = shots.groupby('team').agg(
    total_shots=('type', 'count'),
    on_target=('shot_outcome', lambda x: x.isin(['Goal', 'Saved']).sum()),
    goals=('shot_outcome', lambda x: (x == 'Goal').sum())
)
print(stats)
library(StatsBombR)
library(tidyverse)

# Get Argentina vs France final
events <- get.matchFree(data.frame(match_id = 3869685))

# Filter shots and calculate stats
shots <- events %>%
  filter(type.name == 'Shot') %>%
  group_by(team.name) %>%
  summarise(
    total_shots = n(),
    on_target = sum(shot.outcome.name %in% c('Goal', 'Saved')),
    goals = sum(shot.outcome.name == 'Goal')
  )
print(shots)

Exercise 6.1: xG Comparison Analysis

Medium

Using StatsBomb open data for the 2018 World Cup: 1) Calculate total xG for each team in the tournament 2) Compare xG to actual goals scored 3) Identify teams that overperformed/underperformed xG 4) Create a scatter plot of xG vs Goals.

Show Hints

Hint 1: First get all matches for the 2018 World Cup, then get all events.

Hint 2: Aggregate shots and xG by team across all matches.

Show Solution
solution
from statsbombpy import sb
import pandas as pd
import matplotlib.pyplot as plt

# Get all World Cup 2018 matches
matches = sb.matches(competition_id=43, season_id=3)

# Get all events
all_events = pd.concat([sb.events(m) for m in matches['match_id']])

# Filter shots and calculate xG by team
shots = all_events[all_events['type'] == 'Shot']
xg_summary = shots.groupby('team').agg(
    total_xg=('shot_statsbomb_xg', 'sum'),
    goals=('shot_outcome', lambda x: (x == 'Goal').sum())
).reset_index()

xg_summary['difference'] = xg_summary['goals'] - xg_summary['total_xg']

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(xg_summary['total_xg'], xg_summary['goals'])
plt.plot([0, 15], [0, 15], 'r--', label='xG = Goals')
plt.xlabel('Expected Goals (xG)')
plt.ylabel('Actual Goals')
plt.title('World Cup 2018: xG vs Goals')
plt.legend()
plt.show()
library(StatsBombR)
library(tidyverse)

# Get all World Cup 2018 matches and events
comp <- FreeCompetitions() %>%
  filter(competition_id == 43, season_id == 3)
matches <- FreeMatches(comp)
events <- free_allevents(MatchesDF = matches)

# Calculate xG summary
xg_summary <- events %>%
  filter(type.name == 'Shot') %>%
  group_by(team.name) %>%
  summarise(
    total_xg = sum(shot.statsbomb_xg, na.rm = TRUE),
    goals = sum(shot.outcome.name == 'Goal')
  ) %>%
  mutate(difference = goals - total_xg)

# Plot
ggplot(xg_summary, aes(x = total_xg, y = goals)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed', color = 'red') +
  labs(x = 'Expected Goals (xG)', y = 'Actual Goals',
       title = 'World Cup 2018: xG vs Goals') +
  theme_minimal()

Exercise 4.1: Create a Shot Map

Medium

Create a shot map visualization for any World Cup match showing all shots on a football pitch, with size indicating xG and color indicating outcome (goal vs. non-goal).

Show Hints

Hint 1: Use mplsoccer Pitch() or ggsoccer to draw the pitch.

Hint 2: Shot locations are in the location column (Python) or location.x/location.y (R).

Show Solution
solution
from mplsoccer import Pitch, VerticalPitch
from statsbombpy import sb
import matplotlib.pyplot as plt

# Get match events
events = sb.events(match_id=3869685)
shots = events[events['type'] == 'Shot'].copy()

# Extract coordinates
shots['x'] = shots['location'].apply(lambda x: x[0])
shots['y'] = shots['location'].apply(lambda x: x[1])
shots['is_goal'] = shots['shot_outcome'] == 'Goal'

# Create pitch
pitch = VerticalPitch(half=True, pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(10, 8))

# Plot shots
for team in shots['team'].unique():
    team_shots = shots[shots['team'] == team]
    scatter = pitch.scatter(
        team_shots['x'], team_shots['y'],
        s=team_shots['shot_statsbomb_xg'] * 500,
        c=['green' if g else 'red' for g in team_shots['is_goal']],
        alpha=0.6, ax=ax, label=team
    )

plt.title('Argentina vs France - Shot Map')
plt.legend()
plt.show()
library(StatsBombR)
library(ggsoccer)
library(tidyverse)

# Get match events
events <- get.matchFree(data.frame(match_id = 3869685))
shots <- events %>%
  filter(type.name == 'Shot') %>%
  mutate(
    is_goal = shot.outcome.name == 'Goal',
    xg = shot.statsbomb_xg
  )

# Create shot map
ggplot(shots) +
  annotate_pitch(colour = 'white', fill = '#538032') +
  geom_point(
    aes(x = location.x, y = location.y,
        size = xg, color = is_goal),
    alpha = 0.7
  ) +
  scale_color_manual(values = c('red', 'green'),
                     labels = c('No Goal', 'Goal')) +
  coord_flip(xlim = c(60, 120)) +
  theme_pitch() +
  labs(title = 'Argentina vs France - Shot Map',
       size = 'xG', color = 'Outcome')

Exercise 9.1: Build a Pass Network

Hard

Create a pass network visualization showing the average positions of players and the passing connections between them for a team in a specific match.

Show Hints

Hint 1: Calculate average x,y position for each player from their pass start locations.

Hint 2: Count passes between each pair of players to determine edge weights.

Show Solution
solution
from mplsoccer import Pitch
from statsbombpy import sb
import pandas as pd
import matplotlib.pyplot as plt

# Get events
events = sb.events(match_id=3869685)
arg_passes = events[(events['team'] == 'Argentina') & 
                    (events['type'] == 'Pass') &
                    (events['pass_outcome'].isna())].copy()

# Calculate average positions
arg_passes['x'] = arg_passes['location'].apply(lambda x: x[0])
arg_passes['y'] = arg_passes['location'].apply(lambda x: x[1])

avg_pos = arg_passes.groupby('player').agg(
    x=('x', 'mean'),
    y=('y', 'mean'),
    passes=('type', 'count')
).reset_index()

# Count pass combinations
arg_passes['recipient'] = arg_passes['pass_recipient']
pass_pairs = arg_passes.groupby(['player', 'recipient']).size().reset_index(name='count')
pass_pairs = pass_pairs[pass_pairs['count'] >= 3]  # Min 3 passes

# Plot
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(12, 8))

# Draw edges
for _, row in pass_pairs.iterrows():
    p1 = avg_pos[avg_pos['player'] == row['player']]
    p2 = avg_pos[avg_pos['recipient'] == row['recipient']]
    if len(p1) > 0 and len(p2) > 0:
        ax.plot([p1['x'].values[0], p2['x'].values[0]],
                [p1['y'].values[0], p2['y'].values[0]],
                'gray', alpha=row['count']/20, linewidth=row['count']/2)

# Draw nodes
ax.scatter(avg_pos['x'], avg_pos['y'], s=avg_pos['passes']*3, 
           c='blue', zorder=10)

plt.title('Argentina Pass Network')
plt.show()
library(StatsBombR)
library(ggsoccer)
library(tidyverse)

# Get events
events <- get.matchFree(data.frame(match_id = 3869685))

arg_passes <- events %>%
  filter(team.name == 'Argentina', type.name == 'Pass',
         is.na(pass.outcome.name))

# Calculate average positions
avg_pos <- arg_passes %>%
  group_by(player.name) %>%
  summarise(
    x = mean(location.x),
    y = mean(location.y),
    passes = n()
  )

# Count pass combinations
pass_pairs <- arg_passes %>%
  group_by(player.name, pass.recipient.name) %>%
  summarise(count = n()) %>%
  filter(count >= 3)

# Create pass network plot
ggplot() +
  annotate_pitch() +
  geom_segment(
    data = pass_pairs %>% 
      left_join(avg_pos, by = c('player.name' = 'player.name')) %>%
      left_join(avg_pos, by = c('pass.recipient.name' = 'player.name'),
                suffix = c('', '_end')),
    aes(x = x, y = y, xend = x_end, yend = y_end,
        alpha = count, linewidth = count),
    color = 'gray'
  ) +
  geom_point(
    data = avg_pos,
    aes(x = x, y = y, size = passes),
    color = 'blue'
  ) +
  theme_pitch() +
  labs(title = 'Argentina Pass Network')