Chapter 60

Capstone - Complete Analytics System

Intermediate 30 min read 5 sections 10 code examples
0 of 60 chapters completed (0%)

The Hardest Position to Measure

Defensive analytics is football's greatest challenge. Unlike attacking metrics where we measure creation and conversion, defensive value often comes from what doesn't happen—the shot not taken, the pass not completed, the chance prevented.

A defender with zero tackles might be elite (never out of position) or terrible (ball watching). Context is everything. This chapter explores how to properly evaluate defensive contributions.

The Defensive Analytics Challenge
  • Selection bias: More tackles might mean worse positioning
  • Team context: High-line teams face different challenges
  • Off-ball value: Positioning and communication are invisible in data
  • Attribution: Who prevented the goal—the blocker or the one who forced the bad shot?

Basic Defensive Actions

Start with the fundamentals: tackles, interceptions, blocks, and clearances.

# Calculate basic defensive statistics from statsbombpy import sb import pandas as pd # Load data matches = sb.matches(competition_id=43, season_id=106) all_events = pd.concat([sb.events(mid) for mid in matches["match_id"]]) # Filter defensive events defensive_types = ["Tackle", "Interception", "Clearance", "Block", "Duel"] defense = all_events[all_events["type"].isin(defensive_types)].copy() # Aggregate by player def_stats = defense.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), tackles=("type", lambda x: (x == "Tackle").sum()), interceptions=("type", lambda x: (x == "Interception").sum()), clearances=("type", lambda x: (x == "Clearance").sum()), blocks=("type", lambda x: (x == "Block").sum()), duels=("type", lambda x: (x == "Duel").sum()) ).reset_index() # Per 90 metrics def_stats["tackles_per_90"] = (def_stats["tackles"] / def_stats["matches"]).round(2) def_stats["interceptions_per_90"] = (def_stats["interceptions"] / def_stats["matches"]).round(2) def_stats["ball_winning"] = def_stats["tackles"] + def_stats["interceptions"] def_stats["ball_winning_per_90"] = (def_stats["ball_winning"] / def_stats["matches"]).round(2) def_stats = def_stats[def_stats["matches"] >= 3] print("Defensive Action Leaders:") print(def_stats.sort_values("ball_winning_per_90", ascending=False).head(15))
# Calculate basic defensive statistics
library(StatsBombR)
library(dplyr)

# Load data
comps <- FreeCompetitions() %>%
  filter(competition_id == 43, season_id == 106)
matches <- FreeMatches(comps)
events <- free_allevents(MatchesDF = matches)

# Defensive actions
defensive_stats <- events %>%
  filter(type.name %in% c("Tackle", "Interception", "Clearance",
                          "Block", "Duel")) %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n_distinct(match_id),

    # Tackles
    tackles = sum(type.name == "Tackle"),
    tackles_won = sum(type.name == "Tackle" &
                      duel.outcome.name %in% c("Won", "Success")),

    # Interceptions
    interceptions = sum(type.name == "Interception"),

    # Clearances
    clearances = sum(type.name == "Clearance"),

    # Blocks
    blocks = sum(type.name == "Block"),
    shot_blocks = sum(type.name == "Block" &
                      block.offensive == FALSE),

    # Duels
    duels = sum(type.name == "Duel"),
    duels_won = sum(type.name == "Duel" &
                    duel.outcome.name %in% c("Won", "Success")),

    .groups = "drop"
  ) %>%
  mutate(
    # Rates
    tackle_success = round(tackles_won / tackles * 100, 1),
    duel_success = round(duels_won / duels * 100, 1),

    # Per 90
    tackles_per_90 = round(tackles / matches, 2),
    interceptions_per_90 = round(interceptions / matches, 2),
    clearances_per_90 = round(clearances / matches, 2),

    # Ball-winning actions
    ball_winning = tackles_won + interceptions,
    ball_winning_per_90 = round(ball_winning / matches, 2)
  )

print("Defensive Action Leaders:")
print(defensive_stats %>%
        filter(matches >= 3) %>%
        arrange(desc(ball_winning_per_90)) %>%
        select(player.name, team.name, matches, tackles_won,
               interceptions, ball_winning_per_90, tackle_success) %>%
        head(15))
chapter10-basic-defense
Output
Calculating basic defensive statistics

Possession-Adjusted Defense

Raw defensive numbers are misleading without context. A team with 70% possession has fewer defensive opportunities than one with 30%.

Defensive Actions Per Defensive Action Opportunity

# Possession-adjusted defensive metrics # Calculate opponent possession per match match_poss = all_events[all_events["type"] == "Pass"].groupby( ["match_id", "team"]).size().unstack(fill_value=0) match_poss["total"] = match_poss.sum(axis=1) # Get defensive actions per player per match def_actions = all_events[ all_events["type"].isin(["Tackle", "Interception"]) ].groupby(["player", "team", "match_id"]).size().reset_index(name="def_actions") # This would need proper possession joining - simplified version print("Note: Full implementation requires match-level possession data") print("Concept: Adjust defensive actions by opponent possession %") print("More opponent possession = more chances to defend") print("Adjustment: def_actions * (50 / opp_possession_pct)")
# Possession-adjusted defensive metrics
# Calculate opponent possession to adjust

# First, get team possession by match
match_possession <- events %>%
  filter(type.name == "Pass") %>%
  group_by(match_id, team.name) %>%
  summarise(passes = n(), .groups = "drop") %>%
  group_by(match_id) %>%
  mutate(
    total_passes = sum(passes),
    possession_pct = passes / total_passes * 100,
    opp_possession_pct = 100 - possession_pct
  )

# Join to player defensive stats
player_match_defense <- events %>%
  filter(type.name %in% c("Tackle", "Interception")) %>%
  group_by(player.name, team.name, match_id) %>%
  summarise(
    defensive_actions = n(),
    .groups = "drop"
  ) %>%
  left_join(match_possession, by = c("match_id", "team.name"))

# Possession-adjusted rate
# More opponent possession = more opportunities to defend
poss_adjusted <- player_match_defense %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n(),
    total_def_actions = sum(defensive_actions),
    avg_opp_possession = mean(opp_possession_pct),

    # Raw rate
    def_actions_per_match = total_def_actions / matches,

    # Adjusted: normalize to 50% opponent possession baseline
    adj_def_actions = total_def_actions / (avg_opp_possession / 50),
    adj_def_per_match = adj_def_actions / matches,

    .groups = "drop"
  ) %>%
  filter(matches >= 3) %>%
  mutate(
    adjustment_factor = round(50 / avg_opp_possession, 2)
  )

print("Possession-Adjusted Defensive Actions:")
print(poss_adjusted %>%
        arrange(desc(adj_def_per_match)) %>%
        select(player.name, matches, def_actions_per_match,
               avg_opp_possession, adjustment_factor, adj_def_per_match) %>%
        head(15))
chapter10-poss-adjusted
Output
Possession-adjusted defensive metrics

PPDA - Passes Per Defensive Action

PPDA measures team pressing intensity—fewer passes allowed per defensive action means more aggressive pressing:

# Calculate PPDA # PPDA = Opponent passes in their defensive 60% / Your defensive actions in that zone def calculate_match_ppda(events, team): """Calculate PPDA for a team in a match.""" opp_passes = events[ (events["team"] != team) & (events["type"] == "Pass") & (events["location"].apply(lambda l: l[0] if l else 0) <= 72) ] team_def = events[ (events["team"] == team) & (events["type"].isin(["Tackle", "Interception", "Foul Committed"])) & (events["location"].apply(lambda l: l[0] if l else 0) >= 48) ] if len(team_def) == 0: return None return len(opp_passes) / len(team_def) # Calculate for sample match sample_match = sb.events(match_id=matches["match_id"].iloc[0]) teams = sample_match["team"].unique() for team in teams: ppda = calculate_match_ppda(sample_match, team) if ppda: print(f"{team} PPDA: {ppda:.1f}") print("\nPPDA Interpretation:") print("< 8: Very high press (Liverpool, Man City)") print("8-10: High press") print("10-12: Medium press") print("> 12: Low block, counter-attacking")
# Calculate PPDA (Passes Per Defensive Action)
# Lower PPDA = more intense pressing

calculate_ppda <- function(events_df, team_name, match_id) {
  match_events <- events_df %>%
    filter(match_id == !!match_id)

  # Opponent passes in their defensive 60%
  opp_passes <- match_events %>%
    filter(team.name != team_name,
           type.name == "Pass",
           location.x <= 72)  # Opponent defensive 60%

  # Team defensive actions in opponent defensive 60%
  team_def_actions <- match_events %>%
    filter(team.name == team_name,
           type.name %in% c("Tackle", "Interception", "Foul Committed"),
           location.x >= 48)  # Same zone, flipped

  if (nrow(team_def_actions) == 0) return(NA)

  ppda <- nrow(opp_passes) / nrow(team_def_actions)
  return(ppda)
}

# Calculate PPDA for all teams across tournament
team_ppda <- events %>%
  select(match_id, team.name) %>%
  distinct() %>%
  group_by(team.name) %>%
  summarise(
    matches = n_distinct(match_id)
  )

# Simplified PPDA calculation
team_ppda_stats <- events %>%
  group_by(team.name) %>%
  summarise(
    matches = n_distinct(match_id),

    # Approximate PPDA
    opp_passes_faced = sum(type.name == "Pass" &
                           !is.na(team.name)),  # Would need opponent filter
    high_press_actions = sum(type.name %in% c("Tackle", "Interception",
                                               "Pressure") &
                             location.x >= 60),  # High press zone

    .groups = "drop"
  )

print("Team Pressing Intensity:")
print(team_ppda_stats %>%
        mutate(press_actions_per_match = round(high_press_actions / matches, 1)) %>%
        arrange(desc(press_actions_per_match)))
chapter10-ppda
Output
Calculating PPDA (pressing intensity)

Defensive Value Models

Modern analytics attempts to quantify the actual value of defensive actions by measuring their impact on opponent scoring probability.

Expected Threat Prevented (xT Prevented)

# Calculate xT prevented by defensive actions import numpy as np # Simple xT grid (probabilities of scoring from each zone) xt_grid = np.array([ [0.01, 0.01, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01], [0.01, 0.02, 0.03, 0.04, 0.04, 0.03, 0.02, 0.01], [0.02, 0.03, 0.05, 0.08, 0.08, 0.05, 0.03, 0.02], [0.03, 0.05, 0.10, 0.15, 0.15, 0.10, 0.05, 0.03], [0.05, 0.10, 0.20, 0.30, 0.30, 0.20, 0.10, 0.05], [0.10, 0.20, 0.35, 0.45, 0.45, 0.35, 0.20, 0.10] ]) def get_xt(x, y): """Get xT value for coordinates.""" if x is None or y is None: return 0 # Convert to grid indices grid_x = min(5, max(0, int(x / 20))) grid_y = min(7, max(0, int(y / 10))) return xt_grid[grid_x, grid_y] if grid_x < 6 else 0.5 # Calculate xT prevented def_actions = all_events[ all_events["type"].isin(["Tackle", "Interception", "Block"]) ].copy() def_actions["x"] = def_actions["location"].apply(lambda l: l[0] if l else None) def_actions["y"] = def_actions["location"].apply(lambda l: l[1] if l else None) def_actions["xt_prevented"] = def_actions.apply( lambda r: get_xt(r["x"], r["y"]), axis=1) # Aggregate xt_by_player = def_actions.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), def_actions=("type", "count"), total_xt_prevented=("xt_prevented", "sum") ).reset_index() xt_by_player["xt_per_90"] = ( xt_by_player["total_xt_prevented"] / xt_by_player["matches"]).round(3) print("xT Prevented Leaders:") print(xt_by_player[xt_by_player["matches"] >= 3].sort_values( "xt_per_90", ascending=False).head(15))
# Calculate xT (Expected Threat) prevented by defensive actions
# xT assigns values to pitch zones based on goal probability

# Simple xT grid (12x8 zones)
# Values represent probability of scoring from that zone
xt_grid <- matrix(c(
  0.01, 0.01, 0.01, 0.02, 0.02, 0.02, 0.02, 0.01,  # Row 1 (own goal)
  0.01, 0.01, 0.02, 0.02, 0.03, 0.03, 0.02, 0.01,
  0.01, 0.02, 0.03, 0.04, 0.05, 0.04, 0.03, 0.02,
  0.02, 0.03, 0.04, 0.06, 0.08, 0.06, 0.04, 0.03,
  0.03, 0.04, 0.06, 0.10, 0.15, 0.10, 0.06, 0.04,
  0.04, 0.06, 0.10, 0.20, 0.30, 0.20, 0.10, 0.06,
  0.08, 0.15, 0.25, 0.35, 0.40, 0.35, 0.25, 0.15,  # Row 7 (opp box)
  0.15, 0.30, 0.40, 0.50, 0.50, 0.50, 0.40, 0.30   # Row 8 (6-yard box)
), nrow = 8, byrow = TRUE)

# Function to get xT value from coordinates
get_xt <- function(x, y) {
  # Convert StatsBomb coords (120x80) to grid (8x12)
  grid_x <- ceiling(x / 10)
  grid_y <- ceiling(y / 10)

  grid_x <- max(1, min(12, grid_x))
  grid_y <- max(1, min(8, grid_y))

  return(xt_grid[grid_y, grid_x])
}

# Calculate xT prevented by defensive actions
def_xt <- events %>%
  filter(type.name %in% c("Tackle", "Interception", "Block"),
         duel.outcome.name %in% c("Won", "Success") |
         type.name %in% c("Interception", "Block")) %>%
  mutate(
    xt_at_action = mapply(get_xt, location.x, location.y),
    # Opponent was progressing, so xT prevented is current position value
    xt_prevented = xt_at_action
  )

# Player xT prevented
xt_prevented_by_player <- def_xt %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n_distinct(match_id),
    defensive_actions = n(),
    total_xt_prevented = sum(xt_prevented),
    xt_prevented_per_90 = round(total_xt_prevented / matches, 3),
    .groups = "drop"
  ) %>%
  filter(matches >= 3) %>%
  arrange(desc(xt_prevented_per_90))

print("xT Prevented Leaders:")
print(head(xt_prevented_by_player, 15))
chapter10-xt-prevented
Output
Calculating Expected Threat prevented

The Other Side: Defensive Failures

Defensive stats must include failures—being dribbled past, errors, and penalties conceded:

# Track defensive failures failures = all_events[ all_events["type"].isin(["Dribbled Past", "Error", "Foul Committed"]) ].copy() failure_stats = failures.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), dribbled_past=("type", lambda x: (x == "Dribbled Past").sum()), errors=("type", lambda x: (x == "Error").sum()), fouls=("type", lambda x: (x == "Foul Committed").sum()) ).reset_index() failure_stats["failures_per_90"] = ( (failure_stats["dribbled_past"] + failure_stats["errors"]) / failure_stats["matches"]).round(2) # Net defensive contribution net_def = def_stats.merge( failure_stats[["player", "failures_per_90"]], on="player", how="left") net_def["failures_per_90"] = net_def["failures_per_90"].fillna(0) net_def["net_defensive"] = net_def["ball_winning_per_90"] - net_def["failures_per_90"] print("Net Defensive Contribution:") print(net_def[net_def["matches"] >= 3].sort_values( "net_defensive", ascending=False).head(15))
# Track defensive failures
defensive_failures <- events %>%
  filter(
    # Dribbled past (from dribble events where defender lost)
    (type.name == "Dribbled Past") |
    # Fouls leading to dangerous free kicks
    (type.name == "Foul Committed" & location.x >= 80) |
    # Errors
    (type.name == "Error")
  ) %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n_distinct(match_id),
    dribbled_past = sum(type.name == "Dribbled Past"),
    fouls_danger_zone = sum(type.name == "Foul Committed"),
    errors = sum(type.name == "Error"),
    .groups = "drop"
  ) %>%
  mutate(
    failures_per_90 = round((dribbled_past + errors) / matches, 2)
  )

# Combine with positive actions for net contribution
net_defensive <- defensive_stats %>%
  select(player.name, team.name, matches, ball_winning_per_90) %>%
  left_join(
    defensive_failures %>%
      select(player.name, failures_per_90),
    by = "player.name"
  ) %>%
  mutate(
    failures_per_90 = ifelse(is.na(failures_per_90), 0, failures_per_90),
    net_defensive = ball_winning_per_90 - failures_per_90
  ) %>%
  filter(matches >= 3) %>%
  arrange(desc(net_defensive))

print("Net Defensive Contribution (Wins - Failures):")
print(head(net_defensive, 15))
chapter10-failures
Output
Tracking defensive failures and net contribution

Aerial Duel Analysis

# Aerial duel analysis (simplified) aerials = all_events[ all_events["type"].isin(["Clearance"]) | (all_events["type"] == "Duel") ].copy() aerial_stats = aerials.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), aerial_actions=("type", "count") ).reset_index() aerial_stats["aerials_per_90"] = ( aerial_stats["aerial_actions"] / aerial_stats["matches"]).round(2) print("Aerial Activity Leaders:") print(aerial_stats[aerial_stats["matches"] >= 3].sort_values( "aerials_per_90", ascending=False).head(10))
# Aerial duel analysis
aerial_stats <- events %>%
  filter(type.name == "Aerial Lost" | type.name == "Clearance" |
         (type.name == "Duel" & duel.type.name == "Aerial Lost")) %>%
  # This is simplified - actual aerial tracking needs both won/lost
  group_by(player.name, team.name) %>%
  summarise(
    matches = n_distinct(match_id),
    aerials = n(),
    .groups = "drop"
  ) %>%
  mutate(
    aerials_per_90 = round(aerials / matches, 2)
  ) %>%
  filter(matches >= 3)

# For proper aerial win rate, need both aerial won and lost events
print("Aerial Activity Leaders:")
print(head(aerial_stats %>% arrange(desc(aerials_per_90)), 10))
chapter10-aerial
Output
Aerial duel analysis

Chapter Summary

Key Takeaways
  • Raw numbers mislead: Always consider possession and context
  • Possession-adjust: More opponent possession = more defensive opportunities
  • PPDA measures pressing: Lower = more aggressive press
  • Include failures: Dribbled past, errors, fouls matter
  • Net contribution: Successes minus failures gives true picture
  • xT prevented: Value actions by how much threat they stopped

Defensive Metrics Reference

Metric Good Value Interpretation
Tackle Success % 65%+ Wins most challenges
Aerial Win % 60%+ Dominant in the air
PPDA <10 High pressing intensity
Dribbled Past/90 <1.0 Rarely beaten 1v1

Practice Exercises

Test your understanding of defensive analytics with these practical exercises.

Exercise 10.1: Possession-Adjusted Defensive Actions

Task: Calculate possession-adjusted defensive statistics for players. Normalize defensive actions based on how much possession the opponent had (more opponent possession = more opportunities to defend).

Formula: Adjusted Actions = Raw Actions × (50 / Opponent Possession %)

# Exercise 10.1: Possession-Adjusted Defense from statsbombpy import sb import pandas as pd # Load data matches = sb.matches(competition_id=43, season_id=106) all_events = pd.concat([ sb.events(mid).assign(match_id=mid) for mid in matches["match_id"] ]) # Calculate possession per match passes = all_events[all_events["type"] == "Pass"] match_poss = passes.groupby(["match_id", "team"]).size().unstack(fill_value=0) match_poss["total"] = match_poss.sum(axis=1) # Melt to get possession by team poss_data = [] for match_id in match_poss.index: for team in match_poss.columns[:-1]: team_passes = match_poss.loc[match_id, team] total = match_poss.loc[match_id, "total"] poss_pct = team_passes / total * 100 poss_data.append({ "match_id": match_id, "team": team, "possession": poss_pct, "opp_possession": 100 - poss_pct }) poss_df = pd.DataFrame(poss_data) # Defensive actions def_types = ["Tackle", "Interception", "Clearance", "Block"] defense = all_events[all_events["type"].isin(def_types)] player_def = defense.groupby(["player", "team", "match_id"]).agg( actions=("type", "count") ).reset_index() # Join possession player_def = player_def.merge( poss_df[["match_id", "team", "opp_possession"]], on=["match_id", "team"] ) # Aggregate with adjustment adjusted = player_def.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), raw_actions=("actions", "sum"), avg_opp_poss=("opp_possession", "mean") ).reset_index() # Apply adjustment player_def["adj_actions"] = player_def["actions"] * (50 / player_def["opp_possession"]) adj_sum = player_def.groupby("player")["adj_actions"].sum() adjusted = adjusted.merge(adj_sum.reset_index(), on="player") adjusted["raw_per_90"] = (adjusted["raw_actions"] / adjusted["matches"]).round(2) adjusted["adj_per_90"] = (adjusted["adj_actions"] / adjusted["matches"]).round(2) adjusted = adjusted[adjusted["matches"] >= 3] print("Possession-Adjusted Defensive Actions:") print(adjusted.sort_values("adj_per_90", ascending=False).head(15))
# Exercise 10.1: Possession-Adjusted Defense
library(StatsBombR)
library(dplyr)

# Load World Cup data
comps <- FreeCompetitions() %>%
  filter(competition_id == 43, season_id == 106)
matches <- FreeMatches(comps)
events <- free_allevents(MatchesDF = matches)

# Calculate possession by team per match
match_possession <- events %>%
  filter(type.name == "Pass") %>%
  group_by(match_id, team.name) %>%
  summarise(team_passes = n(), .groups = "drop") %>%
  group_by(match_id) %>%
  mutate(
    total_passes = sum(team_passes),
    possession_pct = team_passes / total_passes * 100,
    opp_possession = 100 - possession_pct
  ) %>%
  ungroup()

# Defensive actions per player per match
player_defense <- events %>%
  filter(type.name %in% c("Tackle", "Interception", "Clearance", "Block")) %>%
  group_by(player.name, team.name, match_id) %>%
  summarise(
    tackles = sum(type.name == "Tackle"),
    interceptions = sum(type.name == "Interception"),
    clearances = sum(type.name == "Clearance"),
    blocks = sum(type.name == "Block"),
    total_actions = n(),
    .groups = "drop"
  )

# Join possession data
player_defense <- player_defense %>%
  left_join(
    match_possession %>% select(match_id, team.name, opp_possession),
    by = c("match_id", "team.name")
  )

# Aggregate and adjust
adjusted_defense <- player_defense %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n(),
    raw_actions = sum(total_actions),
    avg_opp_poss = mean(opp_possession, na.rm = TRUE),

    # Possession-adjusted actions
    adjusted_actions = sum(total_actions * (50 / opp_possession)),

    .groups = "drop"
  ) %>%
  filter(matches >= 3) %>%
  mutate(
    raw_per_90 = round(raw_actions / matches, 2),
    adj_per_90 = round(adjusted_actions / matches, 2),
    adjustment_factor = round(50 / avg_opp_poss, 2),

    # Difference shows impact of adjustment
    adjustment_impact = round(adj_per_90 - raw_per_90, 2)
  ) %>%
  arrange(desc(adj_per_90))

print("Possession-Adjusted Defensive Actions:")
print(head(adjusted_defense, 15))

# Note: Players on low-possession teams get boosted
# Players on high-possession teams get penalized
print("\nBiggest Adjustment Impacts:")
print(adjusted_defense %>%
        arrange(desc(abs(adjustment_impact))) %>%
        select(player.name, avg_opp_poss, raw_per_90, adj_per_90, adjustment_impact) %>%
        head(10))
ex101-solution
Output
Exercise 10.1: Calculate possession-adjusted defensive stats
Exercise 10.2: Team PPDA Analysis

Task: Calculate PPDA (Passes Per Defensive Action) for all teams in a tournament. PPDA measures pressing intensity - lower values indicate more aggressive pressing.

Definition: PPDA = Opponent passes in their defensive 60% / Your defensive actions in that zone

# Exercise 10.2: Team PPDA Analysis import pandas as pd import matplotlib.pyplot as plt def calculate_ppda(match_events, team): """Calculate PPDA for a team in a match.""" other_teams = match_events[match_events["team"] != team]["team"].unique() # Opponent passes in their defensive 60% opp_passes = match_events[ (match_events["team"].isin(other_teams)) & (match_events["type"] == "Pass") & (match_events["location"].apply(lambda l: l[0] if l else 0) <= 72) ] # Our defensive actions in that zone def_actions = match_events[ (match_events["team"] == team) & (match_events["type"].isin(["Tackle", "Interception", "Foul Committed"])) & (match_events["location"].apply(lambda l: l[0] if l else 0) >= 48) ] if len(def_actions) == 0: return None return len(opp_passes) / len(def_actions) # Calculate for all matches ppda_results = [] for match_id in all_events["match_id"].unique(): match_events = all_events[all_events["match_id"] == match_id] teams = match_events["team"].unique() for team in teams: ppda = calculate_ppda(match_events, team) if ppda: ppda_results.append({ "match_id": match_id, "team": team, "ppda": ppda }) ppda_df = pd.DataFrame(ppda_results) # Aggregate team_ppda = ppda_df.groupby("team").agg( matches=("match_id", "count"), avg_ppda=("ppda", "mean"), min_ppda=("ppda", "min"), max_ppda=("ppda", "max") ).reset_index().round(2) # Classify def classify_press(ppda): if ppda < 8: return "Very High Press" elif ppda < 10: return "High Press" elif ppda < 12: return "Medium Press" return "Low Block" team_ppda["style"] = team_ppda["avg_ppda"].apply(classify_press) print("Team PPDA Rankings:") print(team_ppda.sort_values("avg_ppda")) # Plot team_ppda_filtered = team_ppda[team_ppda["matches"] >= 3].sort_values("avg_ppda") colors = {"Very High Press": "#1B5E20", "High Press": "#4CAF50", "Medium Press": "#FFC107", "Low Block": "#F44336"} fig, ax = plt.subplots(figsize=(12, 10)) bars = ax.barh(team_ppda_filtered["team"], team_ppda_filtered["avg_ppda"], color=[colors[s] for s in team_ppda_filtered["style"]]) ax.axvline(x=8, color="gray", linestyle="--", alpha=0.5) ax.axvline(x=10, color="gray", linestyle="--", alpha=0.5) ax.set_xlabel("PPDA (Lower = More Pressing)") ax.set_title("Team Pressing Intensity") plt.tight_layout() plt.savefig("team_ppda.png", dpi=150)
# Exercise 10.2: Team PPDA Analysis
library(StatsBombR)
library(dplyr)
library(ggplot2)

# Calculate PPDA for each team per match
calculate_ppda <- function(match_events, team) {
  # Opponent passes in their defensive 60% (x <= 72)
  other_teams <- setdiff(unique(match_events$team.name), team)

  opp_passes <- match_events %>%
    filter(team.name %in% other_teams,
           type.name == "Pass",
           location.x <= 72) %>%
    nrow()

  # Our defensive actions in opponent defensive 60% (x >= 48 for us)
  our_def_actions <- match_events %>%
    filter(team.name == team,
           type.name %in% c("Tackle", "Interception", "Foul Committed"),
           location.x >= 48) %>%
    nrow()

  if (our_def_actions == 0) return(NA)
  return(opp_passes / our_def_actions)
}

# Calculate for all teams across all matches
teams <- unique(events$team.name)
ppda_results <- data.frame()

for (mid in unique(events$match_id)) {
  match_events <- events %>% filter(match_id == mid)
  match_teams <- unique(match_events$team.name)

  for (team in match_teams) {
    ppda <- calculate_ppda(match_events, team)
    ppda_results <- bind_rows(ppda_results, data.frame(
      match_id = mid,
      team = team,
      ppda = ppda
    ))
  }
}

# Aggregate by team
team_ppda <- ppda_results %>%
  filter(!is.na(ppda)) %>%
  group_by(team) %>%
  summarise(
    matches = n(),
    avg_ppda = round(mean(ppda), 2),
    min_ppda = round(min(ppda), 2),
    max_ppda = round(max(ppda), 2),
    .groups = "drop"
  ) %>%
  mutate(
    press_intensity = case_when(
      avg_ppda < 8 ~ "Very High Press",
      avg_ppda < 10 ~ "High Press",
      avg_ppda < 12 ~ "Medium Press",
      TRUE ~ "Low Block"
    )
  ) %>%
  arrange(avg_ppda)

print("Team PPDA Rankings (Lower = More Pressing):")
print(team_ppda)

# Visualization
ggplot(team_ppda %>% filter(matches >= 3),
       aes(x = reorder(team, avg_ppda), y = avg_ppda, fill = press_intensity)) +
  geom_col() +
  geom_hline(yintercept = c(8, 10, 12), linetype = "dashed", alpha = 0.5) +
  coord_flip() +
  scale_fill_manual(values = c("Very High Press" = "#1B5E20",
                               "High Press" = "#4CAF50",
                               "Medium Press" = "#FFC107",
                               "Low Block" = "#F44336")) +
  labs(title = "Team Pressing Intensity (PPDA)",
       subtitle = "Lower PPDA = More Aggressive Pressing",
       x = "", y = "Passes Per Defensive Action",
       fill = "Press Style") +
  theme_minimal()

ggsave("team_ppda.png", width = 12, height = 10)
ex102-solution
Output
Exercise 10.2: Calculate and visualize team PPDA
Exercise 10.3: Net Defensive Value Index

Task: Create a comprehensive defensive value index that combines successful defensive actions with failures. Calculate a net score that accounts for tackles won, interceptions, and ball recoveries minus times dribbled past, errors, and fouls in dangerous areas.

Formula: Net Defensive Value = (Tackles Won + Interceptions + Recoveries) - (Dribbled Past + Errors + Dangerous Fouls)

# Exercise 10.3: Net Defensive Value Index import pandas as pd import matplotlib.pyplot as plt # Positive actions pos_events = all_events[ (all_events["type"] == "Tackle") | (all_events["type"] == "Interception") | (all_events["type"] == "Ball Recovery") ].copy() positive = pos_events.groupby(["player", "team"]).agg( matches=("match_id", "nunique"), tackles=("type", lambda x: (x == "Tackle").sum()), interceptions=("type", lambda x: (x == "Interception").sum()), recoveries=("type", lambda x: (x == "Ball Recovery").sum()), positive_actions=("type", "count") ).reset_index() # Negative actions neg_events = all_events[ (all_events["type"] == "Dribbled Past") | (all_events["type"] == "Error") | ((all_events["type"] == "Foul Committed") & (all_events["location"].apply(lambda l: l[0] if l else 0) >= 80)) ].copy() negative = neg_events.groupby(["player", "team"]).agg( dribbled_past=("type", lambda x: (x == "Dribbled Past").sum()), errors=("type", lambda x: (x == "Error").sum()), dangerous_fouls=("type", lambda x: (x == "Foul Committed").sum()), negative_actions=("type", "count") ).reset_index() # Merge net_def = positive.merge(negative, on=["player", "team"], how="left") net_def = net_def.fillna(0) net_def = net_def[net_def["matches"] >= 3].copy() # Calculate metrics net_def["net_value"] = net_def["positive_actions"] - net_def["negative_actions"] net_def["net_per_90"] = (net_def["net_value"] / net_def["matches"]).round(2) net_def["pos_per_90"] = (net_def["positive_actions"] / net_def["matches"]).round(2) net_def["neg_per_90"] = (net_def["negative_actions"] / net_def["matches"]).round(2) # Rating def rate(val): if val > 5: return "Elite" elif val > 3: return "Good" elif val > 1: return "Average" return "Below Average" net_def["rating"] = net_def["net_per_90"].apply(rate) print("Net Defensive Value Index:") print(net_def.sort_values("net_per_90", ascending=False).head(20)[ ["player", "matches", "pos_per_90", "neg_per_90", "net_per_90", "rating"]]) # Plot colors = {"Elite": "#1B5E20", "Good": "#4CAF50", "Average": "#FFC107", "Below Average": "#F44336"} fig, ax = plt.subplots(figsize=(12, 10)) for rating in colors: subset = net_def[net_def["rating"] == rating] ax.scatter(subset["pos_per_90"], subset["neg_per_90"], c=colors[rating], label=rating, s=subset["matches"]*20, alpha=0.7) ax.plot([0, 10], [0, 10], "k--", alpha=0.5, label="Break even") ax.set_xlabel("Positive Actions per 90") ax.set_ylabel("Negative Actions per 90") ax.set_title("Net Defensive Value Analysis") ax.legend() plt.savefig("net_defensive_value.png", dpi=150) plt.show()
# Exercise 10.3: Net Defensive Value Index
library(StatsBombR)
library(dplyr)
library(ggplot2)

# Positive defensive actions
positive_defense <- events %>%
  filter(
    (type.name == "Tackle" & duel.outcome.name %in% c("Won", "Success")) |
    (type.name == "Interception") |
    (type.name == "Ball Recovery" & !is.na(ball_recovery.recovery_failure))
  ) %>%
  group_by(player.name, team.name) %>%
  summarise(
    matches = n_distinct(match_id),
    tackles_won = sum(type.name == "Tackle"),
    interceptions = sum(type.name == "Interception"),
    recoveries = sum(type.name == "Ball Recovery"),
    positive_actions = n(),
    .groups = "drop"
  )

# Negative defensive events
negative_defense <- events %>%
  filter(
    (type.name == "Dribbled Past") |
    (type.name == "Error") |
    (type.name == "Foul Committed" & location.x >= 80)  # Dangerous area
  ) %>%
  group_by(player.name, team.name) %>%
  summarise(
    dribbled_past = sum(type.name == "Dribbled Past"),
    errors = sum(type.name == "Error"),
    dangerous_fouls = sum(type.name == "Foul Committed"),
    negative_actions = n(),
    .groups = "drop"
  )

# Combine for net value
net_defensive <- positive_defense %>%
  left_join(negative_defense, by = c("player.name", "team.name")) %>%
  mutate(
    across(c(dribbled_past, errors, dangerous_fouls, negative_actions),
           ~ifelse(is.na(.), 0, .))
  ) %>%
  filter(matches >= 3) %>%
  mutate(
    # Net value
    net_value = positive_actions - negative_actions,
    net_per_90 = round(net_value / matches, 2),

    # Component breakdown
    positive_per_90 = round(positive_actions / matches, 2),
    negative_per_90 = round(negative_actions / matches, 2),

    # Rating
    rating = case_when(
      net_per_90 > 5 ~ "Elite",
      net_per_90 > 3 ~ "Good",
      net_per_90 > 1 ~ "Average",
      TRUE ~ "Below Average"
    )
  ) %>%
  arrange(desc(net_per_90))

print("Net Defensive Value Index:")
print(head(net_defensive %>%
             select(player.name, matches, positive_per_90, negative_per_90,
                    net_per_90, rating), 20))

# Visualization: Positive vs Negative scatter
ggplot(net_defensive, aes(x = positive_per_90, y = negative_per_90)) +
  geom_point(aes(color = rating, size = matches), alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +
  scale_color_manual(values = c("Elite" = "#1B5E20", "Good" = "#4CAF50",
                                "Average" = "#FFC107", "Below Average" = "#F44336")) +
  labs(
    title = "Net Defensive Value Analysis",
    subtitle = "Players below the line have positive net defensive value",
    x = "Positive Actions per 90 (Tackles Won, Interceptions, Recoveries)",
    y = "Negative Actions per 90 (Dribbled Past, Errors, Dangerous Fouls)",
    color = "Rating", size = "Matches"
  ) +
  theme_minimal()

ggsave("net_defensive_value.png", width = 12, height = 10)
ex103-solution
Output
Exercise 10.3: Calculate net defensive value index

Continue Your Journey

You've completed the core analytics modules! Explore positional analytics next to evaluate players by their specific roles.

Continue to Goalkeeper Analytics