Chapter 60

Capstone - Complete Analytics System

Intermediate 30 min read 5 sections 10 code examples
0 of 60 chapters completed (0%)

Formations provide the structural framework for how teams organize in attack and defense. Modern formation analysis goes beyond static shapes to understand dynamic positioning, shape transitions, and how formations adapt to different game states.

Formation Detection

Formation detection involves analyzing player positions to determine the team's structural shape. This can be done using tracking data or by aggregating touch locations from event data.

formation_detection.py
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Load tracking data
tracking_data = pd.read_csv("tracking_data.csv")
home_tracking = tracking_data[tracking_data['team'] == 'home']

def calculate_average_positions(tracking, period='full'):
    """
    Calculate average positions for each player.
    """
    if period != 'full':
        tracking = tracking[tracking['game_state'] == period]

    avg_positions = (
        tracking
        .groupby(['player_id', 'player_name', 'position'])
        .agg({
            'x': 'mean',
            'y': 'mean'
        })
        .reset_index()
        .rename(columns={'x': 'avg_x', 'y': 'avg_y'})
    )

    # Exclude goalkeeper
    avg_positions = avg_positions[avg_positions['position'] != 'Goalkeeper']

    return avg_positions

def classify_formation(avg_positions):
    """
    Classify team formation using clustering.
    """
    # Sort by x position
    sorted_players = avg_positions.sort_values('avg_x').copy()

    # Get x positions for clustering
    x_positions = sorted_players['avg_x'].values.reshape(-1, 1)

    # Find optimal number of lines
    silhouette_scores = []
    for k in range(2, 5):
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(x_positions)
        score = silhouette_score(x_positions, labels)
        silhouette_scores.append(score)

    optimal_k = np.argmax(silhouette_scores) + 2

    # Cluster into lines
    km = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    sorted_players['line'] = km.fit_predict(x_positions)

    # Sort lines by average x position
    line_order = (
        sorted_players
        .groupby('line')['avg_x']
        .mean()
        .sort_values()
        .index
    )
    line_mapping = {old: new for new, old in enumerate(line_order)}
    sorted_players['line'] = sorted_players['line'].map(line_mapping)

    # Count players per line
    line_counts = (
        sorted_players
        .groupby('line')
        .size()
        .sort_index()
        .tolist()
    )

    formation = '-'.join(map(str, line_counts))

    return {
        'formation': formation,
        'line_counts': line_counts,
        'player_lines': sorted_players
    }

# Detect formation
avg_positions = calculate_average_positions(home_tracking)
formation_result = classify_formation(avg_positions)
print(f"Detected formation: {formation_result['formation']}")

def visualize_formation(avg_positions, formation_label):
    """
    Visualize formation on pitch.
    """
    fig, ax = plt.subplots(figsize=(12, 8))

    # Draw pitch
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    ax.set_facecolor('#1a5f1a')

    # Plot players
    for _, player in avg_positions.iterrows():
        ax.scatter(player['avg_x'], player['avg_y'],
                   s=500, c='white', edgecolors='black', zorder=5)
        ax.text(player['avg_x'], player['avg_y'],
                player['position'][:3],
                ha='center', va='center', fontsize=8, fontweight='bold')

    ax.set_title(f'Formation: {formation_label}', fontsize=14)
    ax.set_xlabel('Pitch Length')
    ax.set_ylabel('Pitch Width')

    plt.tight_layout()
    plt.show()

visualize_formation(
    formation_result['player_lines'],
    formation_result['formation']
)
library(tidyverse)
library(cluster)

# Load tracking data
tracking_data <- read_csv("tracking_data.csv") %>%
  filter(team == "home")

# Calculate average positions for formation detection
calculate_average_positions <- function(tracking, period = "full") {

  if (period != "full") {
    tracking <- tracking %>% filter(game_state == period)
  }

  avg_positions <- tracking %>%
    group_by(player_id, player_name, position) %>%
    summarise(
      avg_x = mean(x, na.rm = TRUE),
      avg_y = mean(y, na.rm = TRUE),
      touches = n(),
      .groups = "drop"
    ) %>%
    filter(position != "Goalkeeper")  # Exclude GK for formation

  return(avg_positions)
}

# Classify formation based on player positions
classify_formation <- function(avg_positions) {

  # Sort by x position to identify lines
  sorted_players <- avg_positions %>%
    arrange(avg_x)

  # Use clustering to identify defensive/midfield/attack lines
  position_matrix <- sorted_players %>%
    select(avg_x, avg_y) %>%
    as.matrix()

  # Determine optimal number of lines (typically 3-4)
  silhouette_scores <- sapply(2:4, function(k) {
    km <- kmeans(position_matrix[, 1], centers = k, nstart = 10)
    mean(silhouette(km$cluster, dist(position_matrix[, 1]))[, 3])
  })

  optimal_k <- which.max(silhouette_scores) + 1

  # Cluster into lines
  km <- kmeans(position_matrix[, 1], centers = optimal_k, nstart = 10)
  sorted_players$line <- km$cluster

  # Count players in each line
  line_counts <- sorted_players %>%
    group_by(line) %>%
    summarise(n = n()) %>%
    arrange(line) %>%
    pull(n)

  # Determine formation string
  formation <- paste(line_counts, collapse = "-")

  return(list(
    formation = formation,
    line_counts = line_counts,
    player_lines = sorted_players
  ))
}

# Detect formation
formation_result <- tracking_data %>%
  calculate_average_positions() %>%
  classify_formation()

cat("Detected formation:", formation_result$formation, "\n")

# Visualize formation
visualize_formation <- function(avg_positions, formation_label) {
  ggplot(avg_positions, aes(x = avg_x, y = avg_y)) +
    # Pitch outline
    annotate("rect", xmin = 0, xmax = 100, ymin = 0, ymax = 100,
             fill = "darkgreen", alpha = 0.3) +
    # Players
    geom_point(size = 8, color = "white") +
    geom_text(aes(label = str_extract(position, "^[A-Z]+")),
              size = 3, fontface = "bold") +
    # Formation label
    labs(
      title = paste("Formation:", formation_label),
      x = "Pitch Length", y = "Pitch Width"
    ) +
    coord_fixed(ratio = 1) +
    theme_minimal()
}

visualize_formation(
  formation_result$player_lines,
  formation_result$formation
)

Formation Shape Metrics

Beyond identifying the formation, we can measure its shape characteristics: compactness, width, length, and how these change during different game states.

Metric Description Interpretation
Length Distance between deepest and highest outfield players Lower = more compact vertically
Width Distance between widest players Higher = more stretched horizontally
Surface Area Area covered by convex hull of players Lower = more compact overall
Defensive Line Height Average x-position of defensive line Higher = more aggressive pressing
Centroid Average position of all players Team's center of gravity
shape_metrics.py
from scipy.spatial import ConvexHull
import numpy as np

def calculate_shape_metrics(tracking_frame):
    """
    Calculate formation shape metrics for a single frame.
    """
    outfield = tracking_frame[tracking_frame['position'] != 'Goalkeeper']

    if len(outfield) < 3:
        return None

    metrics = {}

    # Team centroid
    metrics['centroid_x'] = outfield['x'].mean()
    metrics['centroid_y'] = outfield['y'].mean()

    # Length (vertical compactness)
    metrics['length'] = outfield['x'].max() - outfield['x'].min()

    # Width (horizontal spread)
    metrics['width'] = outfield['y'].max() - outfield['y'].min()

    # Surface area using convex hull
    points = outfield[['x', 'y']].values
    if len(points) >= 3:
        try:
            hull = ConvexHull(points)
            metrics['surface_area'] = hull.volume  # 2D area
        except:
            metrics['surface_area'] = 0
    else:
        metrics['surface_area'] = 0

    # Defensive line height
    defensive_line = outfield.nsmallest(4, 'x')
    metrics['defensive_line_height'] = defensive_line['x'].mean()

    # Midfield height
    sorted_players = outfield.sort_values('x')
    midfield = sorted_players.iloc[3:7]
    metrics['midfield_height'] = midfield['x'].mean()

    # Compactness ratio
    metrics['compactness_ratio'] = (
        metrics['length'] / metrics['width'] if metrics['width'] > 0 else 0
    )

    return metrics

# Calculate metrics for all frames
def calculate_shape_timeline(tracking_data):
    """
    Calculate shape metrics over time.
    """
    results = []

    for frame in tracking_data['frame'].unique():
        frame_data = tracking_data[tracking_data['frame'] == frame]
        metrics = calculate_shape_metrics(frame_data)
        if metrics:
            metrics['frame'] = frame
            results.append(metrics)

    return pd.DataFrame(results)

shape_timeline = calculate_shape_timeline(home_tracking)

# Analyze by game state
def analyze_shape_by_state(tracking_data):
    """
    Compare shape metrics across game states.
    """
    results = []

    for state in tracking_data['game_state'].unique():
        state_data = tracking_data[tracking_data['game_state'] == state]

        for frame in state_data['frame'].unique():
            frame_data = state_data[state_data['frame'] == frame]
            metrics = calculate_shape_metrics(frame_data)
            if metrics:
                metrics['game_state'] = state
                results.append(metrics)

    df = pd.DataFrame(results)

    summary = df.groupby('game_state').agg({
        'length': 'mean',
        'width': 'mean',
        'surface_area': 'mean',
        'defensive_line_height': 'mean'
    }).reset_index()

    return summary

shape_by_state = analyze_shape_by_state(home_tracking)
print(shape_by_state)

# Visualize shape evolution
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(shape_timeline['frame'], shape_timeline['length'],
        label='Length', alpha=0.7)
ax.plot(shape_timeline['frame'], shape_timeline['width'],
        label='Width', alpha=0.7)
ax.plot(shape_timeline['frame'], shape_timeline['defensive_line_height'],
        label='Defensive Line', alpha=0.7)

ax.set_xlabel('Time (frames)')
ax.set_ylabel('Distance (meters)')
ax.set_title('Team Shape Evolution During Match')
ax.legend()
plt.tight_layout()
plt.show()
library(sf)

# Calculate formation shape metrics
calculate_shape_metrics <- function(tracking_frame) {

  outfield_players <- tracking_frame %>%
    filter(position != "Goalkeeper")

  if (nrow(outfield_players) < 3) return(NULL)

  metrics <- list()

  # Team centroid
  metrics$centroid_x <- mean(outfield_players$x)
  metrics$centroid_y <- mean(outfield_players$y)

  # Length (vertical compactness)
  metrics$length <- max(outfield_players$x) - min(outfield_players$x)

  # Width (horizontal spread)
  metrics$width <- max(outfield_players$y) - min(outfield_players$y)

  # Surface area using convex hull
  points_sf <- st_as_sf(outfield_players, coords = c("y", "x"))
  hull <- st_convex_hull(st_union(points_sf))
  metrics$surface_area <- st_area(hull)

  # Defensive line height (average of 4 deepest players)
  defensive_line <- outfield_players %>%
    arrange(x) %>%
    head(4)
  metrics$defensive_line_height <- mean(defensive_line$x)

  # Midfield line height
  midfield <- outfield_players %>%
    arrange(x) %>%
    slice(4:7)
  metrics$midfield_height <- mean(midfield$x)

  # Compactness ratio (length / width)
  metrics$compactness_ratio <- metrics$length / metrics$width

  return(as_tibble(metrics))
}

# Calculate metrics for each frame
shape_over_time <- tracking_data %>%
  group_by(frame) %>%
  group_modify(~calculate_shape_metrics(.x)) %>%
  ungroup()

# Analyze shape by game state
shape_by_state <- tracking_data %>%
  group_by(frame, game_state) %>%
  group_modify(~calculate_shape_metrics(.x)) %>%
  ungroup() %>%
  group_by(game_state) %>%
  summarise(
    avg_length = mean(length, na.rm = TRUE),
    avg_width = mean(width, na.rm = TRUE),
    avg_surface_area = mean(surface_area, na.rm = TRUE),
    avg_defensive_line = mean(defensive_line_height, na.rm = TRUE),
    .groups = "drop"
  )

# Visualize
ggplot(shape_by_state, aes(x = game_state)) +
  geom_bar(aes(y = avg_length, fill = "Length"), stat = "identity",
           position = "dodge", alpha = 0.7) +
  geom_bar(aes(y = avg_width, fill = "Width"), stat = "identity",
           position = "dodge", alpha = 0.7) +
  labs(
    title = "Team Shape by Game State",
    x = "Game State", y = "Distance (meters)", fill = "Metric"
  ) +
  theme_minimal()

# Plot shape evolution over time
ggplot(shape_over_time, aes(x = frame)) +
  geom_line(aes(y = length, color = "Length")) +
  geom_line(aes(y = width, color = "Width")) +
  geom_line(aes(y = defensive_line_height, color = "Def Line")) +
  labs(
    title = "Team Shape Evolution",
    x = "Time (frames)", y = "Distance (meters)", color = "Metric"
  ) +
  theme_minimal()

In-Possession vs Out-of-Possession Shapes

Teams typically adopt different shapes depending on whether they have the ball or not. Analyzing these differences reveals tactical intent.

possession_shapes.py
def analyze_possession_shapes(tracking, events):
    """
    Compare team shapes in and out of possession.
    """
    # Simplified: assume we have possession state in tracking data
    # In practice, derive from event data

    results = {'in_possession': [], 'out_possession': []}

    for frame in tracking['frame'].unique():
        frame_data = tracking[tracking['frame'] == frame]

        if len(frame_data) == 0:
            continue

        # Determine possession state (simplified)
        in_poss = frame_data['in_possession'].iloc[0] if 'in_possession' in frame_data.columns else True

        metrics = calculate_shape_metrics(frame_data)

        if metrics:
            if in_poss:
                results['in_possession'].append(metrics)
            else:
                results['out_possession'].append(metrics)

    # Aggregate
    comparison = {}
    for state, data in results.items():
        if data:
            df = pd.DataFrame(data)
            comparison[state] = {
                'length': df['length'].mean(),
                'width': df['width'].mean(),
                'surface_area': df['surface_area'].mean(),
                'defensive_line_height': df['defensive_line_height'].mean()
            }

    return comparison

# Calculate average positions by possession state
def calculate_positions_by_possession(tracking):
    """
    Calculate average positions for in and out of possession.
    """
    in_poss = tracking[tracking['in_possession'] == True]
    out_poss = tracking[tracking['in_possession'] == False]

    positions_in = (
        in_poss
        .groupby(['player_name', 'position'])
        .agg({'x': 'mean', 'y': 'mean'})
        .reset_index()
        .rename(columns={'x': 'avg_x', 'y': 'avg_y'})
    )

    positions_out = (
        out_poss
        .groupby(['player_name', 'position'])
        .agg({'x': 'mean', 'y': 'mean'})
        .reset_index()
        .rename(columns={'x': 'avg_x', 'y': 'avg_y'})
    )

    return positions_in, positions_out

def plot_possession_comparison(positions_in, positions_out):
    """
    Side-by-side comparison of shapes.
    """
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))

    titles = ['In Possession', 'Out of Possession']
    data_sets = [positions_in, positions_out]

    for ax, title, data in zip(axes, titles, data_sets):
        ax.set_xlim(0, 100)
        ax.set_ylim(0, 100)
        ax.set_facecolor('#1a5f1a')

        for _, player in data.iterrows():
            ax.scatter(player['avg_x'], player['avg_y'],
                       s=400, c='white', edgecolors='black', zorder=5)
            ax.text(player['avg_x'], player['avg_y'],
                    player['position'][:3],
                    ha='center', va='center', fontsize=7)

        ax.set_title(title, fontsize=12)
        ax.set_xlabel('Pitch Length')
        ax.set_ylabel('Pitch Width')

    plt.tight_layout()
    plt.show()

# Example usage
if 'in_possession' in home_tracking.columns:
    positions_in, positions_out = calculate_positions_by_possession(home_tracking)
    plot_possession_comparison(positions_in, positions_out)
# Compare in-possession vs out-of-possession shapes
analyze_possession_shapes <- function(tracking, events) {

  # Determine possession for each frame
  possession_frames <- events %>%
    filter(type.name %in% c("Pass", "Carry", "Shot", "Dribble")) %>%
    select(frame_start = frame, frame_end = lead(frame), possession_team = team.name)

  tracking_with_possession <- tracking %>%
    left_join(
      possession_frames,
      by = c("frame" >= "frame_start", "frame" < "frame_end")
    ) %>%
    mutate(
      in_possession = team == possession_team
    )

  # Calculate shapes for each state
  in_possession_shape <- tracking_with_possession %>%
    filter(in_possession == TRUE) %>%
    group_by(frame) %>%
    group_modify(~calculate_shape_metrics(.x)) %>%
    summarise(across(everything(), mean, na.rm = TRUE)) %>%
    mutate(state = "In Possession")

  out_possession_shape <- tracking_with_possession %>%
    filter(in_possession == FALSE) %>%
    group_by(frame) %>%
    group_modify(~calculate_shape_metrics(.x)) %>%
    summarise(across(everything(), mean, na.rm = TRUE)) %>%
    mutate(state = "Out of Possession")

  comparison <- bind_rows(in_possession_shape, out_possession_shape)

  return(comparison)
}

possession_comparison <- analyze_possession_shapes(tracking_data, events)

# Visualize comparison
possession_comparison %>%
  pivot_longer(cols = c(length, width, surface_area, defensive_line_height),
               names_to = "metric", values_to = "value") %>%
  ggplot(aes(x = metric, y = value, fill = state)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Team Shape: In vs Out of Possession",
    x = "Metric", y = "Value", fill = "State"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Average player positions by possession state
position_by_possession <- tracking_with_possession %>%
  group_by(player_name, position, in_possession) %>%
  summarise(
    avg_x = mean(x),
    avg_y = mean(y),
    .groups = "drop"
  )

# Plot side by side
library(patchwork)

p1 <- position_by_possession %>%
  filter(in_possession == TRUE) %>%
  visualize_formation("In Possession")

p2 <- position_by_possession %>%
  filter(in_possession == FALSE) %>%
  visualize_formation("Out of Possession")

p1 + p2

Detecting Formation Changes

Teams often change formation during matches in response to game state, substitutions, or tactical adjustments. Detecting these changes helps understand tactical decision-making.

formation_changes.py
def detect_formation_changes(tracking_data, window_minutes=10):
    """
    Detect formation changes throughout the match.
    """
    tracking_data = tracking_data.copy()
    tracking_data['time_window'] = tracking_data['minute'] // window_minutes

    formations = []

    for window in tracking_data['time_window'].unique():
        window_data = tracking_data[tracking_data['time_window'] == window]
        avg_positions = calculate_average_positions(window_data)

        if len(avg_positions) >= 10:
            formation_result = classify_formation(avg_positions)
            formations.append({
                'time_window': window,
                'formation': formation_result['formation'],
                'minute': window * window_minutes
            })

    formations_df = pd.DataFrame(formations)

    # Detect changes
    formations_df['prev_formation'] = formations_df['formation'].shift(1)
    formations_df['formation_change'] = (
        formations_df['formation'] != formations_df['prev_formation']
    )

    changes = formations_df[formations_df['formation_change']].copy()
    changes = changes.rename(columns={
        'prev_formation': 'from_formation',
        'formation': 'to_formation'
    })

    return {
        'timeline': formations_df,
        'changes': changes[['minute', 'from_formation', 'to_formation']]
    }

formation_changes = detect_formation_changes(home_tracking)

def plot_formation_timeline(formation_data):
    """
    Visualize formation changes over time.
    """
    timeline = formation_data['timeline']
    changes = formation_data['changes']

    fig, ax = plt.subplots(figsize=(14, 3))

    # Create color map for formations
    unique_formations = timeline['formation'].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_formations)))
    color_map = dict(zip(unique_formations, colors))

    # Plot timeline
    for _, row in timeline.iterrows():
        ax.barh(0, window_minutes, left=row['minute'],
                color=color_map[row['formation']], height=0.5)

    # Mark changes
    for _, change in changes.iterrows():
        ax.axvline(x=change['minute'], color='red',
                   linestyle='--', linewidth=2)
        ax.text(change['minute'], 0.6,
                f"{change['from_formation']} -> {change['to_formation']}",
                rotation=45, fontsize=8)

    ax.set_xlim(0, 95)
    ax.set_ylim(-0.5, 1)
    ax.set_xlabel('Match Minute')
    ax.set_title('Formation Changes During Match')
    ax.set_yticks([])

    # Legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=color_map[f], label=f)
                       for f in unique_formations]
    ax.legend(handles=legend_elements, loc='upper right')

    plt.tight_layout()
    plt.show()

plot_formation_timeline(formation_changes)

# Print detected changes
print("Formation Changes Detected:")
print(formation_changes['changes'])
# Detect formation changes during match
detect_formation_changes <- function(tracking_data, window_minutes = 10) {

  # Calculate formation for rolling windows
  tracking_data <- tracking_data %>%
    mutate(time_window = floor(minute / window_minutes))

  formations_over_time <- tracking_data %>%
    group_by(time_window) %>%
    group_modify(~{
      avg_pos <- calculate_average_positions(tibble(.x))
      formation <- classify_formation(avg_pos)
      tibble(
        formation = formation$formation,
        line_counts = list(formation$line_counts)
      )
    }) %>%
    ungroup()

  # Detect changes
  formations_over_time <- formations_over_time %>%
    mutate(
      formation_change = formation != lag(formation),
      formation_change = ifelse(is.na(formation_change), FALSE, formation_change)
    )

  changes <- formations_over_time %>%
    filter(formation_change) %>%
    mutate(
      from_formation = lag(formation),
      to_formation = formation,
      change_minute = time_window * window_minutes
    )

  return(list(
    timeline = formations_over_time,
    changes = changes
  ))
}

formation_changes <- detect_formation_changes(tracking_data)

# Visualize formation timeline
ggplot(formation_changes$timeline,
       aes(x = time_window * 10, y = 1, fill = formation)) +
  geom_tile(height = 0.8) +
  geom_vline(data = formation_changes$changes,
             aes(xintercept = change_minute),
             linetype = "dashed", color = "red") +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title = "Formation Changes During Match",
    x = "Match Minute", y = "", fill = "Formation"
  ) +
  theme_minimal() +
  theme(axis.text.y = element_blank())

# Analyze what triggered changes
analyze_change_triggers <- function(changes, events, substitutions) {

  change_analysis <- changes %>%
    rowwise() %>%
    mutate(
      # Check for recent substitution
      recent_sub = any(
        substitutions$minute >= change_minute - 5 &
        substitutions$minute <= change_minute + 5
      ),
      # Check for recent goal
      recent_goal = any(
        events$type.name == "Shot" &
        events$shot.outcome.name == "Goal" &
        events$minute >= change_minute - 5 &
        events$minute <= change_minute + 5
      ),
      # Check score at time of change
      score_at_change = get_score_at_minute(events, change_minute)
    )

  return(change_analysis)
}

Formation Matchup Analysis

How formations interact against each other creates advantages and disadvantages. Analyzing historical matchup data reveals which formations perform best against specific opponents.

formation_matchups.py
def analyze_formation_matchups(match_data):
    """
    Analyze how different formations perform against each other.
    """
    match_data = match_data.copy()

    # Calculate metrics
    match_data['home_xg_diff'] = match_data['home_xg'] - match_data['away_xg']
    match_data['home_result'] = np.where(
        match_data['home_goals'] > match_data['away_goals'], 'Win',
        np.where(match_data['home_goals'] < match_data['away_goals'], 'Loss', 'Draw')
    )

    matchup_results = (
        match_data
        .groupby(['home_formation', 'away_formation'])
        .agg({
            'match_id': 'count',
            'home_xg': 'mean',
            'away_xg': 'mean',
            'home_xg_diff': 'mean',
            'home_result': lambda x: (x == 'Win').sum()
        })
        .reset_index()
    )

    matchup_results.columns = [
        'home_formation', 'away_formation', 'matches',
        'avg_home_xg', 'avg_away_xg', 'avg_xg_diff', 'home_wins'
    ]

    matchup_results['home_win_rate'] = (
        matchup_results['home_wins'] / matchup_results['matches'] * 100
    )

    # Filter for minimum sample size
    matchup_results = matchup_results[matchup_results['matches'] >= 10]

    return matchup_results

matchup_analysis = analyze_formation_matchups(season_matches)

def plot_matchup_heatmap(matchup_data):
    """
    Create heatmap of formation matchup performance.
    """
    # Pivot for heatmap
    pivot_xg = matchup_data.pivot(
        index='home_formation',
        columns='away_formation',
        values='avg_xg_diff'
    )

    pivot_winrate = matchup_data.pivot(
        index='home_formation',
        columns='away_formation',
        values='home_win_rate'
    )

    fig, ax = plt.subplots(figsize=(10, 8))

    im = ax.imshow(pivot_xg.values, cmap='RdYlGn', aspect='auto')

    # Add text annotations
    for i in range(len(pivot_xg.index)):
        for j in range(len(pivot_xg.columns)):
            if not np.isnan(pivot_winrate.values[i, j]):
                ax.text(j, i, f'{pivot_winrate.values[i, j]:.0f}%',
                        ha='center', va='center', fontsize=9)

    ax.set_xticks(range(len(pivot_xg.columns)))
    ax.set_yticks(range(len(pivot_xg.index)))
    ax.set_xticklabels(pivot_xg.columns, rotation=45, ha='right')
    ax.set_yticklabels(pivot_xg.index)

    ax.set_xlabel('Away Formation')
    ax.set_ylabel('Home Formation')
    ax.set_title('Formation Matchup Performance\n(Home team xG diff, win rate %)')

    plt.colorbar(im, label='xG Difference')
    plt.tight_layout()
    plt.show()

plot_matchup_heatmap(matchup_analysis)

def best_formations_against(matchup_data, opponent_formation):
    """
    Find best formations to use against a specific opponent formation.
    """
    result = (
        matchup_data[matchup_data['away_formation'] == opponent_formation]
        .nlargest(5, 'avg_xg_diff')
        [['home_formation', 'matches', 'avg_xg_diff', 'home_win_rate']]
    )
    return result

print("Best formations against 4-3-3:")
print(best_formations_against(matchup_analysis, '4-3-3'))
# Analyze formation matchup performance
analyze_formation_matchups <- function(match_data) {

  matchup_results <- match_data %>%
    mutate(
      matchup = paste(home_formation, "vs", away_formation),
      home_xg_diff = home_xg - away_xg,
      home_result = case_when(
        home_goals > away_goals ~ "Win",
        home_goals < away_goals ~ "Loss",
        TRUE ~ "Draw"
      )
    ) %>%
    group_by(home_formation, away_formation) %>%
    summarise(
      matches = n(),
      home_wins = sum(home_result == "Win"),
      draws = sum(home_result == "Draw"),
      home_losses = sum(home_result == "Loss"),
      avg_home_xg = mean(home_xg),
      avg_away_xg = mean(away_xg),
      avg_xg_diff = mean(home_xg_diff),
      home_win_rate = home_wins / matches * 100,
      .groups = "drop"
    ) %>%
    filter(matches >= 10)  # Minimum sample size

  return(matchup_results)
}

matchup_analysis <- analyze_formation_matchups(season_matches)

# Heatmap of matchup performance
ggplot(matchup_analysis, aes(x = away_formation, y = home_formation,
                             fill = avg_xg_diff)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(home_win_rate, 0), "%")),
            color = "white", size = 3) +
  scale_fill_gradient2(low = "red", mid = "white", high = "green",
                       midpoint = 0, name = "xG Diff") +
  labs(
    title = "Formation Matchup Performance",
    subtitle = "Home team xG difference and win rate",
    x = "Away Formation", y = "Home Formation"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Identify best formations against specific systems
best_against <- function(matchup_data, opponent_formation) {
  matchup_data %>%
    filter(away_formation == opponent_formation) %>%
    arrange(desc(avg_xg_diff)) %>%
    head(5)
}

# Example: Best formations against 4-3-3
best_against(matchup_analysis, "4-3-3")

Practice Exercises

Exercise 24.1: Complete Formation Detection and Classification System

Task: Build a comprehensive formation detection system that identifies team formations from event data, validates classifications against known formations, and generates visual reports.

Requirements:

  • Calculate average touch positions for each player from event data
  • Implement multiple clustering methods (K-means, hierarchical) for formation detection
  • Create a formation validation system that compares detected vs expected formations
  • Generate confidence scores for formation classifications
  • Visualize formations with player positions and line assignments
  • Handle edge cases (players with few touches, unusual formations)

formation_detection_system
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from statsbombpy import sb

# ============================================
# COMPLETE FORMATION DETECTION SYSTEM
# ============================================

# Define standard formations
STANDARD_FORMATIONS = {
    "4-4-2": {"lines": [4, 4, 2], "positions": ["DEF", "MID", "FWD"]},
    "4-3-3": {"lines": [4, 3, 3], "positions": ["DEF", "MID", "FWD"]},
    "3-5-2": {"lines": [3, 5, 2], "positions": ["DEF", "MID", "FWD"]},
    "4-2-3-1": {"lines": [4, 2, 3, 1], "positions": ["DEF", "DM", "AM", "FWD"]},
    "5-3-2": {"lines": [5, 3, 2], "positions": ["DEF", "MID", "FWD"]},
    "4-1-4-1": {"lines": [4, 1, 4, 1], "positions": ["DEF", "DM", "MID", "FWD"]},
    "3-4-3": {"lines": [3, 4, 3], "positions": ["DEF", "MID", "FWD"]}
}

def load_statsbomb_events(competition_id=43, season_id=3):
    """
    Load StatsBomb event data.
    """
    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    sample_match = matches.iloc[0]
    events = sb.events(match_id=sample_match['match_id'])
    return events, sample_match

def calculate_touch_positions(events, team_name, min_touches=10):
    """
    Calculate average positions from event touch data.
    """
    touch_types = ['Pass', 'Carry', 'Shot', 'Dribble', 'Ball Receipt*', 'Clearance']

    touch_events = events[
        (events['team'] == team_name) &
        (events['type'].isin(touch_types)) &
        (events['location'].notna())
    ].copy()

    # Extract x, y from location
    touch_events['x'] = touch_events['location'].apply(lambda loc: loc[0] if isinstance(loc, list) else np.nan)
    touch_events['y'] = touch_events['location'].apply(lambda loc: loc[1] if isinstance(loc, list) else np.nan)

    # Calculate average positions
    avg_positions = (
        touch_events
        .groupby(['player_id', 'player', 'position'])
        .agg({
            'x': ['mean', 'std', 'count'],
            'y': ['mean', 'std']
        })
        .reset_index()
    )

    avg_positions.columns = ['player_id', 'player_name', 'position',
                              'avg_x', 'x_std', 'touches', 'avg_y', 'y_std']

    # Filter by minimum touches and exclude goalkeeper
    avg_positions = avg_positions[
        (avg_positions['touches'] >= min_touches) &
        (~avg_positions['position'].str.contains('Goalkeeper', na=False))
    ]

    return avg_positions

def classify_formation_multi_method(avg_positions):
    """
    Classify formation using multiple clustering methods.
    """
    if len(avg_positions) < 10:
        return {
            'formation': 'Unknown',
            'confidence': 0,
            'message': 'Insufficient players detected'
        }

    # Prepare position data
    x_positions = avg_positions['avg_x'].values.reshape(-1, 1)

    # Method 1: K-means clustering
    kmeans_results = {}
    for k in range(2, 5):
        km = KMeans(n_clusters=k, random_state=42, n_init=25)
        labels = km.fit_predict(x_positions)
        sil_score = silhouette_score(x_positions, labels)
        kmeans_results[k] = {
            'labels': labels,
            'silhouette': sil_score,
            'centers': np.sort(km.cluster_centers_.flatten())
        }

    best_k_km = max(kmeans_results, key=lambda k: kmeans_results[k]['silhouette'])
    km_result = kmeans_results[best_k_km]

    # Method 2: Hierarchical clustering
    hc_results = {}
    for k in range(2, 5):
        hc = AgglomerativeClustering(n_clusters=k, linkage='ward')
        labels = hc.fit_predict(x_positions)
        sil_score = silhouette_score(x_positions, labels)
        hc_results[k] = {
            'labels': labels,
            'silhouette': sil_score
        }

    best_k_hc = max(hc_results, key=lambda k: hc_results[k]['silhouette'])

    # Assign lines using K-means result
    avg_positions = avg_positions.copy()
    avg_positions['line'] = km_result['labels']

    # Reorder lines by average x position
    line_means = avg_positions.groupby('line')['avg_x'].mean().sort_values()
    line_mapping = {old: new for new, old in enumerate(line_means.index, 1)}
    avg_positions['line'] = avg_positions['line'].map(line_mapping)

    # Count players per line
    line_counts = avg_positions.groupby('line').size().sort_index().tolist()
    formation_string = '-'.join(map(str, line_counts))

    # Match to standard formations
    confidence_scores = {}
    for formation_name, formation_data in STANDARD_FORMATIONS.items():
        expected = formation_data['lines']
        if len(expected) != len(line_counts):
            confidence_scores[formation_name] = 0
        else:
            diff = sum(abs(e - d) for e, d in zip(expected, line_counts))
            confidence_scores[formation_name] = 1 - (diff / (2 * sum(line_counts)))

    best_match = max(confidence_scores, key=confidence_scores.get)
    confidence = max(confidence_scores.values())

    # Boost confidence for exact match
    if formation_string in STANDARD_FORMATIONS:
        confidence = max(confidence, 0.95)
        best_match = formation_string

    return {
        'detected_formation': formation_string,
        'matched_formation': best_match,
        'confidence': round(confidence, 3),
        'line_counts': line_counts,
        'player_positions': avg_positions,
        'silhouette_kmeans': km_result['silhouette'],
        'optimal_clusters': best_k_km,
        'method_agreement': best_k_km == best_k_hc
    }

def visualize_formation_detection(formation_result, team_name=""):
    """
    Visualize detected formation on a pitch.
    """
    fig, ax = plt.subplots(figsize=(14, 9))

    # Draw pitch
    ax.set_xlim(0, 120)
    ax.set_ylim(0, 80)
    ax.set_facecolor('#228B22')

    # Pitch markings
    ax.axvline(x=60, color='white', linewidth=1)
    circle = plt.Circle((60, 40), 9.15, fill=False, color='white', linewidth=1)
    ax.add_patch(circle)

    # Penalty areas
    ax.add_patch(patches.Rectangle((0, 18), 18, 44, fill=False, color='white', linewidth=1))
    ax.add_patch(patches.Rectangle((102, 18), 18, 44, fill=False, color='white', linewidth=1))

    # Goals
    ax.add_patch(patches.Rectangle((-2, 30), 2, 20, fill=True, facecolor='white'))
    ax.add_patch(patches.Rectangle((120, 30), 2, 20, fill=True, facecolor='white'))

    # Player data
    player_data = formation_result['player_positions']
    line_colors = {1: '#1f77b4', 2: '#ff7f0e', 3: '#2ca02c', 4: '#d62728'}

    # Plot players
    for _, player in player_data.iterrows():
        color = line_colors.get(player['line'], 'gray')
        ax.scatter(player['avg_x'], player['avg_y'], s=500, c=color,
                   edgecolors='white', linewidths=2, zorder=5)

        # Player position abbreviation
        pos_abbr = player['position'][:3].upper() if pd.notna(player['position']) else '?'
        ax.text(player['avg_x'], player['avg_y'], pos_abbr,
                ha='center', va='center', fontsize=8, fontweight='bold', color='white')

    # Draw line groupings (ellipses)
    for line_num in player_data['line'].unique():
        line_players = player_data[player_data['line'] == line_num]
        if len(line_players) >= 2:
            from matplotlib.patches import Ellipse
            center_x = line_players['avg_x'].mean()
            center_y = line_players['avg_y'].mean()
            width = (line_players['avg_y'].max() - line_players['avg_y'].min()) + 10
            height = (line_players['avg_x'].max() - line_players['avg_x'].min()) + 5

            ellipse = Ellipse((center_x, center_y), width, max(height, 5),
                              fill=False, color=line_colors.get(line_num, 'gray'),
                              linestyle='--', linewidth=2)
            ax.add_patch(ellipse)

    # Title and labels
    ax.set_title(
        f"Detected Formation: {formation_result['detected_formation']} "
        f"(Matched: {formation_result['matched_formation']})\n"
        f"Confidence: {formation_result['confidence']*100:.1f}% | {team_name}",
        fontsize=14, fontweight='bold'
    )
    ax.set_xlabel('Pitch Length (meters)')
    ax.set_ylabel('Pitch Width (meters)')

    # Legend
    legend_elements = [plt.scatter([], [], c=color, s=100, label=f'Line {num}')
                       for num, color in line_colors.items() if num <= len(formation_result['line_counts'])]
    ax.legend(handles=legend_elements, loc='upper right')

    plt.tight_layout()
    plt.show()

def generate_formation_report(events, team_name):
    """
    Generate comprehensive formation report.
    """
    print("=" * 55)
    print(f"FORMATION ANALYSIS REPORT: {team_name}")
    print("=" * 55)

    # Calculate positions
    positions = calculate_touch_positions(events, team_name)

    # Detect formation
    result = classify_formation_multi_method(positions)

    print(f"\nDetected Formation: {result['detected_formation']}")
    print(f"Best Match: {result['matched_formation']}")
    print(f"Confidence Score: {result['confidence']*100:.1f}%")

    print("\nLine Structure:")
    line_names = ["Defensive", "Midfield 1", "Midfield 2", "Attacking"]
    for i, count in enumerate(result['line_counts']):
        print(f"  Line {i+1} ({line_names[i]}): {count} players")

    print("\nPlayer Assignments by Line:")
    for line_num in sorted(result['player_positions']['line'].unique()):
        line_players = result['player_positions'][
            result['player_positions']['line'] == line_num
        ]['player_name'].tolist()
        print(f"  Line {line_num}: {', '.join(line_players)}")

    print("\nQuality Metrics:")
    print(f"  Silhouette Score: {result['silhouette_kmeans']:.3f}")
    print(f"  Optimal Clusters: {result['optimal_clusters']}")
    print(f"  Method Agreement: {'Yes' if result['method_agreement'] else 'No'}")

    return result

# Example usage
events, match_info = load_statsbomb_events()
team_name = events['team'].unique()[0]
formation_report = generate_formation_report(events, team_name)
visualize_formation_detection(formation_report, team_name)
library(tidyverse)
library(cluster)
library(factoextra)
library(StatsBombR)

# ============================================
# COMPLETE FORMATION DETECTION SYSTEM
# ============================================

# Load StatsBomb data
comps <- FreeCompetitions()
matches <- FreeMatches(Competitions = comps %>% filter(competition_id == 43))
events <- free_allevents(MatchesDF = matches[1:5,],Atea = TRUE)

# Clean event data
events_clean <- allclean(events)

# Define standard formations with expected line structures
standard_formations <- list(
  "4-4-2" = list(lines = c(4, 4, 2), positions = c("DEF", "MID", "FWD")),
  "4-3-3" = list(lines = c(4, 3, 3), positions = c("DEF", "MID", "FWD")),
  "3-5-2" = list(lines = c(3, 5, 2), positions = c("DEF", "MID", "FWD")),
  "4-2-3-1" = list(lines = c(4, 2, 3, 1), positions = c("DEF", "DM", "AM", "FWD")),
  "5-3-2" = list(lines = c(5, 3, 2), positions = c("DEF", "MID", "FWD")),
  "4-1-4-1" = list(lines = c(4, 1, 4, 1), positions = c("DEF", "DM", "MID", "FWD")),
  "3-4-3" = list(lines = c(3, 4, 3), positions = c("DEF", "MID", "FWD"))
)

# Calculate average positions from event data
calculate_touch_positions <- function(events, team_name, min_touches = 10) {

  touch_events <- events %>%
    filter(
      team.name == team_name,
      type.name %in% c("Pass", "Carry", "Shot", "Dribble", "Ball Receipt*", "Clearance")
    ) %>%
    filter(!is.na(location.x) & !is.na(location.y))

  avg_positions <- touch_events %>%
    group_by(player.id, player.name, position.name) %>%
    summarise(
      avg_x = mean(location.x, na.rm = TRUE),
      avg_y = mean(location.y, na.rm = TRUE),
      touches = n(),
      x_sd = sd(location.x, na.rm = TRUE),
      y_sd = sd(location.y, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    filter(touches >= min_touches) %>%
    filter(!str_detect(position.name, "Goalkeeper"))

  return(avg_positions)
}

# Multi-method formation classification
classify_formation_multi_method <- function(avg_positions) {

  if (nrow(avg_positions) < 10) {
    return(list(
      formation = "Unknown",
      confidence = 0,
      message = "Insufficient players detected"
    ))
  }

  # Prepare position matrix
  pos_matrix <- avg_positions %>%
    select(avg_x) %>%
    as.matrix()

  results <- list()

  # Method 1: K-means clustering
  kmeans_results <- list()
  for (k in 2:4) {
    km <- kmeans(pos_matrix, centers = k, nstart = 25)
    sil <- silhouette(km$cluster, dist(pos_matrix))
    kmeans_results[[as.character(k)]] <- list(
      clusters = km$cluster,
      silhouette = mean(sil[, 3]),
      centers = sort(km$centers[,1])
    )
  }

  best_k_kmeans <- names(which.max(sapply(kmeans_results, function(x) x$silhouette)))
  km_result <- kmeans_results[[best_k_kmeans]]

  # Method 2: Hierarchical clustering
  hc <- hclust(dist(pos_matrix), method = "ward.D2")
  hc_results <- list()
  for (k in 2:4) {
    hc_cluster <- cutree(hc, k = k)
    sil <- silhouette(hc_cluster, dist(pos_matrix))
    hc_results[[as.character(k)]] <- list(
      clusters = hc_cluster,
      silhouette = mean(sil[, 3])
    )
  }

  best_k_hc <- names(which.max(sapply(hc_results, function(x) x$silhouette)))

  # Use K-means result for line assignment
  avg_positions$line <- km_result$clusters

  # Reorder lines by average x position (defensive to attacking)
  line_order <- avg_positions %>%
    group_by(line) %>%
    summarise(mean_x = mean(avg_x)) %>%
    arrange(mean_x) %>%
    mutate(new_line = row_number())

  avg_positions <- avg_positions %>%
    left_join(line_order %>% select(line, new_line), by = "line") %>%
    mutate(line = new_line) %>%
    select(-new_line)

  # Count players per line
  line_counts <- avg_positions %>%
    group_by(line) %>%
    summarise(n = n(), .groups = "drop") %>%
    arrange(line) %>%
    pull(n)

  formation_string <- paste(line_counts, collapse = "-")

  # Match to standard formations and calculate confidence
  confidence_scores <- sapply(names(standard_formations), function(f) {
    expected <- standard_formations[[f]]$lines
    if (length(expected) != length(line_counts)) {
      return(0)
    }
    1 - (sum(abs(expected - line_counts)) / (2 * sum(line_counts)))
  })

  best_match <- names(which.max(confidence_scores))
  confidence <- max(confidence_scores)

  # If exact match found
  if (formation_string %in% names(standard_formations)) {
    confidence <- max(confidence, 0.95)
    best_match <- formation_string
  }

  return(list(
    detected_formation = formation_string,
    matched_formation = best_match,
    confidence = round(confidence, 3),
    line_counts = line_counts,
    player_positions = avg_positions,
    silhouette_kmeans = km_result$silhouette,
    optimal_clusters = as.integer(best_k_kmeans),
    method_agreement = best_k_kmeans == best_k_hc
  ))
}

# Visualize detected formation
visualize_formation_detection <- function(formation_result, title_suffix = "") {

  player_data <- formation_result$player_positions

  # Create pitch
  pitch <- ggplot() +
    # Pitch background
    annotate("rect", xmin = 0, xmax = 120, ymin = 0, ymax = 80,
             fill = "#228B22", color = "white", size = 1) +
    # Center line
    annotate("segment", x = 60, xend = 60, y = 0, yend = 80,
             color = "white", size = 0.5) +
    # Center circle
    annotate("path",
             x = 60 + 9.15 * cos(seq(0, 2*pi, length.out = 100)),
             y = 40 + 9.15 * sin(seq(0, 2*pi, length.out = 100)),
             color = "white", size = 0.5) +
    # Penalty areas
    annotate("rect", xmin = 0, xmax = 18, ymin = 18, ymax = 62,
             fill = NA, color = "white", size = 0.5) +
    annotate("rect", xmin = 102, xmax = 120, ymin = 18, ymax = 62,
             fill = NA, color = "white", size = 0.5)

  # Color by line
  line_colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728")

  pitch +
    # Player positions
    geom_point(data = player_data,
               aes(x = avg_x, y = avg_y, color = factor(line)),
               size = 10) +
    geom_text(data = player_data,
              aes(x = avg_x, y = avg_y,
                  label = str_extract(position.name, "^[A-Z]{2,3}")),
              color = "white", size = 3, fontface = "bold") +
    # Ellipses for each line
    stat_ellipse(data = player_data,
                 aes(x = avg_x, y = avg_y, color = factor(line)),
                 level = 0.8, linetype = "dashed") +
    scale_color_manual(values = line_colors, name = "Line") +
    labs(
      title = paste0("Detected Formation: ", formation_result$detected_formation,
                     " (Matched: ", formation_result$matched_formation, ")"),
      subtitle = paste0("Confidence: ", round(formation_result$confidence * 100, 1),
                        "% | ", title_suffix),
      x = "Pitch Length (meters)", y = "Pitch Width (meters)"
    ) +
    coord_fixed(ratio = 1) +
    theme_minimal() +
    theme(
      panel.grid = element_blank(),
      plot.title = element_text(hjust = 0.5, face = "bold"),
      plot.subtitle = element_text(hjust = 0.5)
    )
}

# Generate formation report
generate_formation_report <- function(events, team_name) {

  cat("=" , rep("=", 50), "\n", sep = "")
  cat("FORMATION ANALYSIS REPORT:", team_name, "\n")
  cat("=" , rep("=", 50), "\n\n", sep = "")

  # Calculate positions
  positions <- calculate_touch_positions(events, team_name)

  # Detect formation
  result <- classify_formation_multi_method(positions)

  cat("Detected Formation:", result$detected_formation, "\n")
  cat("Best Match:", result$matched_formation, "\n")
  cat("Confidence Score:", round(result$confidence * 100, 1), "%\n\n")

  cat("Line Structure:\n")
  for (i in seq_along(result$line_counts)) {
    line_name <- c("Defensive", "Midfield 1", "Midfield 2", "Attacking")[i]
    cat("  Line", i, "(", line_name, "):", result$line_counts[i], "players\n")
  }

  cat("\nPlayer Assignments:\n")
  result$player_positions %>%
    arrange(line, avg_y) %>%
    group_by(line) %>%
    summarise(
      players = paste(player.name, collapse = ", "),
      .groups = "drop"
    ) %>%
    print()

  cat("\nQuality Metrics:\n")
  cat("  Silhouette Score:", round(result$silhouette_kmeans, 3), "\n")
  cat("  Optimal Clusters:", result$optimal_clusters, "\n")
  cat("  Method Agreement:", ifelse(result$method_agreement, "Yes", "No"), "\n")

  return(result)
}

# Example usage
team_name <- unique(events_clean$team.name)[1]
formation_report <- generate_formation_report(events_clean, team_name)
visualize_formation_detection(formation_report, team_name)
Exercise 24.2: In-Possession vs Out-of-Possession Shape Analysis

Task: Build a comprehensive shape comparison system that analyzes how a team's formation transforms between attacking and defending phases, including shape metrics, player movement patterns, and tactical visualizations.

Requirements:

  • Determine possession state for each event/frame
  • Calculate shape metrics (length, width, surface area, centroid) for both phases
  • Measure individual player position changes between phases
  • Identify which players move most between attacking/defending
  • Create side-by-side formation visualizations
  • Generate statistical comparison report

shape_comparison_analysis
import pandas as pd
import numpy as np
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from statsbombpy import sb

# ============================================
# IN vs OUT OF POSSESSION SHAPE ANALYSIS
# ============================================

def load_match_events(competition_id=43, season_id=3):
    """Load sample match events."""
    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    match_id = matches.iloc[0]['match_id']
    events = sb.events(match_id=match_id)
    return events

def assign_possession_phase(events, team_name):
    """Determine possession phase for each event."""
    possession_events = ['Pass', 'Carry', 'Shot', 'Dribble', 'Ball Receipt*',
                         'Foul Won', 'Dispossessed']
    defensive_events = ['Pressure', 'Tackle', 'Interception', 'Block',
                        'Clearance', 'Ball Recovery']

    events = events.copy()
    events['is_team_event'] = events['team'] == team_name

    def get_phase(row):
        if row['is_team_event'] and row['type'] in possession_events:
            return 'in_possession'
        elif row['is_team_event'] and row['type'] in defensive_events:
            return 'out_of_possession'
        elif not row['is_team_event'] and row['type'] in possession_events:
            return 'out_of_possession'
        return None

    events['possession_phase'] = events.apply(get_phase, axis=1)
    return events[events['possession_phase'].notna()]

def calculate_shape_metrics(positions_df):
    """Calculate shape metrics for a set of positions."""
    if len(positions_df) < 3:
        return None

    metrics = {}

    # Centroid
    metrics['centroid_x'] = positions_df['avg_x'].mean()
    metrics['centroid_y'] = positions_df['avg_y'].mean()

    # Length and width
    metrics['length'] = positions_df['avg_x'].max() - positions_df['avg_x'].min()
    metrics['width'] = positions_df['avg_y'].max() - positions_df['avg_y'].min()

    # Surface area via convex hull
    try:
        points = positions_df[['avg_x', 'avg_y']].values
        if len(points) >= 3:
            hull = ConvexHull(points)
            metrics['surface_area'] = hull.volume  # 2D area
        else:
            metrics['surface_area'] = 0
    except:
        metrics['surface_area'] = 0

    # Compactness ratio
    metrics['compactness_ratio'] = (
        metrics['length'] / metrics['width'] if metrics['width'] > 0 else 0
    )

    # Defensive line (4 deepest players)
    defensive = positions_df.nsmallest(4, 'avg_x')
    metrics['defensive_line'] = defensive['avg_x'].mean()

    # Attacking line (3 highest players)
    attacking = positions_df.nlargest(3, 'avg_x')
    metrics['attacking_line'] = attacking['avg_x'].mean()

    # Team stretch
    metrics['team_stretch'] = metrics['attacking_line'] - metrics['defensive_line']

    return metrics

def calculate_positions_by_phase(events, team_name):
    """Calculate average positions for each possession phase."""
    events_phased = assign_possession_phase(events, team_name)

    def extract_positions(events_subset):
        df = events_subset.copy()
        df['x'] = df['location'].apply(
            lambda loc: loc[0] if isinstance(loc, list) else np.nan
        )
        df['y'] = df['location'].apply(
            lambda loc: loc[1] if isinstance(loc, list) else np.nan
        )

        return (
            df[df['x'].notna() & ~df['position'].str.contains('Goalkeeper', na=False)]
            .groupby(['player_id', 'player', 'position'])
            .agg({'x': 'mean', 'y': 'mean'})
            .reset_index()
            .rename(columns={'x': 'avg_x', 'y': 'avg_y', 'player': 'player_name'})
        )

    in_poss = events_phased[
        (events_phased['team'] == team_name) &
        (events_phased['possession_phase'] == 'in_possession')
    ]
    out_poss = events_phased[
        (events_phased['team'] == team_name) &
        (events_phased['possession_phase'] == 'out_of_possession')
    ]

    return {
        'in_possession': extract_positions(in_poss),
        'out_of_possession': extract_positions(out_poss)
    }

def calculate_player_movement(positions_in, positions_out):
    """Calculate how much each player moves between phases."""
    merged = positions_in.merge(
        positions_out[['player_id', 'avg_x', 'avg_y']],
        on='player_id',
        suffixes=('_in', '_out')
    )

    merged['x_movement'] = merged['avg_x_in'] - merged['avg_x_out']
    merged['y_movement'] = merged['avg_y_in'] - merged['avg_y_out']
    merged['total_movement'] = np.sqrt(
        merged['x_movement']**2 + merged['y_movement']**2
    )

    def classify_movement(x_mov):
        if x_mov > 5:
            return "Pushes forward in possession"
        elif x_mov < -5:
            return "Drops back in possession"
        return "Maintains position"

    merged['movement_type'] = merged['x_movement'].apply(classify_movement)

    return merged.sort_values('total_movement', ascending=False)

def visualize_shape_comparison(positions_in, positions_out,
                                metrics_in, metrics_out, team_name):
    """Create side-by-side shape comparison visualization."""
    fig, axes = plt.subplots(1, 2, figsize=(18, 9))

    def draw_pitch(ax, positions, metrics, title):
        # Pitch background
        ax.set_xlim(0, 120)
        ax.set_ylim(0, 80)
        ax.set_facecolor('#228B22')

        # Pitch markings
        ax.axvline(x=60, color='white', linewidth=1)
        ax.add_patch(patches.Rectangle((0, 18), 18, 44,
                                        fill=False, color='white', linewidth=1))
        ax.add_patch(patches.Rectangle((102, 18), 18, 44,
                                        fill=False, color='white', linewidth=1))

        # Convex hull
        if len(positions) >= 3:
            try:
                points = positions[['avg_x', 'avg_y']].values
                hull = ConvexHull(points)
                hull_points = np.vstack([points[hull.vertices], points[hull.vertices[0]]])
                ax.fill(hull_points[:, 0], hull_points[:, 1],
                        alpha=0.2, color='white', linestyle='--')
                ax.plot(hull_points[:, 0], hull_points[:, 1],
                        'w--', linewidth=2)
            except:
                pass

        # Centroid
        ax.scatter(metrics['centroid_x'], metrics['centroid_y'],
                   marker='x', s=200, c='yellow', linewidths=3, zorder=10)

        # Players
        for _, player in positions.iterrows():
            ax.scatter(player['avg_x'], player['avg_y'],
                       s=400, c='white', edgecolors='black', linewidths=2, zorder=5)
            pos_abbr = player['position'][:3].upper() if pd.notna(player['position']) else '?'
            ax.text(player['avg_x'], player['avg_y'], pos_abbr,
                    ha='center', va='center', fontsize=8, fontweight='bold')

        # Metrics box
        metrics_text = (f"Length: {metrics['length']:.1f}m\n"
                        f"Width: {metrics['width']:.1f}m\n"
                        f"Area: {metrics['surface_area']:.0f}m²")
        ax.text(100, 72, metrics_text, fontsize=10,
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

        ax.set_title(title, fontsize=14, fontweight='bold')
        ax.set_xlabel('Pitch Length (m)')
        ax.set_ylabel('Pitch Width (m)')

    draw_pitch(axes[0], positions_in, metrics_in, 'In Possession')
    draw_pitch(axes[1], positions_out, metrics_out, 'Out of Possession')

    fig.suptitle(f'{team_name} - Shape Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

def generate_shape_comparison_report(events, team_name):
    """Generate comprehensive shape comparison report."""
    print("=" * 60)
    print(f"SHAPE COMPARISON REPORT: {team_name}")
    print("=" * 60)

    # Calculate positions
    positions = calculate_positions_by_phase(events, team_name)

    # Calculate metrics
    metrics_in = calculate_shape_metrics(positions['in_possession'])
    metrics_out = calculate_shape_metrics(positions['out_of_possession'])

    # Calculate movement
    movement = calculate_player_movement(
        positions['in_possession'],
        positions['out_of_possession']
    )

    # Print comparison
    print("\nSHAPE METRICS COMPARISON:")
    print("-" * 60)
    print(f"{'Metric':<25} {'In Poss':>12} {'Out Poss':>12} {'Diff':>12}")
    print("-" * 60)

    metrics_list = [
        ('Centroid X', metrics_in['centroid_x'], metrics_out['centroid_x']),
        ('Centroid Y', metrics_in['centroid_y'], metrics_out['centroid_y']),
        ('Length', metrics_in['length'], metrics_out['length']),
        ('Width', metrics_in['width'], metrics_out['width']),
        ('Surface Area', metrics_in['surface_area'], metrics_out['surface_area']),
        ('Defensive Line', metrics_in['defensive_line'], metrics_out['defensive_line']),
        ('Attacking Line', metrics_in['attacking_line'], metrics_out['attacking_line']),
        ('Team Stretch', metrics_in['team_stretch'], metrics_out['team_stretch']),
    ]

    for name, in_val, out_val in metrics_list:
        diff = in_val - out_val
        print(f"{name:<25} {in_val:>12.1f} {out_val:>12.1f} {diff:>+12.1f}")

    print("\n\nPLAYER MOVEMENT ANALYSIS:")
    print("-" * 60)
    print("Top 5 players with most positional change:\n")

    top_movers = movement.head(5)[['player_name', 'position', 'x_movement',
                                    'y_movement', 'total_movement', 'movement_type']]
    print(top_movers.to_string(index=False))

    # Visualize
    visualize_shape_comparison(
        positions['in_possession'],
        positions['out_of_possession'],
        metrics_in,
        metrics_out,
        team_name
    )

    return {
        'metrics_in': metrics_in,
        'metrics_out': metrics_out,
        'movement': movement,
        'positions': positions
    }

# Run analysis
events = load_match_events()
team_name = events['team'].unique()[0]
report = generate_shape_comparison_report(events, team_name)
library(tidyverse)
library(sf)
library(patchwork)
library(StatsBombR)

# ============================================
# IN vs OUT OF POSSESSION SHAPE ANALYSIS
# ============================================

# Load data
comps <- FreeCompetitions()
matches <- FreeMatches(Competitions = comps %>% filter(competition_id == 43))
events <- free_allevents(MatchesDF = matches[1,],Atea = TRUE)
events_clean <- allclean(events)

# Determine possession phases
assign_possession_phase <- function(events, team_name) {

  possession_events <- c("Pass", "Carry", "Shot", "Dribble", "Ball Receipt*",
                          "Foul Won", "Dispossessed")
  defensive_events <- c("Pressure", "Tackle", "Interception", "Block",
                         "Clearance", "Ball Recovery")

  events <- events %>%
    mutate(
      is_team_event = team.name == team_name,
      possession_phase = case_when(
        # Team has possession
        is_team_event & type.name %in% possession_events ~ "in_possession",
        # Team defending
        is_team_event & type.name %in% defensive_events ~ "out_of_possession",
        # Opponent events when team doesn't have ball
        !is_team_event & type.name %in% possession_events ~ "out_of_possession",
        TRUE ~ NA_character_
      )
    ) %>%
    filter(!is.na(possession_phase))

  return(events)
}

# Calculate shape metrics for a set of positions
calculate_shape_metrics <- function(positions_df) {

  if (nrow(positions_df) < 3) return(NULL)

  metrics <- list()

  # Centroid
  metrics$centroid_x <- mean(positions_df$avg_x, na.rm = TRUE)
  metrics$centroid_y <- mean(positions_df$avg_y, na.rm = TRUE)

  # Length (vertical spread)
  metrics$length <- max(positions_df$avg_x) - min(positions_df$avg_x)

  # Width (horizontal spread)
  metrics$width <- max(positions_df$avg_y) - min(positions_df$avg_y)

  # Surface area via convex hull
  tryCatch({
    points_sf <- st_as_sf(positions_df,
                          coords = c("avg_y", "avg_x"),
                          crs = NA)
    hull <- st_convex_hull(st_union(points_sf))
    metrics$surface_area <- as.numeric(st_area(hull))
  }, error = function(e) {
    metrics$surface_area <- NA
  })

  # Compactness ratio
  metrics$compactness_ratio <- metrics$length / metrics$width

  # Defensive line height (average of 4 deepest)
  defensive <- positions_df %>% arrange(avg_x) %>% head(4)
  metrics$defensive_line <- mean(defensive$avg_x)

  # Attacking line height (average of 3 highest)
  attacking <- positions_df %>% arrange(desc(avg_x)) %>% head(3)
  metrics$attacking_line <- mean(attacking$avg_x)

  # Team stretch (distance between def and attack lines)
  metrics$team_stretch <- metrics$attacking_line - metrics$defensive_line

  return(as_tibble(metrics))
}

# Calculate positions by possession phase
calculate_positions_by_phase <- function(events, team_name) {

  events_phased <- assign_possession_phase(events, team_name)

  positions_in <- events_phased %>%
    filter(
      team.name == team_name,
      possession_phase == "in_possession",
      !is.na(location.x), !is.na(location.y),
      !str_detect(position.name, "Goalkeeper")
    ) %>%
    group_by(player.id, player.name, position.name) %>%
    summarise(
      avg_x = mean(location.x, na.rm = TRUE),
      avg_y = mean(location.y, na.rm = TRUE),
      touches = n(),
      .groups = "drop"
    ) %>%
    mutate(phase = "In Possession")

  positions_out <- events_phased %>%
    filter(
      team.name == team_name,
      possession_phase == "out_of_possession",
      !is.na(location.x), !is.na(location.y),
      !str_detect(position.name, "Goalkeeper")
    ) %>%
    group_by(player.id, player.name, position.name) %>%
    summarise(
      avg_x = mean(location.x, na.rm = TRUE),
      avg_y = mean(location.y, na.rm = TRUE),
      touches = n(),
      .groups = "drop"
    ) %>%
    mutate(phase = "Out of Possession")

  return(list(
    in_possession = positions_in,
    out_of_possession = positions_out
  ))
}

# Calculate player movement between phases
calculate_player_movement <- function(positions_in, positions_out) {

  movement <- positions_in %>%
    select(player.id, player.name, position.name,
           x_in = avg_x, y_in = avg_y) %>%
    inner_join(
      positions_out %>%
        select(player.id, x_out = avg_x, y_out = avg_y),
      by = "player.id"
    ) %>%
    mutate(
      x_movement = x_in - x_out,  # Positive = higher up when attacking
      y_movement = y_in - y_out,
      total_movement = sqrt(x_movement^2 + y_movement^2),
      movement_type = case_when(
        x_movement > 5 ~ "Pushes forward in possession",
        x_movement < -5 ~ "Drops back in possession",
        TRUE ~ "Maintains position"
      )
    ) %>%
    arrange(desc(total_movement))

  return(movement)
}

# Visualize both phases side by side
visualize_shape_comparison <- function(positions_in, positions_out,
                                       metrics_in, metrics_out, team_name) {

  # Create pitch plot function
  create_pitch_plot <- function(positions, metrics, title) {
    ggplot(positions, aes(x = avg_x, y = avg_y)) +
      # Pitch background
      annotate("rect", xmin = 0, xmax = 120, ymin = 0, ymax = 80,
               fill = "#228B22", alpha = 0.9) +
      # Pitch markings
      annotate("segment", x = 60, xend = 60, y = 0, yend = 80, color = "white") +
      annotate("rect", xmin = 0, xmax = 18, ymin = 18, ymax = 62,
               fill = NA, color = "white") +
      annotate("rect", xmin = 102, xmax = 120, ymin = 18, ymax = 62,
               fill = NA, color = "white") +
      # Centroid
      annotate("point", x = metrics$centroid_x, y = metrics$centroid_y,
               shape = 4, size = 6, color = "yellow", stroke = 2) +
      # Convex hull
      stat_chull(fill = "white", alpha = 0.2, color = "white", linetype = "dashed") +
      # Players
      geom_point(size = 8, color = "white") +
      geom_text(aes(label = str_extract(position.name, "^[A-Z]{2,3}")),
                size = 2.5, fontface = "bold") +
      # Metrics annotation
      annotate("label", x = 100, y = 75,
               label = paste0("Length: ", round(metrics$length, 1), "m\n",
                             "Width: ", round(metrics$width, 1), "m\n",
                             "Area: ", round(metrics$surface_area, 0), "m²"),
               size = 3, fill = "white", alpha = 0.8) +
      labs(title = title) +
      coord_fixed(ratio = 1, xlim = c(0, 120), ylim = c(0, 80)) +
      theme_void() +
      theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12))
  }

  # Create both plots
  p1 <- create_pitch_plot(positions_in, metrics_in, "In Possession")
  p2 <- create_pitch_plot(positions_out, metrics_out, "Out of Possession")

  # Combine
  combined <- p1 + p2 +
    plot_annotation(
      title = paste0(team_name, " - Shape Comparison"),
      theme = theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
    )

  return(combined)
}

# Generate comprehensive comparison report
generate_shape_comparison_report <- function(events, team_name) {

  cat("\n", rep("=", 60), "\n", sep = "")
  cat("SHAPE COMPARISON REPORT:", team_name, "\n")
  cat(rep("=", 60), "\n\n", sep = "")

  # Calculate positions
  positions <- calculate_positions_by_phase(events, team_name)

  # Calculate metrics for each phase
  metrics_in <- calculate_shape_metrics(positions$in_possession)
  metrics_out <- calculate_shape_metrics(positions$out_of_possession)

  # Calculate player movement
  movement <- calculate_player_movement(
    positions$in_possession,
    positions$out_of_possession
  )

  # Print shape comparison
  cat("SHAPE METRICS COMPARISON:\n")
  cat("-" , rep("-", 40), "\n", sep = "")
  cat(sprintf("%-25s %12s %12s %12s\n",
              "Metric", "In Poss", "Out Poss", "Difference"))
  cat("-" , rep("-", 40), "\n", sep = "")

  metrics_df <- tibble(
    Metric = c("Centroid X", "Centroid Y", "Length", "Width",
               "Surface Area", "Defensive Line", "Attacking Line", "Team Stretch"),
    In_Poss = c(metrics_in$centroid_x, metrics_in$centroid_y,
                metrics_in$length, metrics_in$width,
                metrics_in$surface_area, metrics_in$defensive_line,
                metrics_in$attacking_line, metrics_in$team_stretch),
    Out_Poss = c(metrics_out$centroid_x, metrics_out$centroid_y,
                 metrics_out$length, metrics_out$width,
                 metrics_out$surface_area, metrics_out$defensive_line,
                 metrics_out$attacking_line, metrics_out$team_stretch)
  ) %>%
    mutate(
      Difference = In_Poss - Out_Poss,
      Change_Pct = round((Difference / Out_Poss) * 100, 1)
    )

  print(metrics_df, n = 8)

  cat("\n\nPLAYER MOVEMENT ANALYSIS:\n")
  cat("-" , rep("-", 40), "\n", sep = "")
  cat("Top 5 players with most positional change:\n\n")

  movement %>%
    head(5) %>%
    select(player.name, position.name, x_movement, y_movement,
           total_movement, movement_type) %>%
    print()

  # Visualize
  plot <- visualize_shape_comparison(
    positions$in_possession,
    positions$out_of_possession,
    metrics_in,
    metrics_out,
    team_name
  )
  print(plot)

  return(list(
    metrics_in = metrics_in,
    metrics_out = metrics_out,
    movement = movement,
    positions = positions
  ))
}

# Run analysis
team_name <- unique(events_clean$team.name)[1]
shape_report <- generate_shape_comparison_report(events_clean, team_name)
Exercise 24.3: Formation Matchup Recommendation System

Task: Build a data-driven formation recommendation system that analyzes historical matchup data and suggests optimal formations to use against specific opponent systems.

Requirements:

  • Create a matchup database from historical match data
  • Calculate performance metrics (xG diff, win rate, goals) for each formation combination
  • Implement confidence scoring based on sample size and variance
  • Build a recommendation engine that considers opponent formation, team strengths, and historical performance
  • Generate visual matchup heatmaps and recommendation reports
  • Include scenario analysis for different tactical objectives (defensive, balanced, attacking)

formation_recommendation_system
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# ============================================
# FORMATION MATCHUP RECOMMENDATION SYSTEM
# ============================================

def generate_matchup_database(n_matches=500):
    """Generate synthetic matchup database."""
    np.random.seed(42)

    formations = ["4-4-2", "4-3-3", "4-2-3-1", "3-5-2", "5-3-2",
                  "4-1-4-1", "3-4-3", "4-5-1"]
    formation_probs = [0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10]

    # Generate matches
    matches = pd.DataFrame({
        'match_id': range(1, n_matches + 1),
        'home_formation': np.random.choice(formations, n_matches, p=formation_probs),
        'away_formation': np.random.choice(formations, n_matches, p=formation_probs)
    })

    # Define matchup advantages
    advantages = {
        ('4-3-3', '4-4-2'): 0.3,
        ('4-2-3-1', '4-3-3'): 0.2,
        ('3-5-2', '4-3-3'): 0.15,
        ('4-4-2', '3-5-2'): -0.1,
        ('5-3-2', '4-3-3'): 0.1
    }

    def get_advantage(home, away):
        return advantages.get((home, away), 0)

    matches['advantage'] = matches.apply(
        lambda x: get_advantage(x['home_formation'], x['away_formation']),
        axis=1
    )

    # Generate outcomes
    matches['home_xg'] = np.maximum(
        0.5,
        np.random.normal(1.3 + matches['advantage'], 0.4)
    )
    matches['away_xg'] = np.maximum(
        0.3,
        np.random.normal(1.1 - matches['advantage'] * 0.5, 0.35)
    )
    matches['home_goals'] = np.random.poisson(matches['home_xg'])
    matches['away_goals'] = np.random.poisson(matches['away_xg'])

    matches['home_result'] = np.where(
        matches['home_goals'] > matches['away_goals'], 'Win',
        np.where(matches['home_goals'] < matches['away_goals'], 'Loss', 'Draw')
    )
    matches['home_points'] = np.where(
        matches['home_result'] == 'Win', 3,
        np.where(matches['home_result'] == 'Draw', 1, 0)
    )

    return matches.drop(columns=['advantage'])

def calculate_matchup_stats(matches, min_sample=5):
    """Calculate performance statistics for each matchup."""
    stats = (
        matches
        .groupby(['home_formation', 'away_formation'])
        .agg({
            'match_id': 'count',
            'home_result': lambda x: (x == 'Win').sum(),
            'home_xg': ['mean', 'std'],
            'away_xg': 'mean',
            'home_goals': 'mean',
            'away_goals': 'mean',
            'home_points': 'mean'
        })
        .reset_index()
    )

    stats.columns = [
        'home_formation', 'away_formation', 'matches_played', 'wins',
        'avg_home_xg', 'xg_diff_sd', 'avg_away_xg',
        'avg_home_goals', 'avg_away_goals', 'avg_points'
    ]

    stats['win_rate'] = stats['wins'] / stats['matches_played']
    stats['xg_diff'] = stats['avg_home_xg'] - stats['avg_away_xg']
    stats['goal_diff'] = stats['avg_home_goals'] - stats['avg_away_goals']

    # Calculate confidence
    stats['sample_score'] = np.minimum(1, stats['matches_played'] / 30)
    stats['variance_score'] = 1 / (1 + stats['xg_diff_sd'].fillna(1))
    stats['confidence'] = (stats['sample_score'] * 0.6 + stats['variance_score'] * 0.4).round(2)

    # Performance score
    stats['performance_score'] = (
        stats['win_rate'] * 40 +
        (1 - stats['win_rate'] - stats['win_rate']) / 2 * 10 +  # draw rate approx
        stats['xg_diff'] * 20 +
        stats['goal_diff'] * 10
    ).round(1)

    return stats[stats['matches_played'] >= min_sample]

def recommend_formation(matchup_stats, opponent_formation, tactical_objective='balanced'):
    """Recommend best formation against opponent."""
    relevant = matchup_stats[matchup_stats['away_formation'] == opponent_formation].copy()

    if len(relevant) == 0:
        return {
            'recommendation': 'No data available',
            'confidence': 0,
            'alternatives': None
        }

    # Adjust scoring
    if tactical_objective == 'defensive':
        relevant['adjusted_score'] = (
            relevant['performance_score'] +
            (1 - (1 - relevant['win_rate'])) * 20 -
            relevant['xg_diff'] * 5
        )
    elif tactical_objective == 'attacking':
        relevant['adjusted_score'] = (
            relevant['performance_score'] +
            relevant['avg_home_xg'] * 15 +
            relevant['win_rate'] * 10
        )
    else:
        relevant['adjusted_score'] = relevant['performance_score']

    relevant = relevant.sort_values('adjusted_score', ascending=False)
    top = relevant.iloc[0]
    alts = relevant.iloc[1:4]

    return {
        'opponent_formation': opponent_formation,
        'tactical_objective': tactical_objective,
        'recommendation': top['home_formation'],
        'confidence': top['confidence'],
        'expected_xg_diff': round(top['xg_diff'], 2),
        'expected_win_rate': round(top['win_rate'] * 100, 1),
        'sample_size': int(top['matches_played']),
        'alternatives': alts[['home_formation', 'win_rate', 'xg_diff', 'confidence']].copy()
    }

def create_matchup_heatmap(matchup_stats):
    """Create visual matchup heatmap."""
    pivot = matchup_stats.pivot(
        index='home_formation',
        columns='away_formation',
        values='xg_diff'
    )

    win_pivot = matchup_stats.pivot(
        index='home_formation',
        columns='away_formation',
        values='win_rate'
    )

    fig, ax = plt.subplots(figsize=(12, 10))

    # Create heatmap
    sns.heatmap(
        pivot,
        annot=False,
        cmap='RdYlGn',
        center=0,
        linewidths=0.5,
        ax=ax
    )

    # Add annotations with win rate and xG diff
    for i, row in enumerate(pivot.index):
        for j, col in enumerate(pivot.columns):
            xg_val = pivot.loc[row, col]
            win_val = win_pivot.loc[row, col]
            if pd.notna(xg_val):
                text = f"{win_val*100:.0f}%\n({xg_val:.2f})"
                ax.text(j + 0.5, i + 0.5, text,
                        ha='center', va='center', fontsize=8, color='white')

    ax.set_title('Formation Matchup Performance Matrix\nWin Rate % and (xG Difference)',
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('Opponent Formation', fontsize=12)
    ax.set_ylabel('Your Formation', fontsize=12)

    plt.tight_layout()
    plt.show()

def generate_recommendation_report(opponent_formation, matchup_stats):
    """Generate comprehensive recommendation report."""
    print("\n" + "=" * 65)
    print("FORMATION RECOMMENDATION REPORT")
    print(f"Opponent Formation: {opponent_formation}")
    print("=" * 65 + "\n")

    objectives = ['defensive', 'balanced', 'attacking']

    for obj in objectives:
        rec = recommend_formation(matchup_stats, opponent_formation, obj)

        print(f"{obj.upper()} APPROACH:")
        print("-" * 45)
        print(f"  Recommended Formation: {rec['recommendation']}")
        print(f"  Confidence: {rec['confidence']*100:.0f}%")
        print(f"  Expected xG Difference: {rec['expected_xg_diff']}")
        print(f"  Historical Win Rate: {rec['expected_win_rate']}%")
        print(f"  Sample Size: {rec['sample_size']} matches")

        if rec['alternatives'] is not None and len(rec['alternatives']) > 0:
            print("\n  Alternatives:")
            for idx, (_, alt) in enumerate(rec['alternatives'].iterrows(), 1):
                print(f"    {idx}. {alt['home_formation']} "
                      f"({alt['win_rate']*100:.1f}% win, {alt['xg_diff']:.2f} xG diff)")
        print()

    # Full analysis
    all_matchups = (
        matchup_stats[matchup_stats['away_formation'] == opponent_formation]
        .sort_values('performance_score', ascending=False)
    )

    print(f"\nCOMPLETE MATCHUP ANALYSIS vs {opponent_formation}:")
    print("-" * 65)
    print(all_matchups[['home_formation', 'matches_played', 'win_rate',
                        'xg_diff', 'avg_points', 'confidence']].to_string(index=False))

# Main execution
matches = generate_matchup_database(n_matches=800)
matchup_stats = calculate_matchup_stats(matches, min_sample=8)

# Create heatmap
create_matchup_heatmap(matchup_stats)

# Generate reports
generate_recommendation_report("4-3-3", matchup_stats)
generate_recommendation_report("4-4-2", matchup_stats)
library(tidyverse)
library(scales)

# ============================================
# FORMATION MATCHUP RECOMMENDATION SYSTEM
# ============================================

# Simulate historical matchup database
generate_matchup_database <- function(n_matches = 500) {

  formations <- c("4-4-2", "4-3-3", "4-2-3-1", "3-5-2", "5-3-2",
                  "4-1-4-1", "3-4-3", "4-5-1")

  # Generate match data
  set.seed(42)
  matches <- tibble(
    match_id = 1:n_matches,
    home_formation = sample(formations, n_matches, replace = TRUE,
                            prob = c(0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10)),
    away_formation = sample(formations, n_matches, replace = TRUE,
                            prob = c(0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10))
  )

  # Generate outcomes based on formation matchups (with realistic tendencies)
  matchup_advantages <- tibble(
    home = c("4-3-3", "4-2-3-1", "3-5-2", "4-4-2", "5-3-2"),
    away = c("4-4-2", "4-3-3", "4-3-3", "3-5-2", "4-3-3"),
    advantage = c(0.3, 0.2, 0.15, -0.1, 0.1)
  )

  matches <- matches %>%
    left_join(matchup_advantages,
              by = c("home_formation" = "home", "away_formation" = "away")) %>%
    mutate(
      advantage = replace_na(advantage, 0),
      home_base_xg = 1.3 + advantage,
      away_base_xg = 1.1 - advantage * 0.5,
      home_xg = pmax(0.5, rnorm(n(), home_base_xg, 0.4)),
      away_xg = pmax(0.3, rnorm(n(), away_base_xg, 0.35)),
      home_goals = rpois(n(), home_xg),
      away_goals = rpois(n(), away_xg),
      home_result = case_when(
        home_goals > away_goals ~ "Win",
        home_goals < away_goals ~ "Loss",
        TRUE ~ "Draw"
      ),
      home_points = case_when(
        home_result == "Win" ~ 3,
        home_result == "Draw" ~ 1,
        TRUE ~ 0
      )
    ) %>%
    select(-advantage, -home_base_xg, -away_base_xg)

  return(matches)
}

# Calculate matchup statistics
calculate_matchup_stats <- function(matches, min_sample = 5) {

  matchup_stats <- matches %>%
    group_by(home_formation, away_formation) %>%
    summarise(
      matches_played = n(),
      wins = sum(home_result == "Win"),
      draws = sum(home_result == "Draw"),
      losses = sum(home_result == "Loss"),
      win_rate = wins / matches_played,
      draw_rate = draws / matches_played,
      loss_rate = losses / matches_played,
      avg_home_xg = mean(home_xg),
      avg_away_xg = mean(away_xg),
      xg_diff = avg_home_xg - avg_away_xg,
      xg_diff_sd = sd(home_xg - away_xg),
      avg_home_goals = mean(home_goals),
      avg_away_goals = mean(away_goals),
      goal_diff = avg_home_goals - avg_away_goals,
      avg_points = mean(home_points),
      .groups = "drop"
    ) %>%
    filter(matches_played >= min_sample) %>%
    mutate(
      # Confidence score based on sample size and variance
      sample_score = pmin(1, matches_played / 30),
      variance_score = 1 / (1 + coalesce(xg_diff_sd, 1)),
      confidence = round((sample_score * 0.6 + variance_score * 0.4), 2),
      # Overall performance score
      performance_score = round(
        (win_rate * 40) + (draw_rate * 10) + (xg_diff * 20) + (goal_diff * 10), 1
      )
    )

  return(matchup_stats)
}

# Get formation recommendations against opponent
recommend_formation <- function(matchup_stats, opponent_formation,
                                 tactical_objective = "balanced") {

  relevant_matchups <- matchup_stats %>%
    filter(away_formation == opponent_formation) %>%
    arrange(desc(performance_score))

  if (nrow(relevant_matchups) == 0) {
    return(list(
      recommendation = "No data available",
      confidence = 0,
      alternatives = NULL
    ))
  }

  # Adjust scoring based on tactical objective
  relevant_matchups <- relevant_matchups %>%
    mutate(
      adjusted_score = case_when(
        tactical_objective == "defensive" ~
          performance_score + (1 - loss_rate) * 20 - xg_diff * 5,
        tactical_objective == "attacking" ~
          performance_score + avg_home_xg * 15 + win_rate * 10,
        TRUE ~ performance_score  # balanced
      )
    ) %>%
    arrange(desc(adjusted_score))

  top_recommendation <- relevant_matchups %>% slice(1)
  alternatives <- relevant_matchups %>% slice(2:4)

  return(list(
    opponent_formation = opponent_formation,
    tactical_objective = tactical_objective,
    recommendation = top_recommendation$home_formation,
    confidence = top_recommendation$confidence,
    expected_xg_diff = round(top_recommendation$xg_diff, 2),
    expected_win_rate = round(top_recommendation$win_rate * 100, 1),
    sample_size = top_recommendation$matches_played,
    alternatives = alternatives %>%
      select(home_formation, win_rate, xg_diff, confidence) %>%
      mutate(
        win_rate = round(win_rate * 100, 1),
        xg_diff = round(xg_diff, 2)
      )
  ))
}

# Create matchup heatmap
create_matchup_heatmap <- function(matchup_stats, metric = "xg_diff") {

  # Pivot for heatmap
  heatmap_data <- matchup_stats %>%
    select(home_formation, away_formation, !!sym(metric)) %>%
    pivot_wider(
      names_from = away_formation,
      values_from = !!sym(metric)
    )

  # Convert to matrix
  formations <- heatmap_data$home_formation
  matrix_data <- as.matrix(heatmap_data[, -1])
  rownames(matrix_data) <- formations

  # Create heatmap
  heatmap_long <- matchup_stats %>%
    select(home_formation, away_formation, value = !!sym(metric),
           win_rate, confidence)

  ggplot(heatmap_long, aes(x = away_formation, y = home_formation,
                           fill = value)) +
    geom_tile(color = "white", size = 0.5) +
    geom_text(aes(label = paste0(round(win_rate * 100, 0), "%\n",
                                  "(",round(value, 2),")")),
              size = 2.5, color = "white") +
    scale_fill_gradient2(
      low = "#d73027", mid = "#ffffbf", high = "#1a9850",
      midpoint = 0, name = "xG Difference"
    ) +
    labs(
      title = "Formation Matchup Performance Matrix",
      subtitle = "Win rate % and (xG difference) shown",
      x = "Opponent Formation", y = "Your Formation"
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
      axis.text.y = element_text(size = 9),
      plot.title = element_text(hjust = 0.5, face = "bold"),
      plot.subtitle = element_text(hjust = 0.5),
      panel.grid = element_blank()
    )
}

# Generate full recommendation report
generate_recommendation_report <- function(opponent_formation,
                                            matchup_stats,
                                            team_available_formations = NULL) {

  cat("\n", rep("=", 65), "\n", sep = "")
  cat("FORMATION RECOMMENDATION REPORT\n")
  cat("Opponent Formation:", opponent_formation, "\n")
  cat(rep("=", 65), "\n\n", sep = "")

  # Generate recommendations for different objectives
  objectives <- c("defensive", "balanced", "attacking")

  for (obj in objectives) {
    rec <- recommend_formation(matchup_stats, opponent_formation, obj)

    cat(toupper(obj), "APPROACH:\n")
    cat("-", rep("-", 40), "\n", sep = "")
    cat("  Recommended Formation:", rec$recommendation, "\n")
    cat("  Confidence:", rec$confidence * 100, "%\n")
    cat("  Expected xG Difference:", rec$expected_xg_diff, "\n")
    cat("  Historical Win Rate:", rec$expected_win_rate, "%\n")
    cat("  Sample Size:", rec$sample_size, "matches\n\n")

    if (!is.null(rec$alternatives) && nrow(rec$alternatives) > 0) {
      cat("  Alternatives:\n")
      for (i in 1:nrow(rec$alternatives)) {
        alt <- rec$alternatives[i, ]
        cat("    ", i, ". ", alt$home_formation,
            " (", alt$win_rate, "% win, ", alt$xg_diff, " xG diff)\n", sep = "")
      }
    }
    cat("\n")
  }

  # Matchup analysis
  all_matchups <- matchup_stats %>%
    filter(away_formation == opponent_formation) %>%
    arrange(desc(performance_score))

  cat("\nCOMPLETE MATCHUP ANALYSIS vs", opponent_formation, ":\n")
  cat("-", rep("-", 60), "\n", sep = "")

  print(
    all_matchups %>%
      select(home_formation, matches_played, win_rate, xg_diff,
             avg_points, confidence) %>%
      mutate(
        win_rate = percent(win_rate, accuracy = 0.1),
        xg_diff = round(xg_diff, 2),
        avg_points = round(avg_points, 2)
      ),
    n = 10
  )
}

# Main execution
matches <- generate_matchup_database(n_matches = 800)
matchup_stats <- calculate_matchup_stats(matches, min_sample = 8)

# Create heatmap
print(create_matchup_heatmap(matchup_stats))

# Generate recommendation report
generate_recommendation_report("4-3-3", matchup_stats)
generate_recommendation_report("4-4-2", matchup_stats)

Summary

Key Takeaways
  • Formation detection uses clustering algorithms to identify positional lines and classify team shapes from average positions
  • Shape metrics (length, width, surface area, defensive line height) quantify tactical organization beyond simple formation labels
  • In vs out of possession shapes reveal how teams transform between attacking and defending phases
  • Formation changes during matches often correlate with substitutions, goals, or tactical adjustments
  • Matchup analysis identifies which formations perform best against specific opponent systems, informing tactical preparation