Capstone - Complete Analytics System
Formations provide the structural framework for how teams organize in attack and defense. Modern formation analysis goes beyond static shapes to understand dynamic positioning, shape transitions, and how formations adapt to different game states.
Learning Objectives
- Identify formations from tracking and event data
- Analyze formation compactness and shape metrics
- Understand in-possession vs out-of-possession shapes
- Detect formation changes during matches
- Compare formation effectiveness against different systems
Formation Detection
Formation detection involves analyzing player positions to determine the team's structural shape. This can be done using tracking data or by aggregating touch locations from event data.
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# Load tracking data
tracking_data = pd.read_csv("tracking_data.csv")
home_tracking = tracking_data[tracking_data['team'] == 'home']
def calculate_average_positions(tracking, period='full'):
"""
Calculate average positions for each player.
"""
if period != 'full':
tracking = tracking[tracking['game_state'] == period]
avg_positions = (
tracking
.groupby(['player_id', 'player_name', 'position'])
.agg({
'x': 'mean',
'y': 'mean'
})
.reset_index()
.rename(columns={'x': 'avg_x', 'y': 'avg_y'})
)
# Exclude goalkeeper
avg_positions = avg_positions[avg_positions['position'] != 'Goalkeeper']
return avg_positions
def classify_formation(avg_positions):
"""
Classify team formation using clustering.
"""
# Sort by x position
sorted_players = avg_positions.sort_values('avg_x').copy()
# Get x positions for clustering
x_positions = sorted_players['avg_x'].values.reshape(-1, 1)
# Find optimal number of lines
silhouette_scores = []
for k in range(2, 5):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(x_positions)
score = silhouette_score(x_positions, labels)
silhouette_scores.append(score)
optimal_k = np.argmax(silhouette_scores) + 2
# Cluster into lines
km = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
sorted_players['line'] = km.fit_predict(x_positions)
# Sort lines by average x position
line_order = (
sorted_players
.groupby('line')['avg_x']
.mean()
.sort_values()
.index
)
line_mapping = {old: new for new, old in enumerate(line_order)}
sorted_players['line'] = sorted_players['line'].map(line_mapping)
# Count players per line
line_counts = (
sorted_players
.groupby('line')
.size()
.sort_index()
.tolist()
)
formation = '-'.join(map(str, line_counts))
return {
'formation': formation,
'line_counts': line_counts,
'player_lines': sorted_players
}
# Detect formation
avg_positions = calculate_average_positions(home_tracking)
formation_result = classify_formation(avg_positions)
print(f"Detected formation: {formation_result['formation']}")
def visualize_formation(avg_positions, formation_label):
"""
Visualize formation on pitch.
"""
fig, ax = plt.subplots(figsize=(12, 8))
# Draw pitch
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_facecolor('#1a5f1a')
# Plot players
for _, player in avg_positions.iterrows():
ax.scatter(player['avg_x'], player['avg_y'],
s=500, c='white', edgecolors='black', zorder=5)
ax.text(player['avg_x'], player['avg_y'],
player['position'][:3],
ha='center', va='center', fontsize=8, fontweight='bold')
ax.set_title(f'Formation: {formation_label}', fontsize=14)
ax.set_xlabel('Pitch Length')
ax.set_ylabel('Pitch Width')
plt.tight_layout()
plt.show()
visualize_formation(
formation_result['player_lines'],
formation_result['formation']
)library(tidyverse)
library(cluster)
# Load tracking data
tracking_data <- read_csv("tracking_data.csv") %>%
filter(team == "home")
# Calculate average positions for formation detection
calculate_average_positions <- function(tracking, period = "full") {
if (period != "full") {
tracking <- tracking %>% filter(game_state == period)
}
avg_positions <- tracking %>%
group_by(player_id, player_name, position) %>%
summarise(
avg_x = mean(x, na.rm = TRUE),
avg_y = mean(y, na.rm = TRUE),
touches = n(),
.groups = "drop"
) %>%
filter(position != "Goalkeeper") # Exclude GK for formation
return(avg_positions)
}
# Classify formation based on player positions
classify_formation <- function(avg_positions) {
# Sort by x position to identify lines
sorted_players <- avg_positions %>%
arrange(avg_x)
# Use clustering to identify defensive/midfield/attack lines
position_matrix <- sorted_players %>%
select(avg_x, avg_y) %>%
as.matrix()
# Determine optimal number of lines (typically 3-4)
silhouette_scores <- sapply(2:4, function(k) {
km <- kmeans(position_matrix[, 1], centers = k, nstart = 10)
mean(silhouette(km$cluster, dist(position_matrix[, 1]))[, 3])
})
optimal_k <- which.max(silhouette_scores) + 1
# Cluster into lines
km <- kmeans(position_matrix[, 1], centers = optimal_k, nstart = 10)
sorted_players$line <- km$cluster
# Count players in each line
line_counts <- sorted_players %>%
group_by(line) %>%
summarise(n = n()) %>%
arrange(line) %>%
pull(n)
# Determine formation string
formation <- paste(line_counts, collapse = "-")
return(list(
formation = formation,
line_counts = line_counts,
player_lines = sorted_players
))
}
# Detect formation
formation_result <- tracking_data %>%
calculate_average_positions() %>%
classify_formation()
cat("Detected formation:", formation_result$formation, "\n")
# Visualize formation
visualize_formation <- function(avg_positions, formation_label) {
ggplot(avg_positions, aes(x = avg_x, y = avg_y)) +
# Pitch outline
annotate("rect", xmin = 0, xmax = 100, ymin = 0, ymax = 100,
fill = "darkgreen", alpha = 0.3) +
# Players
geom_point(size = 8, color = "white") +
geom_text(aes(label = str_extract(position, "^[A-Z]+")),
size = 3, fontface = "bold") +
# Formation label
labs(
title = paste("Formation:", formation_label),
x = "Pitch Length", y = "Pitch Width"
) +
coord_fixed(ratio = 1) +
theme_minimal()
}
visualize_formation(
formation_result$player_lines,
formation_result$formation
)Formation Shape Metrics
Beyond identifying the formation, we can measure its shape characteristics: compactness, width, length, and how these change during different game states.
| Metric | Description | Interpretation |
|---|---|---|
| Length | Distance between deepest and highest outfield players | Lower = more compact vertically |
| Width | Distance between widest players | Higher = more stretched horizontally |
| Surface Area | Area covered by convex hull of players | Lower = more compact overall |
| Defensive Line Height | Average x-position of defensive line | Higher = more aggressive pressing |
| Centroid | Average position of all players | Team's center of gravity |
from scipy.spatial import ConvexHull
import numpy as np
def calculate_shape_metrics(tracking_frame):
"""
Calculate formation shape metrics for a single frame.
"""
outfield = tracking_frame[tracking_frame['position'] != 'Goalkeeper']
if len(outfield) < 3:
return None
metrics = {}
# Team centroid
metrics['centroid_x'] = outfield['x'].mean()
metrics['centroid_y'] = outfield['y'].mean()
# Length (vertical compactness)
metrics['length'] = outfield['x'].max() - outfield['x'].min()
# Width (horizontal spread)
metrics['width'] = outfield['y'].max() - outfield['y'].min()
# Surface area using convex hull
points = outfield[['x', 'y']].values
if len(points) >= 3:
try:
hull = ConvexHull(points)
metrics['surface_area'] = hull.volume # 2D area
except:
metrics['surface_area'] = 0
else:
metrics['surface_area'] = 0
# Defensive line height
defensive_line = outfield.nsmallest(4, 'x')
metrics['defensive_line_height'] = defensive_line['x'].mean()
# Midfield height
sorted_players = outfield.sort_values('x')
midfield = sorted_players.iloc[3:7]
metrics['midfield_height'] = midfield['x'].mean()
# Compactness ratio
metrics['compactness_ratio'] = (
metrics['length'] / metrics['width'] if metrics['width'] > 0 else 0
)
return metrics
# Calculate metrics for all frames
def calculate_shape_timeline(tracking_data):
"""
Calculate shape metrics over time.
"""
results = []
for frame in tracking_data['frame'].unique():
frame_data = tracking_data[tracking_data['frame'] == frame]
metrics = calculate_shape_metrics(frame_data)
if metrics:
metrics['frame'] = frame
results.append(metrics)
return pd.DataFrame(results)
shape_timeline = calculate_shape_timeline(home_tracking)
# Analyze by game state
def analyze_shape_by_state(tracking_data):
"""
Compare shape metrics across game states.
"""
results = []
for state in tracking_data['game_state'].unique():
state_data = tracking_data[tracking_data['game_state'] == state]
for frame in state_data['frame'].unique():
frame_data = state_data[state_data['frame'] == frame]
metrics = calculate_shape_metrics(frame_data)
if metrics:
metrics['game_state'] = state
results.append(metrics)
df = pd.DataFrame(results)
summary = df.groupby('game_state').agg({
'length': 'mean',
'width': 'mean',
'surface_area': 'mean',
'defensive_line_height': 'mean'
}).reset_index()
return summary
shape_by_state = analyze_shape_by_state(home_tracking)
print(shape_by_state)
# Visualize shape evolution
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(shape_timeline['frame'], shape_timeline['length'],
label='Length', alpha=0.7)
ax.plot(shape_timeline['frame'], shape_timeline['width'],
label='Width', alpha=0.7)
ax.plot(shape_timeline['frame'], shape_timeline['defensive_line_height'],
label='Defensive Line', alpha=0.7)
ax.set_xlabel('Time (frames)')
ax.set_ylabel('Distance (meters)')
ax.set_title('Team Shape Evolution During Match')
ax.legend()
plt.tight_layout()
plt.show()library(sf)
# Calculate formation shape metrics
calculate_shape_metrics <- function(tracking_frame) {
outfield_players <- tracking_frame %>%
filter(position != "Goalkeeper")
if (nrow(outfield_players) < 3) return(NULL)
metrics <- list()
# Team centroid
metrics$centroid_x <- mean(outfield_players$x)
metrics$centroid_y <- mean(outfield_players$y)
# Length (vertical compactness)
metrics$length <- max(outfield_players$x) - min(outfield_players$x)
# Width (horizontal spread)
metrics$width <- max(outfield_players$y) - min(outfield_players$y)
# Surface area using convex hull
points_sf <- st_as_sf(outfield_players, coords = c("y", "x"))
hull <- st_convex_hull(st_union(points_sf))
metrics$surface_area <- st_area(hull)
# Defensive line height (average of 4 deepest players)
defensive_line <- outfield_players %>%
arrange(x) %>%
head(4)
metrics$defensive_line_height <- mean(defensive_line$x)
# Midfield line height
midfield <- outfield_players %>%
arrange(x) %>%
slice(4:7)
metrics$midfield_height <- mean(midfield$x)
# Compactness ratio (length / width)
metrics$compactness_ratio <- metrics$length / metrics$width
return(as_tibble(metrics))
}
# Calculate metrics for each frame
shape_over_time <- tracking_data %>%
group_by(frame) %>%
group_modify(~calculate_shape_metrics(.x)) %>%
ungroup()
# Analyze shape by game state
shape_by_state <- tracking_data %>%
group_by(frame, game_state) %>%
group_modify(~calculate_shape_metrics(.x)) %>%
ungroup() %>%
group_by(game_state) %>%
summarise(
avg_length = mean(length, na.rm = TRUE),
avg_width = mean(width, na.rm = TRUE),
avg_surface_area = mean(surface_area, na.rm = TRUE),
avg_defensive_line = mean(defensive_line_height, na.rm = TRUE),
.groups = "drop"
)
# Visualize
ggplot(shape_by_state, aes(x = game_state)) +
geom_bar(aes(y = avg_length, fill = "Length"), stat = "identity",
position = "dodge", alpha = 0.7) +
geom_bar(aes(y = avg_width, fill = "Width"), stat = "identity",
position = "dodge", alpha = 0.7) +
labs(
title = "Team Shape by Game State",
x = "Game State", y = "Distance (meters)", fill = "Metric"
) +
theme_minimal()
# Plot shape evolution over time
ggplot(shape_over_time, aes(x = frame)) +
geom_line(aes(y = length, color = "Length")) +
geom_line(aes(y = width, color = "Width")) +
geom_line(aes(y = defensive_line_height, color = "Def Line")) +
labs(
title = "Team Shape Evolution",
x = "Time (frames)", y = "Distance (meters)", color = "Metric"
) +
theme_minimal()In-Possession vs Out-of-Possession Shapes
Teams typically adopt different shapes depending on whether they have the ball or not. Analyzing these differences reveals tactical intent.
def analyze_possession_shapes(tracking, events):
"""
Compare team shapes in and out of possession.
"""
# Simplified: assume we have possession state in tracking data
# In practice, derive from event data
results = {'in_possession': [], 'out_possession': []}
for frame in tracking['frame'].unique():
frame_data = tracking[tracking['frame'] == frame]
if len(frame_data) == 0:
continue
# Determine possession state (simplified)
in_poss = frame_data['in_possession'].iloc[0] if 'in_possession' in frame_data.columns else True
metrics = calculate_shape_metrics(frame_data)
if metrics:
if in_poss:
results['in_possession'].append(metrics)
else:
results['out_possession'].append(metrics)
# Aggregate
comparison = {}
for state, data in results.items():
if data:
df = pd.DataFrame(data)
comparison[state] = {
'length': df['length'].mean(),
'width': df['width'].mean(),
'surface_area': df['surface_area'].mean(),
'defensive_line_height': df['defensive_line_height'].mean()
}
return comparison
# Calculate average positions by possession state
def calculate_positions_by_possession(tracking):
"""
Calculate average positions for in and out of possession.
"""
in_poss = tracking[tracking['in_possession'] == True]
out_poss = tracking[tracking['in_possession'] == False]
positions_in = (
in_poss
.groupby(['player_name', 'position'])
.agg({'x': 'mean', 'y': 'mean'})
.reset_index()
.rename(columns={'x': 'avg_x', 'y': 'avg_y'})
)
positions_out = (
out_poss
.groupby(['player_name', 'position'])
.agg({'x': 'mean', 'y': 'mean'})
.reset_index()
.rename(columns={'x': 'avg_x', 'y': 'avg_y'})
)
return positions_in, positions_out
def plot_possession_comparison(positions_in, positions_out):
"""
Side-by-side comparison of shapes.
"""
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
titles = ['In Possession', 'Out of Possession']
data_sets = [positions_in, positions_out]
for ax, title, data in zip(axes, titles, data_sets):
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_facecolor('#1a5f1a')
for _, player in data.iterrows():
ax.scatter(player['avg_x'], player['avg_y'],
s=400, c='white', edgecolors='black', zorder=5)
ax.text(player['avg_x'], player['avg_y'],
player['position'][:3],
ha='center', va='center', fontsize=7)
ax.set_title(title, fontsize=12)
ax.set_xlabel('Pitch Length')
ax.set_ylabel('Pitch Width')
plt.tight_layout()
plt.show()
# Example usage
if 'in_possession' in home_tracking.columns:
positions_in, positions_out = calculate_positions_by_possession(home_tracking)
plot_possession_comparison(positions_in, positions_out)# Compare in-possession vs out-of-possession shapes
analyze_possession_shapes <- function(tracking, events) {
# Determine possession for each frame
possession_frames <- events %>%
filter(type.name %in% c("Pass", "Carry", "Shot", "Dribble")) %>%
select(frame_start = frame, frame_end = lead(frame), possession_team = team.name)
tracking_with_possession <- tracking %>%
left_join(
possession_frames,
by = c("frame" >= "frame_start", "frame" < "frame_end")
) %>%
mutate(
in_possession = team == possession_team
)
# Calculate shapes for each state
in_possession_shape <- tracking_with_possession %>%
filter(in_possession == TRUE) %>%
group_by(frame) %>%
group_modify(~calculate_shape_metrics(.x)) %>%
summarise(across(everything(), mean, na.rm = TRUE)) %>%
mutate(state = "In Possession")
out_possession_shape <- tracking_with_possession %>%
filter(in_possession == FALSE) %>%
group_by(frame) %>%
group_modify(~calculate_shape_metrics(.x)) %>%
summarise(across(everything(), mean, na.rm = TRUE)) %>%
mutate(state = "Out of Possession")
comparison <- bind_rows(in_possession_shape, out_possession_shape)
return(comparison)
}
possession_comparison <- analyze_possession_shapes(tracking_data, events)
# Visualize comparison
possession_comparison %>%
pivot_longer(cols = c(length, width, surface_area, defensive_line_height),
names_to = "metric", values_to = "value") %>%
ggplot(aes(x = metric, y = value, fill = state)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Team Shape: In vs Out of Possession",
x = "Metric", y = "Value", fill = "State"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Average player positions by possession state
position_by_possession <- tracking_with_possession %>%
group_by(player_name, position, in_possession) %>%
summarise(
avg_x = mean(x),
avg_y = mean(y),
.groups = "drop"
)
# Plot side by side
library(patchwork)
p1 <- position_by_possession %>%
filter(in_possession == TRUE) %>%
visualize_formation("In Possession")
p2 <- position_by_possession %>%
filter(in_possession == FALSE) %>%
visualize_formation("Out of Possession")
p1 + p2Detecting Formation Changes
Teams often change formation during matches in response to game state, substitutions, or tactical adjustments. Detecting these changes helps understand tactical decision-making.
def detect_formation_changes(tracking_data, window_minutes=10):
"""
Detect formation changes throughout the match.
"""
tracking_data = tracking_data.copy()
tracking_data['time_window'] = tracking_data['minute'] // window_minutes
formations = []
for window in tracking_data['time_window'].unique():
window_data = tracking_data[tracking_data['time_window'] == window]
avg_positions = calculate_average_positions(window_data)
if len(avg_positions) >= 10:
formation_result = classify_formation(avg_positions)
formations.append({
'time_window': window,
'formation': formation_result['formation'],
'minute': window * window_minutes
})
formations_df = pd.DataFrame(formations)
# Detect changes
formations_df['prev_formation'] = formations_df['formation'].shift(1)
formations_df['formation_change'] = (
formations_df['formation'] != formations_df['prev_formation']
)
changes = formations_df[formations_df['formation_change']].copy()
changes = changes.rename(columns={
'prev_formation': 'from_formation',
'formation': 'to_formation'
})
return {
'timeline': formations_df,
'changes': changes[['minute', 'from_formation', 'to_formation']]
}
formation_changes = detect_formation_changes(home_tracking)
def plot_formation_timeline(formation_data):
"""
Visualize formation changes over time.
"""
timeline = formation_data['timeline']
changes = formation_data['changes']
fig, ax = plt.subplots(figsize=(14, 3))
# Create color map for formations
unique_formations = timeline['formation'].unique()
colors = plt.cm.Set3(np.linspace(0, 1, len(unique_formations)))
color_map = dict(zip(unique_formations, colors))
# Plot timeline
for _, row in timeline.iterrows():
ax.barh(0, window_minutes, left=row['minute'],
color=color_map[row['formation']], height=0.5)
# Mark changes
for _, change in changes.iterrows():
ax.axvline(x=change['minute'], color='red',
linestyle='--', linewidth=2)
ax.text(change['minute'], 0.6,
f"{change['from_formation']} -> {change['to_formation']}",
rotation=45, fontsize=8)
ax.set_xlim(0, 95)
ax.set_ylim(-0.5, 1)
ax.set_xlabel('Match Minute')
ax.set_title('Formation Changes During Match')
ax.set_yticks([])
# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=color_map[f], label=f)
for f in unique_formations]
ax.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
plot_formation_timeline(formation_changes)
# Print detected changes
print("Formation Changes Detected:")
print(formation_changes['changes'])# Detect formation changes during match
detect_formation_changes <- function(tracking_data, window_minutes = 10) {
# Calculate formation for rolling windows
tracking_data <- tracking_data %>%
mutate(time_window = floor(minute / window_minutes))
formations_over_time <- tracking_data %>%
group_by(time_window) %>%
group_modify(~{
avg_pos <- calculate_average_positions(tibble(.x))
formation <- classify_formation(avg_pos)
tibble(
formation = formation$formation,
line_counts = list(formation$line_counts)
)
}) %>%
ungroup()
# Detect changes
formations_over_time <- formations_over_time %>%
mutate(
formation_change = formation != lag(formation),
formation_change = ifelse(is.na(formation_change), FALSE, formation_change)
)
changes <- formations_over_time %>%
filter(formation_change) %>%
mutate(
from_formation = lag(formation),
to_formation = formation,
change_minute = time_window * window_minutes
)
return(list(
timeline = formations_over_time,
changes = changes
))
}
formation_changes <- detect_formation_changes(tracking_data)
# Visualize formation timeline
ggplot(formation_changes$timeline,
aes(x = time_window * 10, y = 1, fill = formation)) +
geom_tile(height = 0.8) +
geom_vline(data = formation_changes$changes,
aes(xintercept = change_minute),
linetype = "dashed", color = "red") +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Formation Changes During Match",
x = "Match Minute", y = "", fill = "Formation"
) +
theme_minimal() +
theme(axis.text.y = element_blank())
# Analyze what triggered changes
analyze_change_triggers <- function(changes, events, substitutions) {
change_analysis <- changes %>%
rowwise() %>%
mutate(
# Check for recent substitution
recent_sub = any(
substitutions$minute >= change_minute - 5 &
substitutions$minute <= change_minute + 5
),
# Check for recent goal
recent_goal = any(
events$type.name == "Shot" &
events$shot.outcome.name == "Goal" &
events$minute >= change_minute - 5 &
events$minute <= change_minute + 5
),
# Check score at time of change
score_at_change = get_score_at_minute(events, change_minute)
)
return(change_analysis)
}Formation Matchup Analysis
How formations interact against each other creates advantages and disadvantages. Analyzing historical matchup data reveals which formations perform best against specific opponents.
def analyze_formation_matchups(match_data):
"""
Analyze how different formations perform against each other.
"""
match_data = match_data.copy()
# Calculate metrics
match_data['home_xg_diff'] = match_data['home_xg'] - match_data['away_xg']
match_data['home_result'] = np.where(
match_data['home_goals'] > match_data['away_goals'], 'Win',
np.where(match_data['home_goals'] < match_data['away_goals'], 'Loss', 'Draw')
)
matchup_results = (
match_data
.groupby(['home_formation', 'away_formation'])
.agg({
'match_id': 'count',
'home_xg': 'mean',
'away_xg': 'mean',
'home_xg_diff': 'mean',
'home_result': lambda x: (x == 'Win').sum()
})
.reset_index()
)
matchup_results.columns = [
'home_formation', 'away_formation', 'matches',
'avg_home_xg', 'avg_away_xg', 'avg_xg_diff', 'home_wins'
]
matchup_results['home_win_rate'] = (
matchup_results['home_wins'] / matchup_results['matches'] * 100
)
# Filter for minimum sample size
matchup_results = matchup_results[matchup_results['matches'] >= 10]
return matchup_results
matchup_analysis = analyze_formation_matchups(season_matches)
def plot_matchup_heatmap(matchup_data):
"""
Create heatmap of formation matchup performance.
"""
# Pivot for heatmap
pivot_xg = matchup_data.pivot(
index='home_formation',
columns='away_formation',
values='avg_xg_diff'
)
pivot_winrate = matchup_data.pivot(
index='home_formation',
columns='away_formation',
values='home_win_rate'
)
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(pivot_xg.values, cmap='RdYlGn', aspect='auto')
# Add text annotations
for i in range(len(pivot_xg.index)):
for j in range(len(pivot_xg.columns)):
if not np.isnan(pivot_winrate.values[i, j]):
ax.text(j, i, f'{pivot_winrate.values[i, j]:.0f}%',
ha='center', va='center', fontsize=9)
ax.set_xticks(range(len(pivot_xg.columns)))
ax.set_yticks(range(len(pivot_xg.index)))
ax.set_xticklabels(pivot_xg.columns, rotation=45, ha='right')
ax.set_yticklabels(pivot_xg.index)
ax.set_xlabel('Away Formation')
ax.set_ylabel('Home Formation')
ax.set_title('Formation Matchup Performance\n(Home team xG diff, win rate %)')
plt.colorbar(im, label='xG Difference')
plt.tight_layout()
plt.show()
plot_matchup_heatmap(matchup_analysis)
def best_formations_against(matchup_data, opponent_formation):
"""
Find best formations to use against a specific opponent formation.
"""
result = (
matchup_data[matchup_data['away_formation'] == opponent_formation]
.nlargest(5, 'avg_xg_diff')
[['home_formation', 'matches', 'avg_xg_diff', 'home_win_rate']]
)
return result
print("Best formations against 4-3-3:")
print(best_formations_against(matchup_analysis, '4-3-3'))# Analyze formation matchup performance
analyze_formation_matchups <- function(match_data) {
matchup_results <- match_data %>%
mutate(
matchup = paste(home_formation, "vs", away_formation),
home_xg_diff = home_xg - away_xg,
home_result = case_when(
home_goals > away_goals ~ "Win",
home_goals < away_goals ~ "Loss",
TRUE ~ "Draw"
)
) %>%
group_by(home_formation, away_formation) %>%
summarise(
matches = n(),
home_wins = sum(home_result == "Win"),
draws = sum(home_result == "Draw"),
home_losses = sum(home_result == "Loss"),
avg_home_xg = mean(home_xg),
avg_away_xg = mean(away_xg),
avg_xg_diff = mean(home_xg_diff),
home_win_rate = home_wins / matches * 100,
.groups = "drop"
) %>%
filter(matches >= 10) # Minimum sample size
return(matchup_results)
}
matchup_analysis <- analyze_formation_matchups(season_matches)
# Heatmap of matchup performance
ggplot(matchup_analysis, aes(x = away_formation, y = home_formation,
fill = avg_xg_diff)) +
geom_tile() +
geom_text(aes(label = paste0(round(home_win_rate, 0), "%")),
color = "white", size = 3) +
scale_fill_gradient2(low = "red", mid = "white", high = "green",
midpoint = 0, name = "xG Diff") +
labs(
title = "Formation Matchup Performance",
subtitle = "Home team xG difference and win rate",
x = "Away Formation", y = "Home Formation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Identify best formations against specific systems
best_against <- function(matchup_data, opponent_formation) {
matchup_data %>%
filter(away_formation == opponent_formation) %>%
arrange(desc(avg_xg_diff)) %>%
head(5)
}
# Example: Best formations against 4-3-3
best_against(matchup_analysis, "4-3-3")Practice Exercises
Exercise 24.1: Complete Formation Detection and Classification System
Task: Build a comprehensive formation detection system that identifies team formations from event data, validates classifications against known formations, and generates visual reports.
Requirements:
- Calculate average touch positions for each player from event data
- Implement multiple clustering methods (K-means, hierarchical) for formation detection
- Create a formation validation system that compares detected vs expected formations
- Generate confidence scores for formation classifications
- Visualize formations with player positions and line assignments
- Handle edge cases (players with few touches, unusual formations)
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from statsbombpy import sb
# ============================================
# COMPLETE FORMATION DETECTION SYSTEM
# ============================================
# Define standard formations
STANDARD_FORMATIONS = {
"4-4-2": {"lines": [4, 4, 2], "positions": ["DEF", "MID", "FWD"]},
"4-3-3": {"lines": [4, 3, 3], "positions": ["DEF", "MID", "FWD"]},
"3-5-2": {"lines": [3, 5, 2], "positions": ["DEF", "MID", "FWD"]},
"4-2-3-1": {"lines": [4, 2, 3, 1], "positions": ["DEF", "DM", "AM", "FWD"]},
"5-3-2": {"lines": [5, 3, 2], "positions": ["DEF", "MID", "FWD"]},
"4-1-4-1": {"lines": [4, 1, 4, 1], "positions": ["DEF", "DM", "MID", "FWD"]},
"3-4-3": {"lines": [3, 4, 3], "positions": ["DEF", "MID", "FWD"]}
}
def load_statsbomb_events(competition_id=43, season_id=3):
"""
Load StatsBomb event data.
"""
matches = sb.matches(competition_id=competition_id, season_id=season_id)
sample_match = matches.iloc[0]
events = sb.events(match_id=sample_match['match_id'])
return events, sample_match
def calculate_touch_positions(events, team_name, min_touches=10):
"""
Calculate average positions from event touch data.
"""
touch_types = ['Pass', 'Carry', 'Shot', 'Dribble', 'Ball Receipt*', 'Clearance']
touch_events = events[
(events['team'] == team_name) &
(events['type'].isin(touch_types)) &
(events['location'].notna())
].copy()
# Extract x, y from location
touch_events['x'] = touch_events['location'].apply(lambda loc: loc[0] if isinstance(loc, list) else np.nan)
touch_events['y'] = touch_events['location'].apply(lambda loc: loc[1] if isinstance(loc, list) else np.nan)
# Calculate average positions
avg_positions = (
touch_events
.groupby(['player_id', 'player', 'position'])
.agg({
'x': ['mean', 'std', 'count'],
'y': ['mean', 'std']
})
.reset_index()
)
avg_positions.columns = ['player_id', 'player_name', 'position',
'avg_x', 'x_std', 'touches', 'avg_y', 'y_std']
# Filter by minimum touches and exclude goalkeeper
avg_positions = avg_positions[
(avg_positions['touches'] >= min_touches) &
(~avg_positions['position'].str.contains('Goalkeeper', na=False))
]
return avg_positions
def classify_formation_multi_method(avg_positions):
"""
Classify formation using multiple clustering methods.
"""
if len(avg_positions) < 10:
return {
'formation': 'Unknown',
'confidence': 0,
'message': 'Insufficient players detected'
}
# Prepare position data
x_positions = avg_positions['avg_x'].values.reshape(-1, 1)
# Method 1: K-means clustering
kmeans_results = {}
for k in range(2, 5):
km = KMeans(n_clusters=k, random_state=42, n_init=25)
labels = km.fit_predict(x_positions)
sil_score = silhouette_score(x_positions, labels)
kmeans_results[k] = {
'labels': labels,
'silhouette': sil_score,
'centers': np.sort(km.cluster_centers_.flatten())
}
best_k_km = max(kmeans_results, key=lambda k: kmeans_results[k]['silhouette'])
km_result = kmeans_results[best_k_km]
# Method 2: Hierarchical clustering
hc_results = {}
for k in range(2, 5):
hc = AgglomerativeClustering(n_clusters=k, linkage='ward')
labels = hc.fit_predict(x_positions)
sil_score = silhouette_score(x_positions, labels)
hc_results[k] = {
'labels': labels,
'silhouette': sil_score
}
best_k_hc = max(hc_results, key=lambda k: hc_results[k]['silhouette'])
# Assign lines using K-means result
avg_positions = avg_positions.copy()
avg_positions['line'] = km_result['labels']
# Reorder lines by average x position
line_means = avg_positions.groupby('line')['avg_x'].mean().sort_values()
line_mapping = {old: new for new, old in enumerate(line_means.index, 1)}
avg_positions['line'] = avg_positions['line'].map(line_mapping)
# Count players per line
line_counts = avg_positions.groupby('line').size().sort_index().tolist()
formation_string = '-'.join(map(str, line_counts))
# Match to standard formations
confidence_scores = {}
for formation_name, formation_data in STANDARD_FORMATIONS.items():
expected = formation_data['lines']
if len(expected) != len(line_counts):
confidence_scores[formation_name] = 0
else:
diff = sum(abs(e - d) for e, d in zip(expected, line_counts))
confidence_scores[formation_name] = 1 - (diff / (2 * sum(line_counts)))
best_match = max(confidence_scores, key=confidence_scores.get)
confidence = max(confidence_scores.values())
# Boost confidence for exact match
if formation_string in STANDARD_FORMATIONS:
confidence = max(confidence, 0.95)
best_match = formation_string
return {
'detected_formation': formation_string,
'matched_formation': best_match,
'confidence': round(confidence, 3),
'line_counts': line_counts,
'player_positions': avg_positions,
'silhouette_kmeans': km_result['silhouette'],
'optimal_clusters': best_k_km,
'method_agreement': best_k_km == best_k_hc
}
def visualize_formation_detection(formation_result, team_name=""):
"""
Visualize detected formation on a pitch.
"""
fig, ax = plt.subplots(figsize=(14, 9))
# Draw pitch
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)
ax.set_facecolor('#228B22')
# Pitch markings
ax.axvline(x=60, color='white', linewidth=1)
circle = plt.Circle((60, 40), 9.15, fill=False, color='white', linewidth=1)
ax.add_patch(circle)
# Penalty areas
ax.add_patch(patches.Rectangle((0, 18), 18, 44, fill=False, color='white', linewidth=1))
ax.add_patch(patches.Rectangle((102, 18), 18, 44, fill=False, color='white', linewidth=1))
# Goals
ax.add_patch(patches.Rectangle((-2, 30), 2, 20, fill=True, facecolor='white'))
ax.add_patch(patches.Rectangle((120, 30), 2, 20, fill=True, facecolor='white'))
# Player data
player_data = formation_result['player_positions']
line_colors = {1: '#1f77b4', 2: '#ff7f0e', 3: '#2ca02c', 4: '#d62728'}
# Plot players
for _, player in player_data.iterrows():
color = line_colors.get(player['line'], 'gray')
ax.scatter(player['avg_x'], player['avg_y'], s=500, c=color,
edgecolors='white', linewidths=2, zorder=5)
# Player position abbreviation
pos_abbr = player['position'][:3].upper() if pd.notna(player['position']) else '?'
ax.text(player['avg_x'], player['avg_y'], pos_abbr,
ha='center', va='center', fontsize=8, fontweight='bold', color='white')
# Draw line groupings (ellipses)
for line_num in player_data['line'].unique():
line_players = player_data[player_data['line'] == line_num]
if len(line_players) >= 2:
from matplotlib.patches import Ellipse
center_x = line_players['avg_x'].mean()
center_y = line_players['avg_y'].mean()
width = (line_players['avg_y'].max() - line_players['avg_y'].min()) + 10
height = (line_players['avg_x'].max() - line_players['avg_x'].min()) + 5
ellipse = Ellipse((center_x, center_y), width, max(height, 5),
fill=False, color=line_colors.get(line_num, 'gray'),
linestyle='--', linewidth=2)
ax.add_patch(ellipse)
# Title and labels
ax.set_title(
f"Detected Formation: {formation_result['detected_formation']} "
f"(Matched: {formation_result['matched_formation']})\n"
f"Confidence: {formation_result['confidence']*100:.1f}% | {team_name}",
fontsize=14, fontweight='bold'
)
ax.set_xlabel('Pitch Length (meters)')
ax.set_ylabel('Pitch Width (meters)')
# Legend
legend_elements = [plt.scatter([], [], c=color, s=100, label=f'Line {num}')
for num, color in line_colors.items() if num <= len(formation_result['line_counts'])]
ax.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
def generate_formation_report(events, team_name):
"""
Generate comprehensive formation report.
"""
print("=" * 55)
print(f"FORMATION ANALYSIS REPORT: {team_name}")
print("=" * 55)
# Calculate positions
positions = calculate_touch_positions(events, team_name)
# Detect formation
result = classify_formation_multi_method(positions)
print(f"\nDetected Formation: {result['detected_formation']}")
print(f"Best Match: {result['matched_formation']}")
print(f"Confidence Score: {result['confidence']*100:.1f}%")
print("\nLine Structure:")
line_names = ["Defensive", "Midfield 1", "Midfield 2", "Attacking"]
for i, count in enumerate(result['line_counts']):
print(f" Line {i+1} ({line_names[i]}): {count} players")
print("\nPlayer Assignments by Line:")
for line_num in sorted(result['player_positions']['line'].unique()):
line_players = result['player_positions'][
result['player_positions']['line'] == line_num
]['player_name'].tolist()
print(f" Line {line_num}: {', '.join(line_players)}")
print("\nQuality Metrics:")
print(f" Silhouette Score: {result['silhouette_kmeans']:.3f}")
print(f" Optimal Clusters: {result['optimal_clusters']}")
print(f" Method Agreement: {'Yes' if result['method_agreement'] else 'No'}")
return result
# Example usage
events, match_info = load_statsbomb_events()
team_name = events['team'].unique()[0]
formation_report = generate_formation_report(events, team_name)
visualize_formation_detection(formation_report, team_name)library(tidyverse)
library(cluster)
library(factoextra)
library(StatsBombR)
# ============================================
# COMPLETE FORMATION DETECTION SYSTEM
# ============================================
# Load StatsBomb data
comps <- FreeCompetitions()
matches <- FreeMatches(Competitions = comps %>% filter(competition_id == 43))
events <- free_allevents(MatchesDF = matches[1:5,],Atea = TRUE)
# Clean event data
events_clean <- allclean(events)
# Define standard formations with expected line structures
standard_formations <- list(
"4-4-2" = list(lines = c(4, 4, 2), positions = c("DEF", "MID", "FWD")),
"4-3-3" = list(lines = c(4, 3, 3), positions = c("DEF", "MID", "FWD")),
"3-5-2" = list(lines = c(3, 5, 2), positions = c("DEF", "MID", "FWD")),
"4-2-3-1" = list(lines = c(4, 2, 3, 1), positions = c("DEF", "DM", "AM", "FWD")),
"5-3-2" = list(lines = c(5, 3, 2), positions = c("DEF", "MID", "FWD")),
"4-1-4-1" = list(lines = c(4, 1, 4, 1), positions = c("DEF", "DM", "MID", "FWD")),
"3-4-3" = list(lines = c(3, 4, 3), positions = c("DEF", "MID", "FWD"))
)
# Calculate average positions from event data
calculate_touch_positions <- function(events, team_name, min_touches = 10) {
touch_events <- events %>%
filter(
team.name == team_name,
type.name %in% c("Pass", "Carry", "Shot", "Dribble", "Ball Receipt*", "Clearance")
) %>%
filter(!is.na(location.x) & !is.na(location.y))
avg_positions <- touch_events %>%
group_by(player.id, player.name, position.name) %>%
summarise(
avg_x = mean(location.x, na.rm = TRUE),
avg_y = mean(location.y, na.rm = TRUE),
touches = n(),
x_sd = sd(location.x, na.rm = TRUE),
y_sd = sd(location.y, na.rm = TRUE),
.groups = "drop"
) %>%
filter(touches >= min_touches) %>%
filter(!str_detect(position.name, "Goalkeeper"))
return(avg_positions)
}
# Multi-method formation classification
classify_formation_multi_method <- function(avg_positions) {
if (nrow(avg_positions) < 10) {
return(list(
formation = "Unknown",
confidence = 0,
message = "Insufficient players detected"
))
}
# Prepare position matrix
pos_matrix <- avg_positions %>%
select(avg_x) %>%
as.matrix()
results <- list()
# Method 1: K-means clustering
kmeans_results <- list()
for (k in 2:4) {
km <- kmeans(pos_matrix, centers = k, nstart = 25)
sil <- silhouette(km$cluster, dist(pos_matrix))
kmeans_results[[as.character(k)]] <- list(
clusters = km$cluster,
silhouette = mean(sil[, 3]),
centers = sort(km$centers[,1])
)
}
best_k_kmeans <- names(which.max(sapply(kmeans_results, function(x) x$silhouette)))
km_result <- kmeans_results[[best_k_kmeans]]
# Method 2: Hierarchical clustering
hc <- hclust(dist(pos_matrix), method = "ward.D2")
hc_results <- list()
for (k in 2:4) {
hc_cluster <- cutree(hc, k = k)
sil <- silhouette(hc_cluster, dist(pos_matrix))
hc_results[[as.character(k)]] <- list(
clusters = hc_cluster,
silhouette = mean(sil[, 3])
)
}
best_k_hc <- names(which.max(sapply(hc_results, function(x) x$silhouette)))
# Use K-means result for line assignment
avg_positions$line <- km_result$clusters
# Reorder lines by average x position (defensive to attacking)
line_order <- avg_positions %>%
group_by(line) %>%
summarise(mean_x = mean(avg_x)) %>%
arrange(mean_x) %>%
mutate(new_line = row_number())
avg_positions <- avg_positions %>%
left_join(line_order %>% select(line, new_line), by = "line") %>%
mutate(line = new_line) %>%
select(-new_line)
# Count players per line
line_counts <- avg_positions %>%
group_by(line) %>%
summarise(n = n(), .groups = "drop") %>%
arrange(line) %>%
pull(n)
formation_string <- paste(line_counts, collapse = "-")
# Match to standard formations and calculate confidence
confidence_scores <- sapply(names(standard_formations), function(f) {
expected <- standard_formations[[f]]$lines
if (length(expected) != length(line_counts)) {
return(0)
}
1 - (sum(abs(expected - line_counts)) / (2 * sum(line_counts)))
})
best_match <- names(which.max(confidence_scores))
confidence <- max(confidence_scores)
# If exact match found
if (formation_string %in% names(standard_formations)) {
confidence <- max(confidence, 0.95)
best_match <- formation_string
}
return(list(
detected_formation = formation_string,
matched_formation = best_match,
confidence = round(confidence, 3),
line_counts = line_counts,
player_positions = avg_positions,
silhouette_kmeans = km_result$silhouette,
optimal_clusters = as.integer(best_k_kmeans),
method_agreement = best_k_kmeans == best_k_hc
))
}
# Visualize detected formation
visualize_formation_detection <- function(formation_result, title_suffix = "") {
player_data <- formation_result$player_positions
# Create pitch
pitch <- ggplot() +
# Pitch background
annotate("rect", xmin = 0, xmax = 120, ymin = 0, ymax = 80,
fill = "#228B22", color = "white", size = 1) +
# Center line
annotate("segment", x = 60, xend = 60, y = 0, yend = 80,
color = "white", size = 0.5) +
# Center circle
annotate("path",
x = 60 + 9.15 * cos(seq(0, 2*pi, length.out = 100)),
y = 40 + 9.15 * sin(seq(0, 2*pi, length.out = 100)),
color = "white", size = 0.5) +
# Penalty areas
annotate("rect", xmin = 0, xmax = 18, ymin = 18, ymax = 62,
fill = NA, color = "white", size = 0.5) +
annotate("rect", xmin = 102, xmax = 120, ymin = 18, ymax = 62,
fill = NA, color = "white", size = 0.5)
# Color by line
line_colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728")
pitch +
# Player positions
geom_point(data = player_data,
aes(x = avg_x, y = avg_y, color = factor(line)),
size = 10) +
geom_text(data = player_data,
aes(x = avg_x, y = avg_y,
label = str_extract(position.name, "^[A-Z]{2,3}")),
color = "white", size = 3, fontface = "bold") +
# Ellipses for each line
stat_ellipse(data = player_data,
aes(x = avg_x, y = avg_y, color = factor(line)),
level = 0.8, linetype = "dashed") +
scale_color_manual(values = line_colors, name = "Line") +
labs(
title = paste0("Detected Formation: ", formation_result$detected_formation,
" (Matched: ", formation_result$matched_formation, ")"),
subtitle = paste0("Confidence: ", round(formation_result$confidence * 100, 1),
"% | ", title_suffix),
x = "Pitch Length (meters)", y = "Pitch Width (meters)"
) +
coord_fixed(ratio = 1) +
theme_minimal() +
theme(
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
}
# Generate formation report
generate_formation_report <- function(events, team_name) {
cat("=" , rep("=", 50), "\n", sep = "")
cat("FORMATION ANALYSIS REPORT:", team_name, "\n")
cat("=" , rep("=", 50), "\n\n", sep = "")
# Calculate positions
positions <- calculate_touch_positions(events, team_name)
# Detect formation
result <- classify_formation_multi_method(positions)
cat("Detected Formation:", result$detected_formation, "\n")
cat("Best Match:", result$matched_formation, "\n")
cat("Confidence Score:", round(result$confidence * 100, 1), "%\n\n")
cat("Line Structure:\n")
for (i in seq_along(result$line_counts)) {
line_name <- c("Defensive", "Midfield 1", "Midfield 2", "Attacking")[i]
cat(" Line", i, "(", line_name, "):", result$line_counts[i], "players\n")
}
cat("\nPlayer Assignments:\n")
result$player_positions %>%
arrange(line, avg_y) %>%
group_by(line) %>%
summarise(
players = paste(player.name, collapse = ", "),
.groups = "drop"
) %>%
print()
cat("\nQuality Metrics:\n")
cat(" Silhouette Score:", round(result$silhouette_kmeans, 3), "\n")
cat(" Optimal Clusters:", result$optimal_clusters, "\n")
cat(" Method Agreement:", ifelse(result$method_agreement, "Yes", "No"), "\n")
return(result)
}
# Example usage
team_name <- unique(events_clean$team.name)[1]
formation_report <- generate_formation_report(events_clean, team_name)
visualize_formation_detection(formation_report, team_name)Exercise 24.2: In-Possession vs Out-of-Possession Shape Analysis
Task: Build a comprehensive shape comparison system that analyzes how a team's formation transforms between attacking and defending phases, including shape metrics, player movement patterns, and tactical visualizations.
Requirements:
- Determine possession state for each event/frame
- Calculate shape metrics (length, width, surface area, centroid) for both phases
- Measure individual player position changes between phases
- Identify which players move most between attacking/defending
- Create side-by-side formation visualizations
- Generate statistical comparison report
import pandas as pd
import numpy as np
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from statsbombpy import sb
# ============================================
# IN vs OUT OF POSSESSION SHAPE ANALYSIS
# ============================================
def load_match_events(competition_id=43, season_id=3):
"""Load sample match events."""
matches = sb.matches(competition_id=competition_id, season_id=season_id)
match_id = matches.iloc[0]['match_id']
events = sb.events(match_id=match_id)
return events
def assign_possession_phase(events, team_name):
"""Determine possession phase for each event."""
possession_events = ['Pass', 'Carry', 'Shot', 'Dribble', 'Ball Receipt*',
'Foul Won', 'Dispossessed']
defensive_events = ['Pressure', 'Tackle', 'Interception', 'Block',
'Clearance', 'Ball Recovery']
events = events.copy()
events['is_team_event'] = events['team'] == team_name
def get_phase(row):
if row['is_team_event'] and row['type'] in possession_events:
return 'in_possession'
elif row['is_team_event'] and row['type'] in defensive_events:
return 'out_of_possession'
elif not row['is_team_event'] and row['type'] in possession_events:
return 'out_of_possession'
return None
events['possession_phase'] = events.apply(get_phase, axis=1)
return events[events['possession_phase'].notna()]
def calculate_shape_metrics(positions_df):
"""Calculate shape metrics for a set of positions."""
if len(positions_df) < 3:
return None
metrics = {}
# Centroid
metrics['centroid_x'] = positions_df['avg_x'].mean()
metrics['centroid_y'] = positions_df['avg_y'].mean()
# Length and width
metrics['length'] = positions_df['avg_x'].max() - positions_df['avg_x'].min()
metrics['width'] = positions_df['avg_y'].max() - positions_df['avg_y'].min()
# Surface area via convex hull
try:
points = positions_df[['avg_x', 'avg_y']].values
if len(points) >= 3:
hull = ConvexHull(points)
metrics['surface_area'] = hull.volume # 2D area
else:
metrics['surface_area'] = 0
except:
metrics['surface_area'] = 0
# Compactness ratio
metrics['compactness_ratio'] = (
metrics['length'] / metrics['width'] if metrics['width'] > 0 else 0
)
# Defensive line (4 deepest players)
defensive = positions_df.nsmallest(4, 'avg_x')
metrics['defensive_line'] = defensive['avg_x'].mean()
# Attacking line (3 highest players)
attacking = positions_df.nlargest(3, 'avg_x')
metrics['attacking_line'] = attacking['avg_x'].mean()
# Team stretch
metrics['team_stretch'] = metrics['attacking_line'] - metrics['defensive_line']
return metrics
def calculate_positions_by_phase(events, team_name):
"""Calculate average positions for each possession phase."""
events_phased = assign_possession_phase(events, team_name)
def extract_positions(events_subset):
df = events_subset.copy()
df['x'] = df['location'].apply(
lambda loc: loc[0] if isinstance(loc, list) else np.nan
)
df['y'] = df['location'].apply(
lambda loc: loc[1] if isinstance(loc, list) else np.nan
)
return (
df[df['x'].notna() & ~df['position'].str.contains('Goalkeeper', na=False)]
.groupby(['player_id', 'player', 'position'])
.agg({'x': 'mean', 'y': 'mean'})
.reset_index()
.rename(columns={'x': 'avg_x', 'y': 'avg_y', 'player': 'player_name'})
)
in_poss = events_phased[
(events_phased['team'] == team_name) &
(events_phased['possession_phase'] == 'in_possession')
]
out_poss = events_phased[
(events_phased['team'] == team_name) &
(events_phased['possession_phase'] == 'out_of_possession')
]
return {
'in_possession': extract_positions(in_poss),
'out_of_possession': extract_positions(out_poss)
}
def calculate_player_movement(positions_in, positions_out):
"""Calculate how much each player moves between phases."""
merged = positions_in.merge(
positions_out[['player_id', 'avg_x', 'avg_y']],
on='player_id',
suffixes=('_in', '_out')
)
merged['x_movement'] = merged['avg_x_in'] - merged['avg_x_out']
merged['y_movement'] = merged['avg_y_in'] - merged['avg_y_out']
merged['total_movement'] = np.sqrt(
merged['x_movement']**2 + merged['y_movement']**2
)
def classify_movement(x_mov):
if x_mov > 5:
return "Pushes forward in possession"
elif x_mov < -5:
return "Drops back in possession"
return "Maintains position"
merged['movement_type'] = merged['x_movement'].apply(classify_movement)
return merged.sort_values('total_movement', ascending=False)
def visualize_shape_comparison(positions_in, positions_out,
metrics_in, metrics_out, team_name):
"""Create side-by-side shape comparison visualization."""
fig, axes = plt.subplots(1, 2, figsize=(18, 9))
def draw_pitch(ax, positions, metrics, title):
# Pitch background
ax.set_xlim(0, 120)
ax.set_ylim(0, 80)
ax.set_facecolor('#228B22')
# Pitch markings
ax.axvline(x=60, color='white', linewidth=1)
ax.add_patch(patches.Rectangle((0, 18), 18, 44,
fill=False, color='white', linewidth=1))
ax.add_patch(patches.Rectangle((102, 18), 18, 44,
fill=False, color='white', linewidth=1))
# Convex hull
if len(positions) >= 3:
try:
points = positions[['avg_x', 'avg_y']].values
hull = ConvexHull(points)
hull_points = np.vstack([points[hull.vertices], points[hull.vertices[0]]])
ax.fill(hull_points[:, 0], hull_points[:, 1],
alpha=0.2, color='white', linestyle='--')
ax.plot(hull_points[:, 0], hull_points[:, 1],
'w--', linewidth=2)
except:
pass
# Centroid
ax.scatter(metrics['centroid_x'], metrics['centroid_y'],
marker='x', s=200, c='yellow', linewidths=3, zorder=10)
# Players
for _, player in positions.iterrows():
ax.scatter(player['avg_x'], player['avg_y'],
s=400, c='white', edgecolors='black', linewidths=2, zorder=5)
pos_abbr = player['position'][:3].upper() if pd.notna(player['position']) else '?'
ax.text(player['avg_x'], player['avg_y'], pos_abbr,
ha='center', va='center', fontsize=8, fontweight='bold')
# Metrics box
metrics_text = (f"Length: {metrics['length']:.1f}m\n"
f"Width: {metrics['width']:.1f}m\n"
f"Area: {metrics['surface_area']:.0f}m²")
ax.text(100, 72, metrics_text, fontsize=10,
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
ax.set_title(title, fontsize=14, fontweight='bold')
ax.set_xlabel('Pitch Length (m)')
ax.set_ylabel('Pitch Width (m)')
draw_pitch(axes[0], positions_in, metrics_in, 'In Possession')
draw_pitch(axes[1], positions_out, metrics_out, 'Out of Possession')
fig.suptitle(f'{team_name} - Shape Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
def generate_shape_comparison_report(events, team_name):
"""Generate comprehensive shape comparison report."""
print("=" * 60)
print(f"SHAPE COMPARISON REPORT: {team_name}")
print("=" * 60)
# Calculate positions
positions = calculate_positions_by_phase(events, team_name)
# Calculate metrics
metrics_in = calculate_shape_metrics(positions['in_possession'])
metrics_out = calculate_shape_metrics(positions['out_of_possession'])
# Calculate movement
movement = calculate_player_movement(
positions['in_possession'],
positions['out_of_possession']
)
# Print comparison
print("\nSHAPE METRICS COMPARISON:")
print("-" * 60)
print(f"{'Metric':<25} {'In Poss':>12} {'Out Poss':>12} {'Diff':>12}")
print("-" * 60)
metrics_list = [
('Centroid X', metrics_in['centroid_x'], metrics_out['centroid_x']),
('Centroid Y', metrics_in['centroid_y'], metrics_out['centroid_y']),
('Length', metrics_in['length'], metrics_out['length']),
('Width', metrics_in['width'], metrics_out['width']),
('Surface Area', metrics_in['surface_area'], metrics_out['surface_area']),
('Defensive Line', metrics_in['defensive_line'], metrics_out['defensive_line']),
('Attacking Line', metrics_in['attacking_line'], metrics_out['attacking_line']),
('Team Stretch', metrics_in['team_stretch'], metrics_out['team_stretch']),
]
for name, in_val, out_val in metrics_list:
diff = in_val - out_val
print(f"{name:<25} {in_val:>12.1f} {out_val:>12.1f} {diff:>+12.1f}")
print("\n\nPLAYER MOVEMENT ANALYSIS:")
print("-" * 60)
print("Top 5 players with most positional change:\n")
top_movers = movement.head(5)[['player_name', 'position', 'x_movement',
'y_movement', 'total_movement', 'movement_type']]
print(top_movers.to_string(index=False))
# Visualize
visualize_shape_comparison(
positions['in_possession'],
positions['out_of_possession'],
metrics_in,
metrics_out,
team_name
)
return {
'metrics_in': metrics_in,
'metrics_out': metrics_out,
'movement': movement,
'positions': positions
}
# Run analysis
events = load_match_events()
team_name = events['team'].unique()[0]
report = generate_shape_comparison_report(events, team_name)library(tidyverse)
library(sf)
library(patchwork)
library(StatsBombR)
# ============================================
# IN vs OUT OF POSSESSION SHAPE ANALYSIS
# ============================================
# Load data
comps <- FreeCompetitions()
matches <- FreeMatches(Competitions = comps %>% filter(competition_id == 43))
events <- free_allevents(MatchesDF = matches[1,],Atea = TRUE)
events_clean <- allclean(events)
# Determine possession phases
assign_possession_phase <- function(events, team_name) {
possession_events <- c("Pass", "Carry", "Shot", "Dribble", "Ball Receipt*",
"Foul Won", "Dispossessed")
defensive_events <- c("Pressure", "Tackle", "Interception", "Block",
"Clearance", "Ball Recovery")
events <- events %>%
mutate(
is_team_event = team.name == team_name,
possession_phase = case_when(
# Team has possession
is_team_event & type.name %in% possession_events ~ "in_possession",
# Team defending
is_team_event & type.name %in% defensive_events ~ "out_of_possession",
# Opponent events when team doesn't have ball
!is_team_event & type.name %in% possession_events ~ "out_of_possession",
TRUE ~ NA_character_
)
) %>%
filter(!is.na(possession_phase))
return(events)
}
# Calculate shape metrics for a set of positions
calculate_shape_metrics <- function(positions_df) {
if (nrow(positions_df) < 3) return(NULL)
metrics <- list()
# Centroid
metrics$centroid_x <- mean(positions_df$avg_x, na.rm = TRUE)
metrics$centroid_y <- mean(positions_df$avg_y, na.rm = TRUE)
# Length (vertical spread)
metrics$length <- max(positions_df$avg_x) - min(positions_df$avg_x)
# Width (horizontal spread)
metrics$width <- max(positions_df$avg_y) - min(positions_df$avg_y)
# Surface area via convex hull
tryCatch({
points_sf <- st_as_sf(positions_df,
coords = c("avg_y", "avg_x"),
crs = NA)
hull <- st_convex_hull(st_union(points_sf))
metrics$surface_area <- as.numeric(st_area(hull))
}, error = function(e) {
metrics$surface_area <- NA
})
# Compactness ratio
metrics$compactness_ratio <- metrics$length / metrics$width
# Defensive line height (average of 4 deepest)
defensive <- positions_df %>% arrange(avg_x) %>% head(4)
metrics$defensive_line <- mean(defensive$avg_x)
# Attacking line height (average of 3 highest)
attacking <- positions_df %>% arrange(desc(avg_x)) %>% head(3)
metrics$attacking_line <- mean(attacking$avg_x)
# Team stretch (distance between def and attack lines)
metrics$team_stretch <- metrics$attacking_line - metrics$defensive_line
return(as_tibble(metrics))
}
# Calculate positions by possession phase
calculate_positions_by_phase <- function(events, team_name) {
events_phased <- assign_possession_phase(events, team_name)
positions_in <- events_phased %>%
filter(
team.name == team_name,
possession_phase == "in_possession",
!is.na(location.x), !is.na(location.y),
!str_detect(position.name, "Goalkeeper")
) %>%
group_by(player.id, player.name, position.name) %>%
summarise(
avg_x = mean(location.x, na.rm = TRUE),
avg_y = mean(location.y, na.rm = TRUE),
touches = n(),
.groups = "drop"
) %>%
mutate(phase = "In Possession")
positions_out <- events_phased %>%
filter(
team.name == team_name,
possession_phase == "out_of_possession",
!is.na(location.x), !is.na(location.y),
!str_detect(position.name, "Goalkeeper")
) %>%
group_by(player.id, player.name, position.name) %>%
summarise(
avg_x = mean(location.x, na.rm = TRUE),
avg_y = mean(location.y, na.rm = TRUE),
touches = n(),
.groups = "drop"
) %>%
mutate(phase = "Out of Possession")
return(list(
in_possession = positions_in,
out_of_possession = positions_out
))
}
# Calculate player movement between phases
calculate_player_movement <- function(positions_in, positions_out) {
movement <- positions_in %>%
select(player.id, player.name, position.name,
x_in = avg_x, y_in = avg_y) %>%
inner_join(
positions_out %>%
select(player.id, x_out = avg_x, y_out = avg_y),
by = "player.id"
) %>%
mutate(
x_movement = x_in - x_out, # Positive = higher up when attacking
y_movement = y_in - y_out,
total_movement = sqrt(x_movement^2 + y_movement^2),
movement_type = case_when(
x_movement > 5 ~ "Pushes forward in possession",
x_movement < -5 ~ "Drops back in possession",
TRUE ~ "Maintains position"
)
) %>%
arrange(desc(total_movement))
return(movement)
}
# Visualize both phases side by side
visualize_shape_comparison <- function(positions_in, positions_out,
metrics_in, metrics_out, team_name) {
# Create pitch plot function
create_pitch_plot <- function(positions, metrics, title) {
ggplot(positions, aes(x = avg_x, y = avg_y)) +
# Pitch background
annotate("rect", xmin = 0, xmax = 120, ymin = 0, ymax = 80,
fill = "#228B22", alpha = 0.9) +
# Pitch markings
annotate("segment", x = 60, xend = 60, y = 0, yend = 80, color = "white") +
annotate("rect", xmin = 0, xmax = 18, ymin = 18, ymax = 62,
fill = NA, color = "white") +
annotate("rect", xmin = 102, xmax = 120, ymin = 18, ymax = 62,
fill = NA, color = "white") +
# Centroid
annotate("point", x = metrics$centroid_x, y = metrics$centroid_y,
shape = 4, size = 6, color = "yellow", stroke = 2) +
# Convex hull
stat_chull(fill = "white", alpha = 0.2, color = "white", linetype = "dashed") +
# Players
geom_point(size = 8, color = "white") +
geom_text(aes(label = str_extract(position.name, "^[A-Z]{2,3}")),
size = 2.5, fontface = "bold") +
# Metrics annotation
annotate("label", x = 100, y = 75,
label = paste0("Length: ", round(metrics$length, 1), "m\n",
"Width: ", round(metrics$width, 1), "m\n",
"Area: ", round(metrics$surface_area, 0), "m²"),
size = 3, fill = "white", alpha = 0.8) +
labs(title = title) +
coord_fixed(ratio = 1, xlim = c(0, 120), ylim = c(0, 80)) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12))
}
# Create both plots
p1 <- create_pitch_plot(positions_in, metrics_in, "In Possession")
p2 <- create_pitch_plot(positions_out, metrics_out, "Out of Possession")
# Combine
combined <- p1 + p2 +
plot_annotation(
title = paste0(team_name, " - Shape Comparison"),
theme = theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
)
return(combined)
}
# Generate comprehensive comparison report
generate_shape_comparison_report <- function(events, team_name) {
cat("\n", rep("=", 60), "\n", sep = "")
cat("SHAPE COMPARISON REPORT:", team_name, "\n")
cat(rep("=", 60), "\n\n", sep = "")
# Calculate positions
positions <- calculate_positions_by_phase(events, team_name)
# Calculate metrics for each phase
metrics_in <- calculate_shape_metrics(positions$in_possession)
metrics_out <- calculate_shape_metrics(positions$out_of_possession)
# Calculate player movement
movement <- calculate_player_movement(
positions$in_possession,
positions$out_of_possession
)
# Print shape comparison
cat("SHAPE METRICS COMPARISON:\n")
cat("-" , rep("-", 40), "\n", sep = "")
cat(sprintf("%-25s %12s %12s %12s\n",
"Metric", "In Poss", "Out Poss", "Difference"))
cat("-" , rep("-", 40), "\n", sep = "")
metrics_df <- tibble(
Metric = c("Centroid X", "Centroid Y", "Length", "Width",
"Surface Area", "Defensive Line", "Attacking Line", "Team Stretch"),
In_Poss = c(metrics_in$centroid_x, metrics_in$centroid_y,
metrics_in$length, metrics_in$width,
metrics_in$surface_area, metrics_in$defensive_line,
metrics_in$attacking_line, metrics_in$team_stretch),
Out_Poss = c(metrics_out$centroid_x, metrics_out$centroid_y,
metrics_out$length, metrics_out$width,
metrics_out$surface_area, metrics_out$defensive_line,
metrics_out$attacking_line, metrics_out$team_stretch)
) %>%
mutate(
Difference = In_Poss - Out_Poss,
Change_Pct = round((Difference / Out_Poss) * 100, 1)
)
print(metrics_df, n = 8)
cat("\n\nPLAYER MOVEMENT ANALYSIS:\n")
cat("-" , rep("-", 40), "\n", sep = "")
cat("Top 5 players with most positional change:\n\n")
movement %>%
head(5) %>%
select(player.name, position.name, x_movement, y_movement,
total_movement, movement_type) %>%
print()
# Visualize
plot <- visualize_shape_comparison(
positions$in_possession,
positions$out_of_possession,
metrics_in,
metrics_out,
team_name
)
print(plot)
return(list(
metrics_in = metrics_in,
metrics_out = metrics_out,
movement = movement,
positions = positions
))
}
# Run analysis
team_name <- unique(events_clean$team.name)[1]
shape_report <- generate_shape_comparison_report(events_clean, team_name)Exercise 24.3: Formation Matchup Recommendation System
Task: Build a data-driven formation recommendation system that analyzes historical matchup data and suggests optimal formations to use against specific opponent systems.
Requirements:
- Create a matchup database from historical match data
- Calculate performance metrics (xG diff, win rate, goals) for each formation combination
- Implement confidence scoring based on sample size and variance
- Build a recommendation engine that considers opponent formation, team strengths, and historical performance
- Generate visual matchup heatmaps and recommendation reports
- Include scenario analysis for different tactical objectives (defensive, balanced, attacking)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# ============================================
# FORMATION MATCHUP RECOMMENDATION SYSTEM
# ============================================
def generate_matchup_database(n_matches=500):
"""Generate synthetic matchup database."""
np.random.seed(42)
formations = ["4-4-2", "4-3-3", "4-2-3-1", "3-5-2", "5-3-2",
"4-1-4-1", "3-4-3", "4-5-1"]
formation_probs = [0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10]
# Generate matches
matches = pd.DataFrame({
'match_id': range(1, n_matches + 1),
'home_formation': np.random.choice(formations, n_matches, p=formation_probs),
'away_formation': np.random.choice(formations, n_matches, p=formation_probs)
})
# Define matchup advantages
advantages = {
('4-3-3', '4-4-2'): 0.3,
('4-2-3-1', '4-3-3'): 0.2,
('3-5-2', '4-3-3'): 0.15,
('4-4-2', '3-5-2'): -0.1,
('5-3-2', '4-3-3'): 0.1
}
def get_advantage(home, away):
return advantages.get((home, away), 0)
matches['advantage'] = matches.apply(
lambda x: get_advantage(x['home_formation'], x['away_formation']),
axis=1
)
# Generate outcomes
matches['home_xg'] = np.maximum(
0.5,
np.random.normal(1.3 + matches['advantage'], 0.4)
)
matches['away_xg'] = np.maximum(
0.3,
np.random.normal(1.1 - matches['advantage'] * 0.5, 0.35)
)
matches['home_goals'] = np.random.poisson(matches['home_xg'])
matches['away_goals'] = np.random.poisson(matches['away_xg'])
matches['home_result'] = np.where(
matches['home_goals'] > matches['away_goals'], 'Win',
np.where(matches['home_goals'] < matches['away_goals'], 'Loss', 'Draw')
)
matches['home_points'] = np.where(
matches['home_result'] == 'Win', 3,
np.where(matches['home_result'] == 'Draw', 1, 0)
)
return matches.drop(columns=['advantage'])
def calculate_matchup_stats(matches, min_sample=5):
"""Calculate performance statistics for each matchup."""
stats = (
matches
.groupby(['home_formation', 'away_formation'])
.agg({
'match_id': 'count',
'home_result': lambda x: (x == 'Win').sum(),
'home_xg': ['mean', 'std'],
'away_xg': 'mean',
'home_goals': 'mean',
'away_goals': 'mean',
'home_points': 'mean'
})
.reset_index()
)
stats.columns = [
'home_formation', 'away_formation', 'matches_played', 'wins',
'avg_home_xg', 'xg_diff_sd', 'avg_away_xg',
'avg_home_goals', 'avg_away_goals', 'avg_points'
]
stats['win_rate'] = stats['wins'] / stats['matches_played']
stats['xg_diff'] = stats['avg_home_xg'] - stats['avg_away_xg']
stats['goal_diff'] = stats['avg_home_goals'] - stats['avg_away_goals']
# Calculate confidence
stats['sample_score'] = np.minimum(1, stats['matches_played'] / 30)
stats['variance_score'] = 1 / (1 + stats['xg_diff_sd'].fillna(1))
stats['confidence'] = (stats['sample_score'] * 0.6 + stats['variance_score'] * 0.4).round(2)
# Performance score
stats['performance_score'] = (
stats['win_rate'] * 40 +
(1 - stats['win_rate'] - stats['win_rate']) / 2 * 10 + # draw rate approx
stats['xg_diff'] * 20 +
stats['goal_diff'] * 10
).round(1)
return stats[stats['matches_played'] >= min_sample]
def recommend_formation(matchup_stats, opponent_formation, tactical_objective='balanced'):
"""Recommend best formation against opponent."""
relevant = matchup_stats[matchup_stats['away_formation'] == opponent_formation].copy()
if len(relevant) == 0:
return {
'recommendation': 'No data available',
'confidence': 0,
'alternatives': None
}
# Adjust scoring
if tactical_objective == 'defensive':
relevant['adjusted_score'] = (
relevant['performance_score'] +
(1 - (1 - relevant['win_rate'])) * 20 -
relevant['xg_diff'] * 5
)
elif tactical_objective == 'attacking':
relevant['adjusted_score'] = (
relevant['performance_score'] +
relevant['avg_home_xg'] * 15 +
relevant['win_rate'] * 10
)
else:
relevant['adjusted_score'] = relevant['performance_score']
relevant = relevant.sort_values('adjusted_score', ascending=False)
top = relevant.iloc[0]
alts = relevant.iloc[1:4]
return {
'opponent_formation': opponent_formation,
'tactical_objective': tactical_objective,
'recommendation': top['home_formation'],
'confidence': top['confidence'],
'expected_xg_diff': round(top['xg_diff'], 2),
'expected_win_rate': round(top['win_rate'] * 100, 1),
'sample_size': int(top['matches_played']),
'alternatives': alts[['home_formation', 'win_rate', 'xg_diff', 'confidence']].copy()
}
def create_matchup_heatmap(matchup_stats):
"""Create visual matchup heatmap."""
pivot = matchup_stats.pivot(
index='home_formation',
columns='away_formation',
values='xg_diff'
)
win_pivot = matchup_stats.pivot(
index='home_formation',
columns='away_formation',
values='win_rate'
)
fig, ax = plt.subplots(figsize=(12, 10))
# Create heatmap
sns.heatmap(
pivot,
annot=False,
cmap='RdYlGn',
center=0,
linewidths=0.5,
ax=ax
)
# Add annotations with win rate and xG diff
for i, row in enumerate(pivot.index):
for j, col in enumerate(pivot.columns):
xg_val = pivot.loc[row, col]
win_val = win_pivot.loc[row, col]
if pd.notna(xg_val):
text = f"{win_val*100:.0f}%\n({xg_val:.2f})"
ax.text(j + 0.5, i + 0.5, text,
ha='center', va='center', fontsize=8, color='white')
ax.set_title('Formation Matchup Performance Matrix\nWin Rate % and (xG Difference)',
fontsize=14, fontweight='bold')
ax.set_xlabel('Opponent Formation', fontsize=12)
ax.set_ylabel('Your Formation', fontsize=12)
plt.tight_layout()
plt.show()
def generate_recommendation_report(opponent_formation, matchup_stats):
"""Generate comprehensive recommendation report."""
print("\n" + "=" * 65)
print("FORMATION RECOMMENDATION REPORT")
print(f"Opponent Formation: {opponent_formation}")
print("=" * 65 + "\n")
objectives = ['defensive', 'balanced', 'attacking']
for obj in objectives:
rec = recommend_formation(matchup_stats, opponent_formation, obj)
print(f"{obj.upper()} APPROACH:")
print("-" * 45)
print(f" Recommended Formation: {rec['recommendation']}")
print(f" Confidence: {rec['confidence']*100:.0f}%")
print(f" Expected xG Difference: {rec['expected_xg_diff']}")
print(f" Historical Win Rate: {rec['expected_win_rate']}%")
print(f" Sample Size: {rec['sample_size']} matches")
if rec['alternatives'] is not None and len(rec['alternatives']) > 0:
print("\n Alternatives:")
for idx, (_, alt) in enumerate(rec['alternatives'].iterrows(), 1):
print(f" {idx}. {alt['home_formation']} "
f"({alt['win_rate']*100:.1f}% win, {alt['xg_diff']:.2f} xG diff)")
print()
# Full analysis
all_matchups = (
matchup_stats[matchup_stats['away_formation'] == opponent_formation]
.sort_values('performance_score', ascending=False)
)
print(f"\nCOMPLETE MATCHUP ANALYSIS vs {opponent_formation}:")
print("-" * 65)
print(all_matchups[['home_formation', 'matches_played', 'win_rate',
'xg_diff', 'avg_points', 'confidence']].to_string(index=False))
# Main execution
matches = generate_matchup_database(n_matches=800)
matchup_stats = calculate_matchup_stats(matches, min_sample=8)
# Create heatmap
create_matchup_heatmap(matchup_stats)
# Generate reports
generate_recommendation_report("4-3-3", matchup_stats)
generate_recommendation_report("4-4-2", matchup_stats)library(tidyverse)
library(scales)
# ============================================
# FORMATION MATCHUP RECOMMENDATION SYSTEM
# ============================================
# Simulate historical matchup database
generate_matchup_database <- function(n_matches = 500) {
formations <- c("4-4-2", "4-3-3", "4-2-3-1", "3-5-2", "5-3-2",
"4-1-4-1", "3-4-3", "4-5-1")
# Generate match data
set.seed(42)
matches <- tibble(
match_id = 1:n_matches,
home_formation = sample(formations, n_matches, replace = TRUE,
prob = c(0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10)),
away_formation = sample(formations, n_matches, replace = TRUE,
prob = c(0.15, 0.25, 0.20, 0.08, 0.05, 0.10, 0.07, 0.10))
)
# Generate outcomes based on formation matchups (with realistic tendencies)
matchup_advantages <- tibble(
home = c("4-3-3", "4-2-3-1", "3-5-2", "4-4-2", "5-3-2"),
away = c("4-4-2", "4-3-3", "4-3-3", "3-5-2", "4-3-3"),
advantage = c(0.3, 0.2, 0.15, -0.1, 0.1)
)
matches <- matches %>%
left_join(matchup_advantages,
by = c("home_formation" = "home", "away_formation" = "away")) %>%
mutate(
advantage = replace_na(advantage, 0),
home_base_xg = 1.3 + advantage,
away_base_xg = 1.1 - advantage * 0.5,
home_xg = pmax(0.5, rnorm(n(), home_base_xg, 0.4)),
away_xg = pmax(0.3, rnorm(n(), away_base_xg, 0.35)),
home_goals = rpois(n(), home_xg),
away_goals = rpois(n(), away_xg),
home_result = case_when(
home_goals > away_goals ~ "Win",
home_goals < away_goals ~ "Loss",
TRUE ~ "Draw"
),
home_points = case_when(
home_result == "Win" ~ 3,
home_result == "Draw" ~ 1,
TRUE ~ 0
)
) %>%
select(-advantage, -home_base_xg, -away_base_xg)
return(matches)
}
# Calculate matchup statistics
calculate_matchup_stats <- function(matches, min_sample = 5) {
matchup_stats <- matches %>%
group_by(home_formation, away_formation) %>%
summarise(
matches_played = n(),
wins = sum(home_result == "Win"),
draws = sum(home_result == "Draw"),
losses = sum(home_result == "Loss"),
win_rate = wins / matches_played,
draw_rate = draws / matches_played,
loss_rate = losses / matches_played,
avg_home_xg = mean(home_xg),
avg_away_xg = mean(away_xg),
xg_diff = avg_home_xg - avg_away_xg,
xg_diff_sd = sd(home_xg - away_xg),
avg_home_goals = mean(home_goals),
avg_away_goals = mean(away_goals),
goal_diff = avg_home_goals - avg_away_goals,
avg_points = mean(home_points),
.groups = "drop"
) %>%
filter(matches_played >= min_sample) %>%
mutate(
# Confidence score based on sample size and variance
sample_score = pmin(1, matches_played / 30),
variance_score = 1 / (1 + coalesce(xg_diff_sd, 1)),
confidence = round((sample_score * 0.6 + variance_score * 0.4), 2),
# Overall performance score
performance_score = round(
(win_rate * 40) + (draw_rate * 10) + (xg_diff * 20) + (goal_diff * 10), 1
)
)
return(matchup_stats)
}
# Get formation recommendations against opponent
recommend_formation <- function(matchup_stats, opponent_formation,
tactical_objective = "balanced") {
relevant_matchups <- matchup_stats %>%
filter(away_formation == opponent_formation) %>%
arrange(desc(performance_score))
if (nrow(relevant_matchups) == 0) {
return(list(
recommendation = "No data available",
confidence = 0,
alternatives = NULL
))
}
# Adjust scoring based on tactical objective
relevant_matchups <- relevant_matchups %>%
mutate(
adjusted_score = case_when(
tactical_objective == "defensive" ~
performance_score + (1 - loss_rate) * 20 - xg_diff * 5,
tactical_objective == "attacking" ~
performance_score + avg_home_xg * 15 + win_rate * 10,
TRUE ~ performance_score # balanced
)
) %>%
arrange(desc(adjusted_score))
top_recommendation <- relevant_matchups %>% slice(1)
alternatives <- relevant_matchups %>% slice(2:4)
return(list(
opponent_formation = opponent_formation,
tactical_objective = tactical_objective,
recommendation = top_recommendation$home_formation,
confidence = top_recommendation$confidence,
expected_xg_diff = round(top_recommendation$xg_diff, 2),
expected_win_rate = round(top_recommendation$win_rate * 100, 1),
sample_size = top_recommendation$matches_played,
alternatives = alternatives %>%
select(home_formation, win_rate, xg_diff, confidence) %>%
mutate(
win_rate = round(win_rate * 100, 1),
xg_diff = round(xg_diff, 2)
)
))
}
# Create matchup heatmap
create_matchup_heatmap <- function(matchup_stats, metric = "xg_diff") {
# Pivot for heatmap
heatmap_data <- matchup_stats %>%
select(home_formation, away_formation, !!sym(metric)) %>%
pivot_wider(
names_from = away_formation,
values_from = !!sym(metric)
)
# Convert to matrix
formations <- heatmap_data$home_formation
matrix_data <- as.matrix(heatmap_data[, -1])
rownames(matrix_data) <- formations
# Create heatmap
heatmap_long <- matchup_stats %>%
select(home_formation, away_formation, value = !!sym(metric),
win_rate, confidence)
ggplot(heatmap_long, aes(x = away_formation, y = home_formation,
fill = value)) +
geom_tile(color = "white", size = 0.5) +
geom_text(aes(label = paste0(round(win_rate * 100, 0), "%\n",
"(",round(value, 2),")")),
size = 2.5, color = "white") +
scale_fill_gradient2(
low = "#d73027", mid = "#ffffbf", high = "#1a9850",
midpoint = 0, name = "xG Difference"
) +
labs(
title = "Formation Matchup Performance Matrix",
subtitle = "Win rate % and (xG difference) shown",
x = "Opponent Formation", y = "Your Formation"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.text.y = element_text(size = 9),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
panel.grid = element_blank()
)
}
# Generate full recommendation report
generate_recommendation_report <- function(opponent_formation,
matchup_stats,
team_available_formations = NULL) {
cat("\n", rep("=", 65), "\n", sep = "")
cat("FORMATION RECOMMENDATION REPORT\n")
cat("Opponent Formation:", opponent_formation, "\n")
cat(rep("=", 65), "\n\n", sep = "")
# Generate recommendations for different objectives
objectives <- c("defensive", "balanced", "attacking")
for (obj in objectives) {
rec <- recommend_formation(matchup_stats, opponent_formation, obj)
cat(toupper(obj), "APPROACH:\n")
cat("-", rep("-", 40), "\n", sep = "")
cat(" Recommended Formation:", rec$recommendation, "\n")
cat(" Confidence:", rec$confidence * 100, "%\n")
cat(" Expected xG Difference:", rec$expected_xg_diff, "\n")
cat(" Historical Win Rate:", rec$expected_win_rate, "%\n")
cat(" Sample Size:", rec$sample_size, "matches\n\n")
if (!is.null(rec$alternatives) && nrow(rec$alternatives) > 0) {
cat(" Alternatives:\n")
for (i in 1:nrow(rec$alternatives)) {
alt <- rec$alternatives[i, ]
cat(" ", i, ". ", alt$home_formation,
" (", alt$win_rate, "% win, ", alt$xg_diff, " xG diff)\n", sep = "")
}
}
cat("\n")
}
# Matchup analysis
all_matchups <- matchup_stats %>%
filter(away_formation == opponent_formation) %>%
arrange(desc(performance_score))
cat("\nCOMPLETE MATCHUP ANALYSIS vs", opponent_formation, ":\n")
cat("-", rep("-", 60), "\n", sep = "")
print(
all_matchups %>%
select(home_formation, matches_played, win_rate, xg_diff,
avg_points, confidence) %>%
mutate(
win_rate = percent(win_rate, accuracy = 0.1),
xg_diff = round(xg_diff, 2),
avg_points = round(avg_points, 2)
),
n = 10
)
}
# Main execution
matches <- generate_matchup_database(n_matches = 800)
matchup_stats <- calculate_matchup_stats(matches, min_sample = 8)
# Create heatmap
print(create_matchup_heatmap(matchup_stats))
# Generate recommendation report
generate_recommendation_report("4-3-3", matchup_stats)
generate_recommendation_report("4-4-2", matchup_stats)Summary
Key Takeaways
- Formation detection uses clustering algorithms to identify positional lines and classify team shapes from average positions
- Shape metrics (length, width, surface area, defensive line height) quantify tactical organization beyond simple formation labels
- In vs out of possession shapes reveal how teams transform between attacking and defending phases
- Formation changes during matches often correlate with substitutions, goals, or tactical adjustments
- Matchup analysis identifies which formations perform best against specific opponent systems, informing tactical preparation