Capstone - Complete Analytics System
Capstone Project Overview
This final chapter brings together everything you've learned throughout this textbook into a comprehensive, production-ready football analytics system. You'll build a complete platform that handles data ingestion, analysis, visualization, and reporting.
Capstone Goal
Build "FootballAnalyticsPro" - a modular analytics platform that can support a professional club's scouting, performance analysis, and tactical planning needs.
System Architecture
┌─────────────────────────────────────────────────────────────────────────────┐
│ FootballAnalyticsPro Platform │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ Data Layer │ │ Analytics Layer │ │ Presentation │ │
│ │ │ │ │ │ Layer │ │
│ │ - Ingestion │───▶│ - xG Models │───▶│ - Dashboards │ │
│ │ - Validation │ │ - Player Ratings │ │ - Reports │ │
│ │ - Storage │ │ - Tactical │ │ - API │ │
│ │ - Transformation│ │ - Recruitment │ │ - Exports │ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
│ │ │ │ │
│ └─────────────────────┼──────────────────────┘ │
│ ▼ │
│ ┌─────────────────────┐ │
│ │ Orchestration │ │
│ │ - Scheduling │ │
│ │ - Monitoring │ │
│ │ - Logging │ │
│ └─────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
import os
import logging
from datetime import datetime, date
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
import json
# FootballAnalyticsPro - Main System Class
class FootballAnalyticsPro:
"""
Complete football analytics platform bringing together
data management, analytics, and reporting capabilities.
"""
def __init__(self, config_path: str = "config.yml"):
print("Initializing FootballAnalyticsPro Platform...")
# Load configuration
self.config = self._load_config(config_path)
# Initialize logging
self.logger = self._setup_logging()
# Initialize layers (lazy loading)
self._data_layer = None
self._analytics_layer = None
self._presentation_layer = None
self.log("INFO", "Platform initialized successfully")
print("FootballAnalyticsPro ready!")
def _load_config(self, path: str) -> Dict:
"""Load or create default configuration."""
return {
"database": {
"host": "localhost",
"port": 5432,
"name": "football_analytics",
"user": "analyst"
},
"models": {
"xg_model_path": "models/xg_model.pkl",
"player_value_path": "models/player_value.pkl"
},
"output": {
"report_dir": "reports/",
"dashboard_url": "http://localhost:8050"
},
"logging": {
"level": "INFO",
"file": "logs/app.log"
}
}
def _setup_logging(self) -> logging.Logger:
"""Configure logging."""
logger = logging.getLogger("FootballAnalyticsPro")
logger.setLevel(logging.INFO)
# Console handler
handler = logging.StreamHandler()
formatter = logging.Formatter(
"[%(asctime)s] %(levelname)s: %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def log(self, level: str, message: str) -> None:
"""Log a message."""
getattr(self.logger, level.lower())(message)
@property
def data_layer(self):
"""Lazy initialization of data layer."""
if self._data_layer is None:
self._data_layer = DataLayer(self.config["database"])
return self._data_layer
@property
def analytics_layer(self):
"""Lazy initialization of analytics layer."""
if self._analytics_layer is None:
self._analytics_layer = AnalyticsLayer(self.config["models"])
return self._analytics_layer
@property
def presentation_layer(self):
"""Lazy initialization of presentation layer."""
if self._presentation_layer is None:
self._presentation_layer = PresentationLayer(self.config["output"])
return self._presentation_layer
def run_daily_pipeline(self, pipeline_date: date = None) -> Dict:
"""Run the daily analytics pipeline."""
if pipeline_date is None:
pipeline_date = date.today()
self.log("INFO", f"Starting daily pipeline for {pipeline_date}")
try:
# Step 1: Ingest new data
self.log("INFO", "Step 1: Data ingestion")
new_matches = self.data_layer.ingest_daily_data(pipeline_date)
# Step 2: Run analytics
self.log("INFO", "Step 2: Running analytics")
analytics_results = self.analytics_layer.process_matches(new_matches)
# Step 3: Generate outputs
self.log("INFO", "Step 3: Generating outputs")
self.presentation_layer.generate_daily_report(
analytics_results, pipeline_date
)
self.log("INFO", "Daily pipeline completed successfully")
return {
"status": "success",
"matches_processed": len(new_matches)
}
except Exception as e:
self.log("ERROR", f"Pipeline failed: {str(e)}")
return {"status": "error", "message": str(e)}
def analyze_match(self, match_id: str) -> Dict:
"""Perform full analysis of a single match."""
self.log("INFO", f"Analyzing match: {match_id}")
match_data = self.data_layer.get_match(match_id)
analysis = self.analytics_layer.full_match_analysis(match_data)
return analysis
def scout_players(self, criteria: Dict) -> Dict:
"""Run player scouting workflow."""
self.log("INFO", "Running player scouting workflow")
candidates = self.data_layer.search_players(criteria)
evaluated = self.analytics_layer.evaluate_players(candidates)
report = self.presentation_layer.generate_scouting_report(evaluated)
return report
print("FootballAnalyticsPro system class defined")
print("Usage: platform = FootballAnalyticsPro()")
library(tidyverse)
library(R6)
library(DBI)
library(jsonlite)
# FootballAnalyticsPro - Main System Class
FootballAnalyticsPro <- R6Class("FootballAnalyticsPro",
public = list(
config = NULL,
data_layer = NULL,
analytics_layer = NULL,
presentation_layer = NULL,
logger = NULL,
initialize = function(config_path = "config.yml") {
message("Initializing FootballAnalyticsPro Platform...")
# Load configuration
self$config <- self$load_config(config_path)
# Initialize logging
self$logger <- self$setup_logging()
# Initialize layers
self$data_layer <- DataLayer$new(self$config$database)
self$analytics_layer <- AnalyticsLayer$new(self$config$models)
self$presentation_layer <- PresentationLayer$new(self$config$output)
self$log("INFO", "Platform initialized successfully")
message("FootballAnalyticsPro ready!")
},
load_config = function(path) {
# Default configuration
list(
database = list(
host = "localhost",
port = 5432,
name = "football_analytics",
user = "analyst"
),
models = list(
xg_model_path = "models/xg_model.rds",
player_value_path = "models/player_value.rds"
),
output = list(
report_dir = "reports/",
dashboard_url = "http://localhost:8050"
),
logging = list(
level = "INFO",
file = "logs/app.log"
)
)
},
setup_logging = function() {
list(
log = function(level, message) {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
cat(sprintf("[%s] %s: %s\n", timestamp, level, message))
}
)
},
log = function(level, message) {
self$logger$log(level, message)
},
# Main workflow orchestration
run_daily_pipeline = function(date = Sys.Date()) {
self$log("INFO", paste("Starting daily pipeline for", date))
tryCatch({
# Step 1: Ingest new data
self$log("INFO", "Step 1: Data ingestion")
new_matches <- self$data_layer$ingest_daily_data(date)
# Step 2: Run analytics
self$log("INFO", "Step 2: Running analytics")
analytics_results <- self$analytics_layer$process_matches(new_matches)
# Step 3: Generate outputs
self$log("INFO", "Step 3: Generating outputs")
self$presentation_layer$generate_daily_report(analytics_results, date)
self$log("INFO", "Daily pipeline completed successfully")
return(list(status = "success", matches_processed = length(new_matches)))
},
error = function(e) {
self$log("ERROR", paste("Pipeline failed:", e$message))
return(list(status = "error", message = e$message))
})
},
# Match analysis workflow
analyze_match = function(match_id) {
self$log("INFO", paste("Analyzing match:", match_id))
match_data <- self$data_layer$get_match(match_id)
analysis <- self$analytics_layer$full_match_analysis(match_data)
return(analysis)
},
# Player scouting workflow
scout_players = function(criteria) {
self$log("INFO", "Running player scouting workflow")
candidates <- self$data_layer$search_players(criteria)
evaluated <- self$analytics_layer$evaluate_players(candidates)
report <- self$presentation_layer$generate_scouting_report(evaluated)
return(report)
}
)
)
# Initialize the platform
# platform <- FootballAnalyticsPro$new()
message("FootballAnalyticsPro system class defined")
message("Usage: platform <- FootballAnalyticsPro$new()")
Data Layer Implementation
The data layer handles all data ingestion, validation, transformation, and storage operations. It provides a clean interface for the analytics layer to access data.
import pandas as pd
import numpy as np
from datetime import date, datetime
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
import sqlite3
# Data Layer Component
class DataLayer:
"""
Handles all data ingestion, validation, transformation, and storage.
"""
def __init__(self, db_config: Dict):
self.db_config = db_config
self.connection = None
self.validators = self._setup_validators()
print("DataLayer initialized")
def connect(self) -> None:
"""Establish database connection."""
# Using SQLite for demonstration
self.connection = sqlite3.connect(":memory:")
self._initialize_schema()
def disconnect(self) -> None:
"""Close database connection."""
if self.connection:
self.connection.close()
def _initialize_schema(self) -> None:
"""Create database tables."""
cursor = self.connection.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS matches (
match_id TEXT PRIMARY KEY,
date TEXT,
home_team TEXT,
away_team TEXT,
home_score INTEGER,
away_score INTEGER,
competition TEXT,
season TEXT
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS events (
event_id TEXT PRIMARY KEY,
match_id TEXT,
minute INTEGER,
second INTEGER,
team TEXT,
player TEXT,
event_type TEXT,
x REAL,
y REAL,
end_x REAL,
end_y REAL,
outcome TEXT,
xg REAL,
FOREIGN KEY (match_id) REFERENCES matches(match_id)
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS players (
player_id TEXT PRIMARY KEY,
name TEXT,
position TEXT,
team TEXT,
birth_date TEXT,
nationality TEXT,
market_value REAL
)
""")
self.connection.commit()
def _setup_validators(self) -> Dict:
"""Set up data validators."""
return {
"match": lambda df: all(
col in df.columns
for col in ["match_id", "date", "home_team", "away_team"]
),
"event": lambda df: (
all(col in df.columns
for col in ["event_id", "match_id", "event_type", "x", "y"])
and df["x"].between(0, 120).all()
and df["y"].between(0, 80).all()
),
"player": lambda df: all(
col in df.columns
for col in ["player_id", "name", "position"]
)
}
def validate(self, data: pd.DataFrame, data_type: str) -> bool:
"""Validate data against schema."""
validator = self.validators.get(data_type)
if validator is None:
raise ValueError(f"Unknown data type: {data_type}")
return validator(data)
def ingest_daily_data(self, ingest_date: date) -> List[str]:
"""Ingest data for a specific date."""
print(f"Ingesting data for {ingest_date}")
# Fetch from API (simulated)
matches = self._fetch_matches_from_api(ingest_date)
# Validate
if self.validate(matches, "match"):
# Store matches
return matches["match_id"].tolist()
return []
def _fetch_matches_from_api(self, api_date: date) -> pd.DataFrame:
"""Simulated API fetch."""
return pd.DataFrame({
"match_id": [f"match_{i}" for i in range(1, 4)],
"date": [str(api_date)] * 3,
"home_team": ["Team A", "Team B", "Team C"],
"away_team": ["Team D", "Team E", "Team F"],
"home_score": [2, 1, 0],
"away_score": [1, 1, 3],
"competition": "Premier League",
"season": "2024-25"
})
def get_match(self, match_id: str) -> Dict:
"""Get complete match data."""
return {
"match_info": self._get_match_info(match_id),
"events": self._get_match_events(match_id),
"lineups": self._get_match_lineups(match_id)
}
def _get_match_info(self, match_id: str) -> pd.DataFrame:
"""Get match metadata."""
return pd.DataFrame([{
"match_id": match_id,
"date": str(date.today()),
"home_team": "Team A",
"away_team": "Team B",
"home_score": 2,
"away_score": 1
}])
def _get_match_events(self, match_id: str) -> pd.DataFrame:
"""Get match events."""
np.random.seed(42)
n_events = 100
return pd.DataFrame({
"event_id": [f"evt_{i}" for i in range(n_events)],
"match_id": match_id,
"minute": sorted(np.random.randint(1, 91, n_events)),
"team": np.random.choice(["Team A", "Team B"], n_events),
"event_type": np.random.choice(
["pass", "shot", "tackle", "dribble"],
n_events,
p=[0.6, 0.15, 0.15, 0.1]
),
"x": np.random.uniform(0, 120, n_events),
"y": np.random.uniform(0, 80, n_events)
})
def _get_match_lineups(self, match_id: str) -> Dict:
"""Get match lineups."""
return {
"home": [f"Player_H{i}" for i in range(1, 12)],
"away": [f"Player_A{i}" for i in range(1, 12)]
}
def search_players(self, criteria: Dict) -> pd.DataFrame:
"""Search for players matching criteria."""
print(f"Searching players with criteria: {criteria}")
np.random.seed(42)
return pd.DataFrame({
"player_id": [f"player_{i}" for i in range(1, 21)],
"name": [f"Player {i}" for i in range(1, 21)],
"position": np.random.choice(["FW", "MF", "DF", "GK"], 20),
"team": np.random.choice([f"Club_{c}" for c in "ABCDEFGHIJ"], 20),
"age": np.random.randint(18, 36, 20),
"market_value": np.random.uniform(1, 50, 20) * 1e6
})
# Test data layer
data_layer = DataLayer({"host": "localhost"})
print("DataLayer created successfully")
library(tidyverse)
library(R6)
library(DBI)
library(RSQLite)
# Data Layer Component
DataLayer <- R6Class("DataLayer",
public = list(
db_config = NULL,
connection = NULL,
validators = list(),
initialize = function(db_config) {
self$db_config <- db_config
self$validators <- self$setup_validators()
message("DataLayer initialized")
},
# Database connection management
connect = function() {
# Using SQLite for demonstration (production would use PostgreSQL)
self$connection <- dbConnect(RSQLite::SQLite(), ":memory:")
self$initialize_schema()
},
disconnect = function() {
if (!is.null(self$connection)) {
dbDisconnect(self$connection)
}
},
initialize_schema = function() {
# Create tables
dbExecute(self$connection, "
CREATE TABLE IF NOT EXISTS matches (
match_id TEXT PRIMARY KEY,
date TEXT,
home_team TEXT,
away_team TEXT,
home_score INTEGER,
away_score INTEGER,
competition TEXT,
season TEXT
)
")
dbExecute(self$connection, "
CREATE TABLE IF NOT EXISTS events (
event_id TEXT PRIMARY KEY,
match_id TEXT,
minute INTEGER,
second INTEGER,
team TEXT,
player TEXT,
event_type TEXT,
x REAL,
y REAL,
end_x REAL,
end_y REAL,
outcome TEXT,
xg REAL,
FOREIGN KEY (match_id) REFERENCES matches(match_id)
)
")
dbExecute(self$connection, "
CREATE TABLE IF NOT EXISTS players (
player_id TEXT PRIMARY KEY,
name TEXT,
position TEXT,
team TEXT,
birth_date TEXT,
nationality TEXT,
market_value REAL
)
")
},
# Data validation
setup_validators = function() {
list(
match = function(data) {
required_cols <- c("match_id", "date", "home_team", "away_team")
all(required_cols %in% names(data))
},
event = function(data) {
required_cols <- c("event_id", "match_id", "event_type", "x", "y")
valid_structure <- all(required_cols %in% names(data))
valid_coords <- all(data$x >= 0 & data$x <= 120 &
data$y >= 0 & data$y <= 80)
valid_structure && valid_coords
},
player = function(data) {
required_cols <- c("player_id", "name", "position")
all(required_cols %in% names(data))
}
)
},
validate = function(data, type) {
validator <- self$validators[[type]]
if (is.null(validator)) {
stop(paste("Unknown data type:", type))
}
validator(data)
},
# Data ingestion
ingest_daily_data = function(date) {
message(paste("Ingesting data for", date))
# Simulate fetching from data provider API
matches <- self$fetch_matches_from_api(date)
# Validate and store
valid_matches <- matches %>%
filter(self$validate(., "match"))
# Return match IDs processed
return(valid_matches$match_id)
},
fetch_matches_from_api = function(date) {
# Simulated API response
tibble(
match_id = paste0("match_", 1:3),
date = as.character(date),
home_team = c("Team A", "Team B", "Team C"),
away_team = c("Team D", "Team E", "Team F"),
home_score = c(2, 1, 0),
away_score = c(1, 1, 3),
competition = "Premier League",
season = "2024-25"
)
},
# Data retrieval
get_match = function(match_id) {
# Return match data with events
list(
match_info = self$get_match_info(match_id),
events = self$get_match_events(match_id),
lineups = self$get_match_lineups(match_id)
)
},
get_match_info = function(match_id) {
# Simulated data
tibble(
match_id = match_id,
date = Sys.Date(),
home_team = "Team A",
away_team = "Team B",
home_score = 2,
away_score = 1
)
},
get_match_events = function(match_id) {
# Generate sample events
n_events <- 100
tibble(
event_id = paste0("evt_", 1:n_events),
match_id = match_id,
minute = sort(sample(1:90, n_events, replace = TRUE)),
team = sample(c("Team A", "Team B"), n_events, replace = TRUE),
event_type = sample(c("pass", "shot", "tackle", "dribble"), n_events,
replace = TRUE, prob = c(0.6, 0.15, 0.15, 0.1)),
x = runif(n_events, 0, 120),
y = runif(n_events, 0, 80)
)
},
get_match_lineups = function(match_id) {
# Simulated lineups
list(
home = paste0("Player_H", 1:11),
away = paste0("Player_A", 1:11)
)
},
# Player search
search_players = function(criteria) {
message("Searching players with criteria:")
print(criteria)
# Simulated player search
tibble(
player_id = paste0("player_", 1:20),
name = paste("Player", 1:20),
position = sample(c("FW", "MF", "DF", "GK"), 20, replace = TRUE),
team = sample(paste0("Club_", LETTERS[1:10]), 20, replace = TRUE),
age = sample(18:35, 20, replace = TRUE),
market_value = runif(20, 1, 50) * 1e6
)
}
)
)
# Test data layer
data_layer <- DataLayer$new(list(host = "localhost"))
print("DataLayer created successfully")
Analytics Layer Implementation
The analytics layer contains all the models and algorithms for player evaluation, match analysis, and tactical insights.
import pandas as pd
import numpy as np
from typing import Dict, List, Any
from dataclasses import dataclass
import math
# Analytics Layer Component
class AnalyticsLayer:
"""
Contains all analytical models and algorithms for
player evaluation, match analysis, and tactical insights.
"""
def __init__(self, model_config: Dict):
self.model_config = model_config
self.xg_model = None
self.player_value_model = None
self._load_models()
print("AnalyticsLayer initialized")
def _load_models(self) -> None:
"""Load pre-trained models."""
self.xg_model = self._create_xg_model()
self.player_value_model = self._create_value_model()
def _create_xg_model(self) -> Dict:
"""Create simplified xG model."""
return {
"intercept": -3.0,
"distance_coef": -0.1,
"angle_coef": 0.02,
"body_part_coefs": {"foot": 0, "head": -0.5, "other": -1.0},
"situation_coefs": {
"open_play": 0, "corner": 0.3,
"free_kick": 0.2, "penalty": 2.5
}
}
def _create_value_model(self) -> Dict:
"""Create player value model weights."""
return {
"offensive": {"goals": 0.35, "assists": 0.25, "xg": 0.15, "xa": 0.1},
"defensive": {"tackles": 0.15, "interceptions": 0.12, "blocks": 0.08},
"possession": {"passes": 0.05, "progressive": 0.1, "key_passes": 0.15}
}
def process_matches(self, match_ids: List[str]) -> Dict:
"""Process multiple matches."""
return {
match_id: self._analyze_single_match(match_id)
for match_id in match_ids
}
def _analyze_single_match(self, match_id: str) -> Dict:
"""Quick analysis of a single match."""
np.random.seed(hash(match_id) % 2**32)
return {
"match_id": match_id,
"xg_home": np.random.uniform(0.5, 3),
"xg_away": np.random.uniform(0.5, 3),
"possession_home": np.random.uniform(35, 65),
"shots_home": np.random.randint(8, 21),
"shots_away": np.random.randint(8, 21)
}
def full_match_analysis(self, match_data: Dict) -> Dict:
"""Comprehensive match analysis."""
events = match_data["events"]
# xG analysis
shots = events[events["event_type"] == "shot"]
xg_analysis = self._calculate_xg(shots)
# Passing analysis
passes = events[events["event_type"] == "pass"]
passing_network = self._analyze_passing_network(passes)
# Territory analysis
territory = self._analyze_territory(events)
# Pressing analysis
pressing = self._analyze_pressing(events)
return {
"xg": xg_analysis,
"passing": passing_network,
"territory": territory,
"pressing": pressing,
"summary": self._generate_match_summary(xg_analysis, territory)
}
def _calculate_xg(self, shots: pd.DataFrame) -> pd.DataFrame:
"""Calculate expected goals for shots."""
if len(shots) == 0:
return pd.DataFrame(columns=["team", "total_xg", "shots"])
shots = shots.copy()
# Calculate shot metrics
shots["distance"] = np.sqrt(
(120 - shots["x"])**2 + (40 - shots["y"])**2
)
shots["angle"] = np.degrees(np.arctan2(
np.abs(40 - shots["y"]), 120 - shots["x"]
))
# Calculate xG
shots["xg"] = 1 / (1 + np.exp(-(
self.xg_model["intercept"] +
self.xg_model["distance_coef"] * shots["distance"] +
self.xg_model["angle_coef"] * shots["angle"]
)))
# Aggregate by team
return shots.groupby("team").agg(
total_xg=("xg", "sum"),
shots=("event_id", "count"),
avg_xg_per_shot=("xg", "mean")
).reset_index()
def _analyze_passing_network(self, passes: pd.DataFrame) -> Dict:
"""Analyze passing patterns."""
return {
"total_passes": len(passes),
"completion_rate": np.random.uniform(0.75, 0.92),
"progressive_passes": np.random.randint(20, 51),
"key_passes": np.random.randint(5, 16)
}
def _analyze_territory(self, events: pd.DataFrame) -> pd.DataFrame:
"""Analyze territorial control."""
events = events.copy()
def classify_third(x):
if x <= 40:
return "defensive"
elif x <= 80:
return "middle"
return "attacking"
events["third"] = events["x"].apply(classify_third)
return events.groupby(["team", "third"]).size().unstack(fill_value=0)
def _analyze_pressing(self, events: pd.DataFrame) -> Dict:
"""Analyze pressing intensity."""
return {
"ppda_home": np.random.uniform(6, 15),
"ppda_away": np.random.uniform(6, 15),
"high_press_sequences": np.random.randint(5, 21)
}
def _generate_match_summary(self, xg_analysis: pd.DataFrame,
territory: pd.DataFrame) -> str:
"""Generate text summary of match."""
if len(xg_analysis) >= 2:
return (f"Match Analysis Summary:\n"
f"Home xG: {xg_analysis.iloc[0]['total_xg']:.2f}, "
f"Away xG: {xg_analysis.iloc[1]['total_xg']:.2f}")
return "Match Analysis Summary: Insufficient data"
def evaluate_players(self, players: pd.DataFrame) -> pd.DataFrame:
"""Evaluate and rank players."""
players = players.copy()
np.random.seed(42)
n = len(players)
# Generate performance metrics
players["goals_per_90"] = np.random.uniform(0, 0.8, n)
players["assists_per_90"] = np.random.uniform(0, 0.5, n)
players["xg_per_90"] = np.random.uniform(0, 0.6, n)
players["xa_per_90"] = np.random.uniform(0, 0.4, n)
# Calculate composite score
players["offensive_score"] = (
players["goals_per_90"] * 0.35 +
players["assists_per_90"] * 0.25 +
players["xg_per_90"] * 0.2 +
players["xa_per_90"] * 0.2
)
# Percentile ranking
players["percentile_rank"] = players["offensive_score"].rank(pct=True) * 100
# Value rating
def rate_value(pct):
if pct >= 90:
return "Premium"
elif pct >= 70:
return "Above Average"
elif pct >= 50:
return "Average"
return "Below Average"
players["value_rating"] = players["percentile_rank"].apply(rate_value)
return players.sort_values("offensive_score", ascending=False)
def find_similar_players(self, target_player: Dict,
player_pool: pd.DataFrame,
n: int = 10) -> pd.DataFrame:
"""Find players similar to target."""
pool = player_pool.copy()
# Calculate Euclidean distance in feature space
pool["similarity"] = 1 - np.sqrt(
(pool["goals_per_90"] - target_player["goals_per_90"])**2 +
(pool["assists_per_90"] - target_player["assists_per_90"])**2 +
(pool["xg_per_90"] - target_player["xg_per_90"])**2
)
return pool.nlargest(n, "similarity")
analytics_layer = AnalyticsLayer({"xg_model_path": "models/xg.pkl"})
print("AnalyticsLayer created successfully")
library(tidyverse)
library(R6)
# Analytics Layer Component
AnalyticsLayer <- R6Class("AnalyticsLayer",
public = list(
model_config = NULL,
xg_model = NULL,
player_value_model = NULL,
initialize = function(model_config) {
self$model_config <- model_config
self$load_models()
message("AnalyticsLayer initialized")
},
load_models = function() {
# Load pre-trained models (simulated)
self$xg_model <- self$create_xg_model()
self$player_value_model <- self$create_value_model()
},
create_xg_model = function() {
# Simplified xG model coefficients
list(
intercept = -3.0,
distance_coef = -0.1,
angle_coef = 0.02,
body_part_coefs = c(foot = 0, head = -0.5, other = -1.0),
situation_coefs = c(open_play = 0, corner = 0.3, free_kick = 0.2,
penalty = 2.5)
)
},
create_value_model = function() {
# Player value model weights
list(
offensive = c(goals = 0.35, assists = 0.25, xg = 0.15, xa = 0.1),
defensive = c(tackles = 0.15, interceptions = 0.12, blocks = 0.08),
possession = c(passes = 0.05, progressive = 0.1, key_passes = 0.15)
)
},
# Process matches
process_matches = function(match_ids) {
results <- map(match_ids, ~self$analyze_single_match(.x))
names(results) <- match_ids
return(results)
},
analyze_single_match = function(match_id) {
# Simulated match analysis
list(
match_id = match_id,
xg_home = runif(1, 0.5, 3),
xg_away = runif(1, 0.5, 3),
possession_home = runif(1, 35, 65),
shots_home = sample(8:20, 1),
shots_away = sample(8:20, 1)
)
},
# Full match analysis
full_match_analysis = function(match_data) {
events <- match_data$events
# Calculate xG for shots
shots <- events %>% filter(event_type == "shot")
xg_analysis <- self$calculate_xg(shots)
# Passing network
passes <- events %>% filter(event_type == "pass")
passing_network <- self$analyze_passing_network(passes)
# Territorial analysis
territory <- self$analyze_territory(events)
# Pressing analysis
pressing <- self$analyze_pressing(events)
list(
xg = xg_analysis,
passing = passing_network,
territory = territory,
pressing = pressing,
summary = self$generate_match_summary(xg_analysis, territory)
)
},
calculate_xg = function(shots) {
if (nrow(shots) == 0) {
return(tibble(team = character(), total_xg = numeric(), shots = integer()))
}
shots %>%
mutate(
distance = sqrt((120 - x)^2 + (40 - y)^2),
angle = atan2(abs(40 - y), 120 - x) * 180 / pi,
xg = 1 / (1 + exp(-(self$xg_model$intercept +
self$xg_model$distance_coef * distance +
self$xg_model$angle_coef * angle)))
) %>%
group_by(team) %>%
summarise(
total_xg = sum(xg),
shots = n(),
avg_xg_per_shot = mean(xg),
.groups = "drop"
)
},
analyze_passing_network = function(passes) {
list(
total_passes = nrow(passes),
completion_rate = runif(1, 0.75, 0.92),
progressive_passes = sample(20:50, 1),
key_passes = sample(5:15, 1)
)
},
analyze_territory = function(events) {
events %>%
mutate(
third = case_when(
x <= 40 ~ "defensive",
x <= 80 ~ "middle",
TRUE ~ "attacking"
)
) %>%
group_by(team, third) %>%
summarise(actions = n(), .groups = "drop") %>%
pivot_wider(names_from = third, values_from = actions, values_fill = 0)
},
analyze_pressing = function(events) {
# PPDA-style pressing analysis
list(
ppda_home = runif(1, 6, 15),
ppda_away = runif(1, 6, 15),
high_press_sequences = sample(5:20, 1)
)
},
generate_match_summary = function(xg_analysis, territory) {
paste(
"Match Analysis Summary:",
sprintf("Home xG: %.2f, Away xG: %.2f",
xg_analysis$total_xg[1], xg_analysis$total_xg[2]),
sep = "\n"
)
},
# Player evaluation
evaluate_players = function(players) {
players %>%
mutate(
# Generate random performance metrics for demonstration
goals_per_90 = runif(n(), 0, 0.8),
assists_per_90 = runif(n(), 0, 0.5),
xg_per_90 = runif(n(), 0, 0.6),
xa_per_90 = runif(n(), 0, 0.4),
# Calculate composite score
offensive_score = goals_per_90 * 0.35 + assists_per_90 * 0.25 +
xg_per_90 * 0.2 + xa_per_90 * 0.2,
# Percentile ranking
percentile_rank = percent_rank(offensive_score) * 100,
# Value assessment
value_rating = case_when(
percentile_rank >= 90 ~ "Premium",
percentile_rank >= 70 ~ "Above Average",
percentile_rank >= 50 ~ "Average",
TRUE ~ "Below Average"
)
) %>%
arrange(desc(offensive_score))
},
# Similarity search
find_similar_players = function(target_player, player_pool, n = 10) {
# Calculate feature-based similarity
player_pool %>%
mutate(
similarity = 1 - sqrt(
(goals_per_90 - target_player$goals_per_90)^2 +
(assists_per_90 - target_player$assists_per_90)^2 +
(xg_per_90 - target_player$xg_per_90)^2
)
) %>%
top_n(n, similarity) %>%
arrange(desc(similarity))
}
)
)
analytics_layer <- AnalyticsLayer$new(list(xg_model_path = "models/xg.rds"))
print("AnalyticsLayer created successfully")
Presentation Layer Implementation
The presentation layer handles all output generation including reports, dashboards, visualizations, and API responses.
import pandas as pd
import numpy as np
from datetime import datetime, date
from typing import Dict, List, Any, Optional
import json
# Presentation Layer Component
class PresentationLayer:
"""
Handles all output generation including reports,
dashboards, visualizations, and API responses.
"""
def __init__(self, output_config: Dict):
self.output_config = output_config
print("PresentationLayer initialized")
def generate_daily_report(self, analytics_results: Dict,
report_date: date) -> Dict:
"""Generate daily analytics report."""
print(f"Generating daily report for {report_date}")
report = {
"title": f"Daily Analytics Report - {report_date}",
"generated_at": datetime.now().isoformat(),
"summary": self._create_daily_summary(analytics_results),
"highlights": self._extract_highlights(analytics_results),
"visualizations": self._create_daily_visualizations(analytics_results)
}
# Save report
output_path = (
f"{self.output_config['report_dir']}"
f"daily_report_{report_date}.json"
)
print(f"Report saved to: {output_path}")
return report
def _create_daily_summary(self, results: Dict) -> Dict:
"""Create summary statistics."""
n_matches = len(results)
total_xg = sum(
r.get("xg_home", 0) + r.get("xg_away", 0)
for r in results.values()
)
return {
"matches_analyzed": n_matches,
"average_xg_per_match": total_xg / n_matches if n_matches > 0 else 0,
"date": str(date.today())
}
def _extract_highlights(self, results: Dict) -> Dict:
"""Extract key highlights from results."""
if not results:
return {"highest_xg_match": None, "avg_xg": 0}
xg_totals = {
k: v.get("xg_home", 0) + v.get("xg_away", 0)
for k, v in results.items()
}
return {
"highest_xg_match": max(xg_totals, key=xg_totals.get),
"avg_xg": np.mean(list(xg_totals.values()))
}
def _create_daily_visualizations(self, results: Dict) -> Dict:
"""Prepare visualization data."""
viz_data = pd.DataFrame([
{
"match_id": k,
"xg_home": v.get("xg_home", 0),
"xg_away": v.get("xg_away", 0)
}
for k, v in results.items()
])
return {
"xg_comparison": viz_data.to_dict(orient="records"),
"plot_generated": True
}
def generate_scouting_report(self, evaluated_players: pd.DataFrame) -> Dict:
"""Generate comprehensive scouting report."""
print("Generating scouting report")
top_prospects = evaluated_players[
evaluated_players["value_rating"].isin(["Premium", "Above Average"])
].head(10)
return {
"title": "Player Scouting Report",
"generated_at": datetime.now().isoformat(),
"total_evaluated": len(evaluated_players),
"top_prospects": top_prospects.to_dict(orient="records"),
"position_breakdown": self._position_breakdown(evaluated_players),
"value_analysis": self._value_analysis(evaluated_players)
}
def _position_breakdown(self, players: pd.DataFrame) -> List[Dict]:
"""Break down players by position."""
return players.groupby("position").agg(
count=("player_id", "count"),
avg_score=("offensive_score", "mean")
).reset_index().to_dict(orient="records")
def _value_analysis(self, players: pd.DataFrame) -> List[Dict]:
"""Analyze value distribution."""
return players.groupby("value_rating").agg(
count=("player_id", "count"),
avg_market_value=("market_value", "mean")
).reset_index().to_dict(orient="records")
def generate_match_report(self, match_analysis: Dict,
match_info: pd.DataFrame) -> Dict:
"""Generate detailed match report."""
info = match_info.iloc[0] if isinstance(match_info, pd.DataFrame) else match_info
print(f"Generating match report for {info['match_id']}")
return {
"header": self._create_match_header(info),
"xg_analysis": match_analysis.get("xg"),
"passing_analysis": match_analysis.get("passing"),
"territorial_analysis": match_analysis.get("territory"),
"pressing_analysis": match_analysis.get("pressing"),
"key_moments": self._identify_key_moments(match_analysis),
"player_ratings": self._calculate_player_ratings(match_analysis)
}
def _create_match_header(self, match_info: Dict) -> Dict:
"""Create match header."""
return {
"title": f"{match_info['home_team']} vs {match_info['away_team']}",
"date": str(match_info.get("date", date.today())),
"score": f"{match_info['home_score']} - {match_info['away_score']}",
"venue": "Stadium Name"
}
def _identify_key_moments(self, analysis: Dict) -> Dict:
"""Identify key match moments."""
return {
"highest_xg_chance": {
"minute": np.random.randint(1, 91),
"xg": np.random.uniform(0.3, 0.8)
},
"momentum_shifts": [15, 45, 72]
}
def _calculate_player_ratings(self, analysis: Dict) -> List[Dict]:
"""Calculate player ratings."""
ratings = pd.DataFrame({
"player": [f"Player_{i}" for i in range(1, 12)],
"rating": np.random.uniform(5.5, 9.0, 11)
}).sort_values("rating", ascending=False)
return ratings.to_dict(orient="records")
presentation_layer = PresentationLayer({"report_dir": "reports/"})
print("PresentationLayer created successfully")
library(tidyverse)
library(R6)
library(ggplot2)
library(rmarkdown)
# Presentation Layer Component
PresentationLayer <- R6Class("PresentationLayer",
public = list(
output_config = NULL,
initialize = function(output_config) {
self$output_config <- output_config
message("PresentationLayer initialized")
},
# Report generation
generate_daily_report = function(analytics_results, date) {
message(paste("Generating daily report for", date))
report <- list(
title = paste("Daily Analytics Report -", date),
generated_at = Sys.time(),
summary = self$create_daily_summary(analytics_results),
highlights = self$extract_highlights(analytics_results),
visualizations = self$create_daily_visualizations(analytics_results)
)
# Save report
output_path <- file.path(
self$output_config$report_dir,
paste0("daily_report_", date, ".html")
)
message(paste("Report saved to:", output_path))
return(report)
},
create_daily_summary = function(results) {
n_matches <- length(results)
total_goals <- sum(sapply(results, function(x) {
x$xg_home + x$xg_away
}))
list(
matches_analyzed = n_matches,
average_xg_per_match = total_goals / n_matches,
date = Sys.Date()
)
},
extract_highlights = function(results) {
# Find most exciting matches (highest xG totals)
xg_totals <- sapply(results, function(x) x$xg_home + x$xg_away)
list(
highest_xg_match = names(which.max(xg_totals)),
avg_xg = mean(xg_totals)
)
},
create_daily_visualizations = function(results) {
# Create summary visualization data
viz_data <- tibble(
match_id = names(results),
xg_home = sapply(results, function(x) x$xg_home),
xg_away = sapply(results, function(x) x$xg_away)
)
list(
xg_comparison = viz_data,
plot_generated = TRUE
)
},
# Scouting report generation
generate_scouting_report = function(evaluated_players) {
message("Generating scouting report")
top_prospects <- evaluated_players %>%
filter(value_rating %in% c("Premium", "Above Average")) %>%
head(10)
report <- list(
title = "Player Scouting Report",
generated_at = Sys.time(),
total_evaluated = nrow(evaluated_players),
top_prospects = top_prospects,
position_breakdown = self$position_breakdown(evaluated_players),
value_analysis = self$value_analysis(evaluated_players)
)
return(report)
},
position_breakdown = function(players) {
players %>%
group_by(position) %>%
summarise(
count = n(),
avg_score = mean(offensive_score, na.rm = TRUE),
top_player = name[which.max(offensive_score)],
.groups = "drop"
)
},
value_analysis = function(players) {
players %>%
group_by(value_rating) %>%
summarise(
count = n(),
avg_market_value = mean(market_value, na.rm = TRUE),
.groups = "drop"
)
},
# Match report generation
generate_match_report = function(match_analysis, match_info) {
message(paste("Generating match report for", match_info$match_id))
report <- list(
header = self$create_match_header(match_info),
xg_analysis = match_analysis$xg,
passing_analysis = match_analysis$passing,
territorial_analysis = match_analysis$territory,
pressing_analysis = match_analysis$pressing,
key_moments = self$identify_key_moments(match_analysis),
player_ratings = self$calculate_player_ratings(match_analysis)
)
return(report)
},
create_match_header = function(match_info) {
list(
title = paste(match_info$home_team, "vs", match_info$away_team),
date = match_info$date,
score = paste(match_info$home_score, "-", match_info$away_score),
venue = "Stadium Name"
)
},
identify_key_moments = function(analysis) {
list(
highest_xg_chance = list(minute = sample(1:90, 1), xg = runif(1, 0.3, 0.8)),
momentum_shifts = c(15, 45, 72) # Sample minutes
)
},
calculate_player_ratings = function(analysis) {
tibble(
player = paste0("Player_", 1:11),
rating = runif(11, 5.5, 9.0)
) %>%
arrange(desc(rating))
},
# Visualization creation
create_xg_flow_chart = function(events) {
# Create cumulative xG chart
xg_data <- events %>%
filter(event_type == "shot") %>%
arrange(minute) %>%
group_by(team) %>%
mutate(cumulative_xg = cumsum(xg)) %>%
ungroup()
ggplot(xg_data, aes(x = minute, y = cumulative_xg, color = team)) +
geom_step(linewidth = 1.2) +
geom_point(size = 2) +
scale_color_manual(values = c("#1B5E20", "#D32F2F")) +
labs(
title = "Expected Goals Flow",
x = "Minute",
y = "Cumulative xG",
color = "Team"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "bottom"
)
},
create_shot_map = function(shots, pitch_color = "#1B5E20") {
ggplot(shots, aes(x = x, y = y, size = xg, color = as.factor(goal))) +
annotate("rect", xmin = 0, xmax = 120, ymin = 0, ymax = 80,
fill = pitch_color, alpha = 0.3) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(2, 10), name = "xG") +
scale_color_manual(values = c("0" = "red", "1" = "green"),
labels = c("Missed", "Goal"), name = "Outcome") +
coord_fixed(ratio = 1) +
labs(title = "Shot Map with xG") +
theme_minimal()
}
)
)
presentation_layer <- PresentationLayer$new(list(report_dir = "reports/"))
print("PresentationLayer created successfully")
Complete System Integration
Now let's bring all the layers together into a fully functional system with example workflows.
import pandas as pd
import numpy as np
from datetime import date
# Complete integrated workflow demonstration
def run_complete_demo():
"""Run complete system demonstration."""
print("=" * 60)
print("FootballAnalyticsPro - Complete System Demo")
print("=" * 60)
# Initialize layers
data_layer = DataLayer({"host": "localhost"})
analytics_layer = AnalyticsLayer({"xg_model": "models/xg.pkl"})
presentation_layer = PresentationLayer({"report_dir": "reports/"})
print("\n--- Workflow 1: Daily Pipeline ---")
# Ingest today's data
today = date.today()
match_ids = data_layer.ingest_daily_data(today)
print(f"Ingested {len(match_ids)} matches")
# Process matches
analytics_results = analytics_layer.process_matches(match_ids)
print("Analytics processing complete")
# Generate report
daily_report = presentation_layer.generate_daily_report(analytics_results, today)
print(f"Daily report generated: {daily_report['title']}")
print("\n--- Workflow 2: Match Analysis ---")
# Get specific match
match_data = data_layer.get_match("match_1")
print(f"Loaded match with {len(match_data['events'])} events")
# Full analysis
match_analysis = analytics_layer.full_match_analysis(match_data)
print("Match analysis complete:")
print(match_analysis["xg"])
# Generate match report
match_report = presentation_layer.generate_match_report(
match_analysis,
match_data["match_info"]
)
print("Match report generated")
print("\n--- Workflow 3: Player Scouting ---")
# Define scouting criteria
criteria = {
"position": ["FW", "MF"],
"max_age": 25,
"min_market_value": 5e6
}
# Search players
candidates = data_layer.search_players(criteria)
print(f"Found {len(candidates)} candidates")
# Evaluate players
evaluated = analytics_layer.evaluate_players(candidates)
print("Player evaluation complete")
# Top prospects
top_5 = evaluated[["name", "position", "value_rating", "percentile_rank"]].head(5)
print("\nTop 5 Prospects:")
print(top_5.to_string(index=False))
# Generate scouting report
scout_report = presentation_layer.generate_scouting_report(evaluated)
print(f"\nScouting report generated: {scout_report['total_evaluated']} players evaluated")
print("\n--- Demo Complete ---")
return {
"daily_report": daily_report,
"match_report": match_report,
"scout_report": scout_report
}
# Run the demo
demo_results = run_complete_demo()
library(tidyverse)
library(R6)
# Complete integrated workflow demonstration
run_complete_demo <- function() {
message("=" %>% rep(60) %>% paste(collapse = ""))
message("FootballAnalyticsPro - Complete System Demo")
message("=" %>% rep(60) %>% paste(collapse = ""))
# Initialize layers
data_layer <- DataLayer$new(list(host = "localhost"))
analytics_layer <- AnalyticsLayer$new(list(xg_model = "models/xg.rds"))
presentation_layer <- PresentationLayer$new(list(report_dir = "reports/"))
message("\n--- Workflow 1: Daily Pipeline ---")
# Ingest today data
today <- Sys.Date()
match_ids <- data_layer$ingest_daily_data(today)
message(paste("Ingested", length(match_ids), "matches"))
# Process matches
analytics_results <- analytics_layer$process_matches(match_ids)
message("Analytics processing complete")
# Generate report
daily_report <- presentation_layer$generate_daily_report(analytics_results, today)
message(paste("Daily report generated:", daily_report$title))
message("\n--- Workflow 2: Match Analysis ---")
# Get specific match
match_data <- data_layer$get_match("match_1")
message(paste("Loaded match with", nrow(match_data$events), "events"))
# Full analysis
match_analysis <- analytics_layer$full_match_analysis(match_data)
message("Match analysis complete:")
print(match_analysis$xg)
# Generate match report
match_report <- presentation_layer$generate_match_report(
match_analysis,
match_data$match_info
)
message("Match report generated")
message("\n--- Workflow 3: Player Scouting ---")
# Define scouting criteria
criteria <- list(
position = c("FW", "MF"),
max_age = 25,
min_market_value = 5e6
)
# Search players
candidates <- data_layer$search_players(criteria)
message(paste("Found", nrow(candidates), "candidates"))
# Evaluate players
evaluated <- analytics_layer$evaluate_players(candidates)
message("Player evaluation complete")
# Top prospects
top_5 <- evaluated %>%
head(5) %>%
select(name, position, value_rating, percentile_rank)
print(top_5)
# Generate scouting report
scout_report <- presentation_layer$generate_scouting_report(evaluated)
message(paste("Scouting report generated:",
scout_report$total_evaluated, "players evaluated"))
message("\n--- Demo Complete ---")
return(list(
daily_report = daily_report,
match_report = match_report,
scout_report = scout_report
))
}
# Run the demo
demo_results <- run_complete_demo()
Deployment Considerations
- Database: PostgreSQL with read replicas
- Compute: Container orchestration (K8s)
- Storage: S3/GCS for models and reports
- Caching: Redis for frequent queries
- Monitoring: Prometheus + Grafana
- Authentication: OAuth 2.0 / SSO
- Authorization: Role-based access control
- Data Protection: Encryption at rest/transit
- Audit Logging: All data access logged
- GDPR: Data retention policies
CI/CD Pipeline
┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐
│ Code │───▶│ Test │───▶│ Build │───▶│ Deploy │───▶│ Monitor │
│ Push │ │ Suite │ │ Images │ │ Staging │ │ Metrics │
└─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘
│ │
▼ ▼
Unit Tests Integration
Lint/Format Tests
Coverage Smoke Tests
Capstone Exercises
Exercise 60.1: Extend the Data Layer
Add support for tracking data ingestion to the DataLayer. Implement methods for storing player positions at 25Hz and calculating derived metrics like distance covered and sprint counts.
Exercise 60.2: Add a REST API Layer
Create a REST API wrapper for the analytics platform using Flask (Python) or Plumber (R). Implement endpoints for match analysis, player search, and report generation.
Exercise 60.3: Build a Complete Dashboard
Create an interactive dashboard using Shiny (R) or Dash (Python) that displays real-time match analytics, player comparisons, and scouting shortlists. Include filters for competition, date range, and player position.
Course Summary
Congratulations!
You've completed the Soccer Analytics Textbook! Over 60 chapters, you've learned:
- Foundations: Data wrangling, visualization, and the football data ecosystem
- Core Metrics: Expected Goals, player valuation, and performance measurement
- Positional Analysis: Goalkeeper, defender, midfielder, and forward analytics
- Team Analytics: Tactical analysis, pressing metrics, and set pieces
- Advanced Topics: Machine learning, tracking data, and network analysis
- Business Applications: Recruitment, club valuation, and fan engagement
- Professional Skills: Building departments, research publication, and system design
Your Analytics Journey Continues
This textbook has given you the foundation to work in football analytics. The field evolves rapidly, so stay current by:
- Following analytics Twitter/X and reading industry blogs
- Attending conferences (MIT Sloan, StatsBomb, Opta Forum)
- Contributing to open-source projects
- Building your portfolio with public analyses
- Networking with other analysts and researchers
"Data is just the beginning. The art is in the interpretation."