Capstone - Complete Analytics System
Youth development is where analytics can provide the greatest long-term competitive advantage. By applying data-driven methods to talent identification, development tracking, and pathway optimization, clubs can build sustainable pipelines of first-team-ready players.
Learning Objectives
- Build development curves to track player progression
- Create talent identification models for youth players
- Implement percentile rankings with age adjustments
- Analyze loan decisions and pathway optimization
- Design academy performance dashboards
Development Curves
Development curves track how players improve over time. Understanding expected development patterns helps identify who is ahead or behind the curve.
Building Development Curves
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
# Load youth player data with longitudinal tracking
youth_data = pd.read_csv("youth_player_seasons.csv")
# Calculate key development metrics per season
development_metrics = (
youth_data
.groupby(['player_id', 'season', 'age'])
.agg({
'minutes': 'sum',
'goals': 'sum',
'xg': 'sum',
'progressive_passes': 'sum',
'successful_dribbles': 'sum'
})
.reset_index()
)
# Calculate per 90 metrics
development_metrics['xg_p90'] = (
development_metrics['xg'] / (development_metrics['minutes'] / 90)
)
development_metrics['progressive_passes_p90'] = (
development_metrics['progressive_passes'] / (development_metrics['minutes'] / 90)
)
# Filter for minimum playing time
development_metrics = development_metrics[development_metrics['minutes'] >= 450]
# Fit polynomial regression for expected development by age
X = development_metrics[['age']].values
y = development_metrics['xg_p90'].values
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
model = Ridge(alpha=1.0)
model.fit(X_poly, y)
# Generate expected curve
age_grid = np.linspace(16, 23, 100).reshape(-1, 1)
X_grid_poly = poly.transform(age_grid)
expected_curve = model.predict(X_grid_poly)
# Calculate prediction intervals via bootstrap
n_bootstrap = 1000
bootstrap_curves = np.zeros((n_bootstrap, len(age_grid)))
for i in range(n_bootstrap):
idx = np.random.choice(len(X), size=len(X), replace=True)
model_boot = Ridge(alpha=1.0)
model_boot.fit(poly.transform(X[idx]), y[idx])
bootstrap_curves[i] = model_boot.predict(X_grid_poly)
lower = np.percentile(bootstrap_curves, 2.5, axis=0)
upper = np.percentile(bootstrap_curves, 97.5, axis=0)
# Plot development curve
fig, ax = plt.subplots(figsize=(10, 6))
# Individual player trajectories
for player_id in development_metrics['player_id'].unique()[:50]:
player_data = development_metrics[development_metrics['player_id'] == player_id]
ax.plot(player_data['age'], player_data['xg_p90'], alpha=0.2, color='gray')
# Expected curve with confidence interval
ax.fill_between(age_grid.flatten(), lower, upper, alpha=0.3, color='steelblue')
ax.plot(age_grid, expected_curve, color='steelblue', linewidth=2, label='Expected')
ax.set_xlabel('Age')
ax.set_ylabel('xG per 90')
ax.set_title('xG Development Curve for Youth Forwards')
ax.legend()
plt.tight_layout()
plt.show()library(tidyverse)
library(mgcv)
# Load youth player data with longitudinal tracking
youth_data <- read_csv("youth_player_seasons.csv")
# Calculate key development metrics per season
development_metrics <- youth_data %>%
group_by(player_id, season, age) %>%
summarise(
minutes_played = sum(minutes),
goals_p90 = sum(goals) / (sum(minutes) / 90),
xg_p90 = sum(xg) / (sum(minutes) / 90),
progressive_passes_p90 = sum(progressive_passes) / (sum(minutes) / 90),
successful_dribbles_p90 = sum(successful_dribbles) / (sum(minutes) / 90),
.groups = "drop"
) %>%
filter(minutes_played >= 450) # Minimum 5 full matches
# Fit GAM model for expected development by age
xg_development_model <- gam(
xg_p90 ~ s(age, k = 6),
data = development_metrics,
family = gaussian()
)
# Generate expected curve
age_grid <- tibble(age = seq(16, 23, by = 0.1))
expected_curve <- age_grid %>%
mutate(
expected_xg_p90 = predict(xg_development_model, newdata = .),
se = predict(xg_development_model, newdata = ., se.fit = TRUE)$se.fit,
lower = expected_xg_p90 - 1.96 * se,
upper = expected_xg_p90 + 1.96 * se
)
# Plot development curve with individual players
ggplot() +
geom_ribbon(data = expected_curve,
aes(x = age, ymin = lower, ymax = upper),
fill = "steelblue", alpha = 0.2) +
geom_line(data = expected_curve,
aes(x = age, y = expected_xg_p90),
color = "steelblue", size = 1.5) +
geom_line(data = development_metrics,
aes(x = age, y = xg_p90, group = player_id),
alpha = 0.2) +
labs(
title = "xG Development Curve for Youth Forwards",
subtitle = "Individual players vs. expected development",
x = "Age",
y = "xG per 90"
) +
theme_minimal()Position-Specific Development
Different positions develop at different rates. Center backs tend to peak later, while wingers may show earlier promise.
# Position-specific development curves
position_curves = {}
positions = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']
for position in positions:
pos_data = development_metrics[development_metrics['position_group'] == position]
if len(pos_data) >= 20:
X = pos_data[['age']].values
y = pos_data['composite_score'].values
poly = PolynomialFeatures(degree=3)
model = Ridge(alpha=1.0)
model.fit(poly.fit_transform(X), y)
age_grid = np.linspace(16, 23, 100)
predictions = model.predict(poly.transform(age_grid.reshape(-1, 1)))
position_curves[position] = {
'age': age_grid,
'predicted': predictions
}
# Plot position-specific curves
fig, ax = plt.subplots(figsize=(10, 6))
colors = {'Forward': '#e74c3c', 'Midfielder': '#3498db',
'Defender': '#2ecc71', 'Goalkeeper': '#f39c12'}
for position, data in position_curves.items():
ax.plot(data['age'], data['predicted'],
label=position, color=colors[position], linewidth=2)
ax.set_xlabel('Age')
ax.set_ylabel('Composite Score')
ax.set_title('Development Curves by Position')
ax.legend()
plt.tight_layout()
plt.show()
# Identify early vs late developers
def calculate_development_rate(group):
if len(group) < 2:
return pd.Series({'development_rate': np.nan})
group = group.sort_values('age')
start_score = group['composite_score'].iloc[0]
end_score = group['composite_score'].iloc[-1]
age_diff = group['age'].iloc[-1] - group['age'].iloc[0]
return pd.Series({
'start_age': group['age'].iloc[0],
'end_age': group['age'].iloc[-1],
'start_score': start_score,
'end_score': end_score,
'development_rate': (end_score - start_score) / age_diff if age_diff > 0 else 0
})
development_speed = (
development_metrics
.groupby('player_id')
.apply(calculate_development_rate)
.reset_index()
)
# Categorize developers
q25 = development_speed['development_rate'].quantile(0.25)
q75 = development_speed['development_rate'].quantile(0.75)
development_speed['developer_type'] = np.where(
development_speed['development_rate'] > q75, 'Early Developer',
np.where(development_speed['development_rate'] < q25, 'Late Bloomer', 'Normal')
)# Position-specific development curves
position_curves <- development_metrics %>%
inner_join(player_positions, by = "player_id") %>%
group_by(position_group) %>%
nest() %>%
mutate(
model = map(data, ~gam(
composite_score ~ s(age, k = 5),
data = .x
)),
predictions = map2(model, data, function(m, d) {
ages <- seq(16, 23, by = 0.5)
tibble(
age = ages,
predicted = predict(m, newdata = tibble(age = ages))
)
})
) %>%
select(position_group, predictions) %>%
unnest(predictions)
# Compare development rates by position
ggplot(position_curves, aes(x = age, y = predicted, color = position_group)) +
geom_line(linewidth = 1.2) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Development Curves by Position",
subtitle = "Composite performance score by age",
x = "Age",
y = "Composite Score",
color = "Position"
) +
theme_minimal()
# Identify early vs late developers
development_speed <- development_metrics %>%
group_by(player_id) %>%
filter(n() >= 2) %>%
arrange(age) %>%
summarise(
start_age = min(age),
end_age = max(age),
start_score = first(composite_score),
end_score = last(composite_score),
development_rate = (end_score - start_score) / (end_age - start_age),
.groups = "drop"
) %>%
mutate(
developer_type = case_when(
development_rate > quantile(development_rate, 0.75) ~ "Early Developer",
development_rate < quantile(development_rate, 0.25) ~ "Late Bloomer",
TRUE ~ "Normal Development"
)
)Talent Identification Models
Talent identification uses current performance and physical/technical attributes to predict future success. The challenge is that youth data is noisy and sample sizes are small.
Predictive Features
- Technical skills (relative to age)
- Physical attributes and growth potential
- Decision-making speed
- Performance in high-pressure situations
- Rate of improvement
Challenges
- Relative age effects
- Physical maturation differences
- Small sample sizes
- Context and competition level
- Long prediction horizons
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
import shap
# Prepare talent identification dataset
talent_data = youth_players[
(youth_players['age_at_observation'] >= 15) &
(youth_players['age_at_observation'] <= 18)
].copy()
# Create target: reached professional level within 5 years
talent_data['reached_pro'] = (
talent_data['professional_appearances_5yr'] > 0
).astype(int)
# Function to calculate age-adjusted percentiles
def age_adjusted_percentile(df, column, age_column):
return df.groupby(df[age_column].round())[column].rank(pct=True)
# Create age-adjusted features
talent_data['xg_percentile_age'] = age_adjusted_percentile(
talent_data, 'xg_p90', 'age_at_observation'
)
talent_data['pass_percentile_age'] = age_adjusted_percentile(
talent_data, 'progressive_passes_p90', 'age_at_observation'
)
talent_data['dribble_percentile_age'] = age_adjusted_percentile(
talent_data, 'successful_dribbles_p90', 'age_at_observation'
)
# Feature selection
features = [
'xg_percentile_age', 'pass_percentile_age', 'dribble_percentile_age',
'height_percentile_age', 'sprint_percentile_age',
'improvement_rate', 'minutes_played_u17', 'international_u17_caps',
'birth_quarter' # Control for relative age effect
]
# Prepare data
X = talent_data[features].fillna(0)
y = talent_data['reached_pro']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Train Gradient Boosting model
talent_model = GradientBoostingClassifier(
n_estimators=200,
max_depth=4,
learning_rate=0.1,
random_state=42
)
talent_model.fit(X_train, y_train)
# Evaluate
y_pred_proba = talent_model.predict_proba(X_test)[:, 1]
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}")
# Cross-validation
cv_scores = cross_val_score(talent_model, X, y, cv=5, scoring='roc_auc')
print(f"CV ROC-AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
# SHAP analysis for interpretability
explainer = shap.TreeExplainer(talent_model)
shap_values = explainer.shap_values(X_test)
# Summary plot
shap.summary_plot(shap_values, X_test, feature_names=features)
# Predict probabilities for current youth players
current_youth['pro_probability'] = talent_model.predict_proba(
current_youth[features].fillna(0)
)[:, 1]
# Rank prospects
prospects = current_youth.nlargest(20, 'pro_probability')[
['player_name', 'age', 'position', 'pro_probability']
]
print(prospects)library(randomForest)
library(caret)
# Prepare talent identification dataset
talent_data <- youth_players %>%
filter(age_at_observation >= 15, age_at_observation <= 18) %>%
mutate(
# Create target: reached professional level within 5 years
reached_pro = as.factor(ifelse(
professional_appearances_5yr > 0, "Yes", "No"
)),
# Age-adjusted percentiles
xg_percentile_age = ntile_by_age(xg_p90, age_at_observation),
pass_percentile_age = ntile_by_age(progressive_passes_p90, age_at_observation),
dribble_percentile_age = ntile_by_age(successful_dribbles_p90, age_at_observation),
# Physical metrics
height_percentile_age = ntile_by_age(height_cm, age_at_observation),
sprint_percentile_age = ntile_by_age(top_speed_kmh, age_at_observation),
# Calculate improvement rate if multiple seasons available
improvement_rate = improvement_rate_calc(player_id, composite_score, age)
)
# Split data
set.seed(42)
train_idx <- createDataPartition(talent_data$reached_pro, p = 0.7, list = FALSE)
train_data <- talent_data[train_idx, ]
test_data <- talent_data[-train_idx, ]
# Feature selection for talent model
features <- c(
"xg_percentile_age", "pass_percentile_age", "dribble_percentile_age",
"height_percentile_age", "sprint_percentile_age",
"improvement_rate", "minutes_played_u17", "international_u17_caps",
"position_group", "birth_quarter" # Control for relative age
)
# Train Random Forest model
talent_model <- randomForest(
reached_pro ~ .,
data = train_data[, c(features, "reached_pro")],
ntree = 500,
mtry = 3,
importance = TRUE
)
# Variable importance
importance_df <- importance(talent_model) %>%
as.data.frame() %>%
rownames_to_column("feature") %>%
arrange(desc(MeanDecreaseGini))
ggplot(importance_df, aes(x = reorder(feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "forestgreen") +
coord_flip() +
labs(
title = "Feature Importance for Talent Prediction",
x = "Feature",
y = "Importance (Gini)"
) +
theme_minimal()
# Predict probabilities for current youth players
current_youth$pro_probability <- predict(
talent_model,
newdata = current_youth,
type = "prob"
)[, "Yes"]Age-Adjusted Percentiles
Raw percentiles are misleading for youth players because a 16-year-old shouldn't be compared directly to a 21-year-old. Age-adjusted percentiles show how a player ranks among peers of similar age.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi
def calculate_age_adjusted_percentiles(data, metrics, age_col='age'):
"""
Calculate percentile rankings within age groups.
"""
data = data.copy()
data['age_group'] = np.floor(data[age_col])
for metric in metrics:
percentile_col = f"{metric}_pct_age"
data[percentile_col] = (
data.groupby('age_group')[metric]
.rank(pct=True) * 100
)
return data
# Define metrics to adjust
metrics_to_adjust = [
'xg_p90', 'xa_p90', 'progressive_passes_p90',
'successful_dribbles_p90', 'tackles_won_p90', 'aerial_win_pct'
]
# Apply adjustment
youth_adjusted = calculate_age_adjusted_percentiles(
youth_data,
metrics_to_adjust,
'age'
)
def create_youth_radar(data, player_name, metrics):
"""
Create radar chart showing age-adjusted percentiles.
"""
# Get player data
player_row = data[data['name'] == player_name].iloc[0]
# Extract percentile values
percentile_cols = [f"{m}_pct_age" for m in metrics]
values = [player_row[col] for col in percentile_cols]
values += values[:1] # Close the polygon
# Set up radar chart
angles = [n / len(metrics) * 2 * pi for n in range(len(metrics))]
angles += angles[:1]
# Create plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
# Plot data
ax.plot(angles, values, 'o-', linewidth=2, color='forestgreen')
ax.fill(angles, values, alpha=0.25, color='forestgreen')
# Set labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels([m.replace('_p90', '').replace('_', ' ').title()
for m in metrics])
ax.set_ylim(0, 100)
# Add reference circles
for pct in [25, 50, 75]:
ax.plot(angles, [pct] * len(angles), '--', color='gray', alpha=0.3)
plt.title(f"{player_name} - Age-Adjusted Profile\nAge: {player_row['age']:.1f}")
plt.tight_layout()
plt.show()
# Example usage
create_youth_radar(youth_adjusted, "Jude Bellingham", metrics_to_adjust)
# Compare multiple prospects
def compare_prospects(data, player_names, metrics):
"""
Compare multiple youth players on age-adjusted metrics.
"""
fig, axes = plt.subplots(1, len(player_names),
figsize=(5*len(player_names), 5),
subplot_kw=dict(polar=True))
colors = plt.cm.Set2(np.linspace(0, 1, len(player_names)))
for idx, (ax, name) in enumerate(zip(axes, player_names)):
player_row = data[data['name'] == name].iloc[0]
percentile_cols = [f"{m}_pct_age" for m in metrics]
values = [player_row[col] for col in percentile_cols]
values += values[:1]
angles = [n / len(metrics) * 2 * pi for n in range(len(metrics))]
angles += angles[:1]
ax.plot(angles, values, 'o-', linewidth=2, color=colors[idx])
ax.fill(angles, values, alpha=0.25, color=colors[idx])
ax.set_xticks(angles[:-1])
ax.set_xticklabels([m.replace('_p90', '')[:6] for m in metrics], size=8)
ax.set_ylim(0, 100)
ax.set_title(f"{name}\nAge: {player_row['age']:.1f}")
plt.tight_layout()
plt.show()# Calculate age-adjusted percentiles
calculate_age_adjusted_percentiles <- function(data, metrics, age_col = "age") {
# Round age to create comparison groups
data <- data %>%
mutate(age_group = floor(.data[[age_col]]))
# Calculate percentiles within age groups
for (metric in metrics) {
percentile_col <- paste0(metric, "_pct_age")
data <- data %>%
group_by(age_group) %>%
mutate(
!!percentile_col := percent_rank(.data[[metric]]) * 100
) %>%
ungroup()
}
return(data)
}
# Apply to youth player data
metrics_to_adjust <- c(
"xg_p90", "xa_p90", "progressive_passes_p90",
"successful_dribbles_p90", "tackles_won_p90", "aerial_win_pct"
)
youth_adjusted <- calculate_age_adjusted_percentiles(
youth_data,
metrics_to_adjust,
"age"
)
# Create radar chart comparing player to age peers
create_youth_radar <- function(player_data, player_name, metrics) {
# Get player's age-adjusted percentiles
player_row <- player_data %>%
filter(name == player_name) %>%
select(ends_with("_pct_age"))
# Reshape for radar
radar_data <- player_row %>%
pivot_longer(everything(), names_to = "metric", values_to = "percentile") %>%
mutate(metric = str_remove(metric, "_pct_age"))
# Create radar plot
ggplot(radar_data, aes(x = metric, y = percentile)) +
geom_polygon(fill = "forestgreen", alpha = 0.3) +
geom_point(color = "forestgreen", size = 3) +
coord_polar() +
scale_y_continuous(limits = c(0, 100)) +
labs(
title = paste(player_name, "- Age-Adjusted Profile"),
subtitle = paste("Age:", player_data$age[player_data$name == player_name])
) +
theme_minimal()
}
# Example usage
create_youth_radar(youth_adjusted, "Jude Bellingham", metrics_to_adjust)Relative Age Effect
Players born early in the selection year (Q1) are overrepresented in academies due to physical advantages at young ages. Consider birth quarter when evaluating prospects - a Q4 player performing equally to Q1 peers may have higher ceiling.
Loan Decision Analytics
Loan decisions are critical for player development. Analytics can help identify the right loan level, playing time requirements, and development environments.
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
# Analyze historical loan outcomes
loan_outcomes = loans_data.copy()
# Calculate development metrics
loan_outcomes['development'] = (
loan_outcomes['composite_score_post'] -
loan_outcomes['composite_score_pre']
)
loan_outcomes['league_level_diff'] = (
loan_outcomes['home_league_level'] -
loan_outcomes['loan_league_level']
)
loan_outcomes['minutes_per_game'] = (
loan_outcomes['total_minutes'] /
loan_outcomes['games_available']
)
loan_outcomes['starting_pct'] = (
loan_outcomes['starts'] /
loan_outcomes['appearances']
)
# Model loan success factors
features = ['league_level_diff', 'minutes_per_game', 'starting_pct',
'age_at_loan', 'loan_duration_months']
X = loan_outcomes[features].dropna()
y = loan_outcomes.loc[X.index, 'development']
X_with_const = sm.add_constant(X)
loan_model = sm.OLS(y, X_with_const).fit()
print(loan_model.summary())
# Optimal loan level analysis
optimal_level = (
loan_outcomes
.groupby('league_level_diff')
.agg({
'development': ['mean', 'std', 'count'],
'minutes_per_game': 'mean'
})
.reset_index()
)
optimal_level.columns = ['league_level_diff', 'avg_development',
'std_development', 'n_loans', 'avg_minutes']
optimal_level['se'] = optimal_level['std_development'] / np.sqrt(optimal_level['n_loans'])
# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(optimal_level['league_level_diff'], optimal_level['avg_development'],
yerr=optimal_level['se'], color='steelblue', capsize=5)
ax.set_xlabel('League Level Difference (positive = lower league)')
ax.set_ylabel('Average Development Score Change')
ax.set_title('Player Development by Loan League Level')
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()
def recommend_loan(player, available_clubs):
"""
Recommend optimal loan destinations for a player.
"""
recommendations = []
for _, club in available_clubs.iterrows():
# Level match score
level_match = 1 - abs(
club['league_level'] - estimate_player_level(player['composite_score'])
) / 5
# Playing time probability
playing_time_score = predict_playing_time(
player['composite_score'],
club['avg_player_level'],
club['position_depth_' + player['position']]
)
# Development environment
development_score = club['youth_development_rating'] / 100
# Combined score
total_score = (
0.3 * level_match +
0.4 * playing_time_score +
0.3 * development_score
)
recommendations.append({
'club': club['name'],
'league': club['league'],
'level_match': level_match,
'playing_time_score': playing_time_score,
'development_score': development_score,
'total_score': total_score
})
return pd.DataFrame(recommendations).sort_values('total_score', ascending=False)# Analyze historical loan outcomes
loan_outcomes <- loans_data %>%
mutate(
# Calculate development during loan
pre_loan_score = composite_score_pre,
post_loan_score = composite_score_post,
development = post_loan_score - pre_loan_score,
# Loan characteristics
league_level_diff = home_league_level - loan_league_level,
minutes_per_game = total_minutes / games_available,
starting_pct = starts / appearances
)
# Model loan success factors
loan_success_model <- lm(
development ~
league_level_diff +
minutes_per_game +
starting_pct +
age_at_loan +
loan_duration_months +
position,
data = loan_outcomes
)
summary(loan_success_model)
# Optimal loan level analysis
optimal_level <- loan_outcomes %>%
group_by(league_level_diff) %>%
summarise(
n_loans = n(),
avg_development = mean(development, na.rm = TRUE),
se = sd(development, na.rm = TRUE) / sqrt(n()),
avg_minutes = mean(minutes_per_game, na.rm = TRUE),
.groups = "drop"
)
ggplot(optimal_level, aes(x = factor(league_level_diff), y = avg_development)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_errorbar(aes(ymin = avg_development - se, ymax = avg_development + se),
width = 0.2) +
labs(
title = "Player Development by Loan League Level",
subtitle = "Positive = lower league than parent club",
x = "League Level Difference",
y = "Average Development Score Change"
) +
theme_minimal()
# Helper function: Estimate player level in a league context
estimated_player_level <- function(composite_score) {
# Map composite score (0-100) to league level (1-5)
# Higher composite score = higher expected league level
case_when(
composite_score >= 80 ~ 1, # Top tier leagues
composite_score >= 65 ~ 2, # Second tier
composite_score >= 50 ~ 3, # Third tier
composite_score >= 35 ~ 4, # Fourth tier
TRUE ~ 5 # Lower leagues
)
}
# Helper function: Predict playing time likelihood
predict_playing_time <- function(player_level, league_avg_level, position_depth) {
# Higher chance of playing if player level > league average
level_advantage <- player_level - league_avg_level
# Lower position depth = more opportunity
depth_factor <- 1 / (position_depth + 1)
# Combine factors (normalized to 0-1)
min(max((level_advantage + 20) / 40 * depth_factor * 2, 0), 1)
}
# Recommend loan destination for player
recommend_loan <- function(player, available_clubs) {
# Player needs
player_level <- player$current_composite_score
player_position <- player$position
player_age <- player$age
# Score each potential destination
club_scores <- available_clubs %>%
mutate(
# Match player level to league
level_match = 1 - abs(league_level - estimated_player_level(player_level)),
# Playing time likelihood
playing_time_score = predict_playing_time(
player_level, league_avg_level, position_depth
),
# Development environment
development_score = youth_development_rating,
# Combined score
total_score = 0.3 * level_match +
0.4 * playing_time_score +
0.3 * development_score
) %>%
arrange(desc(total_score))
return(club_scores)
}Loan Success Criteria
| Factor | Optimal Range | Importance |
|---|---|---|
| Minutes per game | > 70 minutes | Critical |
| Starting percentage | > 60% | High |
| League level difference | 1-2 levels below | High |
| Loan duration | Full season | Medium |
| Club development rating | Top 25% in league | Medium |
Academy Performance Dashboard
A comprehensive academy dashboard tracks player development, pathway success, and overall academy efficiency.
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def calculate_academy_kpis(academy_data, years=5):
"""
Calculate key performance indicators for academy.
"""
from datetime import datetime, timedelta
end_date = datetime.now()
start_date = end_date - timedelta(days=years*365)
kpis = {}
# Graduation rate
recent_intake = academy_data[academy_data['entry_date'] >= start_date]
kpis['graduation_rate'] = {
'total_players': len(recent_intake),
'reached_first_team': recent_intake['first_team_debut'].sum(),
'rate': recent_intake['first_team_debut'].mean() * 100
}
# Pro contract rate
recent_exits = academy_data[academy_data['exit_date'] >= start_date]
pro_contracts = recent_exits[recent_exits['exit_type'] == 'Professional Contract']
kpis['pro_contract_rate'] = {
'total_exits': len(recent_exits),
'pro_contracts': len(pro_contracts),
'rate': len(pro_contracts) / len(recent_exits) * 100 if len(recent_exits) > 0 else 0
}
# Development efficiency
with_scores = academy_data.dropna(subset=['composite_score_entry', 'composite_score_exit'])
with_scores['development'] = (
with_scores['composite_score_exit'] -
with_scores['composite_score_entry']
)
with_scores['years_in_academy'] = (
(with_scores['exit_date'] - with_scores['entry_date']).dt.days / 365
)
kpis['development_efficiency'] = {
'avg_development': with_scores['development'].mean(),
'development_per_year': (
with_scores['development'] / with_scores['years_in_academy']
).mean()
}
# Sales revenue
sales = academy_data[
(academy_data['exit_type'] == 'Sale') &
(academy_data['exit_date'] >= start_date)
]
kpis['sales_revenue'] = {
'total_sales': len(sales),
'total_revenue': sales['transfer_fee'].sum(),
'avg_fee': sales['transfer_fee'].mean()
}
return kpis
def create_academy_dashboard(academy_data):
"""
Create comprehensive academy dashboard.
"""
kpis = calculate_academy_kpis(academy_data)
# Create subplots
fig = make_subplots(
rows=2, cols=2,
subplot_titles=(
'Player Development Funnel',
'Development by Age Group',
'Position Distribution',
'Historical Graduation Rate'
),
specs=[[{'type': 'funnel'}, {'type': 'bar'}],
[{'type': 'pie'}, {'type': 'scatter'}]]
)
# Funnel chart
funnel_data = calculate_pathway_funnel(academy_data)
fig.add_trace(
go.Funnel(
y=funnel_data['stage'],
x=funnel_data['count'],
textinfo="value+percent previous"
),
row=1, col=1
)
# Development by age group
age_development = (
academy_data
.groupby('age_group')
['development_rate']
.mean()
.reset_index()
)
fig.add_trace(
go.Bar(
x=age_development['age_group'],
y=age_development['development_rate'],
marker_color='forestgreen'
),
row=1, col=2
)
# Position distribution
position_dist = (
academy_data[academy_data['current_status'] == 'Active']
.groupby('position')
.size()
.reset_index(name='count')
)
fig.add_trace(
go.Pie(
labels=position_dist['position'],
values=position_dist['count']
),
row=2, col=1
)
# Historical graduation rate
yearly_rates = calculate_yearly_graduation_rate(academy_data)
fig.add_trace(
go.Scatter(
x=yearly_rates['year'],
y=yearly_rates['graduation_rate'],
mode='lines+markers'
),
row=2, col=2
)
fig.update_layout(height=800, showlegend=False)
fig.show()
return kpis
# Example usage
kpis = create_academy_dashboard(academy_data)
print(f"Graduation Rate: {kpis['graduation_rate']['rate']:.1f}%")
print(f"Avg Development/Year: {kpis['development_efficiency']['development_per_year']:.2f}")library(shiny)
library(plotly)
# Academy KPIs
calculate_academy_kpis <- function(academy_data, years = 5) {
end_date <- Sys.Date()
start_date <- end_date - years * 365
kpis <- list(
# Graduation rate
graduation_rate = academy_data %>%
filter(entry_date >= start_date) %>%
summarise(
total_players = n(),
reached_first_team = sum(first_team_debut == TRUE),
graduation_rate = reached_first_team / total_players * 100
),
# Pro contract rate
pro_contract_rate = academy_data %>%
filter(exit_date >= start_date) %>%
summarise(
total_exits = n(),
pro_contracts = sum(exit_type == "Professional Contract"),
pro_rate = pro_contracts / total_exits * 100
),
# Development efficiency
development_efficiency = academy_data %>%
filter(!is.na(composite_score_entry), !is.na(composite_score_exit)) %>%
summarise(
avg_development = mean(composite_score_exit - composite_score_entry),
development_per_year = mean(
(composite_score_exit - composite_score_entry) /
as.numeric(exit_date - entry_date) * 365
)
),
# Position coverage
position_coverage = academy_data %>%
filter(current_status == "Active") %>%
group_by(age_group, position) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = position, values_from = count, values_fill = 0),
# Revenue from sales
sales_revenue = academy_data %>%
filter(exit_type == "Sale", exit_date >= start_date) %>%
summarise(
total_sales = n(),
total_revenue = sum(transfer_fee, na.rm = TRUE),
avg_fee = mean(transfer_fee, na.rm = TRUE)
)
)
return(kpis)
}
# Generate dashboard data
academy_kpis <- calculate_academy_kpis(academy_data)
# Create summary visualization
create_academy_summary <- function(kpis) {
# KPI cards
kpi_cards <- data.frame(
metric = c("Graduation Rate", "Pro Contract Rate",
"Avg Development/Year", "Total Sales Revenue"),
value = c(
paste0(round(kpis$graduation_rate$graduation_rate, 1), "%"),
paste0(round(kpis$pro_contract_rate$pro_rate, 1), "%"),
round(kpis$development_efficiency$development_per_year, 2),
paste0("€", round(kpis$sales_revenue$total_revenue / 1e6, 1), "M")
),
color = c("success", "primary", "warning", "info")
)
return(kpi_cards)
}
# Player pathway funnel
pathway_funnel <- academy_data %>%
summarise(
u12_intake = sum(age_group_entry == "U12"),
u15_retained = sum(age_group_entry <= "U12" & reached_u15),
u18_retained = sum(age_group_entry <= "U15" & reached_u18),
u21_retained = sum(age_group_entry <= "U18" & reached_u21),
first_team = sum(first_team_debut)
) %>%
pivot_longer(everything(), names_to = "stage", values_to = "count") %>%
mutate(
stage = factor(stage, levels = c("u12_intake", "u15_retained",
"u18_retained", "u21_retained", "first_team")),
retention = count / lag(count) * 100
)Practice Exercises
Exercise 21.1: Complete Youth Development Tracking System
Task: Build a comprehensive development tracking system that monitors youth player progression across multiple seasons, creates position-specific development curves, and identifies players who are ahead or behind expected development.
Requirements:
- Track development across technical, physical, and tactical dimensions
- Create GAM-based development curves with confidence intervals
- Calculate development velocity (rate of improvement)
- Flag players significantly above/below expected curve
- Generate development reports for coaches
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import UnivariateSpline
import matplotlib.pyplot as plt
# Simulate comprehensive youth player data
np.random.seed(42)
n_players = 200
youth_data = []
for p in range(n_players):
n_seasons = np.random.randint(3, 7)
start_age = np.random.uniform(15.5, 17.5)
position = np.random.choice(['Forward', 'Midfielder', 'Defender', 'Goalkeeper'])
talent_factor = np.random.normal(0, 0.3)
for s in range(n_seasons):
age = start_age + s
minutes = max(400, np.random.normal(1200, 400))
# Technical metrics with age-based growth
pos_mult = {'Forward': 1.5, 'Midfielder': 0.7, 'Defender': 0.5, 'Goalkeeper': 0.3}
xg_p90 = max(0, 0.1 + (age - 16) * 0.08 * pos_mult[position] +
talent_factor * 0.1 + np.random.normal(0, 0.05))
mid_mult = {'Forward': 0.7, 'Midfielder': 1.5, 'Defender': 0.6, 'Goalkeeper': 0.3}
xa_p90 = max(0, 0.05 + (age - 16) * 0.04 * mid_mult[position] +
talent_factor * 0.08 + np.random.normal(0, 0.03))
prog_mult = {'Forward': 0.8, 'Midfielder': 1.3, 'Defender': 1.0, 'Goalkeeper': 0.3}
progressive_passes_p90 = max(0, 2 + (age - 16) * 0.5 * prog_mult[position] +
talent_factor * 0.5 + np.random.normal(0, 0.5))
successful_dribbles_p90 = max(0, 1 + (age - 16) * 0.3 + talent_factor * 0.4 +
np.random.normal(0, 0.3))
# Physical metrics
top_speed_kmh = 28 + (age - 16) * 0.8 + talent_factor * 1.5 + np.random.normal(0, 0.5)
# Defensive metrics
def_mult = {'Forward': 0.5, 'Midfielder': 0.8, 'Defender': 1.5, 'Goalkeeper': 0.3}
tackles_won_p90 = max(0, 1.5 + (age - 16) * 0.2 * def_mult[position] +
talent_factor * 0.3 + np.random.normal(0, 0.3))
youth_data.append({
'player_id': f'YP{p:03d}',
'position': position,
'season': s + 1,
'age': age,
'minutes': minutes,
'xg_p90': xg_p90,
'xa_p90': xa_p90,
'progressive_passes_p90': progressive_passes_p90,
'successful_dribbles_p90': successful_dribbles_p90,
'top_speed_kmh': top_speed_kmh,
'tackles_won_p90': tackles_won_p90
})
youth_df = pd.DataFrame(youth_data)
youth_df = youth_df[youth_df['minutes'] >= 450] # Minimum playing time
# Calculate position-specific z-scores and composite score
def calculate_composite_score(df):
scaler = StandardScaler()
metrics = ['xg_p90', 'xa_p90', 'progressive_passes_p90',
'successful_dribbles_p90', 'top_speed_kmh', 'tackles_won_p90']
result_df = df.copy()
for position in df['position'].unique():
mask = df['position'] == position
pos_data = df.loc[mask, metrics]
# Z-score normalize within position
z_scores = pd.DataFrame(
scaler.fit_transform(pos_data),
columns=[f'{m}_z' for m in metrics],
index=pos_data.index
)
for col in z_scores.columns:
result_df.loc[mask, col] = z_scores[col]
# Position-weighted composite score
weights = {
'Forward': {'xg_p90_z': 0.4, 'xa_p90_z': 0.2, 'successful_dribbles_p90_z': 0.2, 'top_speed_kmh_z': 0.2},
'Midfielder': {'xg_p90_z': 0.2, 'xa_p90_z': 0.3, 'progressive_passes_p90_z': 0.3, 'successful_dribbles_p90_z': 0.2},
'Defender': {'tackles_won_p90_z': 0.3, 'top_speed_kmh_z': 0.2, 'progressive_passes_p90_z': 0.3, 'successful_dribbles_p90_z': 0.2},
'Goalkeeper': {'xg_p90_z': 0.25, 'xa_p90_z': 0.25, 'progressive_passes_p90_z': 0.25, 'tackles_won_p90_z': 0.25}
}
result_df['composite_score'] = 0
for position, pos_weights in weights.items():
mask = result_df['position'] == position
for metric, weight in pos_weights.items():
if metric in result_df.columns:
result_df.loc[mask, 'composite_score'] += weight * result_df.loc[mask, metric]
return result_df
youth_df = calculate_composite_score(youth_df)
# Fit development curves using spline smoothing
def fit_development_curve(data, position, n_points=100):
pos_data = data[data['position'] == position].sort_values('age')
# Bin by age and calculate mean/std
pos_data['age_bin'] = pos_data['age'].round(1)
age_stats = pos_data.groupby('age_bin')['composite_score'].agg(['mean', 'std', 'count']).reset_index()
age_stats = age_stats[age_stats['count'] >= 5] # Minimum samples
# Fit spline
spline = UnivariateSpline(age_stats['age_bin'], age_stats['mean'], s=0.5)
# Generate smooth curve
age_grid = np.linspace(16, 23, n_points)
expected = spline(age_grid)
# Estimate confidence interval using pooled std
avg_std = age_stats['std'].mean()
return pd.DataFrame({
'position': position,
'age': age_grid,
'expected_score': expected,
'lower_95': expected - 1.96 * avg_std,
'upper_95': expected + 1.96 * avg_std
})
# Fit curves for all positions
development_curves = pd.concat([
fit_development_curve(youth_df, pos)
for pos in youth_df['position'].unique()
])
# Calculate development velocity
def calculate_velocity(data):
velocities = []
for player_id in data['player_id'].unique():
player_data = data[data['player_id'] == player_id].sort_values('age')
if len(player_data) >= 2:
score_changes = player_data['composite_score'].diff()
age_changes = player_data['age'].diff()
velocity = score_changes / age_changes
velocities.append({
'player_id': player_id,
'position': player_data['position'].iloc[0],
'current_age': player_data['age'].max(),
'current_score': player_data['composite_score'].iloc[-1],
'avg_velocity': velocity.mean(),
'recent_velocity': velocity.iloc[-1],
'n_seasons': len(player_data)
})
return pd.DataFrame(velocities)
player_velocities = calculate_velocity(youth_df)
# Compare to expected and flag development status
def compare_to_expected(data, curves):
latest = data.groupby('player_id').apply(lambda x: x.loc[x['age'].idxmax()]).reset_index(drop=True)
latest['age_rounded'] = latest['age'].round(1)
curves_lookup = curves[['position', 'age', 'expected_score', 'lower_95', 'upper_95']].copy()
curves_lookup['age'] = curves_lookup['age'].round(1)
merged = latest.merge(curves_lookup, left_on=['position', 'age_rounded'],
right_on=['position', 'age'], how='left')
merged['deviation'] = merged['composite_score'] - merged['expected_score']
conditions = [
merged['composite_score'] > merged['upper_95'],
merged['composite_score'] > merged['expected_score'],
merged['composite_score'] >= merged['lower_95']
]
choices = ['Significantly Above', 'Above Average', 'Below Average']
merged['development_status'] = np.select(conditions, choices, default='Significantly Below')
return merged
latest_status = compare_to_expected(youth_df, development_curves)
latest_status = latest_status.merge(player_velocities[['player_id', 'avg_velocity', 'recent_velocity']],
on='player_id', how='left')
# Generate development report
def generate_report(player_id, data, latest, velocities):
player_history = data[data['player_id'] == player_id].sort_values('age')
status = latest[latest['player_id'] == player_id].iloc[0]
vel = velocities[velocities['player_id'] == player_id].iloc[0]
print("=" * 55)
print("YOUTH DEVELOPMENT REPORT")
print(f"Player: {player_id} | Position: {status['position']}")
print("=" * 55)
print(f"\nCurrent Status:")
print(f" Age: {status['current_age']:.1f}")
print(f" Composite Score: {status['current_score']:.2f}")
print(f" Expected Score: {status['expected_score']:.2f}")
print(f" Status: {status['development_status']}")
print(f"\nDevelopment Velocity:")
print(f" Average: {vel['avg_velocity']:.3f} per year")
print(f" Recent: {vel['recent_velocity']:.3f} per year")
print(f"\nKey Metrics (latest season):")
latest_season = player_history.iloc[-1]
print(f" xG/90: {latest_season['xg_p90']:.3f}")
print(f" xA/90: {latest_season['xa_p90']:.3f}")
print(f" Prog Passes/90: {latest_season['progressive_passes_p90']:.2f}")
# Visualize development curves
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
positions = ['Forward', 'Midfielder', 'Defender', 'Goalkeeper']
for ax, position in zip(axes.flatten(), positions):
pos_curve = development_curves[development_curves['position'] == position]
pos_players = youth_df[youth_df['position'] == position]
# Confidence band
ax.fill_between(pos_curve['age'], pos_curve['lower_95'], pos_curve['upper_95'],
alpha=0.2, label='95% CI')
# Expected curve
ax.plot(pos_curve['age'], pos_curve['expected_score'], linewidth=2, label='Expected')
# Sample player trajectories
for pid in pos_players['player_id'].unique()[:15]:
p_data = pos_players[pos_players['player_id'] == pid]
ax.plot(p_data['age'], p_data['composite_score'], alpha=0.3, linewidth=0.8)
ax.set_title(f'{position} Development Curve')
ax.set_xlabel('Age')
ax.set_ylabel('Composite Score')
ax.legend(loc='upper left')
plt.tight_layout()
plt.suptitle('Youth Development Curves by Position', y=1.02, fontsize=14)
plt.show()
# Top prospects report
print("\n=== TOP PROSPECTS (Ahead of Expected Development) ===")
top_prospects = latest_status[latest_status['development_status'] == 'Significantly Above'].nlargest(10, 'deviation')
print(top_prospects[['player_id', 'position', 'current_age', 'composite_score',
'expected_score', 'deviation', 'avg_velocity']].to_string(index=False))library(tidyverse)
library(mgcv)
# Simulate comprehensive youth player data
set.seed(42)
n_players <- 200
seasons_per_player <- sample(3:6, n_players, replace = TRUE)
youth_tracking <- map_dfr(1:n_players, function(p) {
n_seasons <- seasons_per_player[p]
start_age <- runif(1, 15.5, 17.5)
position <- sample(c("Forward", "Midfielder", "Defender", "Goalkeeper"), 1)
talent_factor <- rnorm(1, 0, 0.3) # Individual variation
tibble(
player_id = paste0("YP", sprintf("%03d", p)),
position = position,
season = 1:n_seasons,
age = start_age + (0:(n_seasons-1)),
minutes = pmax(400, rnorm(n_seasons, 1200, 400)),
# Technical metrics with age-based growth + individual variation
xg_p90 = pmax(0, 0.1 + (age - 16) * 0.08 * ifelse(position == "Forward", 1.5, 0.5) +
talent_factor * 0.1 + rnorm(n_seasons, 0, 0.05)),
xa_p90 = pmax(0, 0.05 + (age - 16) * 0.04 * ifelse(position == "Midfielder", 1.5, 0.7) +
talent_factor * 0.08 + rnorm(n_seasons, 0, 0.03)),
progressive_passes_p90 = pmax(0, 2 + (age - 16) * 0.5 * ifelse(position == "Midfielder", 1.3, 0.8) +
talent_factor * 0.5 + rnorm(n_seasons, 0, 0.5)),
successful_dribbles_p90 = pmax(0, 1 + (age - 16) * 0.3 + talent_factor * 0.4 + rnorm(n_seasons, 0, 0.3)),
# Physical metrics
top_speed_kmh = 28 + (age - 16) * 0.8 + talent_factor * 1.5 + rnorm(n_seasons, 0, 0.5),
distance_p90_km = 9 + (age - 16) * 0.3 + rnorm(n_seasons, 0, 0.4),
# Defensive metrics
tackles_won_p90 = pmax(0, 1.5 + (age - 16) * 0.2 * ifelse(position == "Defender", 1.5, 0.8) +
talent_factor * 0.3 + rnorm(n_seasons, 0, 0.3))
)
}) %>%
filter(minutes >= 450) # Minimum playing time
# Create composite development score by position
youth_tracking <- youth_tracking %>%
group_by(position) %>%
mutate(
# Z-score normalize within position
xg_z = scale(xg_p90)[,1],
xa_z = scale(xa_p90)[,1],
prog_pass_z = scale(progressive_passes_p90)[,1],
dribble_z = scale(successful_dribbles_p90)[,1],
speed_z = scale(top_speed_kmh)[,1],
tackle_z = scale(tackles_won_p90)[,1]
) %>%
ungroup() %>%
mutate(
# Position-weighted composite score
composite_score = case_when(
position == "Forward" ~ 0.4 * xg_z + 0.2 * xa_z + 0.2 * dribble_z + 0.2 * speed_z,
position == "Midfielder" ~ 0.2 * xg_z + 0.3 * xa_z + 0.3 * prog_pass_z + 0.2 * dribble_z,
position == "Defender" ~ 0.3 * tackle_z + 0.2 * speed_z + 0.3 * prog_pass_z + 0.2 * dribble_z,
TRUE ~ 0.25 * (xg_z + xa_z + prog_pass_z + tackle_z)
)
)
# Fit position-specific GAM models for expected development
fit_development_curves <- function(data) {
positions <- unique(data$position)
curves <- map(positions, function(pos) {
pos_data <- data %>% filter(position == pos)
# Fit GAM model
model <- gam(composite_score ~ s(age, k = 5), data = pos_data)
# Generate predictions
age_grid <- seq(16, 23, by = 0.1)
predictions <- predict(model, newdata = tibble(age = age_grid), se.fit = TRUE)
tibble(
position = pos,
age = age_grid,
expected_score = predictions$fit,
se = predictions$se.fit,
lower_95 = expected_score - 1.96 * se,
upper_95 = expected_score + 1.96 * se,
lower_50 = expected_score - 0.67 * se,
upper_50 = expected_score + 0.67 * se
)
}) %>%
bind_rows()
return(curves)
}
development_curves <- fit_development_curves(youth_tracking)
# Calculate development velocity (rate of improvement)
calculate_development_velocity <- function(data) {
data %>%
group_by(player_id) %>%
arrange(age) %>%
mutate(
score_change = composite_score - lag(composite_score),
age_change = age - lag(age),
development_velocity = score_change / age_change
) %>%
filter(!is.na(development_velocity)) %>%
summarise(
position = first(position),
current_age = max(age),
current_score = last(composite_score),
avg_velocity = mean(development_velocity),
recent_velocity = last(development_velocity),
n_seasons = n() + 1,
.groups = "drop"
)
}
player_velocities <- calculate_development_velocity(youth_tracking)
# Compare players to expected development
compare_to_expected <- function(data, curves) {
data %>%
left_join(
curves %>% select(position, age, expected_score, lower_95, upper_95),
by = c("position", "age" = "age")
) %>%
mutate(
deviation = composite_score - expected_score,
development_status = case_when(
composite_score > upper_95 ~ "Significantly Above",
composite_score > expected_score ~ "Above Average",
composite_score >= lower_95 ~ "Below Average",
TRUE ~ "Significantly Below"
)
)
}
# Get latest observation for each player
latest_status <- youth_tracking %>%
group_by(player_id) %>%
filter(age == max(age)) %>%
ungroup() %>%
mutate(age = round(age, 1)) %>%
left_join(
development_curves %>% mutate(age = round(age, 1)),
by = c("position", "age")
) %>%
mutate(
deviation = composite_score - expected_score,
development_status = case_when(
composite_score > upper_95 ~ "Significantly Above",
composite_score > expected_score ~ "Above Average",
composite_score >= lower_95 ~ "Below Average",
TRUE ~ "Significantly Below"
)
) %>%
left_join(player_velocities %>% select(player_id, avg_velocity, recent_velocity), by = "player_id")
# Generate development report
generate_development_report <- function(player_id, data, curves, velocities) {
player <- data %>% filter(player_id == !!player_id) %>% arrange(age)
velocity <- velocities %>% filter(player_id == !!player_id)
pos_curve <- curves %>% filter(position == player$position[1])
cat("=" , rep("=", 50), "\n", sep = "")
cat("YOUTH DEVELOPMENT REPORT\n")
cat("Player:", player_id, "| Position:", player$position[1], "\n")
cat("=" , rep("=", 50), "\n", sep = "")
cat("\nCurrent Status:\n")
cat(" Age:", round(max(player$age), 1), "\n")
cat(" Composite Score:", round(tail(player$composite_score, 1), 2), "\n")
cat(" Expected Score:", round(pos_curve$expected_score[pos_curve$age == round(max(player$age), 1)], 2), "\n")
cat(" Status:", tail(latest_status$development_status[latest_status$player_id == player_id], 1), "\n")
cat("\nDevelopment Velocity:\n")
cat(" Average:", round(velocity$avg_velocity, 3), "per year\n")
cat(" Recent:", round(velocity$recent_velocity, 3), "per year\n")
cat("\nKey Metrics (latest season):\n")
cat(" xG/90:", round(tail(player$xg_p90, 1), 3), "\n")
cat(" xA/90:", round(tail(player$xa_p90, 1), 3), "\n")
cat(" Prog Passes/90:", round(tail(player$progressive_passes_p90, 1), 2), "\n")
}
# Visualize development curves with players
ggplot() +
# Confidence bands
geom_ribbon(data = development_curves,
aes(x = age, ymin = lower_95, ymax = upper_95, fill = position),
alpha = 0.2) +
# Expected curve
geom_line(data = development_curves,
aes(x = age, y = expected_score, color = position),
size = 1.2) +
# Individual player trajectories (sample)
geom_line(data = youth_tracking %>% filter(player_id %in% sample(unique(player_id), 20)),
aes(x = age, y = composite_score, group = player_id),
alpha = 0.3, size = 0.5) +
facet_wrap(~position) +
labs(
title = "Youth Development Curves by Position",
subtitle = "Expected development (line) with 95% CI and individual trajectories",
x = "Age", y = "Composite Development Score"
) +
theme_minimal() +
theme(legend.position = "none")
# Top prospects report
cat("\n=== TOP PROSPECTS (Ahead of Expected Development) ===\n")
latest_status %>%
filter(development_status == "Significantly Above") %>%
arrange(desc(deviation)) %>%
select(player_id, position, age = current_age, composite_score,
expected_score, deviation, avg_velocity) %>%
head(10) %>%
print()Exercise 21.2: Talent Prediction Model with Relative Age Adjustment
Task: Build a talent prediction model that accounts for the relative age effect (birth quarter bias) and predicts which youth players will reach professional level within 5 years.
Requirements:
- Include birth quarter as a feature and analyze its impact
- Create age-adjusted percentile rankings for fair comparison
- Train a classification model (Random Forest or Gradient Boosting)
- Evaluate model with ROC-AUC and calibration
- Generate "Pro Potential Score" for current academy players
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
# Simulate historical youth player data with outcomes
np.random.seed(42)
n_historical = 500
# Birth quarter distribution (Q1 overrepresented due to RAE)
birth_quarters = np.random.choice([1, 2, 3, 4], n_historical, p=[0.35, 0.28, 0.22, 0.15])
historical_data = []
for i in range(n_historical):
bq = birth_quarters[i]
age = np.random.uniform(16, 18)
position = np.random.choice(['Forward', 'Midfielder', 'Defender'], p=[0.3, 0.4, 0.3])
# Birth quarter advantage (Q1 players appear better due to physical maturity)
birth_advantage = (5 - bq) * 0.1
# Technical metrics
xg_p90 = max(0, np.random.normal(0.15 + birth_advantage * 0.02, 0.08))
xa_p90 = max(0, np.random.normal(0.08 + birth_advantage * 0.015, 0.05))
progressive_passes_p90 = max(0, np.random.normal(3.5 + birth_advantage * 0.3, 1.2))
dribbles_p90 = max(0, np.random.normal(1.5 + birth_advantage * 0.2, 0.6))
# Physical metrics (more affected by RAE)
top_speed = 29 + birth_advantage * 1.5 + np.random.normal(0, 1.5)
height = 175 + birth_advantage * 3 + np.random.normal(0, 5)
# Developmental indicators
improvement_rate = np.random.normal(0.3, 0.15) - (bq - 2.5) * 0.05 # Q4 develop faster
intl_caps = max(0, np.random.poisson(2 + birth_advantage * 0.5))
minutes = max(500, np.random.normal(1500 - (bq - 1) * 100, 400))
historical_data.append({
'player_id': f'H{i:04d}',
'birth_quarter': bq,
'age': age,
'position': position,
'xg_p90': xg_p90,
'xa_p90': xa_p90,
'progressive_passes_p90': progressive_passes_p90,
'dribbles_p90': dribbles_p90,
'top_speed_kmh': top_speed,
'height_cm': height,
'improvement_rate': improvement_rate,
'international_u17_caps': intl_caps,
'minutes_played': minutes
})
df = pd.DataFrame(historical_data)
# Create outcome based on "true talent" (accounting for late bloomer potential)
from scipy.stats import zscore
df['true_talent'] = (
0.3 * zscore(df['xg_p90']) +
0.25 * zscore(df['improvement_rate']) +
0.2 * zscore(df['progressive_passes_p90']) +
0.15 * zscore(df['top_speed_kmh']) +
0.1 * (5 - df['birth_quarter']) * 0.3 + # Q4 late bloomer bonus
np.random.normal(0, 0.3, n_historical)
)
df['reached_pro'] = (df['true_talent'] > df['true_talent'].quantile(0.75)).astype(int)
# Calculate age-adjusted percentiles
def calculate_age_adjusted_percentiles(data):
data = data.copy()
data['age_group'] = np.floor(data['age'])
metrics = ['xg_p90', 'xa_p90', 'progressive_passes_p90', 'dribbles_p90',
'top_speed_kmh', 'height_cm']
for metric in metrics:
pct_col = f'{metric}_pct'
data[pct_col] = data.groupby('age_group')[metric].rank(pct=True) * 100
return data
df = calculate_age_adjusted_percentiles(df)
# Feature engineering
df['late_bloomer_score'] = ((df['birth_quarter'] >= 3) &
(df['improvement_rate'] > 0.3)).astype(int)
# Prepare features
features = ['xg_p90_pct', 'xa_p90_pct', 'progressive_passes_p90_pct',
'dribbles_p90_pct', 'top_speed_kmh_pct', 'improvement_rate',
'international_u17_caps', 'minutes_played', 'birth_quarter',
'late_bloomer_score']
X = df[features]
y = df['reached_pro']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42, stratify=y)
# Train Gradient Boosting model
model = GradientBoostingClassifier(
n_estimators=200,
max_depth=4,
learning_rate=0.1,
random_state=42
)
model.fit(X_train, y_train)
# Evaluate
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
print("Model Performance:")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"CV ROC-AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Pro', 'Pro']))
# Feature importance
importance_df = pd.DataFrame({
'feature': features,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Feature importance plot
axes[0].barh(importance_df['feature'], importance_df['importance'], color='forestgreen')
axes[0].set_xlabel('Importance')
axes[0].set_title('Feature Importance for Pro Potential Prediction')
axes[0].invert_yaxis()
# Birth quarter analysis
bq_analysis = df.groupby('birth_quarter').agg({
'player_id': 'count',
'reached_pro': 'mean',
'minutes_played': 'mean',
'improvement_rate': 'mean'
}).reset_index()
bq_analysis.columns = ['Birth Quarter', 'N Players', 'Pro Rate', 'Avg Minutes', 'Avg Improvement']
axes[1].bar(bq_analysis['Birth Quarter'], bq_analysis['Pro Rate'] * 100, color='steelblue')
axes[1].set_xlabel('Birth Quarter')
axes[1].set_ylabel('Pro Conversion Rate (%)')
axes[1].set_title('Birth Quarter Effect on Pro Conversion')
axes[1].set_xticks([1, 2, 3, 4])
axes[1].set_xticklabels(['Q1 (Jan-Mar)', 'Q2 (Apr-Jun)', 'Q3 (Jul-Sep)', 'Q4 (Oct-Dec)'])
plt.tight_layout()
plt.show()
print("\nBirth Quarter Analysis:")
print(bq_analysis.to_string(index=False))
# Current academy predictions
current_academy = pd.DataFrame({
'player_id': [f'CA{i:03d}' for i in range(50)],
'birth_quarter': np.random.choice([1, 2, 3, 4], 50, p=[0.32, 0.28, 0.23, 0.17]),
'age': np.random.uniform(16, 17.5, 50),
'position': np.random.choice(['Forward', 'Midfielder', 'Defender'], 50)
})
# Generate metrics for current academy
for i in range(len(current_academy)):
bq = current_academy.loc[i, 'birth_quarter']
ba = (5 - bq) * 0.1
current_academy.loc[i, 'xg_p90'] = max(0, np.random.normal(0.18, 0.1))
current_academy.loc[i, 'xa_p90'] = max(0, np.random.normal(0.1, 0.06))
current_academy.loc[i, 'progressive_passes_p90'] = max(0, np.random.normal(4, 1.5))
current_academy.loc[i, 'dribbles_p90'] = max(0, np.random.normal(1.8, 0.7))
current_academy.loc[i, 'top_speed_kmh'] = 30 + np.random.normal(0, 1.5)
current_academy.loc[i, 'height_cm'] = 177 + np.random.normal(0, 5)
current_academy.loc[i, 'improvement_rate'] = np.random.normal(0.35, 0.12)
current_academy.loc[i, 'international_u17_caps'] = max(0, np.random.poisson(3))
current_academy.loc[i, 'minutes_played'] = max(800, np.random.normal(1600, 350))
current_academy = calculate_age_adjusted_percentiles(current_academy)
current_academy['late_bloomer_score'] = (
(current_academy['birth_quarter'] >= 3) &
(current_academy['improvement_rate'] > 0.35)
).astype(int)
# Generate Pro Potential Scores
current_academy['pro_potential'] = model.predict_proba(current_academy[features])[:, 1] * 100
print("\n=== CURRENT ACADEMY - TOP PROSPECTS ===")
top_prospects = current_academy.nlargest(10, 'pro_potential')[
['player_id', 'birth_quarter', 'age', 'position', 'pro_potential',
'improvement_rate', 'late_bloomer_score']
]
top_prospects['pro_potential'] = top_prospects['pro_potential'].apply(lambda x: f'{x:.1f}%')
print(top_prospects.to_string(index=False))
print("\n=== HIGH-POTENTIAL LATE BLOOMERS (Q3/Q4 with high improvement) ===")
late_bloomers = current_academy[
(current_academy['birth_quarter'] >= 3) &
(current_academy['improvement_rate'] > current_academy['improvement_rate'].quantile(0.7))
].nlargest(5, 'pro_potential')[['player_id', 'birth_quarter', 'pro_potential', 'improvement_rate']]
print(late_bloomers.to_string(index=False))library(tidyverse)
library(caret)
library(randomForest)
library(pROC)
# Simulate historical youth player data with outcomes
set.seed(42)
n_historical <- 500
historical_players <- tibble(
player_id = paste0("H", sprintf("%04d", 1:n_historical)),
# Birth quarter (Q1 = Jan-Mar, Q4 = Oct-Dec)
birth_quarter = sample(1:4, n_historical, replace = TRUE,
prob = c(0.35, 0.28, 0.22, 0.15)), # Q1 overrepresented
age_at_observation = runif(n_historical, 16, 18),
position = sample(c("Forward", "Midfielder", "Defender"), n_historical,
replace = TRUE, prob = c(0.3, 0.4, 0.3))
) %>%
mutate(
# Raw metrics with birth quarter advantage for Q1
birth_advantage = (5 - birth_quarter) * 0.1,
# Technical metrics
xg_p90_raw = pmax(0, rnorm(n_historical, 0.15 + birth_advantage * 0.02, 0.08)),
xa_p90_raw = pmax(0, rnorm(n_historical, 0.08 + birth_advantage * 0.015, 0.05)),
progressive_passes_p90_raw = pmax(0, rnorm(n_historical, 3.5 + birth_advantage * 0.3, 1.2)),
dribbles_p90_raw = pmax(0, rnorm(n_historical, 1.5 + birth_advantage * 0.2, 0.6)),
# Physical metrics (more affected by relative age)
top_speed_kmh = 29 + birth_advantage * 1.5 + rnorm(n_historical, 0, 1.5),
height_cm = 175 + birth_advantage * 3 + rnorm(n_historical, 0, 5),
# Developmental indicators
improvement_rate = rnorm(n_historical, 0.3, 0.15) - (birth_quarter - 2.5) * 0.05, # Q4 develop faster
international_u17_caps = pmax(0, rpois(n_historical, 2 + birth_advantage * 0.5)),
minutes_played = pmax(500, rnorm(n_historical, 1500 - (birth_quarter - 1) * 100, 400))
)
# Create outcome: reached professional level within 5 years
# True talent + some luck, with late bloomers having a chance
historical_players <- historical_players %>%
mutate(
true_talent = 0.3 * scale(xg_p90_raw)[,1] +
0.25 * scale(improvement_rate)[,1] +
0.2 * scale(progressive_passes_p90_raw)[,1] +
0.15 * scale(top_speed_kmh)[,1] +
0.1 * (5 - birth_quarter) * 0.3 + # Q4 bonus (late bloomer potential)
rnorm(n_historical, 0, 0.3),
reached_pro = ifelse(true_talent > quantile(true_talent, 0.75), 1, 0) %>% as.factor()
)
# Calculate age-adjusted percentiles
calculate_age_adjusted_percentiles <- function(data) {
metrics <- c("xg_p90_raw", "xa_p90_raw", "progressive_passes_p90_raw",
"dribbles_p90_raw", "top_speed_kmh", "height_cm")
data %>%
mutate(age_group = floor(age_at_observation)) %>%
group_by(age_group) %>%
mutate(across(all_of(metrics),
~percent_rank(.) * 100,
.names = "{.col}_pct")) %>%
ungroup()
}
historical_players <- calculate_age_adjusted_percentiles(historical_players)
# Feature engineering for model
model_data <- historical_players %>%
mutate(
# Birth quarter as factor
birth_quarter_f = as.factor(birth_quarter),
# Combined percentile scores
technical_pct = (xg_p90_raw_pct + xa_p90_raw_pct + progressive_passes_p90_raw_pct) / 3,
physical_pct = (top_speed_kmh_pct + height_cm_pct) / 2,
# Late bloomer indicator (Q4 with high improvement rate)
late_bloomer_score = ifelse(birth_quarter >= 3 & improvement_rate > 0.3, 1, 0)
)
# Select features
features <- c("xg_p90_raw_pct", "xa_p90_raw_pct", "progressive_passes_p90_raw_pct",
"dribbles_p90_raw_pct", "top_speed_kmh_pct", "improvement_rate",
"international_u17_caps", "minutes_played", "birth_quarter",
"late_bloomer_score")
# Train/test split
set.seed(42)
train_idx <- createDataPartition(model_data$reached_pro, p = 0.7, list = FALSE)
train_data <- model_data[train_idx, ]
test_data <- model_data[-train_idx, ]
# Train Random Forest with cross-validation
ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Relevel outcome for caret
train_data$reached_pro <- factor(train_data$reached_pro, levels = c("0", "1"), labels = c("No", "Yes"))
test_data$reached_pro <- factor(test_data$reached_pro, levels = c("0", "1"), labels = c("No", "Yes"))
rf_model <- train(
reached_pro ~ .,
data = train_data[, c(features, "reached_pro")],
method = "rf",
trControl = ctrl,
metric = "ROC",
ntree = 500
)
# Predictions
test_probs <- predict(rf_model, test_data, type = "prob")[, "Yes"]
test_preds <- predict(rf_model, test_data)
# Evaluate model
roc_result <- roc(test_data$reached_pro, test_probs)
cat("Model Performance:\n")
cat("ROC-AUC:", round(auc(roc_result), 3), "\n")
# Confusion matrix
cm <- confusionMatrix(test_preds, test_data$reached_pro, positive = "Yes")
cat("Sensitivity:", round(cm$byClass["Sensitivity"], 3), "\n")
cat("Specificity:", round(cm$byClass["Specificity"], 3), "\n")
# Variable importance
importance_df <- varImp(rf_model)$importance %>%
rownames_to_column("feature") %>%
arrange(desc(Overall))
ggplot(importance_df, aes(x = reorder(feature, Overall), y = Overall)) +
geom_bar(stat = "identity", fill = "forestgreen") +
coord_flip() +
labs(title = "Feature Importance for Pro Potential Prediction",
x = "", y = "Importance") +
theme_minimal()
# Analyze birth quarter effect
birth_quarter_analysis <- model_data %>%
group_by(birth_quarter) %>%
summarise(
n_players = n(),
pro_rate = mean(reached_pro == "Yes") * 100,
avg_minutes = mean(minutes_played),
avg_improvement = mean(improvement_rate),
.groups = "drop"
)
cat("\nBirth Quarter Analysis:\n")
print(birth_quarter_analysis)
# Current academy predictions
current_academy <- tibble(
player_id = paste0("CA", sprintf("%03d", 1:50)),
birth_quarter = sample(1:4, 50, replace = TRUE, prob = c(0.32, 0.28, 0.23, 0.17)),
age_at_observation = runif(50, 16, 17.5),
position = sample(c("Forward", "Midfielder", "Defender"), 50, replace = TRUE)
) %>%
mutate(
birth_advantage = (5 - birth_quarter) * 0.1,
xg_p90_raw = pmax(0, rnorm(50, 0.18, 0.1)),
xa_p90_raw = pmax(0, rnorm(50, 0.1, 0.06)),
progressive_passes_p90_raw = pmax(0, rnorm(50, 4, 1.5)),
dribbles_p90_raw = pmax(0, rnorm(50, 1.8, 0.7)),
top_speed_kmh = 30 + rnorm(50, 0, 1.5),
height_cm = 177 + rnorm(50, 0, 5),
improvement_rate = rnorm(50, 0.35, 0.12),
international_u17_caps = pmax(0, rpois(50, 3)),
minutes_played = pmax(800, rnorm(50, 1600, 350)),
late_bloomer_score = ifelse(birth_quarter >= 3 & improvement_rate > 0.35, 1, 0)
) %>%
calculate_age_adjusted_percentiles()
# Generate Pro Potential Scores
current_academy$pro_potential <- predict(rf_model, current_academy, type = "prob")[, "Yes"] * 100
# Top prospects
cat("\n=== CURRENT ACADEMY - TOP PROSPECTS ===\n")
current_academy %>%
arrange(desc(pro_potential)) %>%
select(player_id, birth_quarter, age_at_observation, position,
pro_potential, improvement_rate, late_bloomer_score) %>%
head(10) %>%
mutate(
pro_potential = paste0(round(pro_potential, 1), "%"),
age_at_observation = round(age_at_observation, 1)
) %>%
print()
# Identify high-potential late bloomers
cat("\n=== HIGH-POTENTIAL LATE BLOOMERS (Q3/Q4 with high improvement) ===\n")
current_academy %>%
filter(birth_quarter >= 3, improvement_rate > quantile(improvement_rate, 0.7)) %>%
arrange(desc(pro_potential)) %>%
select(player_id, birth_quarter, pro_potential, improvement_rate) %>%
head(5) %>%
print()Exercise 21.3: Comprehensive Loan Decision Optimizer
Task: Build a loan decision system that analyzes historical loan outcomes, identifies optimal loan characteristics, and recommends the best loan destinations for academy players.
Requirements:
- Analyze historical loan data for success factors
- Model relationship between loan characteristics and player development
- Create scoring system for potential loan destinations
- Generate ranked recommendations with expected development impact
- Include playing time probability estimation
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Simulate historical loan data
np.random.seed(42)
n_loans = 300
historical_loans = pd.DataFrame({
'loan_id': [f'L{i:04d}' for i in range(n_loans)],
'player_id': [f'P{np.random.randint(1, 201):04d}' for _ in range(n_loans)],
'player_age': np.random.uniform(18, 22, n_loans),
'player_position': np.random.choice(['Forward', 'Midfielder', 'Defender'], n_loans, p=[0.3, 0.4, 0.3]),
'player_level_pre': np.random.uniform(40, 75, n_loans),
'loan_league_level': np.random.choice([1, 2, 3, 4], n_loans, p=[0.1, 0.3, 0.4, 0.2]),
'parent_club_level': np.random.choice([1, 2], n_loans, p=[0.7, 0.3]),
'loan_duration_months': np.random.choice([6, 12, 18], n_loans, p=[0.3, 0.5, 0.2]),
'club_development_rating': np.random.uniform(40, 90, n_loans),
'competition_intensity': np.random.uniform(50, 95, n_loans)
})
historical_loans['league_level_diff'] = (
historical_loans['loan_league_level'] - historical_loans['parent_club_level']
)
historical_loans['games_available'] = (historical_loans['loan_duration_months'] * 4).round()
# Simulate playing time
expected_apps = np.minimum(
historical_loans['games_available'],
historical_loans['games_available'] * (
0.3 + 0.15 * historical_loans['league_level_diff'] +
(historical_loans['player_level_pre'] - 50) / 100
)
)
historical_loans['appearances'] = np.maximum(0, expected_apps + np.random.normal(0, 5, n_loans)).round()
historical_loans['starts'] = np.minimum(
historical_loans['appearances'],
historical_loans['appearances'] * np.random.uniform(0.5, 0.9, n_loans)
).round()
historical_loans['minutes_total'] = (
historical_loans['starts'] * np.random.uniform(75, 90, n_loans) +
(historical_loans['appearances'] - historical_loans['starts']) * np.random.uniform(15, 30, n_loans)
)
historical_loans['minutes_per_game'] = (
historical_loans['minutes_total'] / np.maximum(1, historical_loans['appearances'])
)
historical_loans['starting_pct'] = (
historical_loans['starts'] / np.maximum(1, historical_loans['appearances'])
)
# Calculate development outcome
historical_loans['development'] = (
0.3 * (historical_loans['minutes_per_game'] / 90) * 10 +
0.2 * np.maximum(0, 3 - np.abs(historical_loans['league_level_diff'] - 1.5)) +
0.15 * (historical_loans['club_development_rating'] / 100) * 5 +
0.15 * (historical_loans['competition_intensity'] / 100) * 5 +
0.1 * (historical_loans['loan_duration_months'] / 12) * 5 +
np.random.normal(0, 2, n_loans)
)
historical_loans['player_level_post'] = historical_loans['player_level_pre'] + historical_loans['development']
historical_loans['development_success'] = (
historical_loans['development'] > historical_loans['development'].median()
).astype(int)
print("=== LOAN SUCCESS FACTOR ANALYSIS ===\n")
# 1. Playing time analysis
historical_loans['minutes_group'] = pd.cut(
historical_loans['minutes_per_game'],
bins=[0, 30, 50, 70, 90],
labels=['<30', '30-50', '50-70', '70+']
)
playing_time_analysis = historical_loans.groupby('minutes_group').agg({
'loan_id': 'count',
'development': 'mean',
'development_success': 'mean'
}).reset_index()
playing_time_analysis.columns = ['Minutes Group', 'N Loans', 'Avg Development', 'Success Rate']
playing_time_analysis['Success Rate'] = playing_time_analysis['Success Rate'] * 100
print("1. Development by Playing Time:")
print(playing_time_analysis.to_string(index=False))
# 2. League level analysis
level_analysis = historical_loans.groupby('league_level_diff').agg({
'loan_id': 'count',
'development': 'mean',
'minutes_per_game': 'mean',
'development_success': 'mean'
}).reset_index()
level_analysis.columns = ['Level Diff', 'N Loans', 'Avg Dev', 'Avg Minutes', 'Success Rate']
level_analysis['Success Rate'] = level_analysis['Success Rate'] * 100
print("\n2. Development by League Level Difference:")
print(level_analysis.to_string(index=False))
# Train prediction model
features = ['player_age', 'player_level_pre', 'league_level_diff',
'loan_duration_months', 'club_development_rating', 'competition_intensity']
X = historical_loans[features]
y = historical_loans['development']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
loan_model = RandomForestRegressor(n_estimators=200, random_state=42)
loan_model.fit(X_train, y_train)
print(f"\n3. Model R² Score: {loan_model.score(X_test, y_test):.3f}")
# Feature importance
importance_df = pd.DataFrame({
'feature': features,
'importance': loan_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n4. Feature Importance:")
print(importance_df.to_string(index=False))
# Loan recommendation function
def generate_loan_recommendations(player, available_clubs, model):
"""Generate ranked loan recommendations for a player."""
def estimate_playing_time(player_level, club_avg_level, league_level_diff):
base_prob = 0.3 + 0.12 * league_level_diff
quality_adj = (player_level - club_avg_level) / 50
return np.clip(base_prob + quality_adj, 0.1, 0.95)
recommendations = available_clubs.copy()
recommendations['league_level_diff'] = (
recommendations['league_level'] - player['parent_club_level']
)
# Estimate playing time
recommendations['expected_playing_time_pct'] = recommendations.apply(
lambda row: estimate_playing_time(
player['player_level'],
row['avg_player_level'],
row['league_level_diff']
),
axis=1
)
recommendations['expected_minutes_per_game'] = (
recommendations['expected_playing_time_pct'] * 85
)
# Predict development
pred_features = pd.DataFrame({
'player_age': [player['age']] * len(recommendations),
'player_level_pre': [player['player_level']] * len(recommendations),
'league_level_diff': recommendations['league_level_diff'],
'loan_duration_months': recommendations['loan_duration'],
'club_development_rating': recommendations['development_rating'],
'competition_intensity': recommendations['competition_intensity']
})
recommendations['predicted_development'] = model.predict(pred_features)
# Composite score (z-score normalized)
from scipy.stats import zscore
recommendations['recommendation_score'] = (
0.4 * zscore(recommendations['predicted_development']) +
0.3 * zscore(recommendations['expected_playing_time_pct']) +
0.2 * zscore(recommendations['development_rating']) +
0.1 * zscore(recommendations['competition_intensity'])
)
return recommendations.sort_values('recommendation_score', ascending=False)
# Define available clubs
available_clubs = pd.DataFrame({
'club_id': [f'C{i:03d}' for i in range(15)],
'club_name': ['Championship United', 'League One FC', 'Eredivisie Stars',
'Belgian Pro Club', 'Serie B Calcio', '2. Bundesliga Team',
'Ligue 2 Olympique', 'Segunda Division', 'Scottish Premier',
'Championship Town', 'League One City', 'Dutch First Div',
'Portuguese Segunda', 'Austrian Bundesliga', 'Swiss Super'],
'league_level': [2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2],
'avg_player_level': [62, 52, 58, 55, 60, 61, 58, 59, 54, 60, 50, 53, 51, 56, 57],
'development_rating': [75, 70, 82, 78, 72, 85, 74, 76, 68, 72, 65, 70, 66, 80, 77],
'competition_intensity': [85, 75, 78, 72, 82, 84, 76, 83, 70, 82, 68, 72, 65, 75, 73],
'loan_duration': [12, 12, 12, 6, 12, 12, 12, 12, 6, 12, 12, 6, 12, 12, 6]
})
# Example player
player = {
'player_id': 'YP001',
'name': 'Marcus Sterling Jr',
'age': 19,
'position': 'Forward',
'player_level': 58,
'parent_club_level': 1
}
# Generate recommendations
recommendations = generate_loan_recommendations(player, available_clubs, loan_model)
print(f"\n=== LOAN RECOMMENDATIONS FOR {player['name']} ===")
print(f"Age: {player['age']} | Position: {player['position']}")
print(f"Current Level: {player['player_level']}\n")
display_cols = ['club_name', 'league_level', 'expected_minutes_per_game',
'predicted_development', 'recommendation_score']
print(recommendations[display_cols].head(10).round(2).to_string(index=False))
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Playing time vs development scatter
for level_diff in historical_loans['league_level_diff'].unique():
mask = historical_loans['league_level_diff'] == level_diff
axes[0].scatter(
historical_loans.loc[mask, 'minutes_per_game'],
historical_loans.loc[mask, 'development'],
alpha=0.5, label=f'Level Diff: {level_diff}'
)
axes[0].set_xlabel('Minutes per Game')
axes[0].set_ylabel('Development Score')
axes[0].set_title('Playing Time vs Development by League Level')
axes[0].legend()
# Recommendation scores bar chart
top_10 = recommendations.head(10)
axes[1].barh(top_10['club_name'], top_10['recommendation_score'], color='forestgreen')
axes[1].set_xlabel('Recommendation Score')
axes[1].set_title(f'Top Loan Destinations for {player["name"]}')
axes[1].invert_yaxis()
plt.tight_layout()
plt.show()library(tidyverse)
library(randomForest)
# Simulate historical loan data
set.seed(42)
n_loans <- 300
historical_loans <- tibble(
loan_id = paste0("L", sprintf("%04d", 1:n_loans)),
player_id = paste0("P", sprintf("%04d", sample(1:200, n_loans, replace = TRUE))),
player_age = runif(n_loans, 18, 22),
player_position = sample(c("Forward", "Midfielder", "Defender"), n_loans,
replace = TRUE, prob = c(0.3, 0.4, 0.3)),
player_level_pre = runif(n_loans, 40, 75), # Composite score before loan
# Loan characteristics
loan_league_level = sample(1:4, n_loans, replace = TRUE, prob = c(0.1, 0.3, 0.4, 0.2)),
parent_club_level = sample(1:2, n_loans, replace = TRUE, prob = c(0.7, 0.3)),
league_level_diff = loan_league_level - parent_club_level,
loan_duration_months = sample(c(6, 12, 18), n_loans, replace = TRUE, prob = c(0.3, 0.5, 0.2)),
# Outcome variables
games_available = round(loan_duration_months * 4),
appearances = NA,
starts = NA,
minutes_total = NA,
# Club characteristics
club_development_rating = runif(n_loans, 40, 90),
competition_intensity = runif(n_loans, 50, 95)
)
# Simulate playing time based on league level difference and player level
historical_loans <- historical_loans %>%
mutate(
# Players get more playing time at lower league levels
expected_apps = pmin(games_available,
games_available * (0.3 + 0.15 * league_level_diff +
(player_level_pre - 50) / 100)),
appearances = round(pmax(0, expected_apps + rnorm(n_loans, 0, 5))),
starts = round(pmin(appearances, appearances * runif(n_loans, 0.5, 0.9))),
minutes_total = starts * runif(n_loans, 75, 90) + (appearances - starts) * runif(n_loans, 15, 30),
minutes_per_game = minutes_total / pmax(1, appearances),
starting_pct = starts / pmax(1, appearances)
)
# Calculate development outcome
historical_loans <- historical_loans %>%
mutate(
# Development is a function of playing time, league level, and club environment
development =
0.3 * (minutes_per_game / 90) * 10 + # Playing time is crucial
0.2 * pmax(0, 3 - abs(league_level_diff - 1.5)) + # Optimal at 1-2 levels below
0.15 * (club_development_rating / 100) * 5 + # Club environment
0.15 * (competition_intensity / 100) * 5 + # Competitive matches
0.1 * (loan_duration_months / 12) * 5 + # Longer loans better
rnorm(n_loans, 0, 2), # Random variation
player_level_post = player_level_pre + development,
development_success = ifelse(development > median(development), 1, 0)
)
# Analyze loan success factors
cat("=== LOAN SUCCESS FACTOR ANALYSIS ===\n\n")
# 1. Playing time analysis
playing_time_analysis <- historical_loans %>%
mutate(minutes_group = cut(minutes_per_game,
breaks = c(0, 30, 50, 70, 90),
labels = c("<30", "30-50", "50-70", "70+"))) %>%
group_by(minutes_group) %>%
summarise(
n_loans = n(),
avg_development = mean(development),
success_rate = mean(development_success) * 100,
.groups = "drop"
)
cat("1. Development by Playing Time:\n")
print(playing_time_analysis)
# 2. League level difference analysis
level_analysis <- historical_loans %>%
group_by(league_level_diff) %>%
summarise(
n_loans = n(),
avg_development = mean(development),
avg_minutes = mean(minutes_per_game),
success_rate = mean(development_success) * 100,
.groups = "drop"
)
cat("\n2. Development by League Level Difference:\n")
print(level_analysis)
# Train prediction model for loan success
features <- c("player_age", "player_level_pre", "league_level_diff",
"loan_duration_months", "club_development_rating", "competition_intensity")
loan_model <- randomForest(
development ~ .,
data = historical_loans[, c(features, "development")],
ntree = 200,
importance = TRUE
)
# Variable importance
cat("\n3. Feature Importance for Loan Development:\n")
importance(loan_model) %>%
as.data.frame() %>%
rownames_to_column("feature") %>%
arrange(desc(`%IncMSE`)) %>%
print()
# Create loan recommendation system
generate_loan_recommendations <- function(player, available_clubs, model, historical_data) {
# Estimate playing time probability for each club
estimate_playing_time <- function(player_level, club_avg_level, league_level_diff) {
# Higher chance at lower leagues, adjusted for player quality
base_prob <- 0.3 + 0.12 * league_level_diff
quality_adj <- (player_level - club_avg_level) / 50
pmin(0.95, pmax(0.1, base_prob + quality_adj))
}
recommendations <- available_clubs %>%
mutate(
league_level_diff = league_level - player$parent_club_level,
# Estimate playing time
expected_playing_time_pct = estimate_playing_time(
player$player_level, avg_player_level, league_level_diff
),
expected_minutes_per_game = expected_playing_time_pct * 85,
# Predict development using model
predicted_development = predict(model, newdata = tibble(
player_age = player$age,
player_level_pre = player$player_level,
league_level_diff = league_level_diff,
loan_duration_months = loan_duration,
club_development_rating = development_rating,
competition_intensity = competition_intensity
)),
# Create composite recommendation score
recommendation_score =
0.4 * scale(predicted_development)[,1] +
0.3 * scale(expected_playing_time_pct)[,1] +
0.2 * scale(development_rating)[,1] +
0.1 * scale(competition_intensity)[,1]
) %>%
arrange(desc(recommendation_score))
return(recommendations)
}
# Define available loan clubs
available_clubs <- tibble(
club_id = paste0("C", sprintf("%03d", 1:15)),
club_name = c("Championship United", "League One FC", "Eredivisie Stars",
"Belgian Pro Club", "Serie B Calcio", "2. Bundesliga Team",
"Ligue 2 Olympique", "Segunda Division", "Scottish Premier",
"Championship Town", "League One City", "Dutch First Div",
"Portuguese Segunda", "Austrian Bundesliga", "Swiss Super"),
league_level = c(2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2),
avg_player_level = c(62, 52, 58, 55, 60, 61, 58, 59, 54, 60, 50, 53, 51, 56, 57),
development_rating = c(75, 70, 82, 78, 72, 85, 74, 76, 68, 72, 65, 70, 66, 80, 77),
competition_intensity = c(85, 75, 78, 72, 82, 84, 76, 83, 70, 82, 68, 72, 65, 75, 73),
loan_duration = c(12, 12, 12, 6, 12, 12, 12, 12, 6, 12, 12, 6, 12, 12, 6)
)
# Example player seeking loan
player_seeking_loan <- list(
player_id = "YP001",
name = "Marcus Sterling Jr",
age = 19,
position = "Forward",
player_level = 58,
parent_club_level = 1 # Premier League club
)
# Generate recommendations
recommendations <- generate_loan_recommendations(
player_seeking_loan,
available_clubs,
loan_model,
historical_loans
)
cat("\n=== LOAN RECOMMENDATIONS FOR", player_seeking_loan$name, "===\n")
cat("Age:", player_seeking_loan$age, "| Position:", player_seeking_loan$position, "\n")
cat("Current Level:", player_seeking_loan$player_level, "\n\n")
recommendations %>%
select(club_name, league_level, expected_minutes_per_game,
predicted_development, recommendation_score) %>%
mutate(
expected_minutes_per_game = round(expected_minutes_per_game, 0),
predicted_development = round(predicted_development, 2),
recommendation_score = round(recommendation_score, 2)
) %>%
head(10) %>%
print()
# Visualize optimal loan characteristics
ggplot(historical_loans, aes(x = minutes_per_game, y = development,
color = factor(league_level_diff))) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Loan Outcome: Playing Time vs Development",
subtitle = "By league level difference from parent club",
x = "Minutes per Game",
y = "Development Score",
color = "League Level\nDifference"
) +
theme_minimal()Summary
Key Takeaways
- Development curves reveal expected progression patterns by age and position, helping identify players ahead of or behind schedule
- Age-adjusted percentiles are essential for fair youth player comparison - raw percentiles mislead when comparing different ages
- Talent identification models must account for relative age effects, physical maturation, and rate of improvement, not just current performance
- Loan success depends primarily on playing time; the optimal loan is typically 1-2 leagues below parent club level with >70 minutes per game
- Academy dashboards should track graduation rates, development efficiency, pathway retention, and sales revenue to measure overall academy effectiveness