Create ML-ready datasets from Premier League match statistics
The Premier League library includes powerful features for creating machine learning datasets. The create_dataset() method transforms raw match data into feature-engineered datasets perfect for training predictive models.
Maximum number of rows to include. If None, includes all available data (currently max 17,520 matches).When specified, gets the last n rows after sorting by date.
# Example: Arsenal vs Liverpool on 2024-02-15 with lag=10# For Arsenal (home team):# - Find Arsenal's last 10 games before 2024-02-15 in the same season# - Calculate average stats across those 10 games# - Prefix with "home_": home_xG, home_shots_total_FW, etc.# For Liverpool (away team):# - Find Liverpool's last 10 games before 2024-02-15 in the same season# - Calculate average stats across those 10 games# - Prefix with "away_": away_xG, away_shots_total_FW, etc.
If a team hasn’t played enough games in the season (less than lag games), that match is excluded from the dataset.
# Game identificationgame_id: str # Unique match identifierdate: datetime # Match date and timeseason: str # Season (e.g., "2023-2024")match_week: int # Week number in season# Teamshome_team_id: str # Home team IDaway_team_id: str # Away team IDhome_team: str # Home team nameaway_team: str # Away team name# Pre-match formhome_points: int # Home team's points before matchaway_points: int # Away team's points before match
home_shots_total_FW: int # Forward shotshome_shots_total_MF: int # Midfielder shotshome_shots_total_DF: int # Defender shotshome_shots_on_target_FW: int # On-target shots by forwardshome_shots_on_target_MF: int # On-target shots by midfieldershome_shots_on_target_DF: int # On-target shots by defenders
home_shot_creating_chances_FW: int # Shot-creating actions by forwardshome_goal_creating_actions_FW: int # Goal-creating actions by forwards# ... similar for MF, DF
home_passes_completed_FW: int # Completed passes by forwardshome_pass_completion_percentage_FW: float # Pass accuracy for forwardshome_key_passes: int # Passes leading to shotshome_passes_into_final_third: int # Progressive passeshome_passes_into_penalty_area: int # Passes into boxhome_progressive_passes: int # Forward-moving passes
home_tackles_won_FW: int # Successful tackles by forwardshome_blocks_FW: int # Shot blocks by forwardshome_interceptions_FW: int # Interceptions by forwardshome_clearances_FW: int # Clearances by forwardshome_errors_leading_to_goal: int # Defensive errors# ... similar for MF, DF
home_possession_rate: int # Ball possession %home_touches_FW: int # Ball touches by forwardshome_touches_att_pen_area_FW: int # Touches in opponent's boxhome_take_ons_FW: int # Dribble attemptshome_successful_take_ons_FW: int # Successful dribbleshome_carries_FW: int # Number of carrieshome_total_carrying_distance_FW: int # Distance carried (yards)home_dispossessed_FW: int # Times dispossessed
home_save_percentage: float # Save success ratehome_saves: int # Total saveshome_PSxG: float # Post-shot xG facedhome_passes_completed_GK: int # GK pass attemptshome_crosses_stopped: int # Crosses intercepted
home_yellow_card: int # Yellow cardshome_red_card: int # Red cardshome_fouls_committed_FW: int # Fouls by forwardshome_fouls_drawn_FW: int # Fouls won by forwardshome_offside_FW: int # Offside calls# ... similar for MF, DF
import pandas as pdimport numpy as npdf = pd.read_csv("ml/match_prediction.csv")# Create result labels: 1=home win, 0=draw, -1=away windf['result'] = np.where( df['home_goals'] > df['away_goals'], 1, np.where(df['home_goals'] < df['away_goals'], -1, 0))# Feature columns (all except metadata and targets)feature_cols = [col for col in df.columns if col.startswith(('home_', 'away_')) and col not in ['home_team', 'away_team', 'home_goals', 'away_goals']]X = df[feature_cols]y = df['result']
import pandas as pdimport matplotlib.pyplot as pltfrom sklearn.ensemble import RandomForestClassifierdf = pd.read_csv("ml/match_prediction.csv")# Prepare datadf['result'] = (df['home_goals'] > df['away_goals']).astype(int)feature_cols = [col for col in df.columns if col.startswith(('home_', 'away_')) and col not in ['home_team', 'away_team', 'home_goals', 'away_goals']]X = df[feature_cols]y = df['result']# Train modelmodel = RandomForestClassifier(n_estimators=100, random_state=42)model.fit(X, y)# Get feature importanceimportance_df = pd.DataFrame({ 'feature': feature_cols, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)# Plot top 20 featuresplt.figure(figsize=(10, 8))plt.barh(importance_df['feature'][:20], importance_df['importance'][:20])plt.xlabel('Feature Importance')plt.title('Top 20 Most Important Features for Match Prediction')plt.tight_layout()plt.savefig('feature_importance.png')print(importance_df.head(20))
The dataset excludes matches where teams haven’t played enough games:
import pandas as pddf = pd.read_csv("training_data.csv")# Check for any remaining NaN valuesprint(df.isnull().sum())# Handle save_percentage which can be NaNdf['home_save_percentage'].fillna(0, inplace=True)df['away_save_percentage'].fillna(0, inplace=True)
Use temporal splits for validation
Don’t use random splits - use time-based splits:
df = pd.read_csv("training_data.csv")df['date'] = pd.to_datetime(df['date'])df = df.sort_values('date')# Train on earlier matches, test on later matchessplit_date = '2024-01-01'train = df[df['date'] < split_date]test = df[df['date'] >= split_date]
Issue: Dataset has fewer rows than expectedCause: High lag value excludes early-season gamesSolution: Reduce lag or use more seasons of data
# Check how many games are excludedprint(f"Total games in database: {stats.get_total_game_count()}")# Generate dataset and check sizestats.create_dataset("test.csv", lag=10)df = pd.read_csv("test.csv")print(f"Games in dataset: {len(df)}")
If your weighted averages seem unexpected, verify the weight calculation:
lag = 10params = 0.9# Linear weightslin_weights = [i for i in range(lag, 0, -1)]print(f"Linear weights: {lin_weights}")print(f"Sum: {sum(lin_weights)}")# Exponential weightsexp_weights = [params ** k for k in range(1, lag + 1)]print(f"Exponential weights: {[f'{w:.3f}' for w in exp_weights]}")print(f"Sum: {sum(exp_weights):.3f}")
You now know how to create ML-ready datasets with the Premier League library!