Skip to main content

Data Analysis Guide

This guide demonstrates how to load and analyze the RaceData Formula 1 dataset using popular Python libraries. We’ll cover common analysis patterns, joining tables, and real-world examples.

Loading Data

With Pandas

Pandas is the most popular data analysis library for Python and works well for most F1 data analysis tasks.
import pandas as pd

# Load individual tables
drivers = pd.read_csv('data/drivers.csv')
races = pd.read_csv('data/races.csv')
results = pd.read_csv('data/results.csv')
lap_times = pd.read_csv('data/lap_times.csv')

# View basic info
print(f"Total drivers: {len(drivers)}")
print(f"Total races: {len(races)}")
print(f"Date range: {races['date'].min()} to {races['date'].max()}")

With Polars

Polars is a faster alternative to pandas, especially for large datasets. It’s written in Rust and uses Apache Arrow.
import polars as pl

# Load with polars (faster than pandas)
drivers = pl.read_csv('data/drivers.csv')
races = pl.read_csv('data/races.csv')
results = pl.read_csv('data/results.csv')
lap_times = pl.read_csv('data/lap_times.csv')

# Polars is lazy by default for better performance
lazy_lap_times = pl.scan_csv('data/lap_times.csv')

# Execute queries only when needed
fast_laps = (
    lazy_lap_times
    .filter(pl.col('milliseconds') < 90000)
    .collect()
)

print(f"Fast laps: {len(fast_laps)}")
For datasets with 600K+ rows like lap_times.csv, polars can be 5-10x faster than pandas. Consider using polars for large-scale analysis or when performance is critical.

Common Analysis Patterns

Driver Performance Analysis

Analyze a driver’s performance over their career:
import pandas as pd
import numpy as np

# Load required tables
drivers = pd.read_csv('data/drivers.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')

# Analyze Lewis Hamilton's career
hamilton = drivers[drivers['driverRef'] == 'hamilton'].iloc[0]
hamilton_results = results[results['driverId'] == hamilton['driverId']]

# Merge with race information
hamilton_races = hamilton_results.merge(
    races[['raceId', 'year', 'name', 'date']], 
    on='raceId'
)

# Calculate statistics
stats = {
    'total_races': len(hamilton_races),
    'wins': len(hamilton_races[hamilton_races['position'] == 1]),
    'podiums': len(hamilton_races[hamilton_races['position'] <= 3]),
    'total_points': hamilton_races['points'].sum(),
    'avg_finish': hamilton_races['positionOrder'].mean(),
    'dnf_rate': len(hamilton_races[hamilton_races['positionText'] == 'R']) / len(hamilton_races)
}

print(f"Lewis Hamilton Career Stats:")
for key, value in stats.items():
    print(f"  {key}: {value:.2f}" if isinstance(value, float) else f"  {key}: {value}")

# Group by year
yearly_performance = hamilton_races.groupby('year').agg({
    'points': 'sum',
    'position': lambda x: (x == 1).sum(),  # wins
    'raceId': 'count'  # races
}).rename(columns={'position': 'wins', 'raceId': 'races'})

print("\nYearly Performance:")
print(yearly_performance.tail())
Expected Output:
Lewis Hamilton Career Stats:
  total_races: 344
  wins: 103
  podiums: 197
  total_points: 4539.50
  avg_finish: 4.12
  dnf_rate: 0.09

Yearly Performance:
      points  wins  races
year                     
2020   347.0    11     17
2021   387.5    10     22
2022   240.0     1     22
2023   234.0     0     22
2024   223.0     2     24
Analyze how constructor performance has evolved:
import pandas as pd
import numpy as np

# Load data
constructors = pd.read_csv('data/constructors.csv')
constructor_standings = pd.read_csv('data/constructor_standings.csv')
races = pd.read_csv('data/races.csv')

# Get Ferrari's historical performance
ferrari = constructors[constructors['constructorRef'] == 'ferrari'].iloc[0]
ferrari_standings = constructor_standings[
    constructor_standings['constructorId'] == ferrari['constructorId']
]

# Merge with race data to get years
ferrari_history = ferrari_standings.merge(
    races[['raceId', 'year', 'round']], 
    on='raceId'
)

# Get final standings per year (last race of season)
final_standings = (
    ferrari_history
    .sort_values(['year', 'round'])
    .groupby('year')
    .last()
    .reset_index()
)

# Calculate decade averages
final_standings['decade'] = (final_standings['year'] // 10) * 10
decade_performance = final_standings.groupby('decade').agg({
    'position': 'mean',
    'points': 'mean',
    'wins': 'mean'
}).round(2)

print("Ferrari Performance by Decade:")
print(decade_performance)

Head-to-Head Teammate Comparisons

Compare teammates across a season:
import pandas as pd

# Load data
drivers = pd.read_csv('data/drivers.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')

# Compare teammates in 2024
year = 2024
races_2024 = races[races['year'] == year]
results_2024 = results[results['raceId'].isin(races_2024['raceId'])]

# Get both drivers from same constructor
teammate_pairs = (
    results_2024
    .groupby(['raceId', 'constructorId'])['driverId']
    .apply(list)
    .reset_index()
)

# Filter to pairs (both drivers finished)
teammate_pairs = teammate_pairs[teammate_pairs['driverId'].apply(len) == 2]

# For each pair, compare positions
comparisons = []
for _, row in teammate_pairs.iterrows():
    race_results = results_2024[
        (results_2024['raceId'] == row['raceId']) & 
        (results_2024['driverId'].isin(row['driverId']))
    ]
    
    if len(race_results) == 2:
        r1, r2 = race_results.iloc[0], race_results.iloc[1]
        comparisons.append({
            'raceId': row['raceId'],
            'constructorId': row['constructorId'],
            'driver1_id': r1['driverId'],
            'driver1_pos': r1['positionOrder'],
            'driver2_id': r2['driverId'],
            'driver2_pos': r2['positionOrder'],
            'driver1_beat_driver2': r1['positionOrder'] < r2['positionOrder']
        })

comparison_df = pd.DataFrame(comparisons)

# Calculate head-to-head records
for constructor_id in comparison_df['constructorId'].unique():
    constructor_data = comparison_df[comparison_df['constructorId'] == constructor_id]
    d1_id = constructor_data.iloc[0]['driver1_id']
    d2_id = constructor_data.iloc[0]['driver2_id']
    
    d1_wins = constructor_data['driver1_beat_driver2'].sum()
    d2_wins = len(constructor_data) - d1_wins
    
    d1_name = drivers[drivers['driverId'] == d1_id].iloc[0]
    d2_name = drivers[drivers['driverId'] == d2_id].iloc[0]
    
    print(f"{d1_name['forename']} {d1_name['surname']}: {d1_wins}")
    print(f"{d2_name['forename']} {d2_name['surname']}: {d2_wins}")
    print()

Joining Tables for Complex Queries

Multi-Table Join Example

Combine multiple tables to answer: “Which drivers have won at Monaco?”
import pandas as pd

# Load tables
drivers = pd.read_csv('data/drivers.csv')
results = pd.read_csv('data/results.csv')
races = pd.read_csv('data/races.csv')
circuits = pd.read_csv('data/circuits.csv')

# Find Monaco circuit
monaco = circuits[circuits['name'] == 'Circuit de Monaco']

# Chain joins
monaco_winners = (
    results[results['position'] == 1]  # Only winners
    .merge(races, on='raceId')  # Get race info
    .merge(circuits, on='circuitId')  # Get circuit info
    .query('name == "Circuit de Monaco"')  # Filter Monaco
    .merge(drivers, on='driverId')  # Get driver info
    [['year', 'forename', 'surname', 'name_y']]  # Select columns
    .rename(columns={'name_y': 'race_name'})
    .sort_values('year', ascending=False)
)

print("Monaco Grand Prix Winners:")
print(monaco_winners.head(10))

# Count wins per driver
wins_per_driver = (
    monaco_winners
    .groupby(['forename', 'surname'])
    .size()
    .reset_index(name='wins')
    .sort_values('wins', ascending=False)
)

print("\nMost Monaco Wins:")
print(wins_per_driver.head())

Lap Time Analysis with Joins

Analyze fastest laps with driver and race context:
import pandas as pd

# Load data
lap_times = pd.read_csv('data/lap_times.csv')
races = pd.read_csv('data/races.csv')
drivers = pd.read_csv('data/drivers.csv')
circuits = pd.read_csv('data/circuits.csv')

# Find fastest lap ever at each circuit
fastest_laps = (
    lap_times
    .sort_values('milliseconds')
    .groupby('raceId')
    .first()
    .reset_index()
    .merge(races, on='raceId')
    .merge(circuits, on='circuitId')
    .merge(drivers, on='driverId')
)

# Get fastest lap per circuit
circuit_records = (
    fastest_laps
    .sort_values('milliseconds')
    .groupby('circuitRef')
    .first()
    .reset_index()
    [['circuitRef', 'forename', 'surname', 'year', 'time', 'milliseconds']]
)

print("Circuit Lap Records:")
print(circuit_records.head(10))
When joining large tables like lap_times.csv, filter the data first before joining to improve performance. For example, filter to a specific year or race before merging with other tables.

Performance Optimization Tips

Memory Management

import pandas as pd

# Check memory usage before optimization
lap_times = pd.read_csv('data/lap_times.csv')
print(f"Memory usage: {lap_times.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Optimize dtypes
lap_times_optimized = pd.read_csv(
    'data/lap_times.csv',
    dtype={
        'raceId': 'int16',
        'driverId': 'int16',
        'lap': 'int8',
        'position': 'int8',
        'milliseconds': 'int32'
    }
)

print(f"Optimized memory: {lap_times_optimized.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Use categorical for repeated string values
drivers = pd.read_csv('data/drivers.csv')
drivers['nationality'] = drivers['nationality'].astype('category')

Efficient Filtering

import pandas as pd

# Bad: Load everything then filter
results = pd.read_csv('data/results.csv')
results_2023 = results[results['raceId'] > 1100]

# Better: Filter during load if possible
races = pd.read_csv('data/races.csv')
race_ids_2023 = races[races['year'] == 2023]['raceId'].tolist()

# Then load only relevant results
all_results = pd.read_csv('data/results.csv')
results_2023 = all_results[all_results['raceId'].isin(race_ids_2023)]

# Best: Use SQL or polars for complex filtering
import polars as pl

results_2023 = (
    pl.scan_csv('data/results.csv')
    .join(pl.scan_csv('data/races.csv'), on='raceId')
    .filter(pl.col('year') == 2023)
    .collect()
)

Vectorized Operations

import pandas as pd
import numpy as np

results = pd.read_csv('data/results.csv')

# Bad: Loop through rows
point_differences = []
for i in range(len(results)):
    if results.iloc[i]['position'] == 1:
        point_differences.append(results.iloc[i]['points'] - 25)
    else:
        point_differences.append(0)

# Good: Vectorized operation
results['point_diff'] = np.where(
    results['position'] == 1,
    results['points'] - 25,
    0
)

# Even better: Use pandas methods
results['is_winner'] = (results['position'] == 1).astype(int)

Next Steps

Build docs developers (and LLMs) love