Skip to main content

Overview

The FeatureEngineer class transforms cleaned NBA data into ML-ready features through feature engineering, rolling statistics, multicollinearity removal, and encoding.

Class Definition

class FeatureEngineer:
    def __init__(self)
Source: ~/workspace/source/NBA Data Preprocessing/task/pipeline/feature_engineering/features.py:16 No configuration parameters required. All behavior is controlled via method arguments.

Methods

init_rolling_state

def init_rolling_state(self, rolling_window: int = 5) -> RollingState
Initializes state for streaming rolling feature computation.
rolling_window
int
default:"5"
Size of rolling window for streaming statistics
return
RollingState
State object containing:
  • rolling_window (int): Window size
  • salary_window (deque): Circular buffer for salary values
Example:
from pipeline.feature_engineering import FeatureEngineer

engineer = FeatureEngineer()
state = engineer.init_rolling_state(rolling_window=10)

print(f"Window size: {state.rolling_window}")
print(f"Buffer maxlen: {state.salary_window.maxlen}")

build_features

def build_features(
    self, 
    df: pd.DataFrame, 
    rolling_window: int = 5
) -> pd.DataFrame
Builds comprehensive features for batch processing mode.
df
pd.DataFrame
required
Cleaned DataFrame (output from Preprocessor.clean())
rolling_window
int
default:"5"
Window size for rolling statistics
return
pd.DataFrame
DataFrame with original columns plus:
  • version_year: Extracted year from version string
  • age: Player age (version_year - birth year)
  • experience: Years of experience (version_year - draft year)
  • bmi: Body Mass Index (weight / height²)
  • salary_roll_mean: Rolling mean of salary
  • salary_roll_std: Rolling std of salary
  • birth_month: Month extracted from birth date
  • draft_decade: Draft year rounded to decade
  • salary_anomaly: Binary flag for salary z-score > 2.5
Example:
from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
import pandas as pd

preprocessor = Preprocessor()
engineer = FeatureEngineer()

# Clean data first
raw_df = pd.read_csv('nba_data.csv')
cleaned_df = preprocessor.clean(raw_df)

# Build features
featured_df = engineer.build_features(cleaned_df, rolling_window=10)

print("New features:")
print(featured_df[['age', 'experience', 'bmi', 'salary_roll_mean']].head())

print(f"\nFeature count: {len(featured_df.columns)}")
print(f"Anomalies detected: {featured_df['salary_anomaly'].sum()}")

build_features_streaming

def build_features_streaming(
    self, 
    df: pd.DataFrame, 
    state: RollingState
) -> pd.DataFrame
Builds features for streaming mode with stateful rolling statistics.
df
pd.DataFrame
required
Cleaned DataFrame chunk
state
RollingState
required
Rolling state object from init_rolling_state()
return
pd.DataFrame
DataFrame with same features as build_features(), but rolling statistics computed incrementally using state
Example:
from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
from pipeline.ingestion import DataIngestor

preprocessor = Preprocessor()
engineer = FeatureEngineer()
ingestor = DataIngestor()

# Initialize rolling state
state = engineer.init_rolling_state(rolling_window=5)

# Process chunks
for chunk in ingestor.stream_chunks('nba_data.csv', chunk_size=100):
    cleaned_chunk = preprocessor.clean(chunk)
    featured_chunk = engineer.build_features_streaming(cleaned_chunk, state)
    
    print(f"Processed chunk: {len(featured_chunk)} rows")
    print(f"Rolling mean: {featured_chunk['salary_roll_mean'].mean():.2f}")
    print(f"State buffer size: {len(state.salary_window)}")
    # State persists across chunks for continuous rolling window

drop_multicollinearity

def drop_multicollinearity(
    self, 
    df: pd.DataFrame, 
    threshold: float = 0.5
) -> pd.DataFrame
Removes highly correlated features to reduce multicollinearity.
df
pd.DataFrame
required
DataFrame with features (should contain ‘salary’ target column)
threshold
float
default:"0.5"
Correlation threshold. Features with correlation > threshold are candidates for removal
return
pd.DataFrame
DataFrame with multicollinear features removed. Among correlated pairs, keeps the feature more correlated with ‘salary’
Algorithm:
  1. Computes correlation matrix for all numeric features (excluding ‘salary’)
  2. Finds pairs with correlation > threshold
  3. For each pair, removes the feature less correlated with target (‘salary’)
  4. Repeats until no correlations exceed threshold
Example:
from pipeline.feature_engineering import FeatureEngineer
import pandas as pd
import numpy as np

engineer = FeatureEngineer()

# Create data with multicollinearity
np.random.seed(42)
df = pd.DataFrame({
    'feature_a': np.random.randn(100),
    'salary': np.random.randn(100) * 1000000 + 5000000
})
df['feature_b'] = df['feature_a'] * 0.9 + np.random.randn(100) * 0.1  # Highly correlated
df['feature_c'] = df['feature_a'] * 0.95 + np.random.randn(100) * 0.05  # Very highly correlated

print(f"Before: {len(df.columns)} features")
print(df.corr())

# Remove multicollinearity
filtered_df = engineer.drop_multicollinearity(df, threshold=0.8)

print(f"\nAfter: {len(filtered_df.columns)} features")
print(filtered_df.columns.tolist())
print(filtered_df.corr())

encode_and_scale

def encode_and_scale(
    self, 
    df: pd.DataFrame
) -> tuple[pd.DataFrame, pd.Series]
Prepares features for ML by encoding categoricals and scaling numerics.
df
pd.DataFrame
required
DataFrame with features and ‘salary’ target column
return
tuple[pd.DataFrame, pd.Series]
Tuple of (X, y) where:
  • X: Feature matrix with scaled numeric features and one-hot encoded categoricals
  • y: Target variable (salary)
Drops columns: ‘salary’, ‘version’, ‘b_day’, ‘draft_year’, ‘weight’, ‘height’
Transformations:
  • Numeric features: Z-score normalization (mean=0, std=1)
  • Categorical features: One-hot encoding with naming pattern {column}__{value}
  • High-cardinality categoricals: Dropped if ≥50 unique values
  • Missing values: Filled with median for numeric, ‘Unknown_’ for categorical
Example:
from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
import pandas as pd

preprocessor = Preprocessor()
engineer = FeatureEngineer()

# Full pipeline
raw_df = pd.read_csv('nba_data.csv')
cleaned = preprocessor.clean(raw_df)
featured = engineer.build_features(cleaned)
filtered = engineer.drop_multicollinearity(featured)

# Encode and scale
X, y = engineer.encode_and_scale(filtered)

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature types:")
print(X.dtypes.value_counts())
print(f"\nTarget statistics:")
print(f"Mean: ${y.mean():,.2f}")
print(f"Std: ${y.std():,.2f}")

# Verify scaling
print(f"\nNumeric feature means (should be ~0):")
print(X.select_dtypes(include='number').mean().head())
print(f"\nNumeric feature stds (should be ~1):")
print(X.select_dtypes(include='number').std().head())

RollingState

@dataclass
class RollingState:
    rolling_window: int
    salary_window: deque[float]
Source: ~/workspace/source/NBA Data Preprocessing/task/pipeline/feature_engineering/features.py:11 Mutable state container for streaming rolling window computation.
rolling_window
int
Size of the rolling window
salary_window
deque[float]
Circular buffer storing recent salary values (maxlen = rolling_window)

Usage Patterns

Batch Feature Engineering

from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

preprocessor = Preprocessor(random_seed=42)
engineer = FeatureEngineer()

# Load and process
df = pd.read_csv('nba_salaries.csv')
cleaned = preprocessor.clean(df)
featured = engineer.build_features(cleaned, rolling_window=10)
filtered = engineer.drop_multicollinearity(featured, threshold=0.7)
X, y = engineer.encode_and_scale(filtered)

# Train model
model = RandomForestRegressor(random_state=42)
model.fit(X, y)
print(f"Model R²: {model.score(X, y):.4f}")

Streaming Feature Engineering

from pipeline.ingestion import DataIngestor
from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
from sklearn.linear_model import SGDRegressor

ingestor = DataIngestor()
preprocessor = Preprocessor()
engineer = FeatureEngineer()

# Initialize streaming state
state = engineer.init_rolling_state(rolling_window=20)
model = SGDRegressor(random_state=42, max_iter=1)
feature_columns = None

# Process stream
for chunk in ingestor.stream_chunks('large_nba_data.csv', chunk_size=500):
    cleaned = preprocessor.clean(chunk)
    featured = engineer.build_features_streaming(cleaned, state)
    filtered = engineer.drop_multicollinearity(featured)
    X, y = engineer.encode_and_scale(filtered)
    
    # Handle schema consistency
    if feature_columns is None:
        feature_columns = list(X.columns)
    else:
        for col in feature_columns:
            if col not in X.columns:
                X[col] = 0.0
        X = X.reindex(columns=feature_columns, fill_value=0.0)
    
    # Online learning
    if len(X) > 0:
        model.partial_fit(X.to_numpy(), y.to_numpy())
    
    print(f"Processed chunk with {len(X)} rows, buffer size: {len(state.salary_window)}")

Feature Inspection

from pipeline.preprocessing import Preprocessor
from pipeline.feature_engineering import FeatureEngineer
import pandas as pd

preprocessor = Preprocessor()
engineer = FeatureEngineer()

df = pd.read_csv('nba_data.csv')
cleaned = preprocessor.clean(df)
featured = engineer.build_features(cleaned)

# Inspect engineered features
print("Engineered features:")
engineered_cols = ['age', 'experience', 'bmi', 'salary_roll_mean', 
                  'salary_roll_std', 'birth_month', 'draft_decade', 'salary_anomaly']
print(featured[engineered_cols].describe())

# Check correlations with target
filtered = engineer.drop_multicollinearity(featured, threshold=0.5)
corr_with_salary = filtered.corrwith(filtered['salary']).abs().sort_values(ascending=False)
print("\nTop 10 features correlated with salary:")
print(corr_with_salary.head(10))

# Analyze multicollinearity removal
print(f"\nFeatures before multicollinearity removal: {len(featured.columns)}")
print(f"Features after multicollinearity removal: {len(filtered.columns)}")
print(f"Dropped {len(featured.columns) - len(filtered.columns)} features")

Notes

  • build_features() sorts by version_year before computing rolling statistics
  • build_features_streaming() maintains state across chunks for continuous rolling windows
  • Rolling statistics use min_periods=1 to handle small windows at the start
  • Z-score anomaly detection uses threshold of 2.5 standard deviations
  • One-hot encoding creates binary columns named {column}__{category}
  • High-cardinality features (≥50 unique values) are automatically dropped
  • Multicollinearity removal is iterative and preserves features most correlated with target
  • The ‘salary’ column is always preserved and never dropped in multicollinearity filtering
  • Date columns (‘b_day’, ‘draft_year’) are dropped during encoding but their derived features remain

Build docs developers (and LLMs) love