Skip to main content

Overview

The Preprocessor class handles data cleaning, type conversions, missing value imputation, and outlier detection for NBA player datasets.

Class Definition

class Preprocessor:
    def __init__(self, random_seed: int = 42, missing_strategy: str = 'median')
Source: ~/workspace/source/NBA Data Preprocessing/task/pipeline/preprocessing/core.py:7

Constructor

random_seed
int
default:"42"
Random seed for reproducibility (reserved for future randomized operations)
missing_strategy
str
default:"'median'"
Strategy for handling missing values in numeric columns:
  • 'median': Fill with column median
  • 'mean': Fill with column mean
  • Any other value: Fill with 0

Methods

clean

def clean(self, df: pd.DataFrame) -> pd.DataFrame
Performs comprehensive data cleaning and type conversions on NBA player data.
df
pd.DataFrame
required
Raw DataFrame containing NBA player data
return
pd.DataFrame
Cleaned DataFrame with:
  • Parsed datetime columns (b_day, draft_year)
  • Converted height/weight to metric (float)
  • Cleaned salary (removed $, converted to float)
  • Standardized country to USA/Not-USA
  • Replaced “Undrafted” with “0” in draft_round
  • Imputed missing values per missing_strategy
Transformations performed:
  • b_day: Parsed as %m/%d/%y format datetime
  • draft_year: Parsed as %Y format datetime
  • team: Missing values filled with “No Team”
  • height: Extracts metric value (cm) from “imperial / metric” format
  • weight: Extracts metric value (kg) from “imperial / metric kg.” format
  • salary: Removes $ symbol and converts to float
  • country: Binarized to “USA” or “Not-USA”
  • draft_round: “Undrafted” replaced with “0”
Example:
from pipeline.preprocessing import Preprocessor
import pandas as pd

preprocessor = Preprocessor(random_seed=42, missing_strategy='median')

# Raw data
raw_df = pd.DataFrame({
    'b_day': ['01/15/95', '03/22/90'],
    'draft_year': ['2017', '2012'],
    'team': ['LAL', None],
    'height': ['6-5 / 196', '6-8 / 203'],
    'weight': ['210 / 95.3 kg.', '225 / 102.1 kg.'],
    'salary': ['$5000000', '$8500000'],
    'country': ['USA', 'France'],
    'draft_round': ['1', 'Undrafted']
})

cleaned_df = preprocessor.clean(raw_df)

print(cleaned_df['height'].tolist())  # [196.0, 203.0]
print(cleaned_df['weight'].tolist())  # [95.3, 102.1]
print(cleaned_df['salary'].tolist())  # [5000000.0, 8500000.0]
print(cleaned_df['country'].tolist())  # ['USA', 'Not-USA']
print(cleaned_df['draft_round'].tolist())  # ['1', '0']
print(cleaned_df['team'].tolist())  # ['LAL', 'No Team']

handle_missing

def handle_missing(self, df: pd.DataFrame) -> pd.DataFrame
Imputes missing values in numeric and categorical columns.
df
pd.DataFrame
required
DataFrame with potential missing values
return
pd.DataFrame
DataFrame with no missing values:
  • Numeric columns: Filled per missing_strategy
  • Categorical columns: Filled with 'Unknown_{column_name}'
Example:
from pipeline.preprocessing import Preprocessor
import pandas as pd
import numpy as np

preprocessor = Preprocessor(missing_strategy='median')

df = pd.DataFrame({
    'age': [25, 30, np.nan, 28, np.nan],
    'salary': [1000000, np.nan, 3000000, np.nan, 2500000],
    'team': ['LAL', None, 'GSW', 'BOS', None],
    'position': ['PG', 'SG', None, 'SF', 'C']
})

print("Before:")
print(df.isna().sum())
# age        2
# salary     2
# team       2
# position   1

cleaned = preprocessor.handle_missing(df)

print("\nAfter:")
print(cleaned.isna().sum())
# age        0
# salary     0
# team       0
# position   0

print("\nFilled values:")
print(cleaned['age'].tolist())  # [25, 30, 28, 28, 28]  (median)
print(cleaned['salary'].tolist())  # [1000000, 2500000, 3000000, 2500000, 2500000]
print(cleaned['team'].tolist())  # ['LAL', 'Unknown_team', 'GSW', 'BOS', 'Unknown_team']
print(cleaned['position'].tolist())  # ['PG', 'SG', 'Unknown_position', 'SF', 'C']

detect_outliers_iqr

def detect_outliers_iqr(
    self, 
    df: pd.DataFrame, 
    multiplier: float = 1.5
) -> pd.Series
Detects outliers in numeric columns using the Interquartile Range (IQR) method.
df
pd.DataFrame
required
DataFrame with numeric columns to check for outliers
multiplier
float
default:"1.5"
IQR multiplier for outlier threshold. Higher values = more permissive.
  • 1.5: Standard outlier detection
  • 3.0: Extreme outlier detection only
return
pd.Series
Boolean Series where True indicates a row contains at least one outlier value across any numeric column
Outlier detection formula:
Lower bound = Q1 - (multiplier × IQR)
Upper bound = Q3 + (multiplier × IQR)
Outlier if: value < lower_bound OR value > upper_bound
Example:
from pipeline.preprocessing import Preprocessor
import pandas as pd

preprocessor = Preprocessor()

df = pd.DataFrame({
    'salary': [1000000, 2000000, 1500000, 50000000, 1800000],  # 50M is outlier
    'age': [25, 28, 30, 27, 26],
    'height': [190, 195, 200, 188, 192]
})

# Standard outlier detection
outlier_mask = preprocessor.detect_outliers_iqr(df, multiplier=1.5)
print(outlier_mask.tolist())  # [False, False, False, True, False]
print(f"Found {outlier_mask.sum()} outlier(s)")

# Only extreme outliers
outlier_mask_extreme = preprocessor.detect_outliers_iqr(df, multiplier=3.0)
print(outlier_mask_extreme.tolist())  # [False, False, False, True, False]

# Filter out outliers
clean_df = df[~outlier_mask]
print(f"Removed {outlier_mask.sum()} rows, {len(clean_df)} remaining")

# Get outlier rows
outlier_df = df[outlier_mask]
print("\nOutlier rows:")
print(outlier_df)

Usage Patterns

Basic Cleaning Pipeline

from pipeline.preprocessing import Preprocessor
import pandas as pd

preprocessor = Preprocessor(
    random_seed=42,
    missing_strategy='median'
)

# Load and clean data
raw_df = pd.read_csv('nba_raw.csv')
cleaned_df = preprocessor.clean(raw_df)

print(f"Cleaned {len(cleaned_df)} rows")
print(f"Columns: {list(cleaned_df.columns)}")
print(f"Missing values: {cleaned_df.isna().sum().sum()}")  # Should be 0

Outlier Analysis

from pipeline.preprocessing import Preprocessor
import pandas as pd

preprocessor = Preprocessor()
df = pd.read_csv('nba_data.csv')

# Clean first
cleaned = preprocessor.clean(df)

# Detect outliers in numeric columns only
numeric_df = cleaned.select_dtypes(include='number')
outlier_mask = preprocessor.detect_outliers_iqr(numeric_df, multiplier=1.5)

print(f"Total rows: {len(cleaned)}")
print(f"Outliers: {outlier_mask.sum()} ({outlier_mask.mean():.1%})")

# Separate analysis
normal_data = cleaned[~outlier_mask]
outlier_data = cleaned[outlier_mask]

print(f"\nNormal salary range: ${normal_data['salary'].min():,.0f} - ${normal_data['salary'].max():,.0f}")
print(f"Outlier salary range: ${outlier_data['salary'].min():,.0f} - ${outlier_data['salary'].max():,.0f}")

Custom Missing Value Strategy

from pipeline.preprocessing import Preprocessor
import pandas as pd

# Mean imputation
preprocessor_mean = Preprocessor(missing_strategy='mean')
df_mean = preprocessor_mean.clean(raw_df)

# Median imputation (default)
preprocessor_median = Preprocessor(missing_strategy='median')
df_median = preprocessor_median.clean(raw_df)

# Zero imputation
preprocessor_zero = Preprocessor(missing_strategy='zero')
df_zero = preprocessor_zero.clean(raw_df)

# Compare strategies
print("Salary imputation comparison:")
print(f"Mean strategy: {df_mean['salary'].mean():.2f}")
print(f"Median strategy: {df_median['salary'].median():.2f}")
print(f"Zero strategy: {df_zero['salary'].mean():.2f}")

Integration with Pipeline

from pipeline.config import PipelineConfig
from pipeline.preprocessing import Preprocessor
from pipeline.ingestion import DataIngestor

config = PipelineConfig(random_seed=42)
ingestor = DataIngestor(config.random_seed)
preprocessor = Preprocessor(
    random_seed=config.random_seed,
    missing_strategy='median'
)

# Load and clean
raw_df = ingestor.load('nba_data.csv')
cleaned_df = preprocessor.clean(raw_df)

# Detect and report outliers
outlier_mask = preprocessor.detect_outliers_iqr(
    cleaned_df.select_dtypes(include='number')
)

print(f"Processed {len(cleaned_df)} rows")
print(f"Outliers detected: {outlier_mask.sum()}")
print(f"Outlier rate: {outlier_mask.mean():.2%}")

Notes

  • The clean() method automatically calls handle_missing() as the final step
  • Height and weight parsing expects format: "imperial / metric" or "imperial / metric kg."
  • Salary parsing handles $ prefix but not commas (assumes no formatting)
  • Country binarization treats only “USA” as USA, all others as “Not-USA”
  • Datetime parsing errors are coerced to NaT (Not a Time)
  • IQR outlier detection considers all numeric columns together (row-wise)
  • Empty DataFrames return all-False outlier masks
  • Missing value imputation uses pandas’ .fillna() with computed statistics

Build docs developers (and LLMs) love