Skip to main content

Overview

Data wrangling is the process of cleaning, structuring, and enriching raw data into a usable format for analysis. Real-world data is rarely clean—it contains missing values, duplicates, inconsistent formats, and outliers.
Why Data Wrangling Matters:
  • 60-80% of a data scientist’s time is spent preparing data
  • Poor data quality leads to unreliable insights
  • Clean data enables accurate modeling and predictions

Complete Data Wrangling Workflow

Here’s a comprehensive example from a fintech transaction dataset:

1. Load and Explore Data

import pandas as pd
import numpy as np

def cargar_y_explorar(ruta_csv: str) -> pd.DataFrame:
    print("=== 1. LOAD AND EXPLORE DATA ===")
    df = pd.read_csv(ruta_csv)

    print("\nFirst rows of original dataset:")
    print(df.head())

    print("\nDataFrame information:")
    df.info()

    print("\nDescriptive statistics (numeric):")
    print(df.describe())

    print("\nNull values per column:")
    print(df.isna().sum())

    duplicados = df.duplicated().sum()
    print(f"\nDuplicated rows: {duplicados}")

    return df
Always start with .info(), .describe(), and .head() to understand your data structure before making changes.

Handling Missing Data

Identify Missing Values

# Check null values by column
nulos_por_columna = df.isna().sum()
print("Null values per column:")
print(nulos_por_columna)

# Calculate percentage of missing data
porcentaje_nulos = (df.isna().sum() / len(df) * 100).round(2)
print(f"\nPercentage of missing data:")
print(porcentaje_nulos)

Imputation Strategies

# Impute with mean
if "monto" in df.columns:
    media = df["monto"].mean()
    df["monto"] = df["monto"].fillna(media)
    print(f"Imputed mean for 'monto': {media}")

# Impute with median (more robust to outliers)
if "tasa_interes" in df.columns:
    mediana = df["tasa_interes"].median()
    df["tasa_interes"] = df["tasa_interes"].fillna(mediana)

When to Drop vs Impute

# Drop rows with critical missing data (e.g., dates)
df_limpio = df.dropna(subset=['Fecha_Ingreso'])
print(f"Rows after dropping null dates: {len(df_limpio)}")

# Drop columns with >50% missing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)
Be cautious when dropping data—you may lose valuable information. Always document your decisions.

Removing Duplicates

Identify and Remove Duplicates

def limpiar_y_transformar(df: pd.DataFrame) -> pd.DataFrame:
    print("\n=== 2. CLEANING AND TRANSFORMATION ===")
    df_limpio = df.copy()

    # Check for exact duplicates
    print(f"\nDuplicates based on Name and Department:")
    print(df[df.duplicated(subset=['Nombre', 'Departamento'], keep=False)])

    # Remove duplicates
    antes = len(df_limpio)
    df_limpio = df_limpio.drop_duplicates()
    despues = len(df_limpio)
    
    print(f"\nRows before removing duplicates: {antes}")
    print(f"Rows after removing duplicates: {despues}")
    print(f"Duplicates removed: {antes - despues}")

    return df_limpio
# Keep first occurrence
df = df.drop_duplicates(keep='first')

Handling Outliers

Detect Outliers with IQR Method

# Identify outliers in Monto_Total using IQR
q1 = df["Monto_Total"].quantile(0.25)
q3 = df["Monto_Total"].quantile(0.75)
iqr = q3 - q1

limite_inferior = q1 - 1.5 * iqr
limite_superior = q3 + 1.5 * iqr

# Find outliers
outliers = df[
    (df["Monto_Total"] < limite_inferior) |
    (df["Monto_Total"] > limite_superior)
]

print(f"Number of outliers: {len(outliers)}")
print(f"Percentage: {len(outliers)/len(df)*100:.2f}%")

Outlier Treatment Options

# Clip values to acceptable range
df["Monto_Total"] = df["Monto_Total"].clip(
    lower=limite_inferior,
    upper=limite_superior
)

Data Transformation

Encoding Categorical Variables

def codificar_categoricas(df: pd.DataFrame) -> pd.DataFrame:
    print("\n=== ENCODING CATEGORICAL VARIABLES ===")
    df_modelo = df.copy()

    # Manual mapping for ordinal categories
    if "tipo_transaccion" in df_modelo.columns:
        mapa_tipo = {
            "pago": 0,
            "retiro": 1,
            "transferencia": 2,
            "deposito": 3
        }
        df_modelo["tipo_transaccion_cod"] = df_modelo["tipo_transaccion"].map(mapa_tipo)
        print("Created 'tipo_transaccion_cod' via manual mapping.")

    # One-hot encoding for nominal categories
    if "segmento_cliente" in df_modelo.columns:
        dummies = pd.get_dummies(df_modelo["segmento_cliente"], prefix="segmento")
        df_modelo = pd.concat([df_modelo.drop(columns=["segmento_cliente"]), dummies], axis=1)
        print("Created dummy columns for 'segmento_cliente'.")

    return df_modelo

Creating Derived Features

# Calculate ticket average
df["Ticket_Promedio"] = df.apply(
    lambda row: row["Monto_Total"] / row["Total_Compras"] 
        if row["Total_Compras"] > 0 else 0,
    axis=1
)

# Binning continuous variables
bins = [0, 1000, 3000, 10000]
labels = ["Bajo", "Medio", "Alto"]
df["Segmento_Monto"] = pd.cut(
    df["Monto_Total"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

Normalization and Scaling

# Min-max normalization (0 to 1)
min_monto = df["Monto_Total"].min()
max_monto = df["Monto_Total"].max()

if max_monto > min_monto:
    df["Monto_Normalizado"] = (
        df["Monto_Total"] - min_monto
    ) / (max_monto - min_monto)

Restructuring Data

Renaming and Reordering Columns

def optimizar_y_estructurar(df: pd.DataFrame) -> pd.DataFrame:
    print("\n=== 3. OPTIMIZATION AND STRUCTURING ===")
    df_final = df.copy()

    # Rename columns for clarity
    df_final = df_final.rename(columns={
        'id_cliente': 'cliente_id',
        'monto': 'monto_transaccion'
    })

    # Reorder columns
    columnas_deseadas = [
        "cliente_id", "id_transaccion", "fecha", 
        "monto_transaccion", "tipo_transaccion", "tasa_interes", "pais"
    ]
    otras_columnas = [c for c in df_final.columns if c not in columnas_deseadas]
    df_final = df_final[columnas_deseadas + otras_columnas]

    return df_final

Sorting Data

# Sort by multiple columns
df = df.sort_values(
    by=['Ciudad', 'Fecha', 'Monto'],
    ascending=[True, False, False]
).reset_index(drop=True)

Data Consolidation

Combine data from multiple sources:
# Load multiple data sources
df_csv = pd.read_csv("clientes_desde_numpy.csv")
df_ecom_csv = pd.read_csv("clientes_ecommerce.csv")
df_ecom_xlsx = pd.read_excel("clientes_ecommerce.xlsx")

# Ensure matching types
df_ecom_csv["ID"] = df_ecom_csv["ID"].astype(int)
df_csv["ID"] = df_csv["ID"].astype(int)

# Merge datasets
df_unificado = pd.merge(
    df_csv,
    df_ecom_csv[["ID", "Nombre", "Ciudad"]],
    on="ID",
    how="left"
)

df_unificado.to_csv("dataset_consolidado.csv", index=False)

Data Validation

Quality Checks

# Validate data ranges
assert df['edad'].between(0, 120).all(), "Invalid age values"
assert df['monto'].ge(0).all(), "Negative amounts found"

# Check for required fields
assert df['cliente_id'].notna().all(), "Missing customer IDs"

# Verify data types
assert df['fecha'].dtype == 'datetime64[ns]', "Date not in datetime format"

print("✓ All validation checks passed")

Export Clean Data

def exportar(df_final: pd.DataFrame,
             csv_salida: str = "transacciones_limpias.csv",
             xlsx_salida: str = "transacciones_limpias.xlsx") -> None:
    print("\n=== 4. EXPORT ===")
    
    # Export to CSV
    df_final.to_csv(csv_salida, index=False)
    
    # Export to Excel
    df_final.to_excel(xlsx_salida, index=False)
    
    print(f"Files exported: {csv_salida} and {xlsx_salida}")
Always export without the index (index=False) unless the index contains meaningful information.

Complete Workflow Example

def main():
    ruta_csv = "transacciones_raw.csv"

    # 1. Load and explore
    df = cargar_y_explorar(ruta_csv)

    print("\n--- BEFORE CLEANING ---")
    print(df.head())

    # 2. Clean and transform
    df_limpio = limpiar_y_transformar(df)
    df_modelo = codificar_categoricas(df_limpio)
    df_final = optimizar_y_estructurar(df_modelo)

    print("\n--- AFTER TRANSFORMATION ---")
    print(df_final.head())

    # 3. Export
    exportar(df_final)

if __name__ == "__main__":
    main()

Best Practices

Document Decisions

Keep track of all cleaning and transformation decisions

Preserve Raw Data

Never modify original data files directly

Use .copy()

Create copies before modifying DataFrames

Validate Results

Always verify data quality after transformations
  • No missing values in critical columns
  • No duplicate records
  • Data types are correct
  • Outliers handled appropriately
  • Categorical variables encoded
  • Derived features created
  • Data normalized/scaled if needed
  • Column names are clear and consistent

Next Steps

NumPy & Pandas

Master the fundamentals first

Exploratory Analysis

Visualize and discover patterns in clean data

Build docs developers (and LLMs) love