Overview
The optimizers module provides factory functions for creating PyTorch optimizers, learning rate schedulers, and loss functions based on configuration dictionaries. It also includes a custom Focal Loss implementation for handling class imbalance.
Classes
FocalLoss
Focal Loss implementation for imbalanced multi-class classification. Reduces loss contribution from easy examples and focuses on hard examples.
class FocalLoss(nn.Module):
def __init__(
self,
alpha: torch.Tensor | None = None,
gamma: float = 2.0,
reduction: str = "mean"
)
alpha
torch.Tensor
default:"None"
Class weights tensor of shape (num_classes,). Used to handle class imbalance.
Focusing parameter. Higher values down-weight easy examples more. Typical range: [0, 5]
Specifies the reduction to apply: “none”, “mean”, or “sum”
Formula:
FL(pt) = -alpha * (1 - pt)^gamma * log(pt)
where pt = exp(-CE_loss)
When to use:
- Highly imbalanced datasets
- When many examples are easy to classify
- When you want to focus training on hard examples
Example
import torch
import torch.nn as nn
from training.optimizers import FocalLoss
from training.dataset import compute_class_weights
# Compute class weights
class_weights = compute_class_weights(train_labels, num_classes=9)
# Create focal loss
criterion = FocalLoss(
alpha=class_weights,
gamma=2.0,
reduction="mean"
)
# Use in training
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
Functions
create_optimizer
Creates a PyTorch optimizer from training configuration.
def create_optimizer(model: nn.Module, config: dict) -> torch.optim.Optimizer
PyTorch model whose parameters will be optimized
Training configuration dictionary with keys:
optimizer: Optimizer name (“Adam”, “AdamW”, “SGD with Momentum”, “RMSprop”)
learning_rate: Learning rate (default: 0.001)
l2_decay: Whether to enable L2 regularization (default: False)
l2_lambda: L2 regularization coefficient (default: 0.0001)
Returns: Configured optimizer instance
Supported Optimizers:
| Optimizer | Description | Best For |
|---|
| Adam | Adaptive learning rate with momentum | General purpose, default choice |
| AdamW | Adam with decoupled weight decay | Better generalization than Adam |
| SGD with Momentum | Classic SGD with momentum=0.9 | Large batch training, simple models |
| RMSprop | Adaptive learning rate | RNNs, online learning |
Example
from training.optimizers import create_optimizer
# Basic Adam optimizer
config = {
"optimizer": "Adam",
"learning_rate": 0.001,
"l2_decay": False
}
optimizer = create_optimizer(model, config)
# AdamW with weight decay
config = {
"optimizer": "AdamW",
"learning_rate": 0.0001,
"l2_decay": True,
"l2_lambda": 0.01
}
optimizer = create_optimizer(model, config)
# SGD with momentum and L2 regularization
config = {
"optimizer": "SGD with Momentum",
"learning_rate": 0.01,
"l2_decay": True,
"l2_lambda": 0.0001
}
optimizer = create_optimizer(model, config)
create_scheduler
Creates a learning rate scheduler from training configuration.
def create_scheduler(
optimizer: torch.optim.Optimizer,
config: dict,
steps_per_epoch: int,
) -> torch.optim.lr_scheduler.LRScheduler | None
optimizer
torch.optim.Optimizer
required
Optimizer instance to schedule
Training configuration dictionary with keys:
lr_strategy: Scheduler name (“Constant”, “ReduceLROnPlateau”, “Cosine Annealing”, “Step Decay”, “Exponential Decay”)
epochs: Total number of training epochs
Number of training steps per epoch (length of train DataLoader)
Returns: Configured scheduler instance or None if using constant learning rate
Supported Schedulers:
| Scheduler | Description | Parameters |
|---|
| Constant | No scheduling | None |
| ReduceLROnPlateau | Reduce LR when metric plateaus | factor=0.5, patience=5, min_lr=1e-6 |
| Cosine Annealing | Cosine decay to minimum | T_max=epochs, eta_min=1e-6 |
| Step Decay | Reduce LR at fixed intervals | step_size=epochs/3, gamma=0.1 |
| Exponential Decay | Exponential decay | gamma=0.95 |
Example
from training.optimizers import create_optimizer, create_scheduler
optimizer = create_optimizer(model, training_config)
# Constant learning rate
config = {"lr_strategy": "Constant"}
scheduler = create_scheduler(optimizer, config, steps_per_epoch=100)
# Returns None
# ReduceLROnPlateau - reduces LR when validation loss plateaus
config = {
"lr_strategy": "ReduceLROnPlateau",
"epochs": 100
}
scheduler = create_scheduler(optimizer, config, steps_per_epoch=100)
# Call with: scheduler.step(val_loss)
# Cosine Annealing - smooth decay
config = {
"lr_strategy": "Cosine Annealing",
"epochs": 100
}
scheduler = create_scheduler(optimizer, config, steps_per_epoch=100)
# Call with: scheduler.step()
# Step Decay - drops LR at fixed intervals
config = {
"lr_strategy": "Step Decay",
"epochs": 90
}
scheduler = create_scheduler(optimizer, config, steps_per_epoch=100)
# Reduces LR by 0.1x every 30 epochs
create_criterion
Creates a loss function from training configuration.
def create_criterion(
config: dict,
class_weights: torch.Tensor | None = None,
device: torch.device | None = None,
) -> nn.Module
Training configuration dictionary with keys:
class_weights: Loss type (“None”, “Auto Class Weights”, “Focal Loss”)
class_weights
torch.Tensor
default:"None"
Optional class weights tensor from compute_class_weights()
device
torch.device
default:"None"
Device to move class_weights to
Returns: Loss function (nn.CrossEntropyLoss or FocalLoss)
Loss Functions:
| Type | Loss Function | Use Case |
|---|
| None | CrossEntropyLoss() | Balanced datasets |
| Auto Class Weights | CrossEntropyLoss(weight=class_weights) | Imbalanced datasets |
| Focal Loss | FocalLoss(alpha=class_weights) | Highly imbalanced, hard examples |
Example
from training.optimizers import create_criterion
from training.dataset import compute_class_weights
# Standard cross entropy
config = {"class_weights": "None"}
criterion = create_criterion(config)
# Weighted cross entropy for imbalanced data
class_weights = compute_class_weights(train_labels, num_classes=9)
config = {"class_weights": "Auto Class Weights"}
criterion = create_criterion(config, class_weights, device)
# Focal loss for highly imbalanced data
class_weights = compute_class_weights(train_labels, num_classes=9)
config = {"class_weights": "Focal Loss"}
criterion = create_criterion(config, class_weights, device)
Complete Training Setup
import torch
import torch.nn as nn
from training.dataset import create_dataloaders
from training.optimizers import create_optimizer, create_scheduler, create_criterion
from training.engine import TrainingEngine
# Configuration
training_config = {
"optimizer": "AdamW",
"learning_rate": 0.0001,
"l2_decay": True,
"l2_lambda": 0.01,
"lr_strategy": "Cosine Annealing",
"epochs": 100,
"batch_size": 32,
"class_weights": "Focal Loss"
}
dataset_config = {
"dataset_path": "dataset",
"preprocessing": {"target_size": (224, 224)},
"augmentation": {"preset": "Moderate"}
}
model_config = {
"architecture": "ResNet50",
"num_classes": 9,
"pretrained": True
}
# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = build_model(model_config).to(device)
# Create dataloaders
dataloaders, class_names, class_weights = create_dataloaders(
dataset_config,
training_config
)
# Create optimizer, scheduler, and loss
optimizer = create_optimizer(model, training_config)
scheduler = create_scheduler(
optimizer,
training_config,
steps_per_epoch=len(dataloaders["train"])
)
criterion = create_criterion(
training_config,
class_weights,
device
)
# Create training engine
engine = TrainingEngine(
model=model,
train_loader=dataloaders["train"],
val_loader=dataloaders["val"],
optimizer=optimizer,
criterion=criterion,
device=device,
scheduler=scheduler,
early_stopping_patience=10
)
# Train
results = engine.fit(epochs=training_config["epochs"])
print(f"Training completed in {results['duration']}")
print(f"Best validation loss: {results['best_val_loss']:.4f}")
Learning Rate Schedule Visualization
import torch
import matplotlib.pyplot as plt
from training.optimizers import create_optimizer, create_scheduler
model = nn.Linear(10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Test different schedulers
schedulers = [
("Cosine Annealing", "Cosine Annealing"),
("Step Decay", "Step Decay"),
("Exponential Decay", "Exponential Decay")
]
plt.figure(figsize=(12, 4))
for idx, (name, strategy) in enumerate(schedulers):
config = {"lr_strategy": strategy, "epochs": 100}
scheduler = create_scheduler(optimizer, config, steps_per_epoch=100)
lrs = []
for epoch in range(100):
lrs.append(optimizer.param_groups[0]["lr"])
if scheduler:
scheduler.step()
plt.subplot(1, 3, idx + 1)
plt.plot(lrs)
plt.title(name)
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.grid(True)
# Reset optimizer
for param_group in optimizer.param_groups:
param_group["lr"] = 0.001
plt.tight_layout()
plt.savefig("lr_schedules.png")