Skip to main content

Classic Training: BERT-Based Models

This guide covers training BERT-based models for text classification using HuggingFace Transformers, featuring the SST-2 sentiment analysis task.

Overview

The classic example demonstrates:
  • Fine-tuning pre-trained BERT models
  • Text classification with sequence models
  • Experiment tracking with W&B
  • Model evaluation and metrics
  • Model card generation
Example Task: Binary sentiment classification on SST-2 (Stanford Sentiment Treebank)Base Model: MobileBERT (google/mobilebert-uncased)Framework: HuggingFace Transformers + PyTorch

Quick Start

# Navigate to classic example
cd module-3/classic-example

# Build Docker container
make build

# Run development environment
make run_dev

# Inside container:
export PYTHONPATH=.
export WANDB_PROJECT=ml-in-production-practice
export WANDB_API_KEY=your_key

# Load data and train
python classic_example/cli.py load-sst2-data ./data
python classic_example/cli.py train ./conf/example.json

Project Structure

Review the complete package structure:
classic-example/
├── classic_example/
│   ├── __init__.py
│   ├── cli.py               # Command-line interface
│   ├── config.py            # Configuration dataclasses
│   ├── data.py              # Data loading
│   ├── train.py             # Training logic
│   ├── predictor.py         # Inference
│   └── utils.py             # Metrics and helpers
├── conf/
│   ├── example.json         # Full training config
│   └── fast.json            # Quick CI testing
├── tests/
│   ├── test_code.py         # Unit tests
│   ├── test_data.py         # Data validation
│   └── test_model.py        # Model tests
└── requirements.txt

Training Pipeline

Step 1: Data Loading

Load and prepare the SST-2 dataset:
data.py
from datasets import load_dataset
from sklearn.model_selection import train_test_split

def load_sst2_data(path_to_save: Path):
    """Load and split SST-2 dataset."""
    path_to_save.mkdir(parents=True, exist_ok=True)
    
    # Load from HuggingFace datasets
    dataset = load_dataset("glue", "sst2")
    df_all = dataset["train"].to_pandas()
    df_test = dataset["test"].to_pandas()
    
    # Create train/val split
    df_train, df_val = train_test_split(
        df_all, 
        random_state=42,
        test_size=0.2
    )
    
    # Save to CSV
    df_train.to_csv(path_to_save / "train.csv", index=False)
    df_val.to_csv(path_to_save / "val.csv", index=False)
    df_test.to_csv(path_to_save / "test.csv", index=False)
Dataset Statistics:
  • Training: ~53,000 examples
  • Validation: ~14,000 examples
  • Test: ~1,800 examples
  • Classes: Binary (positive/negative sentiment)

Step 2: Configuration

Define training configuration in conf/example.json:
example.json
{
  "model_name_or_path": "google/mobilebert-uncased",
  "train_file": "./data/train.csv",
  "validation_file": "./data/val.csv",
  "output_dir": "results",
  
  "max_seq_length": 128,
  "per_device_train_batch_size": 32,
  "per_device_eval_batch_size": 32,
  "gradient_accumulation_steps": 1,
  
  "learning_rate": 5e-05,
  "num_train_epochs": 5,
  "warmup_ratio": 0.1,
  "weight_decay": 0.01,
  
  "eval_strategy": "steps",
  "eval_steps": 250,
  "logging_steps": 250,
  "save_steps": 250,
  
  "load_best_model_at_end": true,
  "metric_for_best_model": "eval_f1",
  "greater_is_better": true,
  
  "report_to": ["wandb"],
  "seed": 42
}

Step 3: Model Loading

Load pre-trained BERT model and tokenizer:
train.py
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

def get_models(model_args, num_labels):
    """Load pre-trained BERT model and tokenizer."""
    # Load config
    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=model_args.cache_dir,
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )
    
    return config, tokenizer, model

Step 4: Dataset Processing

Tokenize and prepare datasets:
train.py
from functools import partial

def preprocess_function_examples(
    examples, tokenizer, padding, max_seq_length, label_to_id
):
    """Tokenize text examples."""
    result = tokenizer(
        examples["sentence"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )
    
    if label_to_id is not None and "label" in examples:
        result["label"] = [
            label_to_id[label] if label != -1 else -1
            for label in examples["label"]
        ]
    
    return result

def process_dataset(
    data_args, label_list, model, config, 
    tokenizer, training_args, raw_datasets
):
    """Preprocess datasets with tokenization."""
    padding = "max_length" if data_args.pad_to_max_length else False
    label_to_id = {v: i for i, v in enumerate(label_list)}
    
    # Update model config
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in label_to_id.items()}
    
    # Tokenize datasets
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
    
    preprocess_function = partial(
        preprocess_function_examples,
        tokenizer=tokenizer,
        padding=padding,
        max_seq_length=max_seq_length,
        label_to_id=label_to_id,
    )
    
    raw_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Tokenizing dataset",
    )
    
    return raw_datasets["train"], raw_datasets["validation"]

Step 5: Metrics

Define evaluation metrics:
utils.py
import numpy as np
from sklearn.metrics import f1_score, fbeta_score
from transformers import EvalPrediction

def compute_metrics(p: EvalPrediction) -> Dict[str, float]:
    """Compute F1 and F0.5 scores."""
    preds = np.argmax(p.predictions, axis=1)
    
    return {
        "f1": f1_score(y_true=p.label_ids, y_pred=preds),
        "f0.5": fbeta_score(y_true=p.label_ids, y_pred=preds, beta=0.5),
    }

Step 6: Training

Create Trainer and run training:
train.py
from transformers import Trainer, TrainingArguments, set_seed

def train(config_path: Path):
    """Main training function."""
    # Load configuration
    model_args, data_args, training_args = get_config(config_path)
    
    # Set random seed
    set_seed(training_args.seed)
    
    # Load data
    raw_datasets, num_labels, label_list = read_dataset(
        data_args=data_args,
        cache_dir=model_args.cache_dir
    )
    
    # Load model
    config, tokenizer, model = get_models(
        model_args=model_args,
        num_labels=num_labels
    )
    
    # Process datasets
    train_dataset, eval_dataset = process_dataset(
        data_args=data_args,
        label_list=label_list,
        model=model,
        config=config,
        tokenizer=tokenizer,
        training_args=training_args,
        raw_datasets=raw_datasets,
    )
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    
    # Train
    train_result = trainer.train()
    metrics = train_result.metrics
    
    # Save model
    trainer.save_model()
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
    # Evaluate
    metrics = trainer.evaluate(eval_dataset=eval_dataset)
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    
    # Generate model card
    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "tasks": "text-classification",
        "language": "en",
        "dataset_tags": "sst2",
    }
    trainer.create_model_card(**kwargs)
    logger.info(f"Model card saved to {training_args.output_dir}/README.md")

Command-Line Interface

Use the CLI for all operations:
cli.py
import typer
from classic_example.train import train
from classic_example.data import load_sst2_data
from classic_example.utils import upload_to_registry, load_from_registry

app = typer.Typer()
app.command()(train)
app.command()(load_sst2_data)
app.command()(upload_to_registry)
app.command()(load_from_registry)

if __name__ == "__main__":
    app()
Usage:
# Load data
python classic_example/cli.py load-sst2-data ./data

# Train model
python classic_example/cli.py train ./conf/example.json

# Upload to registry
python classic_example/cli.py upload-to-registry \
    my-model ./results

# Download from registry
python classic_example/cli.py load-from-registry \
    my-model:latest ./downloaded

Testing

Run comprehensive tests:
# All tests
make test

# Specific test suites
make test_code    # Unit tests
make test_data    # Data validation
make test_model   # Model tests

# With coverage
make test_all
Key test examples:
test_code.py
def test_compute_metrics(eval_pred: EvalPrediction):
    """Test metrics computation."""
    metrics = compute_metrics(eval_pred)
    assert metrics == {"f1": 1.0, "f0.5": 1.0}

Model Registry

Upload and manage models with W&B:
utils.py
import wandb
from pathlib import Path

def upload_to_registry(model_name: str, model_path: Path):
    """Upload model artifacts to W&B."""
    with wandb.init() as _:
        art = wandb.Artifact(model_name, type="model")
        art.add_file(model_path / "config.json")
        art.add_file(model_path / "model.safetensors")
        art.add_file(model_path / "tokenizer.json")
        art.add_file(model_path / "tokenizer_config.json")
        art.add_file(model_path / "special_tokens_map.json")
        art.add_file(model_path / "README.md")
        wandb.log_artifact(art)

def load_from_registry(model_name: str, model_path: Path):
    """Download model from W&B."""
    with wandb.init() as run:
        artifact = run.use_artifact(model_name, type="model")
        artifact_dir = artifact.download(root=model_path)
        return artifact_dir
Explore pre-trained models on HuggingFace:

BERT Base

110M parameters, uncased

MobileBERT

25M parameters, optimized for mobile

DistilBERT

66M parameters, 40% faster

RoBERTa

125M parameters, improved training
Change base model:
{
  "model_name_or_path": "distilbert-base-uncased",
  // ... rest of config
}

Resources

HuggingFace Models

Browse 100,000+ pre-trained models

Transformers Docs

Complete HuggingFace documentation

Text Classification Example

Official HuggingFace example

PyTorch Training Tips

Make PyTorch models train faster

Next Steps

LLM Training

Learn to fine-tune modern LLMs like Phi-3 with LoRA

Build docs developers (and LLMs) love