Classic Training: BERT-Based Models
This guide covers training BERT-based models for text classification using HuggingFace Transformers, featuring the SST-2 sentiment analysis task.
Overview
The classic example demonstrates:
Fine-tuning pre-trained BERT models
Text classification with sequence models
Experiment tracking with W&B
Model evaluation and metrics
Model card generation
Example Task: Binary sentiment classification on SST-2 (Stanford Sentiment Treebank)Base Model: MobileBERT (google/mobilebert-uncased)Framework: HuggingFace Transformers + PyTorch
Quick Start
# Navigate to classic example
cd module-3/classic-example
# Build Docker container
make build
# Run development environment
make run_dev
# Inside container:
export PYTHONPATH = .
export WANDB_PROJECT = ml-in-production-practice
export WANDB_API_KEY = your_key
# Load data and train
python classic_example/cli.py load-sst2-data ./data
python classic_example/cli.py train ./conf/example.json
Project Structure
Review the complete package structure:
classic-example/
├── classic_example/
│ ├── __init__.py
│ ├── cli.py # Command-line interface
│ ├── config.py # Configuration dataclasses
│ ├── data.py # Data loading
│ ├── train.py # Training logic
│ ├── predictor.py # Inference
│ └── utils.py # Metrics and helpers
├── conf/
│ ├── example.json # Full training config
│ └── fast.json # Quick CI testing
├── tests/
│ ├── test_code.py # Unit tests
│ ├── test_data.py # Data validation
│ └── test_model.py # Model tests
└── requirements.txt
Training Pipeline
Step 1: Data Loading
Load and prepare the SST-2 dataset:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def load_sst2_data ( path_to_save : Path):
"""Load and split SST-2 dataset."""
path_to_save.mkdir( parents = True , exist_ok = True )
# Load from HuggingFace datasets
dataset = load_dataset( "glue" , "sst2" )
df_all = dataset[ "train" ].to_pandas()
df_test = dataset[ "test" ].to_pandas()
# Create train/val split
df_train, df_val = train_test_split(
df_all,
random_state = 42 ,
test_size = 0.2
)
# Save to CSV
df_train.to_csv(path_to_save / "train.csv" , index = False )
df_val.to_csv(path_to_save / "val.csv" , index = False )
df_test.to_csv(path_to_save / "test.csv" , index = False )
Dataset Statistics:
Training: ~53,000 examples
Validation: ~14,000 examples
Test: ~1,800 examples
Classes: Binary (positive/negative sentiment)
Step 2: Configuration
Define training configuration in conf/example.json:
{
"model_name_or_path" : "google/mobilebert-uncased" ,
"train_file" : "./data/train.csv" ,
"validation_file" : "./data/val.csv" ,
"output_dir" : "results" ,
"max_seq_length" : 128 ,
"per_device_train_batch_size" : 32 ,
"per_device_eval_batch_size" : 32 ,
"gradient_accumulation_steps" : 1 ,
"learning_rate" : 5e-05 ,
"num_train_epochs" : 5 ,
"warmup_ratio" : 0.1 ,
"weight_decay" : 0.01 ,
"eval_strategy" : "steps" ,
"eval_steps" : 250 ,
"logging_steps" : 250 ,
"save_steps" : 250 ,
"load_best_model_at_end" : true ,
"metric_for_best_model" : "eval_f1" ,
"greater_is_better" : true ,
"report_to" : [ "wandb" ],
"seed" : 42
}
Step 3: Model Loading
Load pre-trained BERT model and tokenizer:
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
)
def get_models ( model_args , num_labels ):
"""Load pre-trained BERT model and tokenizer."""
# Load config
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
num_labels = num_labels,
cache_dir = model_args.cache_dir,
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
cache_dir = model_args.cache_dir,
use_fast = model_args.use_fast_tokenizer,
)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
model_args.model_name_or_path,
config = config,
cache_dir = model_args.cache_dir,
)
return config, tokenizer, model
Step 4: Dataset Processing
Tokenize and prepare datasets:
from functools import partial
def preprocess_function_examples (
examples , tokenizer , padding , max_seq_length , label_to_id
):
"""Tokenize text examples."""
result = tokenizer(
examples[ "sentence" ],
padding = padding,
max_length = max_seq_length,
truncation = True ,
)
if label_to_id is not None and "label" in examples:
result[ "label" ] = [
label_to_id[label] if label != - 1 else - 1
for label in examples[ "label" ]
]
return result
def process_dataset (
data_args , label_list , model , config ,
tokenizer , training_args , raw_datasets
):
"""Preprocess datasets with tokenization."""
padding = "max_length" if data_args.pad_to_max_length else False
label_to_id = {v: i for i, v in enumerate (label_list)}
# Update model config
model.config.label2id = label_to_id
model.config.id2label = { id : label for label, id in label_to_id.items()}
# Tokenize datasets
max_seq_length = min (data_args.max_seq_length, tokenizer.model_max_length)
preprocess_function = partial(
preprocess_function_examples,
tokenizer = tokenizer,
padding = padding,
max_seq_length = max_seq_length,
label_to_id = label_to_id,
)
raw_datasets = raw_datasets.map(
preprocess_function,
batched = True ,
load_from_cache_file = not data_args.overwrite_cache,
desc = "Tokenizing dataset" ,
)
return raw_datasets[ "train" ], raw_datasets[ "validation" ]
Step 5: Metrics
Define evaluation metrics:
import numpy as np
from sklearn.metrics import f1_score, fbeta_score
from transformers import EvalPrediction
def compute_metrics ( p : EvalPrediction) -> Dict[ str , float ]:
"""Compute F1 and F0.5 scores."""
preds = np.argmax(p.predictions, axis = 1 )
return {
"f1" : f1_score( y_true = p.label_ids, y_pred = preds),
"f0.5" : fbeta_score( y_true = p.label_ids, y_pred = preds, beta = 0.5 ),
}
Step 6: Training
Create Trainer and run training:
from transformers import Trainer, TrainingArguments, set_seed
def train ( config_path : Path):
"""Main training function."""
# Load configuration
model_args, data_args, training_args = get_config(config_path)
# Set random seed
set_seed(training_args.seed)
# Load data
raw_datasets, num_labels, label_list = read_dataset(
data_args = data_args,
cache_dir = model_args.cache_dir
)
# Load model
config, tokenizer, model = get_models(
model_args = model_args,
num_labels = num_labels
)
# Process datasets
train_dataset, eval_dataset = process_dataset(
data_args = data_args,
label_list = label_list,
model = model,
config = config,
tokenizer = tokenizer,
training_args = training_args,
raw_datasets = raw_datasets,
)
# Create trainer
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset,
eval_dataset = eval_dataset,
compute_metrics = compute_metrics,
tokenizer = tokenizer,
)
# Train
train_result = trainer.train()
metrics = train_result.metrics
# Save model
trainer.save_model()
trainer.log_metrics( "train" , metrics)
trainer.save_metrics( "train" , metrics)
trainer.save_state()
# Evaluate
metrics = trainer.evaluate( eval_dataset = eval_dataset)
trainer.log_metrics( "eval" , metrics)
trainer.save_metrics( "eval" , metrics)
# Generate model card
kwargs = {
"finetuned_from" : model_args.model_name_or_path,
"tasks" : "text-classification" ,
"language" : "en" ,
"dataset_tags" : "sst2" ,
}
trainer.create_model_card( ** kwargs)
logger.info( f "Model card saved to { training_args.output_dir } /README.md" )
Command-Line Interface
Use the CLI for all operations:
import typer
from classic_example.train import train
from classic_example.data import load_sst2_data
from classic_example.utils import upload_to_registry, load_from_registry
app = typer.Typer()
app.command()(train)
app.command()(load_sst2_data)
app.command()(upload_to_registry)
app.command()(load_from_registry)
if __name__ == "__main__" :
app()
Usage:
# Load data
python classic_example/cli.py load-sst2-data ./data
# Train model
python classic_example/cli.py train ./conf/example.json
# Upload to registry
python classic_example/cli.py upload-to-registry \
my-model ./results
# Download from registry
python classic_example/cli.py load-from-registry \
my-model:latest ./downloaded
Testing
Run comprehensive tests:
# All tests
make test
# Specific test suites
make test_code # Unit tests
make test_data # Data validation
make test_model # Model tests
# With coverage
make test_all
Key test examples:
Code Tests
Data Tests
Model Tests
def test_compute_metrics ( eval_pred : EvalPrediction):
"""Test metrics computation."""
metrics = compute_metrics(eval_pred)
assert metrics == { "f1" : 1.0 , "f0.5" : 1.0 }
def test_data_shape ( data ):
"""Test dataset shapes."""
df_train, df_val, df_test = data
assert df_train.shape[ 0 ] + df_val.shape[ 0 ] == 67349
assert df_test.shape == ( 1821 , 3 )
def test_data_content ( data ):
"""Test data quality."""
df_train, df_val, df_test = data
assert df_train.expect_column_values_to_not_be_null(
column = "sentence"
)[ "success" ]
assert df_train.expect_column_values_to_be_in_set(
"label" , [ 0 , 1 ]
)[ "success" ]
def test_overfit_batch ( trainer_with_one_batch ):
"""Test model can overfit small batch."""
train_result = trainer_with_one_batch.train()
metrics = train_result.metrics
assert metrics[ "train_loss" ] < 0.01
def test_train_to_completion ( config_path ):
"""Test full training pipeline."""
train( config_path = config_path)
result_path = Path( "/tmp/results" )
assert result_path.exists()
assert (result_path / "model.safetensors" ).exists()
assert (result_path / "README.md" ).exists()
Model Registry
Upload and manage models with W&B:
import wandb
from pathlib import Path
def upload_to_registry ( model_name : str , model_path : Path):
"""Upload model artifacts to W&B."""
with wandb.init() as _:
art = wandb.Artifact(model_name, type = "model" )
art.add_file(model_path / "config.json" )
art.add_file(model_path / "model.safetensors" )
art.add_file(model_path / "tokenizer.json" )
art.add_file(model_path / "tokenizer_config.json" )
art.add_file(model_path / "special_tokens_map.json" )
art.add_file(model_path / "README.md" )
wandb.log_artifact(art)
def load_from_registry ( model_name : str , model_path : Path):
"""Download model from W&B."""
with wandb.init() as run:
artifact = run.use_artifact(model_name, type = "model" )
artifact_dir = artifact.download( root = model_path)
return artifact_dir
Popular Base Models
Explore pre-trained models on HuggingFace:
BERT Base 110M parameters, uncased
MobileBERT 25M parameters, optimized for mobile
DistilBERT 66M parameters, 40% faster
RoBERTa 125M parameters, improved training
Change base model:
{
"model_name_or_path" : "distilbert-base-uncased" ,
// ... rest of config
}
Resources
HuggingFace Models Browse 100,000+ pre-trained models
Transformers Docs Complete HuggingFace documentation
Text Classification Example Official HuggingFace example
PyTorch Training Tips Make PyTorch models train faster
Next Steps
LLM Training Learn to fine-tune modern LLMs like Phi-3 with LoRA