Skip to main content
The instruction_datasets module provides utilities for loading and formatting instruction-following datasets like Alpaca and OpenAssistant for supervised fine-tuning (SFT).

InstructionDataset

PyTorch Dataset class for instruction-tuning that tokenizes examples with response-only masking.
from modern_llm.data.instruction_datasets import (
    InstructionDataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
)

dataset = InstructionDataset(config, tokenizer)
config
InstructionDatasetConfig
required
Configuration object specifying dataset parameters
tokenizer
PreTrainedTokenizerBase
required
Tokenizer for processing instruction examples. If pad_token_id is None, will use eos_token as padding.

Methods

__len__

Returns the number of examples in the dataset.

__getitem__

Returns a single tokenized example as a dictionary with:
  • input_ids: Token IDs for the full instruction + response
  • attention_mask: Attention mask (1 for real tokens, 0 for padding)
  • labels: Target labels with prompt tokens masked as -100

Response-only masking

The dataset automatically masks prompt tokens in the labels so only the response portion contributes to the loss during training. This is achieved by:
  1. Finding the ### Response: marker in the formatted text
  2. Setting all label tokens before the response to -100
  3. Setting all padding tokens to -100

Supported formats

The dataset automatically detects and handles multiple formats: Alpaca format: instruction, input, output fields
{
  "instruction": "Summarize the following article",
  "input": "Long article text...",
  "output": "Article summary"
}
OpenAssistant format: text field with conversation
{
  "text": "Human: Hello\n\nAssistant: Hi there!"
}
HH-RLHF format: chosen field for SFT
{
  "chosen": "Human: Question\n\nAssistant: Answer"
}

load_instruction_dataset

Factory function for creating an InstructionDataset.
from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    num_examples=1000,  # Limit to 1000 examples for debugging
)

dataset = load_instruction_dataset(config, tokenizer)
config
InstructionDatasetConfig
required
Configuration object specifying dataset parameters
tokenizer
PreTrainedTokenizerBase
required
Tokenizer with pad_token defined (or will use eos_token)
dataset
InstructionDataset
InstructionDataset instance with tokenized examples ready for training

InstructionDatasetConfig

Configuration dataclass for instruction-tuning datasets.
from modern_llm.data.instruction_datasets import InstructionDatasetConfig

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
    num_examples=None,      # Load all examples
    include_input=True,     # Include the input field
)
dataset_name
str
required
Hugging Face dataset name (e.g., “tatsu-lab/alpaca”, “OpenAssistant/oasst1”)
max_length
int
default:"1024"
Maximum sequence length for tokenization. Must be positive.
split
str
default:"train"
Dataset split to load (“train”, “validation”, or “test”)
num_examples
Optional[int]
default:"None"
Limit number of examples for debugging. None loads all examples.
include_input
bool
default:"True"
Whether to include the “input” field in Alpaca-style datasets. Set to False to use only instruction and output.

Validation

The config validates on initialization:
  • dataset_name must be non-empty
  • max_length must be positive

format_instruction

Format an instruction example using the standard template.
from modern_llm.data.instruction_datasets import format_instruction

# With input field
formatted = format_instruction(
    instruction="Translate this to French",
    input_text="Hello, how are you?",
    output="Bonjour, comment allez-vous?",
)

# Without input field (empty or whitespace-only)
formatted = format_instruction(
    instruction="Write a haiku about coding",
    input_text="",
    output="Code flows like water\nBugs emerge from the shadows\nDebug until dawn",
)
instruction
str
required
The instruction text (will be stripped)
input_text
str
required
The input context text. If empty or whitespace-only, the “Input:” section is omitted.
output
str
required
The expected response (will be stripped)
formatted
str
Formatted instruction string with clear section delimiters

Template format

With input:
### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}
Without input:
### Instruction:
{instruction}

### Response:
{output}

create_instruction_dataloader

Create a DataLoader with proper collation for instruction datasets.
from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    create_instruction_dataloader,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
)

dataset = load_instruction_dataset(config, tokenizer)

train_loader = create_instruction_dataloader(
    dataset=dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4,
)

for batch in train_loader:
    # batch contains input_ids, attention_mask, labels
    print(batch["input_ids"].shape)  # [batch_size, max_length]
    break
dataset
InstructionDataset
required
The instruction dataset to load from
batch_size
int
required
Number of examples per batch
shuffle
bool
default:"True"
Whether to shuffle the dataset
num_workers
int
default:"0"
Number of worker processes for data loading
dataloader
DataLoader
PyTorch DataLoader with proper collation and pin_memory enabled

Examples

Load Alpaca dataset for SFT

from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
)

train_dataset = load_instruction_dataset(config, tokenizer)
print(f"Loaded {len(train_dataset)} instruction examples")

# Inspect an example
example = train_dataset[0]
print(f"Input shape: {example['input_ids'].shape}")
print(f"Labels shape: {example['labels'].shape}")

Debug with limited examples

# Load only 100 examples for quick iteration
config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=1024,
    num_examples=100,
)

dataset = load_instruction_dataset(config, tokenizer)

Instruction-only format (no input field)

# Exclude the input field from Alpaca examples
config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    include_input=False,  # Only use instruction and output
)

dataset = load_instruction_dataset(config, tokenizer)

Load OpenAssistant conversations

config = InstructionDatasetConfig(
    dataset_name="OpenAssistant/oasst1",
    max_length=2048,
    split="train",
)

dataset = load_instruction_dataset(config, tokenizer)

Create DataLoader for training

from modern_llm.data.instruction_datasets import create_instruction_dataloader

train_loader = create_instruction_dataloader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=4,
)

for batch in train_loader:
    input_ids = batch["input_ids"]        # [8, 2048]
    attention_mask = batch["attention_mask"]  # [8, 2048]
    labels = batch["labels"]              # [8, 2048] with prompt masked
    
    # Train your model
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
    )
    loss = outputs.loss
    break

Custom instruction formatting

from modern_llm.data.instruction_datasets import format_instruction

# Format a custom example
formatted = format_instruction(
    instruction="Explain quantum computing",
    input_text="",  # No additional context
    output="Quantum computing uses quantum mechanics...",
)

print(formatted)
# ### Instruction:
# Explain quantum computing
#
# ### Response:
# Quantum computing uses quantum mechanics...

Build docs developers (and LLMs) love