Instruction datasets

The instruction_datasets module provides utilities for loading and formatting instruction-following datasets like Alpaca and OpenAssistant for supervised fine-tuning (SFT).

InstructionDataset

PyTorch Dataset class for instruction-tuning that tokenizes examples with response-only masking.

from modern_llm.data.instruction_datasets import (
    InstructionDataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
)

dataset = InstructionDataset(config, tokenizer)

config

InstructionDatasetConfig

required

Configuration object specifying dataset parameters

tokenizer

PreTrainedTokenizerBase

required

Tokenizer for processing instruction examples. If pad_token_id is None, will use eos_token as padding.

Methods

`len`

Returns the number of examples in the dataset.

`getitem`

Returns a single tokenized example as a dictionary with:

input_ids: Token IDs for the full instruction + response
attention_mask: Attention mask (1 for real tokens, 0 for padding)
labels: Target labels with prompt tokens masked as -100

Response-only masking

The dataset automatically masks prompt tokens in the labels so only the response portion contributes to the loss during training. This is achieved by:

Finding the ### Response: marker in the formatted text
Setting all label tokens before the response to -100
Setting all padding tokens to -100

Supported formats

The dataset automatically detects and handles multiple formats: Alpaca format: instruction, input, output fields

{
  "instruction": "Summarize the following article",
  "input": "Long article text...",
  "output": "Article summary"
}

OpenAssistant format: text field with conversation

{
  "text": "Human: Hello\n\nAssistant: Hi there!"
}

HH-RLHF format: chosen field for SFT

{
  "chosen": "Human: Question\n\nAssistant: Answer"
}

load_instruction_dataset

Factory function for creating an InstructionDataset.

from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    num_examples=1000,  # Limit to 1000 examples for debugging
)

dataset = load_instruction_dataset(config, tokenizer)

config

InstructionDatasetConfig

required

Configuration object specifying dataset parameters

tokenizer

PreTrainedTokenizerBase

required

Tokenizer with pad_token defined (or will use eos_token)

dataset

InstructionDataset

InstructionDataset instance with tokenized examples ready for training

InstructionDatasetConfig

Configuration dataclass for instruction-tuning datasets.

from modern_llm.data.instruction_datasets import InstructionDatasetConfig

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
    num_examples=None,      # Load all examples
    include_input=True,     # Include the input field
)

dataset_name

str

required

Hugging Face dataset name (e.g., “tatsu-lab/alpaca”, “OpenAssistant/oasst1”)

max_length

int

default:"1024"

Maximum sequence length for tokenization. Must be positive.

split

str

default:"train"

Dataset split to load (“train”, “validation”, or “test”)

num_examples

Optional[int]

default:"None"

Limit number of examples for debugging. None loads all examples.

include_input

bool

default:"True"

Whether to include the “input” field in Alpaca-style datasets. Set to False to use only instruction and output.

Validation

The config validates on initialization:

dataset_name must be non-empty
max_length must be positive

format_instruction

Format an instruction example using the standard template.

from modern_llm.data.instruction_datasets import format_instruction

# With input field
formatted = format_instruction(
    instruction="Translate this to French",
    input_text="Hello, how are you?",
    output="Bonjour, comment allez-vous?",
)

# Without input field (empty or whitespace-only)
formatted = format_instruction(
    instruction="Write a haiku about coding",
    input_text="",
    output="Code flows like water\nBugs emerge from the shadows\nDebug until dawn",
)

instruction

str

required

The instruction text (will be stripped)

input_text

str

required

The input context text. If empty or whitespace-only, the “Input:” section is omitted.

output

str

required

The expected response (will be stripped)

formatted

str

Formatted instruction string with clear section delimiters

Template format

With input:

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}

Without input:

### Instruction:
{instruction}

### Response:
{output}

create_instruction_dataloader

Create a DataLoader with proper collation for instruction datasets.

from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    create_instruction_dataloader,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
)

dataset = load_instruction_dataset(config, tokenizer)

train_loader = create_instruction_dataloader(
    dataset=dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4,
)

for batch in train_loader:
    # batch contains input_ids, attention_mask, labels
    print(batch["input_ids"].shape)  # [batch_size, max_length]
    break

dataset

InstructionDataset

required

The instruction dataset to load from

batch_size

int

required

Number of examples per batch

shuffle

bool

default:"True"

Whether to shuffle the dataset

num_workers

int

default:"0"

Number of worker processes for data loading

dataloader

DataLoader

PyTorch DataLoader with proper collation and pin_memory enabled

Examples

Load Alpaca dataset for SFT

from modern_llm.data.instruction_datasets import (
    load_instruction_dataset,
    InstructionDatasetConfig,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token

config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    split="train",
)

train_dataset = load_instruction_dataset(config, tokenizer)
print(f"Loaded {len(train_dataset)} instruction examples")

# Inspect an example
example = train_dataset[0]
print(f"Input shape: {example['input_ids'].shape}")
print(f"Labels shape: {example['labels'].shape}")

Debug with limited examples

# Load only 100 examples for quick iteration
config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=1024,
    num_examples=100,
)

dataset = load_instruction_dataset(config, tokenizer)

Instruction-only format (no input field)

# Exclude the input field from Alpaca examples
config = InstructionDatasetConfig(
    dataset_name="tatsu-lab/alpaca",
    max_length=2048,
    include_input=False,  # Only use instruction and output
)

dataset = load_instruction_dataset(config, tokenizer)

Load OpenAssistant conversations

config = InstructionDatasetConfig(
    dataset_name="OpenAssistant/oasst1",
    max_length=2048,
    split="train",
)

dataset = load_instruction_dataset(config, tokenizer)

Create DataLoader for training

from modern_llm.data.instruction_datasets import create_instruction_dataloader

train_loader = create_instruction_dataloader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=4,
)

for batch in train_loader:
    input_ids = batch["input_ids"]        # [8, 2048]
    attention_mask = batch["attention_mask"]  # [8, 2048]
    labels = batch["labels"]              # [8, 2048] with prompt masked
    
    # Train your model
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
    )
    loss = outputs.loss
    break

Custom instruction formatting

from modern_llm.data.instruction_datasets import format_instruction

# Format a custom example
formatted = format_instruction(
    instruction="Explain quantum computing",
    input_text="",  # No additional context
    output="Quantum computing uses quantum mechanics...",
)

print(formatted)
# ### Instruction:
# Explain quantum computing
#
# ### Response:
# Quantum computing uses quantum mechanics...

Models

Configuration

Training

Data

Evaluation

Alignment

Instruction datasets