Reward Functions
Reward functions score model outputs and are the core evaluation mechanism in Verifiers.
Overview
Reward functions come in two flavors:
- Individual: Score single rollouts (most common)
- Group: Score multiple rollouts together (for comparative evaluation)
Both types use flexible signatures that automatically receive relevant data from the state.
Type Definitions
IndividualRewardFunc = Callable[..., float | Awaitable[float]]
GroupRewardFunc = Callable[..., list[float] | Awaitable[list[float]]]
RewardFunc = IndividualRewardFunc | GroupRewardFunc
Individual Reward Functions
Functions that score a single rollout at a time.
Signature
def reward_fn(
# Any combination of these parameters:
prompt: Messages | str = ...,
completion: Messages | str = ...,
answer: Any = ...,
task: str = ...,
state: State = ...,
info: dict = ...,
# Plus any class objects registered via rubric.add_class_object()
parser: vf.Parser = ...,
custom_obj: Any = ...,
# Catch-all
**kwargs
) -> float:
"""Return score between 0.0 and 1.0 (or any float)."""
...
Available Parameters
The input prompt (from state[“prompt”]).
The model’s final completion (from state[“completion”]).
Ground truth answer from dataset (from state[“answer”]).
Task identifier (from state[“task”]).
Full state dictionary with trajectory, timing, etc.
Additional metadata from dataset (from state[“info”]).
Catches class objects and extra fields. Always include for forward compatibility.
Examples
Simple Exact Match
def exact_match(answer: str, completion: str, **kwargs) -> float:
"""Check if answer appears in completion."""
return 1.0 if answer.lower() in completion.lower() else 0.0
Using Parser
def parsed_match(answer: str, completion: str, parser: vf.Parser, **kwargs) -> float:
"""Extract answer and compare."""
extracted = parser.parse_answer(completion)
return 1.0 if extracted == answer else 0.0
State-based
def efficiency_reward(state: vf.State, **kwargs) -> float:
"""Reward shorter trajectories."""
num_turns = len(state["trajectory"])
return max(0.0, 1.0 - num_turns / 10)
Async Reward
async def llm_judge(completion: str, answer: str, **kwargs) -> float:
"""Use LLM to judge quality."""
client = vf.OpenAIClient()
prompt = f"Rate this answer (0-1): {completion}\nGround truth: {answer}"
response = await client.generate(
model="gpt-4",
prompt=prompt,
sampling_args={"temperature": 0}
)
score_text = response.message.content
return float(score_text.strip())
Group Reward Functions
Functions that score multiple rollouts together, enabling comparative evaluation.
Signature
def group_reward_fn(
# Plural versions of individual parameters:
prompts: list[Messages | str] = ...,
completions: list[Messages | str] = ...,
answers: list[Any] = ...,
tasks: list[str] = ...,
states: list[State] = ...,
infos: list[dict] = ...,
# Class objects (singular)
parser: vf.Parser = ...,
**kwargs
) -> list[float]:
"""Return list of scores, one per rollout."""
...
Examples
Relative Ranking
def rank_reward(completions: list[str], **kwargs) -> list[float]:
"""Reward top 50% of responses by length."""
lengths = [len(c) for c in completions]
median = sorted(lengths)[len(lengths) // 2]
return [1.0 if l >= median else 0.0 for l in lengths]
Best-of-N
def best_of_n(
completions: list[str],
answers: list[str],
**kwargs
) -> list[float]:
"""Give reward only to the best answer(s)."""
scores = [
1.0 if ans in comp else 0.0
for ans, comp in zip(answers, completions)
]
max_score = max(scores)
return [1.0 if s == max_score else 0.0 for s in scores]
Majority Voting
def majority_vote(
completions: list[str],
answer: str, # Singular - same for all
parser: vf.Parser,
**kwargs
) -> list[float]:
"""Reward answers that match majority."""
from collections import Counter
# Parse all answers
parsed = [parser.parse_answer(c) for c in completions]
# Find majority
counts = Counter(parsed)
majority_answer = counts.most_common(1)[0][0]
# Reward majority + correct answers
return [
1.0 if p == majority_answer or p == answer else 0.0
for p in parsed
]
Metrics vs Rewards
Reward functions can be used as metrics (tracked but not contributing to reward) by setting weight=0:
def response_length(completion: str, **kwargs) -> float:
"""Track response length as a metric."""
return float(len(completion))
rubric = vf.Rubric(
funcs=[exact_match, response_length],
weights=[1.0, 0.0], # length is a metric only
)
# Or use add_metric:
rubric.add_metric(response_length)
Async Support
Both individual and group functions can be async:
async def async_individual(completion: str, **kwargs) -> float:
result = await some_async_operation(completion)
return float(result)
async def async_group(completions: list[str], **kwargs) -> list[float]:
results = await asyncio.gather(*[
some_async_operation(c) for c in completions
])
return [float(r) for r in results]
Class Objects
Register objects that reward functions can access:
class CustomScorer:
def score(self, text: str) -> float:
return len(text) / 100
scorer = CustomScorer()
def use_scorer(completion: str, scorer: CustomScorer, **kwargs) -> float:
return scorer.score(completion)
rubric = vf.Rubric(funcs=[use_scorer])
rubric.add_class_object("scorer", scorer)
Debugging Rewards
Print intermediate values:
def debug_reward(answer: str, completion: str, state: vf.State, **kwargs) -> float:
print(f"Answer: {answer}")
print(f"Completion: {completion}")
print(f"Trajectory length: {len(state['trajectory'])}")
score = 1.0 if answer in completion else 0.0
print(f"Score: {score}")
return score
Common Patterns
Multi-criteria Scoring
def multi_criteria(
answer: str,
completion: str,
state: vf.State,
**kwargs
) -> float:
"""Combine multiple criteria."""
# Correctness
correct = 1.0 if answer in completion else 0.0
# Efficiency
turns = len(state["trajectory"])
efficiency = max(0.0, 1.0 - turns / 10)
# Length
length_score = min(len(completion) / 1000, 1.0)
# Weighted combination
return 0.7 * correct + 0.2 * efficiency + 0.1 * length_score
Partial Credit
def partial_credit(answer: str, completion: str, **kwargs) -> float:
"""Give partial credit for partial matches."""
answer_words = set(answer.lower().split())
completion_words = set(completion.lower().split())
overlap = len(answer_words & completion_words)
return overlap / len(answer_words) if answer_words else 0.0
Error Handling
def safe_reward(answer: str, completion: str, **kwargs) -> float:
"""Handle errors gracefully."""
try:
# Complex scoring logic that might fail
result = complex_calculation(answer, completion)
return float(result)
except Exception as e:
print(f"Reward computation failed: {e}")
return 0.0
Best Practices
- Always include
**kwargs for forward compatibility
- Return float (not int, bool, etc.) for rewards
- Handle None values gracefully
- Keep deterministic when possible (for reproducibility)
- Document score range in docstring
- Use async only when necessary (adds overhead)
- Validate inputs at function start
Type Checking
from verifiers.types import RewardFunc, State, Messages
def my_reward(
answer: str,
completion: Messages,
state: State,
**kwargs
) -> float:
"""Type-checked reward function."""
return 1.0
# Verify it's a valid RewardFunc
func: RewardFunc = my_reward
See Also
- Rubric - Combining multiple reward functions
- State - Full state dictionary
- Parser - Extracting answers from completions