Custom Scorers
Basic Custom Scorer
Create a scorer using thecreateScorer API:
import { createScorer } from '@mastra/core/evals';
const customScorer = createScorer({
id: 'custom-scorer',
name: 'Custom Scorer',
description: 'Evaluates response quality',
type: 'code', // 'code' or 'agent' (LLM-based)
})
.preprocess(({ run }) => {
// Extract data from the run
const input = run.input;
const output = run.output;
return {
processedInput: input,
processedOutput: output,
};
})
.generateScore(({ results }) => {
// Calculate score based on preprocessed data
const { processedInput, processedOutput } = results.preprocessStepResult;
// Your scoring logic
const score = calculateScore(processedInput, processedOutput);
return score; // Return 0-1
});
// Use the scorer
const result = await customScorer.run({
input: messages,
output: response,
});
console.log(result.value); // 0.85
LLM-Based Scorer
Create a scorer that uses a judge model:import { createScorer } from '@mastra/core/evals';
import { getUserMessageFromRunInput, getAssistantMessageFromRunOutput } from '@mastra/evals/scorers/utils';
const llmScorer = createScorer({
id: 'custom-llm-scorer',
name: 'Custom LLM Scorer',
description: 'Uses a judge model to evaluate responses',
type: 'agent',
judge: {
model: 'gpt-4o-mini',
instructions: 'You are an expert evaluator of AI responses.',
},
})
.preprocess({
description: 'Extract question and answer',
outputSchema: z.object({
question: z.string(),
answer: z.string(),
}),
createPrompt: ({ run }) => {
const question = getUserMessageFromRunInput(run.input);
const answer = getAssistantMessageFromRunOutput(run.output);
return `Extract the question and answer:
Question: ${question}
Answer: ${answer}`;
},
})
.analyze({
description: 'Evaluate response quality',
outputSchema: z.object({
verdict: z.enum(['excellent', 'good', 'poor']),
reasoning: z.string(),
}),
createPrompt: ({ results }) => {
const { question, answer } = results.preprocessStepResult;
return `Evaluate this Q&A:
Question: ${question}
Answer: ${answer}
Provide a verdict (excellent/good/poor) and reasoning.`;
},
})
.generateScore(({ results }) => {
const { verdict } = results.analyzeStepResult;
// Map verdict to score
const scoreMap = {
excellent: 1.0,
good: 0.7,
poor: 0.3,
};
return scoreMap[verdict];
})
.generateReason({
description: 'Provide reasoning for the score',
createPrompt: ({ results, score }) => {
const { reasoning } = results.analyzeStepResult;
return `Explain why the score is ${score}:
${reasoning}`;
},
});
Scorer Methods
preprocess()
Extract and transform data from the run:const scorer = createScorer({ /* config */ })
.preprocess(({ run }) => {
// Extract user message
const userMessage = run.input?.inputMessages?.find(
m => m.role === 'user'
);
// Extract assistant message
const assistantMessage = run.output?.find(
m => m.role === 'assistant'
);
return {
question: userMessage?.content?.content || '',
answer: assistantMessage?.content?.content || '',
};
});
analyze()
Perform analysis using a judge model:const scorer = createScorer({ type: 'agent', judge: { model: 'gpt-4o-mini' } })
.analyze({
description: 'Analyze response quality',
outputSchema: z.object({
isAccurate: z.boolean(),
isRelevant: z.boolean(),
issues: z.array(z.string()),
}),
createPrompt: ({ results }) => {
const { question, answer } = results.preprocessStepResult;
return `Analyze this response:
Question: ${question}
Answer: ${answer}
Is it accurate? Is it relevant? List any issues.`;
},
});
generateScore()
Calculate the final score:const scorer = createScorer({ /* config */ })
.generateScore(({ results }) => {
const { isAccurate, isRelevant } = results.analyzeStepResult;
// Both must be true for full score
if (isAccurate && isRelevant) return 1.0;
if (isAccurate || isRelevant) return 0.5;
return 0.0;
});
generateReason()
Provide a reason for the score:const scorer = createScorer({ type: 'agent', judge: { model: 'gpt-4o-mini' } })
.generateReason({
description: 'Explain the score',
createPrompt: ({ results, score }) => {
const { issues } = results.analyzeStepResult;
return `Explain why the score is ${score}.
Issues found: ${issues.join(', ')}`;
},
});
Helper Utilities
Use helper functions from@mastra/evals/scorers/utils:
Extract Messages
import {
getUserMessageFromRunInput,
getAssistantMessageFromRunOutput,
getSystemMessagesFromRunInput,
} from '@mastra/evals/scorers/utils';
const scorer = createScorer({ /* config */ })
.preprocess(({ run }) => {
const question = getUserMessageFromRunInput(run.input);
const answer = getAssistantMessageFromRunOutput(run.output);
const systemPrompts = getSystemMessagesFromRunInput(run.input);
return { question, answer, systemPrompts };
});
Extract Tool Calls
import { extractToolCalls, extractToolResults } from '@mastra/evals/scorers/utils';
const scorer = createScorer({ /* config */ })
.preprocess(({ run }) => {
const { tools, toolCallInfos } = extractToolCalls(run.output);
const toolResults = extractToolResults(run.output);
return { tools, toolCallInfos, toolResults };
});
Create Test Data
import {
createTestMessage,
createAgentTestRun,
createToolInvocation,
} from '@mastra/evals/scorers/utils';
const testRun = createAgentTestRun({
inputMessages: [
createTestMessage({ content: 'Hello', role: 'user' }),
],
output: [
createTestMessage({
content: 'Hi there!',
role: 'assistant',
toolInvocations: [
createToolInvocation({
toolCallId: 'call-1',
toolName: 'greet',
args: {},
result: { greeting: 'Hi there!' },
}),
],
}),
],
});
Example: Completeness Scorer
Score based on required elements:import { createScorer } from '@mastra/core/evals';
import { getAssistantMessageFromRunOutput } from '@mastra/evals/scorers/utils';
function createCustomCompletenessScorer({
requiredElements,
}: {
requiredElements: string[];
}) {
return createScorer({
id: 'completeness-scorer',
name: 'Completeness Scorer',
description: 'Checks if response includes required elements',
type: 'code',
})
.preprocess(({ run }) => {
const answer = getAssistantMessageFromRunOutput(run.output) || '';
return { answer };
})
.generateScore(({ results }) => {
const { answer } = results.preprocessStepResult;
// Count how many required elements are present
const present = requiredElements.filter(element =>
answer.toLowerCase().includes(element.toLowerCase())
);
return present.length / requiredElements.length;
})
.generateReason(({ results, score }) => {
const { answer } = results.preprocessStepResult;
const present = requiredElements.filter(element =>
answer.toLowerCase().includes(element.toLowerCase())
);
const missing = requiredElements.filter(element =>
!answer.toLowerCase().includes(element.toLowerCase())
);
return `Score: ${score}\n` +
`Present: ${present.join(', ')}\n` +
`Missing: ${missing.join(', ')}`;
});
}
// Use the scorer
const scorer = createCustomCompletenessScorer({
requiredElements: ['greeting', 'name', 'farewell'],
});
const result = await scorer.run({
input: messages,
output: response,
});
Example: Tool Usage Scorer
Evaluate tool usage accuracy:import { createScorer } from '@mastra/core/evals';
import { extractToolCalls } from '@mastra/evals/scorers/utils';
function createToolUsageScorer({
expectedTools,
}: {
expectedTools: string[];
}) {
return createScorer({
id: 'tool-usage-scorer',
name: 'Tool Usage Scorer',
description: 'Evaluates if agent used correct tools',
type: 'code',
})
.preprocess(({ run }) => {
const { tools } = extractToolCalls(run.output);
return { actualTools: tools };
})
.generateScore(({ results }) => {
const { actualTools } = results.preprocessStepResult;
// Check if all expected tools were used
const usedCorrectTools = expectedTools.every(tool =>
actualTools.includes(tool)
);
// Check if no unexpected tools were used
const noUnexpectedTools = actualTools.every(tool =>
expectedTools.includes(tool)
);
if (usedCorrectTools && noUnexpectedTools) return 1.0;
if (usedCorrectTools || noUnexpectedTools) return 0.5;
return 0.0;
})
.generateReason(({ results, score }) => {
const { actualTools } = results.preprocessStepResult;
return `Expected: ${expectedTools.join(', ')}\n` +
`Actual: ${actualTools.join(', ')}\n` +
`Score: ${score}`;
});
}
Testing Scorers
Test your custom scorers:import { describe, it, expect } from 'vitest';
import { createTestMessage, createAgentTestRun } from '@mastra/evals/scorers/utils';
describe('Custom Scorer', () => {
it('should score complete responses highly', async () => {
const scorer = createCustomCompletenessScorer({
requiredElements: ['hello', 'world'],
});
const testRun = createAgentTestRun({
inputMessages: [],
output: [
createTestMessage({
content: 'Hello world!',
role: 'assistant',
}),
],
});
const result = await scorer.run(testRun);
expect(result.value).toBe(1.0);
});
it('should score incomplete responses lower', async () => {
const scorer = createCustomCompletenessScorer({
requiredElements: ['hello', 'world', 'goodbye'],
});
const testRun = createAgentTestRun({
inputMessages: [],
output: [
createTestMessage({
content: 'Hello world!',
role: 'assistant',
}),
],
});
const result = await scorer.run(testRun);
expect(result.value).toBeCloseTo(0.67, 2);
});
});
Next Steps
Using Scorers
Explore prebuilt scorers
Observability
Integrate with observability