Evaluators assess the quality and performance of AI models and flows.
defineEvaluator()
Defines and registers an evaluator.
import { defineEvaluator } from '@genkit-ai/ai';
import { z } from 'zod';
const faithfulnessEvaluator = defineEvaluator(
registry,
{
name: 'faithfulness',
displayName: 'Faithfulness',
definition: 'Evaluates if the output is faithful to the context',
dataPointType: BaseEvalDataPointSchema,
configSchema: z.object({
threshold: z.number().default(0.7),
}),
},
async (datapoint, options) => {
const score = await computeFaithfulness(
datapoint.output,
datapoint.context
);
return {
testCaseId: datapoint.testCaseId,
evaluation: {
score,
status: score >= options.threshold ? 'PASS' : 'FAIL',
details: {
reasoning: `Score: ${score}`,
},
},
};
}
);
Parameters
The Genkit registry instance
options
EvaluatorOptions<DataPoint, EvaluatorOpts>
required
Unique name for the evaluator
Human-readable display name
Description of what the evaluator measures
Zod schema for the expected data point structure
Zod schema for evaluator configuration
Whether this evaluator incurs costs (default: true)
runner
EvaluatorFn<EvalDataPoint, EvaluatorOpts>
required
Implementation functionFunction signature:(datapoint: z.infer<EvalDataPoint>, options?: z.infer<EvaluatorOpts>)
=> Promise<EvalResponse>
Returns
An evaluator action that can be used with evaluate()
evaluate()
Evaluates a dataset using an evaluator.
import { evaluate } from '@genkit-ai/ai';
const results = await evaluate(registry, {
evaluator: 'faithfulness',
dataset: [
{
testCaseId: 'test-1',
input: 'What is AI?',
output: 'AI is artificial intelligence.',
context: ['AI stands for artificial intelligence'],
},
],
options: { threshold: 0.8 },
});
console.log(results); // EvalResponses
Parameters
The Genkit registry instance
params
EvaluatorParams<DataPoint, CustomOptions>
required
evaluator
EvaluatorArgument<DataPoint, CustomOptions>
required
Evaluator to use (string name, EvaluatorAction, or EvaluatorReference)
dataset
Dataset<DataPoint>
required
Array of data points to evaluate
Unique identifier for this evaluation run (auto-generated if not provided)
Evaluator-specific configuration
Returns
Array of evaluation resultstype EvalResponses = EvalResponse[];
evaluatorRef()
Creates a reference to an evaluator.
import { evaluatorRef } from '@genkit-ai/ai';
const faithfulnessRef = evaluatorRef({
name: 'faithfulness',
info: {
label: 'Faithfulness Evaluator',
metrics: ['faithfulness_score'],
},
});
Parameters
options
EvaluatorReference<CustomOptionsSchema>
required
Evaluator metadataShow EvaluatorInfo properties
List of metric names this evaluator produces
Returns
reference
EvaluatorReference<CustomOptionsSchema>
An evaluator reference
Types
BaseDataPoint
const BaseDataPointSchema = z.object({
input: z.unknown(),
output: z.unknown().optional(),
context: z.array(z.unknown()).optional(),
reference: z.unknown().optional(),
testCaseId: z.string().optional(),
traceIds: z.array(z.string()).optional(),
});
type BaseDataPoint = z.infer<typeof BaseDataPointSchema>;
BaseEvalDataPoint
const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
testCaseId: z.string(),
});
type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;
Score
const ScoreSchema = z.object({
id: z.string().optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
status: z.enum(['UNKNOWN', 'PASS', 'FAIL']).optional(),
error: z.string().optional(),
details: z.object({
reasoning: z.string().optional(),
}).passthrough().optional(),
});
type Score = z.infer<typeof ScoreSchema>;
EvalResponse
const EvalResponseSchema = z.object({
sampleIndex: z.number().optional(),
testCaseId: z.string(),
traceId: z.string().optional(),
spanId: z.string().optional(),
evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]),
});
type EvalResponse = z.infer<typeof EvalResponseSchema>;
EvaluatorAction
type EvaluatorAction<
DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,
CustomOptions extends z.ZodTypeAny = z.ZodTypeAny
> = Action<typeof EvalRequestSchema, typeof EvalResponsesSchema> & {
__dataPointType?: DataPoint;
__configSchema?: CustomOptions;
};
EvalStatusEnum
enum EvalStatusEnum {
UNKNOWN = 'UNKNOWN',
PASS = 'PASS',
FAIL = 'FAIL',
}
Example: Custom Evaluator
import { ai } from './genkit';
import { BaseEvalDataPointSchema } from '@genkit-ai/ai';
import { z } from 'zod';
// Define custom data point schema
const CustomDataPointSchema = BaseEvalDataPointSchema.extend({
expectedKeywords: z.array(z.string()).optional(),
});
// Define evaluator
const keywordEvaluator = ai.defineEvaluator(
{
name: 'keyword-presence',
displayName: 'Keyword Presence',
definition: 'Checks if expected keywords are present in the output',
dataPointType: CustomDataPointSchema,
configSchema: z.object({
caseSensitive: z.boolean().default(false),
}),
isBilled: false,
},
async (datapoint, options) => {
const output = String(datapoint.output || '');
const keywords = datapoint.expectedKeywords || [];
let foundCount = 0;
const missingKeywords: string[] = [];
for (const keyword of keywords) {
const searchTerm = options.caseSensitive
? keyword
: keyword.toLowerCase();
const searchIn = options.caseSensitive
? output
: output.toLowerCase();
if (searchIn.includes(searchTerm)) {
foundCount++;
} else {
missingKeywords.push(keyword);
}
}
const score = keywords.length > 0 ? foundCount / keywords.length : 1;
return {
testCaseId: datapoint.testCaseId,
evaluation: {
score,
status: score >= 1 ? 'PASS' : 'FAIL',
details: {
reasoning: `Found ${foundCount}/${keywords.length} keywords`,
missingKeywords,
},
},
};
}
);
// Run evaluation
const results = await ai.evaluate({
evaluator: keywordEvaluator,
dataset: [
{
testCaseId: 'test-1',
input: 'Explain quantum computing',
output: 'Quantum computing uses qubits and superposition',
expectedKeywords: ['quantum', 'qubits'],
},
{
testCaseId: 'test-2',
input: 'What is AI?',
output: 'Artificial intelligence is machine learning',
expectedKeywords: ['artificial', 'intelligence', 'neural'],
},
],
options: { caseSensitive: false },
});
console.log(results);