Overview
REMem provides comprehensive evaluation metrics for both retrieval and question-answering performance. Evaluations are automatically computed when you provide gold standard data to rag_for_qa().
Quick Start
from remem import ReMem
from remem.utils.config_utils import BaseConfig
config = BaseConfig(
llm_name = "gpt-4o-mini" ,
embedding_model_name = "nvidia/NV-Embed-v2" ,
do_eval_retrieval = True , # Enable retrieval evaluation
do_eval_qa = True # Enable QA evaluation
)
remem = ReMem( global_config = config, working_dir = "./remem_data" )
remem.index(docs)
# Run with evaluation
query_solutions, answers, metadata, retrieval_results, qa_results = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs, # Required for retrieval eval
gold_answers = gold_answers, # Required for QA eval
metrics = ( "qa_em" , "qa_f1" , "retrieval_recall" )
)
print ( f "Retrieval Results: { retrieval_results } " )
print ( f "QA Results: { qa_results } " )
Retrieval Metrics
Recall@k
Measures the fraction of relevant documents retrieved in top-k:
query_solutions, _, _, retrieval_results, _ = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
metrics = ( "retrieval_recall" ,)
)
# Output format:
# {
# "Recall@1": 0.45,
# "Recall@3": 0.72,
# "Recall@5": 0.85,
# "Recall@10": 0.92,
# "Recall@15": 0.95,
# "Recall@20": 0.97,
# "Recall@30": 0.98,
# "Recall@50": 0.99
# }
Implementation: src/remem/evaluation/retrieval_eval.py:23-83
class RetrievalRecall ( BaseMetric ):
def calculate_metric_scores (
self ,
gold_docs : List[List[ str ]],
retrieved_chunks : List[List[ str ]],
k_list : List[ int ] = [ 1 , 5 , 10 , 20 ]
) -> Tuple[Dict[ str , float ], List[Dict[ str , float ]]]:
# Calculates recall = |retrieved ∩ gold| / |gold|
# for each k value
Recall-All@k
Requires ALL gold documents to be retrieved:
query_solutions, _, _, retrieval_results, _ = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
metrics = ( "retrieval_recall_all" ,)
)
# Output: 1.0 if all gold docs in top-k, else 0.0
# {
# "Recall_all@5": 0.32,
# "Recall_all@10": 0.58,
# "Recall_all@20": 0.78
# }
Implementation: src/remem/evaluation/retrieval_eval.py:86-169
NDCG@k
Normalized Discounted Cumulative Gain measures ranking quality:
query_solutions, _, _, retrieval_results, _ = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
metrics = ( "retrieval_ndcg_any" ,)
)
# Output: NDCG score (0-1)
# {
# "NDCG_any@5": 0.68,
# "NDCG_any@10": 0.72,
# "NDCG_any@20": 0.75
# }
Implementation: src/remem/evaluation/retrieval_eval.py:172-287
LoCoMo Recall
Specialized recall metric for LoCoMo benchmark:
query_solutions, _, _, retrieval_results, _ = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
metrics = ( "retrieval_recall_locomo" ,)
)
QA Metrics
Exact Match (EM)
Binary metric: 1 if predicted answer exactly matches any gold answer:
query_solutions, _, _, _, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
metrics = ( "qa_em" ,)
)
# Output:
# {
# "ExactMatch": 0.67 # 67% exact matches
# }
Implementation: src/remem/evaluation/qa_eval.py:16-54
class QAExactMatch ( BaseMetric ):
def calculate_metric_scores (...):
# Normalizes answers (lowercase, remove punctuation)
# Returns 1.0 if exact match, 0.0 otherwise
F1 Score
Token-level overlap between predicted and gold answers:
query_solutions, _, _, _, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
metrics = ( "qa_f1" ,)
)
# Output:
# {
# "F1": 0.74 # 74% average F1 score
# }
Implementation: src/remem/evaluation/qa_eval.py:57-109
def compute_f1 ( gold : str , predicted : str ) -> float :
# Token-level precision and recall
# F1 = 2 * (precision * recall) / (precision + recall)
BLEU Scores
N-gram overlap metrics:
query_solutions, _, _, _, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
metrics = ( "qa_bleu1" , "qa_bleu4" )
)
# Output:
# {
# "BLEU-1": 0.71, # Unigram overlap
# "BLEU-4": 0.58 # 4-gram overlap
# }
LLM Judge Metrics
Use LLMs to evaluate answer quality:
config = BaseConfig(
llm_name = "gpt-4o" , # Used for evaluation
do_eval_qa = True
)
query_solutions, _, _, _, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
metrics = ( "qa_longmemeval" , "qa_mem0_llm_judge" , "qa_evalsuit_llm_judge" )
)
Available LLM Judges:
qa_longmemeval - LongMemEval benchmark judge
qa_mem0_llm_judge - Mem0 evaluation judge
qa_evalsuit_llm_judge - EvalSuit judge
LoCoMo F1
Specialized F1 metric for LoCoMo benchmark:
query_solutions, _, _, _, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
metrics = ( "qa_f1_score_locomo" ,)
)
Combining Multiple Metrics
query_solutions, _, _, retrieval_results, qa_results = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
gold_answers = gold_answers,
metrics = (
# Retrieval metrics
"retrieval_recall" ,
"retrieval_recall_all" ,
"retrieval_ndcg_any" ,
# QA metrics
"qa_em" ,
"qa_f1" ,
"qa_bleu1" ,
"qa_bleu4" ,
)
)
print ( " \n === Retrieval Results ===" )
for metric, value in retrieval_results.items():
print ( f " { metric } : { value :.4f} " )
print ( " \n === QA Results ===" )
for metric, value in qa_results.items():
print ( f " { metric } : { value :.4f} " )
Per-Example Metrics
Access metrics for individual examples:
query_solutions, _, _, _, _ = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
gold_docs = gold_docs,
metrics = ( "qa_em" , "qa_f1" , "retrieval_recall" )
)
for i, qs in enumerate (query_solutions):
print ( f " \n Example { i + 1 } : { qs.question } " )
print ( f "Answer: { qs.answer } " )
if qs.metrics:
print ( f "Metrics: { qs.metrics } " )
# Example output:
# {
# "ExactMatch": 1.0,
# "F1": 0.95,
# "Recall@5": 1.0,
# "Recall@10": 1.0
# }
Evaluation Configuration
Enable/Disable Evaluation
config = BaseConfig(
do_eval_retrieval = True , # Enable retrieval metrics
do_eval_qa = True # Enable QA metrics
)
Custom k Values
Specify custom k values for Recall@k:
# From remem.py:762
k_list = [ 1 , 3 , 5 , 10 , 15 , 20 , 30 , 50 ]
# Internally used by retrieval evaluators
overall_metric_result, example_metric_results = retrieval_evaluator.calculate_metric_scores(
gold_docs = gold_docs,
retrieved_chunks = [result.docs for result in query_solutions],
k_list = k_list
)
Saved Results
Evaluation results are automatically saved when enabled:
config = BaseConfig(
do_eval_qa = True ,
dataset = "musique"
)
query_solutions, _, _, retrieval_results, qa_results = remem.rag_for_qa(
queries = queries,
gold_answers = gold_answers,
gold_docs = gold_docs,
to_save = True # Default
)
# Results saved to:
# {working_dir}/rag_results_{inference_type}.json
File Format:
{
"samples" : [
{
"question" : "What is the capital of France?" ,
"answer" : "Paris" ,
"docs" : [ "..." ],
"gold_answers" : [ "Paris" ],
"metrics" : {
"ExactMatch" : 1.0 ,
"F1" : 1.0 ,
"Recall@5" : 1.0
}
}
],
"overall_metrics" : {
"ExactMatch" : 0.67 ,
"F1" : 0.74 ,
"Recall@5" : 0.85
}
}
Complete Example
import json
from remem import ReMem
from remem.utils.config_utils import BaseConfig
# Load data
corpus = json.load( open ( "reproduce/dataset/musique_corpus.json" , "r" ))
samples = json.load( open ( "reproduce/dataset/musique.json" , "r" ))
docs = [ f " { doc[ 'title' ] } \n { doc[ 'text' ] } " for doc in corpus]
queries = [s[ "question" ] for s in samples]
# Prepare gold data
gold_answers = [[s[ "answer" ]] for s in samples]
gold_docs = [
[ f " { item[ 0 ] } \n { '' .join(item[ 1 ]) } "
for item in s[ "context" ]
if item[ 0 ] in [sf[ 0 ] for sf in s[ "supporting_facts" ]]]
for s in samples
]
# Configure
config = BaseConfig(
llm_name = "gpt-4o-mini" ,
embedding_model_name = "nvidia/NV-Embed-v2" ,
dataset = "musique" ,
retrieval_top_k = 200 ,
qa_top_k = 5 ,
do_eval_retrieval = True ,
do_eval_qa = True
)
remem = ReMem( global_config = config, working_dir = "./outputs/musique" )
remem.index(docs)
# Evaluate
query_solutions, answers, metadata, retrieval_eval, qa_eval = remem.rag_for_qa(
queries = queries,
gold_docs = gold_docs,
gold_answers = gold_answers,
metrics = (
"retrieval_recall" ,
"retrieval_ndcg_any" ,
"qa_em" ,
"qa_f1"
)
)
# Print results
print ( " \n === Retrieval Metrics ===" )
for k in [ 1 , 5 , 10 , 20 ]:
print ( f "Recall@ { k } : { retrieval_eval[ f 'Recall@ { k } ' ] :.4f} " )
print ( f "NDCG@ { k } : { retrieval_eval[ f 'NDCG_any@ { k } ' ] :.4f} " )
print ( " \n === QA Metrics ===" )
print ( f "Exact Match: { qa_eval[ 'ExactMatch' ] :.4f} " )
print ( f "F1 Score: { qa_eval[ 'F1' ] :.4f} " )
# Print per-example results
print ( " \n === Sample Results ===" )
for i, qs in enumerate (query_solutions[: 3 ]):
print ( f " \n [Example { i + 1 } ]" )
print ( f "Question: { qs.question } " )
print ( f "Predicted: { qs.answer } " )
print ( f "Gold: { list (qs.gold_answers)[ 0 ] } " )
print ( f "Metrics: { qs.metrics } " )
Unsupported Metrics Warning
If you specify an unsupported metric, REMem will log a warning but continue processing with supported metrics.
From remem.py:670-688:
supported_metrics = {
"retrieval_recall" ,
"retrieval_recall_all" ,
"retrieval_ndcg_any" ,
"retrieval_recall_locomo" ,
"qa_em" ,
"qa_f1" ,
"qa_longmemeval" ,
"qa_f1_score_locomo" ,
"qa_mem0_llm_judge" ,
"qa_evalsuit_llm_judge" ,
"qa_bleu1" ,
"qa_bleu4" ,
}
unsupported_metrics = [m for m in metrics if m not in supported_metrics]
if unsupported_metrics:
logger.warning( f "Unsupported metrics found: { unsupported_metrics } " )
Next Steps
Question Answering Learn about the RAG pipeline
Configuration Explore all configuration options