Create and use custom G-Eval criteria for claim quality assessment
CheckThat AI uses DeepEval’s G-Eval framework for claim quality assessment. While the platform provides default evaluation criteria, you can define custom metrics tailored to your specific fact-checking needs.
STATIC_EVAL_SPECS = StaticEvaluation( criteria="""Evaluate the normalized claim against the following criteria: Verifiability and Self-Containment, Claim Centrality and Extraction Quality, Conciseness and Clarity, Check-Worthiness Alignment, and Factual Consistency""", evaluation_steps=[ # Verifiability and Self-Containment "Check if the claim contains verifiable factual assertions that can be independently checked", "Check if the claim is self-contained without requiring additional context from the original post", # Claim Centrality and Extraction Quality "Check if the normalized claim captures the central assertion from the source text while removing extraneous information", "Check if the claim represents the core factual assertion that requires fact-checking, not peripheral details", # Conciseness and Clarity "Check if the claim is presented in a straightforward, concise manner that fact-checkers can easily process", "Check if the claim is significantly shorter than source posts while preserving essential meaning", # Check-Worthiness Alignment "Check if the normalized claim meets check-worthiness standards for fact-verification", "Check if the claim has general public interest, potential for harm, and likelihood of being false", # Factual Consistency "Check if the normalized claim is factually consistent with the source material without hallucinations or distortions", "Check if the claim accurately reflects the original assertion without introducing new information", ])
Define a custom G-Eval metric with your own criteria:
from deepeval.metrics import GEvalfrom deepeval.test_case import LLMTestCaseParamsfrom deepeval.models import GPTModelfrom checkthat import CheckThat# Create custom evaluation metriccustom_metric = GEval( name="Medical Claim Accuracy", criteria="""Evaluate medical claims for: 1. Scientific accuracy 2. Source attribution 3. Hedge appropriateness (avoiding absolutes like 'cures' or 'eliminates') 4. Harm potential """, evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], evaluation_steps=[ "Check if the claim uses scientifically accurate terminology", "Check if medical sources or studies are properly attributed", "Check if the claim uses appropriate hedging (e.g., 'may help' vs 'cures')", "Check if the claim could cause harm if believed to be true", "Check if the claim is verifiable through peer-reviewed research" ], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-openai-key"), threshold=0.7)# Use custom metric with CheckThatclient = CheckThat(api_key="your-checkthat-key")response = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[ { "role": "user", "content": "Drinking 8 glasses of water daily cures kidney disease" } ], refine_claims=True, refine_model="gpt-5-2025-08-07", refine_metrics=custom_metric, # Use custom metric refine_threshold=0.7, refine_max_iters=3)print(response.choices[0].message.content)print(response.refinement_metadata.refinement_history)
from deepeval.metrics import GEvalfrom deepeval.test_case import LLMTestCaseParamsfrom deepeval.models import GPTModelmedical_metric = GEval( name="Medical Fact-Check Quality", criteria="""Evaluate medical claims for: - Scientific accuracy and terminology - Proper source attribution (studies, institutions) - Appropriate hedging (avoid absolutes) - Potential harm if misinformation spreads - Verifiability through medical databases """, evaluation_steps=[ "Verify medical terminology is used correctly", "Check if specific studies, doctors, or institutions are properly named", "Ensure claims use 'may', 'can', 'associated with' rather than 'cures', 'eliminates', 'guarantees'", "Assess potential harm: Could believing this false claim cause injury or death?", "Confirm claim can be verified via PubMed, medical journals, or health authorities", "Check if the claim distinguishes between correlation and causation" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.75)
political_metric = GEval( name="Political Statement Verification", criteria="""Evaluate political claims for: - Specific, verifiable facts (not opinions) - Clear attribution to named politicians or parties - Temporal specificity (dates, timeframes) - Neutrality (avoiding loaded language) - Public record verifiability """, evaluation_steps=[ "Distinguish factual claims from political opinions or rhetoric", "Verify full names and official titles are included", "Check if specific dates, terms, or sessions are mentioned", "Ensure claim uses neutral language (remove 'allegedly', 'claims to', etc.)", "Confirm claim can be verified through official government records or voting records", "Check that the claim represents a single, discrete assertion" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.8 # Higher bar for political claims)
statistical_metric = GEval( name="Statistical Claim Precision", criteria="""Evaluate statistical claims for: - Exact numbers and percentages - Clear population or sample specification - Timeframe specification - Source attribution - Context preservation (denominators, baselines) """, evaluation_steps=[ "Verify specific numbers/percentages are preserved from original", "Check if the population being measured is clearly stated", "Ensure timeframes are specific (not 'recent' but 'in 2024')", "Confirm statistical source is named (census, study, poll)", "Check that context is preserved (e.g., '30% increase from baseline of X')", "Verify claim doesn't conflate absolute numbers with rates/percentages" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.85)
visual_content_metric = GEval( name="Visual Content Claim Verification", criteria="""Evaluate claims about images/videos for: - Explicit mention this is visual content - Description of visual elements - Attribution (who appears, where, when) - Avoidance of unverifiable interpretations - Context about content origin """, evaluation_steps=[ "Check if claim explicitly states this describes a photo/video/image", "Verify visible elements are objectively described (not interpreted)", "Ensure people, places, and times are specifically named if identifiable", "Confirm claim avoids subjective interpretations ('appears happy', 'seems to show')", "Check if source or original context is mentioned when known", "Verify claim doesn't assert facts not visible in the content itself" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.7)
from deepeval.metrics import GEvalfrom deepeval.test_case import LLMTestCaseParamsfrom deepeval.models import GPTModelfrom checkthat import CheckThat# Define custom metricmy_metric = GEval( name="Brand Mention Accuracy", criteria="Ensure brand names, products, and company names are accurately extracted and spelled correctly", evaluation_steps=[ "Check if all brand names from the original are included", "Verify spelling of company and product names", "Confirm brands are not confused with generic terms" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-openai-key"), threshold=0.8)# Use with refinementclient = CheckThat(api_key="your-checkthat-key")response = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[ { "role": "user", "content": "Tesla's new Model S Plaid achieves 0-60 mph in under 2 seconds" } ], refine_claims=True, refine_model="gpt-5-2025-08-07", refine_metrics=my_metric, refine_threshold=0.8, refine_max_iters=3)print(f"Refined claim: {response.choices[0].message.content}")# Check evaluation scoresfor history in response.refinement_metadata.refinement_history: print(f"{history.claim_type}: {history.score:.2f}") print(f" Claim: {history.claim}") print(f" Feedback: {history.feedback[:150]}...\n")
from deepeval.metrics import GEvalfrom deepeval.test_case import LLMTestCaseParamsfrom deepeval.models import GPTModelfrom checkthat import CheckThat# Define multiple metricsaccuracy_metric = GEval( name="Factual Accuracy", criteria="Claim must be factually accurate and verifiable", evaluation_steps=[ "Check if claim can be verified through reliable sources", "Verify no factual errors introduced during extraction" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.8)clarity_metric = GEval( name="Claim Clarity", criteria="Claim must be clear, concise, and unambiguous", evaluation_steps=[ "Check if claim is self-contained", "Verify claim has no ambiguous references", "Ensure claim is concise (under 25 words)" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=GPTModel(model="gpt-5-2025-08-07", _openai_api_key="your-key"), threshold=0.75)# Note: Currently, the API accepts a single metric via refine_metrics# For multiple metrics, you need to combine them or run separately# Option 1: Sequential evaluation with different metricsclient = CheckThat(api_key="your-checkthat-key")# First pass: accuracyresponse1 = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[{"role": "user", "content": post}], refine_claims=True, refine_model="gpt-5-2025-08-07", refine_metrics=accuracy_metric, refine_threshold=0.8)# Second pass: clarity (using first pass result)response2 = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[{"role": "user", "content": response1.choices[0].message.content}], refine_claims=True, refine_model="gpt-5-2025-08-07", refine_metrics=clarity_metric, refine_threshold=0.75)print(f"Final claim: {response2.choices[0].message.content}")
response = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[{"role": "user", "content": post}], refine_claims=True, refine_model="gpt-5-2025-08-07", refine_threshold=0.7, refine_max_iters=3)# Parse feedbackfor iteration in response.refinement_metadata.refinement_history: print(f"\n{'='*60}") print(f"Iteration: {iteration.claim_type}") print(f"Score: {iteration.score:.2f}") print(f"\nClaim:") print(f" {iteration.claim}") print(f"\nFeedback:") print(f" {iteration.feedback}") # Identify specific issues mentioned if "verif" in iteration.feedback.lower(): print(" ⚠️ Verifiability issue detected") if "ambig" in iteration.feedback.lower(): print(" ⚠️ Ambiguity detected") if "context" in iteration.feedback.lower(): print(" ⚠️ Context issue detected")
Example output:
============================================================Iteration: originalScore: 0.58Claim: Asif Mumtaz appointed as PMC ChairmanFeedback: Verifiability: 7/10 - The appointment is verifiable through official records. Self-Containment: 4/10 - The claim uses the acronym 'PMC' without explanation, making it unclear what organization is involved. Named entities are incomplete - missing full title of the appointee. Suggestions: Spell out 'PMC' as 'Pakistan Medical Commission' and include the full title 'Lieutenant Retired General'. ⚠️ Context issue detected ⚠️ Verifiability issue detected
from deepeval.metrics import GEvalfrom deepeval.test_case import LLMTestCaseParamsfrom deepeval.models import AnthropicModel # Using Claude for safety focusfrom checkthat import CheckThat# Define comprehensive medical evaluation metricmedical_safety_metric = GEval( name="Medical Claim Safety & Accuracy", criteria="""Evaluate medical claims for: 1. Scientific accuracy and proper terminology 2. Source attribution (studies, institutions, experts) 3. Appropriate hedging (avoiding dangerous absolutes) 4. Harm potential if misinformation spreads 5. Verifiability through medical literature 6. Distinction between correlation and causation """, evaluation_steps=[ "Verify medical terminology is used correctly and precisely", "Check if specific studies, institutions, or medical experts are named", "Ensure claims use appropriate hedging: 'may help', 'associated with', 'can reduce risk' rather than 'cures', 'eliminates', 'prevents'", "Assess harm potential: Rate from 0-10 how harmful believing this false claim could be", "Confirm claim can be verified via PubMed, peer-reviewed journals, or official health organizations (WHO, CDC)", "Check if the claim distinguishes correlation from causation (e.g., 'associated with' vs 'causes')", "Verify claim doesn't make absolute promises about health outcomes", "Ensure any numerical claims (percentages, dosages) are precise and sourced" ], evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=AnthropicModel( model="claude-opus-4-1-20250805", _anthropic_api_key="your-anthropic-key" ), threshold=0.80 # High bar for medical claims)# Test with medical misinformationclient = CheckThat(api_key="your-checkthat-key")medical_posts = [ "Drinking 8 glasses of water daily cures kidney disease and prevents cancer.", "Eating vaginal fluids makes you immune to cancer. Scientists at St. Austin University found...", "Gargling with warm salt water eliminates coronavirus before it reaches your lungs.", "New study shows vitamin D reduces COVID-19 risk by 80%."]for post in medical_posts: print(f"\n{'='*80}") print(f"Original Post: {post[:60]}...") print("="*80) response = client.chat.completions.create( model="gpt-5-2025-08-07", messages=[{"role": "user", "content": post}], refine_claims=True, refine_model="claude-opus-4-1-20250805", # Use Claude for evaluation refine_metrics=medical_safety_metric, refine_threshold=0.80, refine_max_iters=4 # Allow more iterations for safety ) # Display results metadata = response.refinement_metadata history = metadata.refinement_history print(f"\nInitial Claim (Score: {history[0].score:.2f}):") print(f" {history[0].claim}") print(f"\nFinal Claim (Score: {history[-1].score:.2f}):") print(f" {history[-1].claim}") improvement = history[-1].score - history[0].score print(f"\nImprovement: {improvement:.2f} ({len(history)-1} iterations)") if history[-1].score >= 0.80: print("✅ PASSED: Meets safety threshold") else: print("⚠️ WARNING: Below safety threshold") print(f"\nFinal Feedback:\n{history[-1].feedback}")
Example output:
================================================================================Original Post: Gargling with warm salt water eliminates coronavirus befo...================================================================================Initial Claim (Score: 0.52): Gargling with warm salt water eliminates coronavirus from throatFinal Claim (Score: 0.82): Gargling with warm salt water may help reduce coronavirus in the throatImprovement: 0.30 (3 iterations)✅ PASSED: Meets safety threshold
Use STATIC_EVAL_SPECS first, customize only if needed
Be Specific
Clear evaluation steps produce better feedback
Match Domain
Medical claims need different criteria than political ones
Test Thresholds
A/B test different thresholds on your data
Monitor Scores
Track score distributions to calibrate metrics
Use Strong Models
Better evaluation models = better refinement
When creating custom metrics, test them on a diverse set of claims to ensure they generalize well. A metric that works perfectly on one type of claim might fail on others.