The simplest way to create a dataset is from lists of inputs and outputs:
from phoenix.client import Clientclient = Client()dataset = client.datasets.create_dataset( name="customer-support-qa", inputs=[ {"question": "How do I reset my password?"}, {"question": "What is your return policy?"}, {"question": "How do I track my order?"}, ], outputs=[ {"answer": "Click 'Forgot Password' on the login page..."}, {"answer": "We accept returns within 30 days..."}, {"answer": "You can track your order in the Orders section..."}, ], metadata=[ {"category": "account", "priority": "high"}, {"category": "policy", "priority": "medium"}, {"category": "order", "priority": "high"}, ], dataset_description="Common customer support questions and answers")print(f"Created dataset {dataset.name} with {len(dataset)} examples")
question,answer,source,verified"What is AI?","Artificial Intelligence is...","expert_review",true"What is ML?","Machine Learning is...","documentation",true
One of the most powerful features is creating datasets from real production traces:
1
Query Traces
Retrieve spans from your production application:
from datetime import datetime, timezone, timedeltafrom phoenix.client import Clientclient = Client()# Get spans from the last 7 daysend_time = datetime.now(timezone.utc)start_time = end_time - timedelta(days=7)spans_df = client.spans.get_spans_dataframe( project_identifier="production-chatbot", start_time=start_time, end_time=end_time, limit=1000)print(f"Retrieved {len(spans_df)} spans")
2
Filter and Prepare Data
Select relevant spans and prepare the dataset:
import json# Filter for high-quality interactions# (e.g., where users gave positive feedback)quality_spans = spans_df[ (spans_df['attributes.user_feedback.score'] >= 4.0) & (spans_df['name'] == 'llm_inference')].copy()# Parse JSON strings and extract datadef parse_json_value(value): if isinstance(value, str): try: return json.loads(value) except json.JSONDecodeError: return value return value# Extract inputs and outputsquality_spans['question'] = quality_spans['attributes.input.value'].apply( lambda x: parse_json_value(x).get('question', x) if isinstance(parse_json_value(x), dict) else x)quality_spans['answer'] = quality_spans['attributes.output.value'].apply( lambda x: parse_json_value(x).get('answer', x) if isinstance(parse_json_value(x), dict) else x)# Add metadata from span attributesquality_spans['model'] = quality_spans['attributes.llm.model_name']quality_spans['latency_ms'] = quality_spans['latency_ms']
3
Create Dataset with Span Links
Create the dataset and preserve trace associations:
# Reset index if context.span_id is the indexif quality_spans.index.name == 'context.span_id': quality_spans = quality_spans.reset_index()# Create dataset with span associationsdataset = client.datasets.create_dataset( name="high-quality-responses", dataframe=quality_spans, input_keys=["question"], output_keys=["answer"], metadata_keys=["model", "latency_ms"], span_id_key="context.span_id", # Links examples to original traces dataset_description="High-quality responses from production (feedback >= 4.0)")print(f"Created dataset with {len(dataset)} examples linked to traces")
from phoenix.client import Clientclient = Client()# Add more examples to existing datasetupdated_dataset = client.datasets.add_examples_to_dataset( dataset="customer-support-qa", # Can use ID or name inputs=[{"question": "How do I change my email?"}], outputs=[{"answer": "Go to Account Settings and update your email..."}], metadata=[{"category": "account", "priority": "medium"}])print(f"Dataset now has {len(updated_dataset)} examples")
Adding examples creates a new version of the dataset. Previous experiments remain linked to their original versions.