Skip to main content
The ScyllaDB Rust Driver can collect detailed metrics about query execution, connection health, and performance. These metrics help monitor application behavior and diagnose issues.

Enabling Metrics

Metrics collection is an optional feature. Enable it in your Cargo.toml:
[dependencies]
scylla = { version = "*", features = ["metrics"] }

Accessing Metrics

Get metrics from the session:
use scylla::Session;

let session: Session = /* ... */;
let metrics = session.get_metrics();

Query Metrics

Counters

Track query execution counts:
// Non-paged queries
let total_queries = metrics.get_queries_num();
let failed_queries = metrics.get_errors_num();

// Paged queries (each page counted separately)
let total_pages = metrics.get_queries_iter_num();
let failed_pages = metrics.get_errors_iter_num();

// Retry attempts
let retry_count = metrics.get_retries_num();

println!("Success rate: {:.2}%",
    (total_queries - failed_queries) as f64 / total_queries as f64 * 100.0
);

Latency Metrics

Get latency statistics:
// Average latency
let avg_ms = metrics.get_latency_avg_ms()?;
println!("Average latency: {}ms", avg_ms);

// Specific percentiles
let p50 = metrics.get_latency_percentile_ms(50.0)?;
let p95 = metrics.get_latency_percentile_ms(95.0)?;
let p99 = metrics.get_latency_percentile_ms(99.0)?;
let p999 = metrics.get_latency_percentile_ms(99.9)?;

println!("p50: {}ms, p95: {}ms, p99: {}ms, p999: {}ms",
    p50, p95, p99, p999
);

Latency Snapshot

Get comprehensive latency statistics:
use scylla::observability::metrics::Snapshot;

let snapshot: Snapshot = metrics.get_snapshot()?;

println!("Latency Statistics:");
println!("  Min: {}ms", snapshot.min);
println!("  Max: {}ms", snapshot.max);
println!("  Mean: {}ms", snapshot.mean);
println!("  Std Dev: {}ms", snapshot.stddev);
println!("  Median: {}ms", snapshot.median);
println!("  p75: {}ms", snapshot.percentile_75);
println!("  p95: {}ms", snapshot.percentile_95);
println!("  p98: {}ms", snapshot.percentile_98);
println!("  p99: {}ms", snapshot.percentile_99);
println!("  p999: {}ms", snapshot.percentile_99_9);

Request Rate

Track queries per second:
// Mean rate since driver started
let mean_rate = metrics.get_mean_rate();

// Exponentially weighted moving averages
let rate_1m = metrics.get_one_minute_rate();
let rate_5m = metrics.get_five_minute_rate();
let rate_15m = metrics.get_fifteen_minute_rate();

println!("Request Rate:");
println!("  Mean: {:.2} req/s", mean_rate);
println!("  1min: {:.2} req/s", rate_1m);
println!("  5min: {:.2} req/s", rate_5m);
println!("  15min: {:.2} req/s", rate_15m);

Connection Metrics

Monitor connection pool health:
// Active connections
let total_connections = metrics.get_total_connections();

// Connection timeouts
let conn_timeouts = metrics.get_connection_timeouts();

// Request timeouts
let req_timeouts = metrics.get_request_timeouts();

println!("Connections: {} active", total_connections);
println!("Connection timeouts: {}", conn_timeouts);
println!("Request timeouts: {}", req_timeouts);

Error Handling

Metrics operations can fail:
use scylla::observability::metrics::MetricsError;

match metrics.get_latency_avg_ms() {
    Ok(avg) => println!("Average latency: {}ms", avg),
    Err(MetricsError::Empty) => {
        println!("No metrics collected yet");
    }
    Err(MetricsError::HistogramError(e)) => {
        eprintln!("Histogram error: {}", e);
    }
}

Histogram Configuration

The driver uses a high-dynamic-range histogram with:
  • Maximum value: 65,535 ms (65.5 seconds)
  • Relative error: 0.0244% (very precise)
  • Memory usage: ~1.7 MiB
  • 20,480 buckets
Values above 65.5 seconds are clamped to the maximum.

Use Cases

Health Monitoring

use std::time::Duration;
use tokio::time;

let mut interval = time::interval(Duration::from_secs(60));

loop {
    interval.tick().await;
    
    let metrics = session.get_metrics();
    let snapshot = metrics.get_snapshot()?;
    
    // Alert if p99 is too high
    if snapshot.percentile_99 > 100 {
        eprintln!("WARNING: p99 latency is {}ms", snapshot.percentile_99);
    }
    
    // Alert if error rate is too high
    let error_rate = metrics.get_errors_num() as f64 
        / metrics.get_queries_num() as f64;
    if error_rate > 0.01 {
        eprintln!("WARNING: Error rate is {:.2}%", error_rate * 100.0);
    }
}

Performance Tuning

// Determine if speculative execution threshold is appropriate
let p95 = metrics.get_latency_percentile_ms(95.0)?;
let p99 = metrics.get_latency_percentile_ms(99.0)?;

if p99 > p95 * 2.0 {
    println!("Large p99/p95 gap suggests speculative execution might help");
    println!("Consider setting retry_interval to {}ms", p95);
}

Capacity Planning

// Track request rate trends
let rate_1m = metrics.get_one_minute_rate();
let rate_15m = metrics.get_fifteen_minute_rate();

if rate_1m > rate_15m * 1.5 {
    println!("Request rate increasing: {:.2} req/s (was {:.2})",
        rate_1m, rate_15m);
}

Retry Analysis

let queries = metrics.get_queries_num();
let retries = metrics.get_retries_num();
let retry_ratio = retries as f64 / queries as f64;

println!("Retry ratio: {:.2} retries per query", retry_ratio);

if retry_ratio > 0.5 {
    println!("High retry ratio suggests cluster issues");
}

Exporting Metrics

Prometheus Format

use std::fmt::Write;

fn export_prometheus(metrics: &Metrics) -> Result<String, Box<dyn std::error::Error>> {
    let mut output = String::new();
    
    // Counters
    writeln!(output, "scylla_queries_total {}", metrics.get_queries_num())?;
    writeln!(output, "scylla_errors_total {}", metrics.get_errors_num())?;
    writeln!(output, "scylla_retries_total {}", metrics.get_retries_num())?;
    
    // Gauges
    writeln!(output, "scylla_connections {}", metrics.get_total_connections())?;
    
    // Histograms
    let snapshot = metrics.get_snapshot()?;
    writeln!(output, "scylla_latency_seconds{{quantile=\"0.5\"}} {}",
        snapshot.median as f64 / 1000.0)?;
    writeln!(output, "scylla_latency_seconds{{quantile=\"0.95\"}} {}",
        snapshot.percentile_95 as f64 / 1000.0)?;
    writeln!(output, "scylla_latency_seconds{{quantile=\"0.99\"}} {}",
        snapshot.percentile_99 as f64 / 1000.0)?;
    
    Ok(output)
}

JSON Format

use serde::Serialize;

#[derive(Serialize)]
struct MetricsExport {
    queries_total: u64,
    errors_total: u64,
    retries_total: u64,
    connections: u64,
    latency_ms: LatencyStats,
    rate_qps: RateStats,
}

#[derive(Serialize)]
struct LatencyStats {
    min: u64,
    max: u64,
    mean: u64,
    p50: u64,
    p95: u64,
    p99: u64,
    p999: u64,
}

#[derive(Serialize)]
struct RateStats {
    mean: f64,
    one_minute: f64,
    five_minutes: f64,
    fifteen_minutes: f64,
}

fn export_json(metrics: &Metrics) -> Result<String, Box<dyn std::error::Error>> {
    let snapshot = metrics.get_snapshot()?;
    
    let export = MetricsExport {
        queries_total: metrics.get_queries_num(),
        errors_total: metrics.get_errors_num(),
        retries_total: metrics.get_retries_num(),
        connections: metrics.get_total_connections(),
        latency_ms: LatencyStats {
            min: snapshot.min,
            max: snapshot.max,
            mean: snapshot.mean,
            p50: snapshot.median,
            p95: snapshot.percentile_95,
            p99: snapshot.percentile_99,
            p999: snapshot.percentile_99_9,
        },
        rate_qps: RateStats {
            mean: metrics.get_mean_rate(),
            one_minute: metrics.get_one_minute_rate(),
            five_minutes: metrics.get_five_minute_rate(),
            fifteen_minutes: metrics.get_fifteen_minute_rate(),
        },
    };
    
    Ok(serde_json::to_string_pretty(&export)?)
}

Best Practices

  • Enable metrics in production for observability
  • Monitor p99 and p999 latencies, not just averages
  • Track error rates and retry ratios
  • Set up alerts for connection timeouts
  • Export metrics to monitoring systems (Prometheus, Grafana, etc.)
  • Consider metrics when tuning policies
  • Use request rate to detect traffic anomalies
  • Check metrics before and after configuration changes

Performance Impact

Metrics collection has minimal overhead:
  • Lock-free atomic operations
  • Efficient histogram implementation
  • ~1.7 MiB memory per session
  • Negligible CPU impact

Limitations

  • Histogram maximum value: 65,535ms
  • No per-query or per-statement metrics
  • No per-node breakdown (aggregate only)
  • Metrics reset requires restarting the session

Next Steps

Build docs developers (and LLMs) love