Skip to main content

zebrad/components/tracing/
otel.rs

1//! OpenTelemetry tracing layer with zero overhead when disabled.
2//!
3//! This module provides OpenTelemetry distributed tracing support that can be
4//! compiled into production builds but only activated at runtime when an
5//! endpoint is configured.
6//!
7//! # Transport
8//!
9//! Uses HTTP transport with a blocking reqwest client. This works without an
10//! async runtime context because:
11//! - The BatchSpanProcessor spawns its own dedicated background thread
12//! - The reqwest-blocking-client handles HTTP exports synchronously
13//!
14//! This is important because Zebra's tracing component initializes before the
15//! Tokio runtime starts.
16
17use opentelemetry::trace::TracerProvider as _;
18use opentelemetry_otlp::{Protocol, WithExportConfig};
19use opentelemetry_sdk::{
20    trace::{RandomIdGenerator, Sampler, SdkTracerProvider},
21    Resource,
22};
23use tracing::Subscriber;
24use tracing_opentelemetry::OpenTelemetryLayer;
25use tracing_subscriber::{registry::LookupSpan, Layer};
26
27/// Error type for OpenTelemetry layer initialization.
28pub type OtelError = Box<dyn std::error::Error + Send + Sync + 'static>;
29
30/// Creates an OpenTelemetry layer if an endpoint is configured.
31///
32/// Returns `(None, None)` with ZERO overhead when endpoint is `None` -
33/// no SDK objects are created, no background tasks are spawned.
34///
35/// # Arguments
36///
37/// * `endpoint` - OTLP HTTP endpoint URL (e.g., "http://localhost:4318")
38/// * `service_name` - Service name for traces (defaults to "zebra")
39/// * `sample_percent` - Sampling percentage between 0 and 100 (defaults to 100)
40///
41/// # Errors
42///
43/// Returns an error if the OTLP exporter fails to initialize.
44pub fn layer<S>(
45    endpoint: Option<&str>,
46    service_name: Option<&str>,
47    sample_percent: Option<u8>,
48) -> Result<(Option<impl Layer<S>>, Option<SdkTracerProvider>), OtelError>
49where
50    S: Subscriber + for<'span> LookupSpan<'span>,
51{
52    // CRITICAL: Check config FIRST - zero-cost path when None
53    let endpoint = match endpoint {
54        Some(ep) => ep,
55        None => return Ok((None, None)), // No SDK objects created
56    };
57
58    // HTTP transport requires the /v1/traces path suffix.
59    // Append it if not already present.
60    let endpoint = if endpoint.ends_with("/v1/traces") {
61        endpoint.to_string()
62    } else {
63        format!("{}/v1/traces", endpoint.trim_end_matches('/'))
64    };
65
66    let service_name = service_name.unwrap_or("zebra");
67    // Convert percentage (0-100) to rate (0.0-1.0), clamped to valid range
68    let sample_rate = f64::from(sample_percent.unwrap_or(100).min(100)) / 100.0;
69
70    // Build the HTTP exporter with blocking client.
71    // This works without an async runtime because:
72    // 1. reqwest-blocking-client doesn't need tokio
73    // 2. BatchSpanProcessor spawns its own background thread for exports
74    let exporter = opentelemetry_otlp::SpanExporter::builder()
75        .with_http()
76        .with_protocol(Protocol::HttpBinary)
77        .with_endpoint(&endpoint)
78        .build()?;
79
80    // Use ratio-based sampling for production flexibility
81    let sampler = if sample_rate >= 1.0 {
82        Sampler::AlwaysOn
83    } else if sample_rate <= 0.0 {
84        Sampler::AlwaysOff
85    } else {
86        Sampler::TraceIdRatioBased(sample_rate)
87    };
88
89    let resource = Resource::builder()
90        .with_service_name(service_name.to_owned())
91        .build();
92
93    // Use batch exporter - it spawns its own dedicated background thread
94    // for collecting and exporting spans, so it doesn't need an external runtime
95    let provider = SdkTracerProvider::builder()
96        .with_batch_exporter(exporter)
97        .with_sampler(sampler)
98        .with_id_generator(RandomIdGenerator::default())
99        .with_resource(resource)
100        .build();
101
102    let tracer = provider.tracer(service_name.to_owned());
103    let layer = OpenTelemetryLayer::new(tracer);
104
105    Ok((Some(layer), Some(provider)))
106}