zebrad/components/tracing/otel.rs
1//! OpenTelemetry tracing layer with zero overhead when disabled.
2//!
3//! This module provides OpenTelemetry distributed tracing support that can be
4//! compiled into production builds but only activated at runtime when an
5//! endpoint is configured.
6//!
7//! # Transport
8//!
9//! Uses HTTP transport with a blocking reqwest client. This works without an
10//! async runtime context because:
11//! - The BatchSpanProcessor spawns its own dedicated background thread
12//! - The reqwest-blocking-client handles HTTP exports synchronously
13//!
14//! This is important because Zebra's tracing component initializes before the
15//! Tokio runtime starts.
16
17use opentelemetry::trace::TracerProvider as _;
18use opentelemetry_otlp::{Protocol, WithExportConfig};
19use opentelemetry_sdk::{
20 trace::{RandomIdGenerator, Sampler, SdkTracerProvider},
21 Resource,
22};
23use tracing::Subscriber;
24use tracing_opentelemetry::OpenTelemetryLayer;
25use tracing_subscriber::{registry::LookupSpan, Layer};
26
27/// Error type for OpenTelemetry layer initialization.
28pub type OtelError = Box<dyn std::error::Error + Send + Sync + 'static>;
29
30/// Creates an OpenTelemetry layer if an endpoint is configured.
31///
32/// Returns `(None, None)` with ZERO overhead when endpoint is `None` -
33/// no SDK objects are created, no background tasks are spawned.
34///
35/// # Arguments
36///
37/// * `endpoint` - OTLP HTTP endpoint URL (e.g., "http://localhost:4318")
38/// * `service_name` - Service name for traces (defaults to "zebra")
39/// * `sample_percent` - Sampling percentage between 0 and 100 (defaults to 100)
40///
41/// # Errors
42///
43/// Returns an error if the OTLP exporter fails to initialize.
44pub fn layer<S>(
45 endpoint: Option<&str>,
46 service_name: Option<&str>,
47 sample_percent: Option<u8>,
48) -> Result<(Option<impl Layer<S>>, Option<SdkTracerProvider>), OtelError>
49where
50 S: Subscriber + for<'span> LookupSpan<'span>,
51{
52 // CRITICAL: Check config FIRST - zero-cost path when None
53 let endpoint = match endpoint {
54 Some(ep) => ep,
55 None => return Ok((None, None)), // No SDK objects created
56 };
57
58 // HTTP transport requires the /v1/traces path suffix.
59 // Append it if not already present.
60 let endpoint = if endpoint.ends_with("/v1/traces") {
61 endpoint.to_string()
62 } else {
63 format!("{}/v1/traces", endpoint.trim_end_matches('/'))
64 };
65
66 let service_name = service_name.unwrap_or("zebra");
67 // Convert percentage (0-100) to rate (0.0-1.0), clamped to valid range
68 let sample_rate = f64::from(sample_percent.unwrap_or(100).min(100)) / 100.0;
69
70 // Build the HTTP exporter with blocking client.
71 // This works without an async runtime because:
72 // 1. reqwest-blocking-client doesn't need tokio
73 // 2. BatchSpanProcessor spawns its own background thread for exports
74 let exporter = opentelemetry_otlp::SpanExporter::builder()
75 .with_http()
76 .with_protocol(Protocol::HttpBinary)
77 .with_endpoint(&endpoint)
78 .build()?;
79
80 // Use ratio-based sampling for production flexibility
81 let sampler = if sample_rate >= 1.0 {
82 Sampler::AlwaysOn
83 } else if sample_rate <= 0.0 {
84 Sampler::AlwaysOff
85 } else {
86 Sampler::TraceIdRatioBased(sample_rate)
87 };
88
89 let resource = Resource::builder()
90 .with_service_name(service_name.to_owned())
91 .build();
92
93 // Use batch exporter - it spawns its own dedicated background thread
94 // for collecting and exporting spans, so it doesn't need an external runtime
95 let provider = SdkTracerProvider::builder()
96 .with_batch_exporter(exporter)
97 .with_sampler(sampler)
98 .with_id_generator(RandomIdGenerator::default())
99 .with_resource(resource)
100 .build();
101
102 let tracer = provider.tracer(service_name.to_owned());
103 let layer = OpenTelemetryLayer::new(tracer);
104
105 Ok((Some(layer), Some(provider)))
106}