feat(obs): integrate dial9-tokio-telemetry for runtime tracing (#2285)

Co-authored-by: heihutu <heihutu@gmail.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: houseme <4829346+houseme@users.noreply.github.com>
This commit is contained in:
houseme
2026-03-25 14:23:58 +08:00
committed by GitHub
parent 2681731443
commit fb2ced4d27
22 changed files with 1300 additions and 10 deletions

View File

@@ -89,6 +89,61 @@ export RUSTFS_RUNTIME_THREAD_STACK_SIZE=1024*1024
export RUSTFS_RUNTIME_THREAD_KEEP_ALIVE=60
export RUSTFS_RUNTIME_GLOBAL_QUEUE_INTERVAL=31
# ============================================================================
# dial9 Tokio Runtime Telemetry Configuration
# ============================================================================
# dial9 provides low-overhead Tokio runtime-level telemetry for performance diagnostics.
# It captures events like PollStart/End, WorkerPark/Unpark, QueueSample, TaskSpawn.
#
# Features:
# - CPU overhead < 5% (with sampling rate 1.0)
# - Automatic file rotation (configurable size and count)
# - Graceful degradation if initialization fails
#
# Note: Disabled by default. Enable only when needed for runtime diagnostics.
# Note: Requires build flag --cfg tokio_unstable (set in .cargo/config.toml).
# Enable dial9 telemetry (default: false)
#export RUSTFS_RUNTIME_DIAL9_ENABLED=true
# Output directory for trace files (default: /var/log/rustfs/telemetry)
#export RUSTFS_RUNTIME_DIAL9_OUTPUT_DIR="$current_dir/deploy/telemetry"
# Trace file prefix (default: rustfs-tokio)
#export RUSTFS_RUNTIME_DIAL9_FILE_PREFIX=rustfs-tokio
# Maximum trace file size in bytes (default: 104857600 = 100MB)
#export RUSTFS_RUNTIME_DIAL9_MAX_FILE_SIZE=104857600
# Number of rotated files to keep (default: 10)
#export RUSTFS_RUNTIME_DIAL9_ROTATION_COUNT=10
# Sampling rate: 0.0 to 1.0 (default: 1.0 = 100% sampling)
# Lower values reduce CPU overhead. Recommended: 0.1-0.5 for production.
#export RUSTFS_RUNTIME_DIAL9_SAMPLING_RATE=1.0
# S3 upload settings (not yet implemented; reserved for future use):
#export RUSTFS_RUNTIME_DIAL9_S3_BUCKET=my-trace-bucket
#export RUSTFS_RUNTIME_DIAL9_S3_PREFIX=telemetry/
# --- Scenario 1: Development / Debugging ---
# Full tracing with local storage, high sampling rate
#export RUSTFS_RUNTIME_DIAL9_ENABLED=true
#export RUSTFS_RUNTIME_DIAL9_OUTPUT_DIR="$current_dir/deploy/telemetry"
#export RUSTFS_RUNTIME_DIAL9_SAMPLING_RATE=1.0
# --- Scenario 2: Production Diagnostics ---
# Reduced sampling rate to minimize overhead
#export RUSTFS_RUNTIME_DIAL9_ENABLED=true
#export RUSTFS_RUNTIME_DIAL9_SAMPLING_RATE=0.1
# --- Scenario 3: Performance Investigation ---
# Short-term tracing with high detail, manual cleanup
#export RUSTFS_RUNTIME_DIAL9_ENABLED=true
#export RUSTFS_RUNTIME_DIAL9_OUTPUT_DIR=/tmp/rustfs-telemetry-investigation
#export RUSTFS_RUNTIME_DIAL9_SAMPLING_RATE=1.0
#export RUSTFS_RUNTIME_DIAL9_ROTATION_COUNT=3
export OTEL_INSTRUMENTATION_NAME="rustfs"
export OTEL_INSTRUMENTATION_VERSION="0.1.1"
export OTEL_INSTRUMENTATION_SCHEMA_URL="https://opentelemetry.io/schemas/1.31.0"