fix(obs): Update observability docker compose stack (#2010)

2026-05-06 22:28:16 +08:00 · 2026-02-28 14:03:50 -05:00
parent fe884eabfc
commit fd1b903531
13 changed files with 319 additions and 161 deletions
--- a/.docker/observability/README.md
+++ b/.docker/observability/README.md
@@ -15,28 +15,28 @@ The stack is composed of the following best-in-class open-source components:

 ## Architecture

-1.  **Telemetry Collection**: Applications send OTLP (OpenTelemetry Protocol) data (Metrics, Logs, Traces) to the **OpenTelemetry Collector**.
-2.  **Processing & Exporting**: The Collector processes the data (batching, memory limiting) and exports it to the respective backends:
-    -   **Traces** -> **Tempo** (Primary) & **Jaeger** (Secondary/Optional)
-    -   **Metrics** -> **Prometheus** (via scraping the Collector's exporter)
-    -   **Logs** -> **Loki**
-3.  **Visualization**: **Grafana** connects to all backends (Prometheus, Tempo, Loki, Jaeger) to provide a unified dashboard experience.
+1. **Telemetry Collection**: Applications send OTLP (OpenTelemetry Protocol) data (Metrics, Logs, Traces) to the **OpenTelemetry Collector**.
+2. **Processing & Exporting**: The Collector processes the data (batching, memory limiting) and exports it to the respective backends:
+    - **Traces** -> **Tempo** (Primary) & **Jaeger** (Secondary/Optional)
+    - **Metrics** -> **Prometheus** (via scraping the Collector's exporter)
+    - **Logs** -> **Loki**
+3. **Visualization**: **Grafana** connects to all backends (Prometheus, Tempo, Loki, Jaeger) to provide a unified dashboard experience.

 ## Features

-   **Full Persistence**: All data (Metrics, Logs, Traces) is persisted to Docker volumes, ensuring no data loss on restart.
-   **Correlation**: Seamless navigation between Metrics, Logs, and Traces in Grafana.
-    -   Jump from a Metric spike to relevant Traces.
-    -   Jump from a Trace to relevant Logs.
-   **High Performance**: Optimized configurations for batching, compression, and memory management.
-   **Standardized Protocols**: Built entirely on OpenTelemetry standards.
+- **Full Persistence**: All data (Metrics, Logs, Traces) is persisted to Docker volumes, ensuring no data loss on restart.
+- **Correlation**: Seamless navigation between Metrics, Logs, and Traces in Grafana.
+  - Jump from a Metric spike to relevant Traces.
+  - Jump from a Trace to relevant Logs.
+- **High Performance**: Optimized configurations for batching, compression, and memory management.
+- **Standardized Protocols**: Built entirely on OpenTelemetry standards.

 ## Quick Start

 ### Prerequisites

-   Docker
-   Docker Compose
+- Docker
+- Docker Compose

 ### Deploy

@@ -48,12 +48,12 @@ docker compose up -d

 ### Access Dashboards

-| Service | URL | Credentials | Description |
-| :--- | :--- | :--- | :--- |
-| **Grafana** | [http://localhost:3000](http://localhost:3000) | `admin` / `admin` | Main visualization hub. |
-| **Prometheus** | [http://localhost:9090](http://localhost:9090) | - | Metric queries and status. |
-| **Jaeger UI** | [http://localhost:16686](http://localhost:16686) | - | Secondary trace visualization. |
-| **Tempo** | [http://localhost:3200](http://localhost:3200) | - | Tempo status/metrics. |
+| Service        | URL                                              | Credentials       | Description                    |
+| :------------- | :----------------------------------------------- | :---------------- | :----------------------------- |
+| **Grafana**    | [http://localhost:3000](http://localhost:3000)   | `admin` / `admin` | Main visualization hub.        |
+| **Prometheus** | [http://localhost:9090](http://localhost:9090)   | -                 | Metric queries and status.     |
+| **Jaeger UI**  | [http://localhost:16686](http://localhost:16686) | -                 | Secondary trace visualization. |
+| **Tempo**      | [http://localhost:3200](http://localhost:3200)   | -                 | Tempo status/metrics.          |

 ## Configuration

@@ -61,10 +61,10 @@ docker compose up -d

 Data is stored in the following Docker volumes:

-   `prometheus-data`: Prometheus metrics
-   `tempo-data`: Tempo traces (WAL and Blocks)
-   `loki-data`: Loki logs (Chunks and Rules)
-   `jaeger-data`: Jaeger traces (Badger DB)
+- `prometheus-data`: Prometheus metrics
+- `tempo-data`: Tempo traces (WAL and Blocks)
+- `loki-data`: Loki logs (Chunks and Rules)
+- `jaeger-data`: Jaeger traces (Badger DB)

 To clear all data:

@@ -74,12 +74,12 @@ docker compose down -v

 ### Customization

-   **Prometheus**: Edit `prometheus.yml` to add scrape targets or alerting rules.
-   **Grafana**: Dashboards and datasources are provisioned from the `grafana/` directory.
-   **Collector**: Edit `otel-collector-config.yaml` to modify pipelines, processors, or exporters.
+- **Prometheus**: Edit `prometheus.yml` to add scrape targets or alerting rules.
+- **Grafana**: Dashboards and datasources are provisioned from the `grafana/` directory.
+- **Collector**: Edit `otel-collector-config.yaml` to modify pipelines, processors, or exporters.

 ## Troubleshooting

-   **Service Health**: Check the health of services using `docker compose ps`.
-   **Logs**: View logs for a specific service using `docker compose logs -f <service_name>`.
-   **Otel Collector**: Check `http://localhost:13133` for health status and `http://localhost:1888/debug/pprof/` for profiling.
+- **Service Health**: Check the health of services using `docker compose ps`.
+- **Logs**: View logs for a specific service using `docker compose logs -f <service_name>`.
+- **Otel Collector**: Check `http://localhost:13133` for health status and `http://localhost:1888/debug/pprof/` for profiling.
--- a/.docker/observability/docker-compose.yml
+++ b/.docker/observability/docker-compose.yml
@@ -13,93 +13,149 @@
 # limitations under the License.

 services:
+  rustfs:
+    security_opt:
+      - "no-new-privileges:true"
+    image: rustfs/rustfs:latest
+    container_name: rustfs-server
+    ports:
+      - "9000:9000" # S3 API port
+      - "9001:9001" # Console port
+    environment:
+      - RUSTFS_VOLUMES=/data/rustfs
+      - RUSTFS_ADDRESS=0.0.0.0:9000
+      - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001
+      - RUSTFS_CONSOLE_ENABLE=true
+      - RUSTFS_CORS_ALLOWED_ORIGINS=*
+      - RUSTFS_CONSOLE_CORS_ALLOWED_ORIGINS=*
+      - RUSTFS_ACCESS_KEY=rustfsadmin
+      - RUSTFS_SECRET_KEY=rustfsadmin
+      - RUSTFS_OBS_LOGGER_LEVEL=info
+      - RUSTFS_OBS_ENDPOINT=http://otel-collector:4318
+    volumes:
+      - rustfs-data:/data/rustfs
+    networks:
+      - otel-network
+    restart: unless-stopped
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "sh",
+          "-c",
+          "curl -f http://127.0.0.1:9000/health && curl -f http://127.0.0.1:9001/rustfs/console/health",
+        ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    depends_on:
+      otel-collector:
+        condition: service_started
+
+  rustfs-init:
+    image: alpine
+    container_name: rustfs-init
+    volumes:
+      - rustfs-data:/data
+    networks:
+      - otel-network
+    command: >
+      sh -c "
+        chown -R 10001:10001 /data &&
+        echo 'Volume Permissions fixed' &&
+        exit 0
+      "
+    restart: no

  # --- Tracing ---

-  tempo-init:
-    image: busybox:latest
-    command: [ "sh", "-c", "chown -R 10001:10001 /var/tempo" ]
-    volumes:
-      - ./tempo-data:/var/tempo
-    user: root
-    networks:
-      - otel-network
-    restart: "no"
-
  tempo:
    image: grafana/tempo:latest
-    user: "10001"
-    command: [ "-config.file=/etc/tempo.yaml" ]
+    container_name: tempo
+    command: ["-config.file=/etc/tempo.yaml"]
    volumes:
      - ./tempo.yaml:/etc/tempo.yaml:ro
-      - ./tempo-data:/var/tempo
+      - tempo-data:/var/tempo
    ports:
      - "3200:3200" # tempo
      - "4317"      # otlp grpc
      - "4318"      # otlp http
-    restart: unless-stopped
    networks:
      - otel-network
+    restart: unless-stopped
+    depends_on:
+      - redpanda
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:3200/metrics || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3200/ready"]
      interval: 10s
      timeout: 5s
-      retries: 5
-      start_period: 40s
+      retries: 3
+      start_period: 15s
+
+  redpanda:
+    image: redpandadata/redpanda:latest # for tempo ingest
+    container_name: redpanda
+    ports:
+      - "9092:9092"
+    networks:
+      - otel-network
+    restart: unless-stopped
+    command: >
+      redpanda start --overprovisioned
+      --mode=dev-container
+      --kafka-addr=PLAINTEXT://0.0.0.0:9092
+      --advertise-kafka-addr=PLAINTEXT://redpanda:9092

  jaeger:
    image: jaegertracing/jaeger:latest
+    container_name: jaeger
    environment:
-      - TZ=Asia/Shanghai
      - SPAN_STORAGE_TYPE=badger
      - BADGER_EPHEMERAL=false
      - BADGER_DIRECTORY_VALUE=/badger/data
      - BADGER_DIRECTORY_KEY=/badger/key
      - COLLECTOR_OTLP_ENABLED=true
    volumes:
-      - ./jaeger-data:/badger
+      - ./jaeger.yaml:/etc/jaeger/config.yml
+      - jaeger-data:/badger
    ports:
      - "16686:16686" # Web UI
      - "14269:14269" # Admin/Metrics
-      - "4317"
-      - "4318"
+      - "4317" # otlp grpc
+      - "4318" # otlp http
+    command: ["--config", "/etc/jaeger/config.yml"]
    networks:
      - otel-network
+    restart: unless-stopped
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:14269 || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:14269"]
      interval: 10s
      timeout: 5s
-      retries: 5
-      start_period: 20s
+      retries: 3
+      start_period: 15s

  # --- Metrics ---

  prometheus:
    image: prom/prometheus:latest
-    environment:
-      - TZ=Asia/Shanghai
+    container_name: prometheus
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
-      - ./prometheus-data:/prometheus
+      - prometheus-data:/prometheus
    ports:
      - "9090:9090"
    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--web.enable-otlp-receiver'
-      - '--web.enable-remote-write-receiver'
-      - '--enable-feature=promql-experimental-functions'
-      - '--storage.tsdb.min-block-duration=2h'
-      - '--storage.tsdb.max-block-duration=2h'
-      - '--log.level=info'
-      - '--storage.tsdb.retention.time=30d'
-      - '--storage.tsdb.path=/prometheus'
-      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
-      - '--web.console.templates=/usr/share/prometheus/consoles'
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--web.enable-otlp-receiver" # Enable OTLP
+      - "--web.enable-remote-write-receiver" # Enable remote write
+      - "--enable-feature=promql-experimental-functions" # Enable info()
+      - "--storage.tsdb.retention.time=30d"
    restart: unless-stopped
    networks:
      - otel-network
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:9090/-/healthy || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
      interval: 10s
      timeout: 5s
      retries: 3
@@ -108,18 +164,18 @@ services:

  loki:
    image: grafana/loki:latest
-    environment:
-      - TZ=Asia/Shanghai
+    container_name: loki
    volumes:
-      - ./loki-config.yaml:/etc/loki/local-config.yaml:ro
-      - ./loki-data:/loki
+      - ./loki.yaml:/etc/loki/loki.yaml:ro
+      - loki-data:/loki
    ports:
      - "3100:3100"
-    command: -config.file=/etc/loki/local-config.yaml
+    command: -config.file=/etc/loki/loki.yaml
    networks:
      - otel-network
+    restart: unless-stopped
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:3100/metrics || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
      interval: 15s
      timeout: 10s
      retries: 5
@@ -129,8 +185,6 @@ services:

  otel-collector:
    image: otel/opentelemetry-collector-contrib:latest
-    environment:
-      - TZ=Asia/Shanghai
    volumes:
      - ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro
    ports:
@@ -143,52 +197,51 @@ services:
      - "55679:55679" # zpages
    networks:
      - otel-network
+    restart: unless-stopped
    depends_on:
      - tempo
      - jaeger
      - prometheus
      - loki
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:13133 || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133"]
      interval: 10s
      timeout: 5s
      retries: 3
-      start_period: 20s

  # --- Visualization ---

  grafana:
    image: grafana/grafana:latest
+    container_name: grafana
    ports:
      - "3000:3000"
-    volumes:
-      - ./grafana/provisioning:/etc/grafana/provisioning
-      - ./grafana/dashboards:/var/lib/grafana/dashboards
-      - ./grafana-data:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_SECURITY_ADMIN_USER=admin
-      - TZ=Asia/Shanghai
-      - GF_INSTALL_PLUGINS=grafana-pyroscope-datasource
-      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/home.json
-    restart: unless-stopped
+    volumes:
+      - ./grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./grafana/dashboards:/etc/grafana/dashboards:ro
+      - grafana-data:/var/lib/grafana
    networks:
      - otel-network
+    restart: unless-stopped
    depends_on:
      - prometheus
      - tempo
      - loki
    healthcheck:
-      test: [ "CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1" ]
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
      interval: 10s
      timeout: 5s
      retries: 3

 volumes:
-  prometheus-data:
+  rustfs-data:
  tempo-data:
-  loki-data:
  jaeger-data:
+  prometheus-data:
+  loki-data:
  grafana-data:

 networks:
--- a/.docker/observability/grafana-data/.gitignore
+++ b/.docker/observability/grafana-data/.gitignore
@@ -1 +0,0 @@
-*
--- a/.docker/observability/grafana/provisioning/dashboards/dashboard.yml
+++ b/.docker/observability/grafana/provisioning/dashboards/dashboard.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+  - name: "default"
+    orgId: 1
+    folder: ""
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    options:
+      path: /etc/grafana/dashboards
--- a/.docker/observability/grafana/provisioning/datasources/datasources.yaml
+++ b/.docker/observability/grafana/provisioning/datasources/datasources.yaml
@@ -0,0 +1,78 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    uid: prometheus
+    url: http://prometheus:9090
+    access: proxy
+    isDefault: true
+    editable: false
+    jsonData:
+      httpMethod: GET
+      exemplarTraceIdDestinations:
+        - name: trace_id
+          datasourceUid: tempo
+
+  - name: Tempo
+    type: tempo
+    uid: tempo
+    access: proxy
+    url: http://tempo:3200
+    isDefault: false
+    editable: false
+    jsonData:
+      httpMethod: GET
+      serviceMap:
+        datasourceUid: prometheus
+      tracesToLogs:
+        datasourceUid: loki
+        tags: [ 'job', 'instance', 'pod', 'namespace', 'service.name' ]
+        mappedTags: [ { key: 'service.name', value: 'app' } ]
+        spanStartTimeShift: '-1h'
+        spanEndTimeShift: '1h'
+        filterByTraceID: true
+        filterBySpanID: false
+      tracesToMetrics:
+        datasourceUid: prometheus
+        tags: [ { key: 'service.name' }, { key: 'job' } ]
+        queries:
+          - name: 'Service-Level Latency'
+            query: 'sum(rate(traces_spanmetrics_latency_bucket{$$__tags}[5m])) by (le)'
+          - name: 'Service-Level Calls'
+            query: 'sum(rate(traces_spanmetrics_calls_total{$$__tags}[5m]))'
+          - name: 'Service-Level Errors'
+            query: 'sum(rate(traces_spanmetrics_calls_total{status_code="ERROR", $$__tags}[5m]))'
+      nodeGraph:
+        enabled: true
+
+  - name: Loki
+    type: loki
+    uid: loki
+    url: http://loki:3100
+    basicAuth: false
+    isDefault: false
+    editable: false
+    jsonData:
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: 'trace_id=(\w+)'
+          name: 'TraceID'
+          url: '$${__value.raw}'
+
+  - name: Jaeger
+    type: jaeger
+    uid: jaeger
+    url: http://jaeger:16686
+    access: proxy
+    isDefault: false
+    editable: false
+    jsonData:
+      tracesToLogs:
+        datasourceUid: loki
+        tags: [ 'job', 'instance', 'pod', 'namespace', 'service.name' ]
+        mappedTags: [ { key: 'service.name', value: 'app' } ]
+        spanStartTimeShift: '1s'
+        spanEndTimeShift: '-1s'
+        filterByTraceID: true
+        filterBySpanID: false
--- a/.docker/observability/jaeger-data/.gitignore
+++ b/.docker/observability/jaeger-data/.gitignore
@@ -1 +0,0 @@
-*
--- a/.docker/observability/jaeger-config.yaml
+++ b/.docker/observability/jaeger-config.yaml
@@ -13,12 +13,15 @@
 # limitations under the License.

 service:
-  extensions: [ jaeger_storage, jaeger_query, remote_sampling, healthcheckv2 ]
+  extensions: [jaeger_storage, jaeger_query]
  pipelines:
    traces:
-      receivers: [ otlp, jaeger, zipkin ]
-      processors: [ batch, adaptive_sampling ]
-      exporters: [ jaeger_storage_exporter ]
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [jaeger_storage_exporter, spanmetrics]
+    metrics/spanmetrics:
+      receivers: [spanmetrics]
+      exporters: [prometheus]
  telemetry:
    resource:
      service.name: jaeger
@@ -31,60 +34,41 @@ service:
                host: 0.0.0.0
                port: 8888
    logs:
-      level: info
+      level: DEBUG

 extensions:
-  healthcheckv2:
-    use_v2: true
-    http:
-
  jaeger_query:
    storage:
-      traces: badger_store
-    ui:
-      config_file: ./cmd/jaeger/config-ui.json
-      log_access: true
-    max_clock_skew_adjust: 0s
-    grpc:
-      endpoint: 0.0.0.0:16685
-    http:
-      endpoint: 0.0.0.0:16686
-
+      traces: some_storage
+      metrics: some_metrics_storage
  jaeger_storage:
    backends:
-      badger_store:
-        badger:
-          ephemeral: false
-          directory_key: /badger/key
-          directory_value: /badger/data
-          span_store_ttl: 72h
+      some_storage:
+        memory:
+          max_traces: 100000
+    metric_backends:
+      some_metrics_storage:
+        prometheus:
+          endpoint: http://prometheus:9090
+          normalize_calls: true
+          normalize_duration: true

-  remote_sampling:
-    adaptive:
-      sampling_store: badger_store
-      initial_sampling_probability: 0.1
-    http:
-    grpc:
+connectors:
+  spanmetrics:

 receivers:
  otlp:
    protocols:
      grpc:
+        endpoint: "0.0.0.0:4317"
      http:
-
-  jaeger:
-    protocols:
-      grpc:
-      thrift_binary:
-      thrift_compact:
-      thrift_http:
-
-  zipkin:
+        endpoint: "0.0.0.0:4318"

 processors:
  batch:
-  adaptive_sampling:

 exporters:
  jaeger_storage_exporter:
-    trace_storage: badger_store
+    trace_storage: some_storage
+  prometheus:
+    endpoint: "0.0.0.0:8889"
--- a/.docker/observability/loki-data/.gitignore
+++ b/.docker/observability/loki-data/.gitignore
@@ -1 +0,0 @@
-*
--- a/.docker/observability/loki-config.yaml
+++ b/.docker/observability/loki-config.yaml
@@ -11,12 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 auth_enabled: false

 server:
  http_listen_port: 3100
-  grpc_listen_port: 9096
+  grpc_listen_port: 9095
  log_level: info
  grpc_server_max_concurrent_streams: 1000

@@ -39,12 +38,6 @@ query_range:
        enabled: true
        max_size_mb: 100

-limits_config:
-  metric_aggregation_enabled: true
-  max_line_size: 256KB
-  max_line_size_truncate: false
-  allow_structured_metadata: true
-
 schema_config:
  configs:
    - from: 2020-10-24
@@ -54,15 +47,17 @@ schema_config:
      index:
        prefix: index_
        period: 24h
-      row_shards: 16
+
+limits_config:
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h
+  allow_structured_metadata: true
+  max_line_size: 256KB

 pattern_ingester:
  enabled: true
  metric_aggregation:
    loki_address: localhost:3100

-ruler:
-  alertmanager_url: http://localhost:9093
-
 frontend:
  encoding: protobuf
--- a/.docker/observability/prometheus-data/.gitignore
+++ b/.docker/observability/prometheus-data/.gitignore
@@ -1 +0,0 @@
-*
--- a/.docker/observability/tempo-data/.gitignore
+++ b/.docker/observability/tempo-data/.gitignore
@@ -1 +0,0 @@
-*
--- a/.docker/observability/tempo.yaml
+++ b/.docker/observability/tempo.yaml
@@ -12,21 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+partition_ring_live_store: true
+stream_over_http_enabled: true
+
 server:
  http_listen_port: 3200
  log_level: info

 distributor:
+  ingester_write_path_enabled: false
+  kafka_write_path_enabled: true
  receivers:
    otlp:
      protocols:
        grpc:
-          endpoint: "0.0.0.0:4317"
+          endpoint: "tempo:4317"
        http:
-          endpoint: "0.0.0.0:4318"
+          endpoint: "tempo:4318"
+  #log_received_spans:
+  #    enabled: true
+  #  log_discarded_spans:
+  #    enabled: true

-ingester:
-  max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
+backend_scheduler:
+  provider:
+    compaction:
+      compaction:
+        block_retention: 1h
+
+backend_worker:
+  backend_scheduler_addr: localhost:3200
+  compaction:
+    block_retention: 1h
+  ring:
+    kvstore:
+      store: memberlist
+
+querier:
+  query_live_store: true

 metrics_generator:
  registry:
@@ -38,13 +61,34 @@ metrics_generator:
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
-  traces_storage:
-    path: /var/tempo/generator/traces
+
+query_frontend:
+  rf1_after: "1999-01-01T00:00:00Z"
+  mcp_server:
+    enabled: true

 storage:
  trace:
-    backend: local # backend configuration to use
+    backend: local
    wal:
-      path: /var/tempo/wal # where to store the wal locally
+      path: /var/tempo/wal
    local:
      path: /var/tempo/blocks
+
+overrides:
+  defaults:
+    metrics_generator:
+      processors: ["span-metrics", "service-graphs", "local-blocks"]
+      generate_native_histograms: both
+
+ingest:
+  enabled: true
+  kafka:
+    address: redpanda:9092
+    topic: tempo-ingest
+
+block_builder:
+  consume_cycle_duration: 30s
+
+usage_report:
+  reporting_enabled: false
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -195,9 +195,7 @@ services:
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
-      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
-      - "--web.console.templates=/usr/share/prometheus/consoles"
-      - "--storage.tsdb.retention.time=200h"
+      - "--storage.tsdb.retention.time=30d"
      - "--web.enable-lifecycle"
      - "--web.enable-otlp-receiver"
      - "--web.enable-remote-write-receiver"
@@ -236,7 +234,7 @@ services:
    volumes:
      - grafana_data:/var/lib/grafana
      - ./.docker/observability/grafana/provisioning:/etc/grafana/provisioning:ro
-      - ./.docker/observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
+      - ./.docker/observability/grafana/dashboards:/etc/grafana/dashboards:ro
    networks:
      - rustfs-network
    restart: unless-stopped