fix(obs): Update observability docker compose stack (#2010)

This commit is contained in:
evan slack
2026-02-28 14:03:50 -05:00
committed by GitHub
parent fe884eabfc
commit fd1b903531
13 changed files with 319 additions and 161 deletions

View File

@@ -15,28 +15,28 @@ The stack is composed of the following best-in-class open-source components:
## Architecture
1. **Telemetry Collection**: Applications send OTLP (OpenTelemetry Protocol) data (Metrics, Logs, Traces) to the **OpenTelemetry Collector**.
2. **Processing & Exporting**: The Collector processes the data (batching, memory limiting) and exports it to the respective backends:
- **Traces** -> **Tempo** (Primary) & **Jaeger** (Secondary/Optional)
- **Metrics** -> **Prometheus** (via scraping the Collector's exporter)
- **Logs** -> **Loki**
3. **Visualization**: **Grafana** connects to all backends (Prometheus, Tempo, Loki, Jaeger) to provide a unified dashboard experience.
1. **Telemetry Collection**: Applications send OTLP (OpenTelemetry Protocol) data (Metrics, Logs, Traces) to the **OpenTelemetry Collector**.
2. **Processing & Exporting**: The Collector processes the data (batching, memory limiting) and exports it to the respective backends:
- **Traces** -> **Tempo** (Primary) & **Jaeger** (Secondary/Optional)
- **Metrics** -> **Prometheus** (via scraping the Collector's exporter)
- **Logs** -> **Loki**
3. **Visualization**: **Grafana** connects to all backends (Prometheus, Tempo, Loki, Jaeger) to provide a unified dashboard experience.
## Features
- **Full Persistence**: All data (Metrics, Logs, Traces) is persisted to Docker volumes, ensuring no data loss on restart.
- **Correlation**: Seamless navigation between Metrics, Logs, and Traces in Grafana.
- Jump from a Metric spike to relevant Traces.
- Jump from a Trace to relevant Logs.
- **High Performance**: Optimized configurations for batching, compression, and memory management.
- **Standardized Protocols**: Built entirely on OpenTelemetry standards.
- **Full Persistence**: All data (Metrics, Logs, Traces) is persisted to Docker volumes, ensuring no data loss on restart.
- **Correlation**: Seamless navigation between Metrics, Logs, and Traces in Grafana.
- Jump from a Metric spike to relevant Traces.
- Jump from a Trace to relevant Logs.
- **High Performance**: Optimized configurations for batching, compression, and memory management.
- **Standardized Protocols**: Built entirely on OpenTelemetry standards.
## Quick Start
### Prerequisites
- Docker
- Docker Compose
- Docker
- Docker Compose
### Deploy
@@ -48,12 +48,12 @@ docker compose up -d
### Access Dashboards
| Service | URL | Credentials | Description |
| :--- | :--- | :--- | :--- |
| **Grafana** | [http://localhost:3000](http://localhost:3000) | `admin` / `admin` | Main visualization hub. |
| **Prometheus** | [http://localhost:9090](http://localhost:9090) | - | Metric queries and status. |
| **Jaeger UI** | [http://localhost:16686](http://localhost:16686) | - | Secondary trace visualization. |
| **Tempo** | [http://localhost:3200](http://localhost:3200) | - | Tempo status/metrics. |
| Service | URL | Credentials | Description |
| :------------- | :----------------------------------------------- | :---------------- | :----------------------------- |
| **Grafana** | [http://localhost:3000](http://localhost:3000) | `admin` / `admin` | Main visualization hub. |
| **Prometheus** | [http://localhost:9090](http://localhost:9090) | - | Metric queries and status. |
| **Jaeger UI** | [http://localhost:16686](http://localhost:16686) | - | Secondary trace visualization. |
| **Tempo** | [http://localhost:3200](http://localhost:3200) | - | Tempo status/metrics. |
## Configuration
@@ -61,10 +61,10 @@ docker compose up -d
Data is stored in the following Docker volumes:
- `prometheus-data`: Prometheus metrics
- `tempo-data`: Tempo traces (WAL and Blocks)
- `loki-data`: Loki logs (Chunks and Rules)
- `jaeger-data`: Jaeger traces (Badger DB)
- `prometheus-data`: Prometheus metrics
- `tempo-data`: Tempo traces (WAL and Blocks)
- `loki-data`: Loki logs (Chunks and Rules)
- `jaeger-data`: Jaeger traces (Badger DB)
To clear all data:
@@ -74,12 +74,12 @@ docker compose down -v
### Customization
- **Prometheus**: Edit `prometheus.yml` to add scrape targets or alerting rules.
- **Grafana**: Dashboards and datasources are provisioned from the `grafana/` directory.
- **Collector**: Edit `otel-collector-config.yaml` to modify pipelines, processors, or exporters.
- **Prometheus**: Edit `prometheus.yml` to add scrape targets or alerting rules.
- **Grafana**: Dashboards and datasources are provisioned from the `grafana/` directory.
- **Collector**: Edit `otel-collector-config.yaml` to modify pipelines, processors, or exporters.
## Troubleshooting
- **Service Health**: Check the health of services using `docker compose ps`.
- **Logs**: View logs for a specific service using `docker compose logs -f <service_name>`.
- **Otel Collector**: Check `http://localhost:13133` for health status and `http://localhost:1888/debug/pprof/` for profiling.
- **Service Health**: Check the health of services using `docker compose ps`.
- **Logs**: View logs for a specific service using `docker compose logs -f <service_name>`.
- **Otel Collector**: Check `http://localhost:13133` for health status and `http://localhost:1888/debug/pprof/` for profiling.

View File

@@ -13,93 +13,149 @@
# limitations under the License.
services:
rustfs:
security_opt:
- "no-new-privileges:true"
image: rustfs/rustfs:latest
container_name: rustfs-server
ports:
- "9000:9000" # S3 API port
- "9001:9001" # Console port
environment:
- RUSTFS_VOLUMES=/data/rustfs
- RUSTFS_ADDRESS=0.0.0.0:9000
- RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001
- RUSTFS_CONSOLE_ENABLE=true
- RUSTFS_CORS_ALLOWED_ORIGINS=*
- RUSTFS_CONSOLE_CORS_ALLOWED_ORIGINS=*
- RUSTFS_ACCESS_KEY=rustfsadmin
- RUSTFS_SECRET_KEY=rustfsadmin
- RUSTFS_OBS_LOGGER_LEVEL=info
- RUSTFS_OBS_ENDPOINT=http://otel-collector:4318
volumes:
- rustfs-data:/data/rustfs
networks:
- otel-network
restart: unless-stopped
healthcheck:
test:
[
"CMD",
"sh",
"-c",
"curl -f http://127.0.0.1:9000/health && curl -f http://127.0.0.1:9001/rustfs/console/health",
]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
depends_on:
otel-collector:
condition: service_started
rustfs-init:
image: alpine
container_name: rustfs-init
volumes:
- rustfs-data:/data
networks:
- otel-network
command: >
sh -c "
chown -R 10001:10001 /data &&
echo 'Volume Permissions fixed' &&
exit 0
"
restart: no
# --- Tracing ---
tempo-init:
image: busybox:latest
command: [ "sh", "-c", "chown -R 10001:10001 /var/tempo" ]
volumes:
- ./tempo-data:/var/tempo
user: root
networks:
- otel-network
restart: "no"
tempo:
image: grafana/tempo:latest
user: "10001"
command: [ "-config.file=/etc/tempo.yaml" ]
container_name: tempo
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./tempo.yaml:/etc/tempo.yaml:ro
- ./tempo-data:/var/tempo
- tempo-data:/var/tempo
ports:
- "3200:3200" # tempo
- "4317" # otlp grpc
- "4318" # otlp http
restart: unless-stopped
networks:
- otel-network
restart: unless-stopped
depends_on:
- redpanda
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:3200/metrics || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3200/ready"]
interval: 10s
timeout: 5s
retries: 5
start_period: 40s
retries: 3
start_period: 15s
redpanda:
image: redpandadata/redpanda:latest # for tempo ingest
container_name: redpanda
ports:
- "9092:9092"
networks:
- otel-network
restart: unless-stopped
command: >
redpanda start --overprovisioned
--mode=dev-container
--kafka-addr=PLAINTEXT://0.0.0.0:9092
--advertise-kafka-addr=PLAINTEXT://redpanda:9092
jaeger:
image: jaegertracing/jaeger:latest
container_name: jaeger
environment:
- TZ=Asia/Shanghai
- SPAN_STORAGE_TYPE=badger
- BADGER_EPHEMERAL=false
- BADGER_DIRECTORY_VALUE=/badger/data
- BADGER_DIRECTORY_KEY=/badger/key
- COLLECTOR_OTLP_ENABLED=true
volumes:
- ./jaeger-data:/badger
- ./jaeger.yaml:/etc/jaeger/config.yml
- jaeger-data:/badger
ports:
- "16686:16686" # Web UI
- "14269:14269" # Admin/Metrics
- "4317"
- "4318"
- "4317" # otlp grpc
- "4318" # otlp http
command: ["--config", "/etc/jaeger/config.yml"]
networks:
- otel-network
restart: unless-stopped
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:14269 || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:14269"]
interval: 10s
timeout: 5s
retries: 5
start_period: 20s
retries: 3
start_period: 15s
# --- Metrics ---
prometheus:
image: prom/prometheus:latest
environment:
- TZ=Asia/Shanghai
container_name: prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus-data:/prometheus
- prometheus-data:/prometheus
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.enable-otlp-receiver'
- '--web.enable-remote-write-receiver'
- '--enable-feature=promql-experimental-functions'
- '--storage.tsdb.min-block-duration=2h'
- '--storage.tsdb.max-block-duration=2h'
- '--log.level=info'
- '--storage.tsdb.retention.time=30d'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-otlp-receiver" # Enable OTLP
- "--web.enable-remote-write-receiver" # Enable remote write
- "--enable-feature=promql-experimental-functions" # Enable info()
- "--storage.tsdb.retention.time=30d"
restart: unless-stopped
networks:
- otel-network
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:9090/-/healthy || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 3
@@ -108,18 +164,18 @@ services:
loki:
image: grafana/loki:latest
environment:
- TZ=Asia/Shanghai
container_name: loki
volumes:
- ./loki-config.yaml:/etc/loki/local-config.yaml:ro
- ./loki-data:/loki
- ./loki.yaml:/etc/loki/loki.yaml:ro
- loki-data:/loki
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
command: -config.file=/etc/loki/loki.yaml
networks:
- otel-network
restart: unless-stopped
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:3100/metrics || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
interval: 15s
timeout: 10s
retries: 5
@@ -129,8 +185,6 @@ services:
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
environment:
- TZ=Asia/Shanghai
volumes:
- ./otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro
ports:
@@ -143,52 +197,51 @@ services:
- "55679:55679" # zpages
networks:
- otel-network
restart: unless-stopped
depends_on:
- tempo
- jaeger
- prometheus
- loki
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:13133 || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:13133"]
interval: 10s
timeout: 5s
retries: 3
start_period: 20s
# --- Visualization ---
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
- ./grafana-data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_SECURITY_ADMIN_USER=admin
- TZ=Asia/Shanghai
- GF_INSTALL_PLUGINS=grafana-pyroscope-datasource
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/home.json
restart: unless-stopped
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning:ro
- ./grafana/dashboards:/etc/grafana/dashboards:ro
- grafana-data:/var/lib/grafana
networks:
- otel-network
restart: unless-stopped
depends_on:
- prometheus
- tempo
- loki
healthcheck:
test: [ "CMD-SHELL", "wget --spider -q http://localhost:3000/api/health || exit 1" ]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
interval: 10s
timeout: 5s
retries: 3
volumes:
prometheus-data:
rustfs-data:
tempo-data:
loki-data:
jaeger-data:
prometheus-data:
loki-data:
grafana-data:
networks:

View File

@@ -1 +0,0 @@
*

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: "default"
orgId: 1
folder: ""
type: file
disableDeletion: false
updateIntervalSeconds: 10
options:
path: /etc/grafana/dashboards

View File

@@ -0,0 +1,78 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://prometheus:9090
access: proxy
isDefault: true
editable: false
jsonData:
httpMethod: GET
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
- name: Tempo
type: tempo
uid: tempo
access: proxy
url: http://tempo:3200
isDefault: false
editable: false
jsonData:
httpMethod: GET
serviceMap:
datasourceUid: prometheus
tracesToLogs:
datasourceUid: loki
tags: [ 'job', 'instance', 'pod', 'namespace', 'service.name' ]
mappedTags: [ { key: 'service.name', value: 'app' } ]
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: false
tracesToMetrics:
datasourceUid: prometheus
tags: [ { key: 'service.name' }, { key: 'job' } ]
queries:
- name: 'Service-Level Latency'
query: 'sum(rate(traces_spanmetrics_latency_bucket{$$__tags}[5m])) by (le)'
- name: 'Service-Level Calls'
query: 'sum(rate(traces_spanmetrics_calls_total{$$__tags}[5m]))'
- name: 'Service-Level Errors'
query: 'sum(rate(traces_spanmetrics_calls_total{status_code="ERROR", $$__tags}[5m]))'
nodeGraph:
enabled: true
- name: Loki
type: loki
uid: loki
url: http://loki:3100
basicAuth: false
isDefault: false
editable: false
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: 'trace_id=(\w+)'
name: 'TraceID'
url: '$${__value.raw}'
- name: Jaeger
type: jaeger
uid: jaeger
url: http://jaeger:16686
access: proxy
isDefault: false
editable: false
jsonData:
tracesToLogs:
datasourceUid: loki
tags: [ 'job', 'instance', 'pod', 'namespace', 'service.name' ]
mappedTags: [ { key: 'service.name', value: 'app' } ]
spanStartTimeShift: '1s'
spanEndTimeShift: '-1s'
filterByTraceID: true
filterBySpanID: false

View File

@@ -1 +0,0 @@
*

View File

@@ -13,12 +13,15 @@
# limitations under the License.
service:
extensions: [ jaeger_storage, jaeger_query, remote_sampling, healthcheckv2 ]
extensions: [jaeger_storage, jaeger_query]
pipelines:
traces:
receivers: [ otlp, jaeger, zipkin ]
processors: [ batch, adaptive_sampling ]
exporters: [ jaeger_storage_exporter ]
receivers: [otlp]
processors: [batch]
exporters: [jaeger_storage_exporter, spanmetrics]
metrics/spanmetrics:
receivers: [spanmetrics]
exporters: [prometheus]
telemetry:
resource:
service.name: jaeger
@@ -31,60 +34,41 @@ service:
host: 0.0.0.0
port: 8888
logs:
level: info
level: DEBUG
extensions:
healthcheckv2:
use_v2: true
http:
jaeger_query:
storage:
traces: badger_store
ui:
config_file: ./cmd/jaeger/config-ui.json
log_access: true
max_clock_skew_adjust: 0s
grpc:
endpoint: 0.0.0.0:16685
http:
endpoint: 0.0.0.0:16686
traces: some_storage
metrics: some_metrics_storage
jaeger_storage:
backends:
badger_store:
badger:
ephemeral: false
directory_key: /badger/key
directory_value: /badger/data
span_store_ttl: 72h
some_storage:
memory:
max_traces: 100000
metric_backends:
some_metrics_storage:
prometheus:
endpoint: http://prometheus:9090
normalize_calls: true
normalize_duration: true
remote_sampling:
adaptive:
sampling_store: badger_store
initial_sampling_probability: 0.1
http:
grpc:
connectors:
spanmetrics:
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
jaeger:
protocols:
grpc:
thrift_binary:
thrift_compact:
thrift_http:
zipkin:
endpoint: "0.0.0.0:4318"
processors:
batch:
adaptive_sampling:
exporters:
jaeger_storage_exporter:
trace_storage: badger_store
trace_storage: some_storage
prometheus:
endpoint: "0.0.0.0:8889"

View File

@@ -1 +0,0 @@
*

View File

@@ -11,12 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
grpc_listen_port: 9095
log_level: info
grpc_server_max_concurrent_streams: 1000
@@ -39,12 +38,6 @@ query_range:
enabled: true
max_size_mb: 100
limits_config:
metric_aggregation_enabled: true
max_line_size: 256KB
max_line_size_truncate: false
allow_structured_metadata: true
schema_config:
configs:
- from: 2020-10-24
@@ -54,15 +47,17 @@ schema_config:
index:
prefix: index_
period: 24h
row_shards: 16
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
allow_structured_metadata: true
max_line_size: 256KB
pattern_ingester:
enabled: true
metric_aggregation:
loki_address: localhost:3100
ruler:
alertmanager_url: http://localhost:9093
frontend:
encoding: protobuf

View File

@@ -1 +0,0 @@
*

View File

@@ -1 +0,0 @@
*

View File

@@ -12,21 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.
partition_ring_live_store: true
stream_over_http_enabled: true
server:
http_listen_port: 3200
log_level: info
distributor:
ingester_write_path_enabled: false
kafka_write_path_enabled: true
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
endpoint: "tempo:4317"
http:
endpoint: "0.0.0.0:4318"
endpoint: "tempo:4318"
#log_received_spans:
# enabled: true
# log_discarded_spans:
# enabled: true
ingester:
max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
backend_scheduler:
provider:
compaction:
compaction:
block_retention: 1h
backend_worker:
backend_scheduler_addr: localhost:3200
compaction:
block_retention: 1h
ring:
kvstore:
store: memberlist
querier:
query_live_store: true
metrics_generator:
registry:
@@ -38,13 +61,34 @@ metrics_generator:
remote_write:
- url: http://prometheus:9090/api/v1/write
send_exemplars: true
traces_storage:
path: /var/tempo/generator/traces
query_frontend:
rf1_after: "1999-01-01T00:00:00Z"
mcp_server:
enabled: true
storage:
trace:
backend: local # backend configuration to use
backend: local
wal:
path: /var/tempo/wal # where to store the wal locally
path: /var/tempo/wal
local:
path: /var/tempo/blocks
overrides:
defaults:
metrics_generator:
processors: ["span-metrics", "service-graphs", "local-blocks"]
generate_native_histograms: both
ingest:
enabled: true
kafka:
address: redpanda:9092
topic: tempo-ingest
block_builder:
consume_cycle_duration: 30s
usage_report:
reporting_enabled: false

View File

@@ -195,9 +195,7 @@ services:
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--storage.tsdb.retention.time=200h"
- "--storage.tsdb.retention.time=30d"
- "--web.enable-lifecycle"
- "--web.enable-otlp-receiver"
- "--web.enable-remote-write-receiver"
@@ -236,7 +234,7 @@ services:
volumes:
- grafana_data:/var/lib/grafana
- ./.docker/observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./.docker/observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
- ./.docker/observability/grafana/dashboards:/etc/grafana/dashboards:ro
networks:
- rustfs-network
restart: unless-stopped