mirror of
https://github.com/rustfs/rustfs.git
synced 2026-07-02 01:34:32 +08:00
506 lines
16 KiB
Bash
Executable File
506 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ALIAS=""
|
|
ENDPOINT=""
|
|
ACCESS_KEY="${RUSTFS_ACCESS_KEY:-}"
|
|
SECRET_KEY=""
|
|
SECRET_KEY_ENV="RUSTFS_SECRET_KEY"
|
|
REGION="us-east-1"
|
|
DEPLOYMENT="single-disk"
|
|
WORKLOAD_LABEL="unspecified"
|
|
METRICS_ENDPOINTS=""
|
|
SAMPLES=30
|
|
INTERVAL_SECS=60
|
|
OUT_DIR=""
|
|
MC_BIN="mc"
|
|
AWSCURL_BIN="awscurl"
|
|
JQ_BIN="jq"
|
|
SKIP_HOST_TELEMETRY=false
|
|
RUSTFS_PID=""
|
|
TELEMETRY_PIDS=()
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage:
|
|
scripts/run_scanner_validation_harness.sh --alias <admin-alias> \
|
|
--endpoint <url> [options]
|
|
|
|
Required:
|
|
--alias Admin client alias used for config snapshots.
|
|
--endpoint RustFS endpoint, for example http://127.0.0.1:9000.
|
|
RUSTFS_ACCESS_KEY Admin access key for scanner status requests.
|
|
RUSTFS_SECRET_KEY Admin secret key for scanner status requests.
|
|
|
|
Optional:
|
|
--access-key Override RUSTFS_ACCESS_KEY for scanner status requests.
|
|
--secret-key-env Environment variable that stores the admin secret key
|
|
(default: RUSTFS_SECRET_KEY).
|
|
--region SigV4 region (default: us-east-1).
|
|
--deployment single-disk | multi-disk | distributed (default: single-disk).
|
|
--workload-label Free-form workload label written to metadata.
|
|
--metrics-endpoints Optional comma-separated RustFS endpoints for
|
|
per-endpoint background-heal status and
|
|
/metrics?types=1&by-host=true&n=1 capture.
|
|
--samples Number of scanner status samples (default: 30).
|
|
--interval-secs Seconds between samples (default: 60).
|
|
--out-dir Output directory (default: target/bench/scanner-validation-<timestamp>).
|
|
--mc-bin mc-compatible admin client (default: mc).
|
|
--awscurl-bin SigV4 HTTP client (default: awscurl).
|
|
--jq-bin jq-compatible JSON processor (default: jq).
|
|
--rustfs-pid RustFS process id for pidstat. If omitted, pidof rustfs is used.
|
|
--skip-host-telemetry Do not run pidstat/iostat/mpstat.
|
|
-h, --help Show this help.
|
|
|
|
The harness collects scanner/heal config snapshots, scanner status samples,
|
|
background heal status samples, optional distributed by-host admin metrics,
|
|
host telemetry when available, a compact scanner-summary.csv file, and a
|
|
scanner-validation-report.md file. It does not generate object workload or
|
|
modify scanner configuration.
|
|
USAGE
|
|
}
|
|
|
|
require_cmd() {
|
|
if ! command -v "$1" >/dev/null 2>&1; then
|
|
echo "ERROR: command not found: $1" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
is_nonnegative_integer() {
|
|
[[ "$1" =~ ^[0-9]+$ ]]
|
|
}
|
|
|
|
is_positive_integer() {
|
|
[[ "$1" =~ ^[1-9][0-9]*$ ]]
|
|
}
|
|
|
|
arg_value() {
|
|
local flag="$1"
|
|
local value="${2:-}"
|
|
|
|
if [[ -z "$value" || "$value" == --* ]]; then
|
|
echo "ERROR: missing value for $flag" >&2
|
|
exit 1
|
|
fi
|
|
|
|
printf '%s\n' "$value"
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--alias) ALIAS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--endpoint) ENDPOINT="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--access-key) ACCESS_KEY="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--secret-key-env) SECRET_KEY_ENV="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--region) REGION="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--deployment) DEPLOYMENT="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--workload-label) WORKLOAD_LABEL="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--metrics-endpoints) METRICS_ENDPOINTS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--samples) SAMPLES="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--interval-secs) INTERVAL_SECS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--out-dir) OUT_DIR="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--mc-bin) MC_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--awscurl-bin) AWSCURL_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--jq-bin) JQ_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--rustfs-pid) RUSTFS_PID="$(arg_value "$1" "${2:-}")"; shift 2 ;;
|
|
--skip-host-telemetry) SKIP_HOST_TELEMETRY=true; shift ;;
|
|
-h|--help) usage; exit 0 ;;
|
|
*)
|
|
echo "ERROR: unknown arg: $1" >&2
|
|
usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
validate_args() {
|
|
if [[ -z "$ALIAS" || -z "$ENDPOINT" || -z "$ACCESS_KEY" ]]; then
|
|
echo "ERROR: --alias, --endpoint, and RUSTFS_ACCESS_KEY (or --access-key) are required" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if ! [[ "$SECRET_KEY_ENV" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then
|
|
echo "ERROR: --secret-key-env must be a valid environment variable name" >&2
|
|
exit 1
|
|
fi
|
|
|
|
SECRET_KEY="${!SECRET_KEY_ENV:-}"
|
|
if [[ -z "$SECRET_KEY" ]]; then
|
|
echo "ERROR: $SECRET_KEY_ENV is required for scanner status requests" >&2
|
|
exit 1
|
|
fi
|
|
|
|
case "$DEPLOYMENT" in
|
|
single-disk|multi-disk|distributed) ;;
|
|
*)
|
|
echo "ERROR: --deployment must be single-disk, multi-disk, or distributed" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
if ! is_positive_integer "$SAMPLES"; then
|
|
echo "ERROR: --samples must be a positive integer" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if ! is_nonnegative_integer "$INTERVAL_SECS"; then
|
|
echo "ERROR: --interval-secs must be a nonnegative integer" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
setup_output() {
|
|
if [[ -z "$OUT_DIR" ]]; then
|
|
OUT_DIR="target/bench/scanner-validation-$(date -u +%Y%m%dT%H%M%SZ)"
|
|
fi
|
|
|
|
mkdir -p "$OUT_DIR/status"
|
|
mkdir -p "$OUT_DIR/heal"
|
|
mkdir -p "$OUT_DIR/metrics"
|
|
SUMMARY_CSV="$OUT_DIR/scanner-summary.csv"
|
|
echo "timestamp,primary_pressure,current_cycle_objects_scanned,current_cycle_directories_scanned,last_cycle_result,last_cycle_partial_reason,last_cycle_partial_source,lifecycle_transition_scanner_missed,source_work_missed_total,current_cycle_usage_saves,last_cycle_usage_saves,usage_dirty_pending_buckets,usage_last_cycle_dirty_buckets,usage_last_cycle_cleared_dirty_buckets,usage_last_save_result,usage_last_save_unix_secs,life_time_scan_cycle,life_time_scan_bucket_drive,life_time_scan_object,life_time_save_usage,heal_queue_length,heal_active_tasks,heal_scanner_queued,heal_admin_queued,heal_auto_heal_queued" >"$SUMMARY_CSV"
|
|
}
|
|
|
|
git_value() {
|
|
local args=("$@")
|
|
git "${args[@]}" 2>/dev/null || true
|
|
}
|
|
|
|
write_metadata() {
|
|
local started_at="$1"
|
|
|
|
{
|
|
printf 'started_at=%s\n' "$started_at"
|
|
printf 'deployment=%s\n' "$DEPLOYMENT"
|
|
printf 'workload_label=%s\n' "$WORKLOAD_LABEL"
|
|
printf 'endpoint=%s\n' "$ENDPOINT"
|
|
printf 'region=%s\n' "$REGION"
|
|
printf 'metrics_endpoints=%s\n' "$METRICS_ENDPOINTS"
|
|
printf 'samples=%s\n' "$SAMPLES"
|
|
printf 'interval_secs=%s\n' "$INTERVAL_SECS"
|
|
printf 'git_commit=%s\n' "$(git_value rev-parse HEAD)"
|
|
printf 'git_branch=%s\n' "$(git_value branch --show-current)"
|
|
} >"$OUT_DIR/run-metadata.env"
|
|
}
|
|
|
|
capture_config_snapshots() {
|
|
"$MC_BIN" admin config get "$ALIAS" scanner >"$OUT_DIR/scanner-config.before.txt"
|
|
"$MC_BIN" admin config get "$ALIAS" heal >"$OUT_DIR/heal-config.before.txt"
|
|
}
|
|
|
|
first_rustfs_pid() {
|
|
if [[ -n "$RUSTFS_PID" ]]; then
|
|
echo "$RUSTFS_PID"
|
|
return
|
|
fi
|
|
|
|
pidof rustfs 2>/dev/null | awk '{ print $1 }' || true
|
|
}
|
|
|
|
start_host_telemetry() {
|
|
TELEMETRY_PIDS=()
|
|
|
|
if [[ "$SKIP_HOST_TELEMETRY" == "true" || "$INTERVAL_SECS" == "0" ]]; then
|
|
return
|
|
fi
|
|
|
|
local pid
|
|
pid="$(first_rustfs_pid)"
|
|
|
|
if [[ -n "$pid" ]] && command -v pidstat >/dev/null 2>&1; then
|
|
pidstat -p "$pid" "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/pidstat.txt" 2>&1 &
|
|
TELEMETRY_PIDS+=("$!")
|
|
fi
|
|
|
|
if command -v iostat >/dev/null 2>&1; then
|
|
iostat -xz "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/iostat.txt" 2>&1 &
|
|
TELEMETRY_PIDS+=("$!")
|
|
fi
|
|
|
|
if command -v mpstat >/dev/null 2>&1; then
|
|
mpstat "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/mpstat.txt" 2>&1 &
|
|
TELEMETRY_PIDS+=("$!")
|
|
fi
|
|
}
|
|
|
|
wait_host_telemetry() {
|
|
local pid
|
|
|
|
if [[ ${#TELEMETRY_PIDS[@]} -eq 0 ]]; then
|
|
return
|
|
fi
|
|
|
|
for pid in "${TELEMETRY_PIDS[@]}"; do
|
|
wait "$pid" || true
|
|
done
|
|
}
|
|
|
|
scanner_status_url() {
|
|
printf '%s/rustfs/admin/v3/scanner/status\n' "${ENDPOINT%/}"
|
|
}
|
|
|
|
background_heal_status_url() {
|
|
printf '%s/rustfs/admin/v3/background-heal/status\n' "${1%/}"
|
|
}
|
|
|
|
metrics_url() {
|
|
printf '%s/rustfs/admin/v3/metrics?types=1&by-host=true&n=1\n' "${1%/}"
|
|
}
|
|
|
|
endpoint_label() {
|
|
local endpoint="$1"
|
|
local label
|
|
|
|
label="${endpoint#*://}"
|
|
label="${label%%/*}"
|
|
printf '%s' "$label" | tr -c 'A-Za-z0-9._-' '_'
|
|
}
|
|
|
|
background_heal_status_endpoints() {
|
|
local endpoint
|
|
local endpoints=()
|
|
local emitted=false
|
|
|
|
if [[ -n "$METRICS_ENDPOINTS" ]]; then
|
|
IFS=',' read -r -a endpoints <<<"$METRICS_ENDPOINTS"
|
|
for endpoint in "${endpoints[@]}"; do
|
|
if [[ -z "$endpoint" ]]; then
|
|
continue
|
|
fi
|
|
printf '%s\n' "$endpoint"
|
|
emitted=true
|
|
done
|
|
fi
|
|
|
|
if [[ "$emitted" != "true" ]]; then
|
|
printf '%s\n' "$ENDPOINT"
|
|
fi
|
|
}
|
|
|
|
write_heal_status_summary() {
|
|
local summary_file="$1"
|
|
shift
|
|
|
|
"$JQ_BIN" -s '
|
|
{
|
|
healOperations: {
|
|
queueLength: (map(.healOperations.queueLength // .healQueueLength // 0) | add // 0),
|
|
activeTasks: (map(.healOperations.activeTasks // .healActiveTasks // 0) | add // 0),
|
|
queuedBySource: {
|
|
scanner: (map(.healOperations.queuedBySource.scanner // 0) | add // 0),
|
|
admin: (map(.healOperations.queuedBySource.admin // 0) | add // 0),
|
|
autoHeal: (map(.healOperations.queuedBySource.autoHeal // 0) | add // 0),
|
|
internal: (map(.healOperations.queuedBySource.internal // 0) | add // 0)
|
|
},
|
|
activeBySource: {
|
|
scanner: (map(.healOperations.activeBySource.scanner // 0) | add // 0),
|
|
admin: (map(.healOperations.activeBySource.admin // 0) | add // 0),
|
|
autoHeal: (map(.healOperations.activeBySource.autoHeal // 0) | add // 0),
|
|
internal: (map(.healOperations.activeBySource.internal // 0) | add // 0)
|
|
},
|
|
queuedByPriority: {
|
|
low: (map(.healOperations.queuedByPriority.low // 0) | add // 0),
|
|
normal: (map(.healOperations.queuedByPriority.normal // 0) | add // 0),
|
|
high: (map(.healOperations.queuedByPriority.high // 0) | add // 0),
|
|
urgent: (map(.healOperations.queuedByPriority.urgent // 0) | add // 0)
|
|
},
|
|
activeByPriority: {
|
|
low: (map(.healOperations.activeByPriority.low // 0) | add // 0),
|
|
normal: (map(.healOperations.activeByPriority.normal // 0) | add // 0),
|
|
high: (map(.healOperations.activeByPriority.high // 0) | add // 0),
|
|
urgent: (map(.healOperations.activeByPriority.urgent // 0) | add // 0)
|
|
}
|
|
}
|
|
}
|
|
' "$@" >"$summary_file"
|
|
}
|
|
|
|
capture_background_heal_status_sample() {
|
|
local index="$1"
|
|
local ts="$2"
|
|
local summary_file="$3"
|
|
local endpoint label heal_status_file
|
|
local heal_status_files=()
|
|
|
|
while IFS= read -r endpoint; do
|
|
label="$(endpoint_label "$endpoint")"
|
|
heal_status_file="$OUT_DIR/heal/background-heal-status.${label}.${index}.${ts}.json"
|
|
AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
|
|
AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
|
|
AWS_DEFAULT_REGION="$REGION" \
|
|
"$AWSCURL_BIN" \
|
|
--service s3 \
|
|
--region "$REGION" \
|
|
--request POST \
|
|
"$(background_heal_status_url "$endpoint")" \
|
|
| "$JQ_BIN" . >"$heal_status_file"
|
|
heal_status_files+=("$heal_status_file")
|
|
done < <(background_heal_status_endpoints)
|
|
|
|
write_heal_status_summary "$summary_file" "${heal_status_files[@]}"
|
|
}
|
|
|
|
capture_distributed_metrics_sample() {
|
|
local index="$1"
|
|
local ts="$2"
|
|
local endpoint label metrics_file
|
|
local endpoints=()
|
|
|
|
if [[ -z "$METRICS_ENDPOINTS" ]]; then
|
|
return
|
|
fi
|
|
|
|
IFS=',' read -r -a endpoints <<<"$METRICS_ENDPOINTS"
|
|
for endpoint in "${endpoints[@]}"; do
|
|
if [[ -z "$endpoint" ]]; then
|
|
continue
|
|
fi
|
|
|
|
label="$(endpoint_label "$endpoint")"
|
|
metrics_file="$OUT_DIR/metrics/admin-metrics.${label}.${index}.${ts}.ndjson"
|
|
AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
|
|
AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
|
|
AWS_DEFAULT_REGION="$REGION" \
|
|
"$AWSCURL_BIN" \
|
|
--service s3 \
|
|
--region "$REGION" \
|
|
--request GET \
|
|
"$(metrics_url "$endpoint")" \
|
|
>"$metrics_file"
|
|
done
|
|
}
|
|
|
|
capture_status_sample() {
|
|
local index="$1"
|
|
local ts="$2"
|
|
local status_file="$OUT_DIR/status/scanner-status.${index}.${ts}.json"
|
|
local heal_summary_file="$OUT_DIR/heal/background-heal-summary.${index}.${ts}.json"
|
|
|
|
AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
|
|
AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
|
|
AWS_DEFAULT_REGION="$REGION" \
|
|
"$AWSCURL_BIN" \
|
|
--service s3 \
|
|
--region "$REGION" \
|
|
--request GET \
|
|
"$(scanner_status_url)" \
|
|
| "$JQ_BIN" . >"$status_file"
|
|
|
|
capture_background_heal_status_sample "$index" "$ts" "$heal_summary_file"
|
|
|
|
"$JQ_BIN" -r --arg ts "$ts" --slurpfile heal_status "$heal_summary_file" '
|
|
($heal_status[0] // {}) as $heal |
|
|
[
|
|
$ts,
|
|
(.metrics.pacing_pressure.primary_pressure // ""),
|
|
(.metrics.current_cycle_objects_scanned // 0),
|
|
(.metrics.current_cycle_directories_scanned // 0),
|
|
(.metrics.last_cycle_result // ""),
|
|
(.metrics.last_cycle_partial_reason // ""),
|
|
(.metrics.last_cycle_partial_source // ""),
|
|
(.metrics.lifecycle_transition.scanner_missed // 0),
|
|
((.metrics.source_work // []) | map(.missed // 0) | add // 0),
|
|
(.metrics.current_cycle_usage_saves // 0),
|
|
(.metrics.last_cycle_usage_saves // 0),
|
|
(.metrics.usage_freshness.dirty_pending_buckets // 0),
|
|
(.metrics.usage_freshness.last_cycle_dirty_buckets // 0),
|
|
(.metrics.usage_freshness.last_cycle_cleared_dirty_buckets // 0),
|
|
(.metrics.usage_freshness.last_usage_save_result // ""),
|
|
(.metrics.usage_freshness.last_usage_save_unix_secs // 0),
|
|
(.metrics.life_time_ops.scan_cycle // 0),
|
|
(.metrics.life_time_ops.scan_bucket_drive // 0),
|
|
(.metrics.life_time_ops.scan_object // 0),
|
|
(.metrics.life_time_ops.save_usage // 0),
|
|
($heal.healOperations.queueLength // $heal.healQueueLength // 0),
|
|
($heal.healOperations.activeTasks // $heal.healActiveTasks // 0),
|
|
($heal.healOperations.queuedBySource.scanner // 0),
|
|
($heal.healOperations.queuedBySource.admin // 0),
|
|
($heal.healOperations.queuedBySource.autoHeal // 0)
|
|
] | @csv
|
|
' "$status_file" >>"$SUMMARY_CSV"
|
|
|
|
capture_distributed_metrics_sample "$index" "$ts"
|
|
}
|
|
|
|
capture_status_series() {
|
|
local index ts
|
|
|
|
for ((index = 1; index <= SAMPLES; index++)); do
|
|
ts="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
capture_status_sample "$index" "$ts"
|
|
|
|
if [[ "$index" -lt "$SAMPLES" && "$INTERVAL_SECS" != "0" ]]; then
|
|
sleep "$INTERVAL_SECS"
|
|
fi
|
|
done
|
|
}
|
|
|
|
count_artifacts() {
|
|
local dir="$1"
|
|
local pattern="$2"
|
|
|
|
find "$dir" -type f -name "$pattern" 2>/dev/null | wc -l | tr -d ' '
|
|
}
|
|
|
|
write_report() {
|
|
local status_count heal_status_count metrics_count
|
|
status_count="$(count_artifacts "$OUT_DIR/status" 'scanner-status.*.json')"
|
|
heal_status_count="$(count_artifacts "$OUT_DIR/heal" 'background-heal-status.*.json')"
|
|
metrics_count="$(count_artifacts "$OUT_DIR/metrics" 'admin-metrics.*.ndjson')"
|
|
|
|
cat >"$OUT_DIR/scanner-validation-report.md" <<EOF
|
|
## Scanner Validation Report
|
|
|
|
Deployment: $DEPLOYMENT
|
|
Workload label: $WORKLOAD_LABEL
|
|
Endpoint: $ENDPOINT
|
|
Samples: $SAMPLES
|
|
Interval seconds: $INTERVAL_SECS
|
|
|
|
## Artifact Summary
|
|
|
|
- Scanner status snapshots: $status_count
|
|
- Background heal status snapshots: $heal_status_count
|
|
- Distributed admin metrics snapshots: $metrics_count
|
|
- Summary CSV: scanner-summary.csv
|
|
- Usage freshness summary: current/last usage saves, dirty bucket state, last usage save result, and scanner life_time_ops.
|
|
- Run metadata: run-metadata.env
|
|
|
|
## Review Checklist
|
|
|
|
- Compare scanner progress and pressure in scanner-summary.csv.
|
|
- Confirm dirty usage marks lead to usage saves when validating post-start bucket metrics freshness.
|
|
- Check life_time_ops scan_cycle, scan_bucket_drive, scan_object, and save_usage before accepting bucket metrics freshness results.
|
|
- Check source work missed totals before accepting pressure reductions.
|
|
- Check healOperations queued and active counts when heal or bitrot pressure is involved.
|
|
- Use distributed admin metrics snapshots for by-host investigation when metrics endpoints were provided.
|
|
- Attach host telemetry when pidstat, iostat, or mpstat files are present.
|
|
EOF
|
|
}
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
validate_args
|
|
require_cmd "$MC_BIN"
|
|
require_cmd "$AWSCURL_BIN"
|
|
require_cmd "$JQ_BIN"
|
|
setup_output
|
|
|
|
local started_at
|
|
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
|
|
write_metadata "$started_at"
|
|
capture_config_snapshots
|
|
start_host_telemetry
|
|
capture_status_series
|
|
wait_host_telemetry
|
|
write_report
|
|
|
|
echo "Scanner validation artifacts written to $OUT_DIR"
|
|
}
|
|
|
|
main "$@"
|