rustfs/scripts/run_scanner_validation_harness.sh

#!/usr/bin/env bash
set -euo pipefail

ALIAS=""
ENDPOINT=""
ACCESS_KEY="${RUSTFS_ACCESS_KEY:-}"
SECRET_KEY=""
SECRET_KEY_ENV="RUSTFS_SECRET_KEY"
REGION="us-east-1"
DEPLOYMENT="single-disk"
WORKLOAD_LABEL="unspecified"
METRICS_ENDPOINTS=""
SAMPLES=30
INTERVAL_SECS=60
OUT_DIR=""
MC_BIN="mc"
AWSCURL_BIN="awscurl"
JQ_BIN="jq"
SKIP_HOST_TELEMETRY=false
RUSTFS_PID=""
TELEMETRY_PIDS=()

usage() {
  cat <<'USAGE'
Usage:
  scripts/run_scanner_validation_harness.sh --alias <admin-alias> \
    --endpoint <url> [options]

Required:
  --alias                 Admin client alias used for config snapshots.
  --endpoint              RustFS endpoint, for example http://127.0.0.1:9000.
  RUSTFS_ACCESS_KEY       Admin access key for scanner status requests.
  RUSTFS_SECRET_KEY       Admin secret key for scanner status requests.

Optional:
  --access-key            Override RUSTFS_ACCESS_KEY for scanner status requests.
  --secret-key-env        Environment variable that stores the admin secret key
                          (default: RUSTFS_SECRET_KEY).
  --region                SigV4 region (default: us-east-1).
  --deployment            single-disk | multi-disk | distributed (default: single-disk).
  --workload-label        Free-form workload label written to metadata.
  --metrics-endpoints     Optional comma-separated RustFS endpoints for
                          per-endpoint background-heal status and
                          /metrics?types=1&by-host=true&n=1 capture.
  --samples               Number of scanner status samples (default: 30).
  --interval-secs         Seconds between samples (default: 60).
  --out-dir               Output directory (default: target/bench/scanner-validation-<timestamp>).
  --mc-bin                mc-compatible admin client (default: mc).
  --awscurl-bin           SigV4 HTTP client (default: awscurl).
  --jq-bin                jq-compatible JSON processor (default: jq).
  --rustfs-pid            RustFS process id for pidstat. If omitted, pidof rustfs is used.
  --skip-host-telemetry   Do not run pidstat/iostat/mpstat.
  -h, --help              Show this help.

The harness collects scanner/heal config snapshots, scanner status samples,
background heal status samples, optional distributed by-host admin metrics,
host telemetry when available, a compact scanner-summary.csv file, and a
scanner-validation-report.md file. It does not generate object workload or
modify scanner configuration.
USAGE
}

require_cmd() {
  if ! command -v "$1" >/dev/null 2>&1; then
    echo "ERROR: command not found: $1" >&2
    exit 1
  fi
}

is_nonnegative_integer() {
  [[ "$1" =~ ^[0-9]+$ ]]
}

is_positive_integer() {
  [[ "$1" =~ ^[1-9][0-9]*$ ]]
}

arg_value() {
  local flag="$1"
  local value="${2:-}"

  if [[ -z "$value" || "$value" == --* ]]; then
    echo "ERROR: missing value for $flag" >&2
    exit 1
  fi

  printf '%s\n' "$value"
}

parse_args() {
  while [[ $# -gt 0 ]]; do
    case "$1" in
      --alias) ALIAS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --endpoint) ENDPOINT="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --access-key) ACCESS_KEY="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --secret-key-env) SECRET_KEY_ENV="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --region) REGION="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --deployment) DEPLOYMENT="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --workload-label) WORKLOAD_LABEL="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --metrics-endpoints) METRICS_ENDPOINTS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --samples) SAMPLES="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --interval-secs) INTERVAL_SECS="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --out-dir) OUT_DIR="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --mc-bin) MC_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --awscurl-bin) AWSCURL_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --jq-bin) JQ_BIN="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --rustfs-pid) RUSTFS_PID="$(arg_value "$1" "${2:-}")"; shift 2 ;;
      --skip-host-telemetry) SKIP_HOST_TELEMETRY=true; shift ;;
      -h|--help) usage; exit 0 ;;
      *)
        echo "ERROR: unknown arg: $1" >&2
        usage
        exit 1
        ;;
    esac
  done
}

validate_args() {
  if [[ -z "$ALIAS" || -z "$ENDPOINT" || -z "$ACCESS_KEY" ]]; then
    echo "ERROR: --alias, --endpoint, and RUSTFS_ACCESS_KEY (or --access-key) are required" >&2
    exit 1
  fi

  if ! [[ "$SECRET_KEY_ENV" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then
    echo "ERROR: --secret-key-env must be a valid environment variable name" >&2
    exit 1
  fi

  SECRET_KEY="${!SECRET_KEY_ENV:-}"
  if [[ -z "$SECRET_KEY" ]]; then
    echo "ERROR: $SECRET_KEY_ENV is required for scanner status requests" >&2
    exit 1
  fi

  case "$DEPLOYMENT" in
    single-disk|multi-disk|distributed) ;;
    *)
      echo "ERROR: --deployment must be single-disk, multi-disk, or distributed" >&2
      exit 1
      ;;
  esac

  if ! is_positive_integer "$SAMPLES"; then
    echo "ERROR: --samples must be a positive integer" >&2
    exit 1
  fi

  if ! is_nonnegative_integer "$INTERVAL_SECS"; then
    echo "ERROR: --interval-secs must be a nonnegative integer" >&2
    exit 1
  fi
}

setup_output() {
  if [[ -z "$OUT_DIR" ]]; then
    OUT_DIR="target/bench/scanner-validation-$(date -u +%Y%m%dT%H%M%SZ)"
  fi

  mkdir -p "$OUT_DIR/status"
  mkdir -p "$OUT_DIR/heal"
  mkdir -p "$OUT_DIR/metrics"
  SUMMARY_CSV="$OUT_DIR/scanner-summary.csv"
  echo "timestamp,primary_pressure,current_cycle_objects_scanned,current_cycle_directories_scanned,last_cycle_result,last_cycle_partial_reason,last_cycle_partial_source,lifecycle_transition_scanner_missed,source_work_missed_total,current_cycle_usage_saves,last_cycle_usage_saves,usage_dirty_pending_buckets,usage_last_cycle_dirty_buckets,usage_last_cycle_cleared_dirty_buckets,usage_last_save_result,usage_last_save_unix_secs,life_time_scan_cycle,life_time_scan_bucket_drive,life_time_scan_object,life_time_save_usage,heal_queue_length,heal_active_tasks,heal_scanner_queued,heal_admin_queued,heal_auto_heal_queued" >"$SUMMARY_CSV"
}

git_value() {
  local args=("$@")
  git "${args[@]}" 2>/dev/null || true
}

write_metadata() {
  local started_at="$1"

  {
    printf 'started_at=%s\n' "$started_at"
    printf 'deployment=%s\n' "$DEPLOYMENT"
    printf 'workload_label=%s\n' "$WORKLOAD_LABEL"
    printf 'endpoint=%s\n' "$ENDPOINT"
    printf 'region=%s\n' "$REGION"
    printf 'metrics_endpoints=%s\n' "$METRICS_ENDPOINTS"
    printf 'samples=%s\n' "$SAMPLES"
    printf 'interval_secs=%s\n' "$INTERVAL_SECS"
    printf 'git_commit=%s\n' "$(git_value rev-parse HEAD)"
    printf 'git_branch=%s\n' "$(git_value branch --show-current)"
  } >"$OUT_DIR/run-metadata.env"
}

capture_config_snapshots() {
  "$MC_BIN" admin config get "$ALIAS" scanner >"$OUT_DIR/scanner-config.before.txt"
  "$MC_BIN" admin config get "$ALIAS" heal >"$OUT_DIR/heal-config.before.txt"
}

first_rustfs_pid() {
  if [[ -n "$RUSTFS_PID" ]]; then
    echo "$RUSTFS_PID"
    return
  fi

  pidof rustfs 2>/dev/null | awk '{ print $1 }' || true
}

start_host_telemetry() {
  TELEMETRY_PIDS=()

  if [[ "$SKIP_HOST_TELEMETRY" == "true" || "$INTERVAL_SECS" == "0" ]]; then
    return
  fi

  local pid
  pid="$(first_rustfs_pid)"

  if [[ -n "$pid" ]] && command -v pidstat >/dev/null 2>&1; then
    pidstat -p "$pid" "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/pidstat.txt" 2>&1 &
    TELEMETRY_PIDS+=("$!")
  fi

  if command -v iostat >/dev/null 2>&1; then
    iostat -xz "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/iostat.txt" 2>&1 &
    TELEMETRY_PIDS+=("$!")
  fi

  if command -v mpstat >/dev/null 2>&1; then
    mpstat "$INTERVAL_SECS" "$SAMPLES" >"$OUT_DIR/mpstat.txt" 2>&1 &
    TELEMETRY_PIDS+=("$!")
  fi
}

wait_host_telemetry() {
  local pid

  if [[ ${#TELEMETRY_PIDS[@]} -eq 0 ]]; then
    return
  fi

  for pid in "${TELEMETRY_PIDS[@]}"; do
    wait "$pid" || true
  done
}

scanner_status_url() {
  printf '%s/rustfs/admin/v3/scanner/status\n' "${ENDPOINT%/}"
}

background_heal_status_url() {
  printf '%s/rustfs/admin/v3/background-heal/status\n' "${1%/}"
}

metrics_url() {
  printf '%s/rustfs/admin/v3/metrics?types=1&by-host=true&n=1\n' "${1%/}"
}

endpoint_label() {
  local endpoint="$1"
  local label

  label="${endpoint#*://}"
  label="${label%%/*}"
  printf '%s' "$label" | tr -c 'A-Za-z0-9._-' '_'
}

background_heal_status_endpoints() {
  local endpoint
  local endpoints=()
  local emitted=false

  if [[ -n "$METRICS_ENDPOINTS" ]]; then
    IFS=',' read -r -a endpoints <<<"$METRICS_ENDPOINTS"
    for endpoint in "${endpoints[@]}"; do
      if [[ -z "$endpoint" ]]; then
        continue
      fi
      printf '%s\n' "$endpoint"
      emitted=true
    done
  fi

  if [[ "$emitted" != "true" ]]; then
    printf '%s\n' "$ENDPOINT"
  fi
}

write_heal_status_summary() {
  local summary_file="$1"
  shift

  "$JQ_BIN" -s '
    {
      healOperations: {
        queueLength: (map(.healOperations.queueLength // .healQueueLength // 0) | add // 0),
        activeTasks: (map(.healOperations.activeTasks // .healActiveTasks // 0) | add // 0),
        queuedBySource: {
          scanner: (map(.healOperations.queuedBySource.scanner // 0) | add // 0),
          admin: (map(.healOperations.queuedBySource.admin // 0) | add // 0),
          autoHeal: (map(.healOperations.queuedBySource.autoHeal // 0) | add // 0),
          internal: (map(.healOperations.queuedBySource.internal // 0) | add // 0)
        },
        activeBySource: {
          scanner: (map(.healOperations.activeBySource.scanner // 0) | add // 0),
          admin: (map(.healOperations.activeBySource.admin // 0) | add // 0),
          autoHeal: (map(.healOperations.activeBySource.autoHeal // 0) | add // 0),
          internal: (map(.healOperations.activeBySource.internal // 0) | add // 0)
        },
        queuedByPriority: {
          low: (map(.healOperations.queuedByPriority.low // 0) | add // 0),
          normal: (map(.healOperations.queuedByPriority.normal // 0) | add // 0),
          high: (map(.healOperations.queuedByPriority.high // 0) | add // 0),
          urgent: (map(.healOperations.queuedByPriority.urgent // 0) | add // 0)
        },
        activeByPriority: {
          low: (map(.healOperations.activeByPriority.low // 0) | add // 0),
          normal: (map(.healOperations.activeByPriority.normal // 0) | add // 0),
          high: (map(.healOperations.activeByPriority.high // 0) | add // 0),
          urgent: (map(.healOperations.activeByPriority.urgent // 0) | add // 0)
        }
      }
    }
  ' "$@" >"$summary_file"
}

capture_background_heal_status_sample() {
  local index="$1"
  local ts="$2"
  local summary_file="$3"
  local endpoint label heal_status_file
  local heal_status_files=()

  while IFS= read -r endpoint; do
    label="$(endpoint_label "$endpoint")"
    heal_status_file="$OUT_DIR/heal/background-heal-status.${label}.${index}.${ts}.json"
    AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
    AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
    AWS_DEFAULT_REGION="$REGION" \
    "$AWSCURL_BIN" \
      --service s3 \
      --region "$REGION" \
      --request POST \
      "$(background_heal_status_url "$endpoint")" \
      | "$JQ_BIN" . >"$heal_status_file"
    heal_status_files+=("$heal_status_file")
  done < <(background_heal_status_endpoints)

  write_heal_status_summary "$summary_file" "${heal_status_files[@]}"
}

capture_distributed_metrics_sample() {
  local index="$1"
  local ts="$2"
  local endpoint label metrics_file
  local endpoints=()

  if [[ -z "$METRICS_ENDPOINTS" ]]; then
    return
  fi

  IFS=',' read -r -a endpoints <<<"$METRICS_ENDPOINTS"
  for endpoint in "${endpoints[@]}"; do
    if [[ -z "$endpoint" ]]; then
      continue
    fi

    label="$(endpoint_label "$endpoint")"
    metrics_file="$OUT_DIR/metrics/admin-metrics.${label}.${index}.${ts}.ndjson"
    AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
    AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
    AWS_DEFAULT_REGION="$REGION" \
    "$AWSCURL_BIN" \
      --service s3 \
      --region "$REGION" \
      --request GET \
      "$(metrics_url "$endpoint")" \
      >"$metrics_file"
  done
}

capture_status_sample() {
  local index="$1"
  local ts="$2"
  local status_file="$OUT_DIR/status/scanner-status.${index}.${ts}.json"
  local heal_summary_file="$OUT_DIR/heal/background-heal-summary.${index}.${ts}.json"

  AWS_ACCESS_KEY_ID="$ACCESS_KEY" \
  AWS_SECRET_ACCESS_KEY="$SECRET_KEY" \
  AWS_DEFAULT_REGION="$REGION" \
  "$AWSCURL_BIN" \
    --service s3 \
    --region "$REGION" \
    --request GET \
    "$(scanner_status_url)" \
    | "$JQ_BIN" . >"$status_file"

  capture_background_heal_status_sample "$index" "$ts" "$heal_summary_file"

  "$JQ_BIN" -r --arg ts "$ts" --slurpfile heal_status "$heal_summary_file" '
    ($heal_status[0] // {}) as $heal |
    [
      $ts,
      (.metrics.pacing_pressure.primary_pressure // ""),
      (.metrics.current_cycle_objects_scanned // 0),
      (.metrics.current_cycle_directories_scanned // 0),
      (.metrics.last_cycle_result // ""),
      (.metrics.last_cycle_partial_reason // ""),
      (.metrics.last_cycle_partial_source // ""),
      (.metrics.lifecycle_transition.scanner_missed // 0),
      ((.metrics.source_work // []) | map(.missed // 0) | add // 0),
      (.metrics.current_cycle_usage_saves // 0),
      (.metrics.last_cycle_usage_saves // 0),
      (.metrics.usage_freshness.dirty_pending_buckets // 0),
      (.metrics.usage_freshness.last_cycle_dirty_buckets // 0),
      (.metrics.usage_freshness.last_cycle_cleared_dirty_buckets // 0),
      (.metrics.usage_freshness.last_usage_save_result // ""),
      (.metrics.usage_freshness.last_usage_save_unix_secs // 0),
      (.metrics.life_time_ops.scan_cycle // 0),
      (.metrics.life_time_ops.scan_bucket_drive // 0),
      (.metrics.life_time_ops.scan_object // 0),
      (.metrics.life_time_ops.save_usage // 0),
      ($heal.healOperations.queueLength // $heal.healQueueLength // 0),
      ($heal.healOperations.activeTasks // $heal.healActiveTasks // 0),
      ($heal.healOperations.queuedBySource.scanner // 0),
      ($heal.healOperations.queuedBySource.admin // 0),
      ($heal.healOperations.queuedBySource.autoHeal // 0)
    ] | @csv
  ' "$status_file" >>"$SUMMARY_CSV"

  capture_distributed_metrics_sample "$index" "$ts"
}

capture_status_series() {
  local index ts

  for ((index = 1; index <= SAMPLES; index++)); do
    ts="$(date -u +%Y%m%dT%H%M%SZ)"
    capture_status_sample "$index" "$ts"

    if [[ "$index" -lt "$SAMPLES" && "$INTERVAL_SECS" != "0" ]]; then
      sleep "$INTERVAL_SECS"
    fi
  done
}

count_artifacts() {
  local dir="$1"
  local pattern="$2"

  find "$dir" -type f -name "$pattern" 2>/dev/null | wc -l | tr -d ' '
}

write_report() {
  local status_count heal_status_count metrics_count
  status_count="$(count_artifacts "$OUT_DIR/status" 'scanner-status.*.json')"
  heal_status_count="$(count_artifacts "$OUT_DIR/heal" 'background-heal-status.*.json')"
  metrics_count="$(count_artifacts "$OUT_DIR/metrics" 'admin-metrics.*.ndjson')"

  cat >"$OUT_DIR/scanner-validation-report.md" <<EOF
## Scanner Validation Report

Deployment: $DEPLOYMENT
Workload label: $WORKLOAD_LABEL
Endpoint: $ENDPOINT
Samples: $SAMPLES
Interval seconds: $INTERVAL_SECS

## Artifact Summary

- Scanner status snapshots: $status_count
- Background heal status snapshots: $heal_status_count
- Distributed admin metrics snapshots: $metrics_count
- Summary CSV: scanner-summary.csv
- Usage freshness summary: current/last usage saves, dirty bucket state, last usage save result, and scanner life_time_ops.
- Run metadata: run-metadata.env

## Review Checklist

- Compare scanner progress and pressure in scanner-summary.csv.
- Confirm dirty usage marks lead to usage saves when validating post-start bucket metrics freshness.
- Check life_time_ops scan_cycle, scan_bucket_drive, scan_object, and save_usage before accepting bucket metrics freshness results.
- Check source work missed totals before accepting pressure reductions.
- Check healOperations queued and active counts when heal or bitrot pressure is involved.
- Use distributed admin metrics snapshots for by-host investigation when metrics endpoints were provided.
- Attach host telemetry when pidstat, iostat, or mpstat files are present.
EOF
}

main() {
  parse_args "$@"
  validate_args
  require_cmd "$MC_BIN"
  require_cmd "$AWSCURL_BIN"
  require_cmd "$JQ_BIN"
  setup_output

  local started_at
  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"

  write_metadata "$started_at"
  capture_config_snapshots
  start_host_telemetry
  capture_status_series
  wait_host_telemetry
  write_report

  echo "Scanner validation artifacts written to $OUT_DIR"
}

main "$@"