mirror of
https://github.com/rustfs/rustfs.git
synced 2026-05-06 22:28:16 +08:00
767 lines
21 KiB
Bash
Executable File
767 lines
21 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
CLUSTER_COMPOSE="${CLUSTER_COMPOSE:-${PROJECT_ROOT}/.docker/compose/docker-compose.cluster.local-build.yml}"
|
|
OBS_COMPOSE="${OBS_COMPOSE:-${PROJECT_ROOT}/.docker/observability/docker-compose.yml}"
|
|
PROJECT_NAME="${PROJECT_NAME:-rustfs-four-node-test}"
|
|
IMAGE_TAG="${IMAGE_TAG:-rustfs/rustfs:local-4node}"
|
|
WITH_OBSERVABILITY="${WITH_OBSERVABILITY:-true}"
|
|
BUILD_LOCAL_IMAGE="${BUILD_LOCAL_IMAGE:-true}"
|
|
RUN_FAILOVER="${RUN_FAILOVER:-true}"
|
|
RUN_BENCHMARK="${RUN_BENCHMARK:-true}"
|
|
KEEP_UP="${KEEP_UP:-false}"
|
|
PRECHECK_AUTO_CLEANUP="${PRECHECK_AUTO_CLEANUP:-true}"
|
|
WAIT_PROBE_MODE="${WAIT_PROBE_MODE:-service}"
|
|
|
|
RUSTFS_ACCESS_KEY="${RUSTFS_ACCESS_KEY:-rustfsadmin}"
|
|
RUSTFS_SECRET_KEY="${RUSTFS_SECRET_KEY:-rustfsadmin}"
|
|
RUSTFS_OBS_ENDPOINT="${RUSTFS_OBS_ENDPOINT:-}"
|
|
RUSTFS_UNSAFE_BYPASS_DISK_CHECK="${RUSTFS_UNSAFE_BYPASS_DISK_CHECK:-true}"
|
|
|
|
WAIT_TIMEOUT_SECS="${WAIT_TIMEOUT_SECS:-180}"
|
|
BENCH_READY_TIMEOUT_SECS="${BENCH_READY_TIMEOUT_SECS:-180}"
|
|
FAILOVER_NODE="${FAILOVER_NODE:-node4}"
|
|
FAILOVER_WARMUP_SECS="${FAILOVER_WARMUP_SECS:-5}"
|
|
FAILOVER_SAMPLE_SECS="${FAILOVER_SAMPLE_SECS:-60}"
|
|
FAILOVER_INTERVAL_SECS="${FAILOVER_INTERVAL_SECS:-1}"
|
|
BENCH_WAIT_MODE="${BENCH_WAIT_MODE:-ready}"
|
|
|
|
BENCH_ENDPOINT="${BENCH_ENDPOINT:-http://127.0.0.1:9000}"
|
|
BENCH_BUCKET="${BENCH_BUCKET:-rustfs-four-node-bench}"
|
|
BENCH_CONCURRENCY="${BENCH_CONCURRENCY:-}"
|
|
BENCH_CONCURRENCIES="${BENCH_CONCURRENCIES:-}"
|
|
BENCH_DURATION="${BENCH_DURATION:-60s}"
|
|
BENCH_SIZES="${BENCH_SIZES:-1KiB,4KiB,11Mi}"
|
|
|
|
OUT_DIR="${OUT_DIR:-${PROJECT_ROOT}/target/bench/four-node-failover-$(date +%Y%m%d-%H%M%S)}"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage:
|
|
scripts/run_four_node_cluster_failover_bench.sh [options]
|
|
|
|
Options:
|
|
--cluster-compose <path> 4-node compose file
|
|
--obs-compose <path> observability compose file
|
|
--project-name <name> docker compose project name
|
|
--image-tag <tag> image tag to build/use
|
|
--with-observability bring up .docker/observability stack together
|
|
--without-observability only bring up 4-node cluster
|
|
--skip-build skip docker build from Dockerfile.source
|
|
--skip-failover skip failover recovery validation
|
|
--skip-bench skip benchmark phase
|
|
--failover-node <nodeN> node to stop during failover test (default: node4)
|
|
--obs-endpoint <url> RUSTFS_OBS_ENDPOINT (default: auto-select by mode)
|
|
--bench-endpoint <url> benchmark endpoint (default: http://127.0.0.1:9000)
|
|
--bench-sizes <sizes> comma list (default: 1KiB,4KiB,11Mi)
|
|
--bench-concurrency <n> benchmark concurrency
|
|
--bench-concurrencies <list> benchmark concurrency list (default: 8,16,32,64,128)
|
|
--bench-duration <dur> benchmark duration
|
|
--out-dir <path> output directory
|
|
--keep-up keep compose services running after script exits
|
|
-h, --help show help
|
|
|
|
Environment:
|
|
CLUSTER_COMPOSE OBS_COMPOSE PROJECT_NAME IMAGE_TAG
|
|
WITH_OBSERVABILITY BUILD_LOCAL_IMAGE RUN_FAILOVER RUN_BENCHMARK KEEP_UP
|
|
RUSTFS_ACCESS_KEY RUSTFS_SECRET_KEY RUSTFS_OBS_ENDPOINT
|
|
PRECHECK_AUTO_CLEANUP (true|false, default: true)
|
|
WAIT_PROBE_MODE (service|ready, default: service)
|
|
WAIT_TIMEOUT_SECS FAILOVER_NODE FAILOVER_WARMUP_SECS FAILOVER_SAMPLE_SECS
|
|
FAILOVER_INTERVAL_SECS BENCH_ENDPOINT BENCH_BUCKET BENCH_CONCURRENCY
|
|
BENCH_CONCURRENCIES BENCH_DURATION BENCH_SIZES OUT_DIR
|
|
BENCH_WAIT_MODE (ready|service, default: ready)
|
|
BENCH_READY_TIMEOUT_SECS (default: 180)
|
|
USAGE
|
|
}
|
|
|
|
log_info() {
|
|
printf '[INFO] %s\n' "$*"
|
|
}
|
|
|
|
log_warn() {
|
|
printf '[WARN] %s\n' "$*"
|
|
}
|
|
|
|
log_error() {
|
|
printf '[ERROR] %s\n' "$*" >&2
|
|
}
|
|
|
|
require_cmd() {
|
|
if ! command -v "$1" >/dev/null 2>&1; then
|
|
log_error "command not found: $1"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
compose() {
|
|
if [[ "${WITH_OBSERVABILITY}" == "true" ]]; then
|
|
docker compose \
|
|
--project-name "${PROJECT_NAME}" \
|
|
-f "${OBS_COMPOSE}" \
|
|
-f "${CLUSTER_COMPOSE}" \
|
|
"$@"
|
|
else
|
|
docker compose \
|
|
--project-name "${PROJECT_NAME}" \
|
|
-f "${CLUSTER_COMPOSE}" \
|
|
"$@"
|
|
fi
|
|
}
|
|
|
|
resolve_bool() {
|
|
local key="$1"
|
|
local value="$2"
|
|
case "${value}" in
|
|
true|false) ;;
|
|
*)
|
|
log_error "invalid ${key}: ${value} (expected true|false)"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
resolve_probe_mode() {
|
|
case "${WAIT_PROBE_MODE}" in
|
|
service|ready) ;;
|
|
*)
|
|
log_error "invalid WAIT_PROBE_MODE: ${WAIT_PROBE_MODE} (expected service|ready)"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
resolve_bench_wait_mode() {
|
|
case "${BENCH_WAIT_MODE}" in
|
|
ready|service) ;;
|
|
*)
|
|
log_error "invalid BENCH_WAIT_MODE: ${BENCH_WAIT_MODE} (expected ready|service)"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
resolve_bench_concurrency() {
|
|
if [[ -n "${BENCH_CONCURRENCIES}" && -n "${BENCH_CONCURRENCY}" && "${BENCH_CONCURRENCIES}" != "${BENCH_CONCURRENCY}" ]]; then
|
|
log_warn "BENCH_CONCURRENCY is ignored because BENCH_CONCURRENCIES is set"
|
|
return
|
|
fi
|
|
|
|
if [[ -n "${BENCH_CONCURRENCIES}" ]]; then
|
|
return
|
|
fi
|
|
|
|
if [[ -n "${BENCH_CONCURRENCY}" ]]; then
|
|
BENCH_CONCURRENCIES="${BENCH_CONCURRENCY}"
|
|
return
|
|
fi
|
|
|
|
# BENCH_CONCURRENCIES="8,16,32,64,128"
|
|
BENCH_CONCURRENCIES="8,16"
|
|
}
|
|
|
|
cluster_compose_uses_otel_network() {
|
|
# Detect whether any service in cluster compose joins otel-network.
|
|
grep -Eq '^[[:space:]]*-[[:space:]]*otel-network([[:space:]]*#.*)?$' "${CLUSTER_COMPOSE}"
|
|
}
|
|
|
|
obs_compose_has_otel_collector() {
|
|
grep -Eq '^[[:space:]]*otel-collector:[[:space:]]*$' "${OBS_COMPOSE}"
|
|
}
|
|
|
|
resolve_default_obs_endpoint() {
|
|
if [[ "${WITH_OBSERVABILITY}" != "true" ]]; then
|
|
RUSTFS_OBS_ENDPOINT="http://host.docker.internal:4318"
|
|
log_info "Auto-selected RUSTFS_OBS_ENDPOINT=${RUSTFS_OBS_ENDPOINT} (observability stack disabled)"
|
|
return
|
|
fi
|
|
|
|
if cluster_compose_uses_otel_network && obs_compose_has_otel_collector; then
|
|
RUSTFS_OBS_ENDPOINT="http://otel-collector:4318"
|
|
log_info "Auto-selected RUSTFS_OBS_ENDPOINT=${RUSTFS_OBS_ENDPOINT} (shared docker network detected)"
|
|
return
|
|
fi
|
|
|
|
RUSTFS_OBS_ENDPOINT="http://host.docker.internal:4318"
|
|
log_info "Auto-selected RUSTFS_OBS_ENDPOINT=${RUSTFS_OBS_ENDPOINT} (cross-network fallback)"
|
|
}
|
|
|
|
docker_daemon_ready() {
|
|
docker info >/dev/null 2>&1
|
|
}
|
|
|
|
port_is_occupied() {
|
|
local port="$1"
|
|
|
|
if command -v lsof >/dev/null 2>&1; then
|
|
lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null 2>&1
|
|
return $?
|
|
fi
|
|
|
|
if command -v ss >/dev/null 2>&1; then
|
|
ss -ltn "sport = :${port}" 2>/dev/null | awk 'NR>1 {found=1} END{exit found?0:1}'
|
|
return $?
|
|
fi
|
|
|
|
if command -v netstat >/dev/null 2>&1; then
|
|
netstat -an 2>/dev/null | grep -E "[\.\:]${port}[[:space:]].*LISTEN" >/dev/null 2>&1
|
|
return $?
|
|
fi
|
|
|
|
# Fallback: no tool available; treat as unknown (not occupied) and rely on compose failure.
|
|
return 1
|
|
}
|
|
|
|
print_port_owner() {
|
|
local port="$1"
|
|
|
|
if command -v lsof >/dev/null 2>&1; then
|
|
lsof -nP -iTCP:"${port}" -sTCP:LISTEN 2>/dev/null | awk 'NR==1 || NR==2 {print " " $0}'
|
|
return
|
|
fi
|
|
|
|
if command -v ss >/dev/null 2>&1; then
|
|
ss -ltnp "sport = :${port}" 2>/dev/null | awk 'NR==1 || NR==2 {print " " $0}'
|
|
fi
|
|
}
|
|
|
|
cleanup_existing_project_containers() {
|
|
local existing_ids
|
|
existing_ids="$(docker ps -aq --filter "label=com.docker.compose.project=${PROJECT_NAME}")"
|
|
|
|
if [[ -z "${existing_ids}" ]]; then
|
|
return 0
|
|
fi
|
|
|
|
log_warn "Found existing containers for project ${PROJECT_NAME}."
|
|
docker ps -a --filter "label=com.docker.compose.project=${PROJECT_NAME}" --format ' - {{.Names}} ({{.Status}})'
|
|
|
|
if [[ "${PRECHECK_AUTO_CLEANUP}" == "true" ]]; then
|
|
log_info "PRECHECK_AUTO_CLEANUP=true, removing existing project containers."
|
|
# shellcheck disable=SC2086
|
|
docker rm -f ${existing_ids} >/dev/null
|
|
else
|
|
log_error "existing project containers detected and PRECHECK_AUTO_CLEANUP=false"
|
|
log_error "run docker compose down --remove-orphans first, or set PRECHECK_AUTO_CLEANUP=true"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
check_required_ports_free() {
|
|
local required_ports=(
|
|
9000 9001 9002 9003
|
|
)
|
|
local occupied_ports=()
|
|
local port
|
|
|
|
if [[ "${WITH_OBSERVABILITY}" == "true" ]]; then
|
|
required_ports+=(
|
|
1888 3000 3100 3200 4040 4317 4318 55679 8888 8889 9090 13133 14269 16686
|
|
)
|
|
fi
|
|
|
|
for port in "${required_ports[@]}"; do
|
|
if port_is_occupied "${port}"; then
|
|
occupied_ports+=("${port}")
|
|
fi
|
|
done
|
|
|
|
if [[ "${#occupied_ports[@]}" -gt 0 ]]; then
|
|
log_error "required host ports are occupied: ${occupied_ports[*]}"
|
|
for port in "${occupied_ports[@]}"; do
|
|
print_port_owner "${port}" || true
|
|
done
|
|
log_error "free these ports or run with a different compose/profile before retrying"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
ensure_runtime_image_exists() {
|
|
if ! docker image inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
|
|
log_error "image not found: ${IMAGE_TAG}"
|
|
log_error "build it first or rerun without --skip-build"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
check_cluster_volumes_writable() {
|
|
local node_idx
|
|
local disk_idx
|
|
local volume_name
|
|
|
|
log_info "Checking cluster data volumes writable"
|
|
# Do not pre-create compose-managed volumes here.
|
|
# If we create them via plain docker run, compose will warn:
|
|
# "already exists but was not created by Docker Compose".
|
|
for node_idx in 1 2 3 4; do
|
|
for disk_idx in 1 2 3 4; do
|
|
volume_name="${PROJECT_NAME}_node${node_idx}_data_${disk_idx}"
|
|
if ! docker volume inspect "${volume_name}" >/dev/null 2>&1; then
|
|
log_info "volume not present yet (will be created by compose): ${volume_name}"
|
|
continue
|
|
fi
|
|
if ! docker run --rm --entrypoint sh -v "${volume_name}:/probe" "${IMAGE_TAG}" -c \
|
|
'set -e; touch /probe/.rwtest; rm -f /probe/.rwtest' >/dev/null 2>&1; then
|
|
log_error "volume write check failed: ${volume_name}"
|
|
exit 1
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
run_precheck_before_build() {
|
|
log_info "Running precheck: docker daemon, residue containers, host ports"
|
|
|
|
if ! docker_daemon_ready; then
|
|
log_error "cannot connect to docker daemon (permission or runtime not ready)"
|
|
exit 1
|
|
fi
|
|
|
|
cleanup_existing_project_containers
|
|
check_required_ports_free
|
|
}
|
|
|
|
run_precheck_after_build() {
|
|
log_info "Running precheck: image exists, cluster volumes writable"
|
|
ensure_runtime_image_exists
|
|
check_cluster_volumes_writable
|
|
}
|
|
|
|
node_port() {
|
|
case "$1" in
|
|
node1) echo "9000" ;;
|
|
node2) echo "9001" ;;
|
|
node3) echo "9002" ;;
|
|
node4) echo "9003" ;;
|
|
*)
|
|
log_error "unknown node name: $1 (expected node1..node4)"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
wait_http_ok() {
|
|
local url="$1"
|
|
local start now
|
|
start="$(date +%s)"
|
|
|
|
while true; do
|
|
if curl -fsS --connect-timeout 2 --max-time 3 "${url}" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
|
|
now="$(date +%s)"
|
|
if (( now - start >= WAIT_TIMEOUT_SECS )); then
|
|
log_error "timed out waiting for ${url}"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
probe_node_service_ok() {
|
|
local port="$1"
|
|
local health_code root_code
|
|
|
|
health_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "http://127.0.0.1:${port}/health" || true)"
|
|
if [[ "${health_code}" != "200" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
if [[ "${WAIT_PROBE_MODE}" == "ready" ]]; then
|
|
local ready_code
|
|
ready_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "http://127.0.0.1:${port}/health/ready" || true)"
|
|
[[ "${ready_code}" == "200" ]]
|
|
return $?
|
|
fi
|
|
|
|
# Service mode: keep startup probe permissive to avoid local false negatives.
|
|
# Benchmark phase has its own stricter readiness gate via wait_bench_endpoint_ready.
|
|
root_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "http://127.0.0.1:${port}/" || true)"
|
|
case "${root_code}" in
|
|
[1-5][0-9][0-9]) return 0 ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
probe_bench_endpoint_ok() {
|
|
local endpoint health_url ready_url root_url
|
|
local health_code ready_code root_code
|
|
endpoint="${BENCH_ENDPOINT%/}"
|
|
health_url="${endpoint}/health"
|
|
ready_url="${endpoint}/health/ready"
|
|
root_url="${endpoint}/"
|
|
|
|
health_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "${health_url}" || true)"
|
|
if [[ "${health_code}" != "200" ]]; then
|
|
return 1
|
|
fi
|
|
|
|
if [[ "${BENCH_WAIT_MODE}" == "ready" ]]; then
|
|
ready_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "${ready_url}" || true)"
|
|
[[ "${ready_code}" == "200" ]]
|
|
return $?
|
|
fi
|
|
|
|
root_code="$(curl -s -o /dev/null -w '%{http_code}' --connect-timeout 2 --max-time 3 "${root_url}" || true)"
|
|
case "${root_code}" in
|
|
2[0-9][0-9]|3[0-9][0-9]|401|403|404) return 0 ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
wait_bench_endpoint_ready() {
|
|
local start now
|
|
start="$(date +%s)"
|
|
|
|
while true; do
|
|
if probe_bench_endpoint_ok; then
|
|
return 0
|
|
fi
|
|
|
|
now="$(date +%s)"
|
|
if (( now - start >= BENCH_READY_TIMEOUT_SECS )); then
|
|
log_error "timed out waiting for benchmark endpoint ${BENCH_ENDPOINT} (mode=${BENCH_WAIT_MODE})"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_node_probe_ok() {
|
|
local port="$1"
|
|
local start now
|
|
start="$(date +%s)"
|
|
|
|
while true; do
|
|
if probe_node_service_ok "${port}"; then
|
|
return 0
|
|
fi
|
|
|
|
now="$(date +%s)"
|
|
if (( now - start >= WAIT_TIMEOUT_SECS )); then
|
|
log_error "timed out waiting for node probe on 127.0.0.1:${port} (mode=${WAIT_PROBE_MODE})"
|
|
return 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_cluster_ready() {
|
|
local port
|
|
for port in 9000 9001 9002 9003; do
|
|
wait_node_probe_ok "${port}"
|
|
done
|
|
}
|
|
|
|
probe_survivors_ready() {
|
|
local failover_port="$1"
|
|
local port
|
|
for port in 9000 9001 9002 9003; do
|
|
if [[ "${port}" == "${failover_port}" ]]; then
|
|
continue
|
|
fi
|
|
if ! probe_node_service_ok "${port}"; then
|
|
return 1
|
|
fi
|
|
done
|
|
return 0
|
|
}
|
|
|
|
run_failover_validation() {
|
|
local failover_port
|
|
local probe_file
|
|
local summary_file
|
|
local event_epoch
|
|
local end_epoch
|
|
local ts
|
|
local first_fail
|
|
local first_recover
|
|
local recovery_secs
|
|
|
|
failover_port="$(node_port "${FAILOVER_NODE}")"
|
|
probe_file="${OUT_DIR}/failover-probe.csv"
|
|
summary_file="${OUT_DIR}/failover-summary.txt"
|
|
mkdir -p "$(dirname "${probe_file}")"
|
|
|
|
log_info "Running failover validation: stopping ${FAILOVER_NODE}"
|
|
sleep "${FAILOVER_WARMUP_SECS}"
|
|
|
|
compose stop "${FAILOVER_NODE}" >/dev/null
|
|
event_epoch="$(date +%s)"
|
|
end_epoch="$((event_epoch + FAILOVER_SAMPLE_SECS))"
|
|
|
|
echo "timestamp_epoch,status" > "${probe_file}"
|
|
while (( "$(date +%s)" <= end_epoch )); do
|
|
ts="$(date +%s)"
|
|
if probe_survivors_ready "${failover_port}"; then
|
|
echo "${ts},ok" >> "${probe_file}"
|
|
else
|
|
echo "${ts},fail" >> "${probe_file}"
|
|
fi
|
|
sleep "${FAILOVER_INTERVAL_SECS}"
|
|
done
|
|
|
|
first_fail="$(awk -F',' 'NR>1 && $2=="fail" {print $1; exit}' "${probe_file}")"
|
|
if [[ -z "${first_fail}" ]]; then
|
|
recovery_secs="0"
|
|
{
|
|
echo "failover_node=${FAILOVER_NODE}"
|
|
echo "outage_observed=false"
|
|
echo "recovery_seconds=${recovery_secs}"
|
|
echo "note=no survivor readiness interruption observed in probe window"
|
|
} > "${summary_file}"
|
|
else
|
|
first_recover="$(awk -F',' -v fail_ts="${first_fail}" 'NR>1 && $1>fail_ts && $2=="ok" {print $1; exit}' "${probe_file}")"
|
|
if [[ -z "${first_recover}" ]]; then
|
|
{
|
|
echo "failover_node=${FAILOVER_NODE}"
|
|
echo "outage_observed=true"
|
|
echo "recovery_seconds=unrecovered_within_${FAILOVER_SAMPLE_SECS}s"
|
|
echo "first_fail_epoch=${first_fail}"
|
|
} > "${summary_file}"
|
|
else
|
|
recovery_secs="$((first_recover - first_fail))"
|
|
{
|
|
echo "failover_node=${FAILOVER_NODE}"
|
|
echo "outage_observed=true"
|
|
echo "first_fail_epoch=${first_fail}"
|
|
echo "first_recover_epoch=${first_recover}"
|
|
echo "recovery_seconds=${recovery_secs}"
|
|
} > "${summary_file}"
|
|
fi
|
|
fi
|
|
|
|
log_info "Restarting ${FAILOVER_NODE}"
|
|
compose start "${FAILOVER_NODE}" >/dev/null
|
|
wait_node_probe_ok "${failover_port}"
|
|
wait_cluster_ready
|
|
}
|
|
|
|
run_benchmark() {
|
|
local bench_out_dir
|
|
local conc
|
|
local conc_dir
|
|
bench_out_dir="${OUT_DIR}/benchmark"
|
|
mkdir -p "${bench_out_dir}"
|
|
|
|
if ! command -v warp >/dev/null 2>&1; then
|
|
log_error "warp is required for benchmark phase. Please install warp or run with --skip-bench."
|
|
exit 1
|
|
fi
|
|
|
|
log_info "Waiting for benchmark endpoint readiness (mode=${BENCH_WAIT_MODE})"
|
|
wait_bench_endpoint_ready
|
|
|
|
IFS=',' read -r -a conc_list <<< "${BENCH_CONCURRENCIES}"
|
|
for conc in "${conc_list[@]}"; do
|
|
conc="$(echo "${conc}" | xargs)"
|
|
if [[ -z "${conc}" ]]; then
|
|
continue
|
|
fi
|
|
if ! [[ "${conc}" =~ ^[0-9]+$ ]] || [[ "${conc}" -le 0 ]]; then
|
|
log_error "invalid concurrency in BENCH_CONCURRENCIES: ${conc}"
|
|
exit 1
|
|
fi
|
|
|
|
conc_dir="${bench_out_dir}/concurrency-${conc}"
|
|
log_info "Running benchmark sequentially with concurrency=${conc}"
|
|
(
|
|
cd "${PROJECT_ROOT}"
|
|
./scripts/run_object_batch_bench.sh \
|
|
--tool warp \
|
|
--endpoint "${BENCH_ENDPOINT}" \
|
|
--access-key "${RUSTFS_ACCESS_KEY}" \
|
|
--secret-key "${RUSTFS_SECRET_KEY}" \
|
|
--bucket "${BENCH_BUCKET}" \
|
|
--concurrency "${conc}" \
|
|
--duration "${BENCH_DURATION}" \
|
|
--sizes "${BENCH_SIZES}" \
|
|
--out-dir "${conc_dir}"
|
|
)
|
|
done
|
|
}
|
|
|
|
cleanup() {
|
|
if [[ "${KEEP_UP}" == "true" ]]; then
|
|
log_info "KEEP_UP=true, leaving containers running"
|
|
return
|
|
fi
|
|
|
|
log_info "Stopping compose services"
|
|
compose down --remove-orphans >/dev/null 2>&1 || true
|
|
}
|
|
|
|
parse_args() {
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--cluster-compose)
|
|
CLUSTER_COMPOSE="$2"
|
|
shift 2
|
|
;;
|
|
--obs-compose)
|
|
OBS_COMPOSE="$2"
|
|
shift 2
|
|
;;
|
|
--project-name)
|
|
PROJECT_NAME="$2"
|
|
shift 2
|
|
;;
|
|
--image-tag)
|
|
IMAGE_TAG="$2"
|
|
shift 2
|
|
;;
|
|
--with-observability)
|
|
WITH_OBSERVABILITY=true
|
|
shift
|
|
;;
|
|
--without-observability)
|
|
WITH_OBSERVABILITY=false
|
|
shift
|
|
;;
|
|
--skip-build)
|
|
BUILD_LOCAL_IMAGE=false
|
|
shift
|
|
;;
|
|
--skip-failover)
|
|
RUN_FAILOVER=false
|
|
shift
|
|
;;
|
|
--skip-bench)
|
|
RUN_BENCHMARK=false
|
|
shift
|
|
;;
|
|
--keep-up)
|
|
KEEP_UP=true
|
|
shift
|
|
;;
|
|
--failover-node)
|
|
FAILOVER_NODE="$2"
|
|
shift 2
|
|
;;
|
|
--obs-endpoint)
|
|
RUSTFS_OBS_ENDPOINT="$2"
|
|
shift 2
|
|
;;
|
|
--bench-endpoint)
|
|
BENCH_ENDPOINT="$2"
|
|
shift 2
|
|
;;
|
|
--bench-sizes)
|
|
BENCH_SIZES="$2"
|
|
shift 2
|
|
;;
|
|
--bench-concurrency)
|
|
BENCH_CONCURRENCY="$2"
|
|
BENCH_CONCURRENCIES="$2"
|
|
shift 2
|
|
;;
|
|
--bench-concurrencies)
|
|
BENCH_CONCURRENCIES="$2"
|
|
shift 2
|
|
;;
|
|
--bench-duration)
|
|
BENCH_DURATION="$2"
|
|
shift 2
|
|
;;
|
|
--out-dir)
|
|
OUT_DIR="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
log_error "unknown argument: $1"
|
|
usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
resolve_bool "WITH_OBSERVABILITY" "${WITH_OBSERVABILITY}"
|
|
resolve_bool "BUILD_LOCAL_IMAGE" "${BUILD_LOCAL_IMAGE}"
|
|
resolve_bool "RUN_FAILOVER" "${RUN_FAILOVER}"
|
|
resolve_bool "RUN_BENCHMARK" "${RUN_BENCHMARK}"
|
|
resolve_bool "KEEP_UP" "${KEEP_UP}"
|
|
resolve_bool "PRECHECK_AUTO_CLEANUP" "${PRECHECK_AUTO_CLEANUP}"
|
|
resolve_probe_mode
|
|
resolve_bench_wait_mode
|
|
resolve_bench_concurrency
|
|
|
|
require_cmd docker
|
|
require_cmd curl
|
|
require_cmd awk
|
|
|
|
if [[ ! -f "${CLUSTER_COMPOSE}" ]]; then
|
|
log_error "cluster compose file not found: ${CLUSTER_COMPOSE}"
|
|
exit 1
|
|
fi
|
|
if [[ "${WITH_OBSERVABILITY}" == "true" && ! -f "${OBS_COMPOSE}" ]]; then
|
|
log_error "observability compose file not found: ${OBS_COMPOSE}"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z "${RUSTFS_OBS_ENDPOINT}" ]]; then
|
|
resolve_default_obs_endpoint
|
|
fi
|
|
|
|
if [[ "${RUSTFS_OBS_ENDPOINT}" == "http://127.0.0.1:4318" ]]; then
|
|
log_warn "RUSTFS_OBS_ENDPOINT is set to container loopback default (${RUSTFS_OBS_ENDPOINT})."
|
|
log_warn "If you need host collector routing, consider: --obs-endpoint http://host.docker.internal:4318"
|
|
fi
|
|
|
|
mkdir -p "${OUT_DIR}"
|
|
|
|
trap cleanup EXIT INT TERM
|
|
|
|
export RUSTFS_IMAGE="${IMAGE_TAG}"
|
|
export RUSTFS_ACCESS_KEY
|
|
export RUSTFS_SECRET_KEY
|
|
export RUSTFS_OBS_ENDPOINT
|
|
export RUSTFS_UNSAFE_BYPASS_DISK_CHECK
|
|
|
|
run_precheck_before_build
|
|
|
|
if [[ "${BUILD_LOCAL_IMAGE}" == "true" ]]; then
|
|
log_info "Building local image from Dockerfile.source: ${IMAGE_TAG}"
|
|
docker build -f "${PROJECT_ROOT}/Dockerfile.source" -t "${IMAGE_TAG}" "${PROJECT_ROOT}"
|
|
else
|
|
log_info "Skipping image build"
|
|
fi
|
|
|
|
run_precheck_after_build
|
|
|
|
log_info "Starting compose stack"
|
|
compose up -d
|
|
|
|
log_info "Waiting for 4-node cluster readiness (mode=${WAIT_PROBE_MODE})"
|
|
wait_cluster_ready
|
|
|
|
if [[ "${RUN_FAILOVER}" == "true" ]]; then
|
|
run_failover_validation
|
|
else
|
|
log_info "Skipping failover validation"
|
|
fi
|
|
|
|
if [[ "${RUN_BENCHMARK}" == "true" ]]; then
|
|
run_benchmark
|
|
else
|
|
log_info "Skipping benchmark"
|
|
fi
|
|
|
|
log_info "Validation finished"
|
|
log_info "Artifacts directory: ${OUT_DIR}"
|
|
log_info "Failover summary: ${OUT_DIR}/failover-summary.txt"
|
|
log_info "Failover probe: ${OUT_DIR}/failover-probe.csv"
|
|
log_info "Benchmark summary: ${OUT_DIR}/benchmark/summary.csv"
|
|
}
|
|
|
|
main "$@"
|