rustfs/scripts/table-catalog/engine_compatibility.py

#!/usr/bin/env python3
"""Machine-readable Iceberg engine compatibility helpers for RustFS S3 Tables."""

from __future__ import annotations

import argparse
import json
from collections import OrderedDict
from io import StringIO
from typing import Any

VENDOR_SPARK_PROFILES: dict[str, dict[str, str]] = {
    "rustfs": {
        "catalog_uri": "{endpoint}/iceberg",
        "warehouse": "{warehouse}",
        "rest_signing_name": "s3",
        "s3_endpoint": "{endpoint}",
        "s3_path_style_access": "true",
    },
    "rustfs-compat": {
        "catalog_uri": "{endpoint}/_iceberg",
        "warehouse": "{warehouse}",
        "rest_signing_name": "s3tables",
        "s3_endpoint": "{endpoint}",
        "s3_path_style_access": "true",
    },
    "aws-s3tables": {
        "catalog_uri": "https://s3tables.{region}.amazonaws.com/iceberg",
        "warehouse": "arn:aws:s3tables:{region}:{account_id}:bucket/{table_bucket}",
        "rest_signing_name": "s3tables",
    },
    "minio-aistor": {
        "catalog_uri": "{endpoint}/_iceberg",
        "warehouse": "{warehouse}",
        "rest_signing_name": "s3tables",
        "s3_endpoint": "{endpoint}",
        "s3_path_style_access": "true",
    },
    "cloudflare-r2-data-catalog": {
        "catalog_uri": "{catalog_uri}",
        "warehouse": "{warehouse_name}",
        "rest_signing_name": "s3",
    },
    "oss-tables": {
        "catalog_uri": "{endpoint}/iceberg",
        "warehouse": "{warehouse}",
        "rest_signing_name": "s3",
    },
}


def scenario(name: str, status: str, evidence: str) -> dict[str, str]:
    return {
        "name": name,
        "status": status,
        "evidence": evidence,
    }


def engine_compatibility_matrix() -> list[dict[str, Any]]:
    return [
        {
            "client": "PyIceberg",
            "status": "automated-smoke",
            "entrypoint": "scripts/table-catalog/pyiceberg_smoke.py",
            "scenarios": [
                scenario("create-namespace", "automated", "PyIceberg catalog.create_namespace_if_not_exists/create_namespace"),
                scenario("create-table", "automated", "PyIceberg catalog.create_table"),
                scenario("append", "automated", "PyIceberg table.append with PyArrow rows"),
                scenario("reload-table", "automated", "PyIceberg catalog.load_table after append"),
                scenario("scan", "automated", "PyIceberg table.scan().to_arrow"),
                scenario("drop-table", "automated-with-cleanup", "PyIceberg catalog.drop_table when --cleanup or --replace is set"),
                scenario("commit-conflict", "direct-rest-probe-required", "catalog commit conflict remains a follow-up live probe"),
            ],
        },
        {
            "client": "Spark Iceberg REST catalog",
            "status": "generated-smoke-harness",
            "entrypoint": "scripts/table-catalog/engine_compatibility.py --print-spark-sql",
            "scenarios": [
                scenario("create-namespace", "generated-spark-sql", "CREATE NAMESPACE IF NOT EXISTS"),
                scenario("create-table", "generated-spark-sql", "CREATE TABLE USING iceberg"),
                scenario("append", "generated-spark-sql", "INSERT INTO"),
                scenario("reload-table", "generated-spark-sql", "REFRESH TABLE and SELECT COUNT"),
                scenario("drop-table", "generated-spark-sql", "DROP TABLE and optional DROP NAMESPACE"),
                scenario("commit-conflict", "manual-validation-required", "requires a two-writer Spark or REST conflict harness"),
            ],
        },
        {
            "client": "Trino Iceberg REST catalog",
            "status": "documented-read-path",
            "entrypoint": "scripts/table-catalog/README.md",
            "scenarios": [
                scenario("catalog-load", "manual-validation-required", "REST catalog configuration reference"),
                scenario("read-table", "manual-validation-required", "SELECT from a table created by PyIceberg or Spark"),
                scenario("write-table", "not-claimed", "Trino write compatibility is not claimed by this harness"),
            ],
        },
        {
            "client": "DuckDB Iceberg",
            "status": "documented-read-path",
            "entrypoint": "scripts/table-catalog/README.md",
            "scenarios": [
                scenario("catalog-load", "manual-validation-required", "REST catalog extension/configuration reference"),
                scenario("read-table", "manual-validation-required", "read-path verification only"),
                scenario("write-table", "not-claimed", "DuckDB write/commit compatibility is not claimed"),
            ],
        },
        {
            "client": "StarRocks Iceberg REST catalog",
            "status": "documented-read-path",
            "entrypoint": "scripts/table-catalog/README.md",
            "scenarios": [
                scenario("catalog-load", "manual-validation-required", "REST catalog configuration reference"),
                scenario("read-table", "manual-validation-required", "external catalog read-path verification only"),
                scenario("write-table", "not-claimed", "StarRocks write/commit compatibility is not claimed"),
            ],
        },
        {
            "client": "Snowflake Open Catalog / Iceberg integrations",
            "status": "reference-only",
            "entrypoint": "scripts/table-catalog/README.md",
            "scenarios": [
                scenario("catalog-load", "not-claimed", "reference only until a repeatable external integration harness exists"),
            ],
        },
        {
            "client": "Databend",
            "status": "s3-data-plane-reference",
            "entrypoint": "scripts/table-catalog/README.md",
            "scenarios": [
                scenario("s3-data-plane-read", "manual-validation-required", "S3 stage/data-plane reference only"),
                scenario("iceberg-rest-catalog", "not-claimed", "Databend REST catalog integration is not claimed"),
            ],
        },
    ]


def normalized_endpoint(endpoint: str) -> str:
    return endpoint.rstrip("/")


def normalized_rest_path(rest_path: str) -> str:
    stripped = rest_path.strip()
    if not stripped:
        raise ValueError("REST catalog path cannot be empty")
    if not stripped.startswith("/"):
        stripped = f"/{stripped}"
    return stripped.rstrip("/")


def vendor_profile_context(
    *,
    endpoint: str,
    warehouse: str,
    region: str,
    account_id: str,
    table_bucket: str,
    catalog_uri: str | None,
    warehouse_name: str | None,
) -> dict[str, str]:
    endpoint = normalized_endpoint(endpoint)
    return {
        "account_id": account_id,
        "catalog_uri": (catalog_uri or f"{endpoint}/iceberg").rstrip("/"),
        "endpoint": endpoint,
        "region": region,
        "table_bucket": table_bucket,
        "warehouse": warehouse,
        "warehouse_name": warehouse_name or warehouse,
    }


def vendor_profile_value(profile: str, key: str, context: dict[str, str]) -> str:
    try:
        template = VENDOR_SPARK_PROFILES[profile][key]
    except KeyError as err:
        raise ValueError(f"unknown vendor profile field: {profile}.{key}") from err
    return template.format(**context).rstrip("/")


def spark_catalog_config(
    *,
    endpoint: str,
    warehouse: str,
    access_key: str,
    secret_key: str,
    region: str,
    catalog_name: str,
    rest_path: str,
    rest_signing_name: str,
) -> OrderedDict[str, str]:
    endpoint = normalized_endpoint(endpoint)
    rest_path = normalized_rest_path(rest_path)
    prefix = f"spark.sql.catalog.{catalog_name}"
    return OrderedDict(
        [
            (prefix, "org.apache.iceberg.spark.SparkCatalog"),
            (f"{prefix}.type", "rest"),
            (f"{prefix}.uri", f"{endpoint}{rest_path}"),
            (f"{prefix}.warehouse", warehouse),
            (f"{prefix}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO"),
            (f"{prefix}.s3.endpoint", endpoint),
            (f"{prefix}.s3.path-style-access", "true"),
            (f"{prefix}.rest.sigv4-enabled", "true"),
            (f"{prefix}.rest.signing-name", rest_signing_name),
            (f"{prefix}.rest.signing-region", region),
            (f"{prefix}.s3.access-key-id", access_key),
            (f"{prefix}.s3.secret-access-key", secret_key),
        ]
    )


def spark_vendor_catalog_config(
    *,
    profile: str,
    endpoint: str,
    warehouse: str,
    access_key: str,
    secret_key: str,
    region: str,
    catalog_name: str,
    account_id: str,
    table_bucket: str,
    catalog_uri: str | None,
    warehouse_name: str | None,
    rest_path: str | None = None,
    rest_signing_name: str | None = None,
) -> OrderedDict[str, str]:
    context = vendor_profile_context(
        endpoint=endpoint,
        warehouse=warehouse,
        region=region,
        account_id=account_id,
        table_bucket=table_bucket,
        catalog_uri=catalog_uri,
        warehouse_name=warehouse_name,
    )
    profile_defaults = VENDOR_SPARK_PROFILES.get(profile)
    if profile_defaults is None:
        raise ValueError(f"unknown vendor profile: {profile}")
    configured_catalog_uri = vendor_profile_value(profile, "catalog_uri", context)
    if rest_path is not None:
        configured_catalog_uri = f"{normalized_endpoint(endpoint)}{normalized_rest_path(rest_path)}"
    config = spark_catalog_config(
        endpoint=endpoint,
        warehouse=vendor_profile_value(profile, "warehouse", context),
        access_key=access_key,
        secret_key=secret_key,
        region=region,
        catalog_name=catalog_name,
        rest_path="/iceberg",
        rest_signing_name=rest_signing_name or profile_defaults["rest_signing_name"],
    )
    prefix = f"spark.sql.catalog.{catalog_name}"
    config[f"{prefix}.uri"] = configured_catalog_uri
    if "s3_endpoint" in profile_defaults:
        config[f"{prefix}.s3.endpoint"] = vendor_profile_value(profile, "s3_endpoint", context)
    else:
        config.pop(f"{prefix}.s3.endpoint", None)
        config.pop(f"{prefix}.s3.access-key-id", None)
        config.pop(f"{prefix}.s3.secret-access-key", None)
    if "s3_path_style_access" in profile_defaults:
        config[f"{prefix}.s3.path-style-access"] = profile_defaults["s3_path_style_access"]
    else:
        config.pop(f"{prefix}.s3.path-style-access", None)
    return config


def quote_spark_identifier(identifier: str) -> str:
    if not identifier or "`" in identifier or "\n" in identifier or "\r" in identifier:
        raise ValueError("Spark identifier must be non-empty and must not contain backticks or newlines")
    return f"`{identifier}`"


def spark_table_identifier(catalog_name: str, namespace: str, table: str) -> str:
    return ".".join(
        [
            catalog_name,
            quote_spark_identifier(namespace),
            quote_spark_identifier(table),
        ]
    )


def spark_sql_smoke(
    *,
    catalog_name: str,
    namespace: str,
    table: str,
    cleanup: bool = False,
) -> str:
    namespace_identifier = f"{catalog_name}.{quote_spark_identifier(namespace)}"
    table_identifier = spark_table_identifier(catalog_name, namespace, table)
    statements = [
        f"CREATE NAMESPACE IF NOT EXISTS {namespace_identifier};",
        f"DROP TABLE IF EXISTS {table_identifier};",
        f"CREATE TABLE {table_identifier} (id BIGINT, payload STRING) USING iceberg;",
        f"INSERT INTO {table_identifier} VALUES (1, 'alpha'), (2, 'beta');",
        f"REFRESH TABLE {table_identifier};",
        f"SELECT COUNT(*) AS row_count FROM {table_identifier};",
    ]
    if cleanup:
        statements.extend(
            [
                f"DROP TABLE IF EXISTS {table_identifier};",
                f"DROP NAMESPACE IF EXISTS {namespace_identifier};",
            ]
        )
    return "\n".join(statements) + "\n"


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Print RustFS S3 Tables Iceberg engine compatibility helpers.")
    parser.add_argument("--endpoint", default="http://127.0.0.1:9000")
    parser.add_argument("--access-key", default="rustfsadmin")
    parser.add_argument("--secret-key", default="rustfsadmin")
    parser.add_argument("--region", default="us-east-1")
    parser.add_argument("--warehouse", default="rustfs-s3table-smoke")
    parser.add_argument("--profile", choices=sorted(VENDOR_SPARK_PROFILES), default="rustfs")
    parser.add_argument("--account-id", default="000000000000")
    parser.add_argument("--table-bucket", default="rustfs-s3table-smoke")
    parser.add_argument("--catalog-uri")
    parser.add_argument("--warehouse-name")
    parser.add_argument("--namespace", default="smoke")
    parser.add_argument("--table", default="events")
    parser.add_argument("--catalog-name", default="rustfs")
    parser.add_argument("--rest-path")
    parser.add_argument("--rest-signing-name")
    parser.add_argument("--cleanup", action="store_true")
    parser.add_argument("--print-engine-matrix", action="store_true")
    parser.add_argument("--print-spark-config", action="store_true")
    parser.add_argument("--print-spark-sql", action="store_true")
    return parser.parse_args(argv)


def print_json(document: Any, output: StringIO | None = None) -> None:
    text = json.dumps(document, indent=2, sort_keys=True)
    if output is None:
        print(text)
    else:
        output.write(f"{text}\n")


def cli_json(argv: list[str]) -> str:
    output = StringIO()
    run(parse_args(argv), output)
    return output.getvalue()


def run(args: argparse.Namespace, output: StringIO | None = None) -> None:
    printed = False
    if args.print_engine_matrix:
        print_json({"engine_compatibility": engine_compatibility_matrix()}, output)
        printed = True
    if args.print_spark_config:
        print_json(
            {
                "spark_config": spark_vendor_catalog_config(
                    profile=args.profile,
                    endpoint=args.endpoint,
                    warehouse=args.warehouse,
                    access_key=args.access_key,
                    secret_key=args.secret_key,
                    region=args.region,
                    catalog_name=args.catalog_name,
                    account_id=args.account_id,
                    table_bucket=args.table_bucket,
                    catalog_uri=args.catalog_uri,
                    warehouse_name=args.warehouse_name,
                    rest_path=args.rest_path,
                    rest_signing_name=args.rest_signing_name,
                )
            },
            output,
        )
        printed = True
    if args.print_spark_sql:
        sql = spark_sql_smoke(
            catalog_name=args.catalog_name,
            namespace=args.namespace,
            table=args.table,
            cleanup=args.cleanup,
        )
        if output is None:
            print(sql, end="")
        else:
            output.write(sql)
        printed = True
    if not printed:
        print_json({"engine_compatibility": engine_compatibility_matrix()}, output)


def main() -> None:
    run(parse_args())


if __name__ == "__main__":
    main()