feat(llm): init vllm as llm_container (#24338)

* feat(llm): init vllm as llm_container * feat(vllm): fix save-instant-model for vllm * feat(llm): support quick-model for vllm * feat(vllm): support preferred-model in llm * fix(llm): add ValidateSpec in llm_container_driver * feat(vllm): auto-set vllm params
2026-05-06 21:52:54 +08:00 · 2026-03-13 11:07:02 +08:00
parent 1fe46558ce
commit d3ab4af466
17 changed files with 930 additions and 67 deletions
--- a/cmd/climc/shell/llm/llm.go
+++ b/cmd/climc/shell/llm/llm.go
@@ -11,6 +11,7 @@ func init() {
 	cmd.BatchCreate(new(options.LLMCreateOptions))
 	cmd.List(new(options.LLMListOptions))
 	cmd.Show(new(options.LLMShowOptions))
+	cmd.Update(new(options.LLMUpdateOptions))
 	cmd.Delete(new(options.LLMDeleteOptions))
 	// cmd.Perform("change-model", new(options.LLMChangeModelOptions))
 	cmd.Perform("syncstatus", new(options.LLMIdOptions))
--- a/pkg/apis/llm/image.go
+++ b/pkg/apis/llm/image.go
@@ -19,6 +19,7 @@ const (
 var (
 	LLM_IMAGE_TYPES = sets.NewString(
 		string(LLM_IMAGE_TYPE_OLLAMA),
+		string(LLM_IMAGE_TYPE_VLLM),
 		string(LLM_IMAGE_TYPE_DIFY),
 		string(LLM_IMAGE_TYPE_COMFYUI),
 		string(LLM_IMAGE_TYPE_OPENCLAW),
--- a/pkg/apis/llm/llm.go
+++ b/pkg/apis/llm/llm.go
@@ -82,6 +82,14 @@ type LLMCreateInput struct {
 	LLMSpec    *LLMSpec `json:"llm_spec,omitempty"`
 }

+// LLMUpdateInput is the request body for updating an LLM (including llm_spec overrides).
+type LLMUpdateInput struct {
+	apis.VirtualResourceBaseUpdateInput
+
+	InstantModelQuotaGb *int     `json:"instant_model_quota_gb,omitempty"`
+	LLMSpec             *LLMSpec `json:"llm_spec,omitempty"`
+}
+
 type LLMBaseListInput struct {
 	apis.VirtualResourceListInput
 	apis.EnabledResourceBaseListInput
--- a/pkg/apis/llm/vllm_const.go
+++ b/pkg/apis/llm/vllm_const.go
@@ -0,0 +1,88 @@
+package llm
+
+import "time"
+
+const (
+	LLM_VLLM              = "vllm"
+	LLM_VLLM_DEFAULT_PORT = 8000
+	LLM_VLLM_EXEC_PATH    = "python3 -m vllm.entrypoints.openai.api_server"
+
+	LLM_VLLM_HF_ENDPOINT = "https://hf-mirror.com"
+
+	// Directory constants
+	LLM_VLLM_CACHE_DIR   = "/root/.cache/huggingface"
+	LLM_VLLM_BASE_PATH   = "/data/models"
+	LLM_VLLM_MODELS_PATH = "/data/models/huggingface"
+
+	// Health check
+	LLM_VLLM_HEALTH_CHECK_TIMEOUT  = 120 * time.Second // 2 minutes
+	LLM_VLLM_HEALTH_CHECK_INTERVAL = 10 * time.Second  // 10 seconds
+
+	// Default vLLM memory params when Python estimation fails (conservative to avoid OOM)
+	LLM_VLLM_DEFAULT_GPU_MEMORY_UTIL = 0.9
+	LLM_VLLM_DEFAULT_MAX_MODEL_LEN   = 2048
+	LLM_VLLM_DEFAULT_MAX_NUM_SEQS    = 1
+
+	// Prefixes for parsing resolveModelAndParams output line (KEY=value)
+	LLM_VLLM_RESOLVE_OUTPUT_PREFIX_GPU_UTIL    = "GPU_MEMORY_UTIL="
+	LLM_VLLM_RESOLVE_OUTPUT_PREFIX_MAX_LEN     = "MAX_MODEL_LEN="
+	LLM_VLLM_RESOLVE_OUTPUT_PREFIX_MAX_NUM_SEQ = "MAX_NUM_SEQS="
+)
+
+const (
+
+	// vllmEstimateParamsScript is a Python script run inside the container to estimate
+	// --gpu-memory-utilization, --max-model-len, and --max-num-seqs from GPU memory and model config.
+	// Args: sys.argv[1]=model path, sys.argv[2]=tensor_parallel_size.
+	// Prints one line: GPU_MEMORY_UTIL=0.9 MAX_MODEL_LEN=2624 MAX_NUM_SEQS=1 (eval-safe).
+	LLM_VLLM_ESTIMATE_PARAMS_SCRIPT = `
+import sys, json, os
+model_path = sys.argv[1] if len(sys.argv) > 1 else ""
+tp = int(sys.argv[2]) if len(sys.argv) > 2 else 1
+if not model_path or not os.path.isdir(model_path):
+    sys.exit(1)
+config_path = os.path.join(model_path, "config.json")
+if not os.path.isfile(config_path):
+    sys.exit(1)
+with open(config_path) as f:
+    config = json.load(f)
+def get_nested(d, *keys):
+    for k in keys:
+        d = d.get(k) if isinstance(d, dict) else None
+        if d is None:
+            return None
+    return d
+num_layers = config.get("num_hidden_layers") or config.get("n_layer") or get_nested(config, "text_config", "num_hidden_layers") or 0
+num_heads = config.get("num_attention_heads") or config.get("n_head") or get_nested(config, "text_config", "num_attention_heads") or 0
+num_kv_heads = config.get("num_key_value_heads") or get_nested(config, "text_config", "num_key_value_heads") or num_heads
+hidden_size = config.get("hidden_size") or config.get("n_embd") or get_nested(config, "text_config", "hidden_size") or 0
+head_dim = hidden_size // num_heads if num_heads else (hidden_size // 64)
+max_pos = config.get("max_position_embeddings") or config.get("n_positions") or get_nested(config, "text_config", "max_position_embeddings") or 4096
+if not num_layers or not num_kv_heads or not head_dim:
+    sys.exit(1)
+try:
+    import torch
+    total_mem = sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count()))
+except Exception:
+    sys.exit(1)
+gpu_util = 0.9
+activation_overhead = 2 * (1024**3)
+num_params = config.get("num_parameters") or config.get("num_params") or get_nested(config, "text_config", "num_parameters")
+if num_params is not None:
+    model_bytes = num_params * 2
+else:
+    model_bytes = 12 * (1024**3)
+available_kv = total_mem * gpu_util - model_bytes - activation_overhead
+if available_kv <= 0:
+    available_kv = total_mem * 0.5
+kv_per_token = num_layers * 2 * num_kv_heads * head_dim * 2
+max_num_seqs = 4
+max_model_len = int(available_kv / (kv_per_token * max_num_seqs))
+max_model_len = max(1, min(max_model_len, max_pos))
+if max_model_len < 256:
+    max_num_seqs = 1
+    max_model_len = int(available_kv / (kv_per_token * max_num_seqs))
+    max_model_len = max(1, min(max_model_len, max_pos))
+print("GPU_MEMORY_UTIL=%s MAX_MODEL_LEN=%d MAX_NUM_SEQS=%d" % (gpu_util, max_model_len, max_num_seqs))
+`
+)
--- a/pkg/llm/drivers/llm_container/base_driver.go
+++ b/pkg/llm/drivers/llm_container/base_driver.go
@@ -41,7 +41,7 @@ func (b *baseDriver) StartLLM(ctx context.Context, userCred mcclient.TokenCreden
 	return nil
 }

-func (b *baseDriver) ValidateCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *api.LLMSkuCreateInput) (*api.LLMSkuCreateInput, error) {
+func (b *baseDriver) ValidateLLMSkuCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *api.LLMSkuCreateInput) (*api.LLMSkuCreateInput, error) {
 	imgObj, err := validators.ValidateModel(ctx, userCred, models.GetLLMImageManager(), &input.LLMImageId)
 	if err != nil {
 		return nil, errors.Wrapf(err, "validate image_id %s", input.LLMImageId)
@@ -67,7 +67,7 @@ func (b *baseDriver) ValidateCreateData(ctx context.Context, userCred mcclient.T
 	return input, nil
 }

-func (b *baseDriver) ValidateUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSkuUpdateInput) (*api.LLMSkuUpdateInput, error) {
+func (b *baseDriver) ValidateLLMSkuUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSkuUpdateInput) (*api.LLMSkuUpdateInput, error) {
 	llmImageId := input.LLMImageId
 	if llmImageId != "" {
 		imgObj, err := validators.ValidateModel(ctx, userCred, models.GetLLMImageManager(), &llmImageId)
@@ -111,5 +111,16 @@ func MatchContainerToUpdateByName(ctr *computeapi.SContainer, podCtrs []*compute
 }

 func (b *baseDriver) MatchContainerToUpdate(ctr *computeapi.SContainer, podCtrs []*computeapi.PodContainerCreateInput) (*computeapi.PodContainerCreateInput, error) {
+	if len(podCtrs) == 1 {
+		return podCtrs[0], nil
+	}
 	return MatchContainerToUpdateByName(ctr, podCtrs)
 }
+
+func (b *baseDriver) ValidateLLMCreateSpec(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSpec) (*api.LLMSpec, error) {
+	return input, nil
+}
+
+func (b *baseDriver) ValidateLLMUpdateSpec(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, input *api.LLMSpec) (*api.LLMSpec, error) {
+	return input, nil
+}
--- a/pkg/llm/drivers/llm_container/dify.go
+++ b/pkg/llm/drivers/llm_container/dify.go
@@ -37,6 +37,36 @@ func (d *dify) GetSpec(sku *models.SLLMSku) interface{} {
 	return sku.LLMSpec.Dify
 }

+// mergeDifySpecInto merges src into dst. If fillEmpty is true, only fills dst when dst is empty; otherwise overwrites dst when src is non-empty.
+func mergeDifySpecInto(dst, src *api.LLMSpecDify, fillEmpty bool) {
+	if dst == nil || src == nil {
+		return
+	}
+	mergeStr := func(dstPtr *string, srcVal string) {
+		if fillEmpty {
+			if *dstPtr == "" && srcVal != "" {
+				*dstPtr = srcVal
+			}
+		} else {
+			if srcVal != "" {
+				*dstPtr = srcVal
+			}
+		}
+	}
+	mergeStr(&dst.PostgresImageId, src.PostgresImageId)
+	mergeStr(&dst.RedisImageId, src.RedisImageId)
+	mergeStr(&dst.NginxImageId, src.NginxImageId)
+	mergeStr(&dst.DifyApiImageId, src.DifyApiImageId)
+	mergeStr(&dst.DifyPluginImageId, src.DifyPluginImageId)
+	mergeStr(&dst.DifyWebImageId, src.DifyWebImageId)
+	mergeStr(&dst.DifySandboxImageId, src.DifySandboxImageId)
+	mergeStr(&dst.DifySSRFImageId, src.DifySSRFImageId)
+	mergeStr(&dst.DifyWeaviateImageId, src.DifyWeaviateImageId)
+	if (fillEmpty && len(dst.CustomizedEnvs) == 0 && len(src.CustomizedEnvs) > 0) || (!fillEmpty && len(src.CustomizedEnvs) > 0) {
+		dst.CustomizedEnvs = src.CustomizedEnvs
+	}
+}
+
 // mergeDify merges llm and sku Dify specs; llm takes priority, use sku when llm is nil or zero.
 func mergeDify(llm, sku *api.LLMSpecDify) *api.LLMSpecDify {
 	if llm != nil && !llm.IsZero() {
@@ -88,14 +118,60 @@ func (d *dify) GetPrimaryContainer(ctx context.Context, llm *models.SLLM, contai
 	return nil, errors.Error("api container not found")
 }

-func (d *dify) ValidateCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *api.LLMSkuCreateInput) (*api.LLMSkuCreateInput, error) {
+func (d *dify) ValidateLLMSkuCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *api.LLMSkuCreateInput) (*api.LLMSkuCreateInput, error) {
 	if input.LLMSpec == nil || input.LLMSpec.Dify == nil {
 		return nil, errors.Wrap(httperrors.ErrInputParameter, "dify SKU requires llm_spec with type dify and image ids")
 	}
 	if input.MountedModels != nil {
 		return nil, errors.Wrap(httperrors.ErrInputParameter, "dify SKU does not support mounted models")
 	}
-	difySpec := input.LLMSpec.Dify
+
+	// Reuse ValidateLLMCreateSpec to normalize/validate LLMSpec.
+	spec, err := d.ValidateLLMCreateSpec(ctx, userCred, nil, input.LLMSpec)
+	if err != nil {
+		return nil, err
+	}
+	input.LLMSpec = spec
+	if input.LLMSpec != nil && input.LLMSpec.Dify != nil {
+		input.LLMImageId = input.LLMSpec.Dify.DifyApiImageId
+	}
+	return input, nil
+}
+
+func (d *dify) ValidateLLMSkuUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSkuUpdateInput) (*api.LLMSkuUpdateInput, error) {
+	if input.MountedModels != nil {
+		return nil, errors.Wrap(httperrors.ErrInputParameter, "dify SKU does not support mounted models")
+	}
+	if input.LLMSpec == nil {
+		return input, nil
+	}
+
+	// Reuse ValidateLLMUpdateSpec by treating current SKU spec as the \"current llm spec\".
+	fakeLLM := &models.SLLM{LLMSpec: sku.LLMSpec}
+	spec, err := d.ValidateLLMUpdateSpec(ctx, userCred, fakeLLM, input.LLMSpec)
+	if err != nil {
+		return nil, err
+	}
+	input.LLMSpec = spec
+
+	// if dify_api_image_id is set, use it as the primary image id
+	if input.LLMSpec != nil && input.LLMSpec.Dify != nil && input.LLMSpec.Dify.DifyApiImageId != "" {
+		input.LLMImageId = input.LLMSpec.Dify.DifyApiImageId
+	}
+	return input, nil
+}
+
+// ValidateLLMCreateSpec implements ILLMContainerDriver. Validates image ids and merges empty fields from SKU spec.
+func (d *dify) ValidateLLMCreateSpec(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSpec) (*api.LLMSpec, error) {
+	if input == nil || input.Dify == nil {
+		return input, nil
+	}
+	difySpec := input.Dify
+	// Merge empty fields from SKU so the stored spec is complete
+	if sku != nil && sku.LLMSpec != nil && sku.LLMSpec.Dify != nil {
+		mergeDifySpecInto(difySpec, sku.LLMSpec.Dify, true)
+	}
+	// Validate non-empty image ids
 	for _, imgId := range []*string{&difySpec.PostgresImageId, &difySpec.RedisImageId, &difySpec.NginxImageId, &difySpec.DifyApiImageId, &difySpec.DifyPluginImageId, &difySpec.DifyWebImageId, &difySpec.DifySandboxImageId, &difySpec.DifySSRFImageId, &difySpec.DifyWeaviateImageId} {
 		if *imgId == "" {
 			continue
@@ -105,63 +181,68 @@ func (d *dify) ValidateCreateData(ctx context.Context, userCred mcclient.TokenCr
 			return nil, errors.Wrapf(err, "validate image_id %s", *imgId)
 		}
 		img := imgObj.(*models.SLLMImage)
-		if img.LLMType != input.LLMType {
-			return nil, errors.Wrapf(httperrors.ErrInvalidStatus, "image %s is not of type %s", *imgId, input.LLMType)
+		if img.LLMType != string(api.LLM_CONTAINER_DIFY) {
+			return nil, errors.Wrapf(httperrors.ErrInvalidStatus, "image %s is not of type dify", *imgId)
 		}
 		*imgId = img.Id
 	}
-	input.LLMImageId = difySpec.DifyApiImageId
-	return input, nil
+	return &api.LLMSpec{Dify: difySpec}, nil
 }

-func (d *dify) ValidateUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSkuUpdateInput) (*api.LLMSkuUpdateInput, error) {
-	if input.MountedModels != nil {
-		return nil, errors.Wrap(httperrors.ErrInputParameter, "dify SKU does not support mounted models")
+// ValidateLLMUpdateSpec implements ILLMContainerDriver. Merges input with current LLM spec (non-empty overwrites); validates image ids.
+func (d *dify) ValidateLLMUpdateSpec(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, input *api.LLMSpec) (*api.LLMSpec, error) {
+	if input == nil || input.Dify == nil {
+		return input, nil
 	}
-	if input.LLMSpec == nil || input.LLMSpec.Dify == nil {
-		return nil, nil
-	}
-	currentSpec := d.GetSpec(sku)
-	if currentSpec == nil {
-		return nil, nil
-	}
-	updated := *currentSpec.(*api.LLMSpecDify)
-	difySpec := input.LLMSpec.Dify
-	mergeStr := func(dst *string, src string) {
-		if src != "" {
-			*dst = src
+	// Start from current LLM spec, or SKU spec as base
+	var base *api.LLMSpecDify
+	if llm != nil && llm.LLMSpec != nil && llm.LLMSpec.Dify != nil {
+		b := *llm.LLMSpec.Dify
+		base = &b
+		if llm.LLMSpec.Dify.CustomizedEnvs != nil {
+			base.CustomizedEnvs = make([]*api.DifyCustomizedEnv, len(llm.LLMSpec.Dify.CustomizedEnvs))
+			copy(base.CustomizedEnvs, llm.LLMSpec.Dify.CustomizedEnvs)
+		}
+	} else if llm != nil {
+		sku, err := llm.GetLLMSku(llm.LLMSkuId)
+		if err == nil && sku != nil && sku.LLMSpec != nil && sku.LLMSpec.Dify != nil {
+			b := *sku.LLMSpec.Dify
+			base = &b
+			if sku.LLMSpec.Dify.CustomizedEnvs != nil {
+				base.CustomizedEnvs = make([]*api.DifyCustomizedEnv, len(sku.LLMSpec.Dify.CustomizedEnvs))
+				copy(base.CustomizedEnvs, sku.LLMSpec.Dify.CustomizedEnvs)
+			}
 		}
 	}
-	mergeStr(&updated.PostgresImageId, difySpec.PostgresImageId)
-	mergeStr(&updated.RedisImageId, difySpec.RedisImageId)
-	mergeStr(&updated.NginxImageId, difySpec.NginxImageId)
-	mergeStr(&updated.DifyApiImageId, difySpec.DifyApiImageId)
-	mergeStr(&updated.DifyPluginImageId, difySpec.DifyPluginImageId)
-	mergeStr(&updated.DifyWebImageId, difySpec.DifyWebImageId)
-	mergeStr(&updated.DifySandboxImageId, difySpec.DifySandboxImageId)
-	mergeStr(&updated.DifySSRFImageId, difySpec.DifySSRFImageId)
-	mergeStr(&updated.DifyWeaviateImageId, difySpec.DifyWeaviateImageId)
-	if len(difySpec.CustomizedEnvs) > 0 {
-		updated.CustomizedEnvs = difySpec.CustomizedEnvs
+	if base == nil {
+		base = &api.LLMSpecDify{}
 	}
-	for _, imgId := range []*string{&updated.PostgresImageId, &updated.RedisImageId, &updated.NginxImageId, &updated.DifyApiImageId, &updated.DifyPluginImageId, &updated.DifyWebImageId, &updated.DifySandboxImageId, &updated.DifySSRFImageId, &updated.DifyWeaviateImageId} {
-		if *imgId != "" {
-			imgObj, err := validators.ValidateModel(ctx, userCred, models.GetLLMImageManager(), imgId)
-			if err != nil {
-				return nil, errors.Wrapf(err, "validate image_id %s", *imgId)
-			}
-			img := imgObj.(*models.SLLMImage)
-			if img.LLMType != sku.LLMType {
-				return nil, errors.Wrapf(httperrors.ErrInvalidStatus, "image %s is not of type %s", *imgId, sku.LLMType)
-			}
-			*imgId = img.GetId()
+	mergeDifySpecInto(base, input.Dify, false)
+	// Validate non-empty image ids
+	for _, imgId := range []*string{&base.PostgresImageId, &base.RedisImageId, &base.NginxImageId, &base.DifyApiImageId, &base.DifyPluginImageId, &base.DifyWebImageId, &base.DifySandboxImageId, &base.DifySSRFImageId, &base.DifyWeaviateImageId} {
+		if *imgId == "" {
+			continue
 		}
+		imgObj, err := validators.ValidateModel(ctx, userCred, models.GetLLMImageManager(), imgId)
+		if err != nil {
+			return nil, errors.Wrapf(err, "validate image_id %s", *imgId)
+		}
+		img := imgObj.(*models.SLLMImage)
+		if img.LLMType != string(api.LLM_CONTAINER_DIFY) {
+			return nil, errors.Wrapf(httperrors.ErrInvalidStatus, "image %s is not of type dify", *imgId)
+		}
+		*imgId = img.Id
 	}
-	// if dify_api_image_id is set, use it as the primary image id
-	if input.LLMSpec.Dify.DifyApiImageId != "" {
-		input.LLMImageId = input.LLMSpec.Dify.DifyApiImageId
+	return &api.LLMSpec{Dify: base}, nil
+}
+
+// GetContainerSpec is required by ILLMContainerDriver but not used for Dify; pod creation uses GetContainerSpecs. Return the first container so the interface is satisfied.
+func (d *dify) GetContainerSpec(ctx context.Context, llm *models.SLLM, image *models.SLLMImage, sku *models.SLLMSku, props []string, devices []computeapi.SIsolatedDevice, diskId string) *computeapi.PodContainerCreateInput {
+	specs := d.GetContainerSpecs(ctx, llm, image, sku, props, devices, diskId)
+	if len(specs) == 0 {
+		return nil
 	}
-	return input, nil
+	return specs[0]
 }

 // GetContainerSpecs returns all Dify pod containers (postgres, redis, api, worker, nginx, etc.). Uses effective spec (llm + sku merged by driver).
--- a/pkg/llm/drivers/llm_container/vllm.go
+++ b/pkg/llm/drivers/llm_container/vllm.go
@@ -0,0 +1,578 @@
+package llm_container
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"path"
+	"strconv"
+	"strings"
+	"time"
+
+	"yunion.io/x/log"
+	"yunion.io/x/pkg/errors"
+
+	commonapi "yunion.io/x/onecloud/pkg/apis"
+	computeapi "yunion.io/x/onecloud/pkg/apis/compute"
+	api "yunion.io/x/onecloud/pkg/apis/llm"
+	"yunion.io/x/onecloud/pkg/llm/models"
+	"yunion.io/x/onecloud/pkg/mcclient"
+)
+
+func init() {
+	models.RegisterLLMContainerDriver(newVLLM())
+}
+
+type vllm struct {
+	baseDriver
+}
+
+func newVLLM() models.ILLMContainerDriver {
+	return &vllm{baseDriver: newBaseDriver(api.LLM_CONTAINER_VLLM)}
+}
+
+// escapeShellSingleQuoted escapes s for use inside a single-quoted shell string (each ' becomes '\”).
+func escapeShellSingleQuoted(s string) string {
+	return strings.ReplaceAll(s, "'", "'\\''")
+}
+
+func (v *vllm) GetSpec(sku *models.SLLMSku) interface{} {
+	if sku == nil || sku.LLMType != string(api.LLM_CONTAINER_VLLM) || sku.LLMSpec == nil || sku.LLMSpec.Vllm == nil {
+		return nil
+	}
+	return sku.LLMSpec.Vllm
+}
+
+func (v *vllm) GetEffectiveSpec(llm *models.SLLM, sku *models.SLLMSku) interface{} {
+	var skuSpec *api.LLMSpecVllm
+	if s := v.GetSpec(sku); s != nil {
+		skuSpec = s.(*api.LLMSpecVllm)
+	}
+	if llm != nil && llm.LLMSpec != nil && llm.LLMSpec.Vllm != nil {
+		if llm.LLMSpec.Vllm.PreferredModel != "" {
+			out := *llm.LLMSpec.Vllm
+			return &out
+		}
+		// llm explicitly present but empty -> fall back to sku default
+	}
+	if skuSpec != nil {
+		out := *skuSpec
+		return &out
+	}
+	return nil
+}
+
+func (v *vllm) ValidateLLMSkuCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *api.LLMSkuCreateInput) (*api.LLMSkuCreateInput, error) {
+	input, err := v.baseDriver.ValidateLLMSkuCreateData(ctx, userCred, input)
+	if err != nil {
+		return nil, err
+	}
+
+	// Reuse ValidateLLMCreateSpec; ensure llm_spec.vllm always exists for vLLM SKU.
+	spec, err := v.ValidateLLMCreateSpec(ctx, userCred, nil, input.LLMSpec)
+	if err != nil {
+		return nil, err
+	}
+	if spec == nil {
+		spec = &api.LLMSpec{Vllm: &api.LLMSpecVllm{}}
+	} else if spec.Vllm == nil {
+		spec.Vllm = &api.LLMSpecVllm{}
+	}
+	input.LLMSpec = spec
+	return input, nil
+}
+
+func (v *vllm) ValidateLLMSkuUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSkuUpdateInput) (*api.LLMSkuUpdateInput, error) {
+	input, err := v.baseDriver.ValidateLLMSkuUpdateData(ctx, userCred, sku, input)
+	if err != nil {
+		return nil, err
+	}
+	if input.LLMSpec == nil {
+		return input, nil
+	}
+
+	// Reuse ValidateLLMUpdateSpec by treating current SKU spec as the "current llm spec".
+	fakeLLM := &models.SLLM{LLMSpec: sku.LLMSpec}
+	spec, err := v.ValidateLLMUpdateSpec(ctx, userCred, fakeLLM, input.LLMSpec)
+	if err != nil {
+		return nil, err
+	}
+	input.LLMSpec = spec
+	if input.LLMSpec != nil && input.LLMSpec.Vllm == nil {
+		input.LLMSpec.Vllm = &api.LLMSpecVllm{}
+	}
+	return input, nil
+}
+
+// ValidateLLMCreateSpec implements ILLMContainerDriver. Merges preferred_model from SKU when input's is empty.
+func (v *vllm) ValidateLLMCreateSpec(ctx context.Context, userCred mcclient.TokenCredential, sku *models.SLLMSku, input *api.LLMSpec) (*api.LLMSpec, error) {
+	if input == nil {
+		return nil, nil
+	}
+	preferred := ""
+	if input.Vllm != nil {
+		preferred = input.Vllm.PreferredModel
+	}
+	if preferred == "" && sku != nil && sku.LLMSpec != nil && sku.LLMSpec.Vllm != nil {
+		preferred = sku.LLMSpec.Vllm.PreferredModel
+	}
+	return &api.LLMSpec{Vllm: &api.LLMSpecVllm{PreferredModel: preferred}}, nil
+}
+
+// ValidateLLMUpdateSpec implements ILLMContainerDriver. Merges preferred_model with current LLM spec; only overwrite when non-empty.
+func (v *vllm) ValidateLLMUpdateSpec(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, input *api.LLMSpec) (*api.LLMSpec, error) {
+	if input == nil || input.Vllm == nil {
+		return input, nil
+	}
+	current := ""
+	if llm != nil && llm.LLMSpec != nil && llm.LLMSpec.Vllm != nil {
+		current = llm.LLMSpec.Vllm.PreferredModel
+	}
+	preferred := input.Vllm.PreferredModel
+	if preferred == "" {
+		preferred = current
+	}
+	return &api.LLMSpec{Vllm: &api.LLMSpecVllm{PreferredModel: preferred}}, nil
+}
+
+func (v *vllm) GetContainerSpec(ctx context.Context, llm *models.SLLM, image *models.SLLMImage, sku *models.SLLMSku, props []string, devices []computeapi.SIsolatedDevice, diskId string) *computeapi.PodContainerCreateInput {
+	// Container entrypoint only keeps the container alive; vLLM is started by StartLLM via exec.
+	startScript := `mkdir -p ` + api.LLM_VLLM_MODELS_PATH + ` && exec sleep infinity`
+	envs := []*commonapi.ContainerKeyValue{
+		{
+			Key:   "HUGGING_FACE_HUB_CACHE",
+			Value: api.LLM_VLLM_CACHE_DIR,
+		},
+		{
+			Key:   "HF_ENDPOINT",
+			Value: api.LLM_VLLM_HF_ENDPOINT,
+		},
+		// // Fix Error 803
+		// {
+		// 	Key:   "LD_LIBRARY_PATH",
+		// 	Value: "/lib64:/usr/local/cuda/lib64:/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}",
+		// },
+		// // Fix Error 803
+		// {
+		// 	Key:   "LD_PRELOAD",
+		// 	Value: "/lib/libcuda.so.1 /lib/libnvidia-ptxjitcompiler.so.1 /lib/libnvidia-gpucomp.so",
+		// },
+	}
+	spec := computeapi.ContainerSpec{
+		ContainerSpec: commonapi.ContainerSpec{
+			Image:             image.ToContainerImage(),
+			ImageCredentialId: image.CredentialId,
+			Command:           []string{"/bin/sh", "-c"},
+			Args:              []string{startScript},
+			EnableLxcfs:       true,
+			AlwaysRestart:     true,
+			Envs:              envs,
+		},
+	}
+
+	// GPU Devices
+	if len(devices) == 0 && (sku.Devices != nil && len(*sku.Devices) > 0) {
+		for i := range *sku.Devices {
+			index := i
+			spec.Devices = append(spec.Devices, &computeapi.ContainerDevice{
+				Type: commonapi.CONTAINER_DEVICE_TYPE_ISOLATED_DEVICE,
+				IsolatedDevice: &computeapi.ContainerIsolatedDevice{
+					Index: &index,
+				},
+			})
+		}
+	} else if len(devices) > 0 {
+		for i := range devices {
+			spec.Devices = append(spec.Devices, &computeapi.ContainerDevice{
+				Type: commonapi.CONTAINER_DEVICE_TYPE_ISOLATED_DEVICE,
+				IsolatedDevice: &computeapi.ContainerIsolatedDevice{
+					Id: devices[i].Id,
+				},
+			})
+		}
+	}
+
+	// Volume Mounts
+	diskIndex := 0
+	postOverlays, err := llm.GetMountedModelsPostOverlay()
+	if err != nil {
+		log.Errorf("GetMountedModelsPostOverlay failed %s", err)
+	}
+	ctrVols := []*commonapi.ContainerVolumeMount{
+		{
+			Disk: &commonapi.ContainerVolumeMountDisk{
+				SubDirectory: api.LLM_VLLM,
+				Index:        &diskIndex,
+				PostOverlay:  postOverlays,
+			},
+			Type:        commonapi.CONTAINER_VOLUME_MOUNT_TYPE_DISK,
+			MountPath:   api.LLM_VLLM_BASE_PATH,
+			ReadOnly:    false,
+			Propagation: commonapi.MOUNTPROPAGATION_PROPAGATION_HOST_TO_CONTAINER,
+		},
+		{
+			// Mount cache dir to save HF cache
+			Disk: &commonapi.ContainerVolumeMountDisk{
+				SubDirectory: "cache",
+				Index:        &diskIndex,
+			},
+			Type:      commonapi.CONTAINER_VOLUME_MOUNT_TYPE_DISK,
+			MountPath: "/root/.cache",
+			ReadOnly:  false,
+		},
+	}
+	spec.VolumeMounts = append(spec.VolumeMounts, ctrVols...)
+
+	return &computeapi.PodContainerCreateInput{
+		ContainerSpec: spec,
+	}
+}
+
+func (v *vllm) GetContainerSpecs(ctx context.Context, llm *models.SLLM, image *models.SLLMImage, sku *models.SLLMSku, props []string, devices []computeapi.SIsolatedDevice, diskId string) []*computeapi.PodContainerCreateInput {
+	return []*computeapi.PodContainerCreateInput{
+		v.GetContainerSpec(ctx, llm, image, sku, props, devices, diskId),
+	}
+}
+
+func (v *vllm) GetLLMUrl(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM) (string, error) {
+	// Similar logic to Ollama to determine URL
+	server, err := llm.GetServer(ctx)
+	if err != nil {
+		return "", errors.Wrap(err, "get server")
+	}
+
+	networkType := llm.NetworkType
+	if networkType == string(computeapi.NETWORK_TYPE_GUEST) {
+		if len(llm.LLMIp) == 0 {
+			return "", errors.Error("LLM IP is empty for guest network")
+		}
+		return fmt.Sprintf("http://%s:%d", llm.LLMIp, api.LLM_VLLM_DEFAULT_PORT), nil
+	} else {
+		// hostlocal
+		if len(server.HostAccessIp) == 0 {
+			return "", errors.Error("host access IP is empty")
+		}
+		// Assuming we might map ports or just use the default if host networking isn't strictly port-mapped per instance
+		// For simplicity, returning default port on host IP, assuming bridge/direct access or specific port mapping logic exists elsewhere.
+		// NOTE: In ollama.go, it queries AccessInfo. Here we simplify.
+		return fmt.Sprintf("http://%s:%d", server.HostAccessIp, api.LLM_VLLM_DEFAULT_PORT), nil
+	}
+}
+
+// StartLLM starts the vLLM server inside the container via exec, then waits for the health endpoint to be ready.
+func (v *vllm) StartLLM(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM) error {
+	lc, err := llm.GetLLMContainer()
+	if err != nil {
+		return errors.Wrap(err, "get llm container")
+	}
+	sku, err := llm.GetLLMSku(llm.LLMSkuId)
+	if err != nil {
+		return errors.Wrap(err, "get llm sku")
+	}
+	tensorParallelSize := 1
+	if sku.Devices != nil && len(*sku.Devices) > 0 {
+		tensorParallelSize = len(*sku.Devices)
+	}
+	swapSpaceGiB := (sku.Memory * 1) / (2 * 1024)
+	if swapSpaceGiB < 1 {
+		swapSpaceGiB = 1
+	}
+
+	preferredPath := ""
+	if eff := v.GetEffectiveSpec(llm, sku); eff != nil {
+		if preferred := eff.(*api.LLMSpecVllm).PreferredModel; preferred != "" {
+			preferredPath = path.Join(api.LLM_VLLM_MODELS_PATH, preferred)
+		}
+	}
+	resolved, err := v.resolveModelAndParams(ctx, lc.CmpId, preferredPath, tensorParallelSize)
+	if err != nil {
+		return err
+	}
+	if resolved == nil {
+		return nil // no model
+	}
+
+	modelEscaped := escapeShellSingleQuoted(resolved.ModelPath)
+	startCmd := fmt.Sprintf(
+		`nohup %s --model '%s' --served-model-name "$(basename '%s')" --port %d \
+		--tensor-parallel-size %d --swap-space %d --enable-prefix-caching \
+		--gpu-memory-utilization %s --max-model-len %d --max-num-seqs %d \
+		> /tmp/vllm.log 2>&1 &`,
+		api.LLM_VLLM_EXEC_PATH,
+		modelEscaped,
+		modelEscaped,
+		api.LLM_VLLM_DEFAULT_PORT,
+		tensorParallelSize,
+		swapSpaceGiB,
+		resolved.GpuUtil,
+		resolved.MaxModelLen,
+		resolved.MaxNumSeqs,
+	)
+	_, err = exec(ctx, lc.CmpId, startCmd, 30)
+	if err != nil {
+		log.Errorf("vLLM start failed, exec command: %s", startCmd)
+		return errors.Wrapf(err, "exec start vLLM, command: %s", startCmd)
+	}
+	cmd := startCmd
+	// Wait for health endpoint
+	baseURL, err := v.GetLLMUrl(ctx, userCred, llm)
+	if err != nil {
+		return errors.Wrap(err, "get llm url for health check")
+	}
+	healthURL := strings.TrimSuffix(baseURL, "/") + "/health"
+	deadline := time.Now().Add(api.LLM_VLLM_HEALTH_CHECK_TIMEOUT)
+	for time.Now().Before(deadline) {
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil)
+		if err != nil {
+			return errors.Wrap(err, "new health check request")
+		}
+		resp, err := http.DefaultClient.Do(req)
+		if err == nil {
+			resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+		}
+		select {
+		case <-ctx.Done():
+			return errors.Wrap(ctx.Err(), "context done while waiting for vLLM")
+		case <-time.After(api.LLM_VLLM_HEALTH_CHECK_INTERVAL):
+			// continue
+		}
+	}
+	// Optionally read last lines of /tmp/vllm.log for better error message
+	logTail, _ := exec(ctx, lc.CmpId, "tail -n 20 /tmp/vllm.log 2>/dev/null || true", 5)
+	if logTail != "" {
+		return errors.Errorf("vLLM health check timeout after %v, exec command: %s, last log: %s", api.LLM_VLLM_HEALTH_CHECK_TIMEOUT, cmd, strings.TrimSpace(logTail))
+	}
+	return errors.Errorf("vLLM health check timeout after %v, exec command: %s", api.LLM_VLLM_HEALTH_CHECK_TIMEOUT, cmd)
+}
+
+// ILLMContainerInstantApp implementation
+
+func (v *vllm) GetProbedInstantModelsExt(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, mdlIds ...string) (map[string]api.LLMInternalInstantMdlInfo, error) {
+	lc, err := llm.GetLLMContainer()
+	if err != nil {
+		return nil, errors.Wrap(err, "get llm container")
+	}
+
+	// List directories in models path
+	cmd := fmt.Sprintf("du -sk %s/*/", api.LLM_VLLM_MODELS_PATH)
+	output, err := exec(ctx, lc.CmpId, cmd, 10)
+	if err != nil {
+		// If ls fails, maybe no directory yet, return empty
+		return make(map[string]api.LLMInternalInstantMdlInfo), nil
+	}
+
+	modelsMap := make(map[string]api.LLMInternalInstantMdlInfo)
+	lines := strings.Split(strings.TrimSpace(output), "\n")
+	for _, line := range lines {
+		fields := strings.Fields(line)
+		if len(fields) < 2 {
+			continue
+		}
+		// Size is in KB
+		sizeKB, _ := strconv.ParseInt(fields[0], 10, 64)
+		fullPath := fields[1]
+		name := path.Base(fullPath)
+		if name == "" {
+			continue
+		}
+		// We treat the directory name as the model name
+		// For vLLM, name usually implies "organization/model" if downloaded from HF, but here we just list local dirs
+		modelsMap[name] = api.LLMInternalInstantMdlInfo{
+			Name:    name,
+			ModelId: name,
+			Tag:     "latest",
+			Size:    sizeKB * 1024,
+		}
+	}
+	return modelsMap, nil
+}
+
+func (v *vllm) DetectModelPaths(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, pkgInfo api.LLMInternalInstantMdlInfo) ([]string, error) {
+	lc, err := llm.GetLLMContainer()
+	if err != nil {
+		return nil, errors.Wrap(err, "get llm container")
+	}
+
+	modelPath := path.Join(api.LLM_VLLM_MODELS_PATH, pkgInfo.Name)
+	checkCmd := fmt.Sprintf("[ -d '%s' ] && echo 'EXIST' || echo 'MISSING'", modelPath)
+	output, err := exec(ctx, lc.CmpId, checkCmd, 10)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to check file existence")
+	}
+
+	if !strings.Contains(output, "EXIST") {
+		return nil, errors.Errorf("model directory %s missing", modelPath)
+	}
+
+	return []string{modelPath}, nil
+}
+
+func (v *vllm) GetImageInternalPathMounts(sApp *models.SInstantModel) map[string]string {
+	// Map host paths to container paths
+	// For vLLM simple volume mount, this might be 1:1 or based on the base path
+	res := make(map[string]string)
+	for _, mount := range sApp.Mounts {
+		relPath := strings.TrimPrefix(mount, api.LLM_VLLM_BASE_PATH)
+		res[relPath] = path.Join(api.LLM_VLLM, relPath)
+	}
+	return res
+}
+
+func (v *vllm) GetSaveDirectories(sApp *models.SInstantModel) (string, []string, error) {
+	var filteredMounts []string
+	for _, mount := range sApp.Mounts {
+		if strings.HasPrefix(mount, api.LLM_VLLM_BASE_PATH) {
+			relPath := strings.TrimPrefix(mount, api.LLM_VLLM_BASE_PATH)
+			filteredMounts = append(filteredMounts, relPath)
+		}
+	}
+	return "", filteredMounts, nil
+}
+
+func (v *vllm) ValidateMounts(mounts []string, mdlName string, mdlTag string) ([]string, error) {
+	return mounts, nil
+}
+
+func (v *vllm) CheckDuplicateMounts(errStr string, dupIndex int) string {
+	return "Duplicate mounts detected"
+}
+
+func (v *vllm) GetInstantModelIdByPostOverlay(postOverlay *commonapi.ContainerVolumeMountDiskPostOverlay, mdlNameToId map[string]string) string {
+	return ""
+}
+
+func (v *vllm) GetDirPostOverlay(dir api.LLMMountDirInfo) *commonapi.ContainerVolumeMountDiskPostOverlay {
+	uid := int64(1000)
+	gid := int64(1000)
+	ov := dir.ToOverlay()
+	ov.FsUser = &uid
+	ov.FsGroup = &gid
+	return &ov
+}
+
+func (v *vllm) PreInstallModel(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, instMdl *models.SLLMInstantModel) error {
+	lc, err := llm.GetLLMContainer()
+	if err != nil {
+		return errors.Wrap(err, "get llm container")
+	}
+	// Create base directory
+	cmd := fmt.Sprintf("mkdir -p %s", api.LLM_VLLM_MODELS_PATH)
+	_, err = exec(ctx, lc.CmpId, cmd, 10)
+	return err
+}
+
+func (v *vllm) InstallModel(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, dirs []string, mdlIds []string) error {
+	return nil
+}
+
+func (v *vllm) UninstallModel(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, instMdl *models.SLLMInstantModel) error {
+	// Optionally remove the model directory
+	// For safety, we might just log or leave it
+	return nil
+}
+
+func (v *vllm) DownloadModel(ctx context.Context, userCred mcclient.TokenCredential, llm *models.SLLM, tmpDir string, modelName string, modelTag string) (string, []string, error) {
+	lc, err := llm.GetLLMContainer()
+	if err != nil {
+		return "", nil, errors.Wrap(err, "get llm container")
+	}
+
+	// Logic to download model inside the container
+	// modelName is expected to be like "facebook/opt-125m"
+	targetDir := path.Join(api.LLM_VLLM_MODELS_PATH, modelName)
+
+	// Check if already exists
+	checkCmd := fmt.Sprintf("[ -d '%s' ] && echo 'EXIST'", targetDir)
+	out, _ := exec(ctx, lc.CmpId, checkCmd, 10)
+	if strings.Contains(out, "EXIST") {
+		log.Infof("Model %s already exists at %s", modelName, targetDir)
+		return modelName, []string{targetDir}, nil
+	}
+
+	// Try to use huggingface-cli
+	// Assuming container has internet access and tools
+	downloadCmd := fmt.Sprintf("mkdir -p %s && huggingface-cli download %s --local-dir %s --local-dir-use-symlinks False", targetDir, modelName, targetDir)
+
+	// If huggingface-cli is missing, try installing it (if pip available)
+	// fallback to pip install
+	fullCmd := fmt.Sprintf("if ! command -v huggingface-cli &> /dev/null; then pip install -U huggingface_hub; fi; %s", downloadCmd)
+
+	log.Infof("Downloading model %s with cmd: %s", modelName, fullCmd)
+	_, err = exec(ctx, lc.CmpId, fullCmd, 3600) // 1 hour timeout for large models
+	if err != nil {
+		return "", nil, errors.Wrapf(err, "failed to download model %s", modelName)
+	}
+
+	return modelName, []string{targetDir}, nil
+}
+
+// vllmResolveResult is the result of resolving model path and estimating vLLM memory params in the container.
+type vllmResolveResult struct {
+	ModelPath   string
+	GpuUtil     string
+	MaxModelLen int
+	MaxNumSeqs  int
+}
+
+// resolveModelAndParams runs one exec in the container to resolve the model path and estimate
+// --gpu-memory-utilization, --max-model-len, --max-num-seqs. Returns (nil, nil) when no model is found.
+func (v *vllm) resolveModelAndParams(ctx context.Context, containerId string, preferredPath string, tensorParallelSize int) (*vllmResolveResult, error) {
+	preferredEscaped := escapeShellSingleQuoted(preferredPath)
+	escapedScript := escapeShellSingleQuoted(strings.TrimSpace(api.LLM_VLLM_ESTIMATE_PARAMS_SCRIPT))
+	defaultGpuUtil := strconv.FormatFloat(float64(api.LLM_VLLM_DEFAULT_GPU_MEMORY_UTIL), 'f', -1, 64)
+	cmd := fmt.Sprintf(
+		`mkdir -p %s;
+		preferred='%s';
+		if [ -n "$preferred" ] && [ -d "$preferred" ]; then model="$preferred"; else model=$(ls -d %s/* 2>/dev/null | head -n 1); fi;
+		if [ -z "$model" ]; then echo "NO_MODEL"; exit 0; fi;
+		tp=%d;
+		vllm_out=$(python3 -c '%s' "$model" "$tp" 2>/dev/null) || true;
+		GPU_MEMORY_UTIL=%s; MAX_MODEL_LEN=%d; MAX_NUM_SEQS=%d;
+		[ -n "$vllm_out" ] && eval "$vllm_out";
+		printf '%%s\n' "$model";
+		printf 'GPU_MEMORY_UTIL=%%s MAX_MODEL_LEN=%%s MAX_NUM_SEQS=%%s\n' "$GPU_MEMORY_UTIL" "$MAX_MODEL_LEN" "$MAX_NUM_SEQS"`,
+		api.LLM_VLLM_MODELS_PATH,
+		preferredEscaped,
+		api.LLM_VLLM_MODELS_PATH,
+		tensorParallelSize,
+		escapedScript,
+		defaultGpuUtil,
+		api.LLM_VLLM_DEFAULT_MAX_MODEL_LEN,
+		api.LLM_VLLM_DEFAULT_MAX_NUM_SEQS,
+	)
+	out, err := exec(ctx, containerId, cmd, 30)
+	if err != nil {
+		return nil, errors.Wrapf(err, "exec resolve model and params")
+	}
+	out = strings.TrimSpace(out)
+	if out == "NO_MODEL" {
+		return nil, nil
+	}
+	lines := strings.SplitN(out, "\n", 2)
+	if len(lines) < 2 {
+		return nil, errors.Errorf("vLLM resolve output missing params line: %s", out)
+	}
+	res := &vllmResolveResult{
+		ModelPath:   strings.TrimSpace(lines[0]),
+		GpuUtil:     defaultGpuUtil,
+		MaxModelLen: api.LLM_VLLM_DEFAULT_MAX_MODEL_LEN,
+		MaxNumSeqs:  api.LLM_VLLM_DEFAULT_MAX_NUM_SEQS,
+	}
+	for _, f := range strings.Fields(lines[1]) {
+		if val, ok := strings.CutPrefix(f, api.LLM_VLLM_RESOLVE_OUTPUT_PREFIX_GPU_UTIL); ok {
+			res.GpuUtil = val
+		} else if val, ok := strings.CutPrefix(f, api.LLM_VLLM_RESOLVE_OUTPUT_PREFIX_MAX_LEN); ok {
+			if n, e := strconv.Atoi(val); e == nil && n > 0 {
+				res.MaxModelLen = n
+			}
+		} else if val, ok := strings.CutPrefix(f, api.LLM_VLLM_RESOLVE_OUTPUT_PREFIX_MAX_NUM_SEQ); ok {
+			if n, e := strconv.Atoi(val); e == nil && n > 0 {
+				res.MaxNumSeqs = n
+			}
+		}
+	}
+	return res, nil
+}
--- a/pkg/llm/models/llm.go
+++ b/pkg/llm/models/llm.go
@@ -92,6 +92,15 @@ func (man *SLLMManager) ValidateCreateData(ctx context.Context, userCred mcclien
 	input.LLMSkuId = lSku.Id
 	input.LLMImageId = lSku.GetLLMImageId()

+	if input.LLMSpec != nil {
+		drv := lSku.GetLLMContainerDriver()
+		spec, err := drv.ValidateLLMCreateSpec(ctx, userCred, lSku, input.LLMSpec)
+		if err != nil {
+			return input, errors.Wrap(err, "validate LLM create spec")
+		}
+		input.LLMSpec = spec
+	}
+
 	return input, nil
 }

@@ -380,6 +389,29 @@ func (llm *SLLM) GetLLMSku(skuId string) (*SLLMSku, error) {
 	return sku.(*SLLMSku), nil
 }

+func (llm *SLLM) ValidateUpdateData(ctx context.Context, userCred mcclient.TokenCredential, query jsonutils.JSONObject, input api.LLMUpdateInput) (api.LLMUpdateInput, error) {
+	var err error
+	input.VirtualResourceBaseUpdateInput, err = llm.SLLMBase.SVirtualResourceBase.ValidateUpdateData(ctx, userCred, query, input.VirtualResourceBaseUpdateInput)
+	if err != nil {
+		return input, errors.Wrap(err, "validate VirtualResourceBaseUpdateInput")
+	}
+
+	if input.LLMSpec == nil {
+		return input, nil
+	}
+	sku, err := llm.GetLLMSku(llm.LLMSkuId)
+	if err != nil {
+		return input, errors.Wrap(err, "fetch LLMSku")
+	}
+	drv := sku.GetLLMContainerDriver()
+	spec, err := drv.ValidateLLMUpdateSpec(ctx, userCred, llm, input.LLMSpec)
+	if err != nil {
+		return input, errors.Wrap(err, "validate LLM update spec")
+	}
+	input.LLMSpec = spec
+	return input, nil
+}
+
 func (llm *SLLM) GetLargeLanguageModelName(name string) (modelName string, modelTag string, err error) {
 	if name == "" {
 		return "", "", errors.Wrap(errors.ErrInvalidStatus, "model name is empty")
--- a/pkg/llm/models/llm_base_pod.go
+++ b/pkg/llm/models/llm_base_pod.go
@@ -43,7 +43,7 @@ func GetLLMBasePodCreateInput(
 	}

 	data.VcpuCount = skuBase.Cpu
-	data.VmemSize = skuBase.Memory + 1
+	data.VmemSize = skuBase.Memory
 	// data.Name = input.Name + "-" + seclib.RandomPassword(6)
 	data.Name = input.Name

--- a/pkg/llm/models/llm_container_driver.go
+++ b/pkg/llm/models/llm_container_driver.go
@@ -93,13 +93,16 @@ type ILLMContainerDriver interface {
 	GetEffectiveSpec(llm *SLLM, sku *SLLMSku) interface{}
 	// GetPrimaryImageId returns the primary image id for this SKU type (e.g. LLMImageId for ollama/vllm, DifyApiImageId for dify).
 	GetPrimaryImageId(sku *SLLMSku) string
-	// ValidateCreateData validates create input and returns the LLMSpec to store. Called by SKU manager after base validation.
-	ValidateCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *llm.LLMSkuCreateInput) (*llm.LLMSkuCreateInput, error)
-	// ValidateUpdateData validates update input, merges with current spec, and returns the LLMSpec to store. Called by SKU when LLMSpec is not nil.
-	ValidateUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *SLLMSku, input *llm.LLMSkuUpdateInput) (*llm.LLMSkuUpdateInput, error)
+	// ValidateLLMSkuCreateData validates create input and returns the LLMSpec to store. Called by SKU manager after base validation.
+	ValidateLLMSkuCreateData(ctx context.Context, userCred mcclient.TokenCredential, input *llm.LLMSkuCreateInput) (*llm.LLMSkuCreateInput, error)
+	// ValidateLLMSkuUpdateData validates update input, merges with current spec, and returns the LLMSpec to store. Called by SKU when LLMSpec is not nil.
+	ValidateLLMSkuUpdateData(ctx context.Context, userCred mcclient.TokenCredential, sku *SLLMSku, input *llm.LLMSkuUpdateInput) (*llm.LLMSkuUpdateInput, error)

 	MatchContainerToUpdate(ctr *computeapi.SContainer, podCtrs []*computeapi.PodContainerCreateInput) (*computeapi.PodContainerCreateInput, error)

+	ValidateLLMCreateSpec(ctx context.Context, userCred mcclient.TokenCredential, sku *SLLMSku, input *llm.LLMSpec) (*llm.LLMSpec, error)
+	ValidateLLMUpdateSpec(ctx context.Context, userCred mcclient.TokenCredential, llm *SLLM, input *llm.LLMSpec) (*llm.LLMSpec, error)
+
 	ILLMContainerMCPAgent
 }

--- a/pkg/llm/models/llm_instant_model_sync.go
+++ b/pkg/llm/models/llm_instant_model_sync.go
@@ -820,13 +820,31 @@ func (llm *SLLM) EnsureInstantModelsInstalled(ctx context.Context, userCred mccl
 	if err != nil {
 		return errors.Wrap(err, "FetchApps")
 	}
+	if len(mdlIds) == 0 {
+		return nil
+	}
+	instModels := make(map[string]SInstantModel)
+	if err := db.FetchModelObjectsByIds(GetInstantModelManager(), "id", mdlIds, &instModels); err != nil {
+		return errors.Wrap(err, "FetchModelObjectsByIds")
+	}
 	var errs []error
 	for _, mdlId := range mdlIds {
 		if _, ok := mdlMap[mdlId]; ok {
-			// probed
-			// errs = append(errs, errors.Wrap(errors.ErrInvalidStatus, pkg))
-		} else {
-			// not mounted and not probed
+			continue
+		}
+		instModel, ok := instModels[mdlId]
+		if !ok {
+			errs = append(errs, errors.Wrapf(errors.ErrNotFound, "mdlId %s", mdlId))
+			continue
+		}
+		found := false
+		for _, info := range mdlMap {
+			if info.ModelId == instModel.ModelId {
+				found = true
+				break
+			}
+		}
+		if !found {
 			errs = append(errs, errors.Wrapf(errors.ErrNotFound, "mdlId %s", mdlId))
 		}
 	}
--- a/pkg/llm/models/llm_sku.go
+++ b/pkg/llm/models/llm_sku.go
@@ -201,7 +201,7 @@ func (man *SLLMSkuManager) ValidateCreateData(ctx context.Context, userCred mccl
 	if err != nil {
 		return input, errors.Wrap(err, "get container driver")
 	}
-	input, err = drv.ValidateCreateData(ctx, userCred, input)
+	input, err = drv.ValidateLLMSkuCreateData(ctx, userCred, input)
 	if err != nil {
 		return input, errors.Wrap(err, "validate create input")
 	}
@@ -238,7 +238,7 @@ func (sku *SLLMSku) ValidateUpdateData(ctx context.Context, userCred mcclient.To
 		return input, nil
 	}
 	drv := sku.GetLLMContainerDriver()
-	updateInput, err := drv.ValidateUpdateData(ctx, userCred, sku, &input)
+	updateInput, err := drv.ValidateLLMSkuUpdateData(ctx, userCred, sku, &input)
 	if err != nil {
 		return input, errors.Wrap(err, "validate update spec")
 	}
--- a/pkg/llm/tasks/llm/llm_create_task.go
+++ b/pkg/llm/tasks/llm/llm_create_task.go
@@ -94,6 +94,7 @@ func (task *LLMCreateTask) OnLLMRefreshStatusComplete(ctx context.Context, llm *
 	mountedModels, err := llm.FetchMountedModelFullName()
 	if err != nil {
 		task.taskFailed(ctx, llm, errors.Wrap(err, "FetchMountedModelFullName"))
+		return
 	}

 	// 创建磁盘
--- a/pkg/llm/tasks/llm/llm_start_task.go
+++ b/pkg/llm/tasks/llm/llm_start_task.go
@@ -59,5 +59,10 @@ func (t *LLMStartTask) OnStartedFailed(ctx context.Context, llm *models.SLLM, er
 }

 func (t *LLMStartTask) OnStarted(ctx context.Context, llm *models.SLLM, reason jsonutils.JSONObject) {
+	err := llm.GetLLMContainerDriver().StartLLM(ctx, t.GetUserCred(), llm)
+	if err != nil {
+		t.taskFailed(ctx, llm, err.Error())
+		return
+	}
 	t.taskComplete(ctx, llm)
 }
--- a/pkg/mcclient/options/llm/image.go
+++ b/pkg/mcclient/options/llm/image.go
@@ -18,7 +18,7 @@ func (o *LLMImageShowOptions) Params() (jsonutils.JSONObject, error) {
 type LLMImageListOptions struct {
 	options.BaseListOptions

-	LLMType string `json:"llm_type" choices:"ollama|dify|comfyui" help:"filter by llm type"`
+	LLMType string `json:"llm_type" choices:"ollama|dify|comfyui|vllm" help:"filter by llm type"`
 }

 func (o *LLMImageListOptions) Params() (jsonutils.JSONObject, error) {
@@ -30,7 +30,7 @@ type LLMImageCreateOptions struct {
 	IMAGE_NAME   string `json:"image_name"`
 	IMAGE_LABEL  string `json:"image_label"`
 	CredentialId string `json:"credential_id"`
-	LLM_TYPE     string `json:"llm_type" choices:"ollama|dify|comfyui" help:"llm type: ollama, comfyui or dify"`
+	LLM_TYPE     string `json:"llm_type" choices:"ollama|dify|comfyui|vllm" help:"llm type: ollama, comfyui or dify"`
 }

 func (o *LLMImageCreateOptions) Params() (jsonutils.JSONObject, error) {
@@ -44,7 +44,7 @@ type LLMImageUpdateOptions struct {
 	ImageName    string `json:"image_name"`
 	ImageLabel   string `json:"image_label"`
 	CredentialId string `json:"credential_id"`
-	LlmType      string `json:"llm_type" choices:"ollama|dify|vllm|comfyui" help:"llm type: ollama, comfyui or dify"`
+	LlmType      string `json:"llm_type" choices:"ollama|dify|vllm|comfyui" help:"llm type: ollama, comfyui, vllm or dify"`
 }

 func (o *LLMImageUpdateOptions) GetId() string {
--- a/pkg/mcclient/options/llm/llm.go
+++ b/pkg/mcclient/options/llm/llm.go
@@ -68,11 +68,12 @@ type LLMBaseCreateOptions struct {
 type LLMCreateOptions struct {
 	LLMBaseCreateOptions

-	LLM_SKU_ID string `help:"llm sku id or name" json:"llm_sku_id"`
+	LLM_SKU_ID     string `help:"llm sku id or name" json:"llm_sku_id"`
+	PreferredModel string `help:"vLLM preferred model dir name under models path (e.g. Qwen/Qwen2-7B)" json:"-"`
 }

 func (o *LLMCreateOptions) Params() (jsonutils.JSONObject, error) {
-	params := jsonutils.Marshal(o)
+	params := jsonutils.Marshal(o).(*jsonutils.JSONDict)

 	if len(o.Net) > 0 {
 		nets := make([]*computeapi.NetworkConfig, 0)
@@ -83,7 +84,16 @@ func (o *LLMCreateOptions) Params() (jsonutils.JSONObject, error) {
 			}
 			nets = append(nets, net)
 		}
-		params.(*jsonutils.JSONDict).Add(jsonutils.Marshal(nets), "nets")
+		params.Add(jsonutils.Marshal(nets), "nets")
+	}
+
+	if o.PreferredModel != "" {
+		spec := &api.LLMSpec{
+			Ollama: nil,
+			Vllm:   &api.LLMSpecVllm{PreferredModel: o.PreferredModel},
+			Dify:   nil,
+		}
+		params.Set("llm_spec", jsonutils.Marshal(spec))
 	}

 	return params, nil
@@ -93,6 +103,32 @@ func (o *LLMCreateOptions) GetCountParam() int {
 	return o.Count
 }

+type LLMUpdateOptions struct {
+	options.BaseIdOptions
+
+	PreferredModel string `help:"vLLM preferred model dir name under models path (e.g. Qwen/Qwen2-7B); takes effect after pod recreate" json:"-"`
+}
+
+func (o *LLMUpdateOptions) GetId() string {
+	return o.ID
+}
+
+func (o *LLMUpdateOptions) Params() (jsonutils.JSONObject, error) {
+	dict, err := options.StructToParams(o)
+	if err != nil {
+		return nil, err
+	}
+	if o.PreferredModel != "" {
+		spec := &api.LLMSpec{
+			Ollama: nil,
+			Vllm:   &api.LLMSpecVllm{PreferredModel: o.PreferredModel},
+			Dify:   nil,
+		}
+		dict.Set("llm_spec", jsonutils.Marshal(spec))
+	}
+	return dict, nil
+}
+
 type LLMDeleteOptions struct {
 	options.BaseIdOptions
 }
--- a/scripts/sync_llm_images.sh
+++ b/scripts/sync_llm_images.sh
@@ -8,7 +8,7 @@ set -euo pipefail

 if [ $# -ne 1 ]; then
  echo "用法: $0 <targetRegistry>"
-  echo "例如: $0 crpi-nf3abu98o8qf9y2x.cn-beijing.personal.cr.aliyuncs.com/eikoh"
+  echo "例如: $0 registry.cn-beijing.aliyuncs.com/cloudpods"
  exit 1
 fi