diff --git a/internal/registry/model_definitions.go b/internal/registry/model_definitions.go index d47e50a5..5ee9faae 100644 --- a/internal/registry/model_definitions.go +++ b/internal/registry/model_definitions.go @@ -765,21 +765,23 @@ func GetIFlowModels() []*ModelInfo { type AntigravityModelConfig struct { Thinking *ThinkingSupport MaxCompletionTokens int - Name string } // GetAntigravityModelConfig returns static configuration for antigravity models. // Keys use upstream model names returned by the Antigravity models endpoint. func GetAntigravityModelConfig() map[string]*AntigravityModelConfig { return map[string]*AntigravityModelConfig{ - "gemini-2.5-flash": {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}, Name: "models/gemini-2.5-flash"}, - "gemini-2.5-flash-lite": {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}, Name: "models/gemini-2.5-flash-lite"}, - "rev19-uic3-1p": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}, Name: "models/rev19-uic3-1p"}, - "gemini-3-pro-high": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}, Name: "models/gemini-3-pro-high"}, - "gemini-3-pro-image": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}, Name: "models/gemini-3-pro-image"}, - "gemini-3-flash": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"minimal", "low", "medium", "high"}}, Name: "models/gemini-3-flash"}, + "gemini-2.5-flash": {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}}, + "gemini-2.5-flash-lite": {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}}, + "rev19-uic3-1p": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}}, + "gemini-3-pro-high": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}}, + "gemini-3-pro-image": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}}, + "gemini-3-flash": {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"minimal", "low", "medium", "high"}}}, "claude-sonnet-4-5-thinking": {Thinking: &ThinkingSupport{Min: 1024, Max: 128000, ZeroAllowed: true, DynamicAllowed: true}, MaxCompletionTokens: 64000}, "claude-opus-4-5-thinking": {Thinking: &ThinkingSupport{Min: 1024, Max: 128000, ZeroAllowed: true, DynamicAllowed: true}, MaxCompletionTokens: 64000}, + "claude-sonnet-4-5": {MaxCompletionTokens: 64000}, + "gpt-oss-120b-medium": {}, + "tab_flash_lite_preview": {}, } } @@ -809,10 +811,9 @@ func LookupStaticModelInfo(modelID string) *ModelInfo { } // Check Antigravity static config - if cfg := GetAntigravityModelConfig()[modelID]; cfg != nil && cfg.Thinking != nil { + if cfg := GetAntigravityModelConfig()[modelID]; cfg != nil { return &ModelInfo{ ID: modelID, - Name: cfg.Name, Thinking: cfg.Thinking, MaxCompletionTokens: cfg.MaxCompletionTokens, } diff --git a/internal/runtime/executor/antigravity_executor.go b/internal/runtime/executor/antigravity_executor.go index 99392188..df26e376 100644 --- a/internal/runtime/executor/antigravity_executor.go +++ b/internal/runtime/executor/antigravity_executor.go @@ -1005,9 +1005,6 @@ func FetchAntigravityModels(ctx context.Context, auth *cliproxyauth.Auth, cfg *c } modelCfg := modelConfig[modelID] modelName := modelID - if modelCfg != nil && modelCfg.Name != "" { - modelName = modelCfg.Name - } modelInfo := ®istry.ModelInfo{ ID: modelID, Name: modelName, @@ -1410,13 +1407,6 @@ func geminiToAntigravity(modelName string, payload []byte, projectID string) []b template, _ = sjson.Delete(template, "request.safetySettings") template, _ = sjson.Set(template, "request.toolConfig.functionCallingConfig.mode", "VALIDATED") - if !strings.HasPrefix(modelName, "gemini-3-") { - if thinkingLevel := gjson.Get(template, "request.generationConfig.thinkingConfig.thinkingLevel"); thinkingLevel.Exists() { - template, _ = sjson.Delete(template, "request.generationConfig.thinkingConfig.thinkingLevel") - template, _ = sjson.Set(template, "request.generationConfig.thinkingConfig.thinkingBudget", -1) - } - } - if strings.Contains(modelName, "claude") { gjson.Get(template, "request.tools").ForEach(func(key, tool gjson.Result) bool { tool.Get("functionDeclarations").ForEach(func(funKey, funcDecl gjson.Result) bool { diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index 17c5a143..b6d5418a 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -17,7 +17,6 @@ import ( claudeauth "github.com/router-for-me/CLIProxyAPI/v6/internal/auth/claude" "github.com/router-for-me/CLIProxyAPI/v6/internal/config" "github.com/router-for-me/CLIProxyAPI/v6/internal/misc" - "github.com/router-for-me/CLIProxyAPI/v6/internal/registry" "github.com/router-for-me/CLIProxyAPI/v6/internal/thinking" "github.com/router-for-me/CLIProxyAPI/v6/internal/util" cliproxyauth "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/auth" @@ -119,9 +118,6 @@ func (e *ClaudeExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r // Disable thinking if tool_choice forces tool use (Anthropic API constraint) body = disableThinkingIfToolChoiceForced(body) - // Ensure max_tokens > thinking.budget_tokens when thinking is enabled - body = ensureMaxTokensForThinking(baseModel, body) - // Extract betas from body and convert to header var extraBetas []string extraBetas, body = extractAndRemoveBetas(body) @@ -250,9 +246,6 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A // Disable thinking if tool_choice forces tool use (Anthropic API constraint) body = disableThinkingIfToolChoiceForced(body) - // Ensure max_tokens > thinking.budget_tokens when thinking is enabled - body = ensureMaxTokensForThinking(baseModel, body) - // Extract betas from body and convert to header var extraBetas []string extraBetas, body = extractAndRemoveBetas(body) @@ -541,81 +534,6 @@ func disableThinkingIfToolChoiceForced(body []byte) []byte { return body } -// ensureMaxTokensForThinking ensures max_tokens > thinking.budget_tokens when thinking is enabled. -// Anthropic API requires this constraint; violating it returns a 400 error. -// This function should be called after all thinking configuration is finalized. -// It looks up the model's MaxCompletionTokens from the registry to use as the cap. -func ensureMaxTokensForThinking(modelName string, body []byte) []byte { - thinkingType := gjson.GetBytes(body, "thinking.type").String() - if thinkingType != "enabled" { - return body - } - - budgetTokens := gjson.GetBytes(body, "thinking.budget_tokens").Int() - if budgetTokens <= 0 { - return body - } - - maxTokens := gjson.GetBytes(body, "max_tokens").Int() - - // Look up the model's max completion tokens from the registry - maxCompletionTokens := 0 - if modelInfo := registry.LookupModelInfo(modelName); modelInfo != nil { - maxCompletionTokens = modelInfo.MaxCompletionTokens - } - - // Fall back to budget + buffer if registry lookup fails or returns 0 - const fallbackBuffer = 4000 - requiredMaxTokens := budgetTokens + fallbackBuffer - if maxCompletionTokens > 0 { - requiredMaxTokens = int64(maxCompletionTokens) - } - - if maxTokens < requiredMaxTokens { - body, _ = sjson.SetBytes(body, "max_tokens", requiredMaxTokens) - } - return body -} - -func (e *ClaudeExecutor) resolveClaudeConfig(auth *cliproxyauth.Auth) *config.ClaudeKey { - if auth == nil || e.cfg == nil { - return nil - } - var attrKey, attrBase string - if auth.Attributes != nil { - attrKey = strings.TrimSpace(auth.Attributes["api_key"]) - attrBase = strings.TrimSpace(auth.Attributes["base_url"]) - } - for i := range e.cfg.ClaudeKey { - entry := &e.cfg.ClaudeKey[i] - cfgKey := strings.TrimSpace(entry.APIKey) - cfgBase := strings.TrimSpace(entry.BaseURL) - if attrKey != "" && attrBase != "" { - if strings.EqualFold(cfgKey, attrKey) && strings.EqualFold(cfgBase, attrBase) { - return entry - } - continue - } - if attrKey != "" && strings.EqualFold(cfgKey, attrKey) { - if cfgBase == "" || strings.EqualFold(cfgBase, attrBase) { - return entry - } - } - if attrKey == "" && attrBase != "" && strings.EqualFold(cfgBase, attrBase) { - return entry - } - } - if attrKey != "" { - for i := range e.cfg.ClaudeKey { - entry := &e.cfg.ClaudeKey[i] - if strings.EqualFold(strings.TrimSpace(entry.APIKey), attrKey) { - return entry - } - } - } - return nil -} - type compositeReadCloser struct { io.Reader closers []func() error diff --git a/internal/thinking/provider/claude/apply.go b/internal/thinking/provider/claude/apply.go index b7833072..3c74d514 100644 --- a/internal/thinking/provider/claude/apply.go +++ b/internal/thinking/provider/claude/apply.go @@ -80,9 +80,66 @@ func (a *Applier) Apply(body []byte, config thinking.ThinkingConfig, modelInfo * result, _ := sjson.SetBytes(body, "thinking.type", "enabled") result, _ = sjson.SetBytes(result, "thinking.budget_tokens", config.Budget) + + // Ensure max_tokens > thinking.budget_tokens (Anthropic API constraint) + result = a.normalizeClaudeBudget(result, config.Budget, modelInfo) return result, nil } +// normalizeClaudeBudget applies Claude-specific constraints to ensure max_tokens > budget_tokens. +// Anthropic API requires this constraint; violating it returns a 400 error. +func (a *Applier) normalizeClaudeBudget(body []byte, budgetTokens int, modelInfo *registry.ModelInfo) []byte { + if budgetTokens <= 0 { + return body + } + + // Ensure the request satisfies Claude constraints: + // 1) Determine effective max_tokens (request overrides model default) + // 2) If budget_tokens >= max_tokens, reduce budget_tokens to max_tokens-1 + // 3) If the adjusted budget falls below the model minimum, leave the request unchanged + // 4) If max_tokens came from model default, write it back into the request + + effectiveMax, setDefaultMax := a.effectiveMaxTokens(body, modelInfo) + if setDefaultMax && effectiveMax > 0 { + body, _ = sjson.SetBytes(body, "max_tokens", effectiveMax) + } + + // Compute the budget we would apply after enforcing budget_tokens < max_tokens. + adjustedBudget := budgetTokens + if effectiveMax > 0 && adjustedBudget >= effectiveMax { + adjustedBudget = effectiveMax - 1 + } + + minBudget := 0 + if modelInfo != nil && modelInfo.Thinking != nil { + minBudget = modelInfo.Thinking.Min + } + if minBudget > 0 && adjustedBudget > 0 && adjustedBudget < minBudget { + // If enforcing the max_tokens constraint would push the budget below the model minimum, + // leave the request unchanged. + return body + } + + if adjustedBudget != budgetTokens { + body, _ = sjson.SetBytes(body, "thinking.budget_tokens", adjustedBudget) + } + + return body +} + +// effectiveMaxTokens returns the max tokens to cap thinking: +// prefer request-provided max_tokens; otherwise fall back to model default. +// The boolean indicates whether the value came from the model default (and thus should be written back). +func (a *Applier) effectiveMaxTokens(body []byte, modelInfo *registry.ModelInfo) (max int, fromModel bool) { + if maxTok := gjson.GetBytes(body, "max_tokens"); maxTok.Exists() && maxTok.Int() > 0 { + return int(maxTok.Int()), false + } + if modelInfo != nil && modelInfo.MaxCompletionTokens > 0 { + return modelInfo.MaxCompletionTokens, true + } + return 0, false +} + func applyCompatibleClaude(body []byte, config thinking.ThinkingConfig) ([]byte, error) { if config.Mode != thinking.ModeBudget && config.Mode != thinking.ModeNone && config.Mode != thinking.ModeAuto { return body, nil diff --git a/internal/translator/codex/claude/codex_claude_response.go b/internal/translator/codex/claude/codex_claude_response.go index c700ef84..5223cd94 100644 --- a/internal/translator/codex/claude/codex_claude_response.go +++ b/internal/translator/codex/claude/codex_claude_response.go @@ -117,8 +117,12 @@ func ConvertCodexResponseToClaude(_ context.Context, _ string, originalRequestRa } else { template, _ = sjson.Set(template, "delta.stop_reason", "end_turn") } - template, _ = sjson.Set(template, "usage.input_tokens", rootResult.Get("response.usage.input_tokens").Int()) - template, _ = sjson.Set(template, "usage.output_tokens", rootResult.Get("response.usage.output_tokens").Int()) + inputTokens, outputTokens, cachedTokens := extractResponsesUsage(rootResult.Get("response.usage")) + template, _ = sjson.Set(template, "usage.input_tokens", inputTokens) + template, _ = sjson.Set(template, "usage.output_tokens", outputTokens) + if cachedTokens > 0 { + template, _ = sjson.Set(template, "usage.cache_read_input_tokens", cachedTokens) + } output = "event: message_delta\n" output += fmt.Sprintf("data: %s\n\n", template) @@ -204,8 +208,12 @@ func ConvertCodexResponseToClaudeNonStream(_ context.Context, _ string, original out := `{"id":"","type":"message","role":"assistant","model":"","content":[],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":0,"output_tokens":0}}` out, _ = sjson.Set(out, "id", responseData.Get("id").String()) out, _ = sjson.Set(out, "model", responseData.Get("model").String()) - out, _ = sjson.Set(out, "usage.input_tokens", responseData.Get("usage.input_tokens").Int()) - out, _ = sjson.Set(out, "usage.output_tokens", responseData.Get("usage.output_tokens").Int()) + inputTokens, outputTokens, cachedTokens := extractResponsesUsage(responseData.Get("usage")) + out, _ = sjson.Set(out, "usage.input_tokens", inputTokens) + out, _ = sjson.Set(out, "usage.output_tokens", outputTokens) + if cachedTokens > 0 { + out, _ = sjson.Set(out, "usage.cache_read_input_tokens", cachedTokens) + } hasToolCall := false @@ -308,12 +316,27 @@ func ConvertCodexResponseToClaudeNonStream(_ context.Context, _ string, original out, _ = sjson.SetRaw(out, "stop_sequence", stopSequence.Raw) } - if responseData.Get("usage.input_tokens").Exists() || responseData.Get("usage.output_tokens").Exists() { - out, _ = sjson.Set(out, "usage.input_tokens", responseData.Get("usage.input_tokens").Int()) - out, _ = sjson.Set(out, "usage.output_tokens", responseData.Get("usage.output_tokens").Int()) + return out +} + +func extractResponsesUsage(usage gjson.Result) (int64, int64, int64) { + if !usage.Exists() || usage.Type == gjson.Null { + return 0, 0, 0 } - return out + inputTokens := usage.Get("input_tokens").Int() + outputTokens := usage.Get("output_tokens").Int() + cachedTokens := usage.Get("input_tokens_details.cached_tokens").Int() + + if cachedTokens > 0 { + if inputTokens >= cachedTokens { + inputTokens -= cachedTokens + } else { + inputTokens = 0 + } + } + + return inputTokens, outputTokens, cachedTokens } // buildReverseMapFromClaudeOriginalShortToOriginal builds a map[short]original from original Claude request tools. diff --git a/internal/translator/openai/claude/openai_claude_request.go b/internal/translator/openai/claude/openai_claude_request.go index 3817b77b..c268ec62 100644 --- a/internal/translator/openai/claude/openai_claude_request.go +++ b/internal/translator/openai/claude/openai_claude_request.go @@ -88,7 +88,7 @@ func ConvertClaudeRequestToOpenAI(modelName string, inputRawJSON []byte, stream var messagesJSON = "[]" // Handle system message first - systemMsgJSON := `{"role":"system","content":[{"type":"text","text":"Use ANY tool, the parameters MUST accord with RFC 8259 (The JavaScript Object Notation (JSON) Data Interchange Format), the keys and value MUST be enclosed in double quotes."}]}` + systemMsgJSON := `{"role":"system","content":[]}` if system := root.Get("system"); system.Exists() { if system.Type == gjson.String { if system.String() != "" { diff --git a/internal/translator/openai/claude/openai_claude_response.go b/internal/translator/openai/claude/openai_claude_response.go index 1629545d..b6e0d005 100644 --- a/internal/translator/openai/claude/openai_claude_response.go +++ b/internal/translator/openai/claude/openai_claude_response.go @@ -289,21 +289,17 @@ func convertOpenAIStreamingChunkToAnthropic(rawJSON []byte, param *ConvertOpenAI // Only process if usage has actual values (not null) if param.FinishReason != "" { usage := root.Get("usage") - var inputTokens, outputTokens int64 + var inputTokens, outputTokens, cachedTokens int64 if usage.Exists() && usage.Type != gjson.Null { - // Check if usage has actual token counts - promptTokens := usage.Get("prompt_tokens") - completionTokens := usage.Get("completion_tokens") - - if promptTokens.Exists() && completionTokens.Exists() { - inputTokens = promptTokens.Int() - outputTokens = completionTokens.Int() - } + inputTokens, outputTokens, cachedTokens = extractOpenAIUsage(usage) // Send message_delta with usage messageDeltaJSON := `{"type":"message_delta","delta":{"stop_reason":"","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}` messageDeltaJSON, _ = sjson.Set(messageDeltaJSON, "delta.stop_reason", mapOpenAIFinishReasonToAnthropic(param.FinishReason)) messageDeltaJSON, _ = sjson.Set(messageDeltaJSON, "usage.input_tokens", inputTokens) messageDeltaJSON, _ = sjson.Set(messageDeltaJSON, "usage.output_tokens", outputTokens) + if cachedTokens > 0 { + messageDeltaJSON, _ = sjson.Set(messageDeltaJSON, "usage.cache_read_input_tokens", cachedTokens) + } results = append(results, "event: message_delta\ndata: "+messageDeltaJSON+"\n\n") param.MessageDeltaSent = true @@ -423,13 +419,12 @@ func convertOpenAINonStreamingToAnthropic(rawJSON []byte) []string { // Set usage information if usage := root.Get("usage"); usage.Exists() { - out, _ = sjson.Set(out, "usage.input_tokens", usage.Get("prompt_tokens").Int()) - out, _ = sjson.Set(out, "usage.output_tokens", usage.Get("completion_tokens").Int()) - reasoningTokens := int64(0) - if v := usage.Get("completion_tokens_details.reasoning_tokens"); v.Exists() { - reasoningTokens = v.Int() + inputTokens, outputTokens, cachedTokens := extractOpenAIUsage(usage) + out, _ = sjson.Set(out, "usage.input_tokens", inputTokens) + out, _ = sjson.Set(out, "usage.output_tokens", outputTokens) + if cachedTokens > 0 { + out, _ = sjson.Set(out, "usage.cache_read_input_tokens", cachedTokens) } - out, _ = sjson.Set(out, "usage.reasoning_tokens", reasoningTokens) } return []string{out} @@ -674,8 +669,12 @@ func ConvertOpenAIResponseToClaudeNonStream(_ context.Context, _ string, origina } if respUsage := root.Get("usage"); respUsage.Exists() { - out, _ = sjson.Set(out, "usage.input_tokens", respUsage.Get("prompt_tokens").Int()) - out, _ = sjson.Set(out, "usage.output_tokens", respUsage.Get("completion_tokens").Int()) + inputTokens, outputTokens, cachedTokens := extractOpenAIUsage(respUsage) + out, _ = sjson.Set(out, "usage.input_tokens", inputTokens) + out, _ = sjson.Set(out, "usage.output_tokens", outputTokens) + if cachedTokens > 0 { + out, _ = sjson.Set(out, "usage.cache_read_input_tokens", cachedTokens) + } } if !stopReasonSet { @@ -692,3 +691,23 @@ func ConvertOpenAIResponseToClaudeNonStream(_ context.Context, _ string, origina func ClaudeTokenCount(ctx context.Context, count int64) string { return fmt.Sprintf(`{"input_tokens":%d}`, count) } + +func extractOpenAIUsage(usage gjson.Result) (int64, int64, int64) { + if !usage.Exists() || usage.Type == gjson.Null { + return 0, 0, 0 + } + + inputTokens := usage.Get("prompt_tokens").Int() + outputTokens := usage.Get("completion_tokens").Int() + cachedTokens := usage.Get("prompt_tokens_details.cached_tokens").Int() + + if cachedTokens > 0 { + if inputTokens >= cachedTokens { + inputTokens -= cachedTokens + } else { + inputTokens = 0 + } + } + + return inputTokens, outputTokens, cachedTokens +} diff --git a/test/thinking_conversion_test.go b/test/thinking_conversion_test.go index 8f527193..4a7df29a 100644 --- a/test/thinking_conversion_test.go +++ b/test/thinking_conversion_test.go @@ -20,6 +20,7 @@ import ( "github.com/router-for-me/CLIProxyAPI/v6/internal/thinking" sdktranslator "github.com/router-for-me/CLIProxyAPI/v6/sdk/translator" "github.com/tidwall/gjson" + "github.com/tidwall/sjson" ) // thinkingTestCase represents a common test case structure for both suffix and body tests. @@ -2707,6 +2708,9 @@ func runThinkingTests(t *testing.T, cases []thinkingTestCase) { []byte(tc.inputJSON), true, ) + if applyTo == "claude" { + body, _ = sjson.SetBytes(body, "max_tokens", 200000) + } body, err := thinking.ApplyThinking(body, tc.model, tc.from, applyTo)