feat(kiro): enhance request translation and fix streaming issues

English: - Fix <thinking> tag parsing: only parse at response start, avoid misinterpreting discussion text - Add token counting support using tiktoken for local estimation - Support top_p parameter in inference config - Handle max_tokens=-1 as maximum (32000 tokens) - Add tool_choice and response_format parameter handling via system prompt hints - Support multiple thinking mode detection formats (Claude API, OpenAI reasoning_effort, AMP/Cursor) - Shorten MCP tool names exceeding 64 characters - Fix duplicate [DONE] marker in OpenAI SSE streaming - Enhance token usage statistics with multiple event format support - Add code fence markers to constants 中文: - 修复 <thinking> 标签解析：仅在响应开头解析，避免误解析讨论文本中的标签 - 使用 tiktoken 实现本地 token 计数功能 - 支持 top_p 推理配置参数 - 处理 max_tokens=-1 转换为最大值（32000 tokens） - 通过系统提示词注入实现 tool_choice 和 response_format 参数支持 - 支持多种思考模式检测格式（Claude API、OpenAI reasoning_effort、AMP/Cursor） - 截断超过64字符的 MCP 工具名称 - 修复 OpenAI SSE 流中重复的 [DONE] 标记 - 增强 token 使用量统计，支持多种事件格式 - 添加代码围栏标记常量
2026-05-08 04:56:04 +08:00 · 2025-12-14 11:54:57 +08:00
parent 81ae09d0ec
commit 9c04c18c04
6 changed files with 1278 additions and 27 deletions
--- a/internal/runtime/executor/kiro_executor.go
+++ b/internal/runtime/executor/kiro_executor.go
--- a/internal/translator/kiro/claude/kiro_claude_request.go
+++ b/internal/translator/kiro/claude/kiro_claude_request.go
@@ -30,6 +30,7 @@ type KiroPayload struct {
 type KiroInferenceConfig struct {
 	MaxTokens   int     `json:"maxTokens,omitempty"`
 	Temperature float64 `json:"temperature,omitempty"`
+	TopP        float64 `json:"topP,omitempty"`
 }

 // KiroConversationState holds the conversation context
@@ -136,9 +137,15 @@ func ConvertClaudeRequestToKiro(modelName string, inputRawJSON []byte, stream bo
 // Supports thinking mode - when Claude API thinking parameter is present, injects thinkingHint.
 func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte {
 	// Extract max_tokens for potential use in inferenceConfig
+	// Handle -1 as "use maximum" (Kiro max output is ~32000 tokens)
+	const kiroMaxOutputTokens = 32000
 	var maxTokens int64
 	if mt := gjson.GetBytes(claudeBody, "max_tokens"); mt.Exists() {
 		maxTokens = mt.Int()
+		if maxTokens == -1 {
+			maxTokens = kiroMaxOutputTokens
+			log.Debugf("kiro: max_tokens=-1 converted to %d", kiroMaxOutputTokens)
+		}
 	}

 	// Extract temperature if specified
@@ -149,6 +156,15 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 		hasTemperature = true
 	}

+	// Extract top_p if specified
+	var topP float64
+	var hasTopP bool
+	if tp := gjson.GetBytes(claudeBody, "top_p"); tp.Exists() {
+		topP = tp.Float()
+		hasTopP = true
+		log.Debugf("kiro: extracted top_p: %.2f", topP)
+	}
+
 	// Normalize origin value for Kiro API compatibility
 	origin = normalizeOrigin(origin)
 	log.Debugf("kiro: normalized origin value: %s", origin)
@@ -164,8 +180,26 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 	// Extract system prompt
 	systemPrompt := extractSystemPrompt(claudeBody)

-	// Check for thinking mode
-	thinkingEnabled, budgetTokens := checkThinkingMode(claudeBody)
+	// Check for thinking mode using the comprehensive IsThinkingEnabled function
+	// This supports Claude API format, OpenAI reasoning_effort, and AMP/Cursor format
+	thinkingEnabled := IsThinkingEnabled(claudeBody)
+	_, budgetTokens := checkThinkingMode(claudeBody) // Get budget tokens from Claude format if available
+	if budgetTokens <= 0 {
+		// Calculate budgetTokens based on max_tokens if available
+		// Use 50% of max_tokens for thinking, with min 8000 and max 24000
+		if maxTokens > 0 {
+			budgetTokens = maxTokens / 2
+			if budgetTokens < 8000 {
+				budgetTokens = 8000
+			}
+			if budgetTokens > 24000 {
+				budgetTokens = 24000
+			}
+			log.Debugf("kiro: budgetTokens calculated from max_tokens: %d (max_tokens=%d)", budgetTokens, maxTokens)
+		} else {
+			budgetTokens = 16000 // Default budget tokens
+		}
+	}

 	// Inject timestamp context
 	timestamp := time.Now().Format("2006-01-02 15:04:05 MST")
@@ -185,6 +219,17 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 		systemPrompt += kirocommon.KiroAgenticSystemPrompt
 	}

+	// Handle tool_choice parameter - Kiro doesn't support it natively, so we inject system prompt hints
+	// Claude tool_choice values: {"type": "auto/any/tool", "name": "..."}
+	toolChoiceHint := extractClaudeToolChoiceHint(claudeBody)
+	if toolChoiceHint != "" {
+		if systemPrompt != "" {
+			systemPrompt += "\n"
+		}
+		systemPrompt += toolChoiceHint
+		log.Debugf("kiro: injected tool_choice hint into system prompt")
+	}
+
 	// Inject thinking hint when thinking mode is enabled
 	if thinkingEnabled {
 		if systemPrompt != "" {
@@ -235,7 +280,7 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA

 	// Build inferenceConfig if we have any inference parameters
 	var inferenceConfig *KiroInferenceConfig
-	if maxTokens > 0 || hasTemperature {
+	if maxTokens > 0 || hasTemperature || hasTopP {
 		inferenceConfig = &KiroInferenceConfig{}
 		if maxTokens > 0 {
 			inferenceConfig.MaxTokens = int(maxTokens)
@@ -243,6 +288,9 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 		if hasTemperature {
 			inferenceConfig.Temperature = temperature
 		}
+		if hasTopP {
+			inferenceConfig.TopP = topP
+		}
 	}

 	payload := KiroPayload{
@@ -324,6 +372,93 @@ func checkThinkingMode(claudeBody []byte) (bool, int64) {
 	return thinkingEnabled, budgetTokens
 }

+// IsThinkingEnabled is a public wrapper to check if thinking mode is enabled.
+// This is used by the executor to determine whether to parse <thinking> tags in responses.
+// When thinking is NOT enabled in the request, <thinking> tags in responses should be
+// treated as regular text content, not as thinking blocks.
+//
+// Supports multiple formats:
+// - Claude API format: thinking.type = "enabled"
+// - OpenAI format: reasoning_effort parameter
+// - AMP/Cursor format: <thinking_mode>interleaved</thinking_mode> in system prompt
+func IsThinkingEnabled(body []byte) bool {
+	// Check Claude API format first (thinking.type = "enabled")
+	enabled, _ := checkThinkingMode(body)
+	if enabled {
+		log.Debugf("kiro: IsThinkingEnabled returning true (Claude API format)")
+		return true
+	}
+
+	// Check OpenAI format: reasoning_effort parameter
+	// Valid values: "low", "medium", "high", "auto" (not "none")
+	reasoningEffort := gjson.GetBytes(body, "reasoning_effort")
+	if reasoningEffort.Exists() {
+		effort := reasoningEffort.String()
+		if effort != "" && effort != "none" {
+			log.Debugf("kiro: thinking mode enabled via OpenAI reasoning_effort: %s", effort)
+			return true
+		}
+	}
+
+	// Check AMP/Cursor format: <thinking_mode>interleaved</thinking_mode> in system prompt
+	// This is how AMP client passes thinking configuration
+	bodyStr := string(body)
+	if strings.Contains(bodyStr, "<thinking_mode>") && strings.Contains(bodyStr, "</thinking_mode>") {
+		// Extract thinking mode value
+		startTag := "<thinking_mode>"
+		endTag := "</thinking_mode>"
+		startIdx := strings.Index(bodyStr, startTag)
+		if startIdx >= 0 {
+			startIdx += len(startTag)
+			endIdx := strings.Index(bodyStr[startIdx:], endTag)
+			if endIdx >= 0 {
+				thinkingMode := bodyStr[startIdx : startIdx+endIdx]
+				if thinkingMode == "interleaved" || thinkingMode == "enabled" {
+					log.Debugf("kiro: thinking mode enabled via AMP/Cursor format: %s", thinkingMode)
+					return true
+				}
+			}
+		}
+	}
+
+	// Check OpenAI format: max_completion_tokens with reasoning (o1-style)
+	// Some clients use this to indicate reasoning mode
+	if gjson.GetBytes(body, "max_completion_tokens").Exists() {
+		// If max_completion_tokens is set, check if model name suggests reasoning
+		model := gjson.GetBytes(body, "model").String()
+		if strings.Contains(strings.ToLower(model), "thinking") ||
+			strings.Contains(strings.ToLower(model), "reason") {
+			log.Debugf("kiro: thinking mode enabled via model name hint: %s", model)
+			return true
+		}
+	}
+
+	log.Debugf("kiro: IsThinkingEnabled returning false (no thinking mode detected)")
+	return false
+}
+
+// shortenToolNameIfNeeded shortens tool names that exceed 64 characters.
+// MCP tools often have long names like "mcp__server-name__tool-name".
+// This preserves the "mcp__" prefix and last segment when possible.
+func shortenToolNameIfNeeded(name string) string {
+	const limit = 64
+	if len(name) <= limit {
+		return name
+	}
+	// For MCP tools, try to preserve prefix and last segment
+	if strings.HasPrefix(name, "mcp__") {
+		idx := strings.LastIndex(name, "__")
+		if idx > 0 {
+			cand := "mcp__" + name[idx+2:]
+			if len(cand) > limit {
+				return cand[:limit]
+			}
+			return cand
+		}
+	}
+	return name[:limit]
+}
+
 // convertClaudeToolsToKiro converts Claude tools to Kiro format
 func convertClaudeToolsToKiro(tools gjson.Result) []KiroToolWrapper {
 	var kiroTools []KiroToolWrapper
@@ -336,6 +471,13 @@ func convertClaudeToolsToKiro(tools gjson.Result) []KiroToolWrapper {
 		description := tool.Get("description").String()
 		inputSchema := tool.Get("input_schema").Value()

+		// Shorten tool name if it exceeds 64 characters (common with MCP tools)
+		originalName := name
+		name = shortenToolNameIfNeeded(name)
+		if name != originalName {
+			log.Debugf("kiro: shortened tool name from '%s' to '%s'", originalName, name)
+		}
+
 		// CRITICAL FIX: Kiro API requires non-empty description
 		if strings.TrimSpace(description) == "" {
 			description = fmt.Sprintf("Tool: %s", name)
@@ -467,6 +609,34 @@ func deduplicateToolResults(toolResults []KiroToolResult) []KiroToolResult {
 	return unique
 }

+// extractClaudeToolChoiceHint extracts tool_choice from Claude request and returns a system prompt hint.
+// Claude tool_choice values:
+// - {"type": "auto"}: Model decides (default, no hint needed)
+// - {"type": "any"}: Must use at least one tool
+// - {"type": "tool", "name": "..."}: Must use specific tool
+func extractClaudeToolChoiceHint(claudeBody []byte) string {
+	toolChoice := gjson.GetBytes(claudeBody, "tool_choice")
+	if !toolChoice.Exists() {
+		return ""
+	}
+
+	toolChoiceType := toolChoice.Get("type").String()
+	switch toolChoiceType {
+	case "any":
+		return "[INSTRUCTION: You MUST use at least one of the available tools to respond. Do not respond with text only - always make a tool call.]"
+	case "tool":
+		toolName := toolChoice.Get("name").String()
+		if toolName != "" {
+			return fmt.Sprintf("[INSTRUCTION: You MUST use the tool named '%s' to respond. Do not use any other tool or respond with text only.]", toolName)
+		}
+	case "auto":
+		// Default behavior, no hint needed
+		return ""
+	}
+
+	return ""
+}
+
 // BuildUserMessageStruct builds a user message and extracts tool results
 func BuildUserMessageStruct(msg gjson.Result, modelID, origin string) (KiroUserInputMessage, []KiroToolResult) {
 	content := msg.Get("content")
--- a/internal/translator/kiro/common/constants.go
+++ b/internal/translator/kiro/common/constants.go
@@ -12,6 +12,15 @@ const (
 	// ThinkingEndTag is the end tag for thinking blocks in responses.
 	ThinkingEndTag = "</thinking>"

+	// CodeFenceMarker is the markdown code fence marker.
+	CodeFenceMarker = "```"
+
+	// AltCodeFenceMarker is the alternative markdown code fence marker.
+	AltCodeFenceMarker = "~~~"
+
+	// InlineCodeMarker is the markdown inline code marker (backtick).
+	InlineCodeMarker = "`"
+
 	// KiroAgenticSystemPrompt is injected only for -agentic models to prevent timeouts on large writes.
 	// AWS Kiro API has a 2-3 minute timeout for large file write operations.
 	KiroAgenticSystemPrompt = `
--- a/internal/translator/kiro/openai/kiro_openai.go
+++ b/internal/translator/kiro/openai/kiro_openai.go
@@ -156,8 +156,9 @@ func ConvertKiroStreamToOpenAI(ctx context.Context, model string, originalReques
 		}

 	case "message_stop":
-		// Final event - emit [DONE]
-		results = append(results, BuildOpenAISSEDone())
+		// Final event - do NOT emit [DONE] here
+		// The handler layer (openai_handlers.go) will send [DONE] when the stream closes
+		// Emitting [DONE] here would cause duplicate [DONE] markers

 	case "ping":
 		// Ping event with usage - optionally emit usage chunk
--- a/internal/translator/kiro/openai/kiro_openai_request.go
+++ b/internal/translator/kiro/openai/kiro_openai_request.go
@@ -29,6 +29,7 @@ type KiroPayload struct {
 type KiroInferenceConfig struct {
 	MaxTokens   int     `json:"maxTokens,omitempty"`
 	Temperature float64 `json:"temperature,omitempty"`
+	TopP        float64 `json:"topP,omitempty"`
 }

 // KiroConversationState holds the conversation context
@@ -134,9 +135,15 @@ func ConvertOpenAIRequestToKiro(modelName string, inputRawJSON []byte, stream bo
 // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode).
 func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte {
 	// Extract max_tokens for potential use in inferenceConfig
+	// Handle -1 as "use maximum" (Kiro max output is ~32000 tokens)
+	const kiroMaxOutputTokens = 32000
 	var maxTokens int64
 	if mt := gjson.GetBytes(openaiBody, "max_tokens"); mt.Exists() {
 		maxTokens = mt.Int()
+		if maxTokens == -1 {
+			maxTokens = kiroMaxOutputTokens
+			log.Debugf("kiro-openai: max_tokens=-1 converted to %d", kiroMaxOutputTokens)
+		}
 	}

 	// Extract temperature if specified
@@ -147,6 +154,15 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s
 		hasTemperature = true
 	}

+	// Extract top_p if specified
+	var topP float64
+	var hasTopP bool
+	if tp := gjson.GetBytes(openaiBody, "top_p"); tp.Exists() {
+		topP = tp.Float()
+		hasTopP = true
+		log.Debugf("kiro-openai: extracted top_p: %.2f", topP)
+	}
+
 	// Normalize origin value for Kiro API compatibility
 	origin = normalizeOrigin(origin)
 	log.Debugf("kiro-openai: normalized origin value: %s", origin)
@@ -180,6 +196,54 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s
 		systemPrompt += kirocommon.KiroAgenticSystemPrompt
 	}

+	// Handle tool_choice parameter - Kiro doesn't support it natively, so we inject system prompt hints
+	// OpenAI tool_choice values: "none", "auto", "required", or {"type":"function","function":{"name":"..."}}
+	toolChoiceHint := extractToolChoiceHint(openaiBody)
+	if toolChoiceHint != "" {
+		if systemPrompt != "" {
+			systemPrompt += "\n"
+		}
+		systemPrompt += toolChoiceHint
+		log.Debugf("kiro-openai: injected tool_choice hint into system prompt")
+	}
+
+	// Handle response_format parameter - Kiro doesn't support it natively, so we inject system prompt hints
+	// OpenAI response_format: {"type": "json_object"} or {"type": "json_schema", "json_schema": {...}}
+	responseFormatHint := extractResponseFormatHint(openaiBody)
+	if responseFormatHint != "" {
+		if systemPrompt != "" {
+			systemPrompt += "\n"
+		}
+		systemPrompt += responseFormatHint
+		log.Debugf("kiro-openai: injected response_format hint into system prompt")
+	}
+
+	// Check for thinking mode and inject thinking hint
+	// Supports OpenAI reasoning_effort parameter and model name hints
+	thinkingEnabled, budgetTokens := checkThinkingModeFromOpenAI(openaiBody)
+	if thinkingEnabled {
+		// Adjust budgetTokens based on max_tokens if not explicitly set by reasoning_effort
+		// Use 50% of max_tokens for thinking, with min 8000 and max 24000
+		if maxTokens > 0 && budgetTokens == 16000 { // 16000 is the default, meaning not explicitly set
+			calculatedBudget := maxTokens / 2
+			if calculatedBudget < 8000 {
+				calculatedBudget = 8000
+			}
+			if calculatedBudget > 24000 {
+				calculatedBudget = 24000
+			}
+			budgetTokens = calculatedBudget
+			log.Debugf("kiro-openai: budgetTokens calculated from max_tokens: %d (max_tokens=%d)", budgetTokens, maxTokens)
+		}
+
+		if systemPrompt != "" {
+			systemPrompt += "\n"
+		}
+		dynamicThinkingHint := fmt.Sprintf("<thinking_mode>interleaved</thinking_mode><max_thinking_length>%d</max_thinking_length>", budgetTokens)
+		systemPrompt += dynamicThinkingHint
+		log.Debugf("kiro-openai: injected dynamic thinking hint into system prompt, max_thinking_length: %d", budgetTokens)
+	}
+
 	// Convert OpenAI tools to Kiro format
 	kiroTools := convertOpenAIToolsToKiro(tools)

@@ -220,7 +284,7 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s

 	// Build inferenceConfig if we have any inference parameters
 	var inferenceConfig *KiroInferenceConfig
-	if maxTokens > 0 || hasTemperature {
+	if maxTokens > 0 || hasTemperature || hasTopP {
 		inferenceConfig = &KiroInferenceConfig{}
 		if maxTokens > 0 {
 			inferenceConfig.MaxTokens = int(maxTokens)
@@ -228,6 +292,9 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s
 		if hasTemperature {
 			inferenceConfig.Temperature = temperature
 		}
+		if hasTopP {
+			inferenceConfig.TopP = topP
+		}
 	}

 	payload := KiroPayload{
@@ -292,6 +359,28 @@ func extractSystemPromptFromOpenAI(messages gjson.Result) string {
 	return strings.Join(systemParts, "\n")
 }

+// shortenToolNameIfNeeded shortens tool names that exceed 64 characters.
+// MCP tools often have long names like "mcp__server-name__tool-name".
+// This preserves the "mcp__" prefix and last segment when possible.
+func shortenToolNameIfNeeded(name string) string {
+	const limit = 64
+	if len(name) <= limit {
+		return name
+	}
+	// For MCP tools, try to preserve prefix and last segment
+	if strings.HasPrefix(name, "mcp__") {
+		idx := strings.LastIndex(name, "__")
+		if idx > 0 {
+			cand := "mcp__" + name[idx+2:]
+			if len(cand) > limit {
+				return cand[:limit]
+			}
+			return cand
+		}
+	}
+	return name[:limit]
+}
+
 // convertOpenAIToolsToKiro converts OpenAI tools to Kiro format
 func convertOpenAIToolsToKiro(tools gjson.Result) []KiroToolWrapper {
 	var kiroTools []KiroToolWrapper
@@ -314,6 +403,13 @@ func convertOpenAIToolsToKiro(tools gjson.Result) []KiroToolWrapper {
 		description := fn.Get("description").String()
 		parameters := fn.Get("parameters").Value()

+		// Shorten tool name if it exceeds 64 characters (common with MCP tools)
+		originalName := name
+		name = shortenToolNameIfNeeded(name)
+		if name != originalName {
+			log.Debugf("kiro-openai: shortened tool name from '%s' to '%s'", originalName, name)
+		}
+
 		// CRITICAL FIX: Kiro API requires non-empty description
 		if strings.TrimSpace(description) == "" {
 			description = fmt.Sprintf("Tool: %s", name)
@@ -584,6 +680,153 @@ func buildFinalContent(content, systemPrompt string, toolResults []KiroToolResul
 	return finalContent
 }

+// checkThinkingModeFromOpenAI checks if thinking mode is enabled in the OpenAI request.
+// Returns (thinkingEnabled, budgetTokens).
+// Supports:
+// - reasoning_effort parameter (low/medium/high/auto)
+// - Model name containing "thinking" or "reason"
+// - <thinking_mode> tag in system prompt (AMP/Cursor format)
+func checkThinkingModeFromOpenAI(openaiBody []byte) (bool, int64) {
+	var budgetTokens int64 = 16000 // Default budget
+
+	// Check OpenAI format: reasoning_effort parameter
+	// Valid values: "low", "medium", "high", "auto" (not "none")
+	reasoningEffort := gjson.GetBytes(openaiBody, "reasoning_effort")
+	if reasoningEffort.Exists() {
+		effort := reasoningEffort.String()
+		if effort != "" && effort != "none" {
+			log.Debugf("kiro-openai: thinking mode enabled via reasoning_effort: %s", effort)
+			// Adjust budget based on effort level
+			switch effort {
+			case "low":
+				budgetTokens = 8000
+			case "medium":
+				budgetTokens = 16000
+			case "high":
+				budgetTokens = 32000
+			case "auto":
+				budgetTokens = 16000
+			}
+			return true, budgetTokens
+		}
+	}
+
+	// Check AMP/Cursor format: <thinking_mode>interleaved</thinking_mode> in system prompt
+	bodyStr := string(openaiBody)
+	if strings.Contains(bodyStr, "<thinking_mode>") && strings.Contains(bodyStr, "</thinking_mode>") {
+		startTag := "<thinking_mode>"
+		endTag := "</thinking_mode>"
+		startIdx := strings.Index(bodyStr, startTag)
+		if startIdx >= 0 {
+			startIdx += len(startTag)
+			endIdx := strings.Index(bodyStr[startIdx:], endTag)
+			if endIdx >= 0 {
+				thinkingMode := bodyStr[startIdx : startIdx+endIdx]
+				if thinkingMode == "interleaved" || thinkingMode == "enabled" {
+					log.Debugf("kiro-openai: thinking mode enabled via AMP/Cursor format: %s", thinkingMode)
+					// Try to extract max_thinking_length if present
+					if maxLenStart := strings.Index(bodyStr, "<max_thinking_length>"); maxLenStart >= 0 {
+						maxLenStart += len("<max_thinking_length>")
+						if maxLenEnd := strings.Index(bodyStr[maxLenStart:], "</max_thinking_length>"); maxLenEnd >= 0 {
+							maxLenStr := bodyStr[maxLenStart : maxLenStart+maxLenEnd]
+							if parsed, err := fmt.Sscanf(maxLenStr, "%d", &budgetTokens); err == nil && parsed == 1 {
+								log.Debugf("kiro-openai: extracted max_thinking_length: %d", budgetTokens)
+							}
+						}
+					}
+					return true, budgetTokens
+				}
+			}
+		}
+	}
+
+	// Check model name for thinking hints
+	model := gjson.GetBytes(openaiBody, "model").String()
+	modelLower := strings.ToLower(model)
+	if strings.Contains(modelLower, "thinking") || strings.Contains(modelLower, "-reason") {
+		log.Debugf("kiro-openai: thinking mode enabled via model name hint: %s", model)
+		return true, budgetTokens
+	}
+
+	log.Debugf("kiro-openai: no thinking mode detected in OpenAI request")
+	return false, budgetTokens
+}
+
+// extractToolChoiceHint extracts tool_choice from OpenAI request and returns a system prompt hint.
+// OpenAI tool_choice values:
+// - "none": Don't use any tools
+// - "auto": Model decides (default, no hint needed)
+// - "required": Must use at least one tool
+// - {"type":"function","function":{"name":"..."}} : Must use specific tool
+func extractToolChoiceHint(openaiBody []byte) string {
+	toolChoice := gjson.GetBytes(openaiBody, "tool_choice")
+	if !toolChoice.Exists() {
+		return ""
+	}
+
+	// Handle string values
+	if toolChoice.Type == gjson.String {
+		switch toolChoice.String() {
+		case "none":
+			// Note: When tool_choice is "none", we should ideally not pass tools at all
+			// But since we can't modify tool passing here, we add a strong hint
+			return "[INSTRUCTION: Do NOT use any tools. Respond with text only.]"
+		case "required":
+			return "[INSTRUCTION: You MUST use at least one of the available tools to respond. Do not respond with text only - always make a tool call.]"
+		case "auto":
+			// Default behavior, no hint needed
+			return ""
+		}
+	}
+
+	// Handle object value: {"type":"function","function":{"name":"..."}}
+	if toolChoice.IsObject() {
+		if toolChoice.Get("type").String() == "function" {
+			toolName := toolChoice.Get("function.name").String()
+			if toolName != "" {
+				return fmt.Sprintf("[INSTRUCTION: You MUST use the tool named '%s' to respond. Do not use any other tool or respond with text only.]", toolName)
+			}
+		}
+	}
+
+	return ""
+}
+
+// extractResponseFormatHint extracts response_format from OpenAI request and returns a system prompt hint.
+// OpenAI response_format values:
+// - {"type": "text"}: Default, no hint needed
+// - {"type": "json_object"}: Must respond with valid JSON
+// - {"type": "json_schema", "json_schema": {...}}: Must respond with JSON matching schema
+func extractResponseFormatHint(openaiBody []byte) string {
+	responseFormat := gjson.GetBytes(openaiBody, "response_format")
+	if !responseFormat.Exists() {
+		return ""
+	}
+
+	formatType := responseFormat.Get("type").String()
+	switch formatType {
+	case "json_object":
+		return "[INSTRUCTION: You MUST respond with valid JSON only. Do not include any text before or after the JSON. Do not wrap the JSON in markdown code blocks. Output raw JSON directly.]"
+	case "json_schema":
+		// Extract schema if provided
+		schema := responseFormat.Get("json_schema.schema")
+		if schema.Exists() {
+			schemaStr := schema.Raw
+			// Truncate if too long
+			if len(schemaStr) > 500 {
+				schemaStr = schemaStr[:500] + "..."
+			}
+			return fmt.Sprintf("[INSTRUCTION: You MUST respond with valid JSON that matches this schema: %s. Do not include any text before or after the JSON. Do not wrap the JSON in markdown code blocks. Output raw JSON directly.]", schemaStr)
+		}
+		return "[INSTRUCTION: You MUST respond with valid JSON only. Do not include any text before or after the JSON. Do not wrap the JSON in markdown code blocks. Output raw JSON directly.]"
+	case "text":
+		// Default behavior, no hint needed
+		return ""
+	}
+
+	return ""
+}
+
 // deduplicateToolResults removes duplicate tool results
 func deduplicateToolResults(toolResults []KiroToolResult) []KiroToolResult {
 	if len(toolResults) == 0 {
--- a/internal/translator/kiro/openai/kiro_openai_stream.go
+++ b/internal/translator/kiro/openai/kiro_openai_stream.go
@@ -5,7 +5,6 @@ package openai

 import (
 	"encoding/json"
-	"fmt"
 	"time"

 	"github.com/google/uuid"
@@ -34,9 +33,12 @@ func NewOpenAIStreamState(model string) *OpenAIStreamState {
 	}
 }

-// FormatSSEEvent formats a JSON payload as an SSE event
+// FormatSSEEvent formats a JSON payload for SSE streaming.
+// Note: This returns raw JSON data without "data:" prefix.
+// The SSE "data:" prefix is added by the Handler layer (e.g., openai_handlers.go)
+// to maintain architectural consistency and avoid double-prefix issues.
 func FormatSSEEvent(data []byte) string {
-	return fmt.Sprintf("data: %s", string(data))
+	return string(data)
 }

 // BuildOpenAISSETextDelta creates an SSE event for text content delta
@@ -130,9 +132,12 @@ func BuildOpenAISSEUsage(state *OpenAIStreamState, usageInfo usage.Detail) strin
 	return FormatSSEEvent(result)
 }

-// BuildOpenAISSEDone creates the final [DONE] SSE event
+// BuildOpenAISSEDone creates the final [DONE] SSE event.
+// Note: This returns raw "[DONE]" without "data:" prefix.
+// The SSE "data:" prefix is added by the Handler layer (e.g., openai_handlers.go)
+// to maintain architectural consistency and avoid double-prefix issues.
 func BuildOpenAISSEDone() string {
-	return "data: [DONE]"
+	return "[DONE]"
 }

 // buildBaseChunk creates a base chunk structure for streaming