diff --git a/Dockerfile b/Dockerfile
index dfddf1a..add933d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,6 +21,9 @@ WORKDIR /app
 # 设置为生产环境
 ENV NODE_ENV=production
 
+# 增大 Node.js 堆内存上限，防止日志文件过大时加载 OOM（tesseract.js / js-tiktoken 初始化也有一定内存需求）
+ENV NODE_OPTIONS="--max-old-space-size=4096"
+
 # 出于安全考虑，避免使用 root 用户运行服务
 RUN addgroup --system --gid 1001 nodejs && \
     adduser --system --uid 1001 cursor
diff --git a/README.md b/README.md
index c88fb25..a125b20 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cp config.yaml.example config.yaml
 | `logging.persist_mode` | 日志落盘模式：`summary` 问答摘要 / `compact` 精简 / `full` 完整 | `summary` |
 | `max_auto_continue` | 截断自动续写次数 (`0`=禁用，交由客户端续写) | `0` |
 | `max_history_messages` | 历史消息条数上限，超出时删除最早消息（建议改用 `max_history_tokens`） | `-1`（不限制） |
-| `max_history_tokens` | 历史消息 token 数上限（推荐），有助于减少超出 Cursor 上下文的概率；注意 tiktoken 低估约 10~20%，建议参考实际 UI 日志调整，参考值 `120000~140000` | `130000` |
+| `max_history_tokens` | 历史消息 token 数上限（推荐），代码自动补偿 Cursor 后端开销（1,300 基础 + 工具 tokenizer 差异），参考值 `130000~170000` | `150000` |
 | `sanitize_response` | 响应内容清洗开关（替换 Cursor 身份引用为 Claude） | `false` |
 | `refusal_patterns` | 自定义拒绝检测规则列表（追加到内置规则） | 不配置 |
 | `tools.passthrough` | 🆕 透传模式：跳过 few-shot 注入，原始 JSON 嵌入（Roo Code/Cline 推荐） | `false` |
@@ -243,6 +243,8 @@ AI 按此格式输出 → 我们解析并转换为标准的 Anthropic `tool_use`
 
 | 环境变量 | 说明 |
 |----------|------|
+> ⚠️ **环境变量优先级高于 `config.yaml`**：若在 docker-compose 等环境中设置了环境变量，该参数的 `config.yaml` 配置会被覆盖，热重载对其**无效**。需要通过 `config.yaml` 动态调整的参数，请勿同时在环境变量中设置。
+
 | `PORT` | 服务端口 |
 | `AUTH_TOKEN` | API 鉴权 token（逗号分隔多个） |
 | `PROXY` | 全局代理地址 |
@@ -254,7 +256,7 @@ AI 按此格式输出 → 我们解析并转换为标准的 Anthropic `tool_use`
 | `LOG_DIR` | 日志文件目录 |
 | `MAX_AUTO_CONTINUE` | 截断自动续写次数 (`0`=禁用) |
 | `MAX_HISTORY_MESSAGES` | 历史消息条数上限（`-1`=不限制） |
-| `MAX_HISTORY_TOKENS` | 历史消息 token 数上限（默认 `130000`，`-1`=不限制，参考值 `120000~140000`，tiktoken 低估约 10~20%） |
+| `MAX_HISTORY_TOKENS` | 历史消息 token 数上限（默认 `150000`，`-1`=不限制，参考值 `130000~170000`，代码自动补偿 Cursor 后端开销） |
 | `SANITIZE_RESPONSE` | 响应内容清洗开关 (`true`/`false`，默认 `false`) |
 | `TOOLS_PASSTHROUGH` | 🆕 工具透传模式 (`true`/`false`，默认 `false`) |
 | `TOOLS_DISABLED` | 🆕 工具禁用模式 (`true`/`false`，默认 `false`) |
diff --git a/config.yaml.example b/config.yaml.example
index 6328c4f..a5fb251 100644
--- a/config.yaml.example
+++ b/config.yaml.example
@@ -1,5 +1,10 @@
 # Cursor2API v2 配置文件
 # 复制此文件为 config.yaml 并根据需要修改
+#
+# ⚠️ 环境变量优先级高于此文件：
+#    若通过环境变量（如 docker-compose 的 environment 块）设置了某个参数，
+#    则修改此文件对该参数无效，热重载也不会生效。
+#    需要在 config.yaml 中管理的参数，请勿同时在环境变量中设置。
 
 # 服务端口
 port: 3010
@@ -46,25 +51,27 @@ max_history_messages: -1
 # 按 js-tiktoken (cl100k_base) 估算 token 数裁剪历史，比按条数更精准
 # 能有效防止超出 Cursor API 200k 上下文上限，保障模型输出稳定
 #
-# ⚠️  注意：js-tiktoken 使用 OpenAI cl100k_base 词表估算，与 Claude 实际 tokenizer 有差异
-#     实测低估约 10%~20%，中英混合/工具调用场景差异更大
-#     建议开启后观察 UI 日志中的「↑ Cursor 输入 tokens」真实值，再据此调整
+# 说明：此值仅计算我们发送的消息内容 token
+#   代码会自动额外补偿 Cursor 后端开销（动态计算）：
+#   - 基础隐藏系统提示：约 1,300 tokens（固定）
+#   - 工具 tokenizer 差异：compact ~20/工具，full ~240/工具，names_only ~5/工具
+#   输出空间不在此预留，由用户自行通过此值控制（建议留 16,000~32,000 余量）
 #
 # 裁剪规则：
-#   - 系统提示 + 工具定义的 token 优先扣除
+#   - 系统提示 + 工具定义的 token 优先扣除（含上述固定开销）
 #   - 剩余额度从最新消息往前累加，超出预算的最早消息整条删除
 #   - 工具模式的 few-shot 示例（前 2 条）始终保留
 #
-# 参考值：120000～140000（考虑到估算误差，需预留足够安全余量）
-# Cursor API 上下文上限约 200k tokens，实际可用历史额度受系统提示和工具定义影响
+# 参考值：130000～170000，默认 150000
+# Cursor API 上下文上限约 200k tokens，建议 max_history_tokens + 开销 + 预留输出 ≤ 200000
 #
 # 与 max_history_messages 的关系：
 #   两者独立生效，若同时设置则取更严格的结果
 #   推荐：只设置 max_history_tokens，不设置 max_history_messages
 #
 # 设为 -1 不限制
-# 环境变量: MAX_HISTORY_TOKENS=130000
-max_history_tokens: 130000
+# 环境变量: MAX_HISTORY_TOKENS=150000
+max_history_tokens: 150000
 
 # ==================== Thinking 开关（最高优先级） ====================
 # 控制是否向 Cursor 发送 thinking 请求，优先级高于客户端传入的 thinking 参数
diff --git a/docker-compose.yml b/docker-compose.yml
index 4792af2..35ffeb7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -39,7 +39,7 @@ services:
       # ── 自动续写 & 历史消息限制 ──
       # - MAX_AUTO_CONTINUE=0          # 截断后自动续写次数，0=禁用(默认)
       # - MAX_HISTORY_MESSAGES=-1      # 历史消息条数上限，-1=不限制（建议改用 MAX_HISTORY_TOKENS）
-      # - MAX_HISTORY_TOKENS=130000     # 历史消息 token 数上限（推荐），默认 130000，参考值 120000~140000（tiktoken 低估约 10~20%，建议观察 UI 日志实际值后调整）
+      # - MAX_HISTORY_TOKENS=150000     # 历史消息 token 数上限（推荐），默认 150000，参考值 130000~170000（代码自动补偿 Cursor 后端开销）
 
       # ── 日志持久化 ──
       # - LOG_FILE_ENABLED=true
diff --git a/src/config.ts b/src/config.ts
index 1cffc10..7b34a54 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -195,7 +195,7 @@ function defaultConfig(): AppConfig {
         cursorModel: 'anthropic/claude-sonnet-4.6',
         maxAutoContinue: 0,
         maxHistoryMessages: -1,
-        maxHistoryTokens: 130000,
+        maxHistoryTokens: 150000,
         sanitizeEnabled: false,  // 默认关闭响应内容清洗
         fingerprint: {
             userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
diff --git a/src/converter.ts b/src/converter.ts
index c18196f..77c4669 100644
--- a/src/converter.ts
+++ b/src/converter.ts
@@ -682,17 +682,21 @@ I will ALWAYS use this exact \`\`\`json action\`\`\` block format for tool calls
     if (maxHistoryTokens >= 0) {
         const fewShotOffset2 = hasTools ? 2 : 0;
 
-        // 估算系统提示 token 数
+        // 直接对已构建的 few-shot 消息（系统提示+工具定义+few-shot回复）调用 estimateTokens
+        // 比 tools.length*70+350 更准确，因为实际注入文字已经在 messages[0..fewShotOffset2-1] 中
         let overhead = 0;
-        if (req.system) {
-            const sysStr = typeof req.system === 'string' ? req.system : JSON.stringify(req.system);
-            overhead += estimateTokens(sysStr);
-        }
-        // 估算工具定义 token 数（压缩后约 70 tokens/工具 + 350 固定开销）
-        if (req.tools && req.tools.length > 0) {
-            overhead += req.tools.length * 70;
-            overhead += 350;
+        for (let i = 0; i < fewShotOffset2; i++) {
+            overhead += estimateTokens(messages[i].parts.map(p => p.text ?? '').join(''));
         }
+        // Cursor 后端额外开销：基础隐藏系统提示（实测约 1300 tokens）+ 工具 tokenizer 差异
+        // 注意：工具定义已通过 buildToolInstructions 转为文本注入 messages[0]，并已在上方 estimateTokens 中计算
+        // Cursor 后端对工具的额外 tokenizer 差异与 schema_mode 强相关：
+        //   compact模式 ~20 tokens/工具，full模式 ~240 tokens/工具，names_only ~5 tokens/工具
+        // 输出空间不在此预留，由用户通过 max_history_tokens 自行控制
+        const toolCount = req.tools?.length ?? 0;
+        const schemaMode = getConfig().tools?.schemaMode ?? 'compact';
+        const perToolOverhead = schemaMode === 'full' ? 240 : (schemaMode === 'names_only' ? 5 : 20);
+        overhead += 1300 + toolCount * perToolOverhead;
 
         const historyBudget = Math.max(0, maxHistoryTokens - overhead);
 
diff --git a/src/cursor-client.ts b/src/cursor-client.ts
index e98d866..affdf73 100644
--- a/src/cursor-client.ts
+++ b/src/cursor-client.ts
@@ -247,14 +247,18 @@ async function sendCursorRequestInner(
 }
 
 /**
- * 发送非流式请求，收集完整响应
+ * 发送非流式请求，收集完整响应及 usage 信息
  */
-export async function sendCursorRequestFull(req: CursorChatRequest): Promise<string> {
+export async function sendCursorRequestFull(req: CursorChatRequest): Promise<{ text: string; usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number } }> {
     let fullText = '';
+    let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
     await sendCursorRequest(req, (event) => {
         if (event.type === 'text-delta' && event.delta) {
             fullText += event.delta;
         }
+        if (event.messageMetadata?.usage) {
+            usage = event.messageMetadata.usage;
+        }
     });
-    return fullText;
+    return { text: fullText, usage };
 }
diff --git a/src/handler.ts b/src/handler.ts
index da4562d..f3e4c21 100644
--- a/src/handler.ts
+++ b/src/handler.ts
@@ -97,6 +97,19 @@ export function listModels(_req: Request, res: Response): void {
 
 // ==================== Token 计数 ====================
 
+/**
+ * 对实际发往 Cursor 的完整消息内容做 token 估算（用于与 Cursor 返回值对比）
+ */
+export function estimateCursorReqTokens(cursorReq: CursorChatRequest): number {
+    let total = 0;
+    for (const msg of cursorReq.messages) {
+        for (const part of msg.parts) {
+            total += estimateTokens(part.text ?? '');
+        }
+    }
+    return total;
+}
+
 export function estimateInputTokens(body: AnthropicRequest): number {
     let total = 0;
 
@@ -479,6 +492,8 @@ function toolCallNeedsMoreContinuation(toolCall: ParsedToolCall): boolean {
  */
 export function shouldAutoContinueTruncatedToolResponse(text: string, hasTools: boolean): boolean {
     if (!hasTools || !isTruncated(text)) return false;
+    // 响应过短（< 200 chars）时不触发续写：上下文不足会导致模型拒绝或错误续写
+    if (text.trim().length < 200) return false;
     if (!hasToolCalls(text)) return true;
 
     const { toolCalls } = parseToolCalls(text);
@@ -677,7 +692,7 @@ Continue EXACTLY from where you stopped. DO NOT repeat any content already gener
             ],
         };
 
-        const continuationResponse = await sendCursorRequestFull(continuationReq);
+        const { text: continuationResponse } = await sendCursorRequestFull(continuationReq);
         if (continuationResponse.trim().length === 0) break;
 
         const deduped = deduplicateContinuation(fullText, continuationResponse);
@@ -1005,9 +1020,12 @@ async function handleDirectTextStream(
         ? sanitizeResponse(finalVisibleText)
         : finalTextToSend;
     log.recordFinalResponse(finalRecordedResponse);
+    const estimatedInput1 = estimateCursorReqTokens(activeCursorReq);
+    const actualInput1 = cursorUsage?.inputTokens;
+    console.log(`[TokenDiff] 流式(无工具) 估算(我们发的)=${estimatedInput1} Cursor实际=${actualInput1 ?? 'N/A'} Cursor隐藏开销=${actualInput1 != null ? (actualInput1 - estimatedInput1) : 'N/A'}`);
     log.updateSummary({
-        inputTokens: cursorUsage?.inputTokens ?? estimateInputTokens(body),
-        outputTokens: cursorUsage?.outputTokens ?? estimateTokens(finalRecordedResponse),
+        inputTokens: cursorUsage?.inputTokens,
+        outputTokens: cursorUsage?.outputTokens,
     });
     log.complete(finalRecordedResponse.length, 'end_turn');
 
@@ -1658,9 +1676,12 @@ Please go ahead and pick the most appropriate tool for the current task and outp
 
         // ★ 记录完成
         log.recordFinalResponse(fullResponse);
+        const estimatedInput2 = estimateCursorReqTokens(activeCursorReq);
+        const actualInput2 = cursorUsage?.inputTokens;
+        console.log(`[TokenDiff] 流式(有工具) 估算(我们发的)=${estimatedInput2} Cursor实际=${actualInput2 ?? 'N/A'} Cursor隐藏开销=${actualInput2 != null ? (actualInput2 - estimatedInput2) : 'N/A'}`);
         log.updateSummary({
-            inputTokens: cursorUsage?.inputTokens ?? estimateInputTokens(body),
-            outputTokens: cursorUsage?.outputTokens ?? estimateTokens(fullResponse),
+            inputTokens: cursorUsage?.inputTokens,
+            outputTokens: cursorUsage?.outputTokens,
         });
         log.complete(fullResponse.length, stopReason);
 
@@ -1695,7 +1716,7 @@ async function handleNonStream(res: Response, cursorReq: CursorChatRequest, body
     try {
     log.startPhase('send', '发送到 Cursor (非流式)');
     const apiStart = Date.now();
-    let fullText = await sendCursorRequestFull(cursorReq);
+    let { text: fullText, usage: cursorUsage } = await sendCursorRequestFull(cursorReq);
     log.recordTTFT();
     log.recordCursorApiTime(apiStart);
     log.recordRawResponse(fullText);
@@ -1738,7 +1759,7 @@ async function handleNonStream(res: Response, cursorReq: CursorChatRequest, body
             log.updateSummary({ retryCount });
             const retryBody = buildRetryRequest(body, attempt);
             activeCursorReq = await convertToCursorRequest(retryBody);
-            fullText = await sendCursorRequestFull(activeCursorReq);
+            ({ text: fullText, usage: cursorUsage } = await sendCursorRequestFull(activeCursorReq));
             // 重试后也需要剥离 thinking 标签
             if (hasLeadingThinking(fullText)) {
                 const { thinkingContent: retryThinking, strippedText: retryStripped } = extractThinking(fullText);
@@ -1768,7 +1789,7 @@ async function handleNonStream(res: Response, cursorReq: CursorChatRequest, body
         retryCount++;
         log.warn('Handler', 'retry', `非流式响应过短 (${fullText.length} chars)，重试第${retryCount}次`);
         activeCursorReq = await convertToCursorRequest(body);
-        fullText = await sendCursorRequestFull(activeCursorReq);
+        ({ text: fullText, usage: cursorUsage } = await sendCursorRequestFull(activeCursorReq));
         log.info('Handler', 'retry', `非流式重试响应: ${fullText.length} chars`, { preview: fullText.substring(0, 200) });
     }
 
@@ -1813,7 +1834,7 @@ Continue EXACTLY from where you stopped. DO NOT repeat any content already gener
             ],
         };
 
-        const continuationResponse = await sendCursorRequestFull(continuationReq);
+        const { text: continuationResponse } = await sendCursorRequestFull(continuationReq);
 
         if (continuationResponse.trim().length === 0) {
             log.warn('Handler', 'continuation', '非流式续写返回空响应，停止续写');
@@ -1919,7 +1940,7 @@ Please go ahead and pick the most appropriate tool for the current task and outp
                 },
             ];
             activeCursorReq = { ...activeCursorReq, messages: forceMessages };
-            fullText = await sendCursorRequestFull(activeCursorReq);
+            ({ text: fullText } = await sendCursorRequestFull(activeCursorReq));
             ({ toolCalls, cleanText } = parseToolCalls(fullText));
         }
         if (toolChoice?.type === 'any' && toolCalls.length === 0) {
@@ -1983,7 +2004,10 @@ Please go ahead and pick the most appropriate tool for the current task and outp
 
     // ★ 记录完成
     log.recordFinalResponse(fullText);
-    log.updateSummary({ inputTokens: estimateInputTokens(body), outputTokens: estimateTokens(fullText) });
+    const estimatedInput = estimateCursorReqTokens(activeCursorReq);
+    const actualInput = cursorUsage?.inputTokens;
+    console.log(`[TokenDiff] 非流式 估算(我们发的)=${estimatedInput} Cursor实际=${actualInput ?? 'N/A'} Cursor隐藏开销=${actualInput != null ? (actualInput - estimatedInput) : 'N/A'}`);
+    log.updateSummary({ inputTokens: cursorUsage?.inputTokens, outputTokens: cursorUsage?.outputTokens });
     log.complete(fullText.length, stopReason);
 
     } catch (err: unknown) {
diff --git a/src/openai-handler.ts b/src/openai-handler.ts
index a70ec51..5441d50 100644
--- a/src/openai-handler.ts
+++ b/src/openai-handler.ts
@@ -1134,7 +1134,7 @@ async function handleOpenAINonStream(
     log: RequestLogger,
 ): Promise<void> {
     let activeCursorReq = cursorReq;
-    let fullText = await sendCursorRequestFull(activeCursorReq);
+    let fullText = (await sendCursorRequestFull(activeCursorReq)).text;
     const hasTools = (body.tools?.length ?? 0) > 0;
 
     // 日志记录在详细日志中
@@ -1162,7 +1162,7 @@ async function handleOpenAINonStream(
             const retryBody = buildRetryRequest(anthropicReq, attempt);
             const retryCursorReq = await convertToCursorRequest(retryBody);
             activeCursorReq = retryCursorReq;
-            fullText = await sendCursorRequestFull(activeCursorReq);
+            fullText = (await sendCursorRequestFull(activeCursorReq)).text;
             // 重试响应也需要先剥离 thinking
             if (hasLeadingThinking(fullText)) {
                 fullText = extractThinking(fullText).strippedText;
@@ -1775,7 +1775,7 @@ async function handleResponsesNonStream(
     log: RequestLogger,
 ): Promise<void> {
     let activeCursorReq = cursorReq;
-    let fullText = await sendCursorRequestFull(activeCursorReq);
+    let fullText = (await sendCursorRequestFull(activeCursorReq)).text;
     const hasTools = (anthropicReq.tools?.length ?? 0) > 0;
 
     // Thinking 提取
@@ -1790,7 +1790,7 @@ async function handleResponsesNonStream(
             const retryBody = buildRetryRequest(anthropicReq, attempt);
             const retryCursorReq = await convertToCursorRequest(retryBody);
             activeCursorReq = retryCursorReq;
-            fullText = await sendCursorRequestFull(activeCursorReq);
+            fullText = (await sendCursorRequestFull(activeCursorReq)).text;
             if (hasLeadingThinking(fullText)) {
                 fullText = extractThinking(fullText).strippedText;
             }
diff --git a/src/types.ts b/src/types.ts
index 39f7030..77610e0 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -116,7 +116,7 @@ export interface AppConfig {
     authTokens?: string[];  // API 鉴权 token 列表，为空则不鉴权
     maxAutoContinue: number;        // 自动续写最大次数，默认 3，设 0 禁用
     maxHistoryMessages: number;     // 历史消息条数硬限制，默认 -1（不限制）
-    maxHistoryTokens: number;       // 历史消息 token 数上限（js-tiktoken 估算），默认 130000，-1 不限制
+    maxHistoryTokens: number;       // 历史消息 token 数上限（tiktoken 估算我们发出的内容，代码自动加 Cursor 后端开销：1300 基础 + perTool*工具数），默认 150000，-1 不限制
     vision?: {
         enabled: boolean;
         mode: 'ocr' | 'api';
diff --git a/vue-ui/README.md b/vue-ui/README.md
index 9a4c6cc..3e01d96 100644
--- a/vue-ui/README.md
+++ b/vue-ui/README.md
@@ -136,7 +136,7 @@ open http://localhost:3010/vuelogs
 | 基础 | `timeout` | 请求超时（秒） |
 | 基础 | `max_auto_continue` | 自动续写次数 |
 | 基础 | `max_history_messages` | 历史消息条数上限（建议改用 max_history_tokens） |
-| 基础 | `max_history_tokens` | 历史消息 token 数上限（推荐），参考值 120000~140000（tiktoken 与 Claude 实际 tokenizer 有差异，建议观察 UI 日志实际值后调整） |
+| 基础 | `max_history_tokens` | 历史消息 token 数上限（推荐），代码自动补偿 Cursor 后端开销（1,300 基础 + 工具 tokenizer 差异，动态计算），参考值 130000~170000，默认 150000 |
 | 功能 | `thinking.enabled` | Thinking 模式（跟随客户端/强制关闭/强制开启） |
 | 功能 | `sanitize_response` | 响应内容清洗 |
 | 历史压缩 | `compression.*` | 压缩开关、级别、保留条数等 |
diff --git a/vue-ui/src/components/ConfigDrawer.vue b/vue-ui/src/components/ConfigDrawer.vue
index b72d5d7..d8002c5 100644
--- a/vue-ui/src/components/ConfigDrawer.vue
+++ b/vue-ui/src/components/ConfigDrawer.vue
@@ -28,7 +28,7 @@
               <Field label="max_history_messages" desc="按条数裁剪历史（保留工具 few-shot 示例）。注意：条数无法反映实际 token 体积，建议改用下方的 max_history_tokens。-1 不限制">
                 <input v-model.number="draft.max_history_messages" type="number" min="-1" class="inp" />
               </Field>
-              <Field label="max_history_tokens" desc="按 token 数裁剪历史（推荐）。从最早消息整条删除，有助于减少超出 Cursor 上下文的概率。注意：tiktoken 与 Claude 实际 tokenizer 有差异，低估约 10~20%，默认 130000，参考值 120000~140000，建议观察 UI 日志的实际输入 tokens 后调整。-1 不限制">
+              <Field label="max_history_tokens" desc="按 token 数裁剪历史（推荐）。从最早消息整条删除，有助于减少超出 Cursor 上下文的概率。代码自动补偿 Cursor 后端开销（1,300 基础 + 工具 tokenizer 差异，动态计算），默认 150000，参考值 130000~170000。-1 不限制">
                 <input v-model.number="draft.max_history_tokens" type="number" min="-1" class="inp" />
               </Field>
             </Group>