diff --git a/crates/loopal-context/src/budget.rs b/crates/loopal-context/src/budget.rs index 6c40340b..c9de2008 100644 --- a/crates/loopal-context/src/budget.rs +++ b/crates/loopal-context/src/budget.rs @@ -14,6 +14,8 @@ pub struct ContextBudget { pub safety_margin: u32, /// Actual token budget available for messages. pub message_budget: u32, + /// True max_output_tokens from model (uncapped, for API constraint validation). + pub max_output_tokens: u32, } impl ContextBudget { @@ -48,6 +50,7 @@ impl ContextBudget { output_reserve, safety_margin, message_budget, + max_output_tokens, } } @@ -77,6 +80,19 @@ impl ContextBudget { pub fn needs_emergency(&self, msg_tokens: u32) -> bool { msg_tokens > self.message_budget * 19 / 20 } + + /// Clamp max_tokens so that `estimated_input + result <= context_window`. + /// + /// Pre-flight check before API call: if input has grown large, dynamically + /// reduce max_tokens to avoid the `input + max_tokens > context_window` rejection. + /// Returns the original max_output_tokens when there is enough headroom. + pub fn clamp_output_tokens(&self, estimated_input: u32) -> u32 { + let headroom = self + .context_window + .saturating_sub(estimated_input) + .saturating_sub(self.safety_margin); + self.max_output_tokens.min(headroom).max(1) + } } #[cfg(test)] @@ -108,6 +124,7 @@ mod tests { output_reserve: 0, safety_margin: 0, message_budget: 100_000, + max_output_tokens: 16_384, }; assert!(!budget.needs_compaction(74_999)); assert!(budget.needs_compaction(75_001)); @@ -122,8 +139,36 @@ mod tests { output_reserve: 0, safety_margin: 0, message_budget: 100_000, + max_output_tokens: 16_384, }; assert!(!budget.needs_emergency(94_999)); assert!(budget.needs_emergency(95_001)); } + + #[test] + fn calculate_preserves_max_output_tokens() { + let budget = ContextBudget::calculate(200_000, "sys", 0, 64_000); + assert_eq!(budget.max_output_tokens, 64_000); + assert_eq!(budget.output_reserve, 16_384); + } + + #[test] + fn clamp_preserves_when_input_small() { + let budget = ContextBudget::calculate(200_000, "", 0, 64_000); + assert_eq!(budget.clamp_output_tokens(50_000), 64_000); + } + + #[test] + fn clamp_reduces_when_input_large() { + let budget = ContextBudget::calculate(200_000, "", 0, 64_000); + let clamped = budget.clamp_output_tokens(180_000); + assert!(clamped < 64_000, "should be reduced: {clamped}"); + assert!(clamped <= 200_000 - 180_000 - budget.safety_margin); + } + + #[test] + fn clamp_saturates_to_one() { + let budget = ContextBudget::calculate(200_000, "", 0, 64_000); + assert_eq!(budget.clamp_output_tokens(300_000), 1); + } } diff --git a/crates/loopal-context/tests/suite/degradation_test.rs b/crates/loopal-context/tests/suite/degradation_test.rs index b75f76a7..a99cdb62 100644 --- a/crates/loopal-context/tests/suite/degradation_test.rs +++ b/crates/loopal-context/tests/suite/degradation_test.rs @@ -10,6 +10,7 @@ fn make_budget(message_budget: u32) -> ContextBudget { output_reserve: 0, safety_margin: 0, message_budget, + max_output_tokens: 16_384, } } diff --git a/crates/loopal-context/tests/suite/store_test.rs b/crates/loopal-context/tests/suite/store_test.rs index f803295a..4be8c5f1 100644 --- a/crates/loopal-context/tests/suite/store_test.rs +++ b/crates/loopal-context/tests/suite/store_test.rs @@ -10,6 +10,7 @@ fn make_budget(message_budget: u32) -> ContextBudget { output_reserve: 0, safety_margin: 0, message_budget, + max_output_tokens: 16_384, } } diff --git a/crates/loopal-error/src/helpers.rs b/crates/loopal-error/src/helpers.rs index 5d41662c..8d985c55 100644 --- a/crates/loopal-error/src/helpers.rs +++ b/crates/loopal-error/src/helpers.rs @@ -18,7 +18,8 @@ impl ProviderError { // non-overflow 400s (prefill rejection, malformed blocks, etc.). if *status == 400 && (message.contains("prompt is too long") - || message.contains("maximum context length")) + || message.contains("maximum context length") + || message.contains("exceed context limit")) { return false; } @@ -34,7 +35,9 @@ impl ProviderError { match self { ProviderError::ContextOverflow { .. } => true, ProviderError::Api { status, message } if *status == 400 => { - message.contains("prompt is too long") || message.contains("maximum context length") + message.contains("prompt is too long") + || message.contains("maximum context length") + || message.contains("exceed context limit") } _ => false, } diff --git a/crates/loopal-error/tests/suite/error_edge_test.rs b/crates/loopal-error/tests/suite/error_edge_test.rs index 75540b7f..b2ee6718 100644 --- a/crates/loopal-error/tests/suite/error_edge_test.rs +++ b/crates/loopal-error/tests/suite/error_edge_test.rs @@ -28,6 +28,20 @@ fn test_api_400_prompt_too_long_is_context_overflow() { assert!(!err.is_retryable()); } +#[test] +fn test_api_400_exceed_context_limit_is_context_overflow() { + let body = concat!( + r#"{"error":{"message":"input length and `max_tokens` exceed context limit:"#, + r#" 140795 + 64000 > 200000","type":"invalid_request_error"},"type":"error"}"#, + ); + let err = ProviderError::Api { + status: 400, + message: body.into(), + }; + assert!(err.is_context_overflow()); + assert!(!err.is_retryable()); +} + #[test] fn test_api_400_max_context_is_context_overflow() { let err = ProviderError::Api { diff --git a/crates/loopal-provider/src/anthropic/mod.rs b/crates/loopal-provider/src/anthropic/mod.rs index 1ef726e8..2735c4e9 100644 --- a/crates/loopal-provider/src/anthropic/mod.rs +++ b/crates/loopal-provider/src/anthropic/mod.rs @@ -185,7 +185,9 @@ impl AnthropicProvider { // many 400 errors (prefill rejection, malformed blocks, etc.) and // must not be conflated with context overflow. if status.as_u16() == 400 - && (text.contains("prompt is too long") || text.contains("maximum context length")) + && (text.contains("prompt is too long") + || text.contains("maximum context length") + || text.contains("exceed context limit")) { return ProviderError::ContextOverflow { message: text }.into(); } diff --git a/crates/loopal-runtime/src/agent_loop/llm_params.rs b/crates/loopal-runtime/src/agent_loop/llm_params.rs index feb17013..e28a9680 100644 --- a/crates/loopal-runtime/src/agent_loop/llm_params.rs +++ b/crates/loopal-runtime/src/agent_loop/llm_params.rs @@ -2,6 +2,7 @@ //! //! Split from llm.rs to keep files under 200 lines. +use loopal_context::{estimate_messages_tokens, estimate_tokens}; use loopal_error::Result; use loopal_message::Message; use loopal_provider::{get_thinking_capability, resolve_thinking_config}; @@ -34,18 +35,27 @@ impl AgentLoopRunner { tool_defs.retain(|t| plan_filter.contains(&t.name)); } + // Pre-flight: estimate input tokens and clamp max_tokens to avoid + // the API's `input + max_tokens > context_window` hard rejection. + let tool_token_count = loopal_context::ContextBudget::estimate_tool_tokens(&tool_defs); + let estimated_input = estimate_tokens(&full_system_prompt) + + tool_token_count + + estimate_messages_tokens(messages); + let safe_max_tokens = self + .params + .store + .budget() + .clamp_output_tokens(estimated_input); + let capability = get_thinking_capability(self.params.config.model()); - let resolved_thinking = resolve_thinking_config( - &self.model_config.thinking, - capability, - self.model_config.max_output_tokens, - ); + let resolved_thinking = + resolve_thinking_config(&self.model_config.thinking, capability, safe_max_tokens); Ok(ChatParams { model: self.params.config.model().to_string(), messages: messages.to_vec(), system_prompt: full_system_prompt, tools: tool_defs, - max_tokens: self.model_config.max_output_tokens, + max_tokens: safe_max_tokens, temperature: None, thinking: resolved_thinking, debug_dump_dir: Some(loopal_config::tmp_dir()), diff --git a/crates/loopal-runtime/src/agent_loop/run.rs b/crates/loopal-runtime/src/agent_loop/run.rs index 06fd92c7..a790387b 100644 --- a/crates/loopal-runtime/src/agent_loop/run.rs +++ b/crates/loopal-runtime/src/agent_loop/run.rs @@ -21,6 +21,7 @@ impl AgentLoopRunner { pub(super) async fn run_loop(&mut self) -> Result { let mut last_output = String::new(); let mut server_block_retry = false; + let mut context_overflow_retry = false; let mut needs_input = self.params.store.is_empty(); loop { @@ -84,6 +85,17 @@ impl AgentLoopRunner { needs_input = false; continue; } + if !context_overflow_retry && e.is_context_overflow() { + context_overflow_retry = true; + info!("context overflow detected, emergency compacting and retrying"); + self.params.store.emergency_compact(5); + self.emit(AgentEventPayload::Error { + message: "Context overflow — compacting and retrying...".into(), + }) + .await?; + needs_input = false; + continue; + } if self.interrupt.take() { self.emit_interrupted().await?; continue; @@ -93,6 +105,7 @@ impl AgentLoopRunner { } } server_block_retry = false; + context_overflow_retry = false; } Ok(AgentOutput { diff --git a/crates/loopal-runtime/tests/agent_loop/drain_pending_test.rs b/crates/loopal-runtime/tests/agent_loop/drain_pending_test.rs index 5e6c90fb..ad2ad1d8 100644 --- a/crates/loopal-runtime/tests/agent_loop/drain_pending_test.rs +++ b/crates/loopal-runtime/tests/agent_loop/drain_pending_test.rs @@ -79,6 +79,7 @@ fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-runtime/tests/agent_loop/integration_test.rs b/crates/loopal-runtime/tests/agent_loop/integration_test.rs index a401a7ff..781f042a 100644 --- a/crates/loopal-runtime/tests/agent_loop/integration_test.rs +++ b/crates/loopal-runtime/tests/agent_loop/integration_test.rs @@ -24,6 +24,7 @@ fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-runtime/tests/agent_loop/llm_test.rs b/crates/loopal-runtime/tests/agent_loop/llm_test.rs index bf6d726d..64eaf752 100644 --- a/crates/loopal-runtime/tests/agent_loop/llm_test.rs +++ b/crates/loopal-runtime/tests/agent_loop/llm_test.rs @@ -19,6 +19,7 @@ fn test_prepare_chat_params_act_mode() { !params.system_prompt.is_empty(), "env section should be present" ); + // With empty messages and 200K window, max_tokens should be preserved (headroom is large). assert_eq!(params.max_tokens, runner.model_config.max_output_tokens); assert!(params.messages.is_empty()); // Builtin tools should be present diff --git a/crates/loopal-runtime/tests/agent_loop/mock_provider.rs b/crates/loopal-runtime/tests/agent_loop/mock_provider.rs index df6e7929..dec0d9b6 100644 --- a/crates/loopal-runtime/tests/agent_loop/mock_provider.rs +++ b/crates/loopal-runtime/tests/agent_loop/mock_provider.rs @@ -76,6 +76,7 @@ fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-runtime/tests/agent_loop/mod.rs b/crates/loopal-runtime/tests/agent_loop/mod.rs index 1de78524..6431b17b 100644 --- a/crates/loopal-runtime/tests/agent_loop/mod.rs +++ b/crates/loopal-runtime/tests/agent_loop/mod.rs @@ -31,6 +31,7 @@ pub fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-runtime/tests/agent_loop/permission_test_ext.rs b/crates/loopal-runtime/tests/agent_loop/permission_test_ext.rs index e5cabdda..981c62e0 100644 --- a/crates/loopal-runtime/tests/agent_loop/permission_test_ext.rs +++ b/crates/loopal-runtime/tests/agent_loop/permission_test_ext.rs @@ -20,6 +20,7 @@ fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-runtime/tests/agent_loop/turn_completion_test.rs b/crates/loopal-runtime/tests/agent_loop/turn_completion_test.rs index 5cc7a2b9..2552082a 100644 --- a/crates/loopal-runtime/tests/agent_loop/turn_completion_test.rs +++ b/crates/loopal-runtime/tests/agent_loop/turn_completion_test.rs @@ -60,6 +60,7 @@ fn make_test_budget() -> ContextBudget { output_reserve: 16_384, safety_margin: 10_000, message_budget: 173_616, + max_output_tokens: 64_000, } } diff --git a/crates/loopal-tui/tests/suite/e2e_compact_edge_test.rs b/crates/loopal-tui/tests/suite/e2e_compact_edge_test.rs index 6befebfa..2c905dc7 100644 --- a/crates/loopal-tui/tests/suite/e2e_compact_edge_test.rs +++ b/crates/loopal-tui/tests/suite/e2e_compact_edge_test.rs @@ -19,6 +19,7 @@ fn tiny_budget() -> ContextBudget { output_reserve: 50, safety_margin: 25, message_budget: 425, + max_output_tokens: 50, } } diff --git a/crates/loopal-tui/tests/suite/e2e_compact_test.rs b/crates/loopal-tui/tests/suite/e2e_compact_test.rs index 289e3d99..f0bf2c8f 100644 --- a/crates/loopal-tui/tests/suite/e2e_compact_test.rs +++ b/crates/loopal-tui/tests/suite/e2e_compact_test.rs @@ -28,6 +28,7 @@ fn tiny_budget() -> ContextBudget { output_reserve: 50, safety_margin: 25, message_budget: 425, + max_output_tokens: 50, } }