diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs index 8ab56019c60..3029329c2c9 100644 --- a/codex-rs/core/src/client.rs +++ b/codex-rs/core/src/client.rs @@ -63,6 +63,8 @@ pub struct ModelClient { effort: Option, summary: ReasoningSummaryConfig, session_source: SessionSource, + /// Optional model override for specialized use cases (e.g., reflection judge). + model_override: Option, } #[allow(clippy::too_many_arguments)] @@ -88,9 +90,19 @@ impl ModelClient { effort, summary, session_source, + model_override: None, } } + /// Returns a clone of this client with a different model for the API request. + /// This is useful for specialized tasks like reflection judging that may use + /// a different (often cheaper/faster) model than the main agent. + pub fn with_model_override(&self, model: &str) -> Self { + let mut client = self.clone(); + client.model_override = Some(model.to_string()); + client + } + pub fn get_model_context_window(&self) -> Option { let model_family = self.get_model_family(); let effective_context_window_percent = model_family.effective_context_window_percent; @@ -294,9 +306,13 @@ impl ModelClient { self.session_source.clone() } - /// Returns the currently configured model slug. + /// Returns the currently configured model slug, or the override if set. pub fn get_model(&self) -> String { - self.get_model_family().get_model_slug().to_string() + if let Some(ref override_model) = self.model_override { + override_model.clone() + } else { + self.get_model_family().get_model_slug().to_string() + } } /// Returns the currently configured model family. diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 5f0e322dae9..6e3833cdd62 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -20,6 +20,7 @@ use crate::openai_models::model_family::ModelFamily; use crate::openai_models::models_manager::ModelsManager; use crate::parse_command::parse_command; use crate::parse_turn_item; +use crate::reflection::{ReflectionContext, evaluate_reflection}; use crate::stream_events_utils::HandleOutputCtx; use crate::stream_events_utils::handle_non_tool_response_item; use crate::stream_events_utils::handle_output_item_done; @@ -114,6 +115,7 @@ use crate::protocol::TokenCountEvent; use crate::protocol::TokenUsage; use crate::protocol::TokenUsageInfo; use crate::protocol::TurnDiffEvent; +use crate::protocol::ReflectionVerdictEvent; use crate::protocol::WarningEvent; use crate::rollout::RolloutRecorder; use crate::rollout::RolloutRecorderParams; @@ -371,6 +373,7 @@ pub(crate) struct TurnContext { pub(crate) tool_call_gate: Arc, pub(crate) exec_policy: Arc>, pub(crate) truncation_policy: TruncationPolicy, + pub(crate) reflection: crate::config::types::ReflectionConfig, } impl TurnContext { @@ -536,6 +539,7 @@ impl Session { per_turn_config.as_ref(), model_family.truncation_policy, ), + reflection: per_turn_config.reflection.clone(), } } @@ -2087,6 +2091,7 @@ async fn spawn_review_thread( tool_call_gate: Arc::new(ReadinessFlag::new()), exec_policy: parent_turn_context.exec_policy.clone(), truncation_policy: TruncationPolicy::new(&per_turn_config, model_family.truncation_policy), + reflection: per_turn_config.reflection.clone(), }; // Seed the child task with the review prompt as the initial user message. @@ -2175,6 +2180,9 @@ pub(crate) async fn run_task( .await; } + // Extract initial task for reflection BEFORE input is consumed + let initial_task = extract_initial_task_from_input(&input); + let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input); let response_item: ResponseItem = initial_input_for_turn.clone().into(); sess.record_response_item_and_emit_turn_item(turn_context.as_ref(), response_item) @@ -2192,6 +2200,11 @@ pub(crate) async fn run_task( // many turns, from the perspective of the user, it is a single turn. let turn_diff_tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new())); + // Track reflection attempts + let mut reflection_attempt: u32 = 0; + let reflection_config = &turn_context.reflection; + let reflection_enabled = reflection_config.enabled || sess.enabled(Feature::Reflection); + loop { // Note that pending_input would be something like a message the user // submitted through the UI while the model was running. Though the UI @@ -2255,7 +2268,95 @@ pub(crate) async fn run_task( } if !needs_follow_up { - last_agent_message = turn_last_agent_message; + last_agent_message = turn_last_agent_message.clone(); + + // Run reflection if enabled and we haven't exceeded max attempts + let max_attempts = reflection_config.max_attempts; + if reflection_enabled && reflection_attempt < max_attempts { + reflection_attempt += 1; + info!( + "Running reflection evaluation (attempt {}/{})", + reflection_attempt, max_attempts + ); + + // Collect conversation items for reflection + let history_items = sess.clone_history().await.get_history_for_prompt(); + let context = ReflectionContext::from_conversation( + initial_task.clone(), + &history_items, + reflection_attempt, + max_attempts, + ); + + // Evaluate with the judge, optionally using a different model + match evaluate_reflection( + &turn_context.client, + context, + reflection_config.model.as_deref(), + ) + .await + { + Ok(verdict) => { + info!( + "Reflection verdict: completed={}, confidence={:.2}", + verdict.completed, verdict.confidence + ); + + // Emit reflection verdict event for visibility + sess.send_event( + &turn_context, + EventMsg::ReflectionVerdict(ReflectionVerdictEvent { + completed: verdict.completed, + confidence: verdict.confidence, + reasoning: verdict.reasoning.clone(), + feedback: verdict.feedback.clone(), + attempt: reflection_attempt, + max_attempts, + }), + ) + .await; + + if !verdict.completed { + if let Some(feedback) = verdict.feedback { + info!("Task incomplete, injecting feedback: {}", feedback); + + // Inject feedback as a new user message + let feedback_msg = format!( + "[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.", + reflection_attempt, + max_attempts, + verdict.reasoning, + feedback + ); + + let feedback_item = ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ContentItem::InputText { + text: feedback_msg, + }], + }; + + sess.record_conversation_items( + &turn_context, + &[feedback_item], + ) + .await; + + // Continue the loop to process the feedback + continue; + } + } else { + info!("Reflection: Task completed successfully"); + } + } + Err(e) => { + warn!("Reflection evaluation failed: {}", e); + // Continue without blocking on reflection errors + } + } + } + sess.notifier() .notify(&UserNotification::AgentTurnComplete { thread_id: sess.conversation_id.to_string(), @@ -2292,6 +2393,27 @@ pub(crate) async fn run_task( last_agent_message } +/// Extract the initial task/prompt from user input. +fn extract_initial_task_from_input(input: &[UserInput]) -> String { + for item in input { + match item { + UserInput::Text { text } => { + return text.clone(); + } + UserInput::Image { .. } | UserInput::LocalImage { .. } => { + // Skip images, look for text + } + UserInput::Skill { name, .. } => { + // Return skill name as task description + return format!("Run skill: {}", name); + } + // Handle future variants of the non-exhaustive enum + _ => {} + } + } + "(No initial task found)".to_string() +} + #[instrument( skip_all, fields( diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs index a7e11f45eb8..1e59f6ffc73 100644 --- a/codex-rs/core/src/config/mod.rs +++ b/codex-rs/core/src/config/mod.rs @@ -7,6 +7,8 @@ use crate::config::types::Notifications; use crate::config::types::OtelConfig; use crate::config::types::OtelConfigToml; use crate::config::types::OtelExporterKind; +use crate::config::types::ReflectionConfig; +use crate::config::types::ReflectionConfigToml; use crate::config::types::SandboxWorkspaceWrite; use crate::config::types::ShellEnvironmentPolicy; use crate::config::types::ShellEnvironmentPolicyToml; @@ -274,6 +276,9 @@ pub struct Config { /// Centralized feature flags; source of truth for feature gating. pub features: Features, + /// Configuration for the reflection/judge feature. + pub reflection: ReflectionConfig, + /// The active profile name used to derive this `Config` (if any). pub active_profile: Option, @@ -666,6 +671,9 @@ pub struct ConfigToml { /// Settings for ghost snapshots (used for undo). #[serde(default)] pub ghost_snapshot: Option, + /// Configuration for the reflection/judge feature. + #[serde(default)] + pub reflection: Option, /// When `true`, checks for Codex updates on startup and surfaces update prompts. /// Set to `false` only if your Codex updates are centrally managed. @@ -1221,6 +1229,7 @@ impl Config { use_experimental_use_rmcp_client, ghost_snapshot, features, + reflection: cfg.reflection.map(Into::into).unwrap_or_default(), active_profile: active_profile_name, active_project, windows_wsl_setup_acknowledged: cfg.windows_wsl_setup_acknowledged.unwrap_or(false), @@ -2983,6 +2992,7 @@ model_verbosity = "high" use_experimental_use_rmcp_client: false, ghost_snapshot: GhostSnapshotConfig::default(), features: Features::with_defaults(), + reflection: ReflectionConfig::default(), active_profile: Some("o3".to_string()), active_project: ProjectConfig { trust_level: None }, windows_wsl_setup_acknowledged: false, @@ -3058,6 +3068,7 @@ model_verbosity = "high" use_experimental_use_rmcp_client: false, ghost_snapshot: GhostSnapshotConfig::default(), features: Features::with_defaults(), + reflection: ReflectionConfig::default(), active_profile: Some("gpt3".to_string()), active_project: ProjectConfig { trust_level: None }, windows_wsl_setup_acknowledged: false, @@ -3148,6 +3159,7 @@ model_verbosity = "high" use_experimental_use_rmcp_client: false, ghost_snapshot: GhostSnapshotConfig::default(), features: Features::with_defaults(), + reflection: ReflectionConfig::default(), active_profile: Some("zdr".to_string()), active_project: ProjectConfig { trust_level: None }, windows_wsl_setup_acknowledged: false, @@ -3224,6 +3236,7 @@ model_verbosity = "high" use_experimental_use_rmcp_client: false, ghost_snapshot: GhostSnapshotConfig::default(), features: Features::with_defaults(), + reflection: ReflectionConfig::default(), active_profile: Some("gpt5".to_string()), active_project: ProjectConfig { trust_level: None }, windows_wsl_setup_acknowledged: false, diff --git a/codex-rs/core/src/config/types.rs b/codex-rs/core/src/config/types.rs index 9243e9878aa..76f148ba412 100644 --- a/codex-rs/core/src/config/types.rs +++ b/codex-rs/core/src/config/types.rs @@ -531,6 +531,57 @@ impl From for ShellEnvironmentPolicy { } } +/// Configuration for the reflection/judge feature. +/// This controls how task completion is verified by a separate "judge" model. +#[derive(Deserialize, Debug, Clone, PartialEq, Default)] +pub struct ReflectionConfigToml { + /// Enable or disable reflection. When enabled, a judge model evaluates + /// task completion after each turn. Defaults to false. + #[serde(default)] + pub enabled: Option, + + /// Model to use for reflection/judging. If not set, uses the same model + /// as the main agent. Can be set to a cheaper/faster model like "gpt-4o-mini". + pub model: Option, + + /// Maximum number of reflection attempts before giving up. + /// Defaults to 3. + pub max_attempts: Option, +} + +/// Runtime reflection configuration. +#[derive(Debug, Clone, PartialEq)] +pub struct ReflectionConfig { + /// Whether reflection is enabled. + pub enabled: bool, + + /// Model to use for reflection/judging. + pub model: Option, + + /// Maximum number of reflection attempts. + pub max_attempts: u32, +} + +impl Default for ReflectionConfig { + fn default() -> Self { + Self { + enabled: false, + model: None, + max_attempts: 3, + } + } +} + +impl From for ReflectionConfig { + fn from(toml: ReflectionConfigToml) -> Self { + Self { + enabled: toml.enabled.unwrap_or(false), + model: toml.model, + max_attempts: toml.max_attempts.unwrap_or(3), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -803,4 +854,92 @@ mod tests { "unexpected error: {err}" ); } + + // ==================== ReflectionConfig Tests ==================== + + #[test] + fn test_reflection_config_defaults() { + let config = ReflectionConfig::default(); + assert!(!config.enabled); + assert!(config.model.is_none()); + assert_eq!(config.max_attempts, 3); + } + + #[test] + fn test_reflection_config_toml_defaults() { + let toml_config = ReflectionConfigToml::default(); + assert!(toml_config.enabled.is_none()); + assert!(toml_config.model.is_none()); + assert!(toml_config.max_attempts.is_none()); + } + + #[test] + fn test_reflection_config_from_toml_defaults() { + let toml_config = ReflectionConfigToml::default(); + let config: ReflectionConfig = toml_config.into(); + + assert!(!config.enabled); + assert!(config.model.is_none()); + assert_eq!(config.max_attempts, 3); + } + + #[test] + fn test_reflection_config_from_toml_full() { + let toml_config = ReflectionConfigToml { + enabled: Some(true), + model: Some("gpt-4o-mini".to_string()), + max_attempts: Some(5), + }; + let config: ReflectionConfig = toml_config.into(); + + assert!(config.enabled); + assert_eq!(config.model, Some("gpt-4o-mini".to_string())); + assert_eq!(config.max_attempts, 5); + } + + #[test] + fn test_reflection_config_deserialize_toml() { + let cfg: ReflectionConfigToml = toml::from_str( + r#" + enabled = true + model = "gpt-4o-mini" + max_attempts = 10 + "#, + ) + .expect("should deserialize reflection config"); + + assert_eq!(cfg.enabled, Some(true)); + assert_eq!(cfg.model, Some("gpt-4o-mini".to_string())); + assert_eq!(cfg.max_attempts, Some(10)); + } + + #[test] + fn test_reflection_config_deserialize_toml_partial() { + let cfg: ReflectionConfigToml = toml::from_str( + r#" + enabled = true + "#, + ) + .expect("should deserialize partial reflection config"); + + assert_eq!(cfg.enabled, Some(true)); + assert!(cfg.model.is_none()); + assert!(cfg.max_attempts.is_none()); + + // Convert to runtime config + let config: ReflectionConfig = cfg.into(); + assert!(config.enabled); + assert!(config.model.is_none()); + assert_eq!(config.max_attempts, 3); // default + } + + #[test] + fn test_reflection_config_deserialize_toml_empty() { + let cfg: ReflectionConfigToml = + toml::from_str("").expect("should deserialize empty config"); + + assert!(cfg.enabled.is_none()); + assert!(cfg.model.is_none()); + assert!(cfg.max_attempts.is_none()); + } } diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs index 431eabe3c61..ca45a6f2ebd 100644 --- a/codex-rs/core/src/features.rs +++ b/codex-rs/core/src/features.rs @@ -64,6 +64,8 @@ pub enum Feature { ShellSnapshot, /// Experimental TUI v2 (viewport) implementation. Tui2, + /// Reflection layer that verifies task completion via a judge model. + Reflection, } impl Feature { @@ -365,4 +367,10 @@ pub const FEATURES: &[FeatureSpec] = &[ stage: Stage::Experimental, default_enabled: false, }, + FeatureSpec { + id: Feature::Reflection, + key: "reflection", + stage: Stage::Experimental, + default_enabled: false, + }, ]; diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs index 11b49c78c41..fb0bab8697d 100644 --- a/codex-rs/core/src/lib.rs +++ b/codex-rs/core/src/lib.rs @@ -92,6 +92,7 @@ pub use rollout::list::Cursor; pub use rollout::list::parse_cursor; pub use rollout::list::read_head_for_summary; mod function_tool; +pub mod reflection; mod state; mod tasks; mod user_notification; diff --git a/codex-rs/core/src/reflection.rs b/codex-rs/core/src/reflection.rs new file mode 100644 index 00000000000..e0b0ea97706 --- /dev/null +++ b/codex-rs/core/src/reflection.rs @@ -0,0 +1,915 @@ +//! Reflection layer for task verification. +//! +//! This module implements a "judge" that verifies if the agent completed a task +//! correctly. After each turn, if reflection is enabled, it: +//! 1. Collects the initial task, recent tool calls, and final result +//! 2. Sends this context to a judge model for evaluation +//! 3. Returns a verdict indicating if the task was completed or needs more work + +use crate::client::ModelClient; +use crate::client_common::Prompt; +use crate::client_common::ResponseEvent; +use crate::error::Result; +use codex_protocol::models::ContentItem; +use codex_protocol::models::ResponseItem; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use tracing::{debug, info, warn}; + +/// Maximum number of recent tool calls to include in reflection context. +const MAX_TOOL_CALLS: usize = 10; + +/// JSON Schema for the reflection verdict output. +/// This ensures the judge model returns structured, parseable JSON. +fn verdict_json_schema() -> serde_json::Value { + json!({ + "type": "object", + "properties": { + "completed": { + "type": "boolean", + "description": "Whether the task was completed successfully" + }, + "confidence": { + "type": "number", + "description": "Confidence score from 0.0 to 1.0" + }, + "reasoning": { + "type": "string", + "description": "Brief explanation of the verdict" + }, + "feedback": { + "type": ["string", "null"], + "description": "If not completed: specific instructions for what still needs to be done. If completed: null" + } + }, + "required": ["completed", "confidence", "reasoning", "feedback"], + "additionalProperties": false + }) +} + +/// Heuristically detect if tool output indicates an error. +/// +/// This function uses more sophisticated pattern matching to reduce false positives +/// from phrases like "error handling", "no errors", etc. +fn output_indicates_error(output: &str) -> bool { + let output_lower = output.to_lowercase(); + + // Patterns that strongly indicate an actual error + let error_indicators = [ + "error:", // "Error: something failed" + "failed:", // "Failed: reason" + "failure:", // "Failure: reason" + "exception:", // "Exception: ..." + "panic:", // Rust panic + "traceback", // Python traceback + "fatal error", // Fatal error + "cannot ", // "cannot find", "cannot open" + "could not ", // "could not connect" + "permission denied", + "access denied", + "not found", // "file not found", "command not found" + "no such file", // Unix errors + "does not exist", + "is not recognized", // Windows command errors + "syntax error", + "compilation error", + "build failed", + "test failed", + "assertion failed", + "segmentation fault", + "stack overflow", + "out of memory", + "timeout", + "timed out", + "connection refused", + "connection reset", + "exit code 1", // Non-zero exit codes + "exit status 1", + "exited with", // "exited with code 1" + "returned error", + "threw an error", + ]; + + // Patterns that suggest false positives - output is actually OK + let false_positive_indicators = [ + "no error", + "no errors", + "0 errors", + "without error", + "error handling", + "error handler", + "error message", + "error-free", + "errorfree", + "on error", // "on error do something" + "if error", // "if error then" + "handle error", + "catch error", + "log error", + "print error", + ]; + + // If output contains a false positive indicator, be more conservative + for fp in &false_positive_indicators { + if output_lower.contains(fp) { + // Still check for strong error indicators (these override false positives) + let strong_indicators = [ + "error:", + "failed:", + "failure:", + "exception:", + "panic:", + "traceback", + "fatal error", + "segmentation fault", + "stack overflow", + ]; + for indicator in &strong_indicators { + if output_lower.contains(indicator) { + return true; + } + } + return false; + } + } + + // Check for error indicators + for indicator in &error_indicators { + if output_lower.contains(indicator) { + return true; + } + } + + false +} + +/// Result of a reflection evaluation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReflectionVerdict { + /// Whether the task was completed successfully. + pub completed: bool, + /// Confidence level (0.0 to 1.0). + pub confidence: f32, + /// Feedback for the agent if task is incomplete. + pub feedback: Option, + /// Reasoning behind the verdict. + pub reasoning: String, +} + +/// Context collected for reflection evaluation. +#[derive(Debug, Clone)] +pub struct ReflectionContext { + /// The original user task/prompt. + pub initial_task: String, + /// Recent tool calls with their results. + pub tool_calls: Vec, + /// The agent's final message/response. + pub final_response: Option, + /// Current reflection attempt number. + pub attempt: u32, + /// Maximum number of reflection attempts (from config). + pub max_attempts: u32, +} + +/// Summary of a tool call for reflection context. +#[derive(Debug, Clone, Serialize)] +pub struct ToolCallSummary { + pub tool_name: String, + pub arguments: String, + pub result: String, + pub success: bool, +} + +impl ReflectionContext { + /// Create a new reflection context from conversation items. + pub fn from_conversation( + initial_task: String, + items: &[ResponseItem], + attempt: u32, + max_attempts: u32, + ) -> Self { + let mut tool_calls: Vec = Vec::new(); + let mut final_response = None; + + // Iterate forward through items to properly pair FunctionCall with FunctionCallOutput + for item in items.iter() { + match item { + ResponseItem::FunctionCall { + name, + arguments, + call_id: _, + id: _, + } => { + if tool_calls.len() < MAX_TOOL_CALLS { + tool_calls.push(ToolCallSummary { + tool_name: name.clone(), + arguments: truncate_string(arguments, 200), + result: String::new(), + success: true, // Default, will be updated when we see output + }); + } + } + ResponseItem::FunctionCallOutput { call_id: _, output } => { + // Update the last tool call with its result + if let Some(last_call) = tool_calls.last_mut() { + last_call.result = truncate_string(output, 500); + last_call.success = !output_indicates_error(output); + } + } + ResponseItem::Message { role, content, .. } => { + if role == "assistant" { + // Extract text from content - keep updating to get the last response + for c in content { + if let ContentItem::OutputText { text } = c { + final_response = Some(text.clone()); + } + } + } + } + _ => {} + } + } + + Self { + initial_task, + tool_calls, + final_response, + attempt, + max_attempts, + } + } + + /// Build the judge prompt from this context. + pub fn build_judge_prompt(&self) -> String { + let tool_calls_str = if self.tool_calls.is_empty() { + "No tool calls were made.".to_string() + } else { + self.tool_calls + .iter() + .enumerate() + .map(|(i, tc)| { + format!( + "{}. {} ({})\n Args: {}\n Result: {}", + i + 1, + tc.tool_name, + if tc.success { "success" } else { "failed" }, + tc.arguments, + if tc.result.is_empty() { + "(no output)" + } else { + &tc.result + } + ) + }) + .collect::>() + .join("\n\n") + }; + + let final_response_str = self + .final_response + .as_ref() + .map(|r| truncate_string(r, 1000)) + .unwrap_or_else(|| "(No final response)".to_string()); + + format!( + r#"You are a strict task verification judge. Your job is to evaluate whether an AI coding assistant has FULLY completed a user's task. + +## ORIGINAL TASK +{task} + +## TOOL CALLS MADE (last {max_tools}) +{tools} + +## AGENT'S FINAL RESPONSE +{response} + +## REFLECTION ATTEMPT +This is attempt {attempt} of {max_attempts}. + +## YOUR TASK +Evaluate whether the task was completed correctly and fully. Be strict but fair. + +Consider: +1. Did the agent address ALL parts of the user's request? +2. Were the tool calls appropriate and successful? +3. Is the final response accurate and complete? +4. Are there any obvious errors or missing steps? + +Respond with ONLY a JSON object in this exact format: +{{ + "completed": true/false, + "confidence": 0.0-1.0, + "reasoning": "Brief explanation of your verdict", + "feedback": "If not completed: specific instructions for what still needs to be done. If completed: null" +}} + +Be concise. Do not include any text outside the JSON object."#, + task = self.initial_task, + max_tools = MAX_TOOL_CALLS, + tools = tool_calls_str, + response = final_response_str, + attempt = self.attempt, + max_attempts = self.max_attempts, + ) + } +} + +/// Parse the judge's response into a verdict. +pub fn parse_verdict(response: &str) -> Result { + // Try to extract JSON from the response + let json_str = extract_json(response); + + match serde_json::from_str::(&json_str) { + Ok(verdict) => Ok(verdict), + Err(e) => { + warn!("Failed to parse judge verdict: {}", e); + // Return a default "completed" verdict if parsing fails. + // This prevents blocking on judge errors - we trust the agent + // and only block when we have clear evidence of incompletion. + Ok(ReflectionVerdict { + completed: true, + confidence: 0.5, + reasoning: format!("Failed to parse judge response: {}", e), + feedback: None, + }) + } + } +} + +/// Extract JSON object from a string that might contain other text. +fn extract_json(s: &str) -> String { + // Find the first { and last } + if let (Some(start), Some(end)) = (s.find('{'), s.rfind('}')) { + if end > start { + return s[start..=end].to_string(); + } + } + s.to_string() +} + +/// Truncate a string to a maximum length, adding ellipsis if needed. +fn truncate_string(s: &str, max_len: usize) -> String { + if s.len() <= max_len { + s.to_string() + } else { + format!("{}...", &s[..max_len.saturating_sub(3)]) + } +} + +/// Run reflection evaluation using a judge model. +/// +/// This function creates a separate API call to evaluate the agent's work. +/// It uses the same model client infrastructure but with a judge-specific prompt. +/// +/// # Arguments +/// * `client` - The model client to use for API calls +/// * `context` - The reflection context containing task and conversation info +/// * `judge_model` - Optional model to use for judging (overrides client's default) +pub async fn evaluate_reflection( + client: &ModelClient, + context: ReflectionContext, + judge_model: Option<&str>, +) -> Result { + info!( + "Running reflection evaluation (attempt {}), judge_model={:?}", + context.attempt, judge_model + ); + + let judge_prompt = context.build_judge_prompt(); + debug!("Judge prompt length: {} chars", judge_prompt.len()); + + // Create a client with the judge model override if specified + let effective_client = match judge_model { + Some(model) => client.with_model_override(model), + None => client.clone(), + }; + + let prompt = Prompt { + input: vec![ResponseItem::Message { + id: None, + role: "user".to_string(), + content: vec![ContentItem::InputText { text: judge_prompt }], + }], + tools: vec![], // Judge doesn't need tools + parallel_tool_calls: false, + base_instructions_override: Some( + "You are a task verification judge. Respond only with JSON.".to_string(), + ), + output_schema: Some(verdict_json_schema()), + }; + + // Stream the response and collect it + let mut stream = effective_client.stream(&prompt).await?; + let mut response_text = String::new(); + + while let Some(event) = stream.next().await { + match event { + Ok(ResponseEvent::OutputTextDelta(delta)) => { + response_text.push_str(&delta); + } + Ok(ResponseEvent::Completed { .. }) => { + break; + } + Err(e) => { + warn!("Error during reflection stream: {}", e); + break; + } + _ => {} + } + } + + debug!("Judge response: {}", response_text); + parse_verdict(&response_text) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ==================== Verdict Parsing Tests ==================== + + #[test] + fn test_parse_verdict_valid_pass() { + let response = r#"{"completed": true, "confidence": 0.95, "reasoning": "Task was done", "feedback": null}"#; + let verdict = parse_verdict(response).unwrap(); + assert!(verdict.completed); + assert!(verdict.confidence > 0.9); + assert!(verdict.feedback.is_none()); + assert_eq!(verdict.reasoning, "Task was done"); + } + + #[test] + fn test_parse_verdict_valid_fail() { + let response = r#"{"completed": false, "confidence": 0.7, "reasoning": "Task incomplete", "feedback": "Please add error handling"}"#; + let verdict = parse_verdict(response).unwrap(); + assert!(!verdict.completed); + assert!(verdict.confidence < 0.8); + assert_eq!( + verdict.feedback, + Some("Please add error handling".to_string()) + ); + } + + #[test] + fn test_parse_verdict_with_surrounding_text() { + let response = r#"Here is my evaluation: +{"completed": false, "confidence": 0.8, "reasoning": "Missing tests", "feedback": "Add unit tests"} +That's my verdict."#; + let verdict = parse_verdict(response).unwrap(); + assert!(!verdict.completed); + assert_eq!(verdict.feedback, Some("Add unit tests".to_string())); + } + + #[test] + fn test_parse_verdict_malformed_json_returns_default() { + let response = "This is not valid JSON at all!"; + let verdict = parse_verdict(response).unwrap(); + // Should return default "completed" verdict when parsing fails + // This prevents blocking on judge parse errors + assert!(verdict.completed); + assert_eq!(verdict.confidence, 0.5); + assert!(verdict.feedback.is_none()); + } + + #[test] + fn test_parse_verdict_partial_json() { + let response = r#"{"completed": true"#; // Incomplete JSON + let verdict = parse_verdict(response).unwrap(); + // Should return default "completed" on parse failure + assert!(verdict.completed); + } + + // ==================== String Utilities Tests ==================== + + #[test] + fn test_truncate_string_no_truncation() { + assert_eq!(truncate_string("hello", 10), "hello"); + assert_eq!(truncate_string("", 10), ""); + } + + #[test] + fn test_truncate_string_with_truncation() { + assert_eq!(truncate_string("hello world", 8), "hello..."); + assert_eq!(truncate_string("abcdefghij", 5), "ab..."); + } + + #[test] + fn test_truncate_string_edge_cases() { + assert_eq!(truncate_string("abc", 3), "abc"); // Exact length + assert_eq!(truncate_string("abcd", 3), "..."); // Just under, adds ellipsis + } + + // ==================== Error Detection Tests ==================== + + #[test] + fn test_output_indicates_error_real_errors() { + // These should all be detected as errors + assert!(output_indicates_error("Error: file not found")); + assert!(output_indicates_error("Failed: connection refused")); + assert!(output_indicates_error("permission denied")); + assert!(output_indicates_error("Command 'foo' not found")); + assert!(output_indicates_error("Build failed with 3 errors")); + assert!(output_indicates_error("Test failed: expected 5, got 3")); + assert!(output_indicates_error("Traceback (most recent call last):")); + assert!(output_indicates_error("panic: runtime error")); + assert!(output_indicates_error("Cannot open file")); + assert!(output_indicates_error("exit code 1")); + } + + #[test] + fn test_output_indicates_error_false_positives() { + // These should NOT be detected as errors (false positives in naive approach) + assert!(!output_indicates_error("Implemented error handling for edge cases")); + assert!(!output_indicates_error("Added error message display")); + assert!(!output_indicates_error("No errors found in the codebase")); + assert!(!output_indicates_error("The function handles errors gracefully")); + assert!(!output_indicates_error("0 errors, 0 warnings")); + assert!(!output_indicates_error("Build completed without error")); + } + + #[test] + fn test_output_indicates_error_success_messages() { + // These are clearly successful outputs + assert!(!output_indicates_error("File created successfully")); + assert!(!output_indicates_error("All tests passed")); + assert!(!output_indicates_error("Build completed in 2.3s")); + assert!(!output_indicates_error("hello world")); + assert!(!output_indicates_error("")); + } + + #[test] + fn test_output_indicates_error_mixed_with_strong_indicator() { + // Even if false positive phrases exist, strong indicators should trigger + assert!(output_indicates_error("Error: no error handler found")); + assert!(output_indicates_error("0 errors in file, but Failed: compilation error")); + } + + #[test] + fn test_extract_json_simple() { + let s = "Some text {\"key\": \"value\"} more text"; + assert_eq!(extract_json(s), r#"{"key": "value"}"#); + } + + #[test] + fn test_extract_json_nested() { + let s = r#"Result: {"outer": {"inner": "value"}, "arr": [1,2,3]}"#; + assert_eq!( + extract_json(s), + r#"{"outer": {"inner": "value"}, "arr": [1,2,3]}"# + ); + } + + #[test] + fn test_extract_json_no_json() { + let s = "No JSON here"; + assert_eq!(extract_json(s), "No JSON here"); + } + + #[test] + fn test_extract_json_only_json() { + let s = r#"{"completed": true}"#; + assert_eq!(extract_json(s), r#"{"completed": true}"#); + } + + // ==================== ReflectionContext Tests ==================== + + #[test] + fn test_reflection_context_creation() { + let context = ReflectionContext { + initial_task: "Create a hello world program".to_string(), + tool_calls: vec![], + final_response: Some("Done!".to_string()), + attempt: 1, + max_attempts: 3, + }; + + assert_eq!(context.initial_task, "Create a hello world program"); + assert_eq!(context.attempt, 1); + assert!(context.tool_calls.is_empty()); + } + + #[test] + fn test_reflection_context_with_tool_calls() { + let context = ReflectionContext { + initial_task: "Run a shell command".to_string(), + tool_calls: vec![ToolCallSummary { + tool_name: "shell".to_string(), + arguments: r#"{"command": "echo hello"}"#.to_string(), + result: "hello\n".to_string(), + success: true, + }], + final_response: Some("Executed successfully".to_string()), + attempt: 1, + max_attempts: 3, + }; + + assert_eq!(context.tool_calls.len(), 1); + assert_eq!(context.tool_calls[0].tool_name, "shell"); + assert!(context.tool_calls[0].success); + } + + #[test] + fn test_build_judge_prompt_contains_task() { + let context = ReflectionContext { + initial_task: "Create a Python file".to_string(), + tool_calls: vec![], + final_response: Some("I created the file".to_string()), + attempt: 1, + max_attempts: 3, + }; + + let prompt = context.build_judge_prompt(); + assert!(prompt.contains("Create a Python file")); + assert!(prompt.contains("I created the file")); + assert!(prompt.contains("attempt 1")); + } + + #[test] + fn test_build_judge_prompt_with_tool_calls() { + let context = ReflectionContext { + initial_task: "Run tests".to_string(), + tool_calls: vec![ToolCallSummary { + tool_name: "shell".to_string(), + arguments: "npm test".to_string(), + result: "All tests passed".to_string(), + success: true, + }], + final_response: None, + attempt: 2, + max_attempts: 3, + }; + + let prompt = context.build_judge_prompt(); + assert!(prompt.contains("shell")); + assert!(prompt.contains("npm test")); + assert!(prompt.contains("All tests passed")); + assert!(prompt.contains("success")); + } + + #[test] + fn test_build_judge_prompt_no_tool_calls() { + let context = ReflectionContext { + initial_task: "Explain something".to_string(), + tool_calls: vec![], + final_response: Some("Here is the explanation".to_string()), + attempt: 1, + max_attempts: 3, + }; + + let prompt = context.build_judge_prompt(); + assert!(prompt.contains("No tool calls were made")); + } + + // ==================== ToolCallSummary Tests ==================== + + #[test] + fn test_tool_call_summary_success() { + let summary = ToolCallSummary { + tool_name: "write_file".to_string(), + arguments: r#"{"path": "/tmp/test.txt", "content": "hello"}"#.to_string(), + result: "File written successfully".to_string(), + success: true, + }; + + assert!(summary.success); + assert_eq!(summary.tool_name, "write_file"); + } + + #[test] + fn test_tool_call_summary_failure() { + let summary = ToolCallSummary { + tool_name: "shell".to_string(), + arguments: "rm -rf /".to_string(), + result: "Permission denied".to_string(), + success: false, + }; + + assert!(!summary.success); + assert!(summary.result.contains("Permission denied")); + } + + // ==================== ReflectionVerdict Tests ==================== + + #[test] + fn test_reflection_verdict_serialize() { + let verdict = ReflectionVerdict { + completed: true, + confidence: 0.95, + feedback: None, + reasoning: "All good".to_string(), + }; + + let json = serde_json::to_string(&verdict).unwrap(); + assert!(json.contains("\"completed\":true")); + assert!(json.contains("\"confidence\":0.95")); + } + + #[test] + fn test_reflection_verdict_deserialize() { + let json = r#"{"completed": false, "confidence": 0.6, "reasoning": "Needs work", "feedback": "Add tests"}"#; + let verdict: ReflectionVerdict = serde_json::from_str(json).unwrap(); + + assert!(!verdict.completed); + assert_eq!(verdict.confidence, 0.6); + assert_eq!(verdict.feedback, Some("Add tests".to_string())); + } + + // ==================== Integration Tests ==================== + // These tests verify the reflection layer works with real ResponseItem data + + #[test] + fn test_from_conversation_with_function_calls() { + use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem}; + + // Simulate a conversation with tool calls + let items = vec![ + // User message + ResponseItem::Message { + id: Some("msg1".to_string()), + role: "user".to_string(), + content: vec![ContentItem::InputText { + text: "Create a file called test.txt".to_string(), + }], + }, + // Assistant calls a tool + ResponseItem::FunctionCall { + id: Some("fc1".to_string()), + name: "write_file".to_string(), + arguments: r#"{"path": "test.txt", "content": "hello"}"#.to_string(), + call_id: "call_123".to_string(), + }, + // Tool output + ResponseItem::FunctionCallOutput { + call_id: "call_123".to_string(), + output: FunctionCallOutputPayload { + content: "File created successfully".to_string(), + ..Default::default() + }, + }, + // Assistant final response + ResponseItem::Message { + id: Some("msg2".to_string()), + role: "assistant".to_string(), + content: vec![ContentItem::OutputText { + text: "I've created the file test.txt with the content 'hello'.".to_string(), + }], + }, + ]; + + let context = ReflectionContext::from_conversation( + "Create a file called test.txt".to_string(), + &items, + 1, + 3, + ); + + // Verify context was built correctly + assert_eq!(context.initial_task, "Create a file called test.txt"); + assert_eq!(context.attempt, 1); + assert!(context.final_response.is_some()); + assert!( + context + .final_response + .as_ref() + .unwrap() + .contains("created the file") + ); + + // Verify tool calls were captured + assert_eq!(context.tool_calls.len(), 1); + assert_eq!(context.tool_calls[0].tool_name, "write_file"); + assert!(context.tool_calls[0].result.contains("successfully")); + assert!(context.tool_calls[0].success); // No "error" in output + } + + #[test] + fn test_from_conversation_with_failed_tool_call() { + use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem}; + + let items = vec![ + ResponseItem::FunctionCall { + id: Some("fc1".to_string()), + name: "shell".to_string(), + arguments: r#"{"command": "rm important_file"}"#.to_string(), + call_id: "call_456".to_string(), + }, + ResponseItem::FunctionCallOutput { + call_id: "call_456".to_string(), + output: FunctionCallOutputPayload { + content: "Error: Permission denied".to_string(), + ..Default::default() + }, + }, + ResponseItem::Message { + id: Some("msg1".to_string()), + role: "assistant".to_string(), + content: vec![ContentItem::OutputText { + text: "The command failed with an error.".to_string(), + }], + }, + ]; + + let context = + ReflectionContext::from_conversation("Delete the file".to_string(), &items, 2, 3); + + // Tool call should be marked as failed due to "Error" in output + assert_eq!(context.tool_calls.len(), 1); + assert!(!context.tool_calls[0].success); + assert!(context.tool_calls[0].result.contains("Permission denied")); + } + + #[test] + fn test_from_conversation_empty_items() { + let items: Vec = vec![]; + let context = + ReflectionContext::from_conversation("Do something".to_string(), &items, 1, 3); + + assert_eq!(context.initial_task, "Do something"); + assert!(context.tool_calls.is_empty()); + assert!(context.final_response.is_none()); + } + + #[test] + fn test_reflection_feedback_message_format() { + // Test that the feedback message format is correct + let verdict = ReflectionVerdict { + completed: false, + confidence: 0.7, + reasoning: "The file was created but tests are missing".to_string(), + feedback: Some("Please add unit tests for the new function".to_string()), + }; + + // Simulate the feedback message construction from codex.rs + let feedback_msg = format!( + "[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.", + 1, + 3, + verdict.reasoning, + verdict.feedback.as_ref().unwrap() + ); + + assert!(feedback_msg.contains("Attempt 1/3")); + assert!(feedback_msg.contains("tests are missing")); + assert!(feedback_msg.contains("add unit tests")); + } + + #[test] + fn test_judge_prompt_includes_all_context() { + use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem}; + + // Build a realistic context + let items = vec![ + ResponseItem::FunctionCall { + id: None, + name: "read_file".to_string(), + arguments: r#"{"path": "src/main.rs"}"#.to_string(), + call_id: "c1".to_string(), + }, + ResponseItem::FunctionCallOutput { + call_id: "c1".to_string(), + output: FunctionCallOutputPayload { + content: "fn main() { println!(\"Hello\"); }".to_string(), + ..Default::default() + }, + }, + ResponseItem::FunctionCall { + id: None, + name: "write_file".to_string(), + arguments: r#"{"path": "src/main.rs", "content": "fn main() { println!(\"Hello, World!\"); }"}"#.to_string(), + call_id: "c2".to_string(), + }, + ResponseItem::FunctionCallOutput { + call_id: "c2".to_string(), + output: FunctionCallOutputPayload { + content: "File written".to_string(), + ..Default::default() + }, + }, + ResponseItem::Message { + id: None, + role: "assistant".to_string(), + content: vec![ContentItem::OutputText { + text: "I've updated the greeting message.".to_string(), + }], + }, + ]; + + let context = ReflectionContext::from_conversation( + "Update the greeting to say Hello, World!".to_string(), + &items, + 1, + 3, + ); + + let prompt = context.build_judge_prompt(); + + // Verify the judge prompt contains all necessary information + assert!(prompt.contains("Update the greeting")); + assert!(prompt.contains("read_file")); + assert!(prompt.contains("write_file")); + assert!(prompt.contains("updated the greeting")); + assert!(prompt.contains("attempt 1")); + assert!(prompt.contains(r#""completed": true/false"#)); // JSON schema hint + } +} diff --git a/codex-rs/core/src/rollout/policy.rs b/codex-rs/core/src/rollout/policy.rs index 2980e768cf8..363557677a3 100644 --- a/codex-rs/core/src/rollout/policy.rs +++ b/codex-rs/core/src/rollout/policy.rs @@ -46,7 +46,8 @@ pub(crate) fn should_persist_event_msg(ev: &EventMsg) -> bool { | EventMsg::EnteredReviewMode(_) | EventMsg::ExitedReviewMode(_) | EventMsg::UndoCompleted(_) - | EventMsg::TurnAborted(_) => true, + | EventMsg::TurnAborted(_) + | EventMsg::ReflectionVerdict(_) => true, EventMsg::Error(_) | EventMsg::Warning(_) | EventMsg::TaskStarted(_) diff --git a/codex-rs/core/tests/suite/eval_swe_bench.rs b/codex-rs/core/tests/suite/eval_swe_bench.rs new file mode 100644 index 00000000000..5247da5810a --- /dev/null +++ b/codex-rs/core/tests/suite/eval_swe_bench.rs @@ -0,0 +1,635 @@ +//! SWE-bench style evaluation tests. +//! +//! These tests evaluate the agent's ability to fix real-world style bugs, +//! inspired by SWE-bench benchmark tasks. Each test provides: +//! - A codebase with a bug +//! - An issue description +//! - Test files that verify the fix +//! +//! Run with reflection enabled to compare results: +//! ```sh +//! AZURE_OPENAI_API_KEY= AZURE_OPENAI_BASE_URL= \ +//! cargo test -p codex-core --test all eval_swe -- --ignored --nocapture +//! ``` + +use assert_cmd::prelude::*; +use std::fs; +use std::io::{Read, Write}; +use std::process::{Command, Stdio}; +use std::thread; +use tempfile::TempDir; + +fn require_azure_credentials() -> (String, String, String) { + let api_key = std::env::var("AZURE_OPENAI_API_KEY") + .expect("AZURE_OPENAI_API_KEY env var not set"); + let base_url = std::env::var("AZURE_OPENAI_BASE_URL") + .expect("AZURE_OPENAI_BASE_URL env var not set"); + let model = std::env::var("AZURE_OPENAI_MODEL").unwrap_or_else(|_| "gpt-5-mini".to_string()); + (api_key, base_url, model) +} + +/// Creates config.toml with optional reflection setting. +fn create_config(base_url: &str, model: &str, reflection_enabled: bool) -> String { + let base_url = base_url.trim_end_matches('/'); + let base_url = if base_url.ends_with("/openai") { + base_url.to_string() + } else { + format!("{}/openai", base_url) + }; + + let reflection_config = if reflection_enabled { + r#" +[reflection] +enabled = true +max_attempts = 3 + +[features] +reflection = true +"# + } else { + "" + }; + + format!( + r#" +model = "{model}" +model_provider = "azure-openai" +{reflection_config} +[model_providers.azure-openai] +name = "Azure OpenAI" +base_url = "{base_url}" +env_key = "AZURE_OPENAI_API_KEY" +wire_api = "responses" +request_max_retries = 3 +stream_max_retries = 3 +stream_idle_timeout_ms = 120000 + +[model_providers.azure-openai.query_params] +api-version = "2025-04-01-preview" +"# + ) +} + +/// Result of an eval run. +#[derive(Debug)] +struct EvalResult { + success: bool, + reflection_used: bool, + test_output: String, + reflection_verdicts: Vec, +} + +/// Run an eval task and return the result. +fn run_eval_task( + prompt: &str, + setup_files: &[(&str, &str)], + test_command: &str, + reflection_enabled: bool, +) -> EvalResult { + #![expect(clippy::unwrap_used)] + + let (api_key, base_url, model) = require_azure_credentials(); + + let dir = TempDir::new().unwrap(); + let work_dir = dir.path(); + + // Create setup files (buggy codebase) + for (path, content) in setup_files { + let file_path = work_dir.join(path); + if let Some(parent) = file_path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::write(&file_path, content).unwrap(); + } + + // Create .codex directory with config + let codex_home = work_dir.join(".codex"); + fs::create_dir_all(&codex_home).unwrap(); + fs::write( + codex_home.join("config.toml"), + create_config(&base_url, &model, reflection_enabled), + ) + .unwrap(); + + // Run codex + let mut cmd = Command::cargo_bin("codex").unwrap(); + cmd.current_dir(work_dir); + cmd.env("AZURE_OPENAI_API_KEY", api_key); + cmd.env("CODEX_HOME", &codex_home); + cmd.env("RUST_LOG", "codex_core=info"); + + cmd.arg("exec") + .arg("--full-auto") + .arg("--skip-git-repo-check") + .arg("--color") + .arg("never") + .arg(prompt); + + cmd.stdin(Stdio::null()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to spawn codex"); + + // Tee helper + fn tee( + mut reader: R, + mut writer: impl Write + Send + 'static, + ) -> thread::JoinHandle> { + thread::spawn(move || { + let mut buf = Vec::new(); + let mut chunk = [0u8; 4096]; + loop { + match reader.read(&mut chunk) { + Ok(0) => break, + Ok(n) => { + writer.write_all(&chunk[..n]).ok(); + writer.flush().ok(); + buf.extend_from_slice(&chunk[..n]); + } + Err(_) => break, + } + } + buf + }) + } + + let stdout_handle = tee( + child.stdout.take().expect("child stdout"), + std::io::stdout(), + ); + let stderr_handle = tee( + child.stderr.take().expect("child stderr"), + std::io::stderr(), + ); + + let status = child.wait().expect("failed to wait on child"); + let stdout = stdout_handle.join().expect("stdout thread panicked"); + let stderr = stderr_handle.join().expect("stderr thread panicked"); + + let combined = format!( + "{}{}", + String::from_utf8_lossy(&stdout), + String::from_utf8_lossy(&stderr) + ); + + // Extract reflection verdicts + let reflection_verdicts: Vec = combined + .lines() + .filter(|line| line.contains("Reflection verdict")) + .map(|s| s.to_string()) + .collect(); + + // Run the verification test + let test_result = Command::new("bash") + .arg("-c") + .arg(test_command) + .current_dir(work_dir) + .output() + .expect("failed to run test"); + + let test_output = format!( + "stdout: {}\nstderr: {}", + String::from_utf8_lossy(&test_result.stdout), + String::from_utf8_lossy(&test_result.stderr) + ); + + EvalResult { + success: status.success() && test_result.status.success(), + reflection_used: !reflection_verdicts.is_empty(), + test_output, + reflection_verdicts, + } +} + +// ============================================================================ +// EVAL TASK 1: Off-by-one error in array processing +// ============================================================================ + +const TASK1_BUG_FILE: &str = r#" +def sum_first_n(arr, n): + """Return sum of first n elements of arr.""" + total = 0 + for i in range(n + 1): # BUG: should be range(n) + total += arr[i] + return total + +def get_last_element(arr): + """Return the last element of arr.""" + return arr[len(arr)] # BUG: should be arr[len(arr) - 1] or arr[-1] +"#; + +const TASK1_TEST_FILE: &str = r#" +import pytest +from array_utils import sum_first_n, get_last_element + +def test_sum_first_n(): + arr = [1, 2, 3, 4, 5] + assert sum_first_n(arr, 3) == 6 # 1 + 2 + 3 + assert sum_first_n(arr, 1) == 1 + assert sum_first_n(arr, 5) == 15 + +def test_get_last_element(): + assert get_last_element([1, 2, 3]) == 3 + assert get_last_element([10]) == 10 + assert get_last_element(['a', 'b', 'c']) == 'c' +"#; + +const TASK1_ISSUE: &str = r#"Fix the off-by-one errors in array_utils.py + +The file array_utils.py has two functions with index errors: + +1. `sum_first_n(arr, n)` - Should return sum of first n elements, but it's including one extra element +2. `get_last_element(arr)` - Should return the last element, but it's causing an IndexError + +Please fix these bugs so that test_array_utils.py passes. + +After fixing, run: pytest test_array_utils.py -v +"#; + +#[ignore] +#[test] +fn eval_task1_offbyone_with_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK1_ISSUE, + &[ + ("array_utils.py", TASK1_BUG_FILE), + ("test_array_utils.py", TASK1_TEST_FILE), + ], + "pytest test_array_utils.py -v", + true, // reflection enabled + ); + + println!("\n=== EVAL TASK 1 (with reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Reflection verdicts: {:?}", result.reflection_verdicts); + println!("Test output:\n{}", result.test_output); + + assert!(result.success, "Task 1 failed with reflection enabled"); +} + +#[ignore] +#[test] +fn eval_task1_offbyone_without_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK1_ISSUE, + &[ + ("array_utils.py", TASK1_BUG_FILE), + ("test_array_utils.py", TASK1_TEST_FILE), + ], + "pytest test_array_utils.py -v", + false, // reflection disabled + ); + + println!("\n=== EVAL TASK 1 (without reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Test output:\n{}", result.test_output); + + // Don't assert - just report for comparison + println!( + "Result: {}", + if result.success { "PASS" } else { "FAIL" } + ); +} + +// ============================================================================ +// EVAL TASK 2: Logic error in string processing +// ============================================================================ + +const TASK2_BUG_FILE: &str = r#" +def is_palindrome(s): + """Check if string is a palindrome (case-insensitive, ignoring spaces).""" + # BUG: doesn't handle case or spaces + return s == s[::-1] + +def count_words(text): + """Count number of words in text.""" + # BUG: doesn't handle multiple spaces or leading/trailing spaces + return len(text.split(' ')) +"#; + +const TASK2_TEST_FILE: &str = r#" +import pytest +from string_utils import is_palindrome, count_words + +def test_is_palindrome(): + assert is_palindrome("racecar") == True + assert is_palindrome("A man a plan a canal Panama") == True + assert is_palindrome("Race Car") == True + assert is_palindrome("hello") == False + +def test_count_words(): + assert count_words("hello world") == 2 + assert count_words(" hello world ") == 2 + assert count_words("one") == 1 + assert count_words("a b c d e") == 5 +"#; + +const TASK2_ISSUE: &str = r#"Fix logic errors in string_utils.py + +The string_utils.py file has two functions with logic bugs: + +1. `is_palindrome(s)` - Should check if string is palindrome, ignoring case and spaces + Currently fails on "Race Car" and "A man a plan a canal Panama" + +2. `count_words(text)` - Should count words, handling multiple spaces correctly + Currently gives wrong count for " hello world " + +Fix these bugs so that test_string_utils.py passes. + +After fixing, run: pytest test_string_utils.py -v +"#; + +#[ignore] +#[test] +fn eval_task2_string_logic_with_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK2_ISSUE, + &[ + ("string_utils.py", TASK2_BUG_FILE), + ("test_string_utils.py", TASK2_TEST_FILE), + ], + "pytest test_string_utils.py -v", + true, + ); + + println!("\n=== EVAL TASK 2 (with reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Reflection verdicts: {:?}", result.reflection_verdicts); + println!("Test output:\n{}", result.test_output); + + assert!(result.success, "Task 2 failed with reflection enabled"); +} + +#[ignore] +#[test] +fn eval_task2_string_logic_without_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK2_ISSUE, + &[ + ("string_utils.py", TASK2_BUG_FILE), + ("test_string_utils.py", TASK2_TEST_FILE), + ], + "pytest test_string_utils.py -v", + false, + ); + + println!("\n=== EVAL TASK 2 (without reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Test output:\n{}", result.test_output); + + println!( + "Result: {}", + if result.success { "PASS" } else { "FAIL" } + ); +} + +// ============================================================================ +// EVAL TASK 3: Missing edge case handling +// ============================================================================ + +const TASK3_BUG_FILE: &str = r#" +def safe_divide(a, b): + """Safely divide a by b, return None on error.""" + # BUG: doesn't handle division by zero + return a / b + +def find_max(numbers): + """Find maximum value in list.""" + # BUG: doesn't handle empty list + max_val = numbers[0] + for n in numbers[1:]: + if n > max_val: + max_val = n + return max_val + +def get_element_at(arr, index): + """Get element at index, return None if out of bounds.""" + # BUG: doesn't handle negative indices or out of bounds + return arr[index] +"#; + +const TASK3_TEST_FILE: &str = r#" +import pytest +from math_utils import safe_divide, find_max, get_element_at + +def test_safe_divide(): + assert safe_divide(10, 2) == 5.0 + assert safe_divide(0, 5) == 0.0 + assert safe_divide(10, 0) is None # Division by zero + assert safe_divide(10, 3) == pytest.approx(3.333, rel=0.01) + +def test_find_max(): + assert find_max([1, 5, 3, 9, 2]) == 9 + assert find_max([42]) == 42 + assert find_max([-5, -1, -10]) == -1 + assert find_max([]) is None # Empty list + +def test_get_element_at(): + arr = [10, 20, 30] + assert get_element_at(arr, 0) == 10 + assert get_element_at(arr, 2) == 30 + assert get_element_at(arr, 5) is None # Out of bounds + assert get_element_at(arr, -1) is None # Negative index + assert get_element_at([], 0) is None # Empty array +"#; + +const TASK3_ISSUE: &str = r#"Fix missing edge case handling in math_utils.py + +The math_utils.py has three functions that don't handle edge cases properly: + +1. `safe_divide(a, b)` - Should return None when dividing by zero, but crashes instead + +2. `find_max(numbers)` - Should return None for empty list, but crashes with IndexError + +3. `get_element_at(arr, index)` - Should return None for: + - Out of bounds indices + - Negative indices + - Empty arrays + Currently crashes instead + +Fix these edge cases so that test_math_utils.py passes. + +After fixing, run: pytest test_math_utils.py -v +"#; + +#[ignore] +#[test] +fn eval_task3_edge_cases_with_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK3_ISSUE, + &[ + ("math_utils.py", TASK3_BUG_FILE), + ("test_math_utils.py", TASK3_TEST_FILE), + ], + "pytest test_math_utils.py -v", + true, + ); + + println!("\n=== EVAL TASK 3 (with reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Reflection verdicts: {:?}", result.reflection_verdicts); + println!("Test output:\n{}", result.test_output); + + assert!(result.success, "Task 3 failed with reflection enabled"); +} + +#[ignore] +#[test] +fn eval_task3_edge_cases_without_reflection() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + let result = run_eval_task( + TASK3_ISSUE, + &[ + ("math_utils.py", TASK3_BUG_FILE), + ("test_math_utils.py", TASK3_TEST_FILE), + ], + "pytest test_math_utils.py -v", + false, + ); + + println!("\n=== EVAL TASK 3 (without reflection) ==="); + println!("Success: {}", result.success); + println!("Reflection used: {}", result.reflection_used); + println!("Test output:\n{}", result.test_output); + + println!( + "Result: {}", + if result.success { "PASS" } else { "FAIL" } + ); +} + +/// Run all eval tasks and summarize results. +#[ignore] +#[test] +fn eval_summary() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() { + eprintln!("skipping eval — Azure credentials not set"); + return; + } + + println!("\n========================================"); + println!("SWE-BENCH STYLE EVALUATION SUMMARY"); + println!("========================================\n"); + + let tasks = [ + ( + "Task 1: Off-by-one errors", + TASK1_ISSUE, + vec![ + ("array_utils.py", TASK1_BUG_FILE), + ("test_array_utils.py", TASK1_TEST_FILE), + ], + "pytest test_array_utils.py -v", + ), + ( + "Task 2: String logic errors", + TASK2_ISSUE, + vec![ + ("string_utils.py", TASK2_BUG_FILE), + ("test_string_utils.py", TASK2_TEST_FILE), + ], + "pytest test_string_utils.py -v", + ), + ( + "Task 3: Missing edge cases", + TASK3_ISSUE, + vec![ + ("math_utils.py", TASK3_BUG_FILE), + ("test_math_utils.py", TASK3_TEST_FILE), + ], + "pytest test_math_utils.py -v", + ), + ]; + + let mut with_reflection_pass = 0; + let mut without_reflection_pass = 0; + + for (name, issue, files, test_cmd) in tasks.iter() { + println!("--- {} ---", name); + + // With reflection + let result_with = run_eval_task( + issue, + &files.iter().map(|(a, b)| (*a, *b)).collect::>(), + test_cmd, + true, + ); + if result_with.success { + with_reflection_pass += 1; + } + println!( + " With reflection: {} (verdicts: {})", + if result_with.success { "PASS" } else { "FAIL" }, + result_with.reflection_verdicts.len() + ); + + // Without reflection + let result_without = run_eval_task( + issue, + &files.iter().map(|(a, b)| (*a, *b)).collect::>(), + test_cmd, + false, + ); + if result_without.success { + without_reflection_pass += 1; + } + println!( + " Without reflection: {}", + if result_without.success { "PASS" } else { "FAIL" } + ); + println!(); + } + + println!("========================================"); + println!("RESULTS"); + println!("========================================"); + println!( + "With reflection: {}/{} tasks passed", + with_reflection_pass, + tasks.len() + ); + println!( + "Without reflection: {}/{} tasks passed", + without_reflection_pass, + tasks.len() + ); + println!( + "Improvement: {:+} tasks", + with_reflection_pass as i32 - without_reflection_pass as i32 + ); +} diff --git a/codex-rs/core/tests/suite/mod.rs b/codex-rs/core/tests/suite/mod.rs index e047899d722..b3dddb9d622 100644 --- a/codex-rs/core/tests/suite/mod.rs +++ b/codex-rs/core/tests/suite/mod.rs @@ -26,6 +26,7 @@ mod compact; mod compact_remote; mod compact_resume_fork; mod deprecation_notice; +mod eval_swe_bench; mod exec; mod exec_policy; mod fork_conversation; @@ -41,6 +42,7 @@ mod otel; mod prompt_caching; mod quota_exceeded; mod read_file; +mod reflection; mod remote_models; mod resume; mod review; diff --git a/codex-rs/core/tests/suite/reflection.rs b/codex-rs/core/tests/suite/reflection.rs new file mode 100644 index 00000000000..aa7051a39c0 --- /dev/null +++ b/codex-rs/core/tests/suite/reflection.rs @@ -0,0 +1,264 @@ +//! Integration tests for the reflection layer. +//! +//! These tests verify that the reflection layer works correctly with real API calls. +//! They are `#[ignore]` by default so CI stays deterministic. Run them locally with: +//! +//! ```sh +//! # For Azure OpenAI: +//! AZURE_OPENAI_API_KEY= AZURE_OPENAI_BASE_URL= \ +//! cargo test -p codex-core --test all reflection -- --ignored --nocapture +//! +//! # Optionally specify a model (defaults to gpt-5-mini): +//! AZURE_OPENAI_API_KEY= AZURE_OPENAI_BASE_URL= AZURE_OPENAI_MODEL=gpt-4o \ +//! cargo test -p codex-core --test all reflection -- --ignored --nocapture +//! ``` + +use assert_cmd::prelude::*; +use std::io::{Read, Write}; +use std::process::{Command, Stdio}; +use std::thread; +use tempfile::TempDir; + +fn require_azure_credentials() -> (String, String, String) { + let api_key = std::env::var("AZURE_OPENAI_API_KEY") + .expect("AZURE_OPENAI_API_KEY env var not set — skip running Azure tests"); + let base_url = std::env::var("AZURE_OPENAI_BASE_URL") + .expect("AZURE_OPENAI_BASE_URL env var not set — skip running Azure tests"); + let model = std::env::var("AZURE_OPENAI_MODEL").unwrap_or_else(|_| "gpt-5-mini".to_string()); + (api_key, base_url, model) +} + +/// Creates a config.toml for Azure OpenAI with reflection enabled. +fn create_azure_config(base_url: &str, model: &str) -> String { + // Ensure base_url ends with /openai + let base_url = base_url.trim_end_matches('/'); + let base_url = if base_url.ends_with("/openai") { + base_url.to_string() + } else { + format!("{}/openai", base_url) + }; + + format!( + r#" +model = "{model}" +model_provider = "azure-openai" + +[reflection] +enabled = true +max_attempts = 3 + +[features] +reflection = true + +[model_providers.azure-openai] +name = "Azure OpenAI" +base_url = "{base_url}" +env_key = "AZURE_OPENAI_API_KEY" +wire_api = "responses" +request_max_retries = 3 +stream_max_retries = 3 +stream_idle_timeout_ms = 120000 + +[model_providers.azure-openai.query_params] +api-version = "2025-04-01-preview" +"# + ) +} + +/// Helper that spawns codex exec with Azure OpenAI config and reflection enabled. +/// Returns (Assert, TempDir, stdout, stderr). +fn run_azure_reflection_test(prompt: &str) -> (assert_cmd::assert::Assert, TempDir, Vec, Vec) { + #![expect(clippy::unwrap_used)] + + let (api_key, base_url, model) = require_azure_credentials(); + + let dir = TempDir::new().unwrap(); + + // Create .codex directory with config + let codex_home = dir.path().join(".codex"); + std::fs::create_dir_all(&codex_home).unwrap(); + std::fs::write(codex_home.join("config.toml"), create_azure_config(&base_url, &model)).unwrap(); + + let mut cmd = Command::cargo_bin("codex").unwrap(); + cmd.current_dir(dir.path()); + cmd.env("AZURE_OPENAI_API_KEY", api_key); + cmd.env("CODEX_HOME", &codex_home); + cmd.env("RUST_LOG", "codex_core=info,reflection=debug"); + + cmd.arg("exec") + .arg("--full-auto") + .arg("--skip-git-repo-check") + .arg("--color") + .arg("never") + .arg(prompt); + + cmd.stdin(Stdio::null()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to spawn codex"); + + // Tee helper - copies stream to both parent stdio and buffer + fn tee( + mut reader: R, + mut writer: impl Write + Send + 'static, + ) -> thread::JoinHandle> { + thread::spawn(move || { + let mut buf = Vec::new(); + let mut chunk = [0u8; 4096]; + loop { + match reader.read(&mut chunk) { + Ok(0) => break, + Ok(n) => { + writer.write_all(&chunk[..n]).ok(); + writer.flush().ok(); + buf.extend_from_slice(&chunk[..n]); + } + Err(_) => break, + } + } + buf + }) + } + + let stdout_handle = tee( + child.stdout.take().expect("child stdout"), + std::io::stdout(), + ); + let stderr_handle = tee( + child.stderr.take().expect("child stderr"), + std::io::stderr(), + ); + + let status = child.wait().expect("failed to wait on child"); + let stdout = stdout_handle.join().expect("stdout thread panicked"); + let stderr = stderr_handle.join().expect("stderr thread panicked"); + + let output = std::process::Output { + status, + stdout: stdout.clone(), + stderr: stderr.clone(), + }; + + (output.assert(), dir, stdout, stderr) +} + +/// Integration test for reflection layer with Azure OpenAI. +/// +/// This test: +/// 1. Connects to Azure OpenAI using AZURE_OPENAI_API_KEY and AZURE_OPENAI_BASE_URL +/// 2. Uses gpt-5-mini model with reflection enabled +/// 3. Asks codex to create a hello world Python app and test it +/// 4. Verifies the reflection layer was invoked +#[ignore] +#[test] +fn reflection_layer_hello_world_with_azure_openai() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() + || std::env::var("AZURE_OPENAI_BASE_URL").is_err() + { + eprintln!( + "skipping reflection test — AZURE_OPENAI_API_KEY or AZURE_OPENAI_BASE_URL not set" + ); + return; + } + + let prompt = r#"Write a hello world Python application that prints exactly "hello, world" (with a lowercase h and trailing newline). Name the file hello.py. + +Then create a test file called test_hello.py that: +1. Runs hello.py using subprocess +2. Captures its output +3. Asserts that the output is exactly "hello, world\n" + +After creating both files, run the test to verify it passes."#; + + let (assert, dir, stdout, stderr) = run_azure_reflection_test(prompt); + + // Test should succeed + assert.success(); + + // Verify hello.py was created + let hello_path = dir.path().join("hello.py"); + assert!( + hello_path.exists(), + "hello.py was not created by the model" + ); + + // Verify test_hello.py was created + let test_path = dir.path().join("test_hello.py"); + assert!( + test_path.exists(), + "test_hello.py was not created by the model" + ); + + // Check that reflection layer was invoked by looking at both stdout and stderr (logs) + let stdout_str = String::from_utf8_lossy(&stdout); + let stderr_str = String::from_utf8_lossy(&stderr); + let combined_output = format!("{}{}", stdout_str, stderr_str); + + let reflection_invoked = combined_output.contains("Running reflection evaluation") + || combined_output.contains("Reflection verdict") + || combined_output.contains("Reflection:"); + + println!("\n=== Reflection Layer Status ==="); + if reflection_invoked { + println!("Reflection layer was invoked during task execution"); + + // Extract and print reflection verdict if present + for line in combined_output.lines() { + if line.contains("Reflection verdict") || line.contains("Reflection:") { + println!(" {}", line); + } + } + } else { + println!("WARNING: No explicit reflection activity detected in output"); + println!("This may indicate the reflection feature was not properly enabled"); + } + + // Verify hello.py content is correct + let hello_content = std::fs::read_to_string(&hello_path).unwrap(); + println!("\n=== hello.py ===\n{}", hello_content); + + // Verify test file content + let test_content = std::fs::read_to_string(&test_path).unwrap(); + println!("\n=== test_hello.py ===\n{}", test_content); + + // The reflection layer should have been invoked + assert!( + reflection_invoked, + "Reflection layer was not invoked - check that reflection feature is enabled" + ); +} + +/// Simple test to verify reflection config is correctly parsed. +#[ignore] +#[test] +fn reflection_config_azure_openai() { + if std::env::var("AZURE_OPENAI_API_KEY").is_err() + || std::env::var("AZURE_OPENAI_BASE_URL").is_err() + { + eprintln!("skipping config test — Azure credentials not set"); + return; + } + + // Simple prompt that should complete quickly + let prompt = "Print 'hello' using echo"; + + let (assert, _dir, stdout, stderr) = run_azure_reflection_test(prompt); + + assert.success(); + + // Verify reflection was at least attempted (logs go to stderr) + let stdout_str = String::from_utf8_lossy(&stdout); + let stderr_str = String::from_utf8_lossy(&stderr); + let combined_output = format!("{}{}", stdout_str, stderr_str); + println!("Output:\n{}", combined_output); + + // Should see reflection-related log messages + let has_reflection_logs = combined_output.contains("reflection") + || combined_output.contains("Reflection"); + + println!( + "\nReflection activity detected: {}", + has_reflection_logs + ); +} diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs index 6c0f6dcc80c..0456d6de994 100644 --- a/codex-rs/core/tests/suite/view_image.rs +++ b/codex-rs/core/tests/suite/view_image.rs @@ -15,6 +15,8 @@ use core_test_support::responses::ev_function_call; use core_test_support::responses::ev_response_created; use core_test_support::responses::sse; use core_test_support::responses::start_mock_server; +use wiremock::ResponseTemplate; +use wiremock::matchers::body_string_contains; use core_test_support::skip_if_no_network; use core_test_support::test_codex::TestCodex; use core_test_support::test_codex::test_codex; diff --git a/codex-rs/exec/src/event_processor_with_human_output.rs b/codex-rs/exec/src/event_processor_with_human_output.rs index a833426dc22..032b9bf8a9c 100644 --- a/codex-rs/exec/src/event_processor_with_human_output.rs +++ b/codex-rs/exec/src/event_processor_with_human_output.rs @@ -22,6 +22,7 @@ use codex_core::protocol::TaskCompleteEvent; use codex_core::protocol::TurnAbortReason; use codex_core::protocol::TurnDiffEvent; use codex_core::protocol::WarningEvent; +use codex_core::protocol::ReflectionVerdictEvent; use codex_core::protocol::WebSearchEndEvent; use codex_protocol::num_format::format_with_separators; use owo_colors::OwoColorize; @@ -173,6 +174,34 @@ impl EventProcessor for EventProcessorWithHumanOutput { "warning:".style(self.yellow).style(self.bold) ); } + EventMsg::ReflectionVerdict(ev) => { + if ev.completed { + ts_msg!( + self, + "{} Task completed (confidence: {:.0}%)", + "✓ reflection:".style(self.green).style(self.bold), + ev.confidence * 100.0 + ); + ts_msg!(self, " {}", ev.reasoning.style(self.dimmed)); + } else { + ts_msg!( + self, + "{} Task incomplete - attempt {}/{} (confidence: {:.0}%)", + "⟳ reflection:".style(self.yellow).style(self.bold), + ev.attempt, + ev.max_attempts, + ev.confidence * 100.0 + ); + ts_msg!(self, " Reasoning: {}", ev.reasoning.style(self.dimmed)); + if let Some(feedback) = &ev.feedback { + ts_msg!( + self, + " Feedback: {}", + feedback.style(self.yellow) + ); + } + } + } EventMsg::DeprecationNotice(DeprecationNoticeEvent { summary, details }) => { ts_msg!( self, diff --git a/codex-rs/mcp-server/src/codex_tool_runner.rs b/codex-rs/mcp-server/src/codex_tool_runner.rs index 73e56a605ce..a0ad30c028a 100644 --- a/codex-rs/mcp-server/src/codex_tool_runner.rs +++ b/codex-rs/mcp-server/src/codex_tool_runner.rs @@ -308,7 +308,8 @@ async fn run_codex_tool_session_inner( | EventMsg::UndoCompleted(_) | EventMsg::ExitedReviewMode(_) | EventMsg::ContextCompacted(_) - | EventMsg::DeprecationNotice(_) => { + | EventMsg::DeprecationNotice(_) + | EventMsg::ReflectionVerdict(_) => { // For now, we do not do anything extra for these // events. Note that // send(codex_event_to_notification(&event)) above has diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 201aba07efe..a2b5140f137 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -630,6 +630,9 @@ pub enum EventMsg { AgentMessageContentDelta(AgentMessageContentDeltaEvent), ReasoningContentDelta(ReasoningContentDeltaEvent), ReasoningRawContentDelta(ReasoningRawContentDeltaEvent), + + /// Reflection layer verdict after evaluating task completion. + ReflectionVerdict(ReflectionVerdictEvent), } /// Codex errors that we expose to clients. @@ -794,6 +797,24 @@ pub struct WarningEvent { pub message: String, } +/// Event emitted when the reflection layer evaluates task completion. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)] +pub struct ReflectionVerdictEvent { + /// Whether the task was completed successfully. + pub completed: bool, + /// Confidence score from 0.0 to 1.0. + pub confidence: f32, + /// The judge's reasoning for the verdict. + pub reasoning: String, + /// Feedback for incomplete tasks (None if completed). + #[serde(skip_serializing_if = "Option::is_none")] + pub feedback: Option, + /// Current reflection attempt number. + pub attempt: u32, + /// Maximum allowed attempts. + pub max_attempts: u32, +} + #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)] pub struct ContextCompactedEvent; diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index 37cd004a159..9d8393f0696 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -58,6 +58,7 @@ use codex_core::protocol::UndoStartedEvent; use codex_core::protocol::UserMessageEvent; use codex_core::protocol::ViewImageToolCallEvent; use codex_core::protocol::WarningEvent; +use codex_core::protocol::ReflectionVerdictEvent; use codex_core::protocol::WebSearchBeginEvent; use codex_core::protocol::WebSearchEndEvent; use codex_core::skills::model::SkillMetadata; @@ -690,6 +691,18 @@ impl ChatWidget { self.request_redraw(); } + fn on_reflection_verdict(&mut self, ev: ReflectionVerdictEvent) { + self.add_to_history(history_cell::new_reflection_verdict( + ev.completed, + ev.confidence, + ev.reasoning, + ev.feedback, + ev.attempt, + ev.max_attempts, + )); + self.request_redraw(); + } + fn on_mcp_startup_update(&mut self, ev: McpStartupUpdateEvent) { let mut status = self.mcp_startup_status.take().unwrap_or_default(); if let McpStartupStatus::Failed { error } = &ev.status { @@ -1906,6 +1919,7 @@ impl ChatWidget { self.on_entered_review_mode(review_request) } EventMsg::ExitedReviewMode(review) => self.on_exited_review_mode(review), + EventMsg::ReflectionVerdict(ev) => self.on_reflection_verdict(ev), EventMsg::ContextCompacted(_) => self.on_agent_message("Context compacted".to_owned()), EventMsg::RawResponseItem(_) | EventMsg::ItemStarted(_) diff --git a/codex-rs/tui/src/history_cell.rs b/codex-rs/tui/src/history_cell.rs index 5440040f6b5..acf6dbe8ca3 100644 --- a/codex-rs/tui/src/history_cell.rs +++ b/codex-rs/tui/src/history_cell.rs @@ -1071,6 +1071,42 @@ pub(crate) fn new_warning_event(message: String) -> PrefixedWrappedHistoryCell { PrefixedWrappedHistoryCell::new(message.yellow(), "⚠ ".yellow(), " ") } +/// Creates a history cell for reflection verdict display. +#[allow(clippy::disallowed_methods)] +pub(crate) fn new_reflection_verdict( + completed: bool, + confidence: f32, + reasoning: String, + feedback: Option, + attempt: u32, + max_attempts: u32, +) -> PlainHistoryCell { + let mut lines: Vec> = Vec::new(); + + if completed { + // Success case - green checkmark + let header = format!( + "✓ Reflection: Task completed (confidence: {:.0}%)", + confidence * 100.0 + ); + lines.push(vec![header.green()].into()); + lines.push(vec![format!(" {}", reasoning).dim()].into()); + } else { + // Incomplete case - yellow warning + let header = format!( + "⟳ Reflection: Task incomplete - attempt {}/{} (confidence: {:.0}%)", + attempt, max_attempts, confidence * 100.0 + ); + lines.push(vec![header.yellow()].into()); + lines.push(vec![format!(" Reasoning: {}", reasoning).dim()].into()); + if let Some(fb) = feedback { + lines.push(vec![format!(" Feedback: {}", fb).yellow()].into()); + } + } + + PlainHistoryCell { lines } +} + #[derive(Debug)] pub(crate) struct DeprecationNoticeCell { summary: String, diff --git a/codex-rs/tui2/src/chatwidget.rs b/codex-rs/tui2/src/chatwidget.rs index 5461bea7296..4c2ec861990 100644 --- a/codex-rs/tui2/src/chatwidget.rs +++ b/codex-rs/tui2/src/chatwidget.rs @@ -58,6 +58,7 @@ use codex_core::protocol::UndoStartedEvent; use codex_core::protocol::UserMessageEvent; use codex_core::protocol::ViewImageToolCallEvent; use codex_core::protocol::WarningEvent; +use codex_core::protocol::ReflectionVerdictEvent; use codex_core::protocol::WebSearchBeginEvent; use codex_core::protocol::WebSearchEndEvent; use codex_core::skills::model::SkillMetadata; @@ -690,6 +691,18 @@ impl ChatWidget { self.request_redraw(); } + fn on_reflection_verdict(&mut self, ev: ReflectionVerdictEvent) { + self.add_to_history(history_cell::new_reflection_verdict( + ev.completed, + ev.confidence, + ev.reasoning, + ev.feedback, + ev.attempt, + ev.max_attempts, + )); + self.request_redraw(); + } + fn on_mcp_startup_update(&mut self, ev: McpStartupUpdateEvent) { let mut status = self.mcp_startup_status.take().unwrap_or_default(); if let McpStartupStatus::Failed { error } = &ev.status { @@ -1906,6 +1919,7 @@ impl ChatWidget { self.on_entered_review_mode(review_request) } EventMsg::ExitedReviewMode(review) => self.on_exited_review_mode(review), + EventMsg::ReflectionVerdict(ev) => self.on_reflection_verdict(ev), EventMsg::ContextCompacted(_) => self.on_agent_message("Context compacted".to_owned()), EventMsg::RawResponseItem(_) | EventMsg::ItemStarted(_) diff --git a/codex-rs/tui2/src/history_cell.rs b/codex-rs/tui2/src/history_cell.rs index 5440040f6b5..acf6dbe8ca3 100644 --- a/codex-rs/tui2/src/history_cell.rs +++ b/codex-rs/tui2/src/history_cell.rs @@ -1071,6 +1071,42 @@ pub(crate) fn new_warning_event(message: String) -> PrefixedWrappedHistoryCell { PrefixedWrappedHistoryCell::new(message.yellow(), "⚠ ".yellow(), " ") } +/// Creates a history cell for reflection verdict display. +#[allow(clippy::disallowed_methods)] +pub(crate) fn new_reflection_verdict( + completed: bool, + confidence: f32, + reasoning: String, + feedback: Option, + attempt: u32, + max_attempts: u32, +) -> PlainHistoryCell { + let mut lines: Vec> = Vec::new(); + + if completed { + // Success case - green checkmark + let header = format!( + "✓ Reflection: Task completed (confidence: {:.0}%)", + confidence * 100.0 + ); + lines.push(vec![header.green()].into()); + lines.push(vec![format!(" {}", reasoning).dim()].into()); + } else { + // Incomplete case - yellow warning + let header = format!( + "⟳ Reflection: Task incomplete - attempt {}/{} (confidence: {:.0}%)", + attempt, max_attempts, confidence * 100.0 + ); + lines.push(vec![header.yellow()].into()); + lines.push(vec![format!(" Reasoning: {}", reasoning).dim()].into()); + if let Some(fb) = feedback { + lines.push(vec![format!(" Feedback: {}", fb).yellow()].into()); + } + } + + PlainHistoryCell { lines } +} + #[derive(Debug)] pub(crate) struct DeprecationNoticeCell { summary: String, diff --git a/docs/reflection.md b/docs/reflection.md new file mode 100644 index 00000000000..3afff5837b7 --- /dev/null +++ b/docs/reflection.md @@ -0,0 +1,275 @@ +# Reflection Layer + +The reflection layer is an experimental feature that verifies if the AI agent completed a task correctly by using a "judge" model to evaluate the work. + +## How It Works + +1. **Task Execution**: Agent executes the user's request using tools (shell, file operations, etc.) + +2. **Evaluation**: After completion, the reflection layer: + - Collects context: original task, recent tool calls (up to 10), and final response + - Sends this to a judge model for evaluation + - Receives a verdict with completion status and confidence score + +3. **Retry Loop**: If the task is incomplete: + - Judge provides feedback on what's missing + - Agent receives feedback and tries again + - Repeats up to 3 attempts (configurable) + +## Verdict Structure + +```json +{ + "completed": true, + "confidence": 0.95, + "reasoning": "Task was completed successfully", + "feedback": null +} +``` + +- `completed`: Whether the task was done +- `confidence`: 0.0 to 1.0 confidence score +- `reasoning`: Explanation of the verdict +- `feedback`: Instructions for the agent if incomplete + +## Configuration + +Enable in `~/.codex/config.toml`: + +```toml +[reflection] +enabled = true +max_attempts = 3 + +[features] +reflection = true +``` + +## Running Tests + +```shell +# Required environment variables +export AZURE_OPENAI_API_KEY="" +export AZURE_OPENAI_BASE_URL="" + +# Optional: specify model (defaults to gpt-5-mini) +export AZURE_OPENAI_MODEL="gpt-5" + +# Run the integration test +cargo test -p codex-core --test all --release reflection_layer_hello_world -- --ignored --nocapture +``` + +The test verifies: +1. Azure OpenAI integration with reflection enabled +2. Agent creates requested Python files +3. Tests pass via pytest +4. Reflection layer evaluates and returns a verdict + +## Evaluation Suite (SWE-bench Style) + +The eval suite measures the reflection layer's impact on coding task performance, inspired by [SWE-bench](https://github.com/SWE-bench/SWE-bench). + +### Tasks + +| Task | Description | Bug Type | +|------|-------------|----------| +| Task 1 | Off-by-one errors | `range(n+1)` → `range(n)`, index errors | +| Task 2 | String logic | Palindrome detection, word counting | +| Task 3 | Edge cases | Division by zero, empty list handling | + +Each task provides: +- A buggy Python codebase +- An issue description (like a GitHub issue) +- Test files that verify the fix + +### Running Evaluations + +```shell +# Run single task with reflection +cargo test -p codex-core --test all --release eval_task1_offbyone_with_reflection -- --ignored --nocapture + +# Run single task without reflection +cargo test -p codex-core --test all --release eval_task1_offbyone_without_reflection -- --ignored --nocapture + +# Run full comparison (all tasks, with and without reflection) +cargo test -p codex-core --test all --release eval_summary -- --ignored --nocapture +``` + +### Sample Output + +``` +======================================== +SWE-BENCH STYLE EVALUATION SUMMARY +======================================== + +--- Task 1: Off-by-one errors --- + With reflection: PASS (verdicts: 1) + Without reflection: PASS + +--- Task 2: String logic errors --- + With reflection: PASS (verdicts: 1) + Without reflection: PASS + +--- Task 3: Missing edge cases --- + With reflection: PASS (verdicts: 2) + Without reflection: FAIL + +======================================== +RESULTS +======================================== +With reflection: 3/3 tasks passed +Without reflection: 2/3 tasks passed +Improvement: +1 tasks +``` + +The reflection layer helps catch incomplete fixes by re-evaluating the agent's work and providing feedback for another attempt. +## Local Debugging + +### Prerequisites + +- Rust toolchain with `cargo` installed. +- `just` available for the repo (if you use it for formatting/linting). +- (Optional) `cargo-insta` if you will work with snapshot tests. + +> Note: On Windows prefer WSL for these instructions or adapt commands to PowerShell. + +### Installation + +Preferred (recommended): install the CLI into your local bin with `cargo install`: + +```shell +# from repo root +cargo install --path codex-rs --root "$HOME/.local" +export PATH="$HOME/.local/bin:$PATH" +``` + +Alternative: build and copy (guarded): + +```shell +mkdir -p "$HOME/.local/bin" +cargo build --release +BINARY="codex-rs/target/release/codex" +if [ -f "$BINARY" ]; then + install -m 755 "$BINARY" "$HOME/.local/bin/" + echo "Installed codex to $HOME/.local/bin" +else + echo "Error: built binary not found at $BINARY" >&2 + exit 1 +fi +``` + +Notes: +- Using `install -m 755` sets the executable bit and is safer than `cp`. +- Avoid using `sudo` unless installing to system locations like `/usr/local/bin`. + +### Configure Azure OpenAI provider (`$HOME/.codex/config.toml`) + +Replace placeholders in the file below with your values. Placeholders are shown in ALL_CAPS and must be replaced. + +```toml +# Example config - replace placeholders +model = "gpt-5-mini" # example model; replace if needed +model_provider = "azure" + +[model_providers.azure] +name = "Azure OpenAI" +base_url = "https://YOUR_AZURE_RESOURCE.openai.azure.com/openai" # replace YOUR_AZURE_RESOURCE +env_key = "AZURE_OPENAI_API_KEY" +wire_api = "responses" +request_max_retries = 3 +stream_max_retries = 3 +stream_idle_timeout_ms = 120000 + +[model_providers.azure.query_params] +api-version = "2025-04-01-preview" + +[reflection] +enabled = true +model = "gpt-5-mini" +max_attempts = 3 +``` + +Important: `base_url` must match your Azure endpoint; e.g. `https://myresource.openai.azure.com/openai`. The `model` value is illustrative — ensure the model is available for your provider/account. + +### Environment variables + +Add to your shell rc (e.g. `$HOME/.zshrc` or `$HOME/.bashrc`): + +```shell +export PATH="$HOME/.local/bin:$PATH" +export AZURE_OPENAI_API_KEY="YOUR_API_KEY" # do not commit this to version control +``` + +After editing, run `source "$HOME/.zshrc"` or open a new shell. + +### JSON vs TOML config + +If you previously used `$HOME/.codex/config.json`, be aware that JSON config may override TOML. To keep a backup and avoid destructive moves: + +```shell +if [ -f "$HOME/.codex/config.json" ]; then + cp "$HOME/.codex/config.json" "$HOME/.codex/config.json.bak" + echo "Backed up existing JSON config to $HOME/.codex/config.json.bak" +fi +``` + +### Sandbox / Network note + +If you run in a restricted (sandboxed) environment, features requiring outgoing network connections may not work. Check environment variables such as `CODEX_SANDBOX_NETWORK_DISABLED` or run with a network-enabled environment if needed. + +### Testing + +Interactive: + +```shell +# run the binary (interactive) +codex +``` + +Non-interactive (example): + +```shell +codex exec --full-auto "Create a Python hello world program" +# Verify reflection: look for "Reflection verdict" (case-insensitive) +codex exec --full-auto "Create test.py that prints 'hello'" 2>&1 | grep -i "Reflection verdict" || true +``` + +### Running Unit Tests + +Run crate-specific tests (preferred): + +```shell +cargo test -p codex-core reflection +``` + +Run lib-only with verbose output: + +```shell +cargo test -p codex-core --lib reflection -- --nocapture +``` + +If you changed shared crates (core, protocol), run the full test suite: + +```shell +# After local crate tests pass +cargo test --all-features +``` + +Snapshot tests (if applicable): + +- If you update UI/text snapshots in `codex-tui`, follow the repo snapshot flow: + - `cargo test -p codex-tui` + - `cargo insta pending-snapshots -p codex-tui` + - `cargo insta accept -p codex-tui` (only if you intend to accept all new snapshots) + +### Formatting / linting for Rust code + +After making Rust changes, run: + +```shell +# format +(cd codex-rs && just fmt) + +# fix lints for the specific project you changed, e.g. codex-core +(cd codex-rs && just fix -p codex-core) +```