diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs
index 8ab56019c60..3029329c2c9 100644
--- a/codex-rs/core/src/client.rs
+++ b/codex-rs/core/src/client.rs
@@ -63,6 +63,8 @@ pub struct ModelClient {
     effort: Option<ReasoningEffortConfig>,
     summary: ReasoningSummaryConfig,
     session_source: SessionSource,
+    /// Optional model override for specialized use cases (e.g., reflection judge).
+    model_override: Option<String>,
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -88,9 +90,19 @@ impl ModelClient {
             effort,
             summary,
             session_source,
+            model_override: None,
         }
     }
 
+    /// Returns a clone of this client with a different model for the API request.
+    /// This is useful for specialized tasks like reflection judging that may use
+    /// a different (often cheaper/faster) model than the main agent.
+    pub fn with_model_override(&self, model: &str) -> Self {
+        let mut client = self.clone();
+        client.model_override = Some(model.to_string());
+        client
+    }
+
     pub fn get_model_context_window(&self) -> Option<i64> {
         let model_family = self.get_model_family();
         let effective_context_window_percent = model_family.effective_context_window_percent;
@@ -294,9 +306,13 @@ impl ModelClient {
         self.session_source.clone()
     }
 
-    /// Returns the currently configured model slug.
+    /// Returns the currently configured model slug, or the override if set.
     pub fn get_model(&self) -> String {
-        self.get_model_family().get_model_slug().to_string()
+        if let Some(ref override_model) = self.model_override {
+            override_model.clone()
+        } else {
+            self.get_model_family().get_model_slug().to_string()
+        }
     }
 
     /// Returns the currently configured model family.
diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
index 5f0e322dae9..6e3833cdd62 100644
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@@ -20,6 +20,7 @@ use crate::openai_models::model_family::ModelFamily;
 use crate::openai_models::models_manager::ModelsManager;
 use crate::parse_command::parse_command;
 use crate::parse_turn_item;
+use crate::reflection::{ReflectionContext, evaluate_reflection};
 use crate::stream_events_utils::HandleOutputCtx;
 use crate::stream_events_utils::handle_non_tool_response_item;
 use crate::stream_events_utils::handle_output_item_done;
@@ -114,6 +115,7 @@ use crate::protocol::TokenCountEvent;
 use crate::protocol::TokenUsage;
 use crate::protocol::TokenUsageInfo;
 use crate::protocol::TurnDiffEvent;
+use crate::protocol::ReflectionVerdictEvent;
 use crate::protocol::WarningEvent;
 use crate::rollout::RolloutRecorder;
 use crate::rollout::RolloutRecorderParams;
@@ -371,6 +373,7 @@ pub(crate) struct TurnContext {
     pub(crate) tool_call_gate: Arc<ReadinessFlag>,
     pub(crate) exec_policy: Arc<RwLock<ExecPolicy>>,
     pub(crate) truncation_policy: TruncationPolicy,
+    pub(crate) reflection: crate::config::types::ReflectionConfig,
 }
 
 impl TurnContext {
@@ -536,6 +539,7 @@ impl Session {
                 per_turn_config.as_ref(),
                 model_family.truncation_policy,
             ),
+            reflection: per_turn_config.reflection.clone(),
         }
     }
 
@@ -2087,6 +2091,7 @@ async fn spawn_review_thread(
         tool_call_gate: Arc::new(ReadinessFlag::new()),
         exec_policy: parent_turn_context.exec_policy.clone(),
         truncation_policy: TruncationPolicy::new(&per_turn_config, model_family.truncation_policy),
+        reflection: per_turn_config.reflection.clone(),
     };
 
     // Seed the child task with the review prompt as the initial user message.
@@ -2175,6 +2180,9 @@ pub(crate) async fn run_task(
             .await;
     }
 
+    // Extract initial task for reflection BEFORE input is consumed
+    let initial_task = extract_initial_task_from_input(&input);
+
     let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input);
     let response_item: ResponseItem = initial_input_for_turn.clone().into();
     sess.record_response_item_and_emit_turn_item(turn_context.as_ref(), response_item)
@@ -2192,6 +2200,11 @@ pub(crate) async fn run_task(
     // many turns, from the perspective of the user, it is a single turn.
     let turn_diff_tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new()));
 
+    // Track reflection attempts
+    let mut reflection_attempt: u32 = 0;
+    let reflection_config = &turn_context.reflection;
+    let reflection_enabled = reflection_config.enabled || sess.enabled(Feature::Reflection);
+
     loop {
         // Note that pending_input would be something like a message the user
         // submitted through the UI while the model was running. Though the UI
@@ -2255,7 +2268,95 @@ pub(crate) async fn run_task(
                 }
 
                 if !needs_follow_up {
-                    last_agent_message = turn_last_agent_message;
+                    last_agent_message = turn_last_agent_message.clone();
+
+                    // Run reflection if enabled and we haven't exceeded max attempts
+                    let max_attempts = reflection_config.max_attempts;
+                    if reflection_enabled && reflection_attempt < max_attempts {
+                        reflection_attempt += 1;
+                        info!(
+                            "Running reflection evaluation (attempt {}/{})",
+                            reflection_attempt, max_attempts
+                        );
+
+                        // Collect conversation items for reflection
+                        let history_items = sess.clone_history().await.get_history_for_prompt();
+                        let context = ReflectionContext::from_conversation(
+                            initial_task.clone(),
+                            &history_items,
+                            reflection_attempt,
+                            max_attempts,
+                        );
+
+                        // Evaluate with the judge, optionally using a different model
+                        match evaluate_reflection(
+                            &turn_context.client,
+                            context,
+                            reflection_config.model.as_deref(),
+                        )
+                        .await
+                        {
+                            Ok(verdict) => {
+                                info!(
+                                    "Reflection verdict: completed={}, confidence={:.2}",
+                                    verdict.completed, verdict.confidence
+                                );
+
+                                // Emit reflection verdict event for visibility
+                                sess.send_event(
+                                    &turn_context,
+                                    EventMsg::ReflectionVerdict(ReflectionVerdictEvent {
+                                        completed: verdict.completed,
+                                        confidence: verdict.confidence,
+                                        reasoning: verdict.reasoning.clone(),
+                                        feedback: verdict.feedback.clone(),
+                                        attempt: reflection_attempt,
+                                        max_attempts,
+                                    }),
+                                )
+                                .await;
+
+                                if !verdict.completed {
+                                    if let Some(feedback) = verdict.feedback {
+                                        info!("Task incomplete, injecting feedback: {}", feedback);
+
+                                        // Inject feedback as a new user message
+                                        let feedback_msg = format!(
+                                            "[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.",
+                                            reflection_attempt,
+                                            max_attempts,
+                                            verdict.reasoning,
+                                            feedback
+                                        );
+
+                                        let feedback_item = ResponseItem::Message {
+                                            id: None,
+                                            role: "user".to_string(),
+                                            content: vec![ContentItem::InputText {
+                                                text: feedback_msg,
+                                            }],
+                                        };
+
+                                        sess.record_conversation_items(
+                                            &turn_context,
+                                            &[feedback_item],
+                                        )
+                                        .await;
+
+                                        // Continue the loop to process the feedback
+                                        continue;
+                                    }
+                                } else {
+                                    info!("Reflection: Task completed successfully");
+                                }
+                            }
+                            Err(e) => {
+                                warn!("Reflection evaluation failed: {}", e);
+                                // Continue without blocking on reflection errors
+                            }
+                        }
+                    }
+
                     sess.notifier()
                         .notify(&UserNotification::AgentTurnComplete {
                             thread_id: sess.conversation_id.to_string(),
@@ -2292,6 +2393,27 @@ pub(crate) async fn run_task(
     last_agent_message
 }
 
+/// Extract the initial task/prompt from user input.
+fn extract_initial_task_from_input(input: &[UserInput]) -> String {
+    for item in input {
+        match item {
+            UserInput::Text { text } => {
+                return text.clone();
+            }
+            UserInput::Image { .. } | UserInput::LocalImage { .. } => {
+                // Skip images, look for text
+            }
+            UserInput::Skill { name, .. } => {
+                // Return skill name as task description
+                return format!("Run skill: {}", name);
+            }
+            // Handle future variants of the non-exhaustive enum
+            _ => {}
+        }
+    }
+    "(No initial task found)".to_string()
+}
+
 #[instrument(
     skip_all,
     fields(
diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs
index a7e11f45eb8..1e59f6ffc73 100644
--- a/codex-rs/core/src/config/mod.rs
+++ b/codex-rs/core/src/config/mod.rs
@@ -7,6 +7,8 @@ use crate::config::types::Notifications;
 use crate::config::types::OtelConfig;
 use crate::config::types::OtelConfigToml;
 use crate::config::types::OtelExporterKind;
+use crate::config::types::ReflectionConfig;
+use crate::config::types::ReflectionConfigToml;
 use crate::config::types::SandboxWorkspaceWrite;
 use crate::config::types::ShellEnvironmentPolicy;
 use crate::config::types::ShellEnvironmentPolicyToml;
@@ -274,6 +276,9 @@ pub struct Config {
     /// Centralized feature flags; source of truth for feature gating.
     pub features: Features,
 
+    /// Configuration for the reflection/judge feature.
+    pub reflection: ReflectionConfig,
+
     /// The active profile name used to derive this `Config` (if any).
     pub active_profile: Option<String>,
 
@@ -666,6 +671,9 @@ pub struct ConfigToml {
     /// Settings for ghost snapshots (used for undo).
     #[serde(default)]
     pub ghost_snapshot: Option<GhostSnapshotToml>,
+    /// Configuration for the reflection/judge feature.
+    #[serde(default)]
+    pub reflection: Option<ReflectionConfigToml>,
 
     /// When `true`, checks for Codex updates on startup and surfaces update prompts.
     /// Set to `false` only if your Codex updates are centrally managed.
@@ -1221,6 +1229,7 @@ impl Config {
             use_experimental_use_rmcp_client,
             ghost_snapshot,
             features,
+            reflection: cfg.reflection.map(Into::into).unwrap_or_default(),
             active_profile: active_profile_name,
             active_project,
             windows_wsl_setup_acknowledged: cfg.windows_wsl_setup_acknowledged.unwrap_or(false),
@@ -2983,6 +2992,7 @@ model_verbosity = "high"
                 use_experimental_use_rmcp_client: false,
                 ghost_snapshot: GhostSnapshotConfig::default(),
                 features: Features::with_defaults(),
+                reflection: ReflectionConfig::default(),
                 active_profile: Some("o3".to_string()),
                 active_project: ProjectConfig { trust_level: None },
                 windows_wsl_setup_acknowledged: false,
@@ -3058,6 +3068,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("gpt3".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,
@@ -3148,6 +3159,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("zdr".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,
@@ -3224,6 +3236,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("gpt5".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,
diff --git a/codex-rs/core/src/config/types.rs b/codex-rs/core/src/config/types.rs
index 9243e9878aa..76f148ba412 100644
--- a/codex-rs/core/src/config/types.rs
+++ b/codex-rs/core/src/config/types.rs
@@ -531,6 +531,57 @@ impl From<ShellEnvironmentPolicyToml> for ShellEnvironmentPolicy {
     }
 }
 
+/// Configuration for the reflection/judge feature.
+/// This controls how task completion is verified by a separate "judge" model.
+#[derive(Deserialize, Debug, Clone, PartialEq, Default)]
+pub struct ReflectionConfigToml {
+    /// Enable or disable reflection. When enabled, a judge model evaluates
+    /// task completion after each turn. Defaults to false.
+    #[serde(default)]
+    pub enabled: Option<bool>,
+
+    /// Model to use for reflection/judging. If not set, uses the same model
+    /// as the main agent. Can be set to a cheaper/faster model like "gpt-4o-mini".
+    pub model: Option<String>,
+
+    /// Maximum number of reflection attempts before giving up.
+    /// Defaults to 3.
+    pub max_attempts: Option<u32>,
+}
+
+/// Runtime reflection configuration.
+#[derive(Debug, Clone, PartialEq)]
+pub struct ReflectionConfig {
+    /// Whether reflection is enabled.
+    pub enabled: bool,
+
+    /// Model to use for reflection/judging.
+    pub model: Option<String>,
+
+    /// Maximum number of reflection attempts.
+    pub max_attempts: u32,
+}
+
+impl Default for ReflectionConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            model: None,
+            max_attempts: 3,
+        }
+    }
+}
+
+impl From<ReflectionConfigToml> for ReflectionConfig {
+    fn from(toml: ReflectionConfigToml) -> Self {
+        Self {
+            enabled: toml.enabled.unwrap_or(false),
+            model: toml.model,
+            max_attempts: toml.max_attempts.unwrap_or(3),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -803,4 +854,92 @@ mod tests {
             "unexpected error: {err}"
         );
     }
+
+    // ==================== ReflectionConfig Tests ====================
+
+    #[test]
+    fn test_reflection_config_defaults() {
+        let config = ReflectionConfig::default();
+        assert!(!config.enabled);
+        assert!(config.model.is_none());
+        assert_eq!(config.max_attempts, 3);
+    }
+
+    #[test]
+    fn test_reflection_config_toml_defaults() {
+        let toml_config = ReflectionConfigToml::default();
+        assert!(toml_config.enabled.is_none());
+        assert!(toml_config.model.is_none());
+        assert!(toml_config.max_attempts.is_none());
+    }
+
+    #[test]
+    fn test_reflection_config_from_toml_defaults() {
+        let toml_config = ReflectionConfigToml::default();
+        let config: ReflectionConfig = toml_config.into();
+
+        assert!(!config.enabled);
+        assert!(config.model.is_none());
+        assert_eq!(config.max_attempts, 3);
+    }
+
+    #[test]
+    fn test_reflection_config_from_toml_full() {
+        let toml_config = ReflectionConfigToml {
+            enabled: Some(true),
+            model: Some("gpt-4o-mini".to_string()),
+            max_attempts: Some(5),
+        };
+        let config: ReflectionConfig = toml_config.into();
+
+        assert!(config.enabled);
+        assert_eq!(config.model, Some("gpt-4o-mini".to_string()));
+        assert_eq!(config.max_attempts, 5);
+    }
+
+    #[test]
+    fn test_reflection_config_deserialize_toml() {
+        let cfg: ReflectionConfigToml = toml::from_str(
+            r#"
+            enabled = true
+            model = "gpt-4o-mini"
+            max_attempts = 10
+        "#,
+        )
+        .expect("should deserialize reflection config");
+
+        assert_eq!(cfg.enabled, Some(true));
+        assert_eq!(cfg.model, Some("gpt-4o-mini".to_string()));
+        assert_eq!(cfg.max_attempts, Some(10));
+    }
+
+    #[test]
+    fn test_reflection_config_deserialize_toml_partial() {
+        let cfg: ReflectionConfigToml = toml::from_str(
+            r#"
+            enabled = true
+        "#,
+        )
+        .expect("should deserialize partial reflection config");
+
+        assert_eq!(cfg.enabled, Some(true));
+        assert!(cfg.model.is_none());
+        assert!(cfg.max_attempts.is_none());
+
+        // Convert to runtime config
+        let config: ReflectionConfig = cfg.into();
+        assert!(config.enabled);
+        assert!(config.model.is_none());
+        assert_eq!(config.max_attempts, 3); // default
+    }
+
+    #[test]
+    fn test_reflection_config_deserialize_toml_empty() {
+        let cfg: ReflectionConfigToml =
+            toml::from_str("").expect("should deserialize empty config");
+
+        assert!(cfg.enabled.is_none());
+        assert!(cfg.model.is_none());
+        assert!(cfg.max_attempts.is_none());
+    }
 }
diff --git a/codex-rs/core/src/features.rs b/codex-rs/core/src/features.rs
index 431eabe3c61..ca45a6f2ebd 100644
--- a/codex-rs/core/src/features.rs
+++ b/codex-rs/core/src/features.rs
@@ -64,6 +64,8 @@ pub enum Feature {
     ShellSnapshot,
     /// Experimental TUI v2 (viewport) implementation.
     Tui2,
+    /// Reflection layer that verifies task completion via a judge model.
+    Reflection,
 }
 
 impl Feature {
@@ -365,4 +367,10 @@ pub const FEATURES: &[FeatureSpec] = &[
         stage: Stage::Experimental,
         default_enabled: false,
     },
+    FeatureSpec {
+        id: Feature::Reflection,
+        key: "reflection",
+        stage: Stage::Experimental,
+        default_enabled: false,
+    },
 ];
diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs
index 11b49c78c41..fb0bab8697d 100644
--- a/codex-rs/core/src/lib.rs
+++ b/codex-rs/core/src/lib.rs
@@ -92,6 +92,7 @@ pub use rollout::list::Cursor;
 pub use rollout::list::parse_cursor;
 pub use rollout::list::read_head_for_summary;
 mod function_tool;
+pub mod reflection;
 mod state;
 mod tasks;
 mod user_notification;
diff --git a/codex-rs/core/src/reflection.rs b/codex-rs/core/src/reflection.rs
new file mode 100644
index 00000000000..e0b0ea97706
--- /dev/null
+++ b/codex-rs/core/src/reflection.rs
@@ -0,0 +1,915 @@
+//! Reflection layer for task verification.
+//!
+//! This module implements a "judge" that verifies if the agent completed a task
+//! correctly. After each turn, if reflection is enabled, it:
+//! 1. Collects the initial task, recent tool calls, and final result
+//! 2. Sends this context to a judge model for evaluation
+//! 3. Returns a verdict indicating if the task was completed or needs more work
+
+use crate::client::ModelClient;
+use crate::client_common::Prompt;
+use crate::client_common::ResponseEvent;
+use crate::error::Result;
+use codex_protocol::models::ContentItem;
+use codex_protocol::models::ResponseItem;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use tracing::{debug, info, warn};
+
+/// Maximum number of recent tool calls to include in reflection context.
+const MAX_TOOL_CALLS: usize = 10;
+
+/// JSON Schema for the reflection verdict output.
+/// This ensures the judge model returns structured, parseable JSON.
+fn verdict_json_schema() -> serde_json::Value {
+    json!({
+        "type": "object",
+        "properties": {
+            "completed": {
+                "type": "boolean",
+                "description": "Whether the task was completed successfully"
+            },
+            "confidence": {
+                "type": "number",
+                "description": "Confidence score from 0.0 to 1.0"
+            },
+            "reasoning": {
+                "type": "string",
+                "description": "Brief explanation of the verdict"
+            },
+            "feedback": {
+                "type": ["string", "null"],
+                "description": "If not completed: specific instructions for what still needs to be done. If completed: null"
+            }
+        },
+        "required": ["completed", "confidence", "reasoning", "feedback"],
+        "additionalProperties": false
+    })
+}
+
+/// Heuristically detect if tool output indicates an error.
+///
+/// This function uses more sophisticated pattern matching to reduce false positives
+/// from phrases like "error handling", "no errors", etc.
+fn output_indicates_error(output: &str) -> bool {
+    let output_lower = output.to_lowercase();
+
+    // Patterns that strongly indicate an actual error
+    let error_indicators = [
+        "error:",           // "Error: something failed"
+        "failed:",          // "Failed: reason"
+        "failure:",         // "Failure: reason"
+        "exception:",       // "Exception: ..."
+        "panic:",           // Rust panic
+        "traceback",        // Python traceback
+        "fatal error",      // Fatal error
+        "cannot ",          // "cannot find", "cannot open"
+        "could not ",       // "could not connect"
+        "permission denied",
+        "access denied",
+        "not found",        // "file not found", "command not found"
+        "no such file",     // Unix errors
+        "does not exist",
+        "is not recognized", // Windows command errors
+        "syntax error",
+        "compilation error",
+        "build failed",
+        "test failed",
+        "assertion failed",
+        "segmentation fault",
+        "stack overflow",
+        "out of memory",
+        "timeout",
+        "timed out",
+        "connection refused",
+        "connection reset",
+        "exit code 1",      // Non-zero exit codes
+        "exit status 1",
+        "exited with",      // "exited with code 1"
+        "returned error",
+        "threw an error",
+    ];
+
+    // Patterns that suggest false positives - output is actually OK
+    let false_positive_indicators = [
+        "no error",
+        "no errors",
+        "0 errors",
+        "without error",
+        "error handling",
+        "error handler",
+        "error message",
+        "error-free",
+        "errorfree",
+        "on error",    // "on error do something"
+        "if error",    // "if error then"
+        "handle error",
+        "catch error",
+        "log error",
+        "print error",
+    ];
+
+    // If output contains a false positive indicator, be more conservative
+    for fp in &false_positive_indicators {
+        if output_lower.contains(fp) {
+            // Still check for strong error indicators (these override false positives)
+            let strong_indicators = [
+                "error:",
+                "failed:",
+                "failure:",
+                "exception:",
+                "panic:",
+                "traceback",
+                "fatal error",
+                "segmentation fault",
+                "stack overflow",
+            ];
+            for indicator in &strong_indicators {
+                if output_lower.contains(indicator) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }
+
+    // Check for error indicators
+    for indicator in &error_indicators {
+        if output_lower.contains(indicator) {
+            return true;
+        }
+    }
+
+    false
+}
+
+/// Result of a reflection evaluation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReflectionVerdict {
+    /// Whether the task was completed successfully.
+    pub completed: bool,
+    /// Confidence level (0.0 to 1.0).
+    pub confidence: f32,
+    /// Feedback for the agent if task is incomplete.
+    pub feedback: Option<String>,
+    /// Reasoning behind the verdict.
+    pub reasoning: String,
+}
+
+/// Context collected for reflection evaluation.
+#[derive(Debug, Clone)]
+pub struct ReflectionContext {
+    /// The original user task/prompt.
+    pub initial_task: String,
+    /// Recent tool calls with their results.
+    pub tool_calls: Vec<ToolCallSummary>,
+    /// The agent's final message/response.
+    pub final_response: Option<String>,
+    /// Current reflection attempt number.
+    pub attempt: u32,
+    /// Maximum number of reflection attempts (from config).
+    pub max_attempts: u32,
+}
+
+/// Summary of a tool call for reflection context.
+#[derive(Debug, Clone, Serialize)]
+pub struct ToolCallSummary {
+    pub tool_name: String,
+    pub arguments: String,
+    pub result: String,
+    pub success: bool,
+}
+
+impl ReflectionContext {
+    /// Create a new reflection context from conversation items.
+    pub fn from_conversation(
+        initial_task: String,
+        items: &[ResponseItem],
+        attempt: u32,
+        max_attempts: u32,
+    ) -> Self {
+        let mut tool_calls: Vec<ToolCallSummary> = Vec::new();
+        let mut final_response = None;
+
+        // Iterate forward through items to properly pair FunctionCall with FunctionCallOutput
+        for item in items.iter() {
+            match item {
+                ResponseItem::FunctionCall {
+                    name,
+                    arguments,
+                    call_id: _,
+                    id: _,
+                } => {
+                    if tool_calls.len() < MAX_TOOL_CALLS {
+                        tool_calls.push(ToolCallSummary {
+                            tool_name: name.clone(),
+                            arguments: truncate_string(arguments, 200),
+                            result: String::new(),
+                            success: true, // Default, will be updated when we see output
+                        });
+                    }
+                }
+                ResponseItem::FunctionCallOutput { call_id: _, output } => {
+                    // Update the last tool call with its result
+                    if let Some(last_call) = tool_calls.last_mut() {
+                        last_call.result = truncate_string(output, 500);
+                        last_call.success = !output_indicates_error(output);
+                    }
+                }
+                ResponseItem::Message { role, content, .. } => {
+                    if role == "assistant" {
+                        // Extract text from content - keep updating to get the last response
+                        for c in content {
+                            if let ContentItem::OutputText { text } = c {
+                                final_response = Some(text.clone());
+                            }
+                        }
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        Self {
+            initial_task,
+            tool_calls,
+            final_response,
+            attempt,
+            max_attempts,
+        }
+    }
+
+    /// Build the judge prompt from this context.
+    pub fn build_judge_prompt(&self) -> String {
+        let tool_calls_str = if self.tool_calls.is_empty() {
+            "No tool calls were made.".to_string()
+        } else {
+            self.tool_calls
+                .iter()
+                .enumerate()
+                .map(|(i, tc)| {
+                    format!(
+                        "{}. {} ({})\n   Args: {}\n   Result: {}",
+                        i + 1,
+                        tc.tool_name,
+                        if tc.success { "success" } else { "failed" },
+                        tc.arguments,
+                        if tc.result.is_empty() {
+                            "(no output)"
+                        } else {
+                            &tc.result
+                        }
+                    )
+                })
+                .collect::<Vec<_>>()
+                .join("\n\n")
+        };
+
+        let final_response_str = self
+            .final_response
+            .as_ref()
+            .map(|r| truncate_string(r, 1000))
+            .unwrap_or_else(|| "(No final response)".to_string());
+
+        format!(
+            r#"You are a strict task verification judge. Your job is to evaluate whether an AI coding assistant has FULLY completed a user's task.
+
+## ORIGINAL TASK
+{task}
+
+## TOOL CALLS MADE (last {max_tools})
+{tools}
+
+## AGENT'S FINAL RESPONSE
+{response}
+
+## REFLECTION ATTEMPT
+This is attempt {attempt} of {max_attempts}.
+
+## YOUR TASK
+Evaluate whether the task was completed correctly and fully. Be strict but fair.
+
+Consider:
+1. Did the agent address ALL parts of the user's request?
+2. Were the tool calls appropriate and successful?
+3. Is the final response accurate and complete?
+4. Are there any obvious errors or missing steps?
+
+Respond with ONLY a JSON object in this exact format:
+{{
+  "completed": true/false,
+  "confidence": 0.0-1.0,
+  "reasoning": "Brief explanation of your verdict",
+  "feedback": "If not completed: specific instructions for what still needs to be done. If completed: null"
+}}
+
+Be concise. Do not include any text outside the JSON object."#,
+            task = self.initial_task,
+            max_tools = MAX_TOOL_CALLS,
+            tools = tool_calls_str,
+            response = final_response_str,
+            attempt = self.attempt,
+            max_attempts = self.max_attempts,
+        )
+    }
+}
+
+/// Parse the judge's response into a verdict.
+pub fn parse_verdict(response: &str) -> Result<ReflectionVerdict> {
+    // Try to extract JSON from the response
+    let json_str = extract_json(response);
+
+    match serde_json::from_str::<ReflectionVerdict>(&json_str) {
+        Ok(verdict) => Ok(verdict),
+        Err(e) => {
+            warn!("Failed to parse judge verdict: {}", e);
+            // Return a default "completed" verdict if parsing fails.
+            // This prevents blocking on judge errors - we trust the agent
+            // and only block when we have clear evidence of incompletion.
+            Ok(ReflectionVerdict {
+                completed: true,
+                confidence: 0.5,
+                reasoning: format!("Failed to parse judge response: {}", e),
+                feedback: None,
+            })
+        }
+    }
+}
+
+/// Extract JSON object from a string that might contain other text.
+fn extract_json(s: &str) -> String {
+    // Find the first { and last }
+    if let (Some(start), Some(end)) = (s.find('{'), s.rfind('}')) {
+        if end > start {
+            return s[start..=end].to_string();
+        }
+    }
+    s.to_string()
+}
+
+/// Truncate a string to a maximum length, adding ellipsis if needed.
+fn truncate_string(s: &str, max_len: usize) -> String {
+    if s.len() <= max_len {
+        s.to_string()
+    } else {
+        format!("{}...", &s[..max_len.saturating_sub(3)])
+    }
+}
+
+/// Run reflection evaluation using a judge model.
+///
+/// This function creates a separate API call to evaluate the agent's work.
+/// It uses the same model client infrastructure but with a judge-specific prompt.
+///
+/// # Arguments
+/// * `client` - The model client to use for API calls
+/// * `context` - The reflection context containing task and conversation info
+/// * `judge_model` - Optional model to use for judging (overrides client's default)
+pub async fn evaluate_reflection(
+    client: &ModelClient,
+    context: ReflectionContext,
+    judge_model: Option<&str>,
+) -> Result<ReflectionVerdict> {
+    info!(
+        "Running reflection evaluation (attempt {}), judge_model={:?}",
+        context.attempt, judge_model
+    );
+
+    let judge_prompt = context.build_judge_prompt();
+    debug!("Judge prompt length: {} chars", judge_prompt.len());
+
+    // Create a client with the judge model override if specified
+    let effective_client = match judge_model {
+        Some(model) => client.with_model_override(model),
+        None => client.clone(),
+    };
+
+    let prompt = Prompt {
+        input: vec![ResponseItem::Message {
+            id: None,
+            role: "user".to_string(),
+            content: vec![ContentItem::InputText { text: judge_prompt }],
+        }],
+        tools: vec![], // Judge doesn't need tools
+        parallel_tool_calls: false,
+        base_instructions_override: Some(
+            "You are a task verification judge. Respond only with JSON.".to_string(),
+        ),
+        output_schema: Some(verdict_json_schema()),
+    };
+
+    // Stream the response and collect it
+    let mut stream = effective_client.stream(&prompt).await?;
+    let mut response_text = String::new();
+
+    while let Some(event) = stream.next().await {
+        match event {
+            Ok(ResponseEvent::OutputTextDelta(delta)) => {
+                response_text.push_str(&delta);
+            }
+            Ok(ResponseEvent::Completed { .. }) => {
+                break;
+            }
+            Err(e) => {
+                warn!("Error during reflection stream: {}", e);
+                break;
+            }
+            _ => {}
+        }
+    }
+
+    debug!("Judge response: {}", response_text);
+    parse_verdict(&response_text)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ==================== Verdict Parsing Tests ====================
+
+    #[test]
+    fn test_parse_verdict_valid_pass() {
+        let response = r#"{"completed": true, "confidence": 0.95, "reasoning": "Task was done", "feedback": null}"#;
+        let verdict = parse_verdict(response).unwrap();
+        assert!(verdict.completed);
+        assert!(verdict.confidence > 0.9);
+        assert!(verdict.feedback.is_none());
+        assert_eq!(verdict.reasoning, "Task was done");
+    }
+
+    #[test]
+    fn test_parse_verdict_valid_fail() {
+        let response = r#"{"completed": false, "confidence": 0.7, "reasoning": "Task incomplete", "feedback": "Please add error handling"}"#;
+        let verdict = parse_verdict(response).unwrap();
+        assert!(!verdict.completed);
+        assert!(verdict.confidence < 0.8);
+        assert_eq!(
+            verdict.feedback,
+            Some("Please add error handling".to_string())
+        );
+    }
+
+    #[test]
+    fn test_parse_verdict_with_surrounding_text() {
+        let response = r#"Here is my evaluation:
+{"completed": false, "confidence": 0.8, "reasoning": "Missing tests", "feedback": "Add unit tests"}
+That's my verdict."#;
+        let verdict = parse_verdict(response).unwrap();
+        assert!(!verdict.completed);
+        assert_eq!(verdict.feedback, Some("Add unit tests".to_string()));
+    }
+
+    #[test]
+    fn test_parse_verdict_malformed_json_returns_default() {
+        let response = "This is not valid JSON at all!";
+        let verdict = parse_verdict(response).unwrap();
+        // Should return default "completed" verdict when parsing fails
+        // This prevents blocking on judge parse errors
+        assert!(verdict.completed);
+        assert_eq!(verdict.confidence, 0.5);
+        assert!(verdict.feedback.is_none());
+    }
+
+    #[test]
+    fn test_parse_verdict_partial_json() {
+        let response = r#"{"completed": true"#; // Incomplete JSON
+        let verdict = parse_verdict(response).unwrap();
+        // Should return default "completed" on parse failure
+        assert!(verdict.completed);
+    }
+
+    // ==================== String Utilities Tests ====================
+
+    #[test]
+    fn test_truncate_string_no_truncation() {
+        assert_eq!(truncate_string("hello", 10), "hello");
+        assert_eq!(truncate_string("", 10), "");
+    }
+
+    #[test]
+    fn test_truncate_string_with_truncation() {
+        assert_eq!(truncate_string("hello world", 8), "hello...");
+        assert_eq!(truncate_string("abcdefghij", 5), "ab...");
+    }
+
+    #[test]
+    fn test_truncate_string_edge_cases() {
+        assert_eq!(truncate_string("abc", 3), "abc"); // Exact length
+        assert_eq!(truncate_string("abcd", 3), "..."); // Just under, adds ellipsis
+    }
+
+    // ==================== Error Detection Tests ====================
+
+    #[test]
+    fn test_output_indicates_error_real_errors() {
+        // These should all be detected as errors
+        assert!(output_indicates_error("Error: file not found"));
+        assert!(output_indicates_error("Failed: connection refused"));
+        assert!(output_indicates_error("permission denied"));
+        assert!(output_indicates_error("Command 'foo' not found"));
+        assert!(output_indicates_error("Build failed with 3 errors"));
+        assert!(output_indicates_error("Test failed: expected 5, got 3"));
+        assert!(output_indicates_error("Traceback (most recent call last):"));
+        assert!(output_indicates_error("panic: runtime error"));
+        assert!(output_indicates_error("Cannot open file"));
+        assert!(output_indicates_error("exit code 1"));
+    }
+
+    #[test]
+    fn test_output_indicates_error_false_positives() {
+        // These should NOT be detected as errors (false positives in naive approach)
+        assert!(!output_indicates_error("Implemented error handling for edge cases"));
+        assert!(!output_indicates_error("Added error message display"));
+        assert!(!output_indicates_error("No errors found in the codebase"));
+        assert!(!output_indicates_error("The function handles errors gracefully"));
+        assert!(!output_indicates_error("0 errors, 0 warnings"));
+        assert!(!output_indicates_error("Build completed without error"));
+    }
+
+    #[test]
+    fn test_output_indicates_error_success_messages() {
+        // These are clearly successful outputs
+        assert!(!output_indicates_error("File created successfully"));
+        assert!(!output_indicates_error("All tests passed"));
+        assert!(!output_indicates_error("Build completed in 2.3s"));
+        assert!(!output_indicates_error("hello world"));
+        assert!(!output_indicates_error(""));
+    }
+
+    #[test]
+    fn test_output_indicates_error_mixed_with_strong_indicator() {
+        // Even if false positive phrases exist, strong indicators should trigger
+        assert!(output_indicates_error("Error: no error handler found"));
+        assert!(output_indicates_error("0 errors in file, but Failed: compilation error"));
+    }
+
+    #[test]
+    fn test_extract_json_simple() {
+        let s = "Some text {\"key\": \"value\"} more text";
+        assert_eq!(extract_json(s), r#"{"key": "value"}"#);
+    }
+
+    #[test]
+    fn test_extract_json_nested() {
+        let s = r#"Result: {"outer": {"inner": "value"}, "arr": [1,2,3]}"#;
+        assert_eq!(
+            extract_json(s),
+            r#"{"outer": {"inner": "value"}, "arr": [1,2,3]}"#
+        );
+    }
+
+    #[test]
+    fn test_extract_json_no_json() {
+        let s = "No JSON here";
+        assert_eq!(extract_json(s), "No JSON here");
+    }
+
+    #[test]
+    fn test_extract_json_only_json() {
+        let s = r#"{"completed": true}"#;
+        assert_eq!(extract_json(s), r#"{"completed": true}"#);
+    }
+
+    // ==================== ReflectionContext Tests ====================
+
+    #[test]
+    fn test_reflection_context_creation() {
+        let context = ReflectionContext {
+            initial_task: "Create a hello world program".to_string(),
+            tool_calls: vec![],
+            final_response: Some("Done!".to_string()),
+            attempt: 1,
+            max_attempts: 3,
+        };
+
+        assert_eq!(context.initial_task, "Create a hello world program");
+        assert_eq!(context.attempt, 1);
+        assert!(context.tool_calls.is_empty());
+    }
+
+    #[test]
+    fn test_reflection_context_with_tool_calls() {
+        let context = ReflectionContext {
+            initial_task: "Run a shell command".to_string(),
+            tool_calls: vec![ToolCallSummary {
+                tool_name: "shell".to_string(),
+                arguments: r#"{"command": "echo hello"}"#.to_string(),
+                result: "hello\n".to_string(),
+                success: true,
+            }],
+            final_response: Some("Executed successfully".to_string()),
+            attempt: 1,
+            max_attempts: 3,
+        };
+
+        assert_eq!(context.tool_calls.len(), 1);
+        assert_eq!(context.tool_calls[0].tool_name, "shell");
+        assert!(context.tool_calls[0].success);
+    }
+
+    #[test]
+    fn test_build_judge_prompt_contains_task() {
+        let context = ReflectionContext {
+            initial_task: "Create a Python file".to_string(),
+            tool_calls: vec![],
+            final_response: Some("I created the file".to_string()),
+            attempt: 1,
+            max_attempts: 3,
+        };
+
+        let prompt = context.build_judge_prompt();
+        assert!(prompt.contains("Create a Python file"));
+        assert!(prompt.contains("I created the file"));
+        assert!(prompt.contains("attempt 1"));
+    }
+
+    #[test]
+    fn test_build_judge_prompt_with_tool_calls() {
+        let context = ReflectionContext {
+            initial_task: "Run tests".to_string(),
+            tool_calls: vec![ToolCallSummary {
+                tool_name: "shell".to_string(),
+                arguments: "npm test".to_string(),
+                result: "All tests passed".to_string(),
+                success: true,
+            }],
+            final_response: None,
+            attempt: 2,
+            max_attempts: 3,
+        };
+
+        let prompt = context.build_judge_prompt();
+        assert!(prompt.contains("shell"));
+        assert!(prompt.contains("npm test"));
+        assert!(prompt.contains("All tests passed"));
+        assert!(prompt.contains("success"));
+    }
+
+    #[test]
+    fn test_build_judge_prompt_no_tool_calls() {
+        let context = ReflectionContext {
+            initial_task: "Explain something".to_string(),
+            tool_calls: vec![],
+            final_response: Some("Here is the explanation".to_string()),
+            attempt: 1,
+            max_attempts: 3,
+        };
+
+        let prompt = context.build_judge_prompt();
+        assert!(prompt.contains("No tool calls were made"));
+    }
+
+    // ==================== ToolCallSummary Tests ====================
+
+    #[test]
+    fn test_tool_call_summary_success() {
+        let summary = ToolCallSummary {
+            tool_name: "write_file".to_string(),
+            arguments: r#"{"path": "/tmp/test.txt", "content": "hello"}"#.to_string(),
+            result: "File written successfully".to_string(),
+            success: true,
+        };
+
+        assert!(summary.success);
+        assert_eq!(summary.tool_name, "write_file");
+    }
+
+    #[test]
+    fn test_tool_call_summary_failure() {
+        let summary = ToolCallSummary {
+            tool_name: "shell".to_string(),
+            arguments: "rm -rf /".to_string(),
+            result: "Permission denied".to_string(),
+            success: false,
+        };
+
+        assert!(!summary.success);
+        assert!(summary.result.contains("Permission denied"));
+    }
+
+    // ==================== ReflectionVerdict Tests ====================
+
+    #[test]
+    fn test_reflection_verdict_serialize() {
+        let verdict = ReflectionVerdict {
+            completed: true,
+            confidence: 0.95,
+            feedback: None,
+            reasoning: "All good".to_string(),
+        };
+
+        let json = serde_json::to_string(&verdict).unwrap();
+        assert!(json.contains("\"completed\":true"));
+        assert!(json.contains("\"confidence\":0.95"));
+    }
+
+    #[test]
+    fn test_reflection_verdict_deserialize() {
+        let json = r#"{"completed": false, "confidence": 0.6, "reasoning": "Needs work", "feedback": "Add tests"}"#;
+        let verdict: ReflectionVerdict = serde_json::from_str(json).unwrap();
+
+        assert!(!verdict.completed);
+        assert_eq!(verdict.confidence, 0.6);
+        assert_eq!(verdict.feedback, Some("Add tests".to_string()));
+    }
+
+    // ==================== Integration Tests ====================
+    // These tests verify the reflection layer works with real ResponseItem data
+
+    #[test]
+    fn test_from_conversation_with_function_calls() {
+        use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem};
+
+        // Simulate a conversation with tool calls
+        let items = vec![
+            // User message
+            ResponseItem::Message {
+                id: Some("msg1".to_string()),
+                role: "user".to_string(),
+                content: vec![ContentItem::InputText {
+                    text: "Create a file called test.txt".to_string(),
+                }],
+            },
+            // Assistant calls a tool
+            ResponseItem::FunctionCall {
+                id: Some("fc1".to_string()),
+                name: "write_file".to_string(),
+                arguments: r#"{"path": "test.txt", "content": "hello"}"#.to_string(),
+                call_id: "call_123".to_string(),
+            },
+            // Tool output
+            ResponseItem::FunctionCallOutput {
+                call_id: "call_123".to_string(),
+                output: FunctionCallOutputPayload {
+                    content: "File created successfully".to_string(),
+                    ..Default::default()
+                },
+            },
+            // Assistant final response
+            ResponseItem::Message {
+                id: Some("msg2".to_string()),
+                role: "assistant".to_string(),
+                content: vec![ContentItem::OutputText {
+                    text: "I've created the file test.txt with the content 'hello'.".to_string(),
+                }],
+            },
+        ];
+
+        let context = ReflectionContext::from_conversation(
+            "Create a file called test.txt".to_string(),
+            &items,
+            1,
+            3,
+        );
+
+        // Verify context was built correctly
+        assert_eq!(context.initial_task, "Create a file called test.txt");
+        assert_eq!(context.attempt, 1);
+        assert!(context.final_response.is_some());
+        assert!(
+            context
+                .final_response
+                .as_ref()
+                .unwrap()
+                .contains("created the file")
+        );
+
+        // Verify tool calls were captured
+        assert_eq!(context.tool_calls.len(), 1);
+        assert_eq!(context.tool_calls[0].tool_name, "write_file");
+        assert!(context.tool_calls[0].result.contains("successfully"));
+        assert!(context.tool_calls[0].success); // No "error" in output
+    }
+
+    #[test]
+    fn test_from_conversation_with_failed_tool_call() {
+        use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem};
+
+        let items = vec![
+            ResponseItem::FunctionCall {
+                id: Some("fc1".to_string()),
+                name: "shell".to_string(),
+                arguments: r#"{"command": "rm important_file"}"#.to_string(),
+                call_id: "call_456".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "call_456".to_string(),
+                output: FunctionCallOutputPayload {
+                    content: "Error: Permission denied".to_string(),
+                    ..Default::default()
+                },
+            },
+            ResponseItem::Message {
+                id: Some("msg1".to_string()),
+                role: "assistant".to_string(),
+                content: vec![ContentItem::OutputText {
+                    text: "The command failed with an error.".to_string(),
+                }],
+            },
+        ];
+
+        let context =
+            ReflectionContext::from_conversation("Delete the file".to_string(), &items, 2, 3);
+
+        // Tool call should be marked as failed due to "Error" in output
+        assert_eq!(context.tool_calls.len(), 1);
+        assert!(!context.tool_calls[0].success);
+        assert!(context.tool_calls[0].result.contains("Permission denied"));
+    }
+
+    #[test]
+    fn test_from_conversation_empty_items() {
+        let items: Vec<codex_protocol::models::ResponseItem> = vec![];
+        let context =
+            ReflectionContext::from_conversation("Do something".to_string(), &items, 1, 3);
+
+        assert_eq!(context.initial_task, "Do something");
+        assert!(context.tool_calls.is_empty());
+        assert!(context.final_response.is_none());
+    }
+
+    #[test]
+    fn test_reflection_feedback_message_format() {
+        // Test that the feedback message format is correct
+        let verdict = ReflectionVerdict {
+            completed: false,
+            confidence: 0.7,
+            reasoning: "The file was created but tests are missing".to_string(),
+            feedback: Some("Please add unit tests for the new function".to_string()),
+        };
+
+        // Simulate the feedback message construction from codex.rs
+        let feedback_msg = format!(
+            "[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.",
+            1,
+            3,
+            verdict.reasoning,
+            verdict.feedback.as_ref().unwrap()
+        );
+
+        assert!(feedback_msg.contains("Attempt 1/3"));
+        assert!(feedback_msg.contains("tests are missing"));
+        assert!(feedback_msg.contains("add unit tests"));
+    }
+
+    #[test]
+    fn test_judge_prompt_includes_all_context() {
+        use codex_protocol::models::{ContentItem, FunctionCallOutputPayload, ResponseItem};
+
+        // Build a realistic context
+        let items = vec![
+            ResponseItem::FunctionCall {
+                id: None,
+                name: "read_file".to_string(),
+                arguments: r#"{"path": "src/main.rs"}"#.to_string(),
+                call_id: "c1".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "c1".to_string(),
+                output: FunctionCallOutputPayload {
+                    content: "fn main() { println!(\"Hello\"); }".to_string(),
+                    ..Default::default()
+                },
+            },
+            ResponseItem::FunctionCall {
+                id: None,
+                name: "write_file".to_string(),
+                arguments: r#"{"path": "src/main.rs", "content": "fn main() { println!(\"Hello, World!\"); }"}"#.to_string(),
+                call_id: "c2".to_string(),
+            },
+            ResponseItem::FunctionCallOutput {
+                call_id: "c2".to_string(),
+                output: FunctionCallOutputPayload {
+                    content: "File written".to_string(),
+                    ..Default::default()
+                },
+            },
+            ResponseItem::Message {
+                id: None,
+                role: "assistant".to_string(),
+                content: vec![ContentItem::OutputText {
+                    text: "I've updated the greeting message.".to_string(),
+                }],
+            },
+        ];
+
+        let context = ReflectionContext::from_conversation(
+            "Update the greeting to say Hello, World!".to_string(),
+            &items,
+            1,
+            3,
+        );
+
+        let prompt = context.build_judge_prompt();
+
+        // Verify the judge prompt contains all necessary information
+        assert!(prompt.contains("Update the greeting"));
+        assert!(prompt.contains("read_file"));
+        assert!(prompt.contains("write_file"));
+        assert!(prompt.contains("updated the greeting"));
+        assert!(prompt.contains("attempt 1"));
+        assert!(prompt.contains(r#""completed": true/false"#)); // JSON schema hint
+    }
+}
diff --git a/codex-rs/core/src/rollout/policy.rs b/codex-rs/core/src/rollout/policy.rs
index 2980e768cf8..363557677a3 100644
--- a/codex-rs/core/src/rollout/policy.rs
+++ b/codex-rs/core/src/rollout/policy.rs
@@ -46,7 +46,8 @@ pub(crate) fn should_persist_event_msg(ev: &EventMsg) -> bool {
         | EventMsg::EnteredReviewMode(_)
         | EventMsg::ExitedReviewMode(_)
         | EventMsg::UndoCompleted(_)
-        | EventMsg::TurnAborted(_) => true,
+        | EventMsg::TurnAborted(_)
+        | EventMsg::ReflectionVerdict(_) => true,
         EventMsg::Error(_)
         | EventMsg::Warning(_)
         | EventMsg::TaskStarted(_)
diff --git a/codex-rs/core/tests/suite/eval_swe_bench.rs b/codex-rs/core/tests/suite/eval_swe_bench.rs
new file mode 100644
index 00000000000..5247da5810a
--- /dev/null
+++ b/codex-rs/core/tests/suite/eval_swe_bench.rs
@@ -0,0 +1,635 @@
+//! SWE-bench style evaluation tests.
+//!
+//! These tests evaluate the agent's ability to fix real-world style bugs,
+//! inspired by SWE-bench benchmark tasks. Each test provides:
+//! - A codebase with a bug
+//! - An issue description
+//! - Test files that verify the fix
+//!
+//! Run with reflection enabled to compare results:
+//! ```sh
+//! AZURE_OPENAI_API_KEY=<key> AZURE_OPENAI_BASE_URL=<url> \
+//!     cargo test -p codex-core --test all eval_swe -- --ignored --nocapture
+//! ```
+
+use assert_cmd::prelude::*;
+use std::fs;
+use std::io::{Read, Write};
+use std::process::{Command, Stdio};
+use std::thread;
+use tempfile::TempDir;
+
+fn require_azure_credentials() -> (String, String, String) {
+    let api_key = std::env::var("AZURE_OPENAI_API_KEY")
+        .expect("AZURE_OPENAI_API_KEY env var not set");
+    let base_url = std::env::var("AZURE_OPENAI_BASE_URL")
+        .expect("AZURE_OPENAI_BASE_URL env var not set");
+    let model = std::env::var("AZURE_OPENAI_MODEL").unwrap_or_else(|_| "gpt-5-mini".to_string());
+    (api_key, base_url, model)
+}
+
+/// Creates config.toml with optional reflection setting.
+fn create_config(base_url: &str, model: &str, reflection_enabled: bool) -> String {
+    let base_url = base_url.trim_end_matches('/');
+    let base_url = if base_url.ends_with("/openai") {
+        base_url.to_string()
+    } else {
+        format!("{}/openai", base_url)
+    };
+
+    let reflection_config = if reflection_enabled {
+        r#"
+[reflection]
+enabled = true
+max_attempts = 3
+
+[features]
+reflection = true
+"#
+    } else {
+        ""
+    };
+
+    format!(
+        r#"
+model = "{model}"
+model_provider = "azure-openai"
+{reflection_config}
+[model_providers.azure-openai]
+name = "Azure OpenAI"
+base_url = "{base_url}"
+env_key = "AZURE_OPENAI_API_KEY"
+wire_api = "responses"
+request_max_retries = 3
+stream_max_retries = 3
+stream_idle_timeout_ms = 120000
+
+[model_providers.azure-openai.query_params]
+api-version = "2025-04-01-preview"
+"#
+    )
+}
+
+/// Result of an eval run.
+#[derive(Debug)]
+struct EvalResult {
+    success: bool,
+    reflection_used: bool,
+    test_output: String,
+    reflection_verdicts: Vec<String>,
+}
+
+/// Run an eval task and return the result.
+fn run_eval_task(
+    prompt: &str,
+    setup_files: &[(&str, &str)],
+    test_command: &str,
+    reflection_enabled: bool,
+) -> EvalResult {
+    #![expect(clippy::unwrap_used)]
+
+    let (api_key, base_url, model) = require_azure_credentials();
+
+    let dir = TempDir::new().unwrap();
+    let work_dir = dir.path();
+
+    // Create setup files (buggy codebase)
+    for (path, content) in setup_files {
+        let file_path = work_dir.join(path);
+        if let Some(parent) = file_path.parent() {
+            fs::create_dir_all(parent).unwrap();
+        }
+        fs::write(&file_path, content).unwrap();
+    }
+
+    // Create .codex directory with config
+    let codex_home = work_dir.join(".codex");
+    fs::create_dir_all(&codex_home).unwrap();
+    fs::write(
+        codex_home.join("config.toml"),
+        create_config(&base_url, &model, reflection_enabled),
+    )
+    .unwrap();
+
+    // Run codex
+    let mut cmd = Command::cargo_bin("codex").unwrap();
+    cmd.current_dir(work_dir);
+    cmd.env("AZURE_OPENAI_API_KEY", api_key);
+    cmd.env("CODEX_HOME", &codex_home);
+    cmd.env("RUST_LOG", "codex_core=info");
+
+    cmd.arg("exec")
+        .arg("--full-auto")
+        .arg("--skip-git-repo-check")
+        .arg("--color")
+        .arg("never")
+        .arg(prompt);
+
+    cmd.stdin(Stdio::null());
+    cmd.stdout(Stdio::piped());
+    cmd.stderr(Stdio::piped());
+
+    let mut child = cmd.spawn().expect("failed to spawn codex");
+
+    // Tee helper
+    fn tee<R: Read + Send + 'static>(
+        mut reader: R,
+        mut writer: impl Write + Send + 'static,
+    ) -> thread::JoinHandle<Vec<u8>> {
+        thread::spawn(move || {
+            let mut buf = Vec::new();
+            let mut chunk = [0u8; 4096];
+            loop {
+                match reader.read(&mut chunk) {
+                    Ok(0) => break,
+                    Ok(n) => {
+                        writer.write_all(&chunk[..n]).ok();
+                        writer.flush().ok();
+                        buf.extend_from_slice(&chunk[..n]);
+                    }
+                    Err(_) => break,
+                }
+            }
+            buf
+        })
+    }
+
+    let stdout_handle = tee(
+        child.stdout.take().expect("child stdout"),
+        std::io::stdout(),
+    );
+    let stderr_handle = tee(
+        child.stderr.take().expect("child stderr"),
+        std::io::stderr(),
+    );
+
+    let status = child.wait().expect("failed to wait on child");
+    let stdout = stdout_handle.join().expect("stdout thread panicked");
+    let stderr = stderr_handle.join().expect("stderr thread panicked");
+
+    let combined = format!(
+        "{}{}",
+        String::from_utf8_lossy(&stdout),
+        String::from_utf8_lossy(&stderr)
+    );
+
+    // Extract reflection verdicts
+    let reflection_verdicts: Vec<String> = combined
+        .lines()
+        .filter(|line| line.contains("Reflection verdict"))
+        .map(|s| s.to_string())
+        .collect();
+
+    // Run the verification test
+    let test_result = Command::new("bash")
+        .arg("-c")
+        .arg(test_command)
+        .current_dir(work_dir)
+        .output()
+        .expect("failed to run test");
+
+    let test_output = format!(
+        "stdout: {}\nstderr: {}",
+        String::from_utf8_lossy(&test_result.stdout),
+        String::from_utf8_lossy(&test_result.stderr)
+    );
+
+    EvalResult {
+        success: status.success() && test_result.status.success(),
+        reflection_used: !reflection_verdicts.is_empty(),
+        test_output,
+        reflection_verdicts,
+    }
+}
+
+// ============================================================================
+// EVAL TASK 1: Off-by-one error in array processing
+// ============================================================================
+
+const TASK1_BUG_FILE: &str = r#"
+def sum_first_n(arr, n):
+    """Return sum of first n elements of arr."""
+    total = 0
+    for i in range(n + 1):  # BUG: should be range(n)
+        total += arr[i]
+    return total
+
+def get_last_element(arr):
+    """Return the last element of arr."""
+    return arr[len(arr)]  # BUG: should be arr[len(arr) - 1] or arr[-1]
+"#;
+
+const TASK1_TEST_FILE: &str = r#"
+import pytest
+from array_utils import sum_first_n, get_last_element
+
+def test_sum_first_n():
+    arr = [1, 2, 3, 4, 5]
+    assert sum_first_n(arr, 3) == 6  # 1 + 2 + 3
+    assert sum_first_n(arr, 1) == 1
+    assert sum_first_n(arr, 5) == 15
+
+def test_get_last_element():
+    assert get_last_element([1, 2, 3]) == 3
+    assert get_last_element([10]) == 10
+    assert get_last_element(['a', 'b', 'c']) == 'c'
+"#;
+
+const TASK1_ISSUE: &str = r#"Fix the off-by-one errors in array_utils.py
+
+The file array_utils.py has two functions with index errors:
+
+1. `sum_first_n(arr, n)` - Should return sum of first n elements, but it's including one extra element
+2. `get_last_element(arr)` - Should return the last element, but it's causing an IndexError
+
+Please fix these bugs so that test_array_utils.py passes.
+
+After fixing, run: pytest test_array_utils.py -v
+"#;
+
+#[ignore]
+#[test]
+fn eval_task1_offbyone_with_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK1_ISSUE,
+        &[
+            ("array_utils.py", TASK1_BUG_FILE),
+            ("test_array_utils.py", TASK1_TEST_FILE),
+        ],
+        "pytest test_array_utils.py -v",
+        true, // reflection enabled
+    );
+
+    println!("\n=== EVAL TASK 1 (with reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Reflection verdicts: {:?}", result.reflection_verdicts);
+    println!("Test output:\n{}", result.test_output);
+
+    assert!(result.success, "Task 1 failed with reflection enabled");
+}
+
+#[ignore]
+#[test]
+fn eval_task1_offbyone_without_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK1_ISSUE,
+        &[
+            ("array_utils.py", TASK1_BUG_FILE),
+            ("test_array_utils.py", TASK1_TEST_FILE),
+        ],
+        "pytest test_array_utils.py -v",
+        false, // reflection disabled
+    );
+
+    println!("\n=== EVAL TASK 1 (without reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Test output:\n{}", result.test_output);
+
+    // Don't assert - just report for comparison
+    println!(
+        "Result: {}",
+        if result.success { "PASS" } else { "FAIL" }
+    );
+}
+
+// ============================================================================
+// EVAL TASK 2: Logic error in string processing
+// ============================================================================
+
+const TASK2_BUG_FILE: &str = r#"
+def is_palindrome(s):
+    """Check if string is a palindrome (case-insensitive, ignoring spaces)."""
+    # BUG: doesn't handle case or spaces
+    return s == s[::-1]
+
+def count_words(text):
+    """Count number of words in text."""
+    # BUG: doesn't handle multiple spaces or leading/trailing spaces
+    return len(text.split(' '))
+"#;
+
+const TASK2_TEST_FILE: &str = r#"
+import pytest
+from string_utils import is_palindrome, count_words
+
+def test_is_palindrome():
+    assert is_palindrome("racecar") == True
+    assert is_palindrome("A man a plan a canal Panama") == True
+    assert is_palindrome("Race Car") == True
+    assert is_palindrome("hello") == False
+
+def test_count_words():
+    assert count_words("hello world") == 2
+    assert count_words("  hello   world  ") == 2
+    assert count_words("one") == 1
+    assert count_words("a b c d e") == 5
+"#;
+
+const TASK2_ISSUE: &str = r#"Fix logic errors in string_utils.py
+
+The string_utils.py file has two functions with logic bugs:
+
+1. `is_palindrome(s)` - Should check if string is palindrome, ignoring case and spaces
+   Currently fails on "Race Car" and "A man a plan a canal Panama"
+
+2. `count_words(text)` - Should count words, handling multiple spaces correctly
+   Currently gives wrong count for "  hello   world  "
+
+Fix these bugs so that test_string_utils.py passes.
+
+After fixing, run: pytest test_string_utils.py -v
+"#;
+
+#[ignore]
+#[test]
+fn eval_task2_string_logic_with_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK2_ISSUE,
+        &[
+            ("string_utils.py", TASK2_BUG_FILE),
+            ("test_string_utils.py", TASK2_TEST_FILE),
+        ],
+        "pytest test_string_utils.py -v",
+        true,
+    );
+
+    println!("\n=== EVAL TASK 2 (with reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Reflection verdicts: {:?}", result.reflection_verdicts);
+    println!("Test output:\n{}", result.test_output);
+
+    assert!(result.success, "Task 2 failed with reflection enabled");
+}
+
+#[ignore]
+#[test]
+fn eval_task2_string_logic_without_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK2_ISSUE,
+        &[
+            ("string_utils.py", TASK2_BUG_FILE),
+            ("test_string_utils.py", TASK2_TEST_FILE),
+        ],
+        "pytest test_string_utils.py -v",
+        false,
+    );
+
+    println!("\n=== EVAL TASK 2 (without reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Test output:\n{}", result.test_output);
+
+    println!(
+        "Result: {}",
+        if result.success { "PASS" } else { "FAIL" }
+    );
+}
+
+// ============================================================================
+// EVAL TASK 3: Missing edge case handling
+// ============================================================================
+
+const TASK3_BUG_FILE: &str = r#"
+def safe_divide(a, b):
+    """Safely divide a by b, return None on error."""
+    # BUG: doesn't handle division by zero
+    return a / b
+
+def find_max(numbers):
+    """Find maximum value in list."""
+    # BUG: doesn't handle empty list
+    max_val = numbers[0]
+    for n in numbers[1:]:
+        if n > max_val:
+            max_val = n
+    return max_val
+
+def get_element_at(arr, index):
+    """Get element at index, return None if out of bounds."""
+    # BUG: doesn't handle negative indices or out of bounds
+    return arr[index]
+"#;
+
+const TASK3_TEST_FILE: &str = r#"
+import pytest
+from math_utils import safe_divide, find_max, get_element_at
+
+def test_safe_divide():
+    assert safe_divide(10, 2) == 5.0
+    assert safe_divide(0, 5) == 0.0
+    assert safe_divide(10, 0) is None  # Division by zero
+    assert safe_divide(10, 3) == pytest.approx(3.333, rel=0.01)
+
+def test_find_max():
+    assert find_max([1, 5, 3, 9, 2]) == 9
+    assert find_max([42]) == 42
+    assert find_max([-5, -1, -10]) == -1
+    assert find_max([]) is None  # Empty list
+
+def test_get_element_at():
+    arr = [10, 20, 30]
+    assert get_element_at(arr, 0) == 10
+    assert get_element_at(arr, 2) == 30
+    assert get_element_at(arr, 5) is None  # Out of bounds
+    assert get_element_at(arr, -1) is None  # Negative index
+    assert get_element_at([], 0) is None  # Empty array
+"#;
+
+const TASK3_ISSUE: &str = r#"Fix missing edge case handling in math_utils.py
+
+The math_utils.py has three functions that don't handle edge cases properly:
+
+1. `safe_divide(a, b)` - Should return None when dividing by zero, but crashes instead
+
+2. `find_max(numbers)` - Should return None for empty list, but crashes with IndexError
+
+3. `get_element_at(arr, index)` - Should return None for:
+   - Out of bounds indices
+   - Negative indices
+   - Empty arrays
+   Currently crashes instead
+
+Fix these edge cases so that test_math_utils.py passes.
+
+After fixing, run: pytest test_math_utils.py -v
+"#;
+
+#[ignore]
+#[test]
+fn eval_task3_edge_cases_with_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK3_ISSUE,
+        &[
+            ("math_utils.py", TASK3_BUG_FILE),
+            ("test_math_utils.py", TASK3_TEST_FILE),
+        ],
+        "pytest test_math_utils.py -v",
+        true,
+    );
+
+    println!("\n=== EVAL TASK 3 (with reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Reflection verdicts: {:?}", result.reflection_verdicts);
+    println!("Test output:\n{}", result.test_output);
+
+    assert!(result.success, "Task 3 failed with reflection enabled");
+}
+
+#[ignore]
+#[test]
+fn eval_task3_edge_cases_without_reflection() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    let result = run_eval_task(
+        TASK3_ISSUE,
+        &[
+            ("math_utils.py", TASK3_BUG_FILE),
+            ("test_math_utils.py", TASK3_TEST_FILE),
+        ],
+        "pytest test_math_utils.py -v",
+        false,
+    );
+
+    println!("\n=== EVAL TASK 3 (without reflection) ===");
+    println!("Success: {}", result.success);
+    println!("Reflection used: {}", result.reflection_used);
+    println!("Test output:\n{}", result.test_output);
+
+    println!(
+        "Result: {}",
+        if result.success { "PASS" } else { "FAIL" }
+    );
+}
+
+/// Run all eval tasks and summarize results.
+#[ignore]
+#[test]
+fn eval_summary() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err() {
+        eprintln!("skipping eval — Azure credentials not set");
+        return;
+    }
+
+    println!("\n========================================");
+    println!("SWE-BENCH STYLE EVALUATION SUMMARY");
+    println!("========================================\n");
+
+    let tasks = [
+        (
+            "Task 1: Off-by-one errors",
+            TASK1_ISSUE,
+            vec![
+                ("array_utils.py", TASK1_BUG_FILE),
+                ("test_array_utils.py", TASK1_TEST_FILE),
+            ],
+            "pytest test_array_utils.py -v",
+        ),
+        (
+            "Task 2: String logic errors",
+            TASK2_ISSUE,
+            vec![
+                ("string_utils.py", TASK2_BUG_FILE),
+                ("test_string_utils.py", TASK2_TEST_FILE),
+            ],
+            "pytest test_string_utils.py -v",
+        ),
+        (
+            "Task 3: Missing edge cases",
+            TASK3_ISSUE,
+            vec![
+                ("math_utils.py", TASK3_BUG_FILE),
+                ("test_math_utils.py", TASK3_TEST_FILE),
+            ],
+            "pytest test_math_utils.py -v",
+        ),
+    ];
+
+    let mut with_reflection_pass = 0;
+    let mut without_reflection_pass = 0;
+
+    for (name, issue, files, test_cmd) in tasks.iter() {
+        println!("--- {} ---", name);
+
+        // With reflection
+        let result_with = run_eval_task(
+            issue,
+            &files.iter().map(|(a, b)| (*a, *b)).collect::<Vec<_>>(),
+            test_cmd,
+            true,
+        );
+        if result_with.success {
+            with_reflection_pass += 1;
+        }
+        println!(
+            "  With reflection:    {} (verdicts: {})",
+            if result_with.success { "PASS" } else { "FAIL" },
+            result_with.reflection_verdicts.len()
+        );
+
+        // Without reflection
+        let result_without = run_eval_task(
+            issue,
+            &files.iter().map(|(a, b)| (*a, *b)).collect::<Vec<_>>(),
+            test_cmd,
+            false,
+        );
+        if result_without.success {
+            without_reflection_pass += 1;
+        }
+        println!(
+            "  Without reflection: {}",
+            if result_without.success { "PASS" } else { "FAIL" }
+        );
+        println!();
+    }
+
+    println!("========================================");
+    println!("RESULTS");
+    println!("========================================");
+    println!(
+        "With reflection:    {}/{} tasks passed",
+        with_reflection_pass,
+        tasks.len()
+    );
+    println!(
+        "Without reflection: {}/{} tasks passed",
+        without_reflection_pass,
+        tasks.len()
+    );
+    println!(
+        "Improvement: {:+} tasks",
+        with_reflection_pass as i32 - without_reflection_pass as i32
+    );
+}
diff --git a/codex-rs/core/tests/suite/mod.rs b/codex-rs/core/tests/suite/mod.rs
index e047899d722..b3dddb9d622 100644
--- a/codex-rs/core/tests/suite/mod.rs
+++ b/codex-rs/core/tests/suite/mod.rs
@@ -26,6 +26,7 @@ mod compact;
 mod compact_remote;
 mod compact_resume_fork;
 mod deprecation_notice;
+mod eval_swe_bench;
 mod exec;
 mod exec_policy;
 mod fork_conversation;
@@ -41,6 +42,7 @@ mod otel;
 mod prompt_caching;
 mod quota_exceeded;
 mod read_file;
+mod reflection;
 mod remote_models;
 mod resume;
 mod review;
diff --git a/codex-rs/core/tests/suite/reflection.rs b/codex-rs/core/tests/suite/reflection.rs
new file mode 100644
index 00000000000..aa7051a39c0
--- /dev/null
+++ b/codex-rs/core/tests/suite/reflection.rs
@@ -0,0 +1,264 @@
+//! Integration tests for the reflection layer.
+//!
+//! These tests verify that the reflection layer works correctly with real API calls.
+//! They are `#[ignore]` by default so CI stays deterministic. Run them locally with:
+//!
+//! ```sh
+//! # For Azure OpenAI:
+//! AZURE_OPENAI_API_KEY=<key> AZURE_OPENAI_BASE_URL=<url> \
+//!     cargo test -p codex-core --test all reflection -- --ignored --nocapture
+//!
+//! # Optionally specify a model (defaults to gpt-5-mini):
+//! AZURE_OPENAI_API_KEY=<key> AZURE_OPENAI_BASE_URL=<url> AZURE_OPENAI_MODEL=gpt-4o \
+//!     cargo test -p codex-core --test all reflection -- --ignored --nocapture
+//! ```
+
+use assert_cmd::prelude::*;
+use std::io::{Read, Write};
+use std::process::{Command, Stdio};
+use std::thread;
+use tempfile::TempDir;
+
+fn require_azure_credentials() -> (String, String, String) {
+    let api_key = std::env::var("AZURE_OPENAI_API_KEY")
+        .expect("AZURE_OPENAI_API_KEY env var not set — skip running Azure tests");
+    let base_url = std::env::var("AZURE_OPENAI_BASE_URL")
+        .expect("AZURE_OPENAI_BASE_URL env var not set — skip running Azure tests");
+    let model = std::env::var("AZURE_OPENAI_MODEL").unwrap_or_else(|_| "gpt-5-mini".to_string());
+    (api_key, base_url, model)
+}
+
+/// Creates a config.toml for Azure OpenAI with reflection enabled.
+fn create_azure_config(base_url: &str, model: &str) -> String {
+    // Ensure base_url ends with /openai
+    let base_url = base_url.trim_end_matches('/');
+    let base_url = if base_url.ends_with("/openai") {
+        base_url.to_string()
+    } else {
+        format!("{}/openai", base_url)
+    };
+
+    format!(
+        r#"
+model = "{model}"
+model_provider = "azure-openai"
+
+[reflection]
+enabled = true
+max_attempts = 3
+
+[features]
+reflection = true
+
+[model_providers.azure-openai]
+name = "Azure OpenAI"
+base_url = "{base_url}"
+env_key = "AZURE_OPENAI_API_KEY"
+wire_api = "responses"
+request_max_retries = 3
+stream_max_retries = 3
+stream_idle_timeout_ms = 120000
+
+[model_providers.azure-openai.query_params]
+api-version = "2025-04-01-preview"
+"#
+    )
+}
+
+/// Helper that spawns codex exec with Azure OpenAI config and reflection enabled.
+/// Returns (Assert, TempDir, stdout, stderr).
+fn run_azure_reflection_test(prompt: &str) -> (assert_cmd::assert::Assert, TempDir, Vec<u8>, Vec<u8>) {
+    #![expect(clippy::unwrap_used)]
+
+    let (api_key, base_url, model) = require_azure_credentials();
+
+    let dir = TempDir::new().unwrap();
+
+    // Create .codex directory with config
+    let codex_home = dir.path().join(".codex");
+    std::fs::create_dir_all(&codex_home).unwrap();
+    std::fs::write(codex_home.join("config.toml"), create_azure_config(&base_url, &model)).unwrap();
+
+    let mut cmd = Command::cargo_bin("codex").unwrap();
+    cmd.current_dir(dir.path());
+    cmd.env("AZURE_OPENAI_API_KEY", api_key);
+    cmd.env("CODEX_HOME", &codex_home);
+    cmd.env("RUST_LOG", "codex_core=info,reflection=debug");
+
+    cmd.arg("exec")
+        .arg("--full-auto")
+        .arg("--skip-git-repo-check")
+        .arg("--color")
+        .arg("never")
+        .arg(prompt);
+
+    cmd.stdin(Stdio::null());
+    cmd.stdout(Stdio::piped());
+    cmd.stderr(Stdio::piped());
+
+    let mut child = cmd.spawn().expect("failed to spawn codex");
+
+    // Tee helper - copies stream to both parent stdio and buffer
+    fn tee<R: Read + Send + 'static>(
+        mut reader: R,
+        mut writer: impl Write + Send + 'static,
+    ) -> thread::JoinHandle<Vec<u8>> {
+        thread::spawn(move || {
+            let mut buf = Vec::new();
+            let mut chunk = [0u8; 4096];
+            loop {
+                match reader.read(&mut chunk) {
+                    Ok(0) => break,
+                    Ok(n) => {
+                        writer.write_all(&chunk[..n]).ok();
+                        writer.flush().ok();
+                        buf.extend_from_slice(&chunk[..n]);
+                    }
+                    Err(_) => break,
+                }
+            }
+            buf
+        })
+    }
+
+    let stdout_handle = tee(
+        child.stdout.take().expect("child stdout"),
+        std::io::stdout(),
+    );
+    let stderr_handle = tee(
+        child.stderr.take().expect("child stderr"),
+        std::io::stderr(),
+    );
+
+    let status = child.wait().expect("failed to wait on child");
+    let stdout = stdout_handle.join().expect("stdout thread panicked");
+    let stderr = stderr_handle.join().expect("stderr thread panicked");
+
+    let output = std::process::Output {
+        status,
+        stdout: stdout.clone(),
+        stderr: stderr.clone(),
+    };
+
+    (output.assert(), dir, stdout, stderr)
+}
+
+/// Integration test for reflection layer with Azure OpenAI.
+///
+/// This test:
+/// 1. Connects to Azure OpenAI using AZURE_OPENAI_API_KEY and AZURE_OPENAI_BASE_URL
+/// 2. Uses gpt-5-mini model with reflection enabled
+/// 3. Asks codex to create a hello world Python app and test it
+/// 4. Verifies the reflection layer was invoked
+#[ignore]
+#[test]
+fn reflection_layer_hello_world_with_azure_openai() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err()
+        || std::env::var("AZURE_OPENAI_BASE_URL").is_err()
+    {
+        eprintln!(
+            "skipping reflection test — AZURE_OPENAI_API_KEY or AZURE_OPENAI_BASE_URL not set"
+        );
+        return;
+    }
+
+    let prompt = r#"Write a hello world Python application that prints exactly "hello, world" (with a lowercase h and trailing newline). Name the file hello.py.
+
+Then create a test file called test_hello.py that:
+1. Runs hello.py using subprocess
+2. Captures its output
+3. Asserts that the output is exactly "hello, world\n"
+
+After creating both files, run the test to verify it passes."#;
+
+    let (assert, dir, stdout, stderr) = run_azure_reflection_test(prompt);
+
+    // Test should succeed
+    assert.success();
+
+    // Verify hello.py was created
+    let hello_path = dir.path().join("hello.py");
+    assert!(
+        hello_path.exists(),
+        "hello.py was not created by the model"
+    );
+
+    // Verify test_hello.py was created
+    let test_path = dir.path().join("test_hello.py");
+    assert!(
+        test_path.exists(),
+        "test_hello.py was not created by the model"
+    );
+
+    // Check that reflection layer was invoked by looking at both stdout and stderr (logs)
+    let stdout_str = String::from_utf8_lossy(&stdout);
+    let stderr_str = String::from_utf8_lossy(&stderr);
+    let combined_output = format!("{}{}", stdout_str, stderr_str);
+
+    let reflection_invoked = combined_output.contains("Running reflection evaluation")
+        || combined_output.contains("Reflection verdict")
+        || combined_output.contains("Reflection:");
+
+    println!("\n=== Reflection Layer Status ===");
+    if reflection_invoked {
+        println!("Reflection layer was invoked during task execution");
+
+        // Extract and print reflection verdict if present
+        for line in combined_output.lines() {
+            if line.contains("Reflection verdict") || line.contains("Reflection:") {
+                println!("  {}", line);
+            }
+        }
+    } else {
+        println!("WARNING: No explicit reflection activity detected in output");
+        println!("This may indicate the reflection feature was not properly enabled");
+    }
+
+    // Verify hello.py content is correct
+    let hello_content = std::fs::read_to_string(&hello_path).unwrap();
+    println!("\n=== hello.py ===\n{}", hello_content);
+
+    // Verify test file content
+    let test_content = std::fs::read_to_string(&test_path).unwrap();
+    println!("\n=== test_hello.py ===\n{}", test_content);
+
+    // The reflection layer should have been invoked
+    assert!(
+        reflection_invoked,
+        "Reflection layer was not invoked - check that reflection feature is enabled"
+    );
+}
+
+/// Simple test to verify reflection config is correctly parsed.
+#[ignore]
+#[test]
+fn reflection_config_azure_openai() {
+    if std::env::var("AZURE_OPENAI_API_KEY").is_err()
+        || std::env::var("AZURE_OPENAI_BASE_URL").is_err()
+    {
+        eprintln!("skipping config test — Azure credentials not set");
+        return;
+    }
+
+    // Simple prompt that should complete quickly
+    let prompt = "Print 'hello' using echo";
+
+    let (assert, _dir, stdout, stderr) = run_azure_reflection_test(prompt);
+
+    assert.success();
+
+    // Verify reflection was at least attempted (logs go to stderr)
+    let stdout_str = String::from_utf8_lossy(&stdout);
+    let stderr_str = String::from_utf8_lossy(&stderr);
+    let combined_output = format!("{}{}", stdout_str, stderr_str);
+    println!("Output:\n{}", combined_output);
+
+    // Should see reflection-related log messages
+    let has_reflection_logs = combined_output.contains("reflection")
+        || combined_output.contains("Reflection");
+
+    println!(
+        "\nReflection activity detected: {}",
+        has_reflection_logs
+    );
+}
diff --git a/codex-rs/core/tests/suite/view_image.rs b/codex-rs/core/tests/suite/view_image.rs
index 6c0f6dcc80c..0456d6de994 100644
--- a/codex-rs/core/tests/suite/view_image.rs
+++ b/codex-rs/core/tests/suite/view_image.rs
@@ -15,6 +15,8 @@ use core_test_support::responses::ev_function_call;
 use core_test_support::responses::ev_response_created;
 use core_test_support::responses::sse;
 use core_test_support::responses::start_mock_server;
+use wiremock::ResponseTemplate;
+use wiremock::matchers::body_string_contains;
 use core_test_support::skip_if_no_network;
 use core_test_support::test_codex::TestCodex;
 use core_test_support::test_codex::test_codex;
diff --git a/codex-rs/exec/src/event_processor_with_human_output.rs b/codex-rs/exec/src/event_processor_with_human_output.rs
index a833426dc22..032b9bf8a9c 100644
--- a/codex-rs/exec/src/event_processor_with_human_output.rs
+++ b/codex-rs/exec/src/event_processor_with_human_output.rs
@@ -22,6 +22,7 @@ use codex_core::protocol::TaskCompleteEvent;
 use codex_core::protocol::TurnAbortReason;
 use codex_core::protocol::TurnDiffEvent;
 use codex_core::protocol::WarningEvent;
+use codex_core::protocol::ReflectionVerdictEvent;
 use codex_core::protocol::WebSearchEndEvent;
 use codex_protocol::num_format::format_with_separators;
 use owo_colors::OwoColorize;
@@ -173,6 +174,34 @@ impl EventProcessor for EventProcessorWithHumanOutput {
                     "warning:".style(self.yellow).style(self.bold)
                 );
             }
+            EventMsg::ReflectionVerdict(ev) => {
+                if ev.completed {
+                    ts_msg!(
+                        self,
+                        "{} Task completed (confidence: {:.0}%)",
+                        "✓ reflection:".style(self.green).style(self.bold),
+                        ev.confidence * 100.0
+                    );
+                    ts_msg!(self, "  {}", ev.reasoning.style(self.dimmed));
+                } else {
+                    ts_msg!(
+                        self,
+                        "{} Task incomplete - attempt {}/{} (confidence: {:.0}%)",
+                        "⟳ reflection:".style(self.yellow).style(self.bold),
+                        ev.attempt,
+                        ev.max_attempts,
+                        ev.confidence * 100.0
+                    );
+                    ts_msg!(self, "  Reasoning: {}", ev.reasoning.style(self.dimmed));
+                    if let Some(feedback) = &ev.feedback {
+                        ts_msg!(
+                            self,
+                            "  Feedback: {}",
+                            feedback.style(self.yellow)
+                        );
+                    }
+                }
+            }
             EventMsg::DeprecationNotice(DeprecationNoticeEvent { summary, details }) => {
                 ts_msg!(
                     self,
diff --git a/codex-rs/mcp-server/src/codex_tool_runner.rs b/codex-rs/mcp-server/src/codex_tool_runner.rs
index 73e56a605ce..a0ad30c028a 100644
--- a/codex-rs/mcp-server/src/codex_tool_runner.rs
+++ b/codex-rs/mcp-server/src/codex_tool_runner.rs
@@ -308,7 +308,8 @@ async fn run_codex_tool_session_inner(
                     | EventMsg::UndoCompleted(_)
                     | EventMsg::ExitedReviewMode(_)
                     | EventMsg::ContextCompacted(_)
-                    | EventMsg::DeprecationNotice(_) => {
+                    | EventMsg::DeprecationNotice(_)
+                    | EventMsg::ReflectionVerdict(_) => {
                         // For now, we do not do anything extra for these
                         // events. Note that
                         // send(codex_event_to_notification(&event)) above has
diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs
index 201aba07efe..a2b5140f137 100644
--- a/codex-rs/protocol/src/protocol.rs
+++ b/codex-rs/protocol/src/protocol.rs
@@ -630,6 +630,9 @@ pub enum EventMsg {
     AgentMessageContentDelta(AgentMessageContentDeltaEvent),
     ReasoningContentDelta(ReasoningContentDeltaEvent),
     ReasoningRawContentDelta(ReasoningRawContentDeltaEvent),
+
+    /// Reflection layer verdict after evaluating task completion.
+    ReflectionVerdict(ReflectionVerdictEvent),
 }
 
 /// Codex errors that we expose to clients.
@@ -794,6 +797,24 @@ pub struct WarningEvent {
     pub message: String,
 }
 
+/// Event emitted when the reflection layer evaluates task completion.
+#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
+pub struct ReflectionVerdictEvent {
+    /// Whether the task was completed successfully.
+    pub completed: bool,
+    /// Confidence score from 0.0 to 1.0.
+    pub confidence: f32,
+    /// The judge's reasoning for the verdict.
+    pub reasoning: String,
+    /// Feedback for incomplete tasks (None if completed).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub feedback: Option<String>,
+    /// Current reflection attempt number.
+    pub attempt: u32,
+    /// Maximum allowed attempts.
+    pub max_attempts: u32,
+}
+
 #[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)]
 pub struct ContextCompactedEvent;
 
diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs
index 37cd004a159..9d8393f0696 100644
--- a/codex-rs/tui/src/chatwidget.rs
+++ b/codex-rs/tui/src/chatwidget.rs
@@ -58,6 +58,7 @@ use codex_core::protocol::UndoStartedEvent;
 use codex_core::protocol::UserMessageEvent;
 use codex_core::protocol::ViewImageToolCallEvent;
 use codex_core::protocol::WarningEvent;
+use codex_core::protocol::ReflectionVerdictEvent;
 use codex_core::protocol::WebSearchBeginEvent;
 use codex_core::protocol::WebSearchEndEvent;
 use codex_core::skills::model::SkillMetadata;
@@ -690,6 +691,18 @@ impl ChatWidget {
         self.request_redraw();
     }
 
+    fn on_reflection_verdict(&mut self, ev: ReflectionVerdictEvent) {
+        self.add_to_history(history_cell::new_reflection_verdict(
+            ev.completed,
+            ev.confidence,
+            ev.reasoning,
+            ev.feedback,
+            ev.attempt,
+            ev.max_attempts,
+        ));
+        self.request_redraw();
+    }
+
     fn on_mcp_startup_update(&mut self, ev: McpStartupUpdateEvent) {
         let mut status = self.mcp_startup_status.take().unwrap_or_default();
         if let McpStartupStatus::Failed { error } = &ev.status {
@@ -1906,6 +1919,7 @@ impl ChatWidget {
                 self.on_entered_review_mode(review_request)
             }
             EventMsg::ExitedReviewMode(review) => self.on_exited_review_mode(review),
+            EventMsg::ReflectionVerdict(ev) => self.on_reflection_verdict(ev),
             EventMsg::ContextCompacted(_) => self.on_agent_message("Context compacted".to_owned()),
             EventMsg::RawResponseItem(_)
             | EventMsg::ItemStarted(_)
diff --git a/codex-rs/tui/src/history_cell.rs b/codex-rs/tui/src/history_cell.rs
index 5440040f6b5..acf6dbe8ca3 100644
--- a/codex-rs/tui/src/history_cell.rs
+++ b/codex-rs/tui/src/history_cell.rs
@@ -1071,6 +1071,42 @@ pub(crate) fn new_warning_event(message: String) -> PrefixedWrappedHistoryCell {
     PrefixedWrappedHistoryCell::new(message.yellow(), "⚠ ".yellow(), "  ")
 }
 
+/// Creates a history cell for reflection verdict display.
+#[allow(clippy::disallowed_methods)]
+pub(crate) fn new_reflection_verdict(
+    completed: bool,
+    confidence: f32,
+    reasoning: String,
+    feedback: Option<String>,
+    attempt: u32,
+    max_attempts: u32,
+) -> PlainHistoryCell {
+    let mut lines: Vec<Line<'static>> = Vec::new();
+
+    if completed {
+        // Success case - green checkmark
+        let header = format!(
+            "✓ Reflection: Task completed (confidence: {:.0}%)",
+            confidence * 100.0
+        );
+        lines.push(vec![header.green()].into());
+        lines.push(vec![format!("  {}", reasoning).dim()].into());
+    } else {
+        // Incomplete case - yellow warning
+        let header = format!(
+            "⟳ Reflection: Task incomplete - attempt {}/{} (confidence: {:.0}%)",
+            attempt, max_attempts, confidence * 100.0
+        );
+        lines.push(vec![header.yellow()].into());
+        lines.push(vec![format!("  Reasoning: {}", reasoning).dim()].into());
+        if let Some(fb) = feedback {
+            lines.push(vec![format!("  Feedback: {}", fb).yellow()].into());
+        }
+    }
+
+    PlainHistoryCell { lines }
+}
+
 #[derive(Debug)]
 pub(crate) struct DeprecationNoticeCell {
     summary: String,
diff --git a/codex-rs/tui2/src/chatwidget.rs b/codex-rs/tui2/src/chatwidget.rs
index 5461bea7296..4c2ec861990 100644
--- a/codex-rs/tui2/src/chatwidget.rs
+++ b/codex-rs/tui2/src/chatwidget.rs
@@ -58,6 +58,7 @@ use codex_core::protocol::UndoStartedEvent;
 use codex_core::protocol::UserMessageEvent;
 use codex_core::protocol::ViewImageToolCallEvent;
 use codex_core::protocol::WarningEvent;
+use codex_core::protocol::ReflectionVerdictEvent;
 use codex_core::protocol::WebSearchBeginEvent;
 use codex_core::protocol::WebSearchEndEvent;
 use codex_core::skills::model::SkillMetadata;
@@ -690,6 +691,18 @@ impl ChatWidget {
         self.request_redraw();
     }
 
+    fn on_reflection_verdict(&mut self, ev: ReflectionVerdictEvent) {
+        self.add_to_history(history_cell::new_reflection_verdict(
+            ev.completed,
+            ev.confidence,
+            ev.reasoning,
+            ev.feedback,
+            ev.attempt,
+            ev.max_attempts,
+        ));
+        self.request_redraw();
+    }
+
     fn on_mcp_startup_update(&mut self, ev: McpStartupUpdateEvent) {
         let mut status = self.mcp_startup_status.take().unwrap_or_default();
         if let McpStartupStatus::Failed { error } = &ev.status {
@@ -1906,6 +1919,7 @@ impl ChatWidget {
                 self.on_entered_review_mode(review_request)
             }
             EventMsg::ExitedReviewMode(review) => self.on_exited_review_mode(review),
+            EventMsg::ReflectionVerdict(ev) => self.on_reflection_verdict(ev),
             EventMsg::ContextCompacted(_) => self.on_agent_message("Context compacted".to_owned()),
             EventMsg::RawResponseItem(_)
             | EventMsg::ItemStarted(_)
diff --git a/codex-rs/tui2/src/history_cell.rs b/codex-rs/tui2/src/history_cell.rs
index 5440040f6b5..acf6dbe8ca3 100644
--- a/codex-rs/tui2/src/history_cell.rs
+++ b/codex-rs/tui2/src/history_cell.rs
@@ -1071,6 +1071,42 @@ pub(crate) fn new_warning_event(message: String) -> PrefixedWrappedHistoryCell {
     PrefixedWrappedHistoryCell::new(message.yellow(), "⚠ ".yellow(), "  ")
 }
 
+/// Creates a history cell for reflection verdict display.
+#[allow(clippy::disallowed_methods)]
+pub(crate) fn new_reflection_verdict(
+    completed: bool,
+    confidence: f32,
+    reasoning: String,
+    feedback: Option<String>,
+    attempt: u32,
+    max_attempts: u32,
+) -> PlainHistoryCell {
+    let mut lines: Vec<Line<'static>> = Vec::new();
+
+    if completed {
+        // Success case - green checkmark
+        let header = format!(
+            "✓ Reflection: Task completed (confidence: {:.0}%)",
+            confidence * 100.0
+        );
+        lines.push(vec![header.green()].into());
+        lines.push(vec![format!("  {}", reasoning).dim()].into());
+    } else {
+        // Incomplete case - yellow warning
+        let header = format!(
+            "⟳ Reflection: Task incomplete - attempt {}/{} (confidence: {:.0}%)",
+            attempt, max_attempts, confidence * 100.0
+        );
+        lines.push(vec![header.yellow()].into());
+        lines.push(vec![format!("  Reasoning: {}", reasoning).dim()].into());
+        if let Some(fb) = feedback {
+            lines.push(vec![format!("  Feedback: {}", fb).yellow()].into());
+        }
+    }
+
+    PlainHistoryCell { lines }
+}
+
 #[derive(Debug)]
 pub(crate) struct DeprecationNoticeCell {
     summary: String,
diff --git a/docs/reflection.md b/docs/reflection.md
new file mode 100644
index 00000000000..3afff5837b7
--- /dev/null
+++ b/docs/reflection.md
@@ -0,0 +1,275 @@
+# Reflection Layer
+
+The reflection layer is an experimental feature that verifies if the AI agent completed a task correctly by using a "judge" model to evaluate the work.
+
+## How It Works
+
+1. **Task Execution**: Agent executes the user's request using tools (shell, file operations, etc.)
+
+2. **Evaluation**: After completion, the reflection layer:
+   - Collects context: original task, recent tool calls (up to 10), and final response
+   - Sends this to a judge model for evaluation
+   - Receives a verdict with completion status and confidence score
+
+3. **Retry Loop**: If the task is incomplete:
+   - Judge provides feedback on what's missing
+   - Agent receives feedback and tries again
+   - Repeats up to 3 attempts (configurable)
+
+## Verdict Structure
+
+```json
+{
+  "completed": true,
+  "confidence": 0.95,
+  "reasoning": "Task was completed successfully",
+  "feedback": null
+}
+```
+
+- `completed`: Whether the task was done
+- `confidence`: 0.0 to 1.0 confidence score
+- `reasoning`: Explanation of the verdict
+- `feedback`: Instructions for the agent if incomplete
+
+## Configuration
+
+Enable in `~/.codex/config.toml`:
+
+```toml
+[reflection]
+enabled = true
+max_attempts = 3
+
+[features]
+reflection = true
+```
+
+## Running Tests
+
+```shell
+# Required environment variables
+export AZURE_OPENAI_API_KEY="<key>"
+export AZURE_OPENAI_BASE_URL="<url>"
+
+# Optional: specify model (defaults to gpt-5-mini)
+export AZURE_OPENAI_MODEL="gpt-5"
+
+# Run the integration test
+cargo test -p codex-core --test all --release reflection_layer_hello_world -- --ignored --nocapture
+```
+
+The test verifies:
+1. Azure OpenAI integration with reflection enabled
+2. Agent creates requested Python files
+3. Tests pass via pytest
+4. Reflection layer evaluates and returns a verdict
+
+## Evaluation Suite (SWE-bench Style)
+
+The eval suite measures the reflection layer's impact on coding task performance, inspired by [SWE-bench](https://github.com/SWE-bench/SWE-bench).
+
+### Tasks
+
+| Task | Description | Bug Type |
+|------|-------------|----------|
+| Task 1 | Off-by-one errors | `range(n+1)` → `range(n)`, index errors |
+| Task 2 | String logic | Palindrome detection, word counting |
+| Task 3 | Edge cases | Division by zero, empty list handling |
+
+Each task provides:
+- A buggy Python codebase
+- An issue description (like a GitHub issue)
+- Test files that verify the fix
+
+### Running Evaluations
+
+```shell
+# Run single task with reflection
+cargo test -p codex-core --test all --release eval_task1_offbyone_with_reflection -- --ignored --nocapture
+
+# Run single task without reflection
+cargo test -p codex-core --test all --release eval_task1_offbyone_without_reflection -- --ignored --nocapture
+
+# Run full comparison (all tasks, with and without reflection)
+cargo test -p codex-core --test all --release eval_summary -- --ignored --nocapture
+```
+
+### Sample Output
+
+```
+========================================
+SWE-BENCH STYLE EVALUATION SUMMARY
+========================================
+
+--- Task 1: Off-by-one errors ---
+  With reflection:    PASS (verdicts: 1)
+  Without reflection: PASS
+
+--- Task 2: String logic errors ---
+  With reflection:    PASS (verdicts: 1)
+  Without reflection: PASS
+
+--- Task 3: Missing edge cases ---
+  With reflection:    PASS (verdicts: 2)
+  Without reflection: FAIL
+
+========================================
+RESULTS
+========================================
+With reflection:    3/3 tasks passed
+Without reflection: 2/3 tasks passed
+Improvement: +1 tasks
+```
+
+The reflection layer helps catch incomplete fixes by re-evaluating the agent's work and providing feedback for another attempt.
+## Local Debugging
+
+### Prerequisites
+
+- Rust toolchain with `cargo` installed.
+- `just` available for the repo (if you use it for formatting/linting).
+- (Optional) `cargo-insta` if you will work with snapshot tests.
+
+> Note: On Windows prefer WSL for these instructions or adapt commands to PowerShell.
+
+### Installation
+
+Preferred (recommended): install the CLI into your local bin with `cargo install`:
+
+```shell
+# from repo root
+cargo install --path codex-rs --root "$HOME/.local"
+export PATH="$HOME/.local/bin:$PATH"
+```
+
+Alternative: build and copy (guarded):
+
+```shell
+mkdir -p "$HOME/.local/bin"
+cargo build --release
+BINARY="codex-rs/target/release/codex"
+if [ -f "$BINARY" ]; then
+  install -m 755 "$BINARY" "$HOME/.local/bin/"
+  echo "Installed codex to $HOME/.local/bin"
+else
+  echo "Error: built binary not found at $BINARY" >&2
+  exit 1
+fi
+```
+
+Notes:
+- Using `install -m 755` sets the executable bit and is safer than `cp`.
+- Avoid using `sudo` unless installing to system locations like `/usr/local/bin`.
+
+### Configure Azure OpenAI provider (`$HOME/.codex/config.toml`)
+
+Replace placeholders in the file below with your values. Placeholders are shown in ALL_CAPS and must be replaced.
+
+```toml
+# Example config - replace placeholders
+model = "gpt-5-mini" # example model; replace if needed
+model_provider = "azure"
+
+[model_providers.azure]
+name = "Azure OpenAI"
+base_url = "https://YOUR_AZURE_RESOURCE.openai.azure.com/openai" # replace YOUR_AZURE_RESOURCE
+env_key = "AZURE_OPENAI_API_KEY"
+wire_api = "responses"
+request_max_retries = 3
+stream_max_retries = 3
+stream_idle_timeout_ms = 120000
+
+[model_providers.azure.query_params]
+api-version = "2025-04-01-preview"
+
+[reflection]
+enabled = true
+model = "gpt-5-mini"
+max_attempts = 3
+```
+
+Important: `base_url` must match your Azure endpoint; e.g. `https://myresource.openai.azure.com/openai`. The `model` value is illustrative — ensure the model is available for your provider/account.
+
+### Environment variables
+
+Add to your shell rc (e.g. `$HOME/.zshrc` or `$HOME/.bashrc`):
+
+```shell
+export PATH="$HOME/.local/bin:$PATH"
+export AZURE_OPENAI_API_KEY="YOUR_API_KEY" # do not commit this to version control
+```
+
+After editing, run `source "$HOME/.zshrc"` or open a new shell.
+
+### JSON vs TOML config
+
+If you previously used `$HOME/.codex/config.json`, be aware that JSON config may override TOML. To keep a backup and avoid destructive moves:
+
+```shell
+if [ -f "$HOME/.codex/config.json" ]; then
+  cp "$HOME/.codex/config.json" "$HOME/.codex/config.json.bak"
+  echo "Backed up existing JSON config to $HOME/.codex/config.json.bak"
+fi
+```
+
+### Sandbox / Network note
+
+If you run in a restricted (sandboxed) environment, features requiring outgoing network connections may not work. Check environment variables such as `CODEX_SANDBOX_NETWORK_DISABLED` or run with a network-enabled environment if needed.
+
+### Testing
+
+Interactive:
+
+```shell
+# run the binary (interactive)
+codex
+```
+
+Non-interactive (example):
+
+```shell
+codex exec --full-auto "Create a Python hello world program"
+# Verify reflection: look for "Reflection verdict" (case-insensitive)
+codex exec --full-auto "Create test.py that prints 'hello'" 2>&1 | grep -i "Reflection verdict" || true
+```
+
+### Running Unit Tests
+
+Run crate-specific tests (preferred):
+
+```shell
+cargo test -p codex-core reflection
+```
+
+Run lib-only with verbose output:
+
+```shell
+cargo test -p codex-core --lib reflection -- --nocapture
+```
+
+If you changed shared crates (core, protocol), run the full test suite:
+
+```shell
+# After local crate tests pass
+cargo test --all-features
+```
+
+Snapshot tests (if applicable):
+
+- If you update UI/text snapshots in `codex-tui`, follow the repo snapshot flow:
+  - `cargo test -p codex-tui`
+  - `cargo insta pending-snapshots -p codex-tui`
+  - `cargo insta accept -p codex-tui` (only if you intend to accept all new snapshots)
+
+### Formatting / linting for Rust code
+
+After making Rust changes, run:
+
+```shell
+# format
+(cd codex-rs && just fmt)
+
+# fix lints for the specific project you changed, e.g. codex-core
+(cd codex-rs && just fix -p codex-core)
+```