openai · dzianisv · Dec 14, 2025
diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs
@@ -63,6 +63,8 @@ pub struct ModelClient {
     effort: Option<ReasoningEffortConfig>,
     summary: ReasoningSummaryConfig,
     session_source: SessionSource,
+    /// Optional model override for specialized use cases (e.g., reflection judge).
+    model_override: Option<String>,
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -88,9 +90,19 @@ impl ModelClient {
             effort,
             summary,
             session_source,
+            model_override: None,
         }
     }
 
+    /// Returns a clone of this client with a different model for the API request.
+    /// This is useful for specialized tasks like reflection judging that may use
+    /// a different (often cheaper/faster) model than the main agent.
+    pub fn with_model_override(&self, model: &str) -> Self {
+        let mut client = self.clone();
+        client.model_override = Some(model.to_string());
+        client
+    }
+
     pub fn get_model_context_window(&self) -> Option<i64> {
         let model_family = self.get_model_family();
         let effective_context_window_percent = model_family.effective_context_window_percent;
@@ -294,9 +306,13 @@ impl ModelClient {
         self.session_source.clone()
     }
 
-    /// Returns the currently configured model slug.
+    /// Returns the currently configured model slug, or the override if set.
     pub fn get_model(&self) -> String {
-        self.get_model_family().get_model_slug().to_string()
+        if let Some(ref override_model) = self.model_override {
+            override_model.clone()
+        } else {
+            self.get_model_family().get_model_slug().to_string()
+        }
     }
 
     /// Returns the currently configured model family.

diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
@@ -20,6 +20,7 @@ use crate::openai_models::model_family::ModelFamily;
 use crate::openai_models::models_manager::ModelsManager;
 use crate::parse_command::parse_command;
 use crate::parse_turn_item;
+use crate::reflection::{ReflectionContext, evaluate_reflection};
 use crate::stream_events_utils::HandleOutputCtx;
 use crate::stream_events_utils::handle_non_tool_response_item;
 use crate::stream_events_utils::handle_output_item_done;
@@ -114,6 +115,7 @@ use crate::protocol::TokenCountEvent;
 use crate::protocol::TokenUsage;
 use crate::protocol::TokenUsageInfo;
 use crate::protocol::TurnDiffEvent;
+use crate::protocol::ReflectionVerdictEvent;
 use crate::protocol::WarningEvent;
 use crate::rollout::RolloutRecorder;
 use crate::rollout::RolloutRecorderParams;
@@ -371,6 +373,7 @@ pub(crate) struct TurnContext {
     pub(crate) tool_call_gate: Arc<ReadinessFlag>,
     pub(crate) exec_policy: Arc<RwLock<ExecPolicy>>,
     pub(crate) truncation_policy: TruncationPolicy,
+    pub(crate) reflection: crate::config::types::ReflectionConfig,
 }
 
 impl TurnContext {
@@ -536,6 +539,7 @@ impl Session {
                 per_turn_config.as_ref(),
                 model_family.truncation_policy,
             ),
+            reflection: per_turn_config.reflection.clone(),
         }
     }
 
@@ -2087,6 +2091,7 @@ async fn spawn_review_thread(
         tool_call_gate: Arc::new(ReadinessFlag::new()),
         exec_policy: parent_turn_context.exec_policy.clone(),
         truncation_policy: TruncationPolicy::new(&per_turn_config, model_family.truncation_policy),
+        reflection: per_turn_config.reflection.clone(),
     };
 
     // Seed the child task with the review prompt as the initial user message.
@@ -2175,6 +2180,9 @@ pub(crate) async fn run_task(
             .await;
     }
 
+    // Extract initial task for reflection BEFORE input is consumed
+    let initial_task = extract_initial_task_from_input(&input);
+
     let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input);
     let response_item: ResponseItem = initial_input_for_turn.clone().into();
     sess.record_response_item_and_emit_turn_item(turn_context.as_ref(), response_item)
@@ -2192,6 +2200,11 @@ pub(crate) async fn run_task(
     // many turns, from the perspective of the user, it is a single turn.
     let turn_diff_tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new()));
 
+    // Track reflection attempts
+    let mut reflection_attempt: u32 = 0;
+    let reflection_config = &turn_context.reflection;
+    let reflection_enabled = reflection_config.enabled || sess.enabled(Feature::Reflection);
+
     loop {
         // Note that pending_input would be something like a message the user
         // submitted through the UI while the model was running. Though the UI
@@ -2255,7 +2268,95 @@ pub(crate) async fn run_task(
                 }
 
                 if !needs_follow_up {
-                    last_agent_message = turn_last_agent_message;
+                    last_agent_message = turn_last_agent_message.clone();
+
+                    // Run reflection if enabled and we haven't exceeded max attempts
+                    let max_attempts = reflection_config.max_attempts;
+                    if reflection_enabled && reflection_attempt < max_attempts {
+                        reflection_attempt += 1;
+                        info!(
+                            "Running reflection evaluation (attempt {}/{})",
+                            reflection_attempt, max_attempts
+                        );
+
+                        // Collect conversation items for reflection
+                        let history_items = sess.clone_history().await.get_history_for_prompt();
+                        let context = ReflectionContext::from_conversation(
+                            initial_task.clone(),
+                            &history_items,
+                            reflection_attempt,
+                            max_attempts,
+                        );
+
+                        // Evaluate with the judge, optionally using a different model
+                        match evaluate_reflection(
+                            &turn_context.client,
+                            context,
+                            reflection_config.model.as_deref(),
+                        )
+                        .await
+                        {
+                            Ok(verdict) => {
+                                info!(
+                                    "Reflection verdict: completed={}, confidence={:.2}",
+                                    verdict.completed, verdict.confidence
+                                );
+
+                                // Emit reflection verdict event for visibility
+                                sess.send_event(
+                                    &turn_context,
+                                    EventMsg::ReflectionVerdict(ReflectionVerdictEvent {
+                                        completed: verdict.completed,
+                                        confidence: verdict.confidence,
+                                        reasoning: verdict.reasoning.clone(),
+                                        feedback: verdict.feedback.clone(),
+                                        attempt: reflection_attempt,
+                                        max_attempts,
+                                    }),
+                                )
+                                .await;
+
+                                if !verdict.completed {
+                                    if let Some(feedback) = verdict.feedback {
+                                        info!("Task incomplete, injecting feedback: {}", feedback);
+
+                                        // Inject feedback as a new user message
+                                        let feedback_msg = format!(
+                                            "[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.",
+                                            reflection_attempt,
+                                            max_attempts,
+                                            verdict.reasoning,
+                                            feedback
+                                        );
+
+                                        let feedback_item = ResponseItem::Message {
+                                            id: None,
+                                            role: "user".to_string(),
+                                            content: vec![ContentItem::InputText {
+                                                text: feedback_msg,
+                                            }],
+                                        };
+
+                                        sess.record_conversation_items(
+                                            &turn_context,
+                                            &[feedback_item],
+                                        )
+                                        .await;
+
+                                        // Continue the loop to process the feedback
+                                        continue;
+                                    }
+                                } else {
+                                    info!("Reflection: Task completed successfully");
+                                }
+                            }
+                            Err(e) => {
+                                warn!("Reflection evaluation failed: {}", e);
+                                // Continue without blocking on reflection errors
+                            }
+                        }
+                    }
+
                     sess.notifier()
                         .notify(&UserNotification::AgentTurnComplete {
                             thread_id: sess.conversation_id.to_string(),
@@ -2292,6 +2393,27 @@ pub(crate) async fn run_task(
     last_agent_message
 }
 
+/// Extract the initial task/prompt from user input.
+fn extract_initial_task_from_input(input: &[UserInput]) -> String {
+    for item in input {
+        match item {
+            UserInput::Text { text } => {
+                return text.clone();
+            }
+            UserInput::Image { .. } | UserInput::LocalImage { .. } => {
+                // Skip images, look for text
+            }
+            UserInput::Skill { name, .. } => {
+                // Return skill name as task description
+                return format!("Run skill: {}", name);
+            }
+            // Handle future variants of the non-exhaustive enum
+            _ => {}
+        }
+    }
+    "(No initial task found)".to_string()
+}
+
 #[instrument(
     skip_all,
     fields(

diff --git a/codex-rs/core/src/config/mod.rs b/codex-rs/core/src/config/mod.rs
@@ -7,6 +7,8 @@ use crate::config::types::Notifications;
 use crate::config::types::OtelConfig;
 use crate::config::types::OtelConfigToml;
 use crate::config::types::OtelExporterKind;
+use crate::config::types::ReflectionConfig;
+use crate::config::types::ReflectionConfigToml;
 use crate::config::types::SandboxWorkspaceWrite;
 use crate::config::types::ShellEnvironmentPolicy;
 use crate::config::types::ShellEnvironmentPolicyToml;
@@ -274,6 +276,9 @@ pub struct Config {
     /// Centralized feature flags; source of truth for feature gating.
     pub features: Features,
 
+    /// Configuration for the reflection/judge feature.
+    pub reflection: ReflectionConfig,
+
     /// The active profile name used to derive this `Config` (if any).
     pub active_profile: Option<String>,
 
@@ -666,6 +671,9 @@ pub struct ConfigToml {
     /// Settings for ghost snapshots (used for undo).
     #[serde(default)]
     pub ghost_snapshot: Option<GhostSnapshotToml>,
+    /// Configuration for the reflection/judge feature.
+    #[serde(default)]
+    pub reflection: Option<ReflectionConfigToml>,
 
     /// When `true`, checks for Codex updates on startup and surfaces update prompts.
     /// Set to `false` only if your Codex updates are centrally managed.
@@ -1221,6 +1229,7 @@ impl Config {
             use_experimental_use_rmcp_client,
             ghost_snapshot,
             features,
+            reflection: cfg.reflection.map(Into::into).unwrap_or_default(),
             active_profile: active_profile_name,
             active_project,
             windows_wsl_setup_acknowledged: cfg.windows_wsl_setup_acknowledged.unwrap_or(false),
@@ -2983,6 +2992,7 @@ model_verbosity = "high"
                 use_experimental_use_rmcp_client: false,
                 ghost_snapshot: GhostSnapshotConfig::default(),
                 features: Features::with_defaults(),
+                reflection: ReflectionConfig::default(),
                 active_profile: Some("o3".to_string()),
                 active_project: ProjectConfig { trust_level: None },
                 windows_wsl_setup_acknowledged: false,
@@ -3058,6 +3068,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("gpt3".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,
@@ -3148,6 +3159,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("zdr".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,
@@ -3224,6 +3236,7 @@ model_verbosity = "high"
             use_experimental_use_rmcp_client: false,
             ghost_snapshot: GhostSnapshotConfig::default(),
             features: Features::with_defaults(),
+            reflection: ReflectionConfig::default(),
             active_profile: Some("gpt5".to_string()),
             active_project: ProjectConfig { trust_level: None },
             windows_wsl_setup_acknowledged: false,