Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions codex-rs/core/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pub struct ModelClient {
effort: Option<ReasoningEffortConfig>,
summary: ReasoningSummaryConfig,
session_source: SessionSource,
/// Optional model override for specialized use cases (e.g., reflection judge).
model_override: Option<String>,
}

#[allow(clippy::too_many_arguments)]
Expand All @@ -88,9 +90,19 @@ impl ModelClient {
effort,
summary,
session_source,
model_override: None,
}
}

/// Returns a clone of this client with a different model for the API request.
/// This is useful for specialized tasks like reflection judging that may use
/// a different (often cheaper/faster) model than the main agent.
pub fn with_model_override(&self, model: &str) -> Self {
let mut client = self.clone();
client.model_override = Some(model.to_string());
client
}

pub fn get_model_context_window(&self) -> Option<i64> {
let model_family = self.get_model_family();
let effective_context_window_percent = model_family.effective_context_window_percent;
Expand Down Expand Up @@ -294,9 +306,13 @@ impl ModelClient {
self.session_source.clone()
}

/// Returns the currently configured model slug.
/// Returns the currently configured model slug, or the override if set.
pub fn get_model(&self) -> String {
self.get_model_family().get_model_slug().to_string()
if let Some(ref override_model) = self.model_override {
override_model.clone()
} else {
self.get_model_family().get_model_slug().to_string()
}
}

/// Returns the currently configured model family.
Expand Down
124 changes: 123 additions & 1 deletion codex-rs/core/src/codex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use crate::openai_models::model_family::ModelFamily;
use crate::openai_models::models_manager::ModelsManager;
use crate::parse_command::parse_command;
use crate::parse_turn_item;
use crate::reflection::{ReflectionContext, evaluate_reflection};
use crate::stream_events_utils::HandleOutputCtx;
use crate::stream_events_utils::handle_non_tool_response_item;
use crate::stream_events_utils::handle_output_item_done;
Expand Down Expand Up @@ -114,6 +115,7 @@ use crate::protocol::TokenCountEvent;
use crate::protocol::TokenUsage;
use crate::protocol::TokenUsageInfo;
use crate::protocol::TurnDiffEvent;
use crate::protocol::ReflectionVerdictEvent;
use crate::protocol::WarningEvent;
use crate::rollout::RolloutRecorder;
use crate::rollout::RolloutRecorderParams;
Expand Down Expand Up @@ -371,6 +373,7 @@ pub(crate) struct TurnContext {
pub(crate) tool_call_gate: Arc<ReadinessFlag>,
pub(crate) exec_policy: Arc<RwLock<ExecPolicy>>,
pub(crate) truncation_policy: TruncationPolicy,
pub(crate) reflection: crate::config::types::ReflectionConfig,
}

impl TurnContext {
Expand Down Expand Up @@ -536,6 +539,7 @@ impl Session {
per_turn_config.as_ref(),
model_family.truncation_policy,
),
reflection: per_turn_config.reflection.clone(),
}
}

Expand Down Expand Up @@ -2087,6 +2091,7 @@ async fn spawn_review_thread(
tool_call_gate: Arc::new(ReadinessFlag::new()),
exec_policy: parent_turn_context.exec_policy.clone(),
truncation_policy: TruncationPolicy::new(&per_turn_config, model_family.truncation_policy),
reflection: per_turn_config.reflection.clone(),
};

// Seed the child task with the review prompt as the initial user message.
Expand Down Expand Up @@ -2175,6 +2180,9 @@ pub(crate) async fn run_task(
.await;
}

// Extract initial task for reflection BEFORE input is consumed
let initial_task = extract_initial_task_from_input(&input);

let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input);
let response_item: ResponseItem = initial_input_for_turn.clone().into();
sess.record_response_item_and_emit_turn_item(turn_context.as_ref(), response_item)
Expand All @@ -2192,6 +2200,11 @@ pub(crate) async fn run_task(
// many turns, from the perspective of the user, it is a single turn.
let turn_diff_tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new()));

// Track reflection attempts
let mut reflection_attempt: u32 = 0;
let reflection_config = &turn_context.reflection;
let reflection_enabled = reflection_config.enabled || sess.enabled(Feature::Reflection);

loop {
// Note that pending_input would be something like a message the user
// submitted through the UI while the model was running. Though the UI
Expand Down Expand Up @@ -2255,7 +2268,95 @@ pub(crate) async fn run_task(
}

if !needs_follow_up {
last_agent_message = turn_last_agent_message;
last_agent_message = turn_last_agent_message.clone();

// Run reflection if enabled and we haven't exceeded max attempts
let max_attempts = reflection_config.max_attempts;
if reflection_enabled && reflection_attempt < max_attempts {
reflection_attempt += 1;
info!(
"Running reflection evaluation (attempt {}/{})",
reflection_attempt, max_attempts
);

// Collect conversation items for reflection
let history_items = sess.clone_history().await.get_history_for_prompt();
let context = ReflectionContext::from_conversation(
initial_task.clone(),
&history_items,
reflection_attempt,
max_attempts,
);

// Evaluate with the judge, optionally using a different model
match evaluate_reflection(
&turn_context.client,
context,
reflection_config.model.as_deref(),
)
.await
{
Ok(verdict) => {
info!(
"Reflection verdict: completed={}, confidence={:.2}",
verdict.completed, verdict.confidence
);

// Emit reflection verdict event for visibility
sess.send_event(
&turn_context,
EventMsg::ReflectionVerdict(ReflectionVerdictEvent {
completed: verdict.completed,
confidence: verdict.confidence,
reasoning: verdict.reasoning.clone(),
feedback: verdict.feedback.clone(),
attempt: reflection_attempt,
max_attempts,
}),
)
.await;

if !verdict.completed {
if let Some(feedback) = verdict.feedback {
info!("Task incomplete, injecting feedback: {}", feedback);

// Inject feedback as a new user message
let feedback_msg = format!(
"[Reflection Judge - Attempt {}/{}] Task verification failed.\n\nReasoning: {}\n\nFeedback: {}\n\nPlease address the above feedback and complete the task.",
reflection_attempt,
max_attempts,
verdict.reasoning,
feedback
);

let feedback_item = ResponseItem::Message {
id: None,
role: "user".to_string(),
content: vec![ContentItem::InputText {
text: feedback_msg,
}],
};

sess.record_conversation_items(
&turn_context,
&[feedback_item],
)
.await;

// Continue the loop to process the feedback
continue;
}
} else {
info!("Reflection: Task completed successfully");
}
}
Err(e) => {
warn!("Reflection evaluation failed: {}", e);
// Continue without blocking on reflection errors
}
}
}

sess.notifier()
.notify(&UserNotification::AgentTurnComplete {
thread_id: sess.conversation_id.to_string(),
Expand Down Expand Up @@ -2292,6 +2393,27 @@ pub(crate) async fn run_task(
last_agent_message
}

/// Extract the initial task/prompt from user input.
fn extract_initial_task_from_input(input: &[UserInput]) -> String {
for item in input {
match item {
UserInput::Text { text } => {
return text.clone();
}
UserInput::Image { .. } | UserInput::LocalImage { .. } => {
// Skip images, look for text
}
UserInput::Skill { name, .. } => {
// Return skill name as task description
return format!("Run skill: {}", name);
}
// Handle future variants of the non-exhaustive enum
_ => {}
}
}
"(No initial task found)".to_string()
}

#[instrument(
skip_all,
fields(
Expand Down
13 changes: 13 additions & 0 deletions codex-rs/core/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use crate::config::types::Notifications;
use crate::config::types::OtelConfig;
use crate::config::types::OtelConfigToml;
use crate::config::types::OtelExporterKind;
use crate::config::types::ReflectionConfig;
use crate::config::types::ReflectionConfigToml;
use crate::config::types::SandboxWorkspaceWrite;
use crate::config::types::ShellEnvironmentPolicy;
use crate::config::types::ShellEnvironmentPolicyToml;
Expand Down Expand Up @@ -274,6 +276,9 @@ pub struct Config {
/// Centralized feature flags; source of truth for feature gating.
pub features: Features,

/// Configuration for the reflection/judge feature.
pub reflection: ReflectionConfig,

/// The active profile name used to derive this `Config` (if any).
pub active_profile: Option<String>,

Expand Down Expand Up @@ -666,6 +671,9 @@ pub struct ConfigToml {
/// Settings for ghost snapshots (used for undo).
#[serde(default)]
pub ghost_snapshot: Option<GhostSnapshotToml>,
/// Configuration for the reflection/judge feature.
#[serde(default)]
pub reflection: Option<ReflectionConfigToml>,

/// When `true`, checks for Codex updates on startup and surfaces update prompts.
/// Set to `false` only if your Codex updates are centrally managed.
Expand Down Expand Up @@ -1221,6 +1229,7 @@ impl Config {
use_experimental_use_rmcp_client,
ghost_snapshot,
features,
reflection: cfg.reflection.map(Into::into).unwrap_or_default(),
active_profile: active_profile_name,
active_project,
windows_wsl_setup_acknowledged: cfg.windows_wsl_setup_acknowledged.unwrap_or(false),
Expand Down Expand Up @@ -2983,6 +2992,7 @@ model_verbosity = "high"
use_experimental_use_rmcp_client: false,
ghost_snapshot: GhostSnapshotConfig::default(),
features: Features::with_defaults(),
reflection: ReflectionConfig::default(),
active_profile: Some("o3".to_string()),
active_project: ProjectConfig { trust_level: None },
windows_wsl_setup_acknowledged: false,
Expand Down Expand Up @@ -3058,6 +3068,7 @@ model_verbosity = "high"
use_experimental_use_rmcp_client: false,
ghost_snapshot: GhostSnapshotConfig::default(),
features: Features::with_defaults(),
reflection: ReflectionConfig::default(),
active_profile: Some("gpt3".to_string()),
active_project: ProjectConfig { trust_level: None },
windows_wsl_setup_acknowledged: false,
Expand Down Expand Up @@ -3148,6 +3159,7 @@ model_verbosity = "high"
use_experimental_use_rmcp_client: false,
ghost_snapshot: GhostSnapshotConfig::default(),
features: Features::with_defaults(),
reflection: ReflectionConfig::default(),
active_profile: Some("zdr".to_string()),
active_project: ProjectConfig { trust_level: None },
windows_wsl_setup_acknowledged: false,
Expand Down Expand Up @@ -3224,6 +3236,7 @@ model_verbosity = "high"
use_experimental_use_rmcp_client: false,
ghost_snapshot: GhostSnapshotConfig::default(),
features: Features::with_defaults(),
reflection: ReflectionConfig::default(),
active_profile: Some("gpt5".to_string()),
active_project: ProjectConfig { trust_level: None },
windows_wsl_setup_acknowledged: false,
Expand Down
Loading