livekit · darshankparmar · Jan 2, 2026 · Jan 5, 2026
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -4,6 +4,7 @@
 import contextvars
 import heapq
 import json
+import string
 import time
 from collections.abc import AsyncIterable, Coroutine, Sequence
 from dataclasses import dataclass
@@ -156,6 +157,7 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
 
         # speeches that audio playout finished but not done because of tool calls
         self._background_speeches: set[SpeechHandle] = set()
+        self._interrupt_timer: asyncio.Task[None] | None = None
 
     def _validate_turn_detection(
         self, turn_detection: TurnDetectionMode | None
@@ -227,6 +229,11 @@ def _validate_turn_detection(
                 "for more responsive interruption handling."
             )
 
+        if self._session.options.interruption_ignore_words and not self.stt:
+            logger.warning(
+                "interruption_ignore_words requires STT to be enabled, the feature will be ignored"
+            )
+
         return mode
 
     @property
@@ -1168,17 +1175,20 @@ def _interrupt_by_audio_activity(self) -> None:
             # ignore if realtime model has turn detection enabled
             return
 
-        if (
-            self.stt is not None
-            and opt.min_interruption_words > 0
-            and self._audio_recognition is not None
-        ):
-            text = self._audio_recognition.current_transcript
+        if self.stt is not None and self._audio_recognition is not None:
+            text = self._audio_recognition.current_transcript or ""
 
-            # TODO(long): better word splitting for multi-language
-            if len(split_words(text, split_character=True)) < opt.min_interruption_words:
+            if (
+                self._session.options.interruption_ignore_words
+                and not self._should_interrupt_from_transcript(text)
+            ):
                 return
 
+            if opt.min_interruption_words > 0:
+                # TODO(long): better word splitting for multi-language
+                if len(split_words(text, split_character=True)) < opt.min_interruption_words:
+                    return
+
         if self._rt_session is not None:
             self._rt_session.start_user_activity()
 
@@ -1203,6 +1213,48 @@ def _interrupt_by_audio_activity(self) -> None:
 
                 self._current_speech.interrupt()
 
+    def _should_interrupt_from_transcript(self, transcript: str) -> bool:
+        """
+        Determine whether a user transcript should trigger an interruption.
+
+        If `interruption_ignore_words` is configured and all recognized words
+        are considered ignorable (e.g. fillers like "um", "uh"), the interruption
+        is suppressed.
+
+        Returns:
+            True if the transcript should trigger an interruption.
+            False if it should be ignored.
+        """
+        if not transcript or not transcript.strip():
+            return False
+
+        ignore = self._session.options.interruption_ignore_words
+
+        if not ignore:
+            return True
+
+        words = [
+            w.lower().strip(string.punctuation)
+            for w in transcript.split()
+            if w.strip(string.punctuation)
+        ]
+
+        if not words:
+            return False
+
+        ignore_set = {w.lower() for w in ignore}
+
+        for word in words:
+            if word not in ignore_set:
+                return True
+
+        logger.debug(
+            "Ignoring interruption due to filler-only transcript",
+            extra={"transcript": transcript, "ignore_words": list(ignore_set)},
+        )
+
+        return False
+
     # region recognition hooks
 
     def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
@@ -1376,13 +1428,19 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
             and self._current_speech is not None
             and self._current_speech.allow_interruptions
             and not self._current_speech.interrupted
-            and self._session.options.min_interruption_words > 0
-            and len(split_words(info.new_transcript, split_character=True))
-            < self._session.options.min_interruption_words
         ):
-            self._cancel_preemptive_generation()
-            # avoid interruption if the new_transcript is too short
-            return False
+            if not self._should_interrupt_from_transcript(info.new_transcript):
+                # Don't cancel preemptive generation if ignore words apply
+                return False
+
+            if (
+                self._session.options.min_interruption_words > 0
+                and len(split_words(info.new_transcript, split_character=True))
+                < self._session.options.min_interruption_words
+            ):
+                self._cancel_preemptive_generation()
+                # avoid interruption if the new_transcript is too short
+                return False
 
         old_task = self._user_turn_completed_atask
         self._user_turn_completed_atask = self._create_speech_task(
@@ -2655,6 +2713,19 @@ def _on_false_interruption() -> None:
                 self._paused_speech = None
                 return
 
+            # Check ignore words before resuming
+            if self._audio_recognition and self._audio_recognition.current_transcript:
+                if not self._should_interrupt_from_transcript(
+                    self._audio_recognition.current_transcript
+                ):
+                    logger.debug(
+                        "Not resuming false interruption due to ignore words",
+                        extra={"transcript": self._audio_recognition.current_transcript},
+                    )
+                    self._paused_speech = None
+                    self._false_interruption_timer = None
+                    return
+
             resumed = False
             if (
                 self._session.options.resume_false_interruption

diff --git a/livekit-agents/livekit/agents/voice/agent_session.py b/livekit-agents/livekit/agents/voice/agent_session.py
@@ -90,6 +90,7 @@ class AgentSessionOptions:
     preemptive_generation: bool
     tts_text_transforms: Sequence[TextTransforms] | None
     ivr_detection: bool
+    interruption_ignore_words: list[str] | None
 
 
 Userdata_T = TypeVar("Userdata_T")
@@ -148,6 +149,7 @@ def __init__(
         discard_audio_if_uninterruptible: bool = True,
         min_interruption_duration: float = 0.5,
         min_interruption_words: int = 0,
+        interruption_ignore_words: list[str] | None = None,
         min_endpointing_delay: float = 0.5,
         max_endpointing_delay: float = 3.0,
         max_tool_steps: int = 3,
@@ -207,6 +209,11 @@ def __init__(
                 register as an interruption. Default ``0.5`` s.
             min_interruption_words (int): Minimum number of words to consider
                 an interruption, only used if stt enabled. Default ``0``.
+            interruption_ignore_words (list[str] | None): List of words
+                that should not trigger interruptions when detected. Useful for
+                filler words ("um", "uh"), acknowledgments ("okay", "right"), or
+                backchannel responses. Words are matched case-insensitively with
+                punctuation stripped. Requires STT to be enabled. Default ``None``.
             min_endpointing_delay (float): Minimum time-in-seconds since the
                 last detected speech before the agent declares the user’s turn
                 complete. In VAD mode this effectively behaves like
@@ -275,6 +282,7 @@ def __init__(
             discard_audio_if_uninterruptible=discard_audio_if_uninterruptible,
             min_interruption_duration=min_interruption_duration,
             min_interruption_words=min_interruption_words,
+            interruption_ignore_words=interruption_ignore_words,
             min_endpointing_delay=min_endpointing_delay,
             max_endpointing_delay=max_endpointing_delay,
             max_tool_steps=max_tool_steps,