Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 85 additions & 14 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import contextvars
import heapq
import json
import string
import time
from collections.abc import AsyncIterable, Coroutine, Sequence
from dataclasses import dataclass
Expand Down Expand Up @@ -156,6 +157,7 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:

# speeches that audio playout finished but not done because of tool calls
self._background_speeches: set[SpeechHandle] = set()
self._interrupt_timer: asyncio.Task[None] | None = None

def _validate_turn_detection(
self, turn_detection: TurnDetectionMode | None
Expand Down Expand Up @@ -227,6 +229,11 @@ def _validate_turn_detection(
"for more responsive interruption handling."
)

if self._session.options.interruption_ignore_words and not self.stt:
logger.warning(
"interruption_ignore_words requires STT to be enabled, the feature will be ignored"
)

return mode

@property
Expand Down Expand Up @@ -1168,17 +1175,20 @@ def _interrupt_by_audio_activity(self) -> None:
# ignore if realtime model has turn detection enabled
return

if (
self.stt is not None
and opt.min_interruption_words > 0
and self._audio_recognition is not None
):
text = self._audio_recognition.current_transcript
if self.stt is not None and self._audio_recognition is not None:
text = self._audio_recognition.current_transcript or ""

# TODO(long): better word splitting for multi-language
if len(split_words(text, split_character=True)) < opt.min_interruption_words:
if (
self._session.options.interruption_ignore_words
and not self._should_interrupt_from_transcript(text)
):
return

if opt.min_interruption_words > 0:
# TODO(long): better word splitting for multi-language
if len(split_words(text, split_character=True)) < opt.min_interruption_words:
return

if self._rt_session is not None:
self._rt_session.start_user_activity()

Expand All @@ -1203,6 +1213,48 @@ def _interrupt_by_audio_activity(self) -> None:

self._current_speech.interrupt()

def _should_interrupt_from_transcript(self, transcript: str) -> bool:
"""
Determine whether a user transcript should trigger an interruption.

If `interruption_ignore_words` is configured and all recognized words
are considered ignorable (e.g. fillers like "um", "uh"), the interruption
is suppressed.

Returns:
True if the transcript should trigger an interruption.
False if it should be ignored.
"""
if not transcript or not transcript.strip():
return False

ignore = self._session.options.interruption_ignore_words

if not ignore:
return True

words = [
w.lower().strip(string.punctuation)
for w in transcript.split()
if w.strip(string.punctuation)
]

if not words:
return False

ignore_set = {w.lower() for w in ignore}

for word in words:
if word not in ignore_set:
return True

logger.debug(
"Ignoring interruption due to filler-only transcript",
extra={"transcript": transcript, "ignore_words": list(ignore_set)},
)

return False

# region recognition hooks

def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
Expand Down Expand Up @@ -1376,13 +1428,19 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
and self._current_speech is not None
and self._current_speech.allow_interruptions
and not self._current_speech.interrupted
and self._session.options.min_interruption_words > 0
and len(split_words(info.new_transcript, split_character=True))
< self._session.options.min_interruption_words
):
self._cancel_preemptive_generation()
# avoid interruption if the new_transcript is too short
return False
if not self._should_interrupt_from_transcript(info.new_transcript):
# Don't cancel preemptive generation if ignore words apply
return False

if (
self._session.options.min_interruption_words > 0
and len(split_words(info.new_transcript, split_character=True))
< self._session.options.min_interruption_words
):
self._cancel_preemptive_generation()
# avoid interruption if the new_transcript is too short
return False

old_task = self._user_turn_completed_atask
self._user_turn_completed_atask = self._create_speech_task(
Expand Down Expand Up @@ -2655,6 +2713,19 @@ def _on_false_interruption() -> None:
self._paused_speech = None
return

# Check ignore words before resuming
if self._audio_recognition and self._audio_recognition.current_transcript:
if not self._should_interrupt_from_transcript(
self._audio_recognition.current_transcript
):
logger.debug(
"Not resuming false interruption due to ignore words",
extra={"transcript": self._audio_recognition.current_transcript},
)
self._paused_speech = None
self._false_interruption_timer = None
return

resumed = False
if (
self._session.options.resume_false_interruption
Expand Down
8 changes: 8 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class AgentSessionOptions:
preemptive_generation: bool
tts_text_transforms: Sequence[TextTransforms] | None
ivr_detection: bool
interruption_ignore_words: list[str] | None


Userdata_T = TypeVar("Userdata_T")
Expand Down Expand Up @@ -148,6 +149,7 @@ def __init__(
discard_audio_if_uninterruptible: bool = True,
min_interruption_duration: float = 0.5,
min_interruption_words: int = 0,
interruption_ignore_words: list[str] | None = None,
min_endpointing_delay: float = 0.5,
max_endpointing_delay: float = 3.0,
max_tool_steps: int = 3,
Expand Down Expand Up @@ -207,6 +209,11 @@ def __init__(
register as an interruption. Default ``0.5`` s.
min_interruption_words (int): Minimum number of words to consider
an interruption, only used if stt enabled. Default ``0``.
interruption_ignore_words (list[str] | None): List of words
that should not trigger interruptions when detected. Useful for
filler words ("um", "uh"), acknowledgments ("okay", "right"), or
backchannel responses. Words are matched case-insensitively with
punctuation stripped. Requires STT to be enabled. Default ``None``.
min_endpointing_delay (float): Minimum time-in-seconds since the
last detected speech before the agent declares the user’s turn
complete. In VAD mode this effectively behaves like
Expand Down Expand Up @@ -275,6 +282,7 @@ def __init__(
discard_audio_if_uninterruptible=discard_audio_if_uninterruptible,
min_interruption_duration=min_interruption_duration,
min_interruption_words=min_interruption_words,
interruption_ignore_words=interruption_ignore_words,
min_endpointing_delay=min_endpointing_delay,
max_endpointing_delay=max_endpointing_delay,
max_tool_steps=max_tool_steps,
Expand Down