Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion examples/drive-thru/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,13 @@ async def drive_thru_agent(ctx: JobContext) -> None:
voice="Sarah",
extra_kwargs={"delivery_mode": "CREATIVE", "speaking_rate": 1.1},
),
expressive=presets.CUSTOMER_SERVICE,
expressive={
**presets.CUSTOMER_SERVICE,
"backchannel": {
"frequency": 0.8,
"source": ["mm-hmm", "yep", "got it", "uh huh", "gotcha"],
},
},
max_tool_steps=10,
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
Expand Down Expand Up @@ -577,6 +583,13 @@ def _on_user_state_changed(ev: UserStateChangedEvent) -> None:
await session.start(agent=DriveThruAgent(userdata=userdata), room=ctx.room)
await background_audio.start(room=ctx.room, agent_session=session)

session.generate_reply(
instructions=(
"Warmly greet the customer with something like "
"\"Hey, welcome to McDonald's! What can I get for you?\""
)
)


if __name__ == "__main__":
cli.run_app(server)
2 changes: 1 addition & 1 deletion examples/frontdesk/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ async def frontdesk_agent(ctx: JobContext):
voice="Nadia",
extra_kwargs={"delivery_mode": "CREATIVE", "speaking_rate": 1.1},
),
expressive=presets.CUSTOMER_SERVICE,
expressive={**presets.CUSTOMER_SERVICE, "backchannel": True},
max_tool_steps=1,
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
Expand Down
2 changes: 1 addition & 1 deletion examples/healthcare/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ async def entrypoint(ctx: JobContext):
voice="Luna",
extra_kwargs={"delivery_mode": "CREATIVE", "speaking_rate": 1.1},
),
expressive=presets.HEALTHCARE,
expressive={**presets.HEALTHCARE, "backchannel": True},
preemptive_generation=True,
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
Expand Down
6 changes: 1 addition & 5 deletions examples/hotel_receptionist/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@
tool_use_judge,
)
from livekit.agents.voice import UserStateChangedEvent, presets
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

load_dotenv()

Expand Down Expand Up @@ -674,9 +672,7 @@ async def hotel_receptionist_agent(ctx: JobContext) -> None:
voice="Ashley",
extra_kwargs={"delivery_mode": "CREATIVE", "speaking_rate": 1.1},
),
expressive=presets.CUSTOMER_SERVICE,
turn_detection=MultilingualModel(),
vad=silero.VAD.load(),
expressive={**presets.CUSTOMER_SERVICE, "backchannel": True},
max_tool_steps=5,
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
Expand Down
2 changes: 1 addition & 1 deletion examples/inference/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ async def entrypoint(ctx: JobContext) -> None:
voice="Sarah",
extra_kwargs={"delivery_mode": "CREATIVE"},
),
expressive=presets.CONVERSATIONAL,
expressive={**presets.CONVERSATIONAL, "backchannel": True},
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
user_away_timeout=10.0,
Expand Down
2 changes: 1 addition & 1 deletion examples/survey/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ async def entrypoint(ctx: JobContext):
tts=inference.TTS(
"inworld/inworld-tts-2", voice="Nate", extra_kwargs={"delivery_mode": "CREATIVE"}
),
expressive=presets.CONVERSATIONAL,
expressive={**presets.CONVERSATIONAL, "backchannel": True},
preemptive_generation=True,
# Flip user_state to "away" after 10s of mutual silence so we can
# check whether they're still there (default is 15s).
Expand Down
1 change: 1 addition & 0 deletions examples/voice_agents/basic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ async def entrypoint(ctx: JobContext) -> None:
"filter_markdown",
text_transforms.replace({"LiveKit": "<<ˈ|l|aɪ|v|k|ɪ|t>>"}),
],
expressive={"backchannel": True},
)

@session.on("metrics_collected")
Expand Down
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@
AgentStateChangedEvent,
AgentTask,
AudioRecognition,
BackchannelConfig,
BackchannelOptions,
CloseEvent,
CloseReason,
ConversationItemAddedEvent,
Expand Down Expand Up @@ -223,6 +225,8 @@ def __getattr__(name: str) -> typing.Any:
"BackgroundAudioPlayer",
"BuiltinAudioClip",
"AudioConfig",
"BackchannelConfig",
"BackchannelOptions",
"PlayHandle",
"FlushSentinel",
"LanguageCode",
Expand Down
3 changes: 3 additions & 0 deletions livekit-agents/livekit/agents/voice/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
VoiceActivityVideoSampler,
)
from .audio_recognition import AudioRecognition
from .backchannel import BackchannelConfig, BackchannelOptions
from .events import (
AgentEvent,
AgentFalseInterruptionEvent,
Expand Down Expand Up @@ -40,6 +41,8 @@
"Agent",
"ModelSettings",
"ExpressiveOptions",
"BackchannelConfig",
"BackchannelOptions",
"presets",
"AgentTask",
"SpeechHandle",
Expand Down
36 changes: 33 additions & 3 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
if TYPE_CHECKING:
from ..llm import mcp
from .agent_session import AgentSession, ExpressiveOptions
from .backchannel import _BackchannelEmitter


_AgentActivityContextVar = contextvars.ContextVar["AgentActivity"]("agents_activity")
Expand Down Expand Up @@ -251,6 +252,9 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._interruption_by_audio_activity_enabled
)

# short acknowledgments emitted during the user's pauses (None when disabled)
self._backchannel_emitter = self._resolve_backchannel_emitter()

# speeches that audio playout finished but not done because of tool calls
self._background_speeches: set[SpeechHandle] = set()

Expand Down Expand Up @@ -2073,9 +2077,8 @@ def on_eot_prediction(self, ev: EotPredictionEvent) -> None:
host._on_eot_prediction(ev)

def on_agent_backchannel_opportunity(self, ev: _AgentBackchannelOpportunityEvent) -> None:
# TODO: consume the backchannel opportunity internally (e.g. trigger a
# backchannel phrase). Kept internal for now — not surfaced as a public event.
pass
if self._backchannel_emitter is not None:
self._backchannel_emitter.maybe_emit(ev, self)

def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
# IMPORTANT: This method is sync to avoid it being cancelled by the AudioRecognition
Expand Down Expand Up @@ -2380,6 +2383,33 @@ def retrieve_chat_ctx(self) -> llm.ChatContext:

# endregion

def _resolve_backchannel_emitter(self) -> _BackchannelEmitter | None:
"""Build the per-activity backchannel emitter, or ``None`` when disabled or
unsupported by the active turn detector."""
from .backchannel import _BackchannelEmitter, resolve_backchannel_options

expr = self._agent.expressive
if not utils.is_given(expr):
expr = self._session.options.expressive
if not expr:
return None # expressive off → no backchannel

backchannel = expr.get("backchannel", NOT_GIVEN) if isinstance(expr, dict) else NOT_GIVEN
options = resolve_backchannel_options(backchannel)
if options is None:
return None

# only the cloud turn detector supplies the backchannel signal; the local
# mini model never emits it, so the opportunity hook would never fire
if not isinstance(self._turn_detection, inference.TurnDetector):
logger.warning(
"backchannel is enabled but the active turn detector does not provide a "
"backchannel signal (requires the LiveKit cloud turn detector); disabling it"
)
return None

return _BackchannelEmitter(options)

def _resolve_expressive_options(self) -> ExpressiveOptions | None:
"""Resolve expressive from agent (overrides session). Returns None if disabled."""
from . import presets
Expand Down
7 changes: 7 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
from ..cli.tcp_console import TcpAudioInput, TcpAudioOutput
from ..inference import LLMModels, STTModels, TTSModels
from ..llm import mcp
from .backchannel import BackchannelConfig, BackchannelOptions
from .background_audio import AudioSource
from .presets import Preset
from .transcription.text_transforms import TextTransforms

Expand Down Expand Up @@ -160,6 +162,11 @@ class ExpressiveOptions(TypedDict, total=False):
tts_instructions_template: Instructions | str
tts_instructions_append: str
audio_recognition_instructions_template: Instructions | str
backchannel: NotGivenOr[bool | list[str | AudioSource | BackchannelConfig] | BackchannelOptions]

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm wondering if we could simplify these types somehow

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

claude suggested:

  # backchannel.py
+ BackchannelSource: TypeAlias = "str | AudioSource | BackchannelConfig"

  class BackchannelOptions(TypedDict, total=False):
      frequency: float
-     source: NotGivenOr[list[str | AudioSource | BackchannelConfig]]
+     source: list[BackchannelSource]

- DEFAULT_BACKCHANNEL_SOURCE: list[str | AudioSource | BackchannelConfig] = [...]
+ DEFAULT_BACKCHANNEL_SOURCE: list[BackchannelSource] = [...]

  def resolve_backchannel_options(
-     backchannel: NotGivenOr[bool | list[str | AudioSource | BackchannelConfig] | BackchannelOptions],
+     backchannel: NotGivenOr[bool | list[BackchannelSource] | BackchannelOptions],
  ) -> BackchannelOptions | None: ...

- def _as_config(entry: str | AudioSource | BackchannelConfig) -> BackchannelConfig: ...
+ def _as_config(entry: BackchannelSource) -> BackchannelConfig: ...
  # agent_session.py
  class ExpressiveOptions(TypedDict, total=False):
      ...
-     backchannel: NotGivenOr[bool | list[str | AudioSource | BackchannelConfig] | BackchannelOptions]
+     backchannel: bool | list[BackchannelSource] | BackchannelOptions

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point

"""Short acknowledgments ("mm-hmm", "yeah") emitted during the user's pauses.
``NOT_GIVEN``/``True`` → default two-tier set; ``False`` → off; a list → custom
clips; a ``BackchannelOptions`` → full control. Requires the LiveKit cloud turn
detector (it supplies the backchannel signal)."""


DEFAULT_EXPRESSIVE_OPTIONS: ExpressiveOptions = ExpressiveOptions(
Expand Down
35 changes: 26 additions & 9 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -1511,17 +1511,34 @@ async def _bounce_eou_task(
if (
backchannel_probability is not None
and backchannel_threshold is not None
and backchannel_probability >= backchannel_threshold
):
self._hooks.on_agent_backchannel_opportunity(
_AgentBackchannelOpportunityEvent(
probability=backchannel_probability,
threshold=backchannel_threshold,
end_of_turn_probability=end_of_turn_probability,
end_of_turn_threshold=unlikely_threshold,
language=self._last_language,
if backchannel_probability >= backchannel_threshold:
logger.debug(
"backchannel opportunity",
extra={
"backchannel_probability": backchannel_probability,
"backchannel_threshold": backchannel_threshold,
"end_of_turn_probability": end_of_turn_probability,
"end_of_turn_threshold": unlikely_threshold,
},
)
self._hooks.on_agent_backchannel_opportunity(
_AgentBackchannelOpportunityEvent(
probability=backchannel_probability,
threshold=backchannel_threshold,
end_of_turn_probability=end_of_turn_probability,
end_of_turn_threshold=unlikely_threshold,
language=self._last_language,
)
)
else:
logger.debug(
"backchannel skipped: below threshold",
extra={
"backchannel_probability": backchannel_probability,
"backchannel_threshold": backchannel_threshold,
},
)
)
if (
prediction_event is not None
and prediction_event.detection_delay is not None
Expand Down
Loading
Loading