storytelling-backup

Paused

App Files Files Community

lucy1118 commited on Jul 4, 2024

Commit

8d7f55c

verified ·

1 Parent(s): cef8620

Upload 78 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pipecat/__init__.py +0 -0
pipecat/frames/__init__.py +0 -0
pipecat/frames/frames.proto +43 -0
pipecat/frames/frames.py +340 -0
pipecat/frames/protobufs/frames_pb2.py +32 -0
pipecat/pipeline/__init__.py +0 -0
pipecat/pipeline/base_pipeline.py +21 -0
pipecat/pipeline/merge_pipeline.py +24 -0
pipecat/pipeline/parallel_pipeline.py +154 -0
pipecat/pipeline/parallel_task.py +119 -0
pipecat/pipeline/pipeline.py +95 -0
pipecat/pipeline/runner.py +58 -0
pipecat/pipeline/task.py +142 -0
pipecat/processors/__init__.py +0 -0
pipecat/processors/aggregators/__init__.py +0 -0
pipecat/processors/aggregators/gated.py +74 -0
pipecat/processors/aggregators/llm_response.py +266 -0
pipecat/processors/aggregators/openai_llm_context.py +114 -0
pipecat/processors/aggregators/sentence.py +54 -0
pipecat/processors/aggregators/user_response.py +156 -0
pipecat/processors/aggregators/vision_image_frame.py +47 -0
pipecat/processors/async_frame_processor.py +63 -0
pipecat/processors/filters/__init__.py +0 -0
pipecat/processors/filters/frame_filter.py +36 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/wake_check_filter.py +86 -0
pipecat/processors/frame_processor.py +162 -0
pipecat/processors/frameworks/__init__.py +0 -0
pipecat/processors/frameworks/langchain.py +80 -0
pipecat/processors/logger.py +27 -0
pipecat/processors/text_transformer.py +38 -0
pipecat/serializers/__init__.py +0 -0
pipecat/serializers/base_serializer.py +20 -0
pipecat/serializers/protobuf.py +92 -0
pipecat/serializers/twilio.py +52 -0
pipecat/services/__init__.py +0 -0
pipecat/services/ai_services.py +300 -0
pipecat/services/anthropic.py +145 -0
pipecat/services/azure.py +233 -0
pipecat/services/cartesia.py +65 -0
pipecat/services/deepgram.py +149 -0
pipecat/services/elevenlabs.py +66 -0
pipecat/services/fal.py +83 -0
pipecat/services/fireworks.py +25 -0
pipecat/services/google.py +129 -0
pipecat/services/moondream.py +92 -0
pipecat/services/ollama.py +13 -0
pipecat/services/openai.py +338 -0
pipecat/services/openpipe.py +71 -0
pipecat/services/playht.py +83 -0

pipecat/__init__.py ADDED Viewed

File without changes

pipecat/frames/__init__.py ADDED Viewed

File without changes

pipecat/frames/frames.proto ADDED Viewed

	@@ -0,0 +1,43 @@

+//
+// Copyright (c) 2024, Daily
+//
+// SPDX-License-Identifier: BSD 2-Clause License
+//
+// Generate frames_pb2.py with:
+//
+//   python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto
+syntax = "proto3";
+package pipecat;
+message TextFrame {
+  uint64 id = 1;
+  string name = 2;
+  string text = 3;
+}
+message AudioRawFrame {
+  uint64 id = 1;
+  string name = 2;
+  bytes audio = 3;
+  uint32 sample_rate = 4;
+  uint32 num_channels = 5;
+}
+message TranscriptionFrame {
+  uint64 id = 1;
+  string name = 2;
+  string text = 3;
+  string user_id = 4;
+  string timestamp = 5;
+}
+message Frame {
+  oneof frame {
+    TextFrame text = 1;
+    AudioRawFrame audio = 2;
+    TranscriptionFrame transcription = 3;
+  }
+}

pipecat/frames/frames.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Any, List, Mapping, Tuple
+from dataclasses import dataclass, field
+from pipecat.utils.utils import obj_count, obj_id
+@dataclass
+class Frame:
+    id: int = field(init=False)
+    name: str = field(init=False)
+    def __post_init__(self):
+        self.id: int = obj_id()
+        self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
+    def __str__(self):
+        return self.name
+@dataclass
+class DataFrame(Frame):
+    pass
+@dataclass
+class AudioRawFrame(DataFrame):
+    """A chunk of audio. Will be played by the transport if the transport's
+    microphone has been enabled.
+    """
+    audio: bytes
+    sample_rate: int
+    num_channels: int
+    def __post_init__(self):
+        super().__post_init__()
+        self.num_frames = int(len(self.audio) / (self.num_channels * 2))
+    def __str__(self):
+        return f"{self.name}(size: {len(self.audio)}, frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})"
+@dataclass
+class ImageRawFrame(DataFrame):
+    """An image. Will be shown by the transport if the transport's camera is
+    enabled.
+    """
+    image: bytes
+    size: Tuple[int, int]
+    format: str | None
+    def __str__(self):
+        return f"{self.name}(size: {self.size}, format: {self.format})"
+@dataclass
+class URLImageRawFrame(ImageRawFrame):
+    """An image with an associated URL. Will be shown by the transport if the
+    transport's camera is enabled.
+    """
+    url: str | None
+    def __str__(self):
+        return f"{self.name}(url: {self.url}, size: {self.size}, format: {self.format})"
+@dataclass
+class VisionImageRawFrame(ImageRawFrame):
+    """An image with an associated text to ask for a description of it. Will be
+    shown by the transport if the transport's camera is enabled.
+    """
+    text: str | None
+    def __str__(self):
+        return f"{self.name}(text: {self.text}, size: {self.size}, format: {self.format})"
+@dataclass
+class UserImageRawFrame(ImageRawFrame):
+    """An image associated to a user. Will be shown by the transport if the
+    transport's camera is enabled.
+    """
+    user_id: str
+    def __str__(self):
+        return f"{self.name}(user: {self.user_id}, size: {self.size}, format: {self.format})"
+@dataclass
+class SpriteFrame(Frame):
+    """An animated sprite. Will be shown by the transport if the transport's
+    camera is enabled. Will play at the framerate specified in the transport's
+    `fps` constructor parameter.
+    """
+    images: List[ImageRawFrame]
+    def __str__(self):
+        return f"{self.name}(size: {len(self.images)})"
+@dataclass
+class TextFrame(DataFrame):
+    """A chunk of text. Emitted by LLM services, consumed by TTS services, can
+    be used to send text through pipelines.
+    """
+    text: str
+    def __str__(self):
+        return f"{self.name}(text: {self.text})"
+@dataclass
+class TranscriptionFrame(TextFrame):
+    """A text frame with transcription-specific data. Will be placed in the
+    transport's receive queue when a participant speaks.
+    """
+    user_id: str
+    timestamp: str
+    def __str__(self):
+        return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
+@dataclass
+class InterimTranscriptionFrame(TextFrame):
+    """A text frame with interim transcription-specific data. Will be placed in
+    the transport's receive queue when a participant speaks."""
+    user_id: str
+    timestamp: str
+    def __str__(self):
+        return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
+@dataclass
+class LLMMessagesFrame(DataFrame):
+    """A frame containing a list of LLM messages. Used to signal that an LLM
+    service should run a chat completion and emit an LLMStartFrames, TextFrames
+    and an LLMEndFrame. Note that the messages property on this class is
+    mutable, and will be be updated by various ResponseAggregator frame
+    processors.
+    """
+    messages: List[dict]
+@dataclass
+class TransportMessageFrame(DataFrame):
+    message: Any
+    def __str__(self):
+        return f"{self.name}(message: {self.message})"
+#
+# App frames. Application user-defined frames.
+#
+@dataclass
+class AppFrame(Frame):
+    pass
+#
+# System frames
+#
+@dataclass
+class SystemFrame(Frame):
+    pass
+@dataclass
+class StartFrame(SystemFrame):
+    """This is the first frame that should be pushed down a pipeline."""
+    allow_interruptions: bool = False
+    enable_metrics: bool = False
+    report_only_initial_ttfb: bool = False
+@dataclass
+class CancelFrame(SystemFrame):
+    """Indicates that a pipeline needs to stop right away."""
+    pass
+@dataclass
+class ErrorFrame(SystemFrame):
+    """This is used notify upstream that an error has occurred downstream the
+    pipeline."""
+    error: str | None
+    def __str__(self):
+        return f"{self.name}(error: {self.error})"
+@dataclass
+class StopTaskFrame(SystemFrame):
+    """Indicates that a pipeline task should be stopped. This should inform the
+    pipeline processors that they should stop pushing frames but that they
+    should be kept in a running state.
+    """
+    pass
+@dataclass
+class StartInterruptionFrame(SystemFrame):
+    """Emitted by VAD to indicate that a user has started speaking (i.e. is
+    interruption). This is similar to UserStartedSpeakingFrame except that it
+    should be pushed concurrently with other frames (so the order is not
+    guaranteed).
+    """
+    pass
+@dataclass
+class StopInterruptionFrame(SystemFrame):
+    """Emitted by VAD to indicate that a user has stopped speaking (i.e. no more
+    interruptions). This is similar to UserStoppedSpeakingFrame except that it
+    should be pushed concurrently with other frames (so the order is not
+    guaranteed).
+    """
+    pass
+@dataclass
+class MetricsFrame(SystemFrame):
+    """Emitted by processor that can compute metrics like latencies.
+    """
+    ttfb: List[Mapping[str, Any]] | None = None
+    processing: List[Mapping[str, Any]] | None = None
+#
+# Control frames
+#
+@dataclass
+class ControlFrame(Frame):
+    pass
+@dataclass
+class EndFrame(ControlFrame):
+    """Indicates that a pipeline has ended and frame processors and pipelines
+    should be shut down. If the transport receives this frame, it will stop
+    sending frames to its output channel(s) and close all its threads. Note,
+    that this is a control frame, which means it will received in the order it
+    was sent (unline system frames).
+    """
+    pass
+@dataclass
+class LLMFullResponseStartFrame(ControlFrame):
+    """Used to indicate the beginning of a full LLM response. Following
+    LLMResponseStartFrame, TextFrame and LLMResponseEndFrame for each sentence
+    until a LLMFullResponseEndFrame."""
+    pass
+@dataclass
+class LLMFullResponseEndFrame(ControlFrame):
+    """Indicates the end of a full LLM response."""
+    pass
+@dataclass
+class LLMResponseStartFrame(ControlFrame):
+    """Used to indicate the beginning of an LLM response. Following TextFrames
+    are part of the LLM response until an LLMResponseEndFrame"""
+    pass
+@dataclass
+class LLMResponseEndFrame(ControlFrame):
+    """Indicates the end of an LLM response."""
+    pass
+@dataclass
+class UserStartedSpeakingFrame(ControlFrame):
+    """Emitted by VAD to indicate that a user has started speaking. This can be
+    used for interruptions or other times when detecting that someone is
+    speaking is more important than knowing what they're saying (as you will
+    with a TranscriptionFrame)
+    """
+    pass
+@dataclass
+class UserStoppedSpeakingFrame(ControlFrame):
+    """Emitted by the VAD to indicate that a user stopped speaking."""
+    pass
+@dataclass
+class TTSStartedFrame(ControlFrame):
+    """Used to indicate the beginning of a TTS response. Following
+    AudioRawFrames are part of the TTS response until an TTSEndFrame. These
+    frames can be used for aggregating audio frames in a transport to optimize
+    the size of frames sent to the session, without needing to control this in
+    the TTS service.
+    """
+    pass
+@dataclass
+class TTSStoppedFrame(ControlFrame):
+    """Indicates the end of a TTS response."""
+    pass
+@dataclass
+class UserImageRequestFrame(ControlFrame):
+    """A frame user to request an image from the given user."""
+    user_id: str
+    def __str__(self):
+        return f"{self.name}, user: {self.user_id}"

pipecat/frames/protobufs/frames_pb2.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: frames.proto
+# Protobuf Python Version: 4.25.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\x07pipecat\"3\n\tTextFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\"c\n\rAudioRawFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\r\n\x05\x61udio\x18\x03 \x01(\x0c\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12\x14\n\x0cnum_channels\x18\x05 \x01(\r\"`\n\x12TranscriptionFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0f\n\x07user_id\x18\x04 \x01(\t\x12\x11\n\ttimestamp\x18\x05 \x01(\t\"\x93\x01\n\x05\x46rame\x12\"\n\x04text\x18\x01 \x01(\x0b\x32\x12.pipecat.TextFrameH\x00\x12\'\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x16.pipecat.AudioRawFrameH\x00\x12\x34\n\rtranscription\x18\x03 \x01(\x0b\x32\x1b.pipecat.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'frames_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+  DESCRIPTOR._options = None
+  _globals['_TEXTFRAME']._serialized_start=25
+  _globals['_TEXTFRAME']._serialized_end=76
+  _globals['_AUDIORAWFRAME']._serialized_start=78
+  _globals['_AUDIORAWFRAME']._serialized_end=177
+  _globals['_TRANSCRIPTIONFRAME']._serialized_start=179
+  _globals['_TRANSCRIPTIONFRAME']._serialized_end=275
+  _globals['_FRAME']._serialized_start=278
+  _globals['_FRAME']._serialized_end=425
+# @@protoc_insertion_point(module_scope)

pipecat/pipeline/__init__.py ADDED Viewed

File without changes

pipecat/pipeline/base_pipeline.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from abc import abstractmethod
+from typing import List
+from pipecat.processors.frame_processor import FrameProcessor
+class BasePipeline(FrameProcessor):
+    def __init__(self):
+        super().__init__()
+    @abstractmethod
+    def processors_with_metrics(self) -> List[FrameProcessor]:
+        pass

pipecat/pipeline/merge_pipeline.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import List
+from pipecat.pipeline.frames import EndFrame, EndPipeFrame
+from pipecat.pipeline.pipeline import Pipeline
+class SequentialMergePipeline(Pipeline):
+    """This class merges the sink queues from a list of pipelines. Frames from
+    each pipeline's sink are merged in the order of pipelines in the list."""
+    def __init__(self, pipelines: List[Pipeline]):
+        super().__init__([])
+        self.pipelines = pipelines
+    async def run_pipeline(self):
+        for idx, pipeline in enumerate(self.pipelines):
+            while True:
+                frame = await pipeline.sink.get()
+                if isinstance(
+                        frame, EndFrame) or isinstance(
+                        frame, EndPipeFrame):
+                    break
+                await self.sink.put(frame)
+        await self.sink.put(EndFrame())

pipecat/pipeline/parallel_pipeline.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from itertools import chain
+from typing import List
+from pipecat.pipeline.base_pipeline import BasePipeline
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame
+from loguru import logger
+class Source(FrameProcessor):
+    def __init__(self, upstream_queue: asyncio.Queue):
+        super().__init__()
+        self._up_queue = upstream_queue
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self._up_queue.put(frame)
+            case FrameDirection.DOWNSTREAM:
+                await self.push_frame(frame, direction)
+class Sink(FrameProcessor):
+    def __init__(self, downstream_queue: asyncio.Queue):
+        super().__init__()
+        self._down_queue = downstream_queue
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self.push_frame(frame, direction)
+            case FrameDirection.DOWNSTREAM:
+                await self._down_queue.put(frame)
+class ParallelPipeline(BasePipeline):
+    def __init__(self, *args):
+        super().__init__()
+        if len(args) == 0:
+            raise Exception(f"ParallelPipeline needs at least one argument")
+        self._sources = []
+        self._sinks = []
+        self._up_queue = asyncio.Queue()
+        self._down_queue = asyncio.Queue()
+        self._up_task: asyncio.Task | None = None
+        self._down_task: asyncio.Task | None = None
+        self._pipelines = []
+        logger.debug(f"Creating {self} pipelines")
+        for processors in args:
+            if not isinstance(processors, list):
+                raise TypeError(f"ParallelPipeline argument {processors} is not a list")
+            # We will add a source before the pipeline and a sink after.
+            source = Source(self._up_queue)
+            sink = Sink(self._down_queue)
+            self._sources.append(source)
+            self._sinks.append(sink)
+            # Create pipeline
+            pipeline = Pipeline(processors)
+            source.link(pipeline)
+            pipeline.link(sink)
+            self._pipelines.append(pipeline)
+        logger.debug(f"Finished creating {self} pipelines")
+    #
+    # BasePipeline
+    #
+    def processors_with_metrics(self) -> List[FrameProcessor]:
+        return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
+    #
+    # Frame processor
+    #
+    async def cleanup(self):
+        await asyncio.gather(*[p.cleanup() for p in self._pipelines])
+    async def _start_tasks(self):
+        loop = self.get_event_loop()
+        self._up_task = loop.create_task(self._process_up_queue())
+        self._down_task = loop.create_task(self._process_down_queue())
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, StartFrame):
+            await self._start_tasks()
+        if direction == FrameDirection.UPSTREAM:
+            # If we get an upstream frame we process it in each sink.
+            await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
+        elif direction == FrameDirection.DOWNSTREAM:
+            # If we get a downstream frame we process it in each source.
+            # TODO(aleix): We are creating task for each frame. For real-time
+            # video/audio this might be too slow. We should use an already
+            # created task instead.
+            await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources])
+        # If we get an EndFrame we stop our queue processing tasks and wait on
+        # all the pipelines to finish.
+        if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
+            # Use None to indicate when queues should be done processing.
+            await self._up_queue.put(None)
+            await self._down_queue.put(None)
+            if self._up_task:
+                await self._up_task
+            if self._down_task:
+                await self._down_task
+    async def _process_up_queue(self):
+        running = True
+        seen_ids = set()
+        while running:
+            frame = await self._up_queue.get()
+            if frame and frame.id not in seen_ids:
+                await self.push_frame(frame, FrameDirection.UPSTREAM)
+                seen_ids.add(frame.id)
+            running = frame is not None
+            self._up_queue.task_done()
+    async def _process_down_queue(self):
+        running = True
+        seen_ids = set()
+        while running:
+            frame = await self._down_queue.get()
+            if frame and frame.id not in seen_ids:
+                await self.push_frame(frame, FrameDirection.DOWNSTREAM)
+                seen_ids.add(frame.id)
+            running = frame is not None
+            self._down_queue.task_done()

pipecat/pipeline/parallel_task.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from itertools import chain
+from typing import List
+from pipecat.pipeline.base_pipeline import BasePipeline
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.frames.frames import Frame
+from loguru import logger
+class Source(FrameProcessor):
+    def __init__(self, upstream_queue: asyncio.Queue):
+        super().__init__()
+        self._up_queue = upstream_queue
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self._up_queue.put(frame)
+            case FrameDirection.DOWNSTREAM:
+                await self.push_frame(frame, direction)
+class Sink(FrameProcessor):
+    def __init__(self, downstream_queue: asyncio.Queue):
+        super().__init__()
+        self._down_queue = downstream_queue
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self.push_frame(frame, direction)
+            case FrameDirection.DOWNSTREAM:
+                await self._down_queue.put(frame)
+class ParallelTask(BasePipeline):
+    def __init__(self, *args):
+        super().__init__()
+        if len(args) == 0:
+            raise Exception(f"ParallelTask needs at least one argument")
+        self._sinks = []
+        self._pipelines = []
+        self._up_queue = asyncio.Queue()
+        self._down_queue = asyncio.Queue()
+        logger.debug(f"Creating {self} pipelines")
+        for processors in args:
+            if not isinstance(processors, list):
+                raise TypeError(f"ParallelTask argument {processors} is not a list")
+            # We add a source at the beginning of the pipeline and a sink at the end.
+            source = Source(self._up_queue)
+            sink = Sink(self._down_queue)
+            processors: List[FrameProcessor] = [source] + processors
+            processors.append(sink)
+            # Keep track of sinks. We access the source through the pipeline.
+            self._sinks.append(sink)
+            # Create pipeline
+            pipeline = Pipeline(processors)
+            self._pipelines.append(pipeline)
+        logger.debug(f"Finished creating {self} pipelines")
+    #
+    # BasePipeline
+    #
+    def processors_with_metrics(self) -> List[FrameProcessor]:
+        return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
+    #
+    # Frame processor
+    #
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if direction == FrameDirection.UPSTREAM:
+            # If we get an upstream frame we process it in each sink.
+            await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
+        elif direction == FrameDirection.DOWNSTREAM:
+            # If we get a downstream frame we process it in each source (using the pipeline).
+            await asyncio.gather(*[p.process_frame(frame, direction) for p in self._pipelines])
+        seen_ids = set()
+        while not self._up_queue.empty():
+            frame = await self._up_queue.get()
+            if frame and frame.id not in seen_ids:
+                await self.push_frame(frame, FrameDirection.UPSTREAM)
+                seen_ids.add(frame.id)
+            self._up_queue.task_done()
+        seen_ids = set()
+        while not self._down_queue.empty():
+            frame = await self._down_queue.get()
+            if frame and frame.id not in seen_ids:
+                await self.push_frame(frame, FrameDirection.DOWNSTREAM)
+                seen_ids.add(frame.id)
+            self._down_queue.task_done()

pipecat/pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Callable, Coroutine, List
+from pipecat.frames.frames import Frame
+from pipecat.pipeline.base_pipeline import BasePipeline
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class PipelineSource(FrameProcessor):
+    def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
+        super().__init__()
+        self._upstream_push_frame = upstream_push_frame
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self._upstream_push_frame(frame, direction)
+            case FrameDirection.DOWNSTREAM:
+                await self.push_frame(frame, direction)
+class PipelineSink(FrameProcessor):
+    def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
+        super().__init__()
+        self._downstream_push_frame = downstream_push_frame
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self.push_frame(frame, direction)
+            case FrameDirection.DOWNSTREAM:
+                await self._downstream_push_frame(frame, direction)
+class Pipeline(BasePipeline):
+    def __init__(self, processors: List[FrameProcessor]):
+        super().__init__()
+        # Add a source and a sink queue so we can forward frames upstream and
+        # downstream outside of the pipeline.
+        self._source = PipelineSource(self.push_frame)
+        self._sink = PipelineSink(self.push_frame)
+        self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink]
+        self._link_processors()
+    #
+    # BasePipeline
+    #
+    def processors_with_metrics(self):
+        services = []
+        for p in self._processors:
+            if isinstance(p, BasePipeline):
+                services += p.processors_with_metrics()
+            elif p.can_generate_metrics():
+                services.append(p)
+        return services
+    #
+    # Frame processor
+    #
+    async def cleanup(self):
+        await self._cleanup_processors()
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if direction == FrameDirection.DOWNSTREAM:
+            await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
+        elif direction == FrameDirection.UPSTREAM:
+            await self._sink.process_frame(frame, FrameDirection.UPSTREAM)
+    async def _cleanup_processors(self):
+        for p in self._processors:
+            await p.cleanup()
+    def _link_processors(self):
+        prev = self._processors[0]
+        for curr in self._processors[1:]:
+            prev.link(curr)
+            prev = curr

pipecat/pipeline/runner.py ADDED Viewed

	@@ -0,0 +1,58 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+import signal
+from pipecat.pipeline.task import PipelineTask
+from pipecat.utils.utils import obj_count, obj_id
+from loguru import logger
+class PipelineRunner:
+    def __init__(self, *, name: str | None = None, handle_sigint: bool = True):
+        self.id: int = obj_id()
+        self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}"
+        self._tasks = {}
+        if handle_sigint:
+            self._setup_sigint()
+    async def run(self, task: PipelineTask):
+        logger.debug(f"Runner {self} started running {task}")
+        self._tasks[task.name] = task
+        await task.run()
+        del self._tasks[task.name]
+        logger.debug(f"Runner {self} finished running {task}")
+    async def stop_when_done(self):
+        logger.debug(f"Runner {self} scheduled to stop when all tasks are done")
+        await asyncio.gather(*[t.stop_when_done() for t in self._tasks.values()])
+    async def cancel(self):
+        logger.debug(f"Canceling runner {self}")
+        await asyncio.gather(*[t.cancel() for t in self._tasks.values()])
+    def _setup_sigint(self):
+        loop = asyncio.get_running_loop()
+        loop.add_signal_handler(
+            signal.SIGINT,
+            lambda *args: asyncio.create_task(self._sig_handler())
+        )
+        loop.add_signal_handler(
+            signal.SIGTERM,
+            lambda *args: asyncio.create_task(self._sig_handler())
+        )
+    async def _sig_handler(self):
+        logger.warning(f"Interruption detected. Canceling runner {self}")
+        await self.cancel()
+    def __str__(self):
+        return self.name

pipecat/pipeline/task.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from typing import AsyncIterable, Iterable
+from pydantic import BaseModel
+from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, MetricsFrame, StartFrame, StopTaskFrame
+from pipecat.pipeline.base_pipeline import BasePipeline
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.utils.utils import obj_count, obj_id
+from loguru import logger
+class PipelineParams(BaseModel):
+    allow_interruptions: bool = False
+    enable_metrics: bool = False
+    report_only_initial_ttfb: bool = False
+class Source(FrameProcessor):
+    def __init__(self, up_queue: asyncio.Queue):
+        super().__init__()
+        self._up_queue = up_queue
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        match direction:
+            case FrameDirection.UPSTREAM:
+                await self._up_queue.put(frame)
+            case FrameDirection.DOWNSTREAM:
+                await self.push_frame(frame, direction)
+class PipelineTask:
+    def __init__(self, pipeline: BasePipeline, params: PipelineParams = PipelineParams()):
+        self.id: int = obj_id()
+        self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
+        self._pipeline = pipeline
+        self._params = params
+        self._finished = False
+        self._down_queue = asyncio.Queue()
+        self._up_queue = asyncio.Queue()
+        self._source = Source(self._up_queue)
+        self._source.link(pipeline)
+    def has_finished(self):
+        return self._finished
+    async def stop_when_done(self):
+        logger.debug(f"Task {self} scheduled to stop when done")
+        await self.queue_frame(EndFrame())
+    async def cancel(self):
+        logger.debug(f"Canceling pipeline task {self}")
+        # Make sure everything is cleaned up downstream. This is sent
+        # out-of-band from the main streaming task which is what we want since
+        # we want to cancel right away.
+        await self._source.process_frame(CancelFrame(), FrameDirection.DOWNSTREAM)
+        self._process_down_task.cancel()
+        self._process_up_task.cancel()
+        await self._process_down_task
+        await self._process_up_task
+    async def run(self):
+        self._process_up_task = asyncio.create_task(self._process_up_queue())
+        self._process_down_task = asyncio.create_task(self._process_down_queue())
+        await asyncio.gather(self._process_up_task, self._process_down_task)
+        self._finished = True
+    async def queue_frame(self, frame: Frame):
+        await self._down_queue.put(frame)
+    async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
+        if isinstance(frames, AsyncIterable):
+            async for frame in frames:
+                await self.queue_frame(frame)
+        elif isinstance(frames, Iterable):
+            for frame in frames:
+                await self.queue_frame(frame)
+        else:
+            raise Exception("Frames must be an iterable or async iterable")
+    def _initial_metrics_frame(self) -> MetricsFrame:
+        processors = self._pipeline.processors_with_metrics()
+        ttfb = [{"name": p.name, "time": 0.0} for p in processors]
+        processing = [{"name": p.name, "time": 0.0} for p in processors]
+        return MetricsFrame(ttfb=ttfb, processing=processing)
+    async def _process_down_queue(self):
+        start_frame = StartFrame(
+            allow_interruptions=self._params.allow_interruptions,
+            enable_metrics=self._params.enable_metrics,
+            report_only_initial_ttfb=self._params.report_only_initial_ttfb
+        )
+        await self._source.process_frame(start_frame, FrameDirection.DOWNSTREAM)
+        await self._source.process_frame(self._initial_metrics_frame(), FrameDirection.DOWNSTREAM)
+        running = True
+        should_cleanup = True
+        while running:
+            try:
+                frame = await self._down_queue.get()
+                await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
+                running = not (isinstance(frame, StopTaskFrame) or isinstance(frame, EndFrame))
+                should_cleanup = not isinstance(frame, StopTaskFrame)
+                self._down_queue.task_done()
+            except asyncio.CancelledError:
+                break
+        # Cleanup only if we need to.
+        if should_cleanup:
+            await self._source.cleanup()
+            await self._pipeline.cleanup()
+        # We just enqueue None to terminate the task gracefully.
+        self._process_up_task.cancel()
+        await self._process_up_task
+    async def _process_up_queue(self):
+        while True:
+            try:
+                frame = await self._up_queue.get()
+                if isinstance(frame, ErrorFrame):
+                    logger.error(f"Error running app: {frame.error}")
+                    await self.queue_frame(CancelFrame())
+                self._up_queue.task_done()
+            except asyncio.CancelledError:
+                break
+    def __str__(self):
+        return self.name

pipecat/processors/__init__.py ADDED Viewed

File without changes

pipecat/processors/aggregators/__init__.py ADDED Viewed

File without changes

pipecat/processors/aggregators/gated.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import List
+from pipecat.frames.frames import Frame, SystemFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from loguru import logger
+class GatedAggregator(FrameProcessor):
+    """Accumulate frames, with custom functions to start and stop accumulation.
+    Yields gate-opening frame before any accumulated frames, then ensuing frames
+    until and not including the gate-closed frame.
+    >>> from pipecat.pipeline.frames import ImageFrame
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         if isinstance(frame, TextFrame):
+    ...             print(frame.text)
+    ...         else:
+    ...             print(frame.__class__.__name__)
+    >>> aggregator = GatedAggregator(
+    ...     gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
+    ...     gate_open_fn=lambda x: isinstance(x, ImageFrame),
+    ...     start_open=False)
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
+    >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
+    ImageFrame
+    Hello
+    Hello again.
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
+    Goodbye.
+    """
+    def __init__(self, gate_open_fn, gate_close_fn, start_open):
+        super().__init__()
+        self._gate_open_fn = gate_open_fn
+        self._gate_close_fn = gate_close_fn
+        self._gate_open = start_open
+        self._accumulator: List[Frame] = []
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        # We must not block system frames.
+        if isinstance(frame, SystemFrame):
+            await self.push_frame(frame, direction)
+            return
+        old_state = self._gate_open
+        if self._gate_open:
+            self._gate_open = not self._gate_close_fn(frame)
+        else:
+            self._gate_open = self._gate_open_fn(frame)
+        if old_state != self._gate_open:
+            state = "open" if self._gate_open else "closed"
+            logger.debug(f"Gate is now {state} because of {frame}")
+        if self._gate_open:
+            await self.push_frame(frame, direction)
+            for frame in self._accumulator:
+                await self.push_frame(frame, direction)
+            self._accumulator = []
+        else:
+            self._accumulator.append(frame)

pipecat/processors/aggregators/llm_response.py ADDED Viewed

	@@ -0,0 +1,266 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import List
+from pipecat.services.openai import OpenAILLMContextFrame, OpenAILLMContext
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.frames.frames import (
+    Frame,
+    InterimTranscriptionFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMResponseEndFrame,
+    LLMResponseStartFrame,
+    LLMMessagesFrame,
+    StartInterruptionFrame,
+    TranscriptionFrame,
+    TextFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame)
+class LLMResponseAggregator(FrameProcessor):
+    def __init__(
+        self,
+        *,
+        messages: List[dict],
+        role: str,
+        start_frame,
+        end_frame,
+        accumulator_frame: TextFrame,
+        interim_accumulator_frame: TextFrame | None = None,
+        handle_interruptions: bool = False
+    ):
+        super().__init__()
+        self._messages = messages
+        self._role = role
+        self._start_frame = start_frame
+        self._end_frame = end_frame
+        self._accumulator_frame = accumulator_frame
+        self._interim_accumulator_frame = interim_accumulator_frame
+        self._handle_interruptions = handle_interruptions
+        # Reset our accumulator state.
+        self._reset()
+    @property
+    def messages(self):
+        return self._messages
+    @property
+    def role(self):
+        return self._role
+    #
+    # Frame processor
+    #
+    # Use cases implemented:
+    #
+    # S: Start, E: End, T: Transcription, I: Interim, X: Text
+    #
+    #        S E -> None
+    #      S T E -> X
+    #    S I T E -> X
+    #    S I E T -> X
+    #  S I E I T -> X
+    #      S E T -> X
+    #    S E I T -> X
+    #
+    # The following case would not be supported:
+    #
+    #    S I E T1 I T2 -> X
+    #
+    # and T2 would be dropped.
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        send_aggregation = False
+        if isinstance(frame, self._start_frame):
+            self._aggregation = ""
+            self._aggregating = True
+            self._seen_start_frame = True
+            self._seen_end_frame = False
+            self._seen_interim_results = False
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, self._end_frame):
+            self._seen_end_frame = True
+            self._seen_start_frame = False
+            # We might have received the end frame but we might still be
+            # aggregating (i.e. we have seen interim results but not the final
+            # text).
+            self._aggregating = self._seen_interim_results or len(self._aggregation) == 0
+            # Send the aggregation if we are not aggregating anymore (i.e. no
+            # more interim results received).
+            send_aggregation = not self._aggregating
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, self._accumulator_frame):
+            if self._aggregating:
+                self._aggregation += f" {frame.text}"
+                # We have recevied a complete sentence, so if we have seen the
+                # end frame and we were still aggregating, it means we should
+                # send the aggregation.
+                send_aggregation = self._seen_end_frame
+            # We just got our final result, so let's reset interim results.
+            self._seen_interim_results = False
+        elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
+            self._seen_interim_results = True
+        elif self._handle_interruptions and isinstance(frame, StartInterruptionFrame):
+            await self._push_aggregation()
+            # Reset anyways
+            self._reset()
+            await self.push_frame(frame, direction)
+        else:
+            await self.push_frame(frame, direction)
+        if send_aggregation:
+            await self._push_aggregation()
+    async def _push_aggregation(self):
+        if len(self._aggregation) > 0:
+            self._messages.append({"role": self._role, "content": self._aggregation})
+            # Reset the aggregation. Reset it before pushing it down, otherwise
+            # if the tasks gets cancelled we won't be able to clear things up.
+            self._aggregation = ""
+            frame = LLMMessagesFrame(self._messages)
+            await self.push_frame(frame)
+    def _reset(self):
+        self._aggregation = ""
+        self._aggregating = False
+        self._seen_start_frame = False
+        self._seen_end_frame = False
+        self._seen_interim_results = False
+class LLMAssistantResponseAggregator(LLMResponseAggregator):
+    def __init__(self, messages: List[dict] = []):
+        super().__init__(
+            messages=messages,
+            role="assistant",
+            start_frame=LLMFullResponseStartFrame,
+            end_frame=LLMFullResponseEndFrame,
+            accumulator_frame=TextFrame,
+            handle_interruptions=True
+        )
+class LLMUserResponseAggregator(LLMResponseAggregator):
+    def __init__(self, messages: List[dict] = []):
+        super().__init__(
+            messages=messages,
+            role="user",
+            start_frame=UserStartedSpeakingFrame,
+            end_frame=UserStoppedSpeakingFrame,
+            accumulator_frame=TranscriptionFrame,
+            interim_accumulator_frame=InterimTranscriptionFrame
+        )
+class LLMFullResponseAggregator(FrameProcessor):
+    """This class aggregates Text frames until it receives a
+    LLMResponseEndFrame, then emits the concatenated text as
+    a single text frame.
+    given the following frames:
+        TextFrame("Hello,")
+        TextFrame(" world.")
+        TextFrame(" I am")
+        TextFrame(" an LLM.")
+        LLMResponseEndFrame()]
+    this processor will yield nothing for the first 4 frames, then
+        TextFrame("Hello, world. I am an LLM.")
+        LLMResponseEndFrame()
+    when passed the last frame.
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         if isinstance(frame, TextFrame):
+    ...             print(frame.text)
+    ...         else:
+    ...             print(frame.__class__.__name__)
+    >>> aggregator = LLMFullResponseAggregator()
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
+    >>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
+    >>> asyncio.run(print_frames(aggregator, TextFrame(" I am")))
+    >>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM.")))
+    >>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame()))
+    Hello, world. I am an LLM.
+    LLMResponseEndFrame
+    """
+    def __init__(self):
+        super().__init__()
+        self._aggregation = ""
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            self._aggregation += frame.text
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            await self.push_frame(TextFrame(self._aggregation))
+            await self.push_frame(frame)
+            self._aggregation = ""
+        else:
+            await self.push_frame(frame, direction)
+class LLMContextAggregator(LLMResponseAggregator):
+    def __init__(self, *, context: OpenAILLMContext, **kwargs):
+        self._context = context
+        super().__init__(**kwargs)
+    async def _push_aggregation(self):
+        if len(self._aggregation) > 0:
+            self._context.add_message({"role": self._role, "content": self._aggregation})
+            frame = OpenAILLMContextFrame(self._context)
+            await self.push_frame(frame)
+            # Reset our accumulator state.
+            self._reset()
+class LLMAssistantContextAggregator(LLMContextAggregator):
+    def __init__(self, context: OpenAILLMContext):
+        super().__init__(
+            messages=[],
+            context=context,
+            role="assistant",
+            start_frame=LLMResponseStartFrame,
+            end_frame=LLMResponseEndFrame,
+            accumulator_frame=TextFrame
+        )
+class LLMUserContextAggregator(LLMContextAggregator):
+    def __init__(self, context: OpenAILLMContext):
+        super().__init__(
+            messages=[],
+            context=context,
+            role="user",
+            start_frame=UserStartedSpeakingFrame,
+            end_frame=UserStoppedSpeakingFrame,
+            accumulator_frame=TranscriptionFrame,
+            interim_accumulator_frame=InterimTranscriptionFrame
+        )

pipecat/processors/aggregators/openai_llm_context.py ADDED Viewed

	@@ -0,0 +1,114 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from dataclasses import dataclass
+import io
+import json
+from typing import List
+from PIL import Image
+from pipecat.frames.frames import Frame, VisionImageRawFrame
+from openai._types import NOT_GIVEN, NotGiven
+from openai.types.chat import (
+    ChatCompletionToolParam,
+    ChatCompletionToolChoiceOptionParam,
+    ChatCompletionMessageParam
+)
+# JSON custom encoder to handle bytes arrays so that we can log contexts
+# with images to the console.
+class CustomEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, io.BytesIO):
+            # Convert the first 8 bytes to an ASCII hex string
+            return (f"{obj.getbuffer()[0:8].hex()}...")
+        return super().default(obj)
+class OpenAILLMContext:
+    def __init__(
+        self,
+        messages: List[ChatCompletionMessageParam] | None = None,
+        tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
+        tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN
+    ):
+        self.messages: List[ChatCompletionMessageParam] = messages if messages else [
+        ]
+        self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice
+        self.tools: List[ChatCompletionToolParam] | NotGiven = tools
+    @staticmethod
+    def from_messages(messages: List[dict]) -> "OpenAILLMContext":
+        context = OpenAILLMContext()
+        for message in messages:
+            context.add_message({
+                "content": message["content"],
+                "role": message["role"],
+                "name": message["name"] if "name" in message else message["role"]
+            })
+        return context
+    @staticmethod
+    def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext":
+        """
+        For images, we are deviating from the OpenAI messages shape. OpenAI
+        expects images to be base64 encoded, but other vision models may not.
+        So we'll store the image as bytes and do the base64 encoding as needed
+        in the LLM service.
+        """
+        context = OpenAILLMContext()
+        buffer = io.BytesIO()
+        Image.frombytes(
+            frame.format,
+            frame.size,
+            frame.image
+        ).save(
+            buffer,
+            format="JPEG")
+        context.add_message({
+            "content": frame.text,
+            "role": "user",
+            "data": buffer,
+            "mime_type": "image/jpeg"
+        })
+        return context
+    def add_message(self, message: ChatCompletionMessageParam):
+        self.messages.append(message)
+    def get_messages(self) -> List[ChatCompletionMessageParam]:
+        return self.messages
+    def get_messages_json(self) -> str:
+        return json.dumps(self.messages, cls=CustomEncoder)
+    def set_tool_choice(
+        self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven
+    ):
+        self.tool_choice = tool_choice
+    def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN):
+        if tools != NOT_GIVEN and len(tools) == 0:
+            tools = NOT_GIVEN
+        self.tools = tools
+@dataclass
+class OpenAILLMContextFrame(Frame):
+    """Like an LLMMessagesFrame, but with extra context specific to the OpenAI
+    API. The context in this message is also mutable, and will be changed by the
+    OpenAIContextAggregator frame processor.
+    """
+    context: OpenAILLMContext

pipecat/processors/aggregators/sentence.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import re
+from pipecat.frames.frames import EndFrame, Frame, InterimTranscriptionFrame, TextFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class SentenceAggregator(FrameProcessor):
+    """This frame processor aggregates text frames into complete sentences.
+    Frame input/output:
+        TextFrame("Hello,") -> None
+        TextFrame(" world.") -> TextFrame("Hello world.")
+    Doctest:
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         print(frame.text)
+    >>> aggregator = SentenceAggregator()
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
+    >>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
+    Hello, world.
+    """
+    def __init__(self):
+        super().__init__()
+        self._aggregation = ""
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        # We ignore interim description at this point.
+        if isinstance(frame, InterimTranscriptionFrame):
+            return
+        if isinstance(frame, TextFrame):
+            m = re.search("(.*[?.!])(.*)", frame.text)
+            if m:
+                await self.push_frame(TextFrame(self._aggregation + m.group(1)))
+                self._aggregation = m.group(2)
+            else:
+                self._aggregation += frame.text
+        elif isinstance(frame, EndFrame):
+            if self._aggregation:
+                await self.push_frame(TextFrame(self._aggregation))
+            await self.push_frame(frame)
+        else:
+            await self.push_frame(frame, direction)

pipecat/processors/aggregators/user_response.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.frames.frames import (
+    Frame,
+    InterimTranscriptionFrame,
+    StartInterruptionFrame,
+    TextFrame,
+    TranscriptionFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame)
+class ResponseAggregator(FrameProcessor):
+    """This frame processor aggregates frames between a start and an end frame
+    into complete text frame sentences.
+    For example, frame input/output:
+        UserStartedSpeakingFrame() -> None
+        TranscriptionFrame("Hello,") -> None
+        TranscriptionFrame(" world.") -> None
+        UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
+    Doctest:
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         if isinstance(frame, TextFrame):
+    ...             print(frame.text)
+    >>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame,
+    ...                                 end_frame=UserStoppedSpeakingFrame,
+    ...                                 accumulator_frame=TranscriptionFrame,
+    ...                                 pass_through=False)
+    >>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame()))
+    >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1)))
+    >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.",  1, 2)))
+    >>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame()))
+    Hello, world.
+    """
+    def __init__(
+        self,
+        *,
+        start_frame,
+        end_frame,
+        accumulator_frame: TextFrame,
+        interim_accumulator_frame: TextFrame | None = None
+    ):
+        super().__init__()
+        self._start_frame = start_frame
+        self._end_frame = end_frame
+        self._accumulator_frame = accumulator_frame
+        self._interim_accumulator_frame = interim_accumulator_frame
+        # Reset our accumulator state.
+        self._reset()
+    #
+    # Frame processor
+    #
+    # Use cases implemented:
+    #
+    # S: Start, E: End, T: Transcription, I: Interim, X: Text
+    #
+    #        S E -> None
+    #      S T E -> X
+    #    S I T E -> X
+    #    S I E T -> X
+    #  S I E I T -> X
+    #      S E T -> X
+    #    S E I T -> X
+    #
+    # The following case would not be supported:
+    #
+    #    S I E T1 I T2 -> X
+    #
+    # and T2 would be dropped.
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        send_aggregation = False
+        if isinstance(frame, self._start_frame):
+            self._aggregating = True
+            self._seen_start_frame = True
+            self._seen_end_frame = False
+            self._seen_interim_results = False
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, self._end_frame):
+            self._seen_end_frame = True
+            self._seen_start_frame = False
+            # We might have received the end frame but we might still be
+            # aggregating (i.e. we have seen interim results but not the final
+            # text).
+            self._aggregating = self._seen_interim_results or len(self._aggregation) == 0
+            # Send the aggregation if we are not aggregating anymore (i.e. no
+            # more interim results received).
+            send_aggregation = not self._aggregating
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, self._accumulator_frame):
+            if self._aggregating:
+                self._aggregation += f" {frame.text}"
+                # We have recevied a complete sentence, so if we have seen the
+                # end frame and we were still aggregating, it means we should
+                # send the aggregation.
+                send_aggregation = self._seen_end_frame
+            # We just got our final result, so let's reset interim results.
+            self._seen_interim_results = False
+        elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
+            self._seen_interim_results = True
+        else:
+            await self.push_frame(frame, direction)
+        if send_aggregation:
+            await self._push_aggregation()
+    async def _push_aggregation(self):
+        if len(self._aggregation) > 0:
+            frame = TextFrame(self._aggregation.strip())
+            # Reset the aggregation. Reset it before pushing it down, otherwise
+            # if the tasks gets cancelled we won't be able to clear things up.
+            self._aggregation = ""
+            await self.push_frame(frame)
+            # Reset our accumulator state.
+            self._reset()
+    def _reset(self):
+        self._aggregation = ""
+        self._aggregating = False
+        self._seen_start_frame = False
+        self._seen_end_frame = False
+        self._seen_interim_results = False
+class UserResponseAggregator(ResponseAggregator):
+    def __init__(self):
+        super().__init__(
+            start_frame=UserStartedSpeakingFrame,
+            end_frame=UserStoppedSpeakingFrame,
+            accumulator_frame=TranscriptionFrame,
+            interim_accumulator_frame=InterimTranscriptionFrame,
+        )

pipecat/processors/aggregators/vision_image_frame.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class VisionImageFrameAggregator(FrameProcessor):
+    """This aggregator waits for a consecutive TextFrame and an
+    ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
+    >>> from pipecat.pipeline.frames import ImageFrame
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         print(frame)
+    >>> aggregator = VisionImageFrameAggregator()
+    >>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?")))
+    >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
+    VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B
+    """
+    def __init__(self):
+        super().__init__()
+        self._describe_text = None
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            self._describe_text = frame.text
+        elif isinstance(frame, ImageRawFrame):
+            if self._describe_text:
+                frame = VisionImageRawFrame(
+                    text=self._describe_text,
+                    image=frame.image,
+                    size=frame.size,
+                    format=frame.format)
+                await self.push_frame(frame)
+                self._describe_text = None
+        else:
+            await self.push_frame(frame, direction)

pipecat/processors/async_frame_processor.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from pipecat.frames.frames import EndFrame, Frame, StartInterruptionFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class AsyncFrameProcessor(FrameProcessor):
+    def __init__(
+            self,
+            *,
+            name: str | None = None,
+            loop: asyncio.AbstractEventLoop | None = None,
+            **kwargs):
+        super().__init__(name=name, loop=loop, **kwargs)
+        self._create_push_task()
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, StartInterruptionFrame):
+            await self._handle_interruptions(frame)
+    async def queue_frame(
+            self,
+            frame: Frame,
+            direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        await self._push_queue.put((frame, direction))
+    async def cleanup(self):
+        self._push_frame_task.cancel()
+        await self._push_frame_task
+    async def _handle_interruptions(self, frame: Frame):
+        # Cancel the task. This will stop pushing frames downstream.
+        self._push_frame_task.cancel()
+        await self._push_frame_task
+        # Push an out-of-band frame (i.e. not using the ordered push
+        # frame task).
+        await self.push_frame(frame)
+        # Create a new queue and task.
+        self._create_push_task()
+    def _create_push_task(self):
+        self._push_queue = asyncio.Queue()
+        self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
+    async def _push_frame_task_handler(self):
+        running = True
+        while running:
+            try:
+                (frame, direction) = await self._push_queue.get()
+                await self.push_frame(frame, direction)
+                running = not isinstance(frame, EndFrame)
+            except asyncio.CancelledError:
+                break

pipecat/processors/filters/__init__.py ADDED Viewed

File without changes

pipecat/processors/filters/frame_filter.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import List
+from pipecat.frames.frames import AppFrame, ControlFrame, Frame, SystemFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class FrameFilter(FrameProcessor):
+    def __init__(self, types: List[type]):
+        super().__init__()
+        self._types = types
+    #
+    # Frame processor
+    #
+    def _should_passthrough_frame(self, frame):
+        for t in self._types:
+            if isinstance(frame, t):
+                return True
+        return (isinstance(frame, AppFrame)
+                or isinstance(frame, ControlFrame)
+                or isinstance(frame, SystemFrame))
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if self._should_passthrough_frame(frame):
+            await self.push_frame(frame, direction)

pipecat/processors/filters/function_filter.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Awaitable, Callable
+from pipecat.frames.frames import Frame, SystemFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class FunctionFilter(FrameProcessor):
+    def __init__(self, filter: Callable[[Frame], Awaitable[bool]]):
+        super().__init__()
+        self._filter = filter
+    #
+    # Frame processor
+    #
+    def _should_passthrough_frame(self, frame):
+        return isinstance(frame, SystemFrame)
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        passthrough = self._should_passthrough_frame(frame)
+        allowed = await self._filter(frame)
+        if passthrough or allowed:
+            await self.push_frame(frame, direction)

pipecat/processors/filters/wake_check_filter.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import re
+import time
+from enum import Enum
+from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from loguru import logger
+class WakeCheckFilter(FrameProcessor):
+    """
+    This filter looks for wake phrases in the transcription frames and only passes through frames
+    after a wake phrase has been detected. It also has a keepalive timeout to allow for a brief
+    period of continued conversation after a wake phrase has been detected.
+    """
+    class WakeState(Enum):
+        IDLE = 1
+        AWAKE = 2
+    class ParticipantState:
+        def __init__(self, participant_id: str):
+            self.participant_id = participant_id
+            self.state = WakeCheckFilter.WakeState.IDLE
+            self.wake_timer = 0.0
+            self.accumulator = ""
+    def __init__(self, wake_phrases: list[str], keepalive_timeout: float = 3):
+        super().__init__()
+        self._participant_states = {}
+        self._keepalive_timeout = keepalive_timeout
+        self._wake_patterns = []
+        for name in wake_phrases:
+            pattern = re.compile(r'\b' + r'\s*'.join(re.escape(word)
+                                 for word in name.split()) + r'\b', re.IGNORECASE)
+            self._wake_patterns.append(pattern)
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        try:
+            if isinstance(frame, TranscriptionFrame):
+                p = self._participant_states.get(frame.user_id)
+                if p is None:
+                    p = WakeCheckFilter.ParticipantState(frame.user_id)
+                    self._participant_states[frame.user_id] = p
+                # If we have been AWAKE within the last keepalive_timeout seconds, pass
+                # the frame through
+                if p.state == WakeCheckFilter.WakeState.AWAKE:
+                    if time.time() - p.wake_timer < self._keepalive_timeout:
+                        logger.debug(
+                            f"Wake phrase keepalive timeout has not expired. Pushing {frame}")
+                        p.wake_timer = time.time()
+                        await self.push_frame(frame)
+                        return
+                    else:
+                        p.state = WakeCheckFilter.WakeState.IDLE
+                p.accumulator += frame.text
+                for pattern in self._wake_patterns:
+                    match = pattern.search(p.accumulator)
+                    if match:
+                        logger.debug(f"Wake phrase triggered: {match.group()}")
+                        # Found the wake word. Discard from the accumulator up to the start of the match
+                        # and modify the frame in place.
+                        p.state = WakeCheckFilter.WakeState.AWAKE
+                        p.wake_timer = time.time()
+                        frame.text = p.accumulator[match.start():]
+                        p.accumulator = ""
+                        await self.push_frame(frame)
+                    else:
+                        pass
+            else:
+                await self.push_frame(frame, direction)
+        except Exception as e:
+            error_msg = f"Error in wake word filter: {e}"
+            logger.exception(error_msg)
+            await self.push_error(ErrorFrame(error_msg))

pipecat/processors/frame_processor.py ADDED Viewed

	@@ -0,0 +1,162 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+import time
+from enum import Enum
+from pipecat.frames.frames import ErrorFrame, Frame, MetricsFrame, StartFrame, StartInterruptionFrame, UserStoppedSpeakingFrame
+from pipecat.utils.utils import obj_count, obj_id
+from loguru import logger
+class FrameDirection(Enum):
+    DOWNSTREAM = 1
+    UPSTREAM = 2
+class FrameProcessorMetrics:
+    def __init__(self, name: str):
+        self._name = name
+        self._start_ttfb_time = 0
+        self._start_processing_time = 0
+        self._should_report_ttfb = True
+    async def start_ttfb_metrics(self, report_only_initial_ttfb):
+        if self._should_report_ttfb:
+            self._start_ttfb_time = time.time()
+            self._should_report_ttfb = not report_only_initial_ttfb
+    async def stop_ttfb_metrics(self):
+        if self._start_ttfb_time == 0:
+            return None
+        value = time.time() - self._start_ttfb_time
+        logger.debug(f"{self._name} TTFB: {value}")
+        ttfb = {
+            "processor": self._name,
+            "value": value
+        }
+        self._start_ttfb_time = 0
+        return MetricsFrame(ttfb=[ttfb])
+    async def start_processing_metrics(self):
+        self._start_processing_time = time.time()
+    async def stop_processing_metrics(self):
+        if self._start_processing_time == 0:
+            return None
+        value = time.time() - self._start_processing_time
+        logger.debug(f"{self._name} processing time: {value}")
+        processing = {
+            "processor": self._name,
+            "value": value
+        }
+        self._start_processing_time = 0
+        return MetricsFrame(processing=[processing])
+class FrameProcessor:
+    def __init__(
+            self,
+            *,
+            name: str | None = None,
+            loop: asyncio.AbstractEventLoop | None = None,
+            **kwargs):
+        self.id: int = obj_id()
+        self.name = name or f"{self.__class__.__name__}#{obj_count(self)}"
+        self._prev: "FrameProcessor" | None = None
+        self._next: "FrameProcessor" | None = None
+        self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop()
+        # Properties
+        self._allow_interruptions = False
+        self._enable_metrics = False
+        self._report_only_initial_ttfb = False
+        # Metrics
+        self._metrics = FrameProcessorMetrics(name=self.name)
+    @property
+    def interruptions_allowed(self):
+        return self._allow_interruptions
+    @property
+    def metrics_enabled(self):
+        return self._enable_metrics
+    @property
+    def report_only_initial_ttfb(self):
+        return self._report_only_initial_ttfb
+    def can_generate_metrics(self) -> bool:
+        return False
+    async def start_ttfb_metrics(self):
+        if self.can_generate_metrics() and self.metrics_enabled:
+            await self._metrics.start_ttfb_metrics(self._report_only_initial_ttfb)
+    async def stop_ttfb_metrics(self):
+        if self.can_generate_metrics() and self.metrics_enabled:
+            frame = await self._metrics.stop_ttfb_metrics()
+            if frame:
+                await self.push_frame(frame)
+    async def start_processing_metrics(self):
+        if self.can_generate_metrics() and self.metrics_enabled:
+            await self._metrics.start_processing_metrics()
+    async def stop_processing_metrics(self):
+        if self.can_generate_metrics() and self.metrics_enabled:
+            frame = await self._metrics.stop_processing_metrics()
+            if frame:
+                await self.push_frame(frame)
+    async def stop_all_metrics(self):
+        await self.stop_ttfb_metrics()
+        await self.stop_processing_metrics()
+    async def cleanup(self):
+        pass
+    def link(self, processor: 'FrameProcessor'):
+        self._next = processor
+        processor._prev = self
+        logger.debug(f"Linking {self} -> {self._next}")
+    def get_event_loop(self) -> asyncio.AbstractEventLoop:
+        return self._loop
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        if isinstance(frame, StartFrame):
+            self._allow_interruptions = frame.allow_interruptions
+            self._enable_metrics = frame.enable_metrics
+            self._report_only_initial_ttfb = frame.report_only_initial_ttfb
+        elif isinstance(frame, StartInterruptionFrame):
+            await self.stop_all_metrics()
+        elif isinstance(frame, UserStoppedSpeakingFrame):
+            self._should_report_ttfb = True
+    async def push_error(self, error: ErrorFrame):
+        await self.push_frame(error, FrameDirection.UPSTREAM)
+    async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        try:
+            if direction == FrameDirection.DOWNSTREAM and self._next:
+                logger.trace(f"Pushing {frame} from {self} to {self._next}")
+                await self._next.process_frame(frame, direction)
+            elif direction == FrameDirection.UPSTREAM and self._prev:
+                logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}")
+                await self._prev.process_frame(frame, direction)
+        except Exception as e:
+            logger.exception(f"Uncaught exception in {self}: {e}")
+    def __str__(self):
+        return self.name

pipecat/processors/frameworks/__init__.py ADDED Viewed

File without changes

pipecat/processors/frameworks/langchain.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Union
+from pipecat.frames.frames import (
+    Frame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+    LLMResponseEndFrame,
+    LLMResponseStartFrame,
+    TextFrame)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from loguru import logger
+try:
+    from langchain_core.messages import AIMessageChunk
+    from langchain_core.runnables import Runnable
+except ModuleNotFoundError as e:
+    logger.exception(
+        "In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. "
+    )
+    raise Exception(f"Missing module: {e}")
+class LangchainProcessor(FrameProcessor):
+    def __init__(self, chain: Runnable, transcript_key: str = "input"):
+        super().__init__()
+        self._chain = chain
+        self._transcript_key = transcript_key
+        self._participant_id: str | None = None
+    def set_participant_id(self, participant_id: str):
+        self._participant_id = participant_id
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, LLMMessagesFrame):
+            # Messages are accumulated by the `LLMUserResponseAggregator` in a list of messages.
+            # The last one by the human is the one we want to send to the LLM.
+            logger.debug(f"Got transcription frame {frame}")
+            text: str = frame.messages[-1]["content"]
+            await self._ainvoke(text.strip())
+        else:
+            await self.push_frame(frame, direction)
+    @staticmethod
+    def __get_token_value(text: Union[str, AIMessageChunk]) -> str:
+        match text:
+            case str():
+                return text
+            case AIMessageChunk():
+                return text.content
+            case _:
+                return ""
+    async def _ainvoke(self, text: str):
+        logger.debug(f"Invoking chain with {text}")
+        await self.push_frame(LLMFullResponseStartFrame())
+        try:
+            async for token in self._chain.astream(
+                {self._transcript_key: text},
+                config={"configurable": {"session_id": self._participant_id}},
+            ):
+                await self.push_frame(LLMResponseStartFrame())
+                await self.push_frame(TextFrame(self.__get_token_value(token)))
+                await self.push_frame(LLMResponseEndFrame())
+        except GeneratorExit:
+            logger.warning(f"{self} generator was closed prematurely")
+        except Exception as e:
+            logger.exception(f"{self} an unknown error occurred: {e}")
+        finally:
+            await self.push_frame(LLMFullResponseEndFrame())

pipecat/processors/logger.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from pipecat.frames.frames import Frame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from loguru import logger
+from typing import Optional
+logger = logger.opt(ansi=True)
+class FrameLogger(FrameProcessor):
+    def __init__(self, prefix="Frame", color: Optional[str] = None):
+        super().__init__()
+        self._prefix = prefix
+        self._color = color
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        dir = "<" if direction is FrameDirection.UPSTREAM else ">"
+        msg = f"{dir} {self._prefix}: {frame}"
+        if self._color:
+            msg = f"<{self._color}>{msg}</>"
+        logger.debug(msg)
+        await self.push_frame(frame, direction)

pipecat/processors/text_transformer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Coroutine
+from pipecat.frames.frames import Frame, TextFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+class StatelessTextTransformer(FrameProcessor):
+    """This processor calls the given function on any text in a text frame.
+    >>> async def print_frames(aggregator, frame):
+    ...     async for frame in aggregator.process_frame(frame):
+    ...         print(frame.text)
+    >>> aggregator = StatelessTextTransformer(lambda x: x.upper())
+    >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
+    HELLO
+    """
+    def __init__(self, transform_fn):
+        super().__init__()
+        self._transform_fn = transform_fn
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            result = self._transform_fn(frame.text)
+            if isinstance(result, Coroutine):
+                result = await result
+            await self.push_frame(result)
+        else:
+            await self.push_frame(frame, direction)

pipecat/serializers/__init__.py ADDED Viewed

File without changes

pipecat/serializers/base_serializer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from abc import ABC, abstractmethod
+from pipecat.frames.frames import Frame
+class FrameSerializer(ABC):
+    @abstractmethod
+    def serialize(self, frame: Frame) -> str | bytes | None:
+        pass
+    @abstractmethod
+    def deserialize(self, data: str | bytes) -> Frame | None:
+        pass

pipecat/serializers/protobuf.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import dataclasses
+import pipecat.frames.protobufs.frames_pb2 as frame_protos
+from pipecat.frames.frames import AudioRawFrame, Frame, TextFrame, TranscriptionFrame
+from pipecat.serializers.base_serializer import FrameSerializer
+from loguru import logger
+class ProtobufFrameSerializer(FrameSerializer):
+    SERIALIZABLE_TYPES = {
+        TextFrame: "text",
+        AudioRawFrame: "audio",
+        TranscriptionFrame: "transcription"
+    }
+    SERIALIZABLE_FIELDS = {v: k for k, v in SERIALIZABLE_TYPES.items()}
+    def __init__(self):
+        pass
+    def serialize(self, frame: Frame) -> str | bytes | None:
+        proto_frame = frame_protos.Frame()
+        if type(frame) not in self.SERIALIZABLE_TYPES:
+            raise ValueError(
+                f"Frame type {type(frame)} is not serializable. You may need to add it to ProtobufFrameSerializer.SERIALIZABLE_FIELDS.")
+        # ignoring linter errors; we check that type(frame) is in this dict above
+        proto_optional_name = self.SERIALIZABLE_TYPES[type(frame)]  # type: ignore
+        for field in dataclasses.fields(frame):  # type: ignore
+            setattr(getattr(proto_frame, proto_optional_name), field.name,
+                    getattr(frame, field.name))
+        result = proto_frame.SerializeToString()
+        return result
+    def deserialize(self, data: str | bytes) -> Frame | None:
+        """Returns a Frame object from a Frame protobuf. Used to convert frames
+        passed over the wire as protobufs to Frame objects used in pipelines
+        and frame processors.
+        >>> serializer = ProtobufFrameSerializer()
+        >>> serializer.deserialize(
+        ...     serializer.serialize(AudioFrame(data=b'1234567890')))
+        AudioFrame(data=b'1234567890')
+        >>> serializer.deserialize(
+        ...     serializer.serialize(TextFrame(text='hello world')))
+        TextFrame(text='hello world')
+        >>> serializer.deserialize(serializer.serialize(TranscriptionFrame(
+        ...     text="Hello there!", participantId="123", timestamp="2021-01-01")))
+        TranscriptionFrame(text='Hello there!', participantId='123', timestamp='2021-01-01')
+        """
+        proto = frame_protos.Frame.FromString(data)
+        which = proto.WhichOneof("frame")
+        if which not in self.SERIALIZABLE_FIELDS:
+            logger.error("Unable to deserialize a valid frame")
+            return None
+        class_name = self.SERIALIZABLE_FIELDS[which]
+        args = getattr(proto, which)
+        args_dict = {}
+        for field in proto.DESCRIPTOR.fields_by_name[which].message_type.fields:
+            args_dict[field.name] = getattr(args, field.name)
+        # Remove special fields if needed
+        id = getattr(args, "id")
+        name = getattr(args, "name")
+        if not id:
+            del args_dict["id"]
+        if not name:
+            del args_dict["name"]
+        # Create the instance
+        instance = class_name(**args_dict)
+        # Set special fields
+        if id:
+            setattr(instance, "id", getattr(args, "id"))
+        if name:
+            setattr(instance, "name", getattr(args, "name"))
+        return instance

pipecat/serializers/twilio.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import base64
+import json
+from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.serializers.base_serializer import FrameSerializer
+from pipecat.utils.audio import ulaw_8000_to_pcm_16000, pcm_16000_to_ulaw_8000
+class TwilioFrameSerializer(FrameSerializer):
+    SERIALIZABLE_TYPES = {
+        AudioRawFrame: "audio",
+    }
+    def __init__(self, stream_sid: str):
+        self._stream_sid = stream_sid
+    def serialize(self, frame: Frame) -> str | bytes | None:
+        if not isinstance(frame, AudioRawFrame):
+            return None
+        data = frame.audio
+        serialized_data = pcm_16000_to_ulaw_8000(data)
+        payload = base64.b64encode(serialized_data).decode("utf-8")
+        answer = {
+            "event": "media",
+            "streamSid": self._stream_sid,
+            "media": {
+                "payload": payload
+            }
+        }
+        return json.dumps(answer)
+    def deserialize(self, data: str | bytes) -> Frame | None:
+        message = json.loads(data)
+        if message["event"] != "media":
+            return None
+        else:
+            payload_base64 = message["media"]["payload"]
+            payload = base64.b64decode(payload_base64)
+            deserialized_data = ulaw_8000_to_pcm_16000(payload)
+            audio_frame = AudioRawFrame(audio=deserialized_data, num_channels=1, sample_rate=16000)
+            return audio_frame

pipecat/services/__init__.py ADDED Viewed

File without changes

pipecat/services/ai_services.py ADDED Viewed

	@@ -0,0 +1,300 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import io
+import wave
+from abc import abstractmethod
+from typing import AsyncGenerator
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    LLMFullResponseEndFrame,
+    StartFrame,
+    StartInterruptionFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+    TextFrame,
+    VisionImageRawFrame,
+)
+from pipecat.processors.async_frame_processor import AsyncFrameProcessor
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.utils.audio import calculate_audio_volume
+from pipecat.utils.utils import exp_smoothing
+class AIService(FrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    async def start(self, frame: StartFrame):
+        pass
+    async def stop(self, frame: EndFrame):
+        pass
+    async def cancel(self, frame: CancelFrame):
+        pass
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, StartFrame):
+            await self.start(frame)
+        elif isinstance(frame, CancelFrame):
+            await self.cancel(frame)
+        elif isinstance(frame, EndFrame):
+            await self.stop(frame)
+    async def process_generator(self, generator: AsyncGenerator[Frame, None]):
+        async for f in generator:
+            if isinstance(f, ErrorFrame):
+                await self.push_error(f)
+            else:
+                await self.push_frame(f)
+class AsyncAIService(AsyncFrameProcessor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    async def start(self, frame: StartFrame):
+        pass
+    async def stop(self, frame: EndFrame):
+        pass
+    async def cancel(self, frame: CancelFrame):
+        pass
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, StartFrame):
+            await self.start(frame)
+        elif isinstance(frame, CancelFrame):
+            await self.cancel(frame)
+        elif isinstance(frame, EndFrame):
+            await self.stop(frame)
+class LLMService(AIService):
+    """This class is a no-op but serves as a base class for LLM services."""
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._callbacks = {}
+        self._start_callbacks = {}
+    # TODO-CB: callback function type
+    def register_function(self, function_name: str, callback, start_callback=None):
+        self._callbacks[function_name] = callback
+        if start_callback:
+            self._start_callbacks[function_name] = start_callback
+    def unregister_function(self, function_name: str):
+        del self._callbacks[function_name]
+        if self._start_callbacks[function_name]:
+            del self._start_callbacks[function_name]
+    def has_function(self, function_name: str):
+        return function_name in self._callbacks.keys()
+    async def call_function(self, function_name: str, args):
+        if function_name in self._callbacks.keys():
+            return await self._callbacks[function_name](self, args)
+        return None
+    async def call_start_function(self, function_name: str):
+        if function_name in self._start_callbacks.keys():
+            await self._start_callbacks[function_name](self)
+class TTSService(AIService):
+    def __init__(self, *, aggregate_sentences: bool = True, **kwargs):
+        super().__init__(**kwargs)
+        self._aggregate_sentences: bool = aggregate_sentences
+        self._current_sentence: str = ""
+    # Converts the text to audio.
+    @abstractmethod
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        pass
+    async def say(self, text: str):
+        await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)
+    async def _process_text_frame(self, frame: TextFrame):
+        text: str | None = None
+        if not self._aggregate_sentences:
+            text = frame.text
+        else:
+            self._current_sentence += frame.text
+            if self._current_sentence.strip().endswith(
+                    (".", "?", "!")) and not self._current_sentence.strip().endswith(
+                    ("Mr,", "Mrs.", "Ms.", "Dr.")):
+                text = self._current_sentence
+                self._current_sentence = ""
+        if text:
+            await self._push_tts_frames(text)
+    async def _push_tts_frames(self, text: str):
+        text = text.strip()
+        if not text:
+            return
+        await self.push_frame(TTSStartedFrame())
+        await self.start_processing_metrics()
+        await self.process_generator(self.run_tts(text))
+        await self.stop_processing_metrics()
+        await self.push_frame(TTSStoppedFrame())
+        # We send the original text after the audio. This way, if we are
+        # interrupted, the text is not added to the assistant context.
+        await self.push_frame(TextFrame(text))
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            await self._process_text_frame(frame)
+        elif isinstance(frame, StartInterruptionFrame):
+            self._current_sentence = ""
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
+            self._current_sentence = ""
+            await self._push_tts_frames(self._current_sentence)
+            await self.push_frame(frame)
+        else:
+            await self.push_frame(frame, direction)
+class STTService(AIService):
+    """STTService is a base class for speech-to-text services."""
+    def __init__(self,
+                 *,
+                 min_volume: float = 0.6,
+                 max_silence_secs: float = 0.3,
+                 max_buffer_secs: float = 1.5,
+                 sample_rate: int = 16000,
+                 num_channels: int = 1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._min_volume = min_volume
+        self._max_silence_secs = max_silence_secs
+        self._max_buffer_secs = max_buffer_secs
+        self._sample_rate = sample_rate
+        self._num_channels = num_channels
+        (self._content, self._wave) = self._new_wave()
+        self._silence_num_frames = 0
+        # Volume exponential smoothing
+        self._smoothing_factor = 0.2
+        self._prev_volume = 0
+    @abstractmethod
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Returns transcript as a string"""
+        pass
+    def _new_wave(self):
+        content = io.BytesIO()
+        ww = wave.open(content, "wb")
+        ww.setsampwidth(2)
+        ww.setnchannels(self._num_channels)
+        ww.setframerate(self._sample_rate)
+        return (content, ww)
+    def _get_smoothed_volume(self, frame: AudioRawFrame) -> float:
+        volume = calculate_audio_volume(frame.audio, frame.sample_rate)
+        return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
+    async def _append_audio(self, frame: AudioRawFrame):
+        # Try to filter out empty background noise
+        volume = self._get_smoothed_volume(frame)
+        if volume >= self._min_volume:
+            # If volume is high enough, write new data to wave file
+            self._wave.writeframes(frame.audio)
+            self._silence_num_frames = 0
+        else:
+            self._silence_num_frames += frame.num_frames
+        self._prev_volume = volume
+        # If buffer is not empty and we have enough data or there's been a long
+        # silence, transcribe the audio gathered so far.
+        silence_secs = self._silence_num_frames / self._sample_rate
+        buffer_secs = self._wave.getnframes() / self._sample_rate
+        if self._content.tell() > 0 and (
+                buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs):
+            self._silence_num_frames = 0
+            self._wave.close()
+            self._content.seek(0)
+            await self.start_processing_metrics()
+            await self.process_generator(self.run_stt(self._content.read()))
+            await self.stop_processing_metrics()
+            (self._content, self._wave) = self._new_wave()
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Processes a frame of audio data, either buffering or transcribing it."""
+        await super().process_frame(frame, direction)
+        if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
+            self._wave.close()
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, AudioRawFrame):
+            # In this service we accumulate audio internally and at the end we
+            # push a TextFrame. We don't really want to push audio frames down.
+            await self._append_audio(frame)
+        else:
+            await self.push_frame(frame, direction)
+class ImageGenService(AIService):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    # Renders the image. Returns an Image object.
+    @abstractmethod
+    async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
+        pass
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            await self.push_frame(frame, direction)
+            await self.start_processing_metrics()
+            await self.process_generator(self.run_image_gen(frame.text))
+            await self.stop_processing_metrics()
+        else:
+            await self.push_frame(frame, direction)
+class VisionService(AIService):
+    """VisionService is a base class for vision services."""
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._describe_text = None
+    @abstractmethod
+    async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
+        pass
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, VisionImageRawFrame):
+            await self.start_processing_metrics()
+            await self.process_generator(self.run_vision(frame))
+            await self.stop_processing_metrics()
+        else:
+            await self.push_frame(frame, direction)

pipecat/services/anthropic.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import base64
+from pipecat.frames.frames import (
+    Frame,
+    TextFrame,
+    VisionImageRawFrame,
+    LLMMessagesFrame,
+    LLMFullResponseStartFrame,
+    LLMResponseStartFrame,
+    LLMResponseEndFrame,
+    LLMFullResponseEndFrame
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
+from loguru import logger
+try:
+    from anthropic import AsyncAnthropic
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class AnthropicLLMService(LLMService):
+    """This class implements inference with Anthropic's AI models
+    This service translates internally from OpenAILLMContext to the messages format
+    expected by the Anthropic Python SDK. We are using the OpenAILLMContext as a lingua
+    franca for all LLM services, so that it is easy to switch between different LLMs.
+    """
+    def __init__(
+            self,
+            *,
+            api_key: str,
+            model: str = "claude-3-opus-20240229",
+            max_tokens: int = 1024):
+        super().__init__()
+        self._client = AsyncAnthropic(api_key=api_key)
+        self._model = model
+        self._max_tokens = max_tokens
+    def can_generate_metrics(self) -> bool:
+        return True
+    def _get_messages_from_openai_context(
+            self, context: OpenAILLMContext):
+        openai_messages = context.get_messages()
+        anthropic_messages = []
+        for message in openai_messages:
+            role = message["role"]
+            text = message["content"]
+            if role == "system":
+                role = "user"
+            if message.get("mime_type") == "image/jpeg":
+                # vision frame
+                encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
+                anthropic_messages.append({
+                    "role": role,
+                    "content": [{
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": message.get("mime_type"),
+                            "data": encoded_image,
+                        }
+                    }, {
+                        "type": "text",
+                        "text": text
+                    }]
+                })
+            else:
+                # Text frame. Anthropic needs the roles to alternate. This will
+                # cause an issue with interruptions. So, if we detect we are the
+                # ones asking again it probably means we were interrupted.
+                if role == "user" and len(anthropic_messages) > 1:
+                    last_message = anthropic_messages[-1]
+                    if last_message["role"] == "user":
+                        anthropic_messages = anthropic_messages[:-1]
+                        content = last_message["content"]
+                        anthropic_messages.append(
+                            {"role": "user", "content": f"Sorry, I just asked you about [{content}] but now I would like to know [{text}]."})
+                    else:
+                        anthropic_messages.append({"role": role, "content": text})
+                else:
+                    anthropic_messages.append({"role": role, "content": text})
+        return anthropic_messages
+    async def _process_context(self, context: OpenAILLMContext):
+        await self.push_frame(LLMFullResponseStartFrame())
+        try:
+            logger.debug(f"Generating chat: {context.get_messages_json()}")
+            messages = self._get_messages_from_openai_context(context)
+            await self.start_ttfb_metrics()
+            response = await self._client.messages.create(
+                messages=messages,
+                model=self._model,
+                max_tokens=self._max_tokens,
+                stream=True)
+            await self.stop_ttfb_metrics()
+            async for event in response:
+                # logger.debug(f"Anthropic LLM event: {event}")
+                if (event.type == "content_block_delta"):
+                    await self.push_frame(LLMResponseStartFrame())
+                    await self.push_frame(TextFrame(event.delta.text))
+                    await self.push_frame(LLMResponseEndFrame())
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+        finally:
+            await self.push_frame(LLMFullResponseEndFrame())
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        context = None
+        if isinstance(frame, OpenAILLMContextFrame):
+            context: OpenAILLMContext = frame.context
+        elif isinstance(frame, LLMMessagesFrame):
+            context = OpenAILLMContext.from_messages(frame.messages)
+        elif isinstance(frame, VisionImageRawFrame):
+            context = OpenAILLMContext.from_image_frame(frame)
+        else:
+            await self.push_frame(frame, direction)
+        if context:
+            await self._process_context(context)

pipecat/services/azure.py ADDED Viewed

	@@ -0,0 +1,233 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import aiohttp
+import asyncio
+import io
+import time
+from PIL import Image
+from typing import AsyncGenerator
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    StartFrame,
+    StartInterruptionFrame,
+    SystemFrame,
+    TranscriptionFrame,
+    URLImageRawFrame)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import AIService, AsyncAIService, TTSService, ImageGenService
+from pipecat.services.openai import BaseOpenAILLMService
+from loguru import logger
+# See .env.example for Azure configuration needed
+try:
+    from openai import AsyncAzureOpenAI
+    from azure.cognitiveservices.speech import (
+        SpeechConfig,
+        SpeechRecognizer,
+        SpeechSynthesizer,
+        ResultReason,
+        CancellationReason,
+    )
+    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
+    from azure.cognitiveservices.speech.dialog import AudioConfig
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
+    raise Exception(f"Missing module: {e}")
+class AzureLLMService(BaseOpenAILLMService):
+    def __init__(
+            self,
+            *,
+            api_key: str,
+            endpoint: str,
+            model: str,
+            api_version: str = "2023-12-01-preview"):
+        # Initialize variables before calling parent __init__() because that
+        # will call create_client() and we need those values there.
+        self._endpoint = endpoint
+        self._api_version = api_version
+        super().__init__(api_key=api_key, model=model)
+    def create_client(self, api_key=None, base_url=None, **kwargs):
+        return AsyncAzureOpenAI(
+            api_key=api_key,
+            azure_endpoint=self._endpoint,
+            api_version=self._api_version,
+        )
+class AzureTTSService(TTSService):
+    def __init__(self, *, api_key: str, region: str, voice="en-US-SaraNeural", **kwargs):
+        super().__init__(**kwargs)
+        speech_config = SpeechConfig(subscription=api_key, region=region)
+        self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
+        self._voice = voice
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: {text}")
+        await self.start_ttfb_metrics()
+        ssml = (
+            "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
+            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
+            f"<voice name='{self._voice}'>"
+            "<mstts:silence type='Sentenceboundary' value='20ms' />"
+            "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
+            "<prosody rate='1.05'>"
+            f"{text}"
+            "</prosody></mstts:express-as></voice></speak> ")
+        result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))
+        if result.reason == ResultReason.SynthesizingAudioCompleted:
+            await self.stop_ttfb_metrics()
+            # Azure always sends a 44-byte header. Strip it off.
+            yield AudioRawFrame(audio=result.audio_data[44:], sample_rate=16000, num_channels=1)
+        elif result.reason == ResultReason.Canceled:
+            cancellation_details = result.cancellation_details
+            logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
+            if cancellation_details.reason == CancellationReason.Error:
+                logger.error(f"{self} error: {cancellation_details.error_details}")
+class AzureSTTService(AsyncAIService):
+    def __init__(
+            self,
+            *,
+            api_key: str,
+            region: str,
+            language="en-US",
+            sample_rate=16000,
+            channels=1,
+            **kwargs):
+        super().__init__(**kwargs)
+        speech_config = SpeechConfig(subscription=api_key, region=region)
+        speech_config.speech_recognition_language = language
+        stream_format = AudioStreamFormat(samples_per_second=sample_rate, channels=channels)
+        self._audio_stream = PushAudioInputStream(stream_format)
+        audio_config = AudioConfig(stream=self._audio_stream)
+        self._speech_recognizer = SpeechRecognizer(
+            speech_config=speech_config, audio_config=audio_config)
+        self._speech_recognizer.recognized.connect(self._on_handle_recognized)
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, SystemFrame):
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, AudioRawFrame):
+            self._audio_stream.write(frame.audio)
+        else:
+            await self._push_queue.put((frame, direction))
+    async def start(self, frame: StartFrame):
+        self._speech_recognizer.start_continuous_recognition_async()
+    async def stop(self, frame: EndFrame):
+        self._speech_recognizer.stop_continuous_recognition_async()
+    async def cancel(self, frame: CancelFrame):
+        self._speech_recognizer.stop_continuous_recognition_async()
+    def _on_handle_recognized(self, event):
+        if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:
+            frame = TranscriptionFrame(event.result.text, "", int(time.time_ns() / 1000000))
+            asyncio.run_coroutine_threadsafe(self.queue_frame(frame), self.get_event_loop())
+class AzureImageGenServiceREST(ImageGenService):
+    def __init__(
+        self,
+        *,
+        aiohttp_session: aiohttp.ClientSession,
+        image_size: str,
+        api_key: str,
+        endpoint: str,
+        model: str,
+        api_version="2023-06-01-preview",
+    ):
+        super().__init__()
+        self._api_key = api_key
+        self._azure_endpoint = endpoint
+        self._api_version = api_version
+        self._model = model
+        self._aiohttp_session = aiohttp_session
+        self._image_size = image_size
+    async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
+        url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
+        headers = {
+            "api-key": self._api_key,
+            "Content-Type": "application/json"}
+        body = {
+            # Enter your prompt text here
+            "prompt": prompt,
+            "size": self._image_size,
+            "n": 1,
+        }
+        async with self._aiohttp_session.post(url, headers=headers, json=body) as submission:
+            # We never get past this line, because this header isn't
+            # defined on a 429 response, but something is eating our
+            # exceptions!
+            operation_location = submission.headers["operation-location"]
+            status = ""
+            attempts_left = 120
+            json_response = None
+            while status != "succeeded":
+                attempts_left -= 1
+                if attempts_left == 0:
+                    logger.error(f"{self} error: image generation timed out")
+                    yield ErrorFrame("Image generation timed out")
+                    return
+                await asyncio.sleep(1)
+                response = await self._aiohttp_session.get(operation_location, headers=headers)
+                json_response = await response.json()
+                status = json_response["status"]
+            image_url = json_response["result"]["data"][0]["url"] if json_response else None
+            if not image_url:
+                logger.error(f"{self} error: image generation failed")
+                yield ErrorFrame("Image generation failed")
+                return
+            # Load the image from the url
+            async with self._aiohttp_session.get(image_url) as response:
+                image_stream = io.BytesIO(await response.content.read())
+                image = Image.open(image_stream)
+                frame = URLImageRawFrame(
+                    url=image_url,
+                    image=image.tobytes(),
+                    size=image.size,
+                    format=image.format)
+                yield frame

pipecat/services/cartesia.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from cartesia import AsyncCartesia
+from typing import AsyncGenerator
+from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.services.ai_services import TTSService
+from loguru import logger
+class CartesiaTTSService(TTSService):
+    def __init__(
+            self,
+            *,
+            api_key: str,
+            voice_id: str,
+            model_id: str = "sonic-english",
+            encoding: str = "pcm_s16le",
+            sample_rate: int = 16000,
+            **kwargs):
+        super().__init__(**kwargs)
+        self._api_key = api_key
+        self._model_id = model_id
+        self._output_format = {
+            "container": "raw",
+            "encoding": encoding,
+            "sample_rate": sample_rate,
+        }
+        try:
+            self._client = AsyncCartesia(api_key=self._api_key)
+            self._voice = self._client.voices.get(id=voice_id)
+        except Exception as e:
+            logger.exception(f"{self} initialization error: {e}")
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+        try:
+            await self.start_ttfb_metrics()
+            chunk_generator = await self._client.tts.sse(
+                stream=True,
+                transcript=text,
+                voice_embedding=self._voice["embedding"],
+                model_id=self._model_id,
+                output_format=self._output_format,
+            )
+            async for chunk in chunk_generator:
+                await self.stop_ttfb_metrics()
+                yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")

pipecat/services/deepgram.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import aiohttp
+import time
+from typing import AsyncGenerator
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    StartFrame,
+    SystemFrame,
+    TranscriptionFrame)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import AsyncAIService, TTSService
+from loguru import logger
+# See .env.example for Deepgram configuration needed
+try:
+    from deepgram import (
+        DeepgramClient,
+        DeepgramClientOptions,
+        LiveTranscriptionEvents,
+        LiveOptions,
+    )
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class DeepgramTTSService(TTSService):
+    def __init__(
+            self,
+            *,
+            aiohttp_session: aiohttp.ClientSession,
+            api_key: str,
+            voice: str = "aura-helios-en",
+            base_url: str = "https://api.deepgram.com/v1/speak",
+            **kwargs):
+        super().__init__(**kwargs)
+        self._voice = voice
+        self._api_key = api_key
+        self._aiohttp_session = aiohttp_session
+        self._base_url = base_url
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+        base_url = self._base_url
+        request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
+        headers = {"authorization": f"token {self._api_key}"}
+        body = {"text": text}
+        try:
+            await self.start_ttfb_metrics()
+            async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
+                if r.status != 200:
+                    response_text = await r.text()
+                    # If we get a a "Bad Request: Input is unutterable", just print out a debug log.
+                    # All other unsuccesful requests should emit an error frame. If not specifically
+                    # handled by the running PipelineTask, the ErrorFrame will cancel the task.
+                    if "unutterable" in response_text:
+                        logger.debug(f"Unutterable text: [{text}]")
+                        return
+                    logger.error(
+                        f"{self} error getting audio (status: {r.status}, error: {response_text})")
+                    yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {response_text})")
+                    return
+                async for data in r.content:
+                    await self.stop_ttfb_metrics()
+                    frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
+                    yield frame
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+class DeepgramSTTService(AsyncAIService):
+    def __init__(self,
+                 *,
+                 api_key: str,
+                 url: str = "",
+                 live_options: LiveOptions = LiveOptions(
+                     encoding="linear16",
+                     language="en-US",
+                     model="nova-2-conversationalai",
+                     sample_rate=16000,
+                     channels=1,
+                     interim_results=True,
+                     smart_format=True,
+                 ),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._live_options = live_options
+        self._client = DeepgramClient(
+            api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}))
+        self._connection = self._client.listen.asynclive.v("1")
+        self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message)
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, SystemFrame):
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, AudioRawFrame):
+            await self._connection.send(frame.audio)
+        else:
+            await self.queue_frame(frame, direction)
+    async def start(self, frame: StartFrame):
+        if await self._connection.start(self._live_options):
+            logger.debug(f"{self}: Connected to Deepgram")
+        else:
+            logger.error(f"{self}: Unable to connect to Deepgram")
+    async def stop(self, frame: EndFrame):
+        await self._connection.finish()
+    async def cancel(self, frame: CancelFrame):
+        await self._connection.finish()
+    async def _on_message(self, *args, **kwargs):
+        result = kwargs["result"]
+        is_final = result.is_final
+        transcript = result.channel.alternatives[0].transcript
+        if len(transcript) > 0:
+            if is_final:
+                await self.queue_frame(TranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))
+            else:
+                await self.queue_frame(InterimTranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))

pipecat/services/elevenlabs.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import aiohttp
+from typing import AsyncGenerator
+from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
+from pipecat.services.ai_services import TTSService
+from loguru import logger
+class ElevenLabsTTSService(TTSService):
+    def __init__(
+            self,
+            *,
+            aiohttp_session: aiohttp.ClientSession,
+            api_key: str,
+            voice_id: str,
+            model: str = "eleven_turbo_v2",
+            **kwargs):
+        super().__init__(**kwargs)
+        self._api_key = api_key
+        self._voice_id = voice_id
+        self._aiohttp_session = aiohttp_session
+        self._model = model
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
+        payload = {"text": text, "model_id": self._model}
+        querystring = {
+            "output_format": "pcm_16000",
+            "optimize_streaming_latency": 2}
+        headers = {
+            "xi-api-key": self._api_key,
+            "Content-Type": "application/json",
+        }
+        await self.start_ttfb_metrics()
+        async with self._aiohttp_session.post(url, json=payload, headers=headers, params=querystring) as r:
+            if r.status != 200:
+                text = await r.text()
+                logger.error(f"{self} error getting audio (status: {r.status}, error: {text})")
+                yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
+                return
+            async for chunk in r.content:
+                if len(chunk) > 0:
+                    await self.stop_ttfb_metrics()
+                    frame = AudioRawFrame(chunk, 16000, 1)
+                    yield frame

pipecat/services/fal.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import aiohttp
+import io
+import os
+from PIL import Image
+from pydantic import BaseModel
+from typing import AsyncGenerator, Optional, Union, Dict
+from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame
+from pipecat.services.ai_services import ImageGenService
+from loguru import logger
+try:
+    import fal_client
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Fal, you need to `pip install pipecat-ai[fal]`. Also, set `FAL_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class FalImageGenService(ImageGenService):
+    class InputParams(BaseModel):
+        seed: Optional[int] = None
+        num_inference_steps: int = 8
+        num_images: int = 1
+        image_size: Union[str, Dict[str, int]] = "square_hd"
+        expand_prompt: bool = False
+        enable_safety_checker: bool = True
+        format: str = "png"
+    def __init__(
+        self,
+        *,
+        aiohttp_session: aiohttp.ClientSession,
+        params: InputParams,
+        model: str = "fal-ai/fast-sdxl",
+        key: str | None = None,
+    ):
+        super().__init__()
+        self._model = model
+        self._params = params
+        self._aiohttp_session = aiohttp_session
+        if key:
+            os.environ["FAL_KEY"] = key
+    async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating image from prompt: {prompt}")
+        response = await fal_client.run_async(
+            self._model,
+            arguments={"prompt": prompt, **self._params.model_dump()}
+        )
+        image_url = response["images"][0]["url"] if response else None
+        if not image_url:
+            logger.error(f"{self} error: image generation failed")
+            yield ErrorFrame("Image generation failed")
+            return
+        logger.debug(f"Image generated at: {image_url}")
+        # Load the image from the url
+        logger.debug(f"Downloading image {image_url} ...")
+        async with self._aiohttp_session.get(image_url) as response:
+            logger.debug(f"Downloaded image {image_url}")
+            image_stream = io.BytesIO(await response.content.read())
+            image = Image.open(image_stream)
+            frame = URLImageRawFrame(
+                url=image_url,
+                image=image.tobytes(),
+                size=image.size,
+                format=image.format)
+            yield frame

pipecat/services/fireworks.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from pipecat.services.openai import BaseOpenAILLMService
+from loguru import logger
+try:
+    from openai import AsyncOpenAI
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Fireworks, you need to `pip install pipecat-ai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class FireworksLLMService(BaseOpenAILLMService):
+    def __init__(self,
+                 *,
+                 model: str = "accounts/fireworks/models/firefunction-v1",
+                 base_url: str = "https://api.fireworks.ai/inference/v1"):
+        super().__init__(model, base_url)

pipecat/services/google.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from typing import List
+from pipecat.frames.frames import (
+    Frame,
+    TextFrame,
+    VisionImageRawFrame,
+    LLMMessagesFrame,
+    LLMFullResponseStartFrame,
+    LLMResponseStartFrame,
+    LLMResponseEndFrame,
+    LLMFullResponseEndFrame
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import LLMService
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
+from loguru import logger
+try:
+    import google.generativeai as gai
+    import google.ai.generativelanguage as glm
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class GoogleLLMService(LLMService):
+    """This class implements inference with Google's AI models
+    This service translates internally from OpenAILLMContext to the messages format
+    expected by the Google AI model. We are using the OpenAILLMContext as a lingua
+    franca for all LLM services, so that it is easy to switch between different LLMs.
+    """
+    def __init__(self, *, api_key: str, model: str = "gemini-1.5-flash-latest", **kwargs):
+        super().__init__(**kwargs)
+        gai.configure(api_key=api_key)
+        self._client = gai.GenerativeModel(model)
+    def can_generate_metrics(self) -> bool:
+        return True
+    def _get_messages_from_openai_context(
+            self, context: OpenAILLMContext) -> List[glm.Content]:
+        openai_messages = context.get_messages()
+        google_messages = []
+        for message in openai_messages:
+            role = message["role"]
+            content = message["content"]
+            if role == "system":
+                role = "user"
+            elif role == "assistant":
+                role = "model"
+            parts = [glm.Part(text=content)]
+            if "mime_type" in message:
+                parts.append(
+                    glm.Part(inline_data=glm.Blob(
+                        mime_type=message["mime_type"],
+                        data=message["data"].getvalue()
+                    )))
+            google_messages.append({"role": role, "parts": parts})
+        return google_messages
+    async def _async_generator_wrapper(self, sync_generator):
+        for item in sync_generator:
+            yield item
+            await asyncio.sleep(0)
+    async def _process_context(self, context: OpenAILLMContext):
+        await self.push_frame(LLMFullResponseStartFrame())
+        try:
+            logger.debug(f"Generating chat: {context.get_messages_json()}")
+            messages = self._get_messages_from_openai_context(context)
+            await self.start_ttfb_metrics()
+            response = self._client.generate_content(messages, stream=True)
+            await self.stop_ttfb_metrics()
+            async for chunk in self._async_generator_wrapper(response):
+                try:
+                    text = chunk.text
+                    await self.push_frame(LLMResponseStartFrame())
+                    await self.push_frame(TextFrame(text))
+                    await self.push_frame(LLMResponseEndFrame())
+                except Exception as e:
+                    # Google LLMs seem to flag safety issues a lot!
+                    if chunk.candidates[0].finish_reason == 3:
+                        logger.debug(
+                            f"LLM refused to generate content for safety reasons - {messages}.")
+                    else:
+                        logger.exception(f"{self} error: {e}")
+        except Exception as e:
+            logger.exception(f"{self} exception: {e}")
+        finally:
+            await self.push_frame(LLMFullResponseEndFrame())
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        context = None
+        if isinstance(frame, OpenAILLMContextFrame):
+            context: OpenAILLMContext = frame.context
+        elif isinstance(frame, LLMMessagesFrame):
+            context = OpenAILLMContext.from_messages(frame.messages)
+        elif isinstance(frame, VisionImageRawFrame):
+            context = OpenAILLMContext.from_image_frame(frame)
+        else:
+            await self.push_frame(frame, direction)
+        if context:
+            await self._process_context(context)

pipecat/services/moondream.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import asyncio
+from PIL import Image
+from typing import AsyncGenerator
+from pipecat.frames.frames import ErrorFrame, Frame, TextFrame, VisionImageRawFrame
+from pipecat.services.ai_services import VisionService
+from loguru import logger
+try:
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Moondream, you need to `pip install pipecat-ai[moondream]`.")
+    raise Exception(f"Missing module(s): {e}")
+def detect_device():
+    """
+    Detects the appropriate device to run on, and return the device and dtype.
+    """
+    try:
+        import intel_extension_for_pytorch
+        if torch.xpu.is_available():
+            return torch.device("xpu"), torch.float32
+    except ImportError:
+        pass
+    if torch.cuda.is_available():
+        return torch.device("cuda"), torch.float16
+    elif torch.backends.mps.is_available():
+        return torch.device("mps"), torch.float16
+    else:
+        return torch.device("cpu"), torch.float32
+class MoondreamService(VisionService):
+    def __init__(
+        self,
+            *,
+        model="vikhyatk/moondream2",
+        revision="2024-04-02",
+        use_cpu=False
+    ):
+        super().__init__()
+        if not use_cpu:
+            device, dtype = detect_device()
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32
+        self._tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
+        logger.debug("Loading Moondream model...")
+        self._model = AutoModelForCausalLM.from_pretrained(
+            model, trust_remote_code=True, revision=revision
+        ).to(device=device, dtype=dtype)
+        self._model.eval()
+        logger.debug("Loaded Moondream model")
+    async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
+        if not self._model:
+            logger.error(f"{self} error: Moondream model not available")
+            yield ErrorFrame("Moondream model not available")
+            return
+        logger.debug(f"Analyzing image: {frame}")
+        def get_image_description(frame: VisionImageRawFrame):
+            image = Image.frombytes(frame.format, frame.size, frame.image)
+            image_embeds = self._model.encode_image(image)
+            description = self._model.answer_question(
+                image_embeds=image_embeds,
+                question=frame.text,
+                tokenizer=self._tokenizer)
+            return description
+        description = await asyncio.to_thread(get_image_description, frame)
+        yield TextFrame(text=description)

pipecat/services/ollama.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from pipecat.services.openai import BaseOpenAILLMService
+class OLLamaLLMService(BaseOpenAILLMService):
+    def __init__(self, *, model: str = "llama2", base_url: str = "http://localhost:11434/v1"):
+        super().__init__(model=model, base_url=base_url, api_key="ollama")

pipecat/services/openai.py ADDED Viewed

	@@ -0,0 +1,338 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import aiohttp
+import base64
+import io
+import json
+from typing import AsyncGenerator, List, Literal
+from loguru import logger
+from PIL import Image
+from pipecat.frames.frames import (
+    AudioRawFrame,
+    ErrorFrame,
+    Frame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMMessagesFrame,
+    LLMResponseEndFrame,
+    LLMResponseStartFrame,
+    TextFrame,
+    URLImageRawFrame,
+    VisionImageRawFrame
+)
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.ai_services import (
+    ImageGenService,
+    LLMService,
+    TTSService
+)
+try:
+    from openai import AsyncOpenAI, AsyncStream, BadRequestError
+    from openai.types.chat import (
+        ChatCompletionChunk,
+        ChatCompletionFunctionMessageParam,
+        ChatCompletionMessageParam,
+        ChatCompletionToolParam
+    )
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
+    raise Exception(f"Missing module: {e}")
+class OpenAIUnhandledFunctionException(Exception):
+    pass
+class BaseOpenAILLMService(LLMService):
+    """This is the base for all services that use the AsyncOpenAI client.
+    This service consumes OpenAILLMContextFrame frames, which contain a reference
+    to an OpenAILLMContext frame. The OpenAILLMContext object defines the context
+    sent to the LLM for a completion. This includes user, assistant and system messages
+    as well as tool choices and the tool, which is used if requesting function
+    calls from the LLM.
+    """
+    def __init__(self, *, model: str, api_key=None, base_url=None, **kwargs):
+        super().__init__(**kwargs)
+        self._model: str = model
+        self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
+    def create_client(self, api_key=None, base_url=None, **kwargs):
+        return AsyncOpenAI(api_key=api_key, base_url=base_url)
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def get_chat_completions(
+            self,
+            context: OpenAILLMContext,
+            messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
+        chunks = await self._client.chat.completions.create(
+            model=self._model,
+            stream=True,
+            messages=messages,
+            tools=context.tools,
+            tool_choice=context.tool_choice,
+        )
+        return chunks
+    async def _stream_chat_completions(
+            self, context: OpenAILLMContext) -> AsyncStream[ChatCompletionChunk]:
+        logger.debug(f"Generating chat: {context.get_messages_json()}")
+        messages: List[ChatCompletionMessageParam] = context.get_messages()
+        # base64 encode any images
+        for message in messages:
+            if message.get("mime_type") == "image/jpeg":
+                encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
+                text = message["content"]
+                message["content"] = [
+                    {"type": "text", "text": text},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
+                ]
+                del message["data"]
+                del message["mime_type"]
+        chunks = await self.get_chat_completions(context, messages)
+        return chunks
+    async def _process_context(self, context: OpenAILLMContext):
+        function_name = ""
+        arguments = ""
+        tool_call_id = ""
+        await self.start_ttfb_metrics()
+        chunk_stream: AsyncStream[ChatCompletionChunk] = (
+            await self._stream_chat_completions(context)
+        )
+        async for chunk in chunk_stream:
+            if len(chunk.choices) == 0:
+                continue
+            await self.stop_ttfb_metrics()
+            if chunk.choices[0].delta.tool_calls:
+                # We're streaming the LLM response to enable the fastest response times.
+                # For text, we just yield each chunk as we receive it and count on consumers
+                # to do whatever coalescing they need (eg. to pass full sentences to TTS)
+                #
+                # If the LLM is a function call, we'll do some coalescing here.
+                # If the response contains a function name, we'll yield a frame to tell consumers
+                # that they can start preparing to call the function with that name.
+                # We accumulate all the arguments for the rest of the streamed response, then when
+                # the response is done, we package up all the arguments and the function name and
+                # yield a frame containing the function name and the arguments.
+                tool_call = chunk.choices[0].delta.tool_calls[0]
+                if tool_call.function and tool_call.function.name:
+                    function_name += tool_call.function.name
+                    tool_call_id = tool_call.id
+                    await self.call_start_function(function_name)
+                if tool_call.function and tool_call.function.arguments:
+                    # Keep iterating through the response to collect all the argument fragments
+                    arguments += tool_call.function.arguments
+            elif chunk.choices[0].delta.content:
+                await self.push_frame(LLMResponseStartFrame())
+                await self.push_frame(TextFrame(chunk.choices[0].delta.content))
+                await self.push_frame(LLMResponseEndFrame())
+        # if we got a function name and arguments, check to see if it's a function with
+        # a registered handler. If so, run the registered callback, save the result to
+        # the context, and re-prompt to get a chat answer. If we don't have a registered
+        # handler, raise an exception.
+        if function_name and arguments:
+            if self.has_function(function_name):
+                await self._handle_function_call(context, tool_call_id, function_name, arguments)
+            else:
+                raise OpenAIUnhandledFunctionException(
+                    f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
+    async def _handle_function_call(
+            self,
+            context,
+            tool_call_id,
+            function_name,
+            arguments
+    ):
+        arguments = json.loads(arguments)
+        result = await self.call_function(function_name, arguments)
+        arguments = json.dumps(arguments)
+        if isinstance(result, (str, dict)):
+            # Handle it in "full magic mode"
+            tool_call = ChatCompletionFunctionMessageParam({
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "function": {
+                            "arguments": arguments,
+                            "name": function_name
+                        },
+                        "type": "function"
+                    }
+                ]
+            })
+            context.add_message(tool_call)
+            if isinstance(result, dict):
+                result = json.dumps(result)
+            tool_result = ChatCompletionToolParam({
+                "tool_call_id": tool_call_id,
+                "role": "tool",
+                "content": result
+            })
+            context.add_message(tool_result)
+            # re-prompt to get a human answer
+            await self._process_context(context)
+        elif isinstance(result, list):
+            # reduced magic
+            for msg in result:
+                context.add_message(msg)
+            await self._process_context(context)
+        elif isinstance(result, type(None)):
+            pass
+        else:
+            raise TypeError(f"Unknown return type from function callback: {type(result)}")
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        context = None
+        if isinstance(frame, OpenAILLMContextFrame):
+            context: OpenAILLMContext = frame.context
+        elif isinstance(frame, LLMMessagesFrame):
+            context = OpenAILLMContext.from_messages(frame.messages)
+        elif isinstance(frame, VisionImageRawFrame):
+            context = OpenAILLMContext.from_image_frame(frame)
+        else:
+            await self.push_frame(frame, direction)
+        if context:
+            await self.push_frame(LLMFullResponseStartFrame())
+            await self.start_processing_metrics()
+            await self._process_context(context)
+            await self.stop_processing_metrics()
+            await self.push_frame(LLMFullResponseEndFrame())
+class OpenAILLMService(BaseOpenAILLMService):
+    def __init__(self, *, model: str = "gpt-4o", **kwargs):
+        super().__init__(model=model, **kwargs)
+class OpenAIImageGenService(ImageGenService):
+    def __init__(
+        self,
+        *,
+        image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
+        aiohttp_session: aiohttp.ClientSession,
+        api_key: str,
+        model: str = "dall-e-3",
+    ):
+        super().__init__()
+        self._model = model
+        self._image_size = image_size
+        self._client = AsyncOpenAI(api_key=api_key)
+        self._aiohttp_session = aiohttp_session
+    async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating image from prompt: {prompt}")
+        image = await self._client.images.generate(
+            prompt=prompt,
+            model=self._model,
+            n=1,
+            size=self._image_size
+        )
+        image_url = image.data[0].url
+        if not image_url:
+            logger.error(f"{self} No image provided in response: {image}")
+            yield ErrorFrame("Image generation failed")
+            return
+        # Load the image from the url
+        async with self._aiohttp_session.get(image_url) as response:
+            image_stream = io.BytesIO(await response.content.read())
+            image = Image.open(image_stream)
+            frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format)
+            yield frame
+class OpenAITTSService(TTSService):
+    """This service uses the OpenAI TTS API to generate audio from text.
+    The returned audio is PCM encoded at 24kHz. When using the DailyTransport, set the sample rate in the DailyParams accordingly:
+    ```
+    DailyParams(
+        audio_out_enabled=True,
+        audio_out_sample_rate=24_000,
+    )
+    ```
+    """
+    def __init__(
+            self,
+            *,
+            api_key: str | None = None,
+            base_url: str | None = None,
+            sample_rate: int = 24_000,
+            voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = "alloy",
+            model: Literal["tts-1", "tts-1-hd"] = "tts-1",
+            **kwargs):
+        super().__init__(**kwargs)
+        self._voice = voice
+        self._model = model
+        self.sample_rate=sample_rate
+        self._client = AsyncOpenAI(api_key=api_key,base_url=base_url)
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+        try:
+            await self.start_ttfb_metrics()
+            async with self._client.audio.speech.with_streaming_response.create(
+                    input=text,
+                    model=self._model,
+                    voice=self._voice,
+                    response_format="pcm",
+            ) as r:
+                if r.status_code != 200:
+                    error = await r.text()
+                    logger.error(
+                        f"{self} error getting audio (status: {r.status_code}, error: {error})")
+                    yield ErrorFrame(f"Error getting audio (status: {r.status_code}, error: {error})")
+                    return
+                async for chunk in r.iter_bytes(8192):
+                    if len(chunk) > 0:
+                        await self.stop_ttfb_metrics()
+                        frame = AudioRawFrame(chunk, self.sample_rate, 1)
+                        yield frame
+        except BadRequestError as e:
+            logger.exception(f"{self} error generating TTS: {e}")

pipecat/services/openpipe.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+from typing import Dict, List
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.openai import BaseOpenAILLMService
+from loguru import logger
+try:
+    from openpipe import AsyncOpenAI as OpenPipeAI, AsyncStream
+    from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionChunk)
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use OpenPipe, you need to `pip install pipecat-ai[openpipe]`. Also, set `OPENPIPE_API_KEY` and `OPENAI_API_KEY` environment variables.")
+    raise Exception(f"Missing module: {e}")
+class OpenPipeLLMService(BaseOpenAILLMService):
+    def __init__(
+            self,
+            *,
+            model: str = "gpt-4o",
+            api_key: str | None = None,
+            base_url: str | None = None,
+            openpipe_api_key: str | None = None,
+            openpipe_base_url: str = "https://app.openpipe.ai/api/v1",
+            tags: Dict[str, str] | None = None,
+            **kwargs):
+        super().__init__(
+            model=model,
+            api_key=api_key,
+            base_url=base_url,
+            openpipe_api_key=openpipe_api_key,
+            openpipe_base_url=openpipe_base_url,
+            **kwargs)
+        self._tags = tags
+    def create_client(self, api_key=None, base_url=None, **kwargs):
+        openpipe_api_key = kwargs.get("openpipe_api_key") or ""
+        openpipe_base_url = kwargs.get("openpipe_base_url") or ""
+        client = OpenPipeAI(
+            api_key=api_key,
+            base_url=base_url,
+            openpipe={
+                "api_key": openpipe_api_key,
+                "base_url": openpipe_base_url
+            }
+        )
+        return client
+    async def get_chat_completions(
+            self,
+            context: OpenAILLMContext,
+            messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
+        chunks = await self._client.chat.completions.create(
+            model=self._model,
+            stream=True,
+            messages=messages,
+            openpipe={
+                "tags": self._tags,
+                "log_request": True
+            }
+        )
+        return chunks

pipecat/services/playht.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+import io
+import struct
+from typing import AsyncGenerator
+from pipecat.frames.frames import AudioRawFrame, Frame
+from pipecat.services.ai_services import TTSService
+from loguru import logger
+try:
+    from pyht.client import TTSOptions
+    from pyht.async_client import AsyncClient
+    from pyht.protos.api_pb2 import Format
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use PlayHT, you need to `pip install pipecat-ai[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables.")
+    raise Exception(f"Missing module: {e}")
+class PlayHTTTSService(TTSService):
+    def __init__(self, *, api_key: str, user_id: str, voice_url: str, **kwargs):
+        super().__init__(**kwargs)
+        self._user_id = user_id
+        self._speech_key = api_key
+        self._client = AsyncClient(
+            user_id=self._user_id,
+            api_key=self._speech_key,
+        )
+        self._options = TTSOptions(
+            voice=voice_url,
+            sample_rate=16000,
+            quality="higher",
+            format=Format.FORMAT_WAV)
+    def can_generate_metrics(self) -> bool:
+        return True
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        logger.debug(f"Generating TTS: [{text}]")
+        try:
+            b = bytearray()
+            in_header = True
+            await self.start_ttfb_metrics()
+            playht_gen = self._client.tts(
+                text,
+                voice_engine="PlayHT2.0-turbo",
+                options=self._options)
+            async for chunk in playht_gen:
+                # skip the RIFF header.
+                if in_header:
+                    b.extend(chunk)
+                    if len(b) <= 36:
+                        continue
+                    else:
+                        fh = io.BytesIO(b)
+                        fh.seek(36)
+                        (data, size) = struct.unpack('<4sI', fh.read(8))
+                        while data != b'data':
+                            fh.read(size)
+                            (data, size) = struct.unpack('<4sI', fh.read(8))
+                        in_header = False
+                else:
+                    if len(chunk):
+                        await self.stop_ttfb_metrics()
+                        frame = AudioRawFrame(chunk, 16000, 1)
+                        yield frame
+        except Exception as e:
+            logger.exception(f"{self} error generating TTS: {e}")