lucy1118 commited on
Commit
8d7f55c
·
verified ·
1 Parent(s): cef8620

Upload 78 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. pipecat/__init__.py +0 -0
  2. pipecat/frames/__init__.py +0 -0
  3. pipecat/frames/frames.proto +43 -0
  4. pipecat/frames/frames.py +340 -0
  5. pipecat/frames/protobufs/frames_pb2.py +32 -0
  6. pipecat/pipeline/__init__.py +0 -0
  7. pipecat/pipeline/base_pipeline.py +21 -0
  8. pipecat/pipeline/merge_pipeline.py +24 -0
  9. pipecat/pipeline/parallel_pipeline.py +154 -0
  10. pipecat/pipeline/parallel_task.py +119 -0
  11. pipecat/pipeline/pipeline.py +95 -0
  12. pipecat/pipeline/runner.py +58 -0
  13. pipecat/pipeline/task.py +142 -0
  14. pipecat/processors/__init__.py +0 -0
  15. pipecat/processors/aggregators/__init__.py +0 -0
  16. pipecat/processors/aggregators/gated.py +74 -0
  17. pipecat/processors/aggregators/llm_response.py +266 -0
  18. pipecat/processors/aggregators/openai_llm_context.py +114 -0
  19. pipecat/processors/aggregators/sentence.py +54 -0
  20. pipecat/processors/aggregators/user_response.py +156 -0
  21. pipecat/processors/aggregators/vision_image_frame.py +47 -0
  22. pipecat/processors/async_frame_processor.py +63 -0
  23. pipecat/processors/filters/__init__.py +0 -0
  24. pipecat/processors/filters/frame_filter.py +36 -0
  25. pipecat/processors/filters/function_filter.py +30 -0
  26. pipecat/processors/filters/wake_check_filter.py +86 -0
  27. pipecat/processors/frame_processor.py +162 -0
  28. pipecat/processors/frameworks/__init__.py +0 -0
  29. pipecat/processors/frameworks/langchain.py +80 -0
  30. pipecat/processors/logger.py +27 -0
  31. pipecat/processors/text_transformer.py +38 -0
  32. pipecat/serializers/__init__.py +0 -0
  33. pipecat/serializers/base_serializer.py +20 -0
  34. pipecat/serializers/protobuf.py +92 -0
  35. pipecat/serializers/twilio.py +52 -0
  36. pipecat/services/__init__.py +0 -0
  37. pipecat/services/ai_services.py +300 -0
  38. pipecat/services/anthropic.py +145 -0
  39. pipecat/services/azure.py +233 -0
  40. pipecat/services/cartesia.py +65 -0
  41. pipecat/services/deepgram.py +149 -0
  42. pipecat/services/elevenlabs.py +66 -0
  43. pipecat/services/fal.py +83 -0
  44. pipecat/services/fireworks.py +25 -0
  45. pipecat/services/google.py +129 -0
  46. pipecat/services/moondream.py +92 -0
  47. pipecat/services/ollama.py +13 -0
  48. pipecat/services/openai.py +338 -0
  49. pipecat/services/openpipe.py +71 -0
  50. pipecat/services/playht.py +83 -0
pipecat/__init__.py ADDED
File without changes
pipecat/frames/__init__.py ADDED
File without changes
pipecat/frames/frames.proto ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Copyright (c) 2024, Daily
3
+ //
4
+ // SPDX-License-Identifier: BSD 2-Clause License
5
+ //
6
+
7
+ // Generate frames_pb2.py with:
8
+ //
9
+ // python -m grpc_tools.protoc --proto_path=./ --python_out=./protobufs frames.proto
10
+
11
+ syntax = "proto3";
12
+
13
+ package pipecat;
14
+
15
+ message TextFrame {
16
+ uint64 id = 1;
17
+ string name = 2;
18
+ string text = 3;
19
+ }
20
+
21
+ message AudioRawFrame {
22
+ uint64 id = 1;
23
+ string name = 2;
24
+ bytes audio = 3;
25
+ uint32 sample_rate = 4;
26
+ uint32 num_channels = 5;
27
+ }
28
+
29
+ message TranscriptionFrame {
30
+ uint64 id = 1;
31
+ string name = 2;
32
+ string text = 3;
33
+ string user_id = 4;
34
+ string timestamp = 5;
35
+ }
36
+
37
+ message Frame {
38
+ oneof frame {
39
+ TextFrame text = 1;
40
+ AudioRawFrame audio = 2;
41
+ TranscriptionFrame transcription = 3;
42
+ }
43
+ }
pipecat/frames/frames.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Any, List, Mapping, Tuple
8
+
9
+ from dataclasses import dataclass, field
10
+
11
+ from pipecat.utils.utils import obj_count, obj_id
12
+
13
+
14
+ @dataclass
15
+ class Frame:
16
+ id: int = field(init=False)
17
+ name: str = field(init=False)
18
+
19
+ def __post_init__(self):
20
+ self.id: int = obj_id()
21
+ self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
22
+
23
+ def __str__(self):
24
+ return self.name
25
+
26
+
27
+ @dataclass
28
+ class DataFrame(Frame):
29
+ pass
30
+
31
+
32
+ @dataclass
33
+ class AudioRawFrame(DataFrame):
34
+ """A chunk of audio. Will be played by the transport if the transport's
35
+ microphone has been enabled.
36
+
37
+ """
38
+ audio: bytes
39
+ sample_rate: int
40
+ num_channels: int
41
+
42
+ def __post_init__(self):
43
+ super().__post_init__()
44
+ self.num_frames = int(len(self.audio) / (self.num_channels * 2))
45
+
46
+ def __str__(self):
47
+ return f"{self.name}(size: {len(self.audio)}, frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})"
48
+
49
+
50
+ @dataclass
51
+ class ImageRawFrame(DataFrame):
52
+ """An image. Will be shown by the transport if the transport's camera is
53
+ enabled.
54
+
55
+ """
56
+ image: bytes
57
+ size: Tuple[int, int]
58
+ format: str | None
59
+
60
+ def __str__(self):
61
+ return f"{self.name}(size: {self.size}, format: {self.format})"
62
+
63
+
64
+ @dataclass
65
+ class URLImageRawFrame(ImageRawFrame):
66
+ """An image with an associated URL. Will be shown by the transport if the
67
+ transport's camera is enabled.
68
+
69
+ """
70
+ url: str | None
71
+
72
+ def __str__(self):
73
+ return f"{self.name}(url: {self.url}, size: {self.size}, format: {self.format})"
74
+
75
+
76
+ @dataclass
77
+ class VisionImageRawFrame(ImageRawFrame):
78
+ """An image with an associated text to ask for a description of it. Will be
79
+ shown by the transport if the transport's camera is enabled.
80
+
81
+ """
82
+ text: str | None
83
+
84
+ def __str__(self):
85
+ return f"{self.name}(text: {self.text}, size: {self.size}, format: {self.format})"
86
+
87
+
88
+ @dataclass
89
+ class UserImageRawFrame(ImageRawFrame):
90
+ """An image associated to a user. Will be shown by the transport if the
91
+ transport's camera is enabled.
92
+
93
+ """
94
+ user_id: str
95
+
96
+ def __str__(self):
97
+ return f"{self.name}(user: {self.user_id}, size: {self.size}, format: {self.format})"
98
+
99
+
100
+ @dataclass
101
+ class SpriteFrame(Frame):
102
+ """An animated sprite. Will be shown by the transport if the transport's
103
+ camera is enabled. Will play at the framerate specified in the transport's
104
+ `fps` constructor parameter.
105
+
106
+ """
107
+ images: List[ImageRawFrame]
108
+
109
+ def __str__(self):
110
+ return f"{self.name}(size: {len(self.images)})"
111
+
112
+
113
+ @dataclass
114
+ class TextFrame(DataFrame):
115
+ """A chunk of text. Emitted by LLM services, consumed by TTS services, can
116
+ be used to send text through pipelines.
117
+
118
+ """
119
+ text: str
120
+
121
+ def __str__(self):
122
+ return f"{self.name}(text: {self.text})"
123
+
124
+
125
+ @dataclass
126
+ class TranscriptionFrame(TextFrame):
127
+ """A text frame with transcription-specific data. Will be placed in the
128
+ transport's receive queue when a participant speaks.
129
+
130
+ """
131
+ user_id: str
132
+ timestamp: str
133
+
134
+ def __str__(self):
135
+ return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
136
+
137
+
138
+ @dataclass
139
+ class InterimTranscriptionFrame(TextFrame):
140
+ """A text frame with interim transcription-specific data. Will be placed in
141
+ the transport's receive queue when a participant speaks."""
142
+ user_id: str
143
+ timestamp: str
144
+
145
+ def __str__(self):
146
+ return f"{self.name}(user: {self.user_id}, text: {self.text}, timestamp: {self.timestamp})"
147
+
148
+
149
+ @dataclass
150
+ class LLMMessagesFrame(DataFrame):
151
+ """A frame containing a list of LLM messages. Used to signal that an LLM
152
+ service should run a chat completion and emit an LLMStartFrames, TextFrames
153
+ and an LLMEndFrame. Note that the messages property on this class is
154
+ mutable, and will be be updated by various ResponseAggregator frame
155
+ processors.
156
+
157
+ """
158
+ messages: List[dict]
159
+
160
+
161
+ @dataclass
162
+ class TransportMessageFrame(DataFrame):
163
+ message: Any
164
+
165
+ def __str__(self):
166
+ return f"{self.name}(message: {self.message})"
167
+
168
+ #
169
+ # App frames. Application user-defined frames.
170
+ #
171
+
172
+
173
+ @dataclass
174
+ class AppFrame(Frame):
175
+ pass
176
+
177
+ #
178
+ # System frames
179
+ #
180
+
181
+
182
+ @dataclass
183
+ class SystemFrame(Frame):
184
+ pass
185
+
186
+
187
+ @dataclass
188
+ class StartFrame(SystemFrame):
189
+ """This is the first frame that should be pushed down a pipeline."""
190
+ allow_interruptions: bool = False
191
+ enable_metrics: bool = False
192
+ report_only_initial_ttfb: bool = False
193
+
194
+
195
+ @dataclass
196
+ class CancelFrame(SystemFrame):
197
+ """Indicates that a pipeline needs to stop right away."""
198
+ pass
199
+
200
+
201
+ @dataclass
202
+ class ErrorFrame(SystemFrame):
203
+ """This is used notify upstream that an error has occurred downstream the
204
+ pipeline."""
205
+ error: str | None
206
+
207
+ def __str__(self):
208
+ return f"{self.name}(error: {self.error})"
209
+
210
+
211
+ @dataclass
212
+ class StopTaskFrame(SystemFrame):
213
+ """Indicates that a pipeline task should be stopped. This should inform the
214
+ pipeline processors that they should stop pushing frames but that they
215
+ should be kept in a running state.
216
+
217
+ """
218
+ pass
219
+
220
+
221
+ @dataclass
222
+ class StartInterruptionFrame(SystemFrame):
223
+ """Emitted by VAD to indicate that a user has started speaking (i.e. is
224
+ interruption). This is similar to UserStartedSpeakingFrame except that it
225
+ should be pushed concurrently with other frames (so the order is not
226
+ guaranteed).
227
+
228
+ """
229
+ pass
230
+
231
+
232
+ @dataclass
233
+ class StopInterruptionFrame(SystemFrame):
234
+ """Emitted by VAD to indicate that a user has stopped speaking (i.e. no more
235
+ interruptions). This is similar to UserStoppedSpeakingFrame except that it
236
+ should be pushed concurrently with other frames (so the order is not
237
+ guaranteed).
238
+
239
+ """
240
+ pass
241
+
242
+
243
+ @dataclass
244
+ class MetricsFrame(SystemFrame):
245
+ """Emitted by processor that can compute metrics like latencies.
246
+ """
247
+ ttfb: List[Mapping[str, Any]] | None = None
248
+ processing: List[Mapping[str, Any]] | None = None
249
+
250
+ #
251
+ # Control frames
252
+ #
253
+
254
+
255
+ @dataclass
256
+ class ControlFrame(Frame):
257
+ pass
258
+
259
+
260
+ @dataclass
261
+ class EndFrame(ControlFrame):
262
+ """Indicates that a pipeline has ended and frame processors and pipelines
263
+ should be shut down. If the transport receives this frame, it will stop
264
+ sending frames to its output channel(s) and close all its threads. Note,
265
+ that this is a control frame, which means it will received in the order it
266
+ was sent (unline system frames).
267
+
268
+ """
269
+ pass
270
+
271
+
272
+ @dataclass
273
+ class LLMFullResponseStartFrame(ControlFrame):
274
+ """Used to indicate the beginning of a full LLM response. Following
275
+ LLMResponseStartFrame, TextFrame and LLMResponseEndFrame for each sentence
276
+ until a LLMFullResponseEndFrame."""
277
+ pass
278
+
279
+
280
+ @dataclass
281
+ class LLMFullResponseEndFrame(ControlFrame):
282
+ """Indicates the end of a full LLM response."""
283
+ pass
284
+
285
+
286
+ @dataclass
287
+ class LLMResponseStartFrame(ControlFrame):
288
+ """Used to indicate the beginning of an LLM response. Following TextFrames
289
+ are part of the LLM response until an LLMResponseEndFrame"""
290
+ pass
291
+
292
+
293
+ @dataclass
294
+ class LLMResponseEndFrame(ControlFrame):
295
+ """Indicates the end of an LLM response."""
296
+ pass
297
+
298
+
299
+ @dataclass
300
+ class UserStartedSpeakingFrame(ControlFrame):
301
+ """Emitted by VAD to indicate that a user has started speaking. This can be
302
+ used for interruptions or other times when detecting that someone is
303
+ speaking is more important than knowing what they're saying (as you will
304
+ with a TranscriptionFrame)
305
+
306
+ """
307
+ pass
308
+
309
+
310
+ @dataclass
311
+ class UserStoppedSpeakingFrame(ControlFrame):
312
+ """Emitted by the VAD to indicate that a user stopped speaking."""
313
+ pass
314
+
315
+
316
+ @dataclass
317
+ class TTSStartedFrame(ControlFrame):
318
+ """Used to indicate the beginning of a TTS response. Following
319
+ AudioRawFrames are part of the TTS response until an TTSEndFrame. These
320
+ frames can be used for aggregating audio frames in a transport to optimize
321
+ the size of frames sent to the session, without needing to control this in
322
+ the TTS service.
323
+
324
+ """
325
+ pass
326
+
327
+
328
+ @dataclass
329
+ class TTSStoppedFrame(ControlFrame):
330
+ """Indicates the end of a TTS response."""
331
+ pass
332
+
333
+
334
+ @dataclass
335
+ class UserImageRequestFrame(ControlFrame):
336
+ """A frame user to request an image from the given user."""
337
+ user_id: str
338
+
339
+ def __str__(self):
340
+ return f"{self.name}, user: {self.user_id}"
pipecat/frames/protobufs/frames_pb2.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: frames.proto
4
+ # Protobuf Python Version: 4.25.1
5
+ """Generated protocol buffer code."""
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ from google.protobuf.internal import builder as _builder
10
+ # @@protoc_insertion_point(imports)
11
+
12
+ _sym_db = _symbol_database.Default()
13
+
14
+
15
+
16
+
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0c\x66rames.proto\x12\x07pipecat\"3\n\tTextFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\"c\n\rAudioRawFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\r\n\x05\x61udio\x18\x03 \x01(\x0c\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12\x14\n\x0cnum_channels\x18\x05 \x01(\r\"`\n\x12TranscriptionFrame\x12\n\n\x02id\x18\x01 \x01(\x04\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0f\n\x07user_id\x18\x04 \x01(\t\x12\x11\n\ttimestamp\x18\x05 \x01(\t\"\x93\x01\n\x05\x46rame\x12\"\n\x04text\x18\x01 \x01(\x0b\x32\x12.pipecat.TextFrameH\x00\x12\'\n\x05\x61udio\x18\x02 \x01(\x0b\x32\x16.pipecat.AudioRawFrameH\x00\x12\x34\n\rtranscription\x18\x03 \x01(\x0b\x32\x1b.pipecat.TranscriptionFrameH\x00\x42\x07\n\x05\x66rameb\x06proto3')
18
+
19
+ _globals = globals()
20
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
21
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'frames_pb2', _globals)
22
+ if _descriptor._USE_C_DESCRIPTORS == False:
23
+ DESCRIPTOR._options = None
24
+ _globals['_TEXTFRAME']._serialized_start=25
25
+ _globals['_TEXTFRAME']._serialized_end=76
26
+ _globals['_AUDIORAWFRAME']._serialized_start=78
27
+ _globals['_AUDIORAWFRAME']._serialized_end=177
28
+ _globals['_TRANSCRIPTIONFRAME']._serialized_start=179
29
+ _globals['_TRANSCRIPTIONFRAME']._serialized_end=275
30
+ _globals['_FRAME']._serialized_start=278
31
+ _globals['_FRAME']._serialized_end=425
32
+ # @@protoc_insertion_point(module_scope)
pipecat/pipeline/__init__.py ADDED
File without changes
pipecat/pipeline/base_pipeline.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from abc import abstractmethod
8
+
9
+ from typing import List
10
+
11
+ from pipecat.processors.frame_processor import FrameProcessor
12
+
13
+
14
+ class BasePipeline(FrameProcessor):
15
+
16
+ def __init__(self):
17
+ super().__init__()
18
+
19
+ @abstractmethod
20
+ def processors_with_metrics(self) -> List[FrameProcessor]:
21
+ pass
pipecat/pipeline/merge_pipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pipecat.pipeline.frames import EndFrame, EndPipeFrame
3
+ from pipecat.pipeline.pipeline import Pipeline
4
+
5
+
6
+ class SequentialMergePipeline(Pipeline):
7
+ """This class merges the sink queues from a list of pipelines. Frames from
8
+ each pipeline's sink are merged in the order of pipelines in the list."""
9
+
10
+ def __init__(self, pipelines: List[Pipeline]):
11
+ super().__init__([])
12
+ self.pipelines = pipelines
13
+
14
+ async def run_pipeline(self):
15
+ for idx, pipeline in enumerate(self.pipelines):
16
+ while True:
17
+ frame = await pipeline.sink.get()
18
+ if isinstance(
19
+ frame, EndFrame) or isinstance(
20
+ frame, EndPipeFrame):
21
+ break
22
+ await self.sink.put(frame)
23
+
24
+ await self.sink.put(EndFrame())
pipecat/pipeline/parallel_pipeline.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from itertools import chain
10
+ from typing import List
11
+
12
+ from pipecat.pipeline.base_pipeline import BasePipeline
13
+ from pipecat.pipeline.pipeline import Pipeline
14
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
15
+ from pipecat.frames.frames import CancelFrame, EndFrame, Frame, StartFrame
16
+
17
+ from loguru import logger
18
+
19
+
20
+ class Source(FrameProcessor):
21
+
22
+ def __init__(self, upstream_queue: asyncio.Queue):
23
+ super().__init__()
24
+ self._up_queue = upstream_queue
25
+
26
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
27
+ await super().process_frame(frame, direction)
28
+
29
+ match direction:
30
+ case FrameDirection.UPSTREAM:
31
+ await self._up_queue.put(frame)
32
+ case FrameDirection.DOWNSTREAM:
33
+ await self.push_frame(frame, direction)
34
+
35
+
36
+ class Sink(FrameProcessor):
37
+
38
+ def __init__(self, downstream_queue: asyncio.Queue):
39
+ super().__init__()
40
+ self._down_queue = downstream_queue
41
+
42
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
43
+ await super().process_frame(frame, direction)
44
+
45
+ match direction:
46
+ case FrameDirection.UPSTREAM:
47
+ await self.push_frame(frame, direction)
48
+ case FrameDirection.DOWNSTREAM:
49
+ await self._down_queue.put(frame)
50
+
51
+
52
+ class ParallelPipeline(BasePipeline):
53
+ def __init__(self, *args):
54
+ super().__init__()
55
+
56
+ if len(args) == 0:
57
+ raise Exception(f"ParallelPipeline needs at least one argument")
58
+
59
+ self._sources = []
60
+ self._sinks = []
61
+
62
+ self._up_queue = asyncio.Queue()
63
+ self._down_queue = asyncio.Queue()
64
+ self._up_task: asyncio.Task | None = None
65
+ self._down_task: asyncio.Task | None = None
66
+
67
+ self._pipelines = []
68
+
69
+ logger.debug(f"Creating {self} pipelines")
70
+ for processors in args:
71
+ if not isinstance(processors, list):
72
+ raise TypeError(f"ParallelPipeline argument {processors} is not a list")
73
+
74
+ # We will add a source before the pipeline and a sink after.
75
+ source = Source(self._up_queue)
76
+ sink = Sink(self._down_queue)
77
+ self._sources.append(source)
78
+ self._sinks.append(sink)
79
+
80
+ # Create pipeline
81
+ pipeline = Pipeline(processors)
82
+ source.link(pipeline)
83
+ pipeline.link(sink)
84
+ self._pipelines.append(pipeline)
85
+
86
+ logger.debug(f"Finished creating {self} pipelines")
87
+
88
+ #
89
+ # BasePipeline
90
+ #
91
+
92
+ def processors_with_metrics(self) -> List[FrameProcessor]:
93
+ return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
94
+
95
+ #
96
+ # Frame processor
97
+ #
98
+
99
+ async def cleanup(self):
100
+ await asyncio.gather(*[p.cleanup() for p in self._pipelines])
101
+
102
+ async def _start_tasks(self):
103
+ loop = self.get_event_loop()
104
+ self._up_task = loop.create_task(self._process_up_queue())
105
+ self._down_task = loop.create_task(self._process_down_queue())
106
+
107
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
108
+ await super().process_frame(frame, direction)
109
+
110
+ if isinstance(frame, StartFrame):
111
+ await self._start_tasks()
112
+
113
+ if direction == FrameDirection.UPSTREAM:
114
+ # If we get an upstream frame we process it in each sink.
115
+ await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
116
+ elif direction == FrameDirection.DOWNSTREAM:
117
+ # If we get a downstream frame we process it in each source.
118
+ # TODO(aleix): We are creating task for each frame. For real-time
119
+ # video/audio this might be too slow. We should use an already
120
+ # created task instead.
121
+ await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sources])
122
+
123
+ # If we get an EndFrame we stop our queue processing tasks and wait on
124
+ # all the pipelines to finish.
125
+ if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
126
+ # Use None to indicate when queues should be done processing.
127
+ await self._up_queue.put(None)
128
+ await self._down_queue.put(None)
129
+ if self._up_task:
130
+ await self._up_task
131
+ if self._down_task:
132
+ await self._down_task
133
+
134
+ async def _process_up_queue(self):
135
+ running = True
136
+ seen_ids = set()
137
+ while running:
138
+ frame = await self._up_queue.get()
139
+ if frame and frame.id not in seen_ids:
140
+ await self.push_frame(frame, FrameDirection.UPSTREAM)
141
+ seen_ids.add(frame.id)
142
+ running = frame is not None
143
+ self._up_queue.task_done()
144
+
145
+ async def _process_down_queue(self):
146
+ running = True
147
+ seen_ids = set()
148
+ while running:
149
+ frame = await self._down_queue.get()
150
+ if frame and frame.id not in seen_ids:
151
+ await self.push_frame(frame, FrameDirection.DOWNSTREAM)
152
+ seen_ids.add(frame.id)
153
+ running = frame is not None
154
+ self._down_queue.task_done()
pipecat/pipeline/parallel_task.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from itertools import chain
10
+ from typing import List
11
+
12
+ from pipecat.pipeline.base_pipeline import BasePipeline
13
+ from pipecat.pipeline.pipeline import Pipeline
14
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
15
+ from pipecat.frames.frames import Frame
16
+
17
+ from loguru import logger
18
+
19
+
20
+ class Source(FrameProcessor):
21
+
22
+ def __init__(self, upstream_queue: asyncio.Queue):
23
+ super().__init__()
24
+ self._up_queue = upstream_queue
25
+
26
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
27
+ await super().process_frame(frame, direction)
28
+
29
+ match direction:
30
+ case FrameDirection.UPSTREAM:
31
+ await self._up_queue.put(frame)
32
+ case FrameDirection.DOWNSTREAM:
33
+ await self.push_frame(frame, direction)
34
+
35
+
36
+ class Sink(FrameProcessor):
37
+
38
+ def __init__(self, downstream_queue: asyncio.Queue):
39
+ super().__init__()
40
+ self._down_queue = downstream_queue
41
+
42
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
43
+ await super().process_frame(frame, direction)
44
+
45
+ match direction:
46
+ case FrameDirection.UPSTREAM:
47
+ await self.push_frame(frame, direction)
48
+ case FrameDirection.DOWNSTREAM:
49
+ await self._down_queue.put(frame)
50
+
51
+
52
+ class ParallelTask(BasePipeline):
53
+ def __init__(self, *args):
54
+ super().__init__()
55
+
56
+ if len(args) == 0:
57
+ raise Exception(f"ParallelTask needs at least one argument")
58
+
59
+ self._sinks = []
60
+ self._pipelines = []
61
+
62
+ self._up_queue = asyncio.Queue()
63
+ self._down_queue = asyncio.Queue()
64
+
65
+ logger.debug(f"Creating {self} pipelines")
66
+ for processors in args:
67
+ if not isinstance(processors, list):
68
+ raise TypeError(f"ParallelTask argument {processors} is not a list")
69
+
70
+ # We add a source at the beginning of the pipeline and a sink at the end.
71
+ source = Source(self._up_queue)
72
+ sink = Sink(self._down_queue)
73
+ processors: List[FrameProcessor] = [source] + processors
74
+ processors.append(sink)
75
+
76
+ # Keep track of sinks. We access the source through the pipeline.
77
+ self._sinks.append(sink)
78
+
79
+ # Create pipeline
80
+ pipeline = Pipeline(processors)
81
+ self._pipelines.append(pipeline)
82
+ logger.debug(f"Finished creating {self} pipelines")
83
+
84
+ #
85
+ # BasePipeline
86
+ #
87
+
88
+ def processors_with_metrics(self) -> List[FrameProcessor]:
89
+ return list(chain.from_iterable(p.processors_with_metrics() for p in self._pipelines))
90
+
91
+ #
92
+ # Frame processor
93
+ #
94
+
95
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
96
+ await super().process_frame(frame, direction)
97
+
98
+ if direction == FrameDirection.UPSTREAM:
99
+ # If we get an upstream frame we process it in each sink.
100
+ await asyncio.gather(*[s.process_frame(frame, direction) for s in self._sinks])
101
+ elif direction == FrameDirection.DOWNSTREAM:
102
+ # If we get a downstream frame we process it in each source (using the pipeline).
103
+ await asyncio.gather(*[p.process_frame(frame, direction) for p in self._pipelines])
104
+
105
+ seen_ids = set()
106
+ while not self._up_queue.empty():
107
+ frame = await self._up_queue.get()
108
+ if frame and frame.id not in seen_ids:
109
+ await self.push_frame(frame, FrameDirection.UPSTREAM)
110
+ seen_ids.add(frame.id)
111
+ self._up_queue.task_done()
112
+
113
+ seen_ids = set()
114
+ while not self._down_queue.empty():
115
+ frame = await self._down_queue.get()
116
+ if frame and frame.id not in seen_ids:
117
+ await self.push_frame(frame, FrameDirection.DOWNSTREAM)
118
+ seen_ids.add(frame.id)
119
+ self._down_queue.task_done()
pipecat/pipeline/pipeline.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Callable, Coroutine, List
8
+
9
+ from pipecat.frames.frames import Frame
10
+ from pipecat.pipeline.base_pipeline import BasePipeline
11
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
12
+
13
+
14
+ class PipelineSource(FrameProcessor):
15
+
16
+ def __init__(self, upstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
17
+ super().__init__()
18
+ self._upstream_push_frame = upstream_push_frame
19
+
20
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
21
+ await super().process_frame(frame, direction)
22
+
23
+ match direction:
24
+ case FrameDirection.UPSTREAM:
25
+ await self._upstream_push_frame(frame, direction)
26
+ case FrameDirection.DOWNSTREAM:
27
+ await self.push_frame(frame, direction)
28
+
29
+
30
+ class PipelineSink(FrameProcessor):
31
+
32
+ def __init__(self, downstream_push_frame: Callable[[Frame, FrameDirection], Coroutine]):
33
+ super().__init__()
34
+ self._downstream_push_frame = downstream_push_frame
35
+
36
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
37
+ await super().process_frame(frame, direction)
38
+
39
+ match direction:
40
+ case FrameDirection.UPSTREAM:
41
+ await self.push_frame(frame, direction)
42
+ case FrameDirection.DOWNSTREAM:
43
+ await self._downstream_push_frame(frame, direction)
44
+
45
+
46
+ class Pipeline(BasePipeline):
47
+
48
+ def __init__(self, processors: List[FrameProcessor]):
49
+ super().__init__()
50
+
51
+ # Add a source and a sink queue so we can forward frames upstream and
52
+ # downstream outside of the pipeline.
53
+ self._source = PipelineSource(self.push_frame)
54
+ self._sink = PipelineSink(self.push_frame)
55
+ self._processors: List[FrameProcessor] = [self._source] + processors + [self._sink]
56
+
57
+ self._link_processors()
58
+
59
+ #
60
+ # BasePipeline
61
+ #
62
+
63
+ def processors_with_metrics(self):
64
+ services = []
65
+ for p in self._processors:
66
+ if isinstance(p, BasePipeline):
67
+ services += p.processors_with_metrics()
68
+ elif p.can_generate_metrics():
69
+ services.append(p)
70
+ return services
71
+
72
+ #
73
+ # Frame processor
74
+ #
75
+
76
+ async def cleanup(self):
77
+ await self._cleanup_processors()
78
+
79
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
80
+ await super().process_frame(frame, direction)
81
+
82
+ if direction == FrameDirection.DOWNSTREAM:
83
+ await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
84
+ elif direction == FrameDirection.UPSTREAM:
85
+ await self._sink.process_frame(frame, FrameDirection.UPSTREAM)
86
+
87
+ async def _cleanup_processors(self):
88
+ for p in self._processors:
89
+ await p.cleanup()
90
+
91
+ def _link_processors(self):
92
+ prev = self._processors[0]
93
+ for curr in self._processors[1:]:
94
+ prev.link(curr)
95
+ prev = curr
pipecat/pipeline/runner.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+ import signal
9
+
10
+ from pipecat.pipeline.task import PipelineTask
11
+ from pipecat.utils.utils import obj_count, obj_id
12
+
13
+ from loguru import logger
14
+
15
+
16
+ class PipelineRunner:
17
+
18
+ def __init__(self, *, name: str | None = None, handle_sigint: bool = True):
19
+ self.id: int = obj_id()
20
+ self.name: str = name or f"{self.__class__.__name__}#{obj_count(self)}"
21
+
22
+ self._tasks = {}
23
+
24
+ if handle_sigint:
25
+ self._setup_sigint()
26
+
27
+ async def run(self, task: PipelineTask):
28
+ logger.debug(f"Runner {self} started running {task}")
29
+ self._tasks[task.name] = task
30
+ await task.run()
31
+ del self._tasks[task.name]
32
+ logger.debug(f"Runner {self} finished running {task}")
33
+
34
+ async def stop_when_done(self):
35
+ logger.debug(f"Runner {self} scheduled to stop when all tasks are done")
36
+ await asyncio.gather(*[t.stop_when_done() for t in self._tasks.values()])
37
+
38
+ async def cancel(self):
39
+ logger.debug(f"Canceling runner {self}")
40
+ await asyncio.gather(*[t.cancel() for t in self._tasks.values()])
41
+
42
+ def _setup_sigint(self):
43
+ loop = asyncio.get_running_loop()
44
+ loop.add_signal_handler(
45
+ signal.SIGINT,
46
+ lambda *args: asyncio.create_task(self._sig_handler())
47
+ )
48
+ loop.add_signal_handler(
49
+ signal.SIGTERM,
50
+ lambda *args: asyncio.create_task(self._sig_handler())
51
+ )
52
+
53
+ async def _sig_handler(self):
54
+ logger.warning(f"Interruption detected. Canceling runner {self}")
55
+ await self.cancel()
56
+
57
+ def __str__(self):
58
+ return self.name
pipecat/pipeline/task.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from typing import AsyncIterable, Iterable
10
+
11
+ from pydantic import BaseModel
12
+
13
+ from pipecat.frames.frames import CancelFrame, EndFrame, ErrorFrame, Frame, MetricsFrame, StartFrame, StopTaskFrame
14
+ from pipecat.pipeline.base_pipeline import BasePipeline
15
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
16
+ from pipecat.utils.utils import obj_count, obj_id
17
+
18
+ from loguru import logger
19
+
20
+
21
+ class PipelineParams(BaseModel):
22
+ allow_interruptions: bool = False
23
+ enable_metrics: bool = False
24
+ report_only_initial_ttfb: bool = False
25
+
26
+
27
+ class Source(FrameProcessor):
28
+
29
+ def __init__(self, up_queue: asyncio.Queue):
30
+ super().__init__()
31
+ self._up_queue = up_queue
32
+
33
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
34
+ await super().process_frame(frame, direction)
35
+
36
+ match direction:
37
+ case FrameDirection.UPSTREAM:
38
+ await self._up_queue.put(frame)
39
+ case FrameDirection.DOWNSTREAM:
40
+ await self.push_frame(frame, direction)
41
+
42
+
43
+ class PipelineTask:
44
+
45
+ def __init__(self, pipeline: BasePipeline, params: PipelineParams = PipelineParams()):
46
+ self.id: int = obj_id()
47
+ self.name: str = f"{self.__class__.__name__}#{obj_count(self)}"
48
+
49
+ self._pipeline = pipeline
50
+ self._params = params
51
+ self._finished = False
52
+
53
+ self._down_queue = asyncio.Queue()
54
+ self._up_queue = asyncio.Queue()
55
+
56
+ self._source = Source(self._up_queue)
57
+ self._source.link(pipeline)
58
+
59
+ def has_finished(self):
60
+ return self._finished
61
+
62
+ async def stop_when_done(self):
63
+ logger.debug(f"Task {self} scheduled to stop when done")
64
+ await self.queue_frame(EndFrame())
65
+
66
+ async def cancel(self):
67
+ logger.debug(f"Canceling pipeline task {self}")
68
+ # Make sure everything is cleaned up downstream. This is sent
69
+ # out-of-band from the main streaming task which is what we want since
70
+ # we want to cancel right away.
71
+ await self._source.process_frame(CancelFrame(), FrameDirection.DOWNSTREAM)
72
+ self._process_down_task.cancel()
73
+ self._process_up_task.cancel()
74
+ await self._process_down_task
75
+ await self._process_up_task
76
+
77
+ async def run(self):
78
+ self._process_up_task = asyncio.create_task(self._process_up_queue())
79
+ self._process_down_task = asyncio.create_task(self._process_down_queue())
80
+ await asyncio.gather(self._process_up_task, self._process_down_task)
81
+ self._finished = True
82
+
83
+ async def queue_frame(self, frame: Frame):
84
+ await self._down_queue.put(frame)
85
+
86
+ async def queue_frames(self, frames: Iterable[Frame] | AsyncIterable[Frame]):
87
+ if isinstance(frames, AsyncIterable):
88
+ async for frame in frames:
89
+ await self.queue_frame(frame)
90
+ elif isinstance(frames, Iterable):
91
+ for frame in frames:
92
+ await self.queue_frame(frame)
93
+ else:
94
+ raise Exception("Frames must be an iterable or async iterable")
95
+
96
+ def _initial_metrics_frame(self) -> MetricsFrame:
97
+ processors = self._pipeline.processors_with_metrics()
98
+ ttfb = [{"name": p.name, "time": 0.0} for p in processors]
99
+ processing = [{"name": p.name, "time": 0.0} for p in processors]
100
+ return MetricsFrame(ttfb=ttfb, processing=processing)
101
+
102
+ async def _process_down_queue(self):
103
+ start_frame = StartFrame(
104
+ allow_interruptions=self._params.allow_interruptions,
105
+ enable_metrics=self._params.enable_metrics,
106
+ report_only_initial_ttfb=self._params.report_only_initial_ttfb
107
+ )
108
+ await self._source.process_frame(start_frame, FrameDirection.DOWNSTREAM)
109
+ await self._source.process_frame(self._initial_metrics_frame(), FrameDirection.DOWNSTREAM)
110
+
111
+ running = True
112
+ should_cleanup = True
113
+ while running:
114
+ try:
115
+ frame = await self._down_queue.get()
116
+ await self._source.process_frame(frame, FrameDirection.DOWNSTREAM)
117
+ running = not (isinstance(frame, StopTaskFrame) or isinstance(frame, EndFrame))
118
+ should_cleanup = not isinstance(frame, StopTaskFrame)
119
+ self._down_queue.task_done()
120
+ except asyncio.CancelledError:
121
+ break
122
+ # Cleanup only if we need to.
123
+ if should_cleanup:
124
+ await self._source.cleanup()
125
+ await self._pipeline.cleanup()
126
+ # We just enqueue None to terminate the task gracefully.
127
+ self._process_up_task.cancel()
128
+ await self._process_up_task
129
+
130
+ async def _process_up_queue(self):
131
+ while True:
132
+ try:
133
+ frame = await self._up_queue.get()
134
+ if isinstance(frame, ErrorFrame):
135
+ logger.error(f"Error running app: {frame.error}")
136
+ await self.queue_frame(CancelFrame())
137
+ self._up_queue.task_done()
138
+ except asyncio.CancelledError:
139
+ break
140
+
141
+ def __str__(self):
142
+ return self.name
pipecat/processors/__init__.py ADDED
File without changes
pipecat/processors/aggregators/__init__.py ADDED
File without changes
pipecat/processors/aggregators/gated.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import List
8
+
9
+ from pipecat.frames.frames import Frame, SystemFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+ from loguru import logger
13
+
14
+
15
+ class GatedAggregator(FrameProcessor):
16
+ """Accumulate frames, with custom functions to start and stop accumulation.
17
+ Yields gate-opening frame before any accumulated frames, then ensuing frames
18
+ until and not including the gate-closed frame.
19
+
20
+ >>> from pipecat.pipeline.frames import ImageFrame
21
+
22
+ >>> async def print_frames(aggregator, frame):
23
+ ... async for frame in aggregator.process_frame(frame):
24
+ ... if isinstance(frame, TextFrame):
25
+ ... print(frame.text)
26
+ ... else:
27
+ ... print(frame.__class__.__name__)
28
+
29
+ >>> aggregator = GatedAggregator(
30
+ ... gate_close_fn=lambda x: isinstance(x, LLMResponseStartFrame),
31
+ ... gate_open_fn=lambda x: isinstance(x, ImageFrame),
32
+ ... start_open=False)
33
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
34
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Hello again.")))
35
+ >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
36
+ ImageFrame
37
+ Hello
38
+ Hello again.
39
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Goodbye.")))
40
+ Goodbye.
41
+ """
42
+
43
+ def __init__(self, gate_open_fn, gate_close_fn, start_open):
44
+ super().__init__()
45
+ self._gate_open_fn = gate_open_fn
46
+ self._gate_close_fn = gate_close_fn
47
+ self._gate_open = start_open
48
+ self._accumulator: List[Frame] = []
49
+
50
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
51
+ await super().process_frame(frame, direction)
52
+
53
+ # We must not block system frames.
54
+ if isinstance(frame, SystemFrame):
55
+ await self.push_frame(frame, direction)
56
+ return
57
+
58
+ old_state = self._gate_open
59
+ if self._gate_open:
60
+ self._gate_open = not self._gate_close_fn(frame)
61
+ else:
62
+ self._gate_open = self._gate_open_fn(frame)
63
+
64
+ if old_state != self._gate_open:
65
+ state = "open" if self._gate_open else "closed"
66
+ logger.debug(f"Gate is now {state} because of {frame}")
67
+
68
+ if self._gate_open:
69
+ await self.push_frame(frame, direction)
70
+ for frame in self._accumulator:
71
+ await self.push_frame(frame, direction)
72
+ self._accumulator = []
73
+ else:
74
+ self._accumulator.append(frame)
pipecat/processors/aggregators/llm_response.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import List
8
+
9
+ from pipecat.services.openai import OpenAILLMContextFrame, OpenAILLMContext
10
+
11
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
12
+ from pipecat.frames.frames import (
13
+ Frame,
14
+ InterimTranscriptionFrame,
15
+ LLMFullResponseEndFrame,
16
+ LLMFullResponseStartFrame,
17
+ LLMResponseEndFrame,
18
+ LLMResponseStartFrame,
19
+ LLMMessagesFrame,
20
+ StartInterruptionFrame,
21
+ TranscriptionFrame,
22
+ TextFrame,
23
+ UserStartedSpeakingFrame,
24
+ UserStoppedSpeakingFrame)
25
+
26
+
27
+ class LLMResponseAggregator(FrameProcessor):
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ messages: List[dict],
33
+ role: str,
34
+ start_frame,
35
+ end_frame,
36
+ accumulator_frame: TextFrame,
37
+ interim_accumulator_frame: TextFrame | None = None,
38
+ handle_interruptions: bool = False
39
+ ):
40
+ super().__init__()
41
+
42
+ self._messages = messages
43
+ self._role = role
44
+ self._start_frame = start_frame
45
+ self._end_frame = end_frame
46
+ self._accumulator_frame = accumulator_frame
47
+ self._interim_accumulator_frame = interim_accumulator_frame
48
+ self._handle_interruptions = handle_interruptions
49
+
50
+ # Reset our accumulator state.
51
+ self._reset()
52
+
53
+ @property
54
+ def messages(self):
55
+ return self._messages
56
+
57
+ @property
58
+ def role(self):
59
+ return self._role
60
+
61
+ #
62
+ # Frame processor
63
+ #
64
+
65
+ # Use cases implemented:
66
+ #
67
+ # S: Start, E: End, T: Transcription, I: Interim, X: Text
68
+ #
69
+ # S E -> None
70
+ # S T E -> X
71
+ # S I T E -> X
72
+ # S I E T -> X
73
+ # S I E I T -> X
74
+ # S E T -> X
75
+ # S E I T -> X
76
+ #
77
+ # The following case would not be supported:
78
+ #
79
+ # S I E T1 I T2 -> X
80
+ #
81
+ # and T2 would be dropped.
82
+
83
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
84
+ await super().process_frame(frame, direction)
85
+
86
+ send_aggregation = False
87
+
88
+ if isinstance(frame, self._start_frame):
89
+ self._aggregation = ""
90
+ self._aggregating = True
91
+ self._seen_start_frame = True
92
+ self._seen_end_frame = False
93
+ self._seen_interim_results = False
94
+ await self.push_frame(frame, direction)
95
+ elif isinstance(frame, self._end_frame):
96
+ self._seen_end_frame = True
97
+ self._seen_start_frame = False
98
+
99
+ # We might have received the end frame but we might still be
100
+ # aggregating (i.e. we have seen interim results but not the final
101
+ # text).
102
+ self._aggregating = self._seen_interim_results or len(self._aggregation) == 0
103
+
104
+ # Send the aggregation if we are not aggregating anymore (i.e. no
105
+ # more interim results received).
106
+ send_aggregation = not self._aggregating
107
+ await self.push_frame(frame, direction)
108
+ elif isinstance(frame, self._accumulator_frame):
109
+ if self._aggregating:
110
+ self._aggregation += f" {frame.text}"
111
+ # We have recevied a complete sentence, so if we have seen the
112
+ # end frame and we were still aggregating, it means we should
113
+ # send the aggregation.
114
+ send_aggregation = self._seen_end_frame
115
+
116
+ # We just got our final result, so let's reset interim results.
117
+ self._seen_interim_results = False
118
+ elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
119
+ self._seen_interim_results = True
120
+ elif self._handle_interruptions and isinstance(frame, StartInterruptionFrame):
121
+ await self._push_aggregation()
122
+ # Reset anyways
123
+ self._reset()
124
+ await self.push_frame(frame, direction)
125
+ else:
126
+ await self.push_frame(frame, direction)
127
+
128
+ if send_aggregation:
129
+ await self._push_aggregation()
130
+
131
+ async def _push_aggregation(self):
132
+ if len(self._aggregation) > 0:
133
+ self._messages.append({"role": self._role, "content": self._aggregation})
134
+
135
+ # Reset the aggregation. Reset it before pushing it down, otherwise
136
+ # if the tasks gets cancelled we won't be able to clear things up.
137
+ self._aggregation = ""
138
+
139
+ frame = LLMMessagesFrame(self._messages)
140
+ await self.push_frame(frame)
141
+
142
+ def _reset(self):
143
+ self._aggregation = ""
144
+ self._aggregating = False
145
+ self._seen_start_frame = False
146
+ self._seen_end_frame = False
147
+ self._seen_interim_results = False
148
+
149
+
150
+ class LLMAssistantResponseAggregator(LLMResponseAggregator):
151
+ def __init__(self, messages: List[dict] = []):
152
+ super().__init__(
153
+ messages=messages,
154
+ role="assistant",
155
+ start_frame=LLMFullResponseStartFrame,
156
+ end_frame=LLMFullResponseEndFrame,
157
+ accumulator_frame=TextFrame,
158
+ handle_interruptions=True
159
+ )
160
+
161
+
162
+ class LLMUserResponseAggregator(LLMResponseAggregator):
163
+ def __init__(self, messages: List[dict] = []):
164
+ super().__init__(
165
+ messages=messages,
166
+ role="user",
167
+ start_frame=UserStartedSpeakingFrame,
168
+ end_frame=UserStoppedSpeakingFrame,
169
+ accumulator_frame=TranscriptionFrame,
170
+ interim_accumulator_frame=InterimTranscriptionFrame
171
+ )
172
+
173
+
174
+ class LLMFullResponseAggregator(FrameProcessor):
175
+ """This class aggregates Text frames until it receives a
176
+ LLMResponseEndFrame, then emits the concatenated text as
177
+ a single text frame.
178
+
179
+ given the following frames:
180
+
181
+ TextFrame("Hello,")
182
+ TextFrame(" world.")
183
+ TextFrame(" I am")
184
+ TextFrame(" an LLM.")
185
+ LLMResponseEndFrame()]
186
+
187
+ this processor will yield nothing for the first 4 frames, then
188
+
189
+ TextFrame("Hello, world. I am an LLM.")
190
+ LLMResponseEndFrame()
191
+
192
+ when passed the last frame.
193
+
194
+ >>> async def print_frames(aggregator, frame):
195
+ ... async for frame in aggregator.process_frame(frame):
196
+ ... if isinstance(frame, TextFrame):
197
+ ... print(frame.text)
198
+ ... else:
199
+ ... print(frame.__class__.__name__)
200
+
201
+ >>> aggregator = LLMFullResponseAggregator()
202
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
203
+ >>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
204
+ >>> asyncio.run(print_frames(aggregator, TextFrame(" I am")))
205
+ >>> asyncio.run(print_frames(aggregator, TextFrame(" an LLM.")))
206
+ >>> asyncio.run(print_frames(aggregator, LLMResponseEndFrame()))
207
+ Hello, world. I am an LLM.
208
+ LLMResponseEndFrame
209
+ """
210
+
211
+ def __init__(self):
212
+ super().__init__()
213
+ self._aggregation = ""
214
+
215
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
216
+ await super().process_frame(frame, direction)
217
+
218
+ if isinstance(frame, TextFrame):
219
+ self._aggregation += frame.text
220
+ elif isinstance(frame, LLMFullResponseEndFrame):
221
+ await self.push_frame(TextFrame(self._aggregation))
222
+ await self.push_frame(frame)
223
+ self._aggregation = ""
224
+ else:
225
+ await self.push_frame(frame, direction)
226
+
227
+
228
+ class LLMContextAggregator(LLMResponseAggregator):
229
+ def __init__(self, *, context: OpenAILLMContext, **kwargs):
230
+
231
+ self._context = context
232
+ super().__init__(**kwargs)
233
+
234
+ async def _push_aggregation(self):
235
+ if len(self._aggregation) > 0:
236
+ self._context.add_message({"role": self._role, "content": self._aggregation})
237
+ frame = OpenAILLMContextFrame(self._context)
238
+ await self.push_frame(frame)
239
+
240
+ # Reset our accumulator state.
241
+ self._reset()
242
+
243
+
244
+ class LLMAssistantContextAggregator(LLMContextAggregator):
245
+ def __init__(self, context: OpenAILLMContext):
246
+ super().__init__(
247
+ messages=[],
248
+ context=context,
249
+ role="assistant",
250
+ start_frame=LLMResponseStartFrame,
251
+ end_frame=LLMResponseEndFrame,
252
+ accumulator_frame=TextFrame
253
+ )
254
+
255
+
256
+ class LLMUserContextAggregator(LLMContextAggregator):
257
+ def __init__(self, context: OpenAILLMContext):
258
+ super().__init__(
259
+ messages=[],
260
+ context=context,
261
+ role="user",
262
+ start_frame=UserStartedSpeakingFrame,
263
+ end_frame=UserStoppedSpeakingFrame,
264
+ accumulator_frame=TranscriptionFrame,
265
+ interim_accumulator_frame=InterimTranscriptionFrame
266
+ )
pipecat/processors/aggregators/openai_llm_context.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from dataclasses import dataclass
8
+ import io
9
+ import json
10
+
11
+ from typing import List
12
+
13
+ from PIL import Image
14
+
15
+ from pipecat.frames.frames import Frame, VisionImageRawFrame
16
+
17
+ from openai._types import NOT_GIVEN, NotGiven
18
+
19
+ from openai.types.chat import (
20
+ ChatCompletionToolParam,
21
+ ChatCompletionToolChoiceOptionParam,
22
+ ChatCompletionMessageParam
23
+ )
24
+
25
+ # JSON custom encoder to handle bytes arrays so that we can log contexts
26
+ # with images to the console.
27
+
28
+
29
+ class CustomEncoder(json.JSONEncoder):
30
+ def default(self, obj):
31
+ if isinstance(obj, io.BytesIO):
32
+ # Convert the first 8 bytes to an ASCII hex string
33
+ return (f"{obj.getbuffer()[0:8].hex()}...")
34
+ return super().default(obj)
35
+
36
+
37
+ class OpenAILLMContext:
38
+
39
+ def __init__(
40
+ self,
41
+ messages: List[ChatCompletionMessageParam] | None = None,
42
+ tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN,
43
+ tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = NOT_GIVEN
44
+ ):
45
+ self.messages: List[ChatCompletionMessageParam] = messages if messages else [
46
+ ]
47
+ self.tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven = tool_choice
48
+ self.tools: List[ChatCompletionToolParam] | NotGiven = tools
49
+
50
+ @staticmethod
51
+ def from_messages(messages: List[dict]) -> "OpenAILLMContext":
52
+ context = OpenAILLMContext()
53
+ for message in messages:
54
+ context.add_message({
55
+ "content": message["content"],
56
+ "role": message["role"],
57
+ "name": message["name"] if "name" in message else message["role"]
58
+ })
59
+ return context
60
+
61
+ @staticmethod
62
+ def from_image_frame(frame: VisionImageRawFrame) -> "OpenAILLMContext":
63
+ """
64
+ For images, we are deviating from the OpenAI messages shape. OpenAI
65
+ expects images to be base64 encoded, but other vision models may not.
66
+ So we'll store the image as bytes and do the base64 encoding as needed
67
+ in the LLM service.
68
+ """
69
+ context = OpenAILLMContext()
70
+ buffer = io.BytesIO()
71
+ Image.frombytes(
72
+ frame.format,
73
+ frame.size,
74
+ frame.image
75
+ ).save(
76
+ buffer,
77
+ format="JPEG")
78
+ context.add_message({
79
+ "content": frame.text,
80
+ "role": "user",
81
+ "data": buffer,
82
+ "mime_type": "image/jpeg"
83
+ })
84
+ return context
85
+
86
+ def add_message(self, message: ChatCompletionMessageParam):
87
+ self.messages.append(message)
88
+
89
+ def get_messages(self) -> List[ChatCompletionMessageParam]:
90
+ return self.messages
91
+
92
+ def get_messages_json(self) -> str:
93
+ return json.dumps(self.messages, cls=CustomEncoder)
94
+
95
+ def set_tool_choice(
96
+ self, tool_choice: ChatCompletionToolChoiceOptionParam | NotGiven
97
+ ):
98
+ self.tool_choice = tool_choice
99
+
100
+ def set_tools(self, tools: List[ChatCompletionToolParam] | NotGiven = NOT_GIVEN):
101
+ if tools != NOT_GIVEN and len(tools) == 0:
102
+ tools = NOT_GIVEN
103
+
104
+ self.tools = tools
105
+
106
+
107
+ @dataclass
108
+ class OpenAILLMContextFrame(Frame):
109
+ """Like an LLMMessagesFrame, but with extra context specific to the OpenAI
110
+ API. The context in this message is also mutable, and will be changed by the
111
+ OpenAIContextAggregator frame processor.
112
+
113
+ """
114
+ context: OpenAILLMContext
pipecat/processors/aggregators/sentence.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import re
8
+
9
+ from pipecat.frames.frames import EndFrame, Frame, InterimTranscriptionFrame, TextFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+
13
+ class SentenceAggregator(FrameProcessor):
14
+ """This frame processor aggregates text frames into complete sentences.
15
+
16
+ Frame input/output:
17
+ TextFrame("Hello,") -> None
18
+ TextFrame(" world.") -> TextFrame("Hello world.")
19
+
20
+ Doctest:
21
+ >>> async def print_frames(aggregator, frame):
22
+ ... async for frame in aggregator.process_frame(frame):
23
+ ... print(frame.text)
24
+
25
+ >>> aggregator = SentenceAggregator()
26
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Hello,")))
27
+ >>> asyncio.run(print_frames(aggregator, TextFrame(" world.")))
28
+ Hello, world.
29
+ """
30
+
31
+ def __init__(self):
32
+ super().__init__()
33
+ self._aggregation = ""
34
+
35
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
36
+ await super().process_frame(frame, direction)
37
+
38
+ # We ignore interim description at this point.
39
+ if isinstance(frame, InterimTranscriptionFrame):
40
+ return
41
+
42
+ if isinstance(frame, TextFrame):
43
+ m = re.search("(.*[?.!])(.*)", frame.text)
44
+ if m:
45
+ await self.push_frame(TextFrame(self._aggregation + m.group(1)))
46
+ self._aggregation = m.group(2)
47
+ else:
48
+ self._aggregation += frame.text
49
+ elif isinstance(frame, EndFrame):
50
+ if self._aggregation:
51
+ await self.push_frame(TextFrame(self._aggregation))
52
+ await self.push_frame(frame)
53
+ else:
54
+ await self.push_frame(frame, direction)
pipecat/processors/aggregators/user_response.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
8
+ from pipecat.frames.frames import (
9
+ Frame,
10
+ InterimTranscriptionFrame,
11
+ StartInterruptionFrame,
12
+ TextFrame,
13
+ TranscriptionFrame,
14
+ UserStartedSpeakingFrame,
15
+ UserStoppedSpeakingFrame)
16
+
17
+
18
+ class ResponseAggregator(FrameProcessor):
19
+ """This frame processor aggregates frames between a start and an end frame
20
+ into complete text frame sentences.
21
+
22
+ For example, frame input/output:
23
+ UserStartedSpeakingFrame() -> None
24
+ TranscriptionFrame("Hello,") -> None
25
+ TranscriptionFrame(" world.") -> None
26
+ UserStoppedSpeakingFrame() -> TextFrame("Hello world.")
27
+
28
+ Doctest:
29
+ >>> async def print_frames(aggregator, frame):
30
+ ... async for frame in aggregator.process_frame(frame):
31
+ ... if isinstance(frame, TextFrame):
32
+ ... print(frame.text)
33
+
34
+ >>> aggregator = ResponseAggregator(start_frame = UserStartedSpeakingFrame,
35
+ ... end_frame=UserStoppedSpeakingFrame,
36
+ ... accumulator_frame=TranscriptionFrame,
37
+ ... pass_through=False)
38
+ >>> asyncio.run(print_frames(aggregator, UserStartedSpeakingFrame()))
39
+ >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("Hello,", 1, 1)))
40
+ >>> asyncio.run(print_frames(aggregator, TranscriptionFrame("world.", 1, 2)))
41
+ >>> asyncio.run(print_frames(aggregator, UserStoppedSpeakingFrame()))
42
+ Hello, world.
43
+
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ *,
49
+ start_frame,
50
+ end_frame,
51
+ accumulator_frame: TextFrame,
52
+ interim_accumulator_frame: TextFrame | None = None
53
+ ):
54
+ super().__init__()
55
+
56
+ self._start_frame = start_frame
57
+ self._end_frame = end_frame
58
+ self._accumulator_frame = accumulator_frame
59
+ self._interim_accumulator_frame = interim_accumulator_frame
60
+
61
+ # Reset our accumulator state.
62
+ self._reset()
63
+
64
+ #
65
+ # Frame processor
66
+ #
67
+
68
+ # Use cases implemented:
69
+ #
70
+ # S: Start, E: End, T: Transcription, I: Interim, X: Text
71
+ #
72
+ # S E -> None
73
+ # S T E -> X
74
+ # S I T E -> X
75
+ # S I E T -> X
76
+ # S I E I T -> X
77
+ # S E T -> X
78
+ # S E I T -> X
79
+ #
80
+ # The following case would not be supported:
81
+ #
82
+ # S I E T1 I T2 -> X
83
+ #
84
+ # and T2 would be dropped.
85
+
86
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
87
+ await super().process_frame(frame, direction)
88
+
89
+ send_aggregation = False
90
+
91
+ if isinstance(frame, self._start_frame):
92
+ self._aggregating = True
93
+ self._seen_start_frame = True
94
+ self._seen_end_frame = False
95
+ self._seen_interim_results = False
96
+ await self.push_frame(frame, direction)
97
+ elif isinstance(frame, self._end_frame):
98
+ self._seen_end_frame = True
99
+ self._seen_start_frame = False
100
+
101
+ # We might have received the end frame but we might still be
102
+ # aggregating (i.e. we have seen interim results but not the final
103
+ # text).
104
+ self._aggregating = self._seen_interim_results or len(self._aggregation) == 0
105
+
106
+ # Send the aggregation if we are not aggregating anymore (i.e. no
107
+ # more interim results received).
108
+ send_aggregation = not self._aggregating
109
+ await self.push_frame(frame, direction)
110
+ elif isinstance(frame, self._accumulator_frame):
111
+ if self._aggregating:
112
+ self._aggregation += f" {frame.text}"
113
+ # We have recevied a complete sentence, so if we have seen the
114
+ # end frame and we were still aggregating, it means we should
115
+ # send the aggregation.
116
+ send_aggregation = self._seen_end_frame
117
+
118
+ # We just got our final result, so let's reset interim results.
119
+ self._seen_interim_results = False
120
+ elif self._interim_accumulator_frame and isinstance(frame, self._interim_accumulator_frame):
121
+ self._seen_interim_results = True
122
+ else:
123
+ await self.push_frame(frame, direction)
124
+
125
+ if send_aggregation:
126
+ await self._push_aggregation()
127
+
128
+ async def _push_aggregation(self):
129
+ if len(self._aggregation) > 0:
130
+ frame = TextFrame(self._aggregation.strip())
131
+
132
+ # Reset the aggregation. Reset it before pushing it down, otherwise
133
+ # if the tasks gets cancelled we won't be able to clear things up.
134
+ self._aggregation = ""
135
+
136
+ await self.push_frame(frame)
137
+
138
+ # Reset our accumulator state.
139
+ self._reset()
140
+
141
+ def _reset(self):
142
+ self._aggregation = ""
143
+ self._aggregating = False
144
+ self._seen_start_frame = False
145
+ self._seen_end_frame = False
146
+ self._seen_interim_results = False
147
+
148
+
149
+ class UserResponseAggregator(ResponseAggregator):
150
+ def __init__(self):
151
+ super().__init__(
152
+ start_frame=UserStartedSpeakingFrame,
153
+ end_frame=UserStoppedSpeakingFrame,
154
+ accumulator_frame=TranscriptionFrame,
155
+ interim_accumulator_frame=InterimTranscriptionFrame,
156
+ )
pipecat/processors/aggregators/vision_image_frame.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame
8
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
9
+
10
+
11
+ class VisionImageFrameAggregator(FrameProcessor):
12
+ """This aggregator waits for a consecutive TextFrame and an
13
+ ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
14
+
15
+ >>> from pipecat.pipeline.frames import ImageFrame
16
+
17
+ >>> async def print_frames(aggregator, frame):
18
+ ... async for frame in aggregator.process_frame(frame):
19
+ ... print(frame)
20
+
21
+ >>> aggregator = VisionImageFrameAggregator()
22
+ >>> asyncio.run(print_frames(aggregator, TextFrame("What do you see?")))
23
+ >>> asyncio.run(print_frames(aggregator, ImageFrame(image=bytes([]), size=(0, 0))))
24
+ VisionImageFrame, text: What do you see?, image size: 0x0, buffer size: 0 B
25
+
26
+ """
27
+
28
+ def __init__(self):
29
+ super().__init__()
30
+ self._describe_text = None
31
+
32
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
33
+ await super().process_frame(frame, direction)
34
+
35
+ if isinstance(frame, TextFrame):
36
+ self._describe_text = frame.text
37
+ elif isinstance(frame, ImageRawFrame):
38
+ if self._describe_text:
39
+ frame = VisionImageRawFrame(
40
+ text=self._describe_text,
41
+ image=frame.image,
42
+ size=frame.size,
43
+ format=frame.format)
44
+ await self.push_frame(frame)
45
+ self._describe_text = None
46
+ else:
47
+ await self.push_frame(frame, direction)
pipecat/processors/async_frame_processor.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from pipecat.frames.frames import EndFrame, Frame, StartInterruptionFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+
13
+ class AsyncFrameProcessor(FrameProcessor):
14
+
15
+ def __init__(
16
+ self,
17
+ *,
18
+ name: str | None = None,
19
+ loop: asyncio.AbstractEventLoop | None = None,
20
+ **kwargs):
21
+ super().__init__(name=name, loop=loop, **kwargs)
22
+
23
+ self._create_push_task()
24
+
25
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
26
+ await super().process_frame(frame, direction)
27
+
28
+ if isinstance(frame, StartInterruptionFrame):
29
+ await self._handle_interruptions(frame)
30
+
31
+ async def queue_frame(
32
+ self,
33
+ frame: Frame,
34
+ direction: FrameDirection = FrameDirection.DOWNSTREAM):
35
+ await self._push_queue.put((frame, direction))
36
+
37
+ async def cleanup(self):
38
+ self._push_frame_task.cancel()
39
+ await self._push_frame_task
40
+
41
+ async def _handle_interruptions(self, frame: Frame):
42
+ # Cancel the task. This will stop pushing frames downstream.
43
+ self._push_frame_task.cancel()
44
+ await self._push_frame_task
45
+ # Push an out-of-band frame (i.e. not using the ordered push
46
+ # frame task).
47
+ await self.push_frame(frame)
48
+ # Create a new queue and task.
49
+ self._create_push_task()
50
+
51
+ def _create_push_task(self):
52
+ self._push_queue = asyncio.Queue()
53
+ self._push_frame_task = self.get_event_loop().create_task(self._push_frame_task_handler())
54
+
55
+ async def _push_frame_task_handler(self):
56
+ running = True
57
+ while running:
58
+ try:
59
+ (frame, direction) = await self._push_queue.get()
60
+ await self.push_frame(frame, direction)
61
+ running = not isinstance(frame, EndFrame)
62
+ except asyncio.CancelledError:
63
+ break
pipecat/processors/filters/__init__.py ADDED
File without changes
pipecat/processors/filters/frame_filter.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import List
8
+
9
+ from pipecat.frames.frames import AppFrame, ControlFrame, Frame, SystemFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+
13
+ class FrameFilter(FrameProcessor):
14
+
15
+ def __init__(self, types: List[type]):
16
+ super().__init__()
17
+ self._types = types
18
+
19
+ #
20
+ # Frame processor
21
+ #
22
+
23
+ def _should_passthrough_frame(self, frame):
24
+ for t in self._types:
25
+ if isinstance(frame, t):
26
+ return True
27
+
28
+ return (isinstance(frame, AppFrame)
29
+ or isinstance(frame, ControlFrame)
30
+ or isinstance(frame, SystemFrame))
31
+
32
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
33
+ await super().process_frame(frame, direction)
34
+
35
+ if self._should_passthrough_frame(frame):
36
+ await self.push_frame(frame, direction)
pipecat/processors/filters/function_filter.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Awaitable, Callable
8
+
9
+ from pipecat.frames.frames import Frame, SystemFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+
13
+ class FunctionFilter(FrameProcessor):
14
+
15
+ def __init__(self, filter: Callable[[Frame], Awaitable[bool]]):
16
+ super().__init__()
17
+ self._filter = filter
18
+
19
+ #
20
+ # Frame processor
21
+ #
22
+
23
+ def _should_passthrough_frame(self, frame):
24
+ return isinstance(frame, SystemFrame)
25
+
26
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
27
+ passthrough = self._should_passthrough_frame(frame)
28
+ allowed = await self._filter(frame)
29
+ if passthrough or allowed:
30
+ await self.push_frame(frame, direction)
pipecat/processors/filters/wake_check_filter.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import re
8
+ import time
9
+
10
+ from enum import Enum
11
+
12
+ from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
13
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
14
+
15
+ from loguru import logger
16
+
17
+
18
+ class WakeCheckFilter(FrameProcessor):
19
+ """
20
+ This filter looks for wake phrases in the transcription frames and only passes through frames
21
+ after a wake phrase has been detected. It also has a keepalive timeout to allow for a brief
22
+ period of continued conversation after a wake phrase has been detected.
23
+ """
24
+ class WakeState(Enum):
25
+ IDLE = 1
26
+ AWAKE = 2
27
+
28
+ class ParticipantState:
29
+ def __init__(self, participant_id: str):
30
+ self.participant_id = participant_id
31
+ self.state = WakeCheckFilter.WakeState.IDLE
32
+ self.wake_timer = 0.0
33
+ self.accumulator = ""
34
+
35
+ def __init__(self, wake_phrases: list[str], keepalive_timeout: float = 3):
36
+ super().__init__()
37
+ self._participant_states = {}
38
+ self._keepalive_timeout = keepalive_timeout
39
+ self._wake_patterns = []
40
+ for name in wake_phrases:
41
+ pattern = re.compile(r'\b' + r'\s*'.join(re.escape(word)
42
+ for word in name.split()) + r'\b', re.IGNORECASE)
43
+ self._wake_patterns.append(pattern)
44
+
45
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
46
+ await super().process_frame(frame, direction)
47
+
48
+ try:
49
+ if isinstance(frame, TranscriptionFrame):
50
+ p = self._participant_states.get(frame.user_id)
51
+ if p is None:
52
+ p = WakeCheckFilter.ParticipantState(frame.user_id)
53
+ self._participant_states[frame.user_id] = p
54
+
55
+ # If we have been AWAKE within the last keepalive_timeout seconds, pass
56
+ # the frame through
57
+ if p.state == WakeCheckFilter.WakeState.AWAKE:
58
+ if time.time() - p.wake_timer < self._keepalive_timeout:
59
+ logger.debug(
60
+ f"Wake phrase keepalive timeout has not expired. Pushing {frame}")
61
+ p.wake_timer = time.time()
62
+ await self.push_frame(frame)
63
+ return
64
+ else:
65
+ p.state = WakeCheckFilter.WakeState.IDLE
66
+
67
+ p.accumulator += frame.text
68
+ for pattern in self._wake_patterns:
69
+ match = pattern.search(p.accumulator)
70
+ if match:
71
+ logger.debug(f"Wake phrase triggered: {match.group()}")
72
+ # Found the wake word. Discard from the accumulator up to the start of the match
73
+ # and modify the frame in place.
74
+ p.state = WakeCheckFilter.WakeState.AWAKE
75
+ p.wake_timer = time.time()
76
+ frame.text = p.accumulator[match.start():]
77
+ p.accumulator = ""
78
+ await self.push_frame(frame)
79
+ else:
80
+ pass
81
+ else:
82
+ await self.push_frame(frame, direction)
83
+ except Exception as e:
84
+ error_msg = f"Error in wake word filter: {e}"
85
+ logger.exception(error_msg)
86
+ await self.push_error(ErrorFrame(error_msg))
pipecat/processors/frame_processor.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+ import time
9
+
10
+ from enum import Enum
11
+
12
+ from pipecat.frames.frames import ErrorFrame, Frame, MetricsFrame, StartFrame, StartInterruptionFrame, UserStoppedSpeakingFrame
13
+ from pipecat.utils.utils import obj_count, obj_id
14
+
15
+ from loguru import logger
16
+
17
+
18
+ class FrameDirection(Enum):
19
+ DOWNSTREAM = 1
20
+ UPSTREAM = 2
21
+
22
+
23
+ class FrameProcessorMetrics:
24
+ def __init__(self, name: str):
25
+ self._name = name
26
+ self._start_ttfb_time = 0
27
+ self._start_processing_time = 0
28
+ self._should_report_ttfb = True
29
+
30
+ async def start_ttfb_metrics(self, report_only_initial_ttfb):
31
+ if self._should_report_ttfb:
32
+ self._start_ttfb_time = time.time()
33
+ self._should_report_ttfb = not report_only_initial_ttfb
34
+
35
+ async def stop_ttfb_metrics(self):
36
+ if self._start_ttfb_time == 0:
37
+ return None
38
+
39
+ value = time.time() - self._start_ttfb_time
40
+ logger.debug(f"{self._name} TTFB: {value}")
41
+ ttfb = {
42
+ "processor": self._name,
43
+ "value": value
44
+ }
45
+ self._start_ttfb_time = 0
46
+ return MetricsFrame(ttfb=[ttfb])
47
+
48
+ async def start_processing_metrics(self):
49
+ self._start_processing_time = time.time()
50
+
51
+ async def stop_processing_metrics(self):
52
+ if self._start_processing_time == 0:
53
+ return None
54
+
55
+ value = time.time() - self._start_processing_time
56
+ logger.debug(f"{self._name} processing time: {value}")
57
+ processing = {
58
+ "processor": self._name,
59
+ "value": value
60
+ }
61
+ self._start_processing_time = 0
62
+ return MetricsFrame(processing=[processing])
63
+
64
+
65
+ class FrameProcessor:
66
+
67
+ def __init__(
68
+ self,
69
+ *,
70
+ name: str | None = None,
71
+ loop: asyncio.AbstractEventLoop | None = None,
72
+ **kwargs):
73
+ self.id: int = obj_id()
74
+ self.name = name or f"{self.__class__.__name__}#{obj_count(self)}"
75
+ self._prev: "FrameProcessor" | None = None
76
+ self._next: "FrameProcessor" | None = None
77
+ self._loop: asyncio.AbstractEventLoop = loop or asyncio.get_running_loop()
78
+
79
+ # Properties
80
+ self._allow_interruptions = False
81
+ self._enable_metrics = False
82
+ self._report_only_initial_ttfb = False
83
+
84
+ # Metrics
85
+ self._metrics = FrameProcessorMetrics(name=self.name)
86
+
87
+ @property
88
+ def interruptions_allowed(self):
89
+ return self._allow_interruptions
90
+
91
+ @property
92
+ def metrics_enabled(self):
93
+ return self._enable_metrics
94
+
95
+ @property
96
+ def report_only_initial_ttfb(self):
97
+ return self._report_only_initial_ttfb
98
+
99
+ def can_generate_metrics(self) -> bool:
100
+ return False
101
+
102
+ async def start_ttfb_metrics(self):
103
+ if self.can_generate_metrics() and self.metrics_enabled:
104
+ await self._metrics.start_ttfb_metrics(self._report_only_initial_ttfb)
105
+
106
+ async def stop_ttfb_metrics(self):
107
+ if self.can_generate_metrics() and self.metrics_enabled:
108
+ frame = await self._metrics.stop_ttfb_metrics()
109
+ if frame:
110
+ await self.push_frame(frame)
111
+
112
+ async def start_processing_metrics(self):
113
+ if self.can_generate_metrics() and self.metrics_enabled:
114
+ await self._metrics.start_processing_metrics()
115
+
116
+ async def stop_processing_metrics(self):
117
+ if self.can_generate_metrics() and self.metrics_enabled:
118
+ frame = await self._metrics.stop_processing_metrics()
119
+ if frame:
120
+ await self.push_frame(frame)
121
+
122
+ async def stop_all_metrics(self):
123
+ await self.stop_ttfb_metrics()
124
+ await self.stop_processing_metrics()
125
+
126
+ async def cleanup(self):
127
+ pass
128
+
129
+ def link(self, processor: 'FrameProcessor'):
130
+ self._next = processor
131
+ processor._prev = self
132
+ logger.debug(f"Linking {self} -> {self._next}")
133
+
134
+ def get_event_loop(self) -> asyncio.AbstractEventLoop:
135
+ return self._loop
136
+
137
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
138
+ if isinstance(frame, StartFrame):
139
+ self._allow_interruptions = frame.allow_interruptions
140
+ self._enable_metrics = frame.enable_metrics
141
+ self._report_only_initial_ttfb = frame.report_only_initial_ttfb
142
+ elif isinstance(frame, StartInterruptionFrame):
143
+ await self.stop_all_metrics()
144
+ elif isinstance(frame, UserStoppedSpeakingFrame):
145
+ self._should_report_ttfb = True
146
+
147
+ async def push_error(self, error: ErrorFrame):
148
+ await self.push_frame(error, FrameDirection.UPSTREAM)
149
+
150
+ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
151
+ try:
152
+ if direction == FrameDirection.DOWNSTREAM and self._next:
153
+ logger.trace(f"Pushing {frame} from {self} to {self._next}")
154
+ await self._next.process_frame(frame, direction)
155
+ elif direction == FrameDirection.UPSTREAM and self._prev:
156
+ logger.trace(f"Pushing {frame} upstream from {self} to {self._prev}")
157
+ await self._prev.process_frame(frame, direction)
158
+ except Exception as e:
159
+ logger.exception(f"Uncaught exception in {self}: {e}")
160
+
161
+ def __str__(self):
162
+ return self.name
pipecat/processors/frameworks/__init__.py ADDED
File without changes
pipecat/processors/frameworks/langchain.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Union
8
+
9
+ from pipecat.frames.frames import (
10
+ Frame,
11
+ LLMFullResponseEndFrame,
12
+ LLMFullResponseStartFrame,
13
+ LLMMessagesFrame,
14
+ LLMResponseEndFrame,
15
+ LLMResponseStartFrame,
16
+ TextFrame)
17
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
18
+
19
+ from loguru import logger
20
+
21
+ try:
22
+ from langchain_core.messages import AIMessageChunk
23
+ from langchain_core.runnables import Runnable
24
+ except ModuleNotFoundError as e:
25
+ logger.exception(
26
+ "In order to use Langchain, you need to `pip install pipecat-ai[langchain]`. "
27
+ )
28
+ raise Exception(f"Missing module: {e}")
29
+
30
+
31
+ class LangchainProcessor(FrameProcessor):
32
+ def __init__(self, chain: Runnable, transcript_key: str = "input"):
33
+ super().__init__()
34
+ self._chain = chain
35
+ self._transcript_key = transcript_key
36
+ self._participant_id: str | None = None
37
+
38
+ def set_participant_id(self, participant_id: str):
39
+ self._participant_id = participant_id
40
+
41
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
42
+ await super().process_frame(frame, direction)
43
+
44
+ if isinstance(frame, LLMMessagesFrame):
45
+ # Messages are accumulated by the `LLMUserResponseAggregator` in a list of messages.
46
+ # The last one by the human is the one we want to send to the LLM.
47
+ logger.debug(f"Got transcription frame {frame}")
48
+ text: str = frame.messages[-1]["content"]
49
+
50
+ await self._ainvoke(text.strip())
51
+ else:
52
+ await self.push_frame(frame, direction)
53
+
54
+ @staticmethod
55
+ def __get_token_value(text: Union[str, AIMessageChunk]) -> str:
56
+ match text:
57
+ case str():
58
+ return text
59
+ case AIMessageChunk():
60
+ return text.content
61
+ case _:
62
+ return ""
63
+
64
+ async def _ainvoke(self, text: str):
65
+ logger.debug(f"Invoking chain with {text}")
66
+ await self.push_frame(LLMFullResponseStartFrame())
67
+ try:
68
+ async for token in self._chain.astream(
69
+ {self._transcript_key: text},
70
+ config={"configurable": {"session_id": self._participant_id}},
71
+ ):
72
+ await self.push_frame(LLMResponseStartFrame())
73
+ await self.push_frame(TextFrame(self.__get_token_value(token)))
74
+ await self.push_frame(LLMResponseEndFrame())
75
+ except GeneratorExit:
76
+ logger.warning(f"{self} generator was closed prematurely")
77
+ except Exception as e:
78
+ logger.exception(f"{self} an unknown error occurred: {e}")
79
+ finally:
80
+ await self.push_frame(LLMFullResponseEndFrame())
pipecat/processors/logger.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from pipecat.frames.frames import Frame
8
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
9
+ from loguru import logger
10
+ from typing import Optional
11
+ logger = logger.opt(ansi=True)
12
+
13
+
14
+ class FrameLogger(FrameProcessor):
15
+ def __init__(self, prefix="Frame", color: Optional[str] = None):
16
+ super().__init__()
17
+ self._prefix = prefix
18
+ self._color = color
19
+
20
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
21
+ dir = "<" if direction is FrameDirection.UPSTREAM else ">"
22
+ msg = f"{dir} {self._prefix}: {frame}"
23
+ if self._color:
24
+ msg = f"<{self._color}>{msg}</>"
25
+ logger.debug(msg)
26
+
27
+ await self.push_frame(frame, direction)
pipecat/processors/text_transformer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Coroutine
8
+
9
+ from pipecat.frames.frames import Frame, TextFrame
10
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
11
+
12
+
13
+ class StatelessTextTransformer(FrameProcessor):
14
+ """This processor calls the given function on any text in a text frame.
15
+
16
+ >>> async def print_frames(aggregator, frame):
17
+ ... async for frame in aggregator.process_frame(frame):
18
+ ... print(frame.text)
19
+
20
+ >>> aggregator = StatelessTextTransformer(lambda x: x.upper())
21
+ >>> asyncio.run(print_frames(aggregator, TextFrame("Hello")))
22
+ HELLO
23
+ """
24
+
25
+ def __init__(self, transform_fn):
26
+ super().__init__()
27
+ self._transform_fn = transform_fn
28
+
29
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
30
+ await super().process_frame(frame, direction)
31
+
32
+ if isinstance(frame, TextFrame):
33
+ result = self._transform_fn(frame.text)
34
+ if isinstance(result, Coroutine):
35
+ result = await result
36
+ await self.push_frame(result)
37
+ else:
38
+ await self.push_frame(frame, direction)
pipecat/serializers/__init__.py ADDED
File without changes
pipecat/serializers/base_serializer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from abc import ABC, abstractmethod
8
+
9
+ from pipecat.frames.frames import Frame
10
+
11
+
12
+ class FrameSerializer(ABC):
13
+
14
+ @abstractmethod
15
+ def serialize(self, frame: Frame) -> str | bytes | None:
16
+ pass
17
+
18
+ @abstractmethod
19
+ def deserialize(self, data: str | bytes) -> Frame | None:
20
+ pass
pipecat/serializers/protobuf.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import dataclasses
8
+
9
+ import pipecat.frames.protobufs.frames_pb2 as frame_protos
10
+
11
+ from pipecat.frames.frames import AudioRawFrame, Frame, TextFrame, TranscriptionFrame
12
+ from pipecat.serializers.base_serializer import FrameSerializer
13
+
14
+ from loguru import logger
15
+
16
+
17
+ class ProtobufFrameSerializer(FrameSerializer):
18
+ SERIALIZABLE_TYPES = {
19
+ TextFrame: "text",
20
+ AudioRawFrame: "audio",
21
+ TranscriptionFrame: "transcription"
22
+ }
23
+
24
+ SERIALIZABLE_FIELDS = {v: k for k, v in SERIALIZABLE_TYPES.items()}
25
+
26
+ def __init__(self):
27
+ pass
28
+
29
+ def serialize(self, frame: Frame) -> str | bytes | None:
30
+ proto_frame = frame_protos.Frame()
31
+ if type(frame) not in self.SERIALIZABLE_TYPES:
32
+ raise ValueError(
33
+ f"Frame type {type(frame)} is not serializable. You may need to add it to ProtobufFrameSerializer.SERIALIZABLE_FIELDS.")
34
+
35
+ # ignoring linter errors; we check that type(frame) is in this dict above
36
+ proto_optional_name = self.SERIALIZABLE_TYPES[type(frame)] # type: ignore
37
+ for field in dataclasses.fields(frame): # type: ignore
38
+ setattr(getattr(proto_frame, proto_optional_name), field.name,
39
+ getattr(frame, field.name))
40
+
41
+ result = proto_frame.SerializeToString()
42
+ return result
43
+
44
+ def deserialize(self, data: str | bytes) -> Frame | None:
45
+ """Returns a Frame object from a Frame protobuf. Used to convert frames
46
+ passed over the wire as protobufs to Frame objects used in pipelines
47
+ and frame processors.
48
+
49
+ >>> serializer = ProtobufFrameSerializer()
50
+ >>> serializer.deserialize(
51
+ ... serializer.serialize(AudioFrame(data=b'1234567890')))
52
+ AudioFrame(data=b'1234567890')
53
+
54
+ >>> serializer.deserialize(
55
+ ... serializer.serialize(TextFrame(text='hello world')))
56
+ TextFrame(text='hello world')
57
+
58
+ >>> serializer.deserialize(serializer.serialize(TranscriptionFrame(
59
+ ... text="Hello there!", participantId="123", timestamp="2021-01-01")))
60
+ TranscriptionFrame(text='Hello there!', participantId='123', timestamp='2021-01-01')
61
+ """
62
+
63
+ proto = frame_protos.Frame.FromString(data)
64
+ which = proto.WhichOneof("frame")
65
+ if which not in self.SERIALIZABLE_FIELDS:
66
+ logger.error("Unable to deserialize a valid frame")
67
+ return None
68
+
69
+ class_name = self.SERIALIZABLE_FIELDS[which]
70
+ args = getattr(proto, which)
71
+ args_dict = {}
72
+ for field in proto.DESCRIPTOR.fields_by_name[which].message_type.fields:
73
+ args_dict[field.name] = getattr(args, field.name)
74
+
75
+ # Remove special fields if needed
76
+ id = getattr(args, "id")
77
+ name = getattr(args, "name")
78
+ if not id:
79
+ del args_dict["id"]
80
+ if not name:
81
+ del args_dict["name"]
82
+
83
+ # Create the instance
84
+ instance = class_name(**args_dict)
85
+
86
+ # Set special fields
87
+ if id:
88
+ setattr(instance, "id", getattr(args, "id"))
89
+ if name:
90
+ setattr(instance, "name", getattr(args, "name"))
91
+
92
+ return instance
pipecat/serializers/twilio.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import base64
8
+ import json
9
+
10
+ from pipecat.frames.frames import AudioRawFrame, Frame
11
+ from pipecat.serializers.base_serializer import FrameSerializer
12
+ from pipecat.utils.audio import ulaw_8000_to_pcm_16000, pcm_16000_to_ulaw_8000
13
+
14
+
15
+ class TwilioFrameSerializer(FrameSerializer):
16
+ SERIALIZABLE_TYPES = {
17
+ AudioRawFrame: "audio",
18
+ }
19
+
20
+ def __init__(self, stream_sid: str):
21
+ self._stream_sid = stream_sid
22
+
23
+ def serialize(self, frame: Frame) -> str | bytes | None:
24
+ if not isinstance(frame, AudioRawFrame):
25
+ return None
26
+
27
+ data = frame.audio
28
+
29
+ serialized_data = pcm_16000_to_ulaw_8000(data)
30
+ payload = base64.b64encode(serialized_data).decode("utf-8")
31
+ answer = {
32
+ "event": "media",
33
+ "streamSid": self._stream_sid,
34
+ "media": {
35
+ "payload": payload
36
+ }
37
+ }
38
+
39
+ return json.dumps(answer)
40
+
41
+ def deserialize(self, data: str | bytes) -> Frame | None:
42
+ message = json.loads(data)
43
+
44
+ if message["event"] != "media":
45
+ return None
46
+ else:
47
+ payload_base64 = message["media"]["payload"]
48
+ payload = base64.b64decode(payload_base64)
49
+
50
+ deserialized_data = ulaw_8000_to_pcm_16000(payload)
51
+ audio_frame = AudioRawFrame(audio=deserialized_data, num_channels=1, sample_rate=16000)
52
+ return audio_frame
pipecat/services/__init__.py ADDED
File without changes
pipecat/services/ai_services.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import io
8
+ import wave
9
+
10
+ from abc import abstractmethod
11
+ from typing import AsyncGenerator
12
+
13
+ from pipecat.frames.frames import (
14
+ AudioRawFrame,
15
+ CancelFrame,
16
+ EndFrame,
17
+ ErrorFrame,
18
+ Frame,
19
+ LLMFullResponseEndFrame,
20
+ StartFrame,
21
+ StartInterruptionFrame,
22
+ TTSStartedFrame,
23
+ TTSStoppedFrame,
24
+ TextFrame,
25
+ VisionImageRawFrame,
26
+ )
27
+ from pipecat.processors.async_frame_processor import AsyncFrameProcessor
28
+ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
29
+ from pipecat.utils.audio import calculate_audio_volume
30
+ from pipecat.utils.utils import exp_smoothing
31
+
32
+
33
+ class AIService(FrameProcessor):
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+
37
+ async def start(self, frame: StartFrame):
38
+ pass
39
+
40
+ async def stop(self, frame: EndFrame):
41
+ pass
42
+
43
+ async def cancel(self, frame: CancelFrame):
44
+ pass
45
+
46
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
47
+ await super().process_frame(frame, direction)
48
+
49
+ if isinstance(frame, StartFrame):
50
+ await self.start(frame)
51
+ elif isinstance(frame, CancelFrame):
52
+ await self.cancel(frame)
53
+ elif isinstance(frame, EndFrame):
54
+ await self.stop(frame)
55
+
56
+ async def process_generator(self, generator: AsyncGenerator[Frame, None]):
57
+ async for f in generator:
58
+ if isinstance(f, ErrorFrame):
59
+ await self.push_error(f)
60
+ else:
61
+ await self.push_frame(f)
62
+
63
+
64
+ class AsyncAIService(AsyncFrameProcessor):
65
+ def __init__(self, **kwargs):
66
+ super().__init__(**kwargs)
67
+
68
+ async def start(self, frame: StartFrame):
69
+ pass
70
+
71
+ async def stop(self, frame: EndFrame):
72
+ pass
73
+
74
+ async def cancel(self, frame: CancelFrame):
75
+ pass
76
+
77
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
78
+ await super().process_frame(frame, direction)
79
+
80
+ if isinstance(frame, StartFrame):
81
+ await self.start(frame)
82
+ elif isinstance(frame, CancelFrame):
83
+ await self.cancel(frame)
84
+ elif isinstance(frame, EndFrame):
85
+ await self.stop(frame)
86
+
87
+
88
+ class LLMService(AIService):
89
+ """This class is a no-op but serves as a base class for LLM services."""
90
+
91
+ def __init__(self, **kwargs):
92
+ super().__init__(**kwargs)
93
+ self._callbacks = {}
94
+ self._start_callbacks = {}
95
+
96
+ # TODO-CB: callback function type
97
+ def register_function(self, function_name: str, callback, start_callback=None):
98
+ self._callbacks[function_name] = callback
99
+ if start_callback:
100
+ self._start_callbacks[function_name] = start_callback
101
+
102
+ def unregister_function(self, function_name: str):
103
+ del self._callbacks[function_name]
104
+ if self._start_callbacks[function_name]:
105
+ del self._start_callbacks[function_name]
106
+
107
+ def has_function(self, function_name: str):
108
+ return function_name in self._callbacks.keys()
109
+
110
+ async def call_function(self, function_name: str, args):
111
+ if function_name in self._callbacks.keys():
112
+ return await self._callbacks[function_name](self, args)
113
+ return None
114
+
115
+ async def call_start_function(self, function_name: str):
116
+ if function_name in self._start_callbacks.keys():
117
+ await self._start_callbacks[function_name](self)
118
+
119
+
120
+ class TTSService(AIService):
121
+ def __init__(self, *, aggregate_sentences: bool = True, **kwargs):
122
+ super().__init__(**kwargs)
123
+ self._aggregate_sentences: bool = aggregate_sentences
124
+ self._current_sentence: str = ""
125
+
126
+ # Converts the text to audio.
127
+ @abstractmethod
128
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
129
+ pass
130
+
131
+ async def say(self, text: str):
132
+ await self.process_frame(TextFrame(text=text), FrameDirection.DOWNSTREAM)
133
+
134
+ async def _process_text_frame(self, frame: TextFrame):
135
+ text: str | None = None
136
+ if not self._aggregate_sentences:
137
+ text = frame.text
138
+ else:
139
+ self._current_sentence += frame.text
140
+ if self._current_sentence.strip().endswith(
141
+ (".", "?", "!")) and not self._current_sentence.strip().endswith(
142
+ ("Mr,", "Mrs.", "Ms.", "Dr.")):
143
+ text = self._current_sentence
144
+ self._current_sentence = ""
145
+
146
+ if text:
147
+ await self._push_tts_frames(text)
148
+
149
+ async def _push_tts_frames(self, text: str):
150
+ text = text.strip()
151
+ if not text:
152
+ return
153
+
154
+ await self.push_frame(TTSStartedFrame())
155
+ await self.start_processing_metrics()
156
+ await self.process_generator(self.run_tts(text))
157
+ await self.stop_processing_metrics()
158
+ await self.push_frame(TTSStoppedFrame())
159
+ # We send the original text after the audio. This way, if we are
160
+ # interrupted, the text is not added to the assistant context.
161
+ await self.push_frame(TextFrame(text))
162
+
163
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
164
+ await super().process_frame(frame, direction)
165
+
166
+ if isinstance(frame, TextFrame):
167
+ await self._process_text_frame(frame)
168
+ elif isinstance(frame, StartInterruptionFrame):
169
+ self._current_sentence = ""
170
+ await self.push_frame(frame, direction)
171
+ elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
172
+ self._current_sentence = ""
173
+ await self._push_tts_frames(self._current_sentence)
174
+ await self.push_frame(frame)
175
+ else:
176
+ await self.push_frame(frame, direction)
177
+
178
+
179
+ class STTService(AIService):
180
+ """STTService is a base class for speech-to-text services."""
181
+
182
+ def __init__(self,
183
+ *,
184
+ min_volume: float = 0.6,
185
+ max_silence_secs: float = 0.3,
186
+ max_buffer_secs: float = 1.5,
187
+ sample_rate: int = 16000,
188
+ num_channels: int = 1,
189
+ **kwargs):
190
+ super().__init__(**kwargs)
191
+ self._min_volume = min_volume
192
+ self._max_silence_secs = max_silence_secs
193
+ self._max_buffer_secs = max_buffer_secs
194
+ self._sample_rate = sample_rate
195
+ self._num_channels = num_channels
196
+ (self._content, self._wave) = self._new_wave()
197
+ self._silence_num_frames = 0
198
+ # Volume exponential smoothing
199
+ self._smoothing_factor = 0.2
200
+ self._prev_volume = 0
201
+
202
+ @abstractmethod
203
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
204
+ """Returns transcript as a string"""
205
+ pass
206
+
207
+ def _new_wave(self):
208
+ content = io.BytesIO()
209
+ ww = wave.open(content, "wb")
210
+ ww.setsampwidth(2)
211
+ ww.setnchannels(self._num_channels)
212
+ ww.setframerate(self._sample_rate)
213
+ return (content, ww)
214
+
215
+ def _get_smoothed_volume(self, frame: AudioRawFrame) -> float:
216
+ volume = calculate_audio_volume(frame.audio, frame.sample_rate)
217
+ return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
218
+
219
+ async def _append_audio(self, frame: AudioRawFrame):
220
+ # Try to filter out empty background noise
221
+ volume = self._get_smoothed_volume(frame)
222
+ if volume >= self._min_volume:
223
+ # If volume is high enough, write new data to wave file
224
+ self._wave.writeframes(frame.audio)
225
+ self._silence_num_frames = 0
226
+ else:
227
+ self._silence_num_frames += frame.num_frames
228
+ self._prev_volume = volume
229
+
230
+ # If buffer is not empty and we have enough data or there's been a long
231
+ # silence, transcribe the audio gathered so far.
232
+ silence_secs = self._silence_num_frames / self._sample_rate
233
+ buffer_secs = self._wave.getnframes() / self._sample_rate
234
+ if self._content.tell() > 0 and (
235
+ buffer_secs > self._max_buffer_secs or silence_secs > self._max_silence_secs):
236
+ self._silence_num_frames = 0
237
+ self._wave.close()
238
+ self._content.seek(0)
239
+ await self.start_processing_metrics()
240
+ await self.process_generator(self.run_stt(self._content.read()))
241
+ await self.stop_processing_metrics()
242
+ (self._content, self._wave) = self._new_wave()
243
+
244
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
245
+ """Processes a frame of audio data, either buffering or transcribing it."""
246
+ await super().process_frame(frame, direction)
247
+
248
+ if isinstance(frame, CancelFrame) or isinstance(frame, EndFrame):
249
+ self._wave.close()
250
+ await self.push_frame(frame, direction)
251
+ elif isinstance(frame, AudioRawFrame):
252
+ # In this service we accumulate audio internally and at the end we
253
+ # push a TextFrame. We don't really want to push audio frames down.
254
+ await self._append_audio(frame)
255
+ else:
256
+ await self.push_frame(frame, direction)
257
+
258
+
259
+ class ImageGenService(AIService):
260
+
261
+ def __init__(self, **kwargs):
262
+ super().__init__(**kwargs)
263
+
264
+ # Renders the image. Returns an Image object.
265
+ @abstractmethod
266
+ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
267
+ pass
268
+
269
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
270
+ await super().process_frame(frame, direction)
271
+
272
+ if isinstance(frame, TextFrame):
273
+ await self.push_frame(frame, direction)
274
+ await self.start_processing_metrics()
275
+ await self.process_generator(self.run_image_gen(frame.text))
276
+ await self.stop_processing_metrics()
277
+ else:
278
+ await self.push_frame(frame, direction)
279
+
280
+
281
+ class VisionService(AIService):
282
+ """VisionService is a base class for vision services."""
283
+
284
+ def __init__(self, **kwargs):
285
+ super().__init__(**kwargs)
286
+ self._describe_text = None
287
+
288
+ @abstractmethod
289
+ async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
290
+ pass
291
+
292
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
293
+ await super().process_frame(frame, direction)
294
+
295
+ if isinstance(frame, VisionImageRawFrame):
296
+ await self.start_processing_metrics()
297
+ await self.process_generator(self.run_vision(frame))
298
+ await self.stop_processing_metrics()
299
+ else:
300
+ await self.push_frame(frame, direction)
pipecat/services/anthropic.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import base64
8
+
9
+ from pipecat.frames.frames import (
10
+ Frame,
11
+ TextFrame,
12
+ VisionImageRawFrame,
13
+ LLMMessagesFrame,
14
+ LLMFullResponseStartFrame,
15
+ LLMResponseStartFrame,
16
+ LLMResponseEndFrame,
17
+ LLMFullResponseEndFrame
18
+ )
19
+ from pipecat.processors.frame_processor import FrameDirection
20
+ from pipecat.services.ai_services import LLMService
21
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
22
+
23
+ from loguru import logger
24
+
25
+ try:
26
+ from anthropic import AsyncAnthropic
27
+ except ModuleNotFoundError as e:
28
+ logger.error(f"Exception: {e}")
29
+ logger.error(
30
+ "In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`. Also, set `ANTHROPIC_API_KEY` environment variable.")
31
+ raise Exception(f"Missing module: {e}")
32
+
33
+
34
+ class AnthropicLLMService(LLMService):
35
+ """This class implements inference with Anthropic's AI models
36
+
37
+ This service translates internally from OpenAILLMContext to the messages format
38
+ expected by the Anthropic Python SDK. We are using the OpenAILLMContext as a lingua
39
+ franca for all LLM services, so that it is easy to switch between different LLMs.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ *,
45
+ api_key: str,
46
+ model: str = "claude-3-opus-20240229",
47
+ max_tokens: int = 1024):
48
+ super().__init__()
49
+ self._client = AsyncAnthropic(api_key=api_key)
50
+ self._model = model
51
+ self._max_tokens = max_tokens
52
+
53
+ def can_generate_metrics(self) -> bool:
54
+ return True
55
+
56
+ def _get_messages_from_openai_context(
57
+ self, context: OpenAILLMContext):
58
+ openai_messages = context.get_messages()
59
+ anthropic_messages = []
60
+
61
+ for message in openai_messages:
62
+ role = message["role"]
63
+ text = message["content"]
64
+ if role == "system":
65
+ role = "user"
66
+ if message.get("mime_type") == "image/jpeg":
67
+ # vision frame
68
+ encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
69
+ anthropic_messages.append({
70
+ "role": role,
71
+ "content": [{
72
+ "type": "image",
73
+ "source": {
74
+ "type": "base64",
75
+ "media_type": message.get("mime_type"),
76
+ "data": encoded_image,
77
+ }
78
+ }, {
79
+ "type": "text",
80
+ "text": text
81
+ }]
82
+ })
83
+ else:
84
+ # Text frame. Anthropic needs the roles to alternate. This will
85
+ # cause an issue with interruptions. So, if we detect we are the
86
+ # ones asking again it probably means we were interrupted.
87
+ if role == "user" and len(anthropic_messages) > 1:
88
+ last_message = anthropic_messages[-1]
89
+ if last_message["role"] == "user":
90
+ anthropic_messages = anthropic_messages[:-1]
91
+ content = last_message["content"]
92
+ anthropic_messages.append(
93
+ {"role": "user", "content": f"Sorry, I just asked you about [{content}] but now I would like to know [{text}]."})
94
+ else:
95
+ anthropic_messages.append({"role": role, "content": text})
96
+ else:
97
+ anthropic_messages.append({"role": role, "content": text})
98
+
99
+ return anthropic_messages
100
+
101
+ async def _process_context(self, context: OpenAILLMContext):
102
+ await self.push_frame(LLMFullResponseStartFrame())
103
+ try:
104
+ logger.debug(f"Generating chat: {context.get_messages_json()}")
105
+
106
+ messages = self._get_messages_from_openai_context(context)
107
+
108
+ await self.start_ttfb_metrics()
109
+
110
+ response = await self._client.messages.create(
111
+ messages=messages,
112
+ model=self._model,
113
+ max_tokens=self._max_tokens,
114
+ stream=True)
115
+
116
+ await self.stop_ttfb_metrics()
117
+
118
+ async for event in response:
119
+ # logger.debug(f"Anthropic LLM event: {event}")
120
+ if (event.type == "content_block_delta"):
121
+ await self.push_frame(LLMResponseStartFrame())
122
+ await self.push_frame(TextFrame(event.delta.text))
123
+ await self.push_frame(LLMResponseEndFrame())
124
+
125
+ except Exception as e:
126
+ logger.exception(f"{self} exception: {e}")
127
+ finally:
128
+ await self.push_frame(LLMFullResponseEndFrame())
129
+
130
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
131
+ await super().process_frame(frame, direction)
132
+
133
+ context = None
134
+
135
+ if isinstance(frame, OpenAILLMContextFrame):
136
+ context: OpenAILLMContext = frame.context
137
+ elif isinstance(frame, LLMMessagesFrame):
138
+ context = OpenAILLMContext.from_messages(frame.messages)
139
+ elif isinstance(frame, VisionImageRawFrame):
140
+ context = OpenAILLMContext.from_image_frame(frame)
141
+ else:
142
+ await self.push_frame(frame, direction)
143
+
144
+ if context:
145
+ await self._process_context(context)
pipecat/services/azure.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import aiohttp
8
+ import asyncio
9
+ import io
10
+ import time
11
+
12
+ from PIL import Image
13
+ from typing import AsyncGenerator
14
+
15
+ from pipecat.frames.frames import (
16
+ AudioRawFrame,
17
+ CancelFrame,
18
+ EndFrame,
19
+ ErrorFrame,
20
+ Frame,
21
+ StartFrame,
22
+ StartInterruptionFrame,
23
+ SystemFrame,
24
+ TranscriptionFrame,
25
+ URLImageRawFrame)
26
+ from pipecat.processors.frame_processor import FrameDirection
27
+ from pipecat.services.ai_services import AIService, AsyncAIService, TTSService, ImageGenService
28
+ from pipecat.services.openai import BaseOpenAILLMService
29
+
30
+ from loguru import logger
31
+
32
+ # See .env.example for Azure configuration needed
33
+ try:
34
+ from openai import AsyncAzureOpenAI
35
+ from azure.cognitiveservices.speech import (
36
+ SpeechConfig,
37
+ SpeechRecognizer,
38
+ SpeechSynthesizer,
39
+ ResultReason,
40
+ CancellationReason,
41
+ )
42
+ from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
43
+ from azure.cognitiveservices.speech.dialog import AudioConfig
44
+ except ModuleNotFoundError as e:
45
+ logger.error(f"Exception: {e}")
46
+ logger.error(
47
+ "In order to use Azure, you need to `pip install pipecat-ai[azure]`. Also, set `AZURE_SPEECH_API_KEY` and `AZURE_SPEECH_REGION` environment variables.")
48
+ raise Exception(f"Missing module: {e}")
49
+
50
+
51
+ class AzureLLMService(BaseOpenAILLMService):
52
+ def __init__(
53
+ self,
54
+ *,
55
+ api_key: str,
56
+ endpoint: str,
57
+ model: str,
58
+ api_version: str = "2023-12-01-preview"):
59
+ # Initialize variables before calling parent __init__() because that
60
+ # will call create_client() and we need those values there.
61
+ self._endpoint = endpoint
62
+ self._api_version = api_version
63
+ super().__init__(api_key=api_key, model=model)
64
+
65
+ def create_client(self, api_key=None, base_url=None, **kwargs):
66
+ return AsyncAzureOpenAI(
67
+ api_key=api_key,
68
+ azure_endpoint=self._endpoint,
69
+ api_version=self._api_version,
70
+ )
71
+
72
+
73
+ class AzureTTSService(TTSService):
74
+ def __init__(self, *, api_key: str, region: str, voice="en-US-SaraNeural", **kwargs):
75
+ super().__init__(**kwargs)
76
+
77
+ speech_config = SpeechConfig(subscription=api_key, region=region)
78
+ self._speech_synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
79
+
80
+ self._voice = voice
81
+
82
+ def can_generate_metrics(self) -> bool:
83
+ return True
84
+
85
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
86
+ logger.debug(f"Generating TTS: {text}")
87
+
88
+ await self.start_ttfb_metrics()
89
+
90
+ ssml = (
91
+ "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
92
+ "xmlns:mstts='http://www.w3.org/2001/mstts'>"
93
+ f"<voice name='{self._voice}'>"
94
+ "<mstts:silence type='Sentenceboundary' value='20ms' />"
95
+ "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
96
+ "<prosody rate='1.05'>"
97
+ f"{text}"
98
+ "</prosody></mstts:express-as></voice></speak> ")
99
+
100
+ result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))
101
+
102
+ if result.reason == ResultReason.SynthesizingAudioCompleted:
103
+ await self.stop_ttfb_metrics()
104
+ # Azure always sends a 44-byte header. Strip it off.
105
+ yield AudioRawFrame(audio=result.audio_data[44:], sample_rate=16000, num_channels=1)
106
+ elif result.reason == ResultReason.Canceled:
107
+ cancellation_details = result.cancellation_details
108
+ logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
109
+ if cancellation_details.reason == CancellationReason.Error:
110
+ logger.error(f"{self} error: {cancellation_details.error_details}")
111
+
112
+
113
+ class AzureSTTService(AsyncAIService):
114
+ def __init__(
115
+ self,
116
+ *,
117
+ api_key: str,
118
+ region: str,
119
+ language="en-US",
120
+ sample_rate=16000,
121
+ channels=1,
122
+ **kwargs):
123
+ super().__init__(**kwargs)
124
+
125
+ speech_config = SpeechConfig(subscription=api_key, region=region)
126
+ speech_config.speech_recognition_language = language
127
+
128
+ stream_format = AudioStreamFormat(samples_per_second=sample_rate, channels=channels)
129
+ self._audio_stream = PushAudioInputStream(stream_format)
130
+
131
+ audio_config = AudioConfig(stream=self._audio_stream)
132
+ self._speech_recognizer = SpeechRecognizer(
133
+ speech_config=speech_config, audio_config=audio_config)
134
+ self._speech_recognizer.recognized.connect(self._on_handle_recognized)
135
+
136
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
137
+ await super().process_frame(frame, direction)
138
+
139
+ if isinstance(frame, SystemFrame):
140
+ await self.push_frame(frame, direction)
141
+ elif isinstance(frame, AudioRawFrame):
142
+ self._audio_stream.write(frame.audio)
143
+ else:
144
+ await self._push_queue.put((frame, direction))
145
+
146
+ async def start(self, frame: StartFrame):
147
+ self._speech_recognizer.start_continuous_recognition_async()
148
+
149
+ async def stop(self, frame: EndFrame):
150
+ self._speech_recognizer.stop_continuous_recognition_async()
151
+
152
+ async def cancel(self, frame: CancelFrame):
153
+ self._speech_recognizer.stop_continuous_recognition_async()
154
+
155
+ def _on_handle_recognized(self, event):
156
+ if event.result.reason == ResultReason.RecognizedSpeech and len(event.result.text) > 0:
157
+ frame = TranscriptionFrame(event.result.text, "", int(time.time_ns() / 1000000))
158
+ asyncio.run_coroutine_threadsafe(self.queue_frame(frame), self.get_event_loop())
159
+
160
+
161
+ class AzureImageGenServiceREST(ImageGenService):
162
+
163
+ def __init__(
164
+ self,
165
+ *,
166
+ aiohttp_session: aiohttp.ClientSession,
167
+ image_size: str,
168
+ api_key: str,
169
+ endpoint: str,
170
+ model: str,
171
+ api_version="2023-06-01-preview",
172
+ ):
173
+ super().__init__()
174
+
175
+ self._api_key = api_key
176
+ self._azure_endpoint = endpoint
177
+ self._api_version = api_version
178
+ self._model = model
179
+ self._aiohttp_session = aiohttp_session
180
+ self._image_size = image_size
181
+
182
+ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
183
+ url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
184
+
185
+ headers = {
186
+ "api-key": self._api_key,
187
+ "Content-Type": "application/json"}
188
+
189
+ body = {
190
+ # Enter your prompt text here
191
+ "prompt": prompt,
192
+ "size": self._image_size,
193
+ "n": 1,
194
+ }
195
+
196
+ async with self._aiohttp_session.post(url, headers=headers, json=body) as submission:
197
+ # We never get past this line, because this header isn't
198
+ # defined on a 429 response, but something is eating our
199
+ # exceptions!
200
+ operation_location = submission.headers["operation-location"]
201
+ status = ""
202
+ attempts_left = 120
203
+ json_response = None
204
+ while status != "succeeded":
205
+ attempts_left -= 1
206
+ if attempts_left == 0:
207
+ logger.error(f"{self} error: image generation timed out")
208
+ yield ErrorFrame("Image generation timed out")
209
+ return
210
+
211
+ await asyncio.sleep(1)
212
+
213
+ response = await self._aiohttp_session.get(operation_location, headers=headers)
214
+
215
+ json_response = await response.json()
216
+ status = json_response["status"]
217
+
218
+ image_url = json_response["result"]["data"][0]["url"] if json_response else None
219
+ if not image_url:
220
+ logger.error(f"{self} error: image generation failed")
221
+ yield ErrorFrame("Image generation failed")
222
+ return
223
+
224
+ # Load the image from the url
225
+ async with self._aiohttp_session.get(image_url) as response:
226
+ image_stream = io.BytesIO(await response.content.read())
227
+ image = Image.open(image_stream)
228
+ frame = URLImageRawFrame(
229
+ url=image_url,
230
+ image=image.tobytes(),
231
+ size=image.size,
232
+ format=image.format)
233
+ yield frame
pipecat/services/cartesia.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from cartesia import AsyncCartesia
8
+
9
+ from typing import AsyncGenerator
10
+
11
+ from pipecat.frames.frames import AudioRawFrame, Frame
12
+ from pipecat.services.ai_services import TTSService
13
+
14
+ from loguru import logger
15
+
16
+
17
+ class CartesiaTTSService(TTSService):
18
+
19
+ def __init__(
20
+ self,
21
+ *,
22
+ api_key: str,
23
+ voice_id: str,
24
+ model_id: str = "sonic-english",
25
+ encoding: str = "pcm_s16le",
26
+ sample_rate: int = 16000,
27
+ **kwargs):
28
+ super().__init__(**kwargs)
29
+
30
+ self._api_key = api_key
31
+ self._model_id = model_id
32
+ self._output_format = {
33
+ "container": "raw",
34
+ "encoding": encoding,
35
+ "sample_rate": sample_rate,
36
+ }
37
+
38
+ try:
39
+ self._client = AsyncCartesia(api_key=self._api_key)
40
+ self._voice = self._client.voices.get(id=voice_id)
41
+ except Exception as e:
42
+ logger.exception(f"{self} initialization error: {e}")
43
+
44
+ def can_generate_metrics(self) -> bool:
45
+ return True
46
+
47
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
48
+ logger.debug(f"Generating TTS: [{text}]")
49
+
50
+ try:
51
+ await self.start_ttfb_metrics()
52
+
53
+ chunk_generator = await self._client.tts.sse(
54
+ stream=True,
55
+ transcript=text,
56
+ voice_embedding=self._voice["embedding"],
57
+ model_id=self._model_id,
58
+ output_format=self._output_format,
59
+ )
60
+
61
+ async for chunk in chunk_generator:
62
+ await self.stop_ttfb_metrics()
63
+ yield AudioRawFrame(chunk["audio"], self._output_format["sample_rate"], 1)
64
+ except Exception as e:
65
+ logger.exception(f"{self} exception: {e}")
pipecat/services/deepgram.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import aiohttp
8
+ import time
9
+
10
+ from typing import AsyncGenerator
11
+
12
+ from pipecat.frames.frames import (
13
+ AudioRawFrame,
14
+ CancelFrame,
15
+ EndFrame,
16
+ ErrorFrame,
17
+ Frame,
18
+ InterimTranscriptionFrame,
19
+ StartFrame,
20
+ SystemFrame,
21
+ TranscriptionFrame)
22
+ from pipecat.processors.frame_processor import FrameDirection
23
+ from pipecat.services.ai_services import AsyncAIService, TTSService
24
+
25
+ from loguru import logger
26
+
27
+ # See .env.example for Deepgram configuration needed
28
+ try:
29
+ from deepgram import (
30
+ DeepgramClient,
31
+ DeepgramClientOptions,
32
+ LiveTranscriptionEvents,
33
+ LiveOptions,
34
+ )
35
+ except ModuleNotFoundError as e:
36
+ logger.error(f"Exception: {e}")
37
+ logger.error(
38
+ "In order to use Deepgram, you need to `pip install pipecat-ai[deepgram]`. Also, set `DEEPGRAM_API_KEY` environment variable.")
39
+ raise Exception(f"Missing module: {e}")
40
+
41
+
42
+ class DeepgramTTSService(TTSService):
43
+
44
+ def __init__(
45
+ self,
46
+ *,
47
+ aiohttp_session: aiohttp.ClientSession,
48
+ api_key: str,
49
+ voice: str = "aura-helios-en",
50
+ base_url: str = "https://api.deepgram.com/v1/speak",
51
+ **kwargs):
52
+ super().__init__(**kwargs)
53
+
54
+ self._voice = voice
55
+ self._api_key = api_key
56
+ self._aiohttp_session = aiohttp_session
57
+ self._base_url = base_url
58
+
59
+ def can_generate_metrics(self) -> bool:
60
+ return True
61
+
62
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
63
+ logger.debug(f"Generating TTS: [{text}]")
64
+
65
+ base_url = self._base_url
66
+ request_url = f"{base_url}?model={self._voice}&encoding=linear16&container=none&sample_rate=16000"
67
+ headers = {"authorization": f"token {self._api_key}"}
68
+ body = {"text": text}
69
+
70
+ try:
71
+ await self.start_ttfb_metrics()
72
+ async with self._aiohttp_session.post(request_url, headers=headers, json=body) as r:
73
+ if r.status != 200:
74
+ response_text = await r.text()
75
+ # If we get a a "Bad Request: Input is unutterable", just print out a debug log.
76
+ # All other unsuccesful requests should emit an error frame. If not specifically
77
+ # handled by the running PipelineTask, the ErrorFrame will cancel the task.
78
+ if "unutterable" in response_text:
79
+ logger.debug(f"Unutterable text: [{text}]")
80
+ return
81
+
82
+ logger.error(
83
+ f"{self} error getting audio (status: {r.status}, error: {response_text})")
84
+ yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {response_text})")
85
+ return
86
+
87
+ async for data in r.content:
88
+ await self.stop_ttfb_metrics()
89
+ frame = AudioRawFrame(audio=data, sample_rate=16000, num_channels=1)
90
+ yield frame
91
+ except Exception as e:
92
+ logger.exception(f"{self} exception: {e}")
93
+
94
+
95
+ class DeepgramSTTService(AsyncAIService):
96
+ def __init__(self,
97
+ *,
98
+ api_key: str,
99
+ url: str = "",
100
+ live_options: LiveOptions = LiveOptions(
101
+ encoding="linear16",
102
+ language="en-US",
103
+ model="nova-2-conversationalai",
104
+ sample_rate=16000,
105
+ channels=1,
106
+ interim_results=True,
107
+ smart_format=True,
108
+ ),
109
+ **kwargs):
110
+ super().__init__(**kwargs)
111
+
112
+ self._live_options = live_options
113
+
114
+ self._client = DeepgramClient(
115
+ api_key, config=DeepgramClientOptions(url=url, options={"keepalive": "true"}))
116
+ self._connection = self._client.listen.asynclive.v("1")
117
+ self._connection.on(LiveTranscriptionEvents.Transcript, self._on_message)
118
+
119
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
120
+ await super().process_frame(frame, direction)
121
+
122
+ if isinstance(frame, SystemFrame):
123
+ await self.push_frame(frame, direction)
124
+ elif isinstance(frame, AudioRawFrame):
125
+ await self._connection.send(frame.audio)
126
+ else:
127
+ await self.queue_frame(frame, direction)
128
+
129
+ async def start(self, frame: StartFrame):
130
+ if await self._connection.start(self._live_options):
131
+ logger.debug(f"{self}: Connected to Deepgram")
132
+ else:
133
+ logger.error(f"{self}: Unable to connect to Deepgram")
134
+
135
+ async def stop(self, frame: EndFrame):
136
+ await self._connection.finish()
137
+
138
+ async def cancel(self, frame: CancelFrame):
139
+ await self._connection.finish()
140
+
141
+ async def _on_message(self, *args, **kwargs):
142
+ result = kwargs["result"]
143
+ is_final = result.is_final
144
+ transcript = result.channel.alternatives[0].transcript
145
+ if len(transcript) > 0:
146
+ if is_final:
147
+ await self.queue_frame(TranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))
148
+ else:
149
+ await self.queue_frame(InterimTranscriptionFrame(transcript, "", int(time.time_ns() / 1000000)))
pipecat/services/elevenlabs.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import aiohttp
8
+
9
+ from typing import AsyncGenerator
10
+
11
+ from pipecat.frames.frames import AudioRawFrame, ErrorFrame, Frame
12
+ from pipecat.services.ai_services import TTSService
13
+
14
+ from loguru import logger
15
+
16
+
17
+ class ElevenLabsTTSService(TTSService):
18
+
19
+ def __init__(
20
+ self,
21
+ *,
22
+ aiohttp_session: aiohttp.ClientSession,
23
+ api_key: str,
24
+ voice_id: str,
25
+ model: str = "eleven_turbo_v2",
26
+ **kwargs):
27
+ super().__init__(**kwargs)
28
+
29
+ self._api_key = api_key
30
+ self._voice_id = voice_id
31
+ self._aiohttp_session = aiohttp_session
32
+ self._model = model
33
+
34
+ def can_generate_metrics(self) -> bool:
35
+ return True
36
+
37
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
38
+ logger.debug(f"Generating TTS: [{text}]")
39
+
40
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
41
+
42
+ payload = {"text": text, "model_id": self._model}
43
+
44
+ querystring = {
45
+ "output_format": "pcm_16000",
46
+ "optimize_streaming_latency": 2}
47
+
48
+ headers = {
49
+ "xi-api-key": self._api_key,
50
+ "Content-Type": "application/json",
51
+ }
52
+
53
+ await self.start_ttfb_metrics()
54
+
55
+ async with self._aiohttp_session.post(url, json=payload, headers=headers, params=querystring) as r:
56
+ if r.status != 200:
57
+ text = await r.text()
58
+ logger.error(f"{self} error getting audio (status: {r.status}, error: {text})")
59
+ yield ErrorFrame(f"Error getting audio (status: {r.status}, error: {text})")
60
+ return
61
+
62
+ async for chunk in r.content:
63
+ if len(chunk) > 0:
64
+ await self.stop_ttfb_metrics()
65
+ frame = AudioRawFrame(chunk, 16000, 1)
66
+ yield frame
pipecat/services/fal.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import aiohttp
8
+ import io
9
+ import os
10
+
11
+ from PIL import Image
12
+ from pydantic import BaseModel
13
+ from typing import AsyncGenerator, Optional, Union, Dict
14
+
15
+ from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame
16
+ from pipecat.services.ai_services import ImageGenService
17
+
18
+ from loguru import logger
19
+
20
+ try:
21
+ import fal_client
22
+ except ModuleNotFoundError as e:
23
+ logger.error(f"Exception: {e}")
24
+ logger.error(
25
+ "In order to use Fal, you need to `pip install pipecat-ai[fal]`. Also, set `FAL_KEY` environment variable.")
26
+ raise Exception(f"Missing module: {e}")
27
+
28
+
29
+ class FalImageGenService(ImageGenService):
30
+ class InputParams(BaseModel):
31
+ seed: Optional[int] = None
32
+ num_inference_steps: int = 8
33
+ num_images: int = 1
34
+ image_size: Union[str, Dict[str, int]] = "square_hd"
35
+ expand_prompt: bool = False
36
+ enable_safety_checker: bool = True
37
+ format: str = "png"
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ aiohttp_session: aiohttp.ClientSession,
43
+ params: InputParams,
44
+ model: str = "fal-ai/fast-sdxl",
45
+ key: str | None = None,
46
+ ):
47
+ super().__init__()
48
+ self._model = model
49
+ self._params = params
50
+ self._aiohttp_session = aiohttp_session
51
+ if key:
52
+ os.environ["FAL_KEY"] = key
53
+
54
+ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
55
+ logger.debug(f"Generating image from prompt: {prompt}")
56
+
57
+ response = await fal_client.run_async(
58
+ self._model,
59
+ arguments={"prompt": prompt, **self._params.model_dump()}
60
+ )
61
+
62
+ image_url = response["images"][0]["url"] if response else None
63
+
64
+ if not image_url:
65
+ logger.error(f"{self} error: image generation failed")
66
+ yield ErrorFrame("Image generation failed")
67
+ return
68
+
69
+ logger.debug(f"Image generated at: {image_url}")
70
+
71
+ # Load the image from the url
72
+ logger.debug(f"Downloading image {image_url} ...")
73
+ async with self._aiohttp_session.get(image_url) as response:
74
+ logger.debug(f"Downloaded image {image_url}")
75
+ image_stream = io.BytesIO(await response.content.read())
76
+ image = Image.open(image_stream)
77
+
78
+ frame = URLImageRawFrame(
79
+ url=image_url,
80
+ image=image.tobytes(),
81
+ size=image.size,
82
+ format=image.format)
83
+ yield frame
pipecat/services/fireworks.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from pipecat.services.openai import BaseOpenAILLMService
8
+
9
+ from loguru import logger
10
+
11
+ try:
12
+ from openai import AsyncOpenAI
13
+ except ModuleNotFoundError as e:
14
+ logger.error(f"Exception: {e}")
15
+ logger.error(
16
+ "In order to use Fireworks, you need to `pip install pipecat-ai[fireworks]`. Also, set the `FIREWORKS_API_KEY` environment variable.")
17
+ raise Exception(f"Missing module: {e}")
18
+
19
+
20
+ class FireworksLLMService(BaseOpenAILLMService):
21
+ def __init__(self,
22
+ *,
23
+ model: str = "accounts/fireworks/models/firefunction-v1",
24
+ base_url: str = "https://api.fireworks.ai/inference/v1"):
25
+ super().__init__(model, base_url)
pipecat/services/google.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from typing import List
10
+
11
+ from pipecat.frames.frames import (
12
+ Frame,
13
+ TextFrame,
14
+ VisionImageRawFrame,
15
+ LLMMessagesFrame,
16
+ LLMFullResponseStartFrame,
17
+ LLMResponseStartFrame,
18
+ LLMResponseEndFrame,
19
+ LLMFullResponseEndFrame
20
+ )
21
+ from pipecat.processors.frame_processor import FrameDirection
22
+ from pipecat.services.ai_services import LLMService
23
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext, OpenAILLMContextFrame
24
+
25
+ from loguru import logger
26
+
27
+ try:
28
+ import google.generativeai as gai
29
+ import google.ai.generativelanguage as glm
30
+ except ModuleNotFoundError as e:
31
+ logger.error(f"Exception: {e}")
32
+ logger.error(
33
+ "In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable.")
34
+ raise Exception(f"Missing module: {e}")
35
+
36
+
37
+ class GoogleLLMService(LLMService):
38
+ """This class implements inference with Google's AI models
39
+
40
+ This service translates internally from OpenAILLMContext to the messages format
41
+ expected by the Google AI model. We are using the OpenAILLMContext as a lingua
42
+ franca for all LLM services, so that it is easy to switch between different LLMs.
43
+ """
44
+
45
+ def __init__(self, *, api_key: str, model: str = "gemini-1.5-flash-latest", **kwargs):
46
+ super().__init__(**kwargs)
47
+ gai.configure(api_key=api_key)
48
+ self._client = gai.GenerativeModel(model)
49
+
50
+ def can_generate_metrics(self) -> bool:
51
+ return True
52
+
53
+ def _get_messages_from_openai_context(
54
+ self, context: OpenAILLMContext) -> List[glm.Content]:
55
+ openai_messages = context.get_messages()
56
+ google_messages = []
57
+
58
+ for message in openai_messages:
59
+ role = message["role"]
60
+ content = message["content"]
61
+ if role == "system":
62
+ role = "user"
63
+ elif role == "assistant":
64
+ role = "model"
65
+
66
+ parts = [glm.Part(text=content)]
67
+ if "mime_type" in message:
68
+ parts.append(
69
+ glm.Part(inline_data=glm.Blob(
70
+ mime_type=message["mime_type"],
71
+ data=message["data"].getvalue()
72
+ )))
73
+ google_messages.append({"role": role, "parts": parts})
74
+
75
+ return google_messages
76
+
77
+ async def _async_generator_wrapper(self, sync_generator):
78
+ for item in sync_generator:
79
+ yield item
80
+ await asyncio.sleep(0)
81
+
82
+ async def _process_context(self, context: OpenAILLMContext):
83
+ await self.push_frame(LLMFullResponseStartFrame())
84
+ try:
85
+ logger.debug(f"Generating chat: {context.get_messages_json()}")
86
+
87
+ messages = self._get_messages_from_openai_context(context)
88
+
89
+ await self.start_ttfb_metrics()
90
+
91
+ response = self._client.generate_content(messages, stream=True)
92
+
93
+ await self.stop_ttfb_metrics()
94
+
95
+ async for chunk in self._async_generator_wrapper(response):
96
+ try:
97
+ text = chunk.text
98
+ await self.push_frame(LLMResponseStartFrame())
99
+ await self.push_frame(TextFrame(text))
100
+ await self.push_frame(LLMResponseEndFrame())
101
+ except Exception as e:
102
+ # Google LLMs seem to flag safety issues a lot!
103
+ if chunk.candidates[0].finish_reason == 3:
104
+ logger.debug(
105
+ f"LLM refused to generate content for safety reasons - {messages}.")
106
+ else:
107
+ logger.exception(f"{self} error: {e}")
108
+
109
+ except Exception as e:
110
+ logger.exception(f"{self} exception: {e}")
111
+ finally:
112
+ await self.push_frame(LLMFullResponseEndFrame())
113
+
114
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
115
+ await super().process_frame(frame, direction)
116
+
117
+ context = None
118
+
119
+ if isinstance(frame, OpenAILLMContextFrame):
120
+ context: OpenAILLMContext = frame.context
121
+ elif isinstance(frame, LLMMessagesFrame):
122
+ context = OpenAILLMContext.from_messages(frame.messages)
123
+ elif isinstance(frame, VisionImageRawFrame):
124
+ context = OpenAILLMContext.from_image_frame(frame)
125
+ else:
126
+ await self.push_frame(frame, direction)
127
+
128
+ if context:
129
+ await self._process_context(context)
pipecat/services/moondream.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import asyncio
8
+
9
+ from PIL import Image
10
+
11
+ from typing import AsyncGenerator
12
+
13
+ from pipecat.frames.frames import ErrorFrame, Frame, TextFrame, VisionImageRawFrame
14
+ from pipecat.services.ai_services import VisionService
15
+
16
+ from loguru import logger
17
+
18
+ try:
19
+ import torch
20
+
21
+ from transformers import AutoModelForCausalLM, AutoTokenizer
22
+ except ModuleNotFoundError as e:
23
+ logger.error(f"Exception: {e}")
24
+ logger.error("In order to use Moondream, you need to `pip install pipecat-ai[moondream]`.")
25
+ raise Exception(f"Missing module(s): {e}")
26
+
27
+
28
+ def detect_device():
29
+ """
30
+ Detects the appropriate device to run on, and return the device and dtype.
31
+ """
32
+ try:
33
+ import intel_extension_for_pytorch
34
+ if torch.xpu.is_available():
35
+ return torch.device("xpu"), torch.float32
36
+ except ImportError:
37
+ pass
38
+ if torch.cuda.is_available():
39
+ return torch.device("cuda"), torch.float16
40
+ elif torch.backends.mps.is_available():
41
+ return torch.device("mps"), torch.float16
42
+ else:
43
+ return torch.device("cpu"), torch.float32
44
+
45
+
46
+ class MoondreamService(VisionService):
47
+ def __init__(
48
+ self,
49
+ *,
50
+ model="vikhyatk/moondream2",
51
+ revision="2024-04-02",
52
+ use_cpu=False
53
+ ):
54
+ super().__init__()
55
+
56
+ if not use_cpu:
57
+ device, dtype = detect_device()
58
+ else:
59
+ device = torch.device("cpu")
60
+ dtype = torch.float32
61
+
62
+ self._tokenizer = AutoTokenizer.from_pretrained(model, revision=revision)
63
+
64
+ logger.debug("Loading Moondream model...")
65
+
66
+ self._model = AutoModelForCausalLM.from_pretrained(
67
+ model, trust_remote_code=True, revision=revision
68
+ ).to(device=device, dtype=dtype)
69
+ self._model.eval()
70
+
71
+ logger.debug("Loaded Moondream model")
72
+
73
+ async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
74
+ if not self._model:
75
+ logger.error(f"{self} error: Moondream model not available")
76
+ yield ErrorFrame("Moondream model not available")
77
+ return
78
+
79
+ logger.debug(f"Analyzing image: {frame}")
80
+
81
+ def get_image_description(frame: VisionImageRawFrame):
82
+ image = Image.frombytes(frame.format, frame.size, frame.image)
83
+ image_embeds = self._model.encode_image(image)
84
+ description = self._model.answer_question(
85
+ image_embeds=image_embeds,
86
+ question=frame.text,
87
+ tokenizer=self._tokenizer)
88
+ return description
89
+
90
+ description = await asyncio.to_thread(get_image_description, frame)
91
+
92
+ yield TextFrame(text=description)
pipecat/services/ollama.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from pipecat.services.openai import BaseOpenAILLMService
8
+
9
+
10
+ class OLLamaLLMService(BaseOpenAILLMService):
11
+
12
+ def __init__(self, *, model: str = "llama2", base_url: str = "http://localhost:11434/v1"):
13
+ super().__init__(model=model, base_url=base_url, api_key="ollama")
pipecat/services/openai.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import aiohttp
8
+ import base64
9
+ import io
10
+ import json
11
+
12
+ from typing import AsyncGenerator, List, Literal
13
+
14
+ from loguru import logger
15
+ from PIL import Image
16
+
17
+ from pipecat.frames.frames import (
18
+ AudioRawFrame,
19
+ ErrorFrame,
20
+ Frame,
21
+ LLMFullResponseEndFrame,
22
+ LLMFullResponseStartFrame,
23
+ LLMMessagesFrame,
24
+ LLMResponseEndFrame,
25
+ LLMResponseStartFrame,
26
+ TextFrame,
27
+ URLImageRawFrame,
28
+ VisionImageRawFrame
29
+ )
30
+ from pipecat.processors.aggregators.openai_llm_context import (
31
+ OpenAILLMContext,
32
+ OpenAILLMContextFrame
33
+ )
34
+ from pipecat.processors.frame_processor import FrameDirection
35
+ from pipecat.services.ai_services import (
36
+ ImageGenService,
37
+ LLMService,
38
+ TTSService
39
+ )
40
+
41
+ try:
42
+ from openai import AsyncOpenAI, AsyncStream, BadRequestError
43
+ from openai.types.chat import (
44
+ ChatCompletionChunk,
45
+ ChatCompletionFunctionMessageParam,
46
+ ChatCompletionMessageParam,
47
+ ChatCompletionToolParam
48
+ )
49
+ except ModuleNotFoundError as e:
50
+ logger.error(f"Exception: {e}")
51
+ logger.error(
52
+ "In order to use OpenAI, you need to `pip install pipecat-ai[openai]`. Also, set `OPENAI_API_KEY` environment variable.")
53
+ raise Exception(f"Missing module: {e}")
54
+
55
+
56
+ class OpenAIUnhandledFunctionException(Exception):
57
+ pass
58
+
59
+
60
+ class BaseOpenAILLMService(LLMService):
61
+ """This is the base for all services that use the AsyncOpenAI client.
62
+
63
+ This service consumes OpenAILLMContextFrame frames, which contain a reference
64
+ to an OpenAILLMContext frame. The OpenAILLMContext object defines the context
65
+ sent to the LLM for a completion. This includes user, assistant and system messages
66
+ as well as tool choices and the tool, which is used if requesting function
67
+ calls from the LLM.
68
+ """
69
+
70
+ def __init__(self, *, model: str, api_key=None, base_url=None, **kwargs):
71
+ super().__init__(**kwargs)
72
+ self._model: str = model
73
+ self._client = self.create_client(api_key=api_key, base_url=base_url, **kwargs)
74
+
75
+ def create_client(self, api_key=None, base_url=None, **kwargs):
76
+ return AsyncOpenAI(api_key=api_key, base_url=base_url)
77
+
78
+ def can_generate_metrics(self) -> bool:
79
+ return True
80
+
81
+ async def get_chat_completions(
82
+ self,
83
+ context: OpenAILLMContext,
84
+ messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
85
+ chunks = await self._client.chat.completions.create(
86
+ model=self._model,
87
+ stream=True,
88
+ messages=messages,
89
+ tools=context.tools,
90
+ tool_choice=context.tool_choice,
91
+ )
92
+ return chunks
93
+
94
+ async def _stream_chat_completions(
95
+ self, context: OpenAILLMContext) -> AsyncStream[ChatCompletionChunk]:
96
+ logger.debug(f"Generating chat: {context.get_messages_json()}")
97
+
98
+ messages: List[ChatCompletionMessageParam] = context.get_messages()
99
+
100
+ # base64 encode any images
101
+ for message in messages:
102
+ if message.get("mime_type") == "image/jpeg":
103
+ encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
104
+ text = message["content"]
105
+ message["content"] = [
106
+ {"type": "text", "text": text},
107
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
108
+ ]
109
+ del message["data"]
110
+ del message["mime_type"]
111
+
112
+ chunks = await self.get_chat_completions(context, messages)
113
+
114
+ return chunks
115
+
116
+ async def _process_context(self, context: OpenAILLMContext):
117
+ function_name = ""
118
+ arguments = ""
119
+ tool_call_id = ""
120
+
121
+ await self.start_ttfb_metrics()
122
+
123
+ chunk_stream: AsyncStream[ChatCompletionChunk] = (
124
+ await self._stream_chat_completions(context)
125
+ )
126
+
127
+ async for chunk in chunk_stream:
128
+ if len(chunk.choices) == 0:
129
+ continue
130
+
131
+ await self.stop_ttfb_metrics()
132
+
133
+ if chunk.choices[0].delta.tool_calls:
134
+ # We're streaming the LLM response to enable the fastest response times.
135
+ # For text, we just yield each chunk as we receive it and count on consumers
136
+ # to do whatever coalescing they need (eg. to pass full sentences to TTS)
137
+ #
138
+ # If the LLM is a function call, we'll do some coalescing here.
139
+ # If the response contains a function name, we'll yield a frame to tell consumers
140
+ # that they can start preparing to call the function with that name.
141
+ # We accumulate all the arguments for the rest of the streamed response, then when
142
+ # the response is done, we package up all the arguments and the function name and
143
+ # yield a frame containing the function name and the arguments.
144
+
145
+ tool_call = chunk.choices[0].delta.tool_calls[0]
146
+ if tool_call.function and tool_call.function.name:
147
+ function_name += tool_call.function.name
148
+ tool_call_id = tool_call.id
149
+ await self.call_start_function(function_name)
150
+ if tool_call.function and tool_call.function.arguments:
151
+ # Keep iterating through the response to collect all the argument fragments
152
+ arguments += tool_call.function.arguments
153
+ elif chunk.choices[0].delta.content:
154
+ await self.push_frame(LLMResponseStartFrame())
155
+ await self.push_frame(TextFrame(chunk.choices[0].delta.content))
156
+ await self.push_frame(LLMResponseEndFrame())
157
+
158
+ # if we got a function name and arguments, check to see if it's a function with
159
+ # a registered handler. If so, run the registered callback, save the result to
160
+ # the context, and re-prompt to get a chat answer. If we don't have a registered
161
+ # handler, raise an exception.
162
+ if function_name and arguments:
163
+ if self.has_function(function_name):
164
+ await self._handle_function_call(context, tool_call_id, function_name, arguments)
165
+ else:
166
+ raise OpenAIUnhandledFunctionException(
167
+ f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function.")
168
+
169
+ async def _handle_function_call(
170
+ self,
171
+ context,
172
+ tool_call_id,
173
+ function_name,
174
+ arguments
175
+ ):
176
+ arguments = json.loads(arguments)
177
+ result = await self.call_function(function_name, arguments)
178
+ arguments = json.dumps(arguments)
179
+ if isinstance(result, (str, dict)):
180
+ # Handle it in "full magic mode"
181
+ tool_call = ChatCompletionFunctionMessageParam({
182
+ "role": "assistant",
183
+ "tool_calls": [
184
+ {
185
+ "id": tool_call_id,
186
+ "function": {
187
+ "arguments": arguments,
188
+ "name": function_name
189
+ },
190
+ "type": "function"
191
+ }
192
+ ]
193
+
194
+ })
195
+ context.add_message(tool_call)
196
+ if isinstance(result, dict):
197
+ result = json.dumps(result)
198
+ tool_result = ChatCompletionToolParam({
199
+ "tool_call_id": tool_call_id,
200
+ "role": "tool",
201
+ "content": result
202
+ })
203
+ context.add_message(tool_result)
204
+ # re-prompt to get a human answer
205
+ await self._process_context(context)
206
+ elif isinstance(result, list):
207
+ # reduced magic
208
+ for msg in result:
209
+ context.add_message(msg)
210
+ await self._process_context(context)
211
+ elif isinstance(result, type(None)):
212
+ pass
213
+ else:
214
+ raise TypeError(f"Unknown return type from function callback: {type(result)}")
215
+
216
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
217
+ await super().process_frame(frame, direction)
218
+
219
+ context = None
220
+ if isinstance(frame, OpenAILLMContextFrame):
221
+ context: OpenAILLMContext = frame.context
222
+ elif isinstance(frame, LLMMessagesFrame):
223
+ context = OpenAILLMContext.from_messages(frame.messages)
224
+ elif isinstance(frame, VisionImageRawFrame):
225
+ context = OpenAILLMContext.from_image_frame(frame)
226
+ else:
227
+ await self.push_frame(frame, direction)
228
+
229
+ if context:
230
+ await self.push_frame(LLMFullResponseStartFrame())
231
+ await self.start_processing_metrics()
232
+ await self._process_context(context)
233
+ await self.stop_processing_metrics()
234
+ await self.push_frame(LLMFullResponseEndFrame())
235
+
236
+
237
+ class OpenAILLMService(BaseOpenAILLMService):
238
+
239
+ def __init__(self, *, model: str = "gpt-4o", **kwargs):
240
+ super().__init__(model=model, **kwargs)
241
+
242
+
243
+ class OpenAIImageGenService(ImageGenService):
244
+
245
+ def __init__(
246
+ self,
247
+ *,
248
+ image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
249
+ aiohttp_session: aiohttp.ClientSession,
250
+ api_key: str,
251
+ model: str = "dall-e-3",
252
+ ):
253
+ super().__init__()
254
+ self._model = model
255
+ self._image_size = image_size
256
+ self._client = AsyncOpenAI(api_key=api_key)
257
+ self._aiohttp_session = aiohttp_session
258
+
259
+ async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
260
+ logger.debug(f"Generating image from prompt: {prompt}")
261
+
262
+ image = await self._client.images.generate(
263
+ prompt=prompt,
264
+ model=self._model,
265
+ n=1,
266
+ size=self._image_size
267
+ )
268
+
269
+ image_url = image.data[0].url
270
+
271
+ if not image_url:
272
+ logger.error(f"{self} No image provided in response: {image}")
273
+ yield ErrorFrame("Image generation failed")
274
+ return
275
+
276
+ # Load the image from the url
277
+ async with self._aiohttp_session.get(image_url) as response:
278
+ image_stream = io.BytesIO(await response.content.read())
279
+ image = Image.open(image_stream)
280
+ frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format)
281
+ yield frame
282
+
283
+
284
+ class OpenAITTSService(TTSService):
285
+ """This service uses the OpenAI TTS API to generate audio from text.
286
+ The returned audio is PCM encoded at 24kHz. When using the DailyTransport, set the sample rate in the DailyParams accordingly:
287
+ ```
288
+ DailyParams(
289
+ audio_out_enabled=True,
290
+ audio_out_sample_rate=24_000,
291
+ )
292
+ ```
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ *,
298
+ api_key: str | None = None,
299
+ base_url: str | None = None,
300
+ sample_rate: int = 24_000,
301
+ voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = "alloy",
302
+ model: Literal["tts-1", "tts-1-hd"] = "tts-1",
303
+ **kwargs):
304
+ super().__init__(**kwargs)
305
+
306
+ self._voice = voice
307
+ self._model = model
308
+ self.sample_rate=sample_rate
309
+ self._client = AsyncOpenAI(api_key=api_key,base_url=base_url)
310
+
311
+ def can_generate_metrics(self) -> bool:
312
+ return True
313
+
314
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
315
+ logger.debug(f"Generating TTS: [{text}]")
316
+
317
+ try:
318
+ await self.start_ttfb_metrics()
319
+
320
+ async with self._client.audio.speech.with_streaming_response.create(
321
+ input=text,
322
+ model=self._model,
323
+ voice=self._voice,
324
+ response_format="pcm",
325
+ ) as r:
326
+ if r.status_code != 200:
327
+ error = await r.text()
328
+ logger.error(
329
+ f"{self} error getting audio (status: {r.status_code}, error: {error})")
330
+ yield ErrorFrame(f"Error getting audio (status: {r.status_code}, error: {error})")
331
+ return
332
+ async for chunk in r.iter_bytes(8192):
333
+ if len(chunk) > 0:
334
+ await self.stop_ttfb_metrics()
335
+ frame = AudioRawFrame(chunk, self.sample_rate, 1)
336
+ yield frame
337
+ except BadRequestError as e:
338
+ logger.exception(f"{self} error generating TTS: {e}")
pipecat/services/openpipe.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ from typing import Dict, List
8
+
9
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
10
+ from pipecat.services.openai import BaseOpenAILLMService
11
+
12
+ from loguru import logger
13
+
14
+ try:
15
+ from openpipe import AsyncOpenAI as OpenPipeAI, AsyncStream
16
+ from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionChunk)
17
+ except ModuleNotFoundError as e:
18
+ logger.error(f"Exception: {e}")
19
+ logger.error(
20
+ "In order to use OpenPipe, you need to `pip install pipecat-ai[openpipe]`. Also, set `OPENPIPE_API_KEY` and `OPENAI_API_KEY` environment variables.")
21
+ raise Exception(f"Missing module: {e}")
22
+
23
+
24
+ class OpenPipeLLMService(BaseOpenAILLMService):
25
+
26
+ def __init__(
27
+ self,
28
+ *,
29
+ model: str = "gpt-4o",
30
+ api_key: str | None = None,
31
+ base_url: str | None = None,
32
+ openpipe_api_key: str | None = None,
33
+ openpipe_base_url: str = "https://app.openpipe.ai/api/v1",
34
+ tags: Dict[str, str] | None = None,
35
+ **kwargs):
36
+ super().__init__(
37
+ model=model,
38
+ api_key=api_key,
39
+ base_url=base_url,
40
+ openpipe_api_key=openpipe_api_key,
41
+ openpipe_base_url=openpipe_base_url,
42
+ **kwargs)
43
+ self._tags = tags
44
+
45
+ def create_client(self, api_key=None, base_url=None, **kwargs):
46
+ openpipe_api_key = kwargs.get("openpipe_api_key") or ""
47
+ openpipe_base_url = kwargs.get("openpipe_base_url") or ""
48
+ client = OpenPipeAI(
49
+ api_key=api_key,
50
+ base_url=base_url,
51
+ openpipe={
52
+ "api_key": openpipe_api_key,
53
+ "base_url": openpipe_base_url
54
+ }
55
+ )
56
+ return client
57
+
58
+ async def get_chat_completions(
59
+ self,
60
+ context: OpenAILLMContext,
61
+ messages: List[ChatCompletionMessageParam]) -> AsyncStream[ChatCompletionChunk]:
62
+ chunks = await self._client.chat.completions.create(
63
+ model=self._model,
64
+ stream=True,
65
+ messages=messages,
66
+ openpipe={
67
+ "tags": self._tags,
68
+ "log_request": True
69
+ }
70
+ )
71
+ return chunks
pipecat/services/playht.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2024, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ import io
8
+ import struct
9
+
10
+ from typing import AsyncGenerator
11
+
12
+ from pipecat.frames.frames import AudioRawFrame, Frame
13
+ from pipecat.services.ai_services import TTSService
14
+
15
+ from loguru import logger
16
+
17
+ try:
18
+ from pyht.client import TTSOptions
19
+ from pyht.async_client import AsyncClient
20
+ from pyht.protos.api_pb2 import Format
21
+ except ModuleNotFoundError as e:
22
+ logger.error(f"Exception: {e}")
23
+ logger.error(
24
+ "In order to use PlayHT, you need to `pip install pipecat-ai[playht]`. Also, set `PLAY_HT_USER_ID` and `PLAY_HT_API_KEY` environment variables.")
25
+ raise Exception(f"Missing module: {e}")
26
+
27
+
28
+ class PlayHTTTSService(TTSService):
29
+
30
+ def __init__(self, *, api_key: str, user_id: str, voice_url: str, **kwargs):
31
+ super().__init__(**kwargs)
32
+
33
+ self._user_id = user_id
34
+ self._speech_key = api_key
35
+
36
+ self._client = AsyncClient(
37
+ user_id=self._user_id,
38
+ api_key=self._speech_key,
39
+ )
40
+ self._options = TTSOptions(
41
+ voice=voice_url,
42
+ sample_rate=16000,
43
+ quality="higher",
44
+ format=Format.FORMAT_WAV)
45
+
46
+ def can_generate_metrics(self) -> bool:
47
+ return True
48
+
49
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
50
+ logger.debug(f"Generating TTS: [{text}]")
51
+
52
+ try:
53
+ b = bytearray()
54
+ in_header = True
55
+
56
+ await self.start_ttfb_metrics()
57
+
58
+ playht_gen = self._client.tts(
59
+ text,
60
+ voice_engine="PlayHT2.0-turbo",
61
+ options=self._options)
62
+
63
+ async for chunk in playht_gen:
64
+ # skip the RIFF header.
65
+ if in_header:
66
+ b.extend(chunk)
67
+ if len(b) <= 36:
68
+ continue
69
+ else:
70
+ fh = io.BytesIO(b)
71
+ fh.seek(36)
72
+ (data, size) = struct.unpack('<4sI', fh.read(8))
73
+ while data != b'data':
74
+ fh.read(size)
75
+ (data, size) = struct.unpack('<4sI', fh.read(8))
76
+ in_header = False
77
+ else:
78
+ if len(chunk):
79
+ await self.stop_ttfb_metrics()
80
+ frame = AudioRawFrame(chunk, 16000, 1)
81
+ yield frame
82
+ except Exception as e:
83
+ logger.exception(f"{self} error generating TTS: {e}")