AdithyaSNair commited on
Commit
35cda3c
·
verified ·
1 Parent(s): 94deb23

added base files

Browse files
Files changed (4) hide show
  1. .env +1 -0
  2. app.py +149 -0
  3. function-calling.txt +322 -0
  4. generate.py +101 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ ELEVENLABS_API_KEY="sk_d508e1a1195d494e5bc09b7b60fec683000a88e3fedd561a"
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import shutil
5
+ import re
6
+ import requests
7
+ import pyttsx3
8
+ from pydub import AudioSegment
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
+ import torch
11
+ from dotenv import load_dotenv
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ # Streamlit configuration
17
+ st.set_page_config(page_title="Podcast Generator", layout="wide")
18
+ st.title("🎙️ Podcast Generator")
19
+
20
+ # System prompt for conversation generation
21
+ system_prompt = """you are an experienced podcast host...
22
+ - based on text like an article you can create an engaging conversation between two people.
23
+ - make the conversation engaging with a lot of emotion.
24
+ - in the response, identify speakers as Sascha and Marina.
25
+ - Sascha is the writer, and Marina is the one asking questions.
26
+ - The podcast is called The Machine Learning Engineer.
27
+ - Short sentences that can be easily used with speech synthesis.
28
+ - Use natural conversation fillers like "äh" to make it sound real.
29
+ """
30
+
31
+ # Load Hugging Face's distilgpt2 model and tokenizer
32
+ model_name = "distilgpt2"
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ model = AutoModelForCausalLM.from_pretrained(model_name)
35
+
36
+ # Pyttsx3 setup
37
+ engine = pyttsx3.init()
38
+ engine.setProperty("rate", 150) # Adjust speech rate as needed
39
+ engine.setProperty("voice", "english") # Set to English voice
40
+
41
+ # Retrieve ElevenLabs API key from environment
42
+ elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
43
+ elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k"
44
+ elevenlabs_headers = {
45
+ "Accept": "audio/mpeg",
46
+ "Content-Type": "application/json",
47
+ "xi-api-key": elevenlabs_api_key
48
+ }
49
+
50
+ # ElevenLabs TTS function for Sascha
51
+ def synthesize_speech_elevenlabs(text, speaker, index):
52
+ data = {
53
+ "text": text,
54
+ "model_id": "eleven_turbo_v2_5",
55
+ "voice_settings": {
56
+ "stability": 0.5,
57
+ "similarity_boost": 0.75
58
+ }
59
+ }
60
+ response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers)
61
+ filename = f"audio-files/{index}_{speaker}.mp3"
62
+ with open(filename, "wb") as out:
63
+ for chunk in response.iter_content(chunk_size=1024):
64
+ if chunk:
65
+ out.write(chunk)
66
+
67
+ # Pyttsx3 TTS function for Marina
68
+ def synthesize_speech_pyttsx3(text, speaker, index):
69
+ filename = f"audio-files/{index}_{speaker}.mp3"
70
+ engine.save_to_file(text, filename)
71
+ engine.runAndWait()
72
+
73
+ # Function to synthesize speech based on the speaker
74
+ def synthesize_speech(text, speaker, index):
75
+ if speaker == "Sascha":
76
+ synthesize_speech_elevenlabs(text, speaker, index)
77
+ else:
78
+ synthesize_speech_pyttsx3(text, speaker, index)
79
+
80
+ # Function to sort filenames naturally
81
+ def natural_sort_key(filename):
82
+ return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
83
+
84
+ # Function to merge audio files
85
+ def merge_audios(audio_folder, output_file):
86
+ combined = AudioSegment.empty()
87
+ audio_files = sorted(
88
+ [f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
89
+ key=natural_sort_key
90
+ )
91
+ for filename in audio_files:
92
+ audio_path = os.path.join(audio_folder, filename)
93
+ audio = AudioSegment.from_file(audio_path)
94
+ combined += audio
95
+ combined.export(output_file, format="mp3")
96
+
97
+ # Function to generate the conversation using distilgpt2
98
+ def generate_conversation(article):
99
+ prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: "
100
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
101
+ output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
102
+
103
+ # Process output to create a structured conversation
104
+ conversation_text = tokenizer.decode(output[0], skip_special_tokens=True)
105
+ lines = conversation_text.splitlines()
106
+ conversation = []
107
+ speaker = "Sascha"
108
+ for line in lines:
109
+ if line.strip():
110
+ conversation.append({"speaker": speaker, "text": line.strip()})
111
+ speaker = "Marina" if speaker == "Sascha" else "Sascha"
112
+ return conversation
113
+
114
+ # Function to generate the podcast audio from conversation data
115
+ def generate_audio(conversation):
116
+ if os.path.exists('audio-files'):
117
+ shutil.rmtree('audio-files')
118
+ os.makedirs('audio-files', exist_ok=True)
119
+
120
+ for index, part in enumerate(conversation):
121
+ speaker = part['speaker']
122
+ text = part['text']
123
+ synthesize_speech(text, speaker, index)
124
+
125
+ output_file = "podcast.mp3"
126
+ merge_audios("audio-files", output_file)
127
+ return output_file
128
+
129
+ # Streamlit inputs and outputs
130
+ article = st.text_area("Article Content", "Paste the article text here", height=300)
131
+ if st.button("Generate Podcast"):
132
+ if not article:
133
+ st.error("Please enter article content to generate a podcast.")
134
+ else:
135
+ with st.spinner("Generating conversation..."):
136
+ conversation = generate_conversation(article)
137
+
138
+ st.success("Conversation generated successfully!")
139
+ st.json(conversation)
140
+
141
+ # Generate audio files
142
+ with st.spinner("Synthesizing audio..."):
143
+ podcast_file = generate_audio(conversation)
144
+
145
+ st.success("Audio synthesis complete!")
146
+ st.audio(podcast_file, format="audio/mp3")
147
+
148
+ with open(podcast_file, "rb") as file:
149
+ st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3")
function-calling.txt ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Search
3
+ Write
4
+
5
+ Sascha Heyer
6
+ Vertex AI Function Calling
7
+ LLMs are turning into reasoning engines using capabilities like web search and calling external APIs.
8
+ Sascha Heyer
9
+ Google Cloud - Community
10
+ Sascha Heyer
11
+
12
+ Published in
13
+ Google Cloud - Community
14
+
15
+ ·
16
+ 8 min read
17
+ ·
18
+ Aug 31, 2024
19
+ 51
20
+
21
+
22
+
23
+
24
+
25
+
26
+ LLMs are stuck in time. They know everything about the past. They lack access to information after their training date, leading to inaccurate responses. Additionally, LLMs do not have a way of interacting with the world. They cannot take action on behalf of your users.
27
+
28
+ This has changed, and LLMs have become more and more capable.
29
+
30
+ 1️⃣ It started with RAG feeding information that is retrieved in real-time into the LLMs context window.
31
+
32
+ 2️⃣ Now, we are seeing multimodal capabilities allowing our models to process large videos, images, audio, and text of up to 2 million tokens. This allows us to process larger documents without the need for a RAG. However, retrieving documents using an RAG approach is still useful if you want to have a low-latency application.
33
+
34
+ 3️⃣ In addition, LLMs turn into reasoning engines with function calling, also called tooling. This allows us to integrate web search and call external APIs into our LLMs.
35
+
36
+ Let’s see how we can call external APIs using Gemini with Function Calling.
37
+
38
+ This article is part #5 of my Friday’s livestream series. You can watch all the previous recordings. Join me every Friday from 10–11:30 AM CET / 8–10:30 UTC.
39
+
40
+ What is Function Calling?
41
+ Function calling is a feature that allows large language models to interact with external tools, APIs, and databases.
42
+
43
+ Function calling enables dynamic data retrieval instead of solely relying on the static knowledge baked into the model during training. This means that the LLM can delegate tasks like fetching weather data, querying databases, or executing custom functions and then use the results to craft a more accurate and relevant response.
44
+
45
+
46
+ Use Case
47
+ Imagine a customer reaching out to your support and asking, “What’s the status of my order ID #12345?” Instead of giving a generic or outdated response, the model connects with Function Calling with your order management system, retrieves the real-time status of the order, and responds with something like, “Your order #12345 is on its way and is expected to arrive tomorrow.”
48
+
49
+ But that’s not all. The customer then decides they want to initiate a return. Now fully connected to your external systems, the model can instantly process the return request. It confirms with the customer, “Your return for order #12345 has been initiated. You’ll receive a return label shortly.”
50
+
51
+ This example shows how Google Cloud’s Gemini Function Calling enables LLMs to retrieve real-time information and interact with external systems.
52
+
53
+ Want some more ideas?
54
+ Smart Home Integration
55
+ Use the LLM to control your smart home devices. For instance, you can ask, “Turn on the living room lights” or “What’s the temperature in the kitchen?” The model connects with your home automation system, retrieves the temperature data, or toggles the lights, and responds accordingly.
56
+ Appointment Scheduling
57
+ A customer could ask, “Can you book a doctor's appointment for me next Tuesday at 3 PM?” The model interacts with the scheduling system to find available slots, book appointments, and confirm them with the user.
58
+ All starts with a Function Declaration
59
+ Before anything, let us focus on a function declaration. A function declaration describes what the function can do and its parameters. The Gemini model uses this information to decide which function to select and how to pass the parameters. Therefore it is extremely important to include as much detail as possible.
60
+
61
+ from vertexai.generative_models import FunctionDeclaration
62
+
63
+ get_order_status_func = FunctionDeclaration(
64
+ name="get_order_status",
65
+ description="Retrieve the current status of an order by its order ID.",
66
+ parameters={
67
+ "type": "object",
68
+ "properties": {
69
+ "order_id": {
70
+ "type": "string",
71
+ "description": "The unique identifier of the order."
72
+ }
73
+ },
74
+ "required": ["order_id"]
75
+ },
76
+ )
77
+ As an alternative, you can define a FunctionDeclaration directly from a function.
78
+
79
+ def get_order_status(order_id: str):
80
+ # Simulated response
81
+ return {
82
+ "order_id": order_id,
83
+ "expected_delivery": "Tomorrow"
84
+ }
85
+
86
+ get_order_status_func = FunctionDeclaration.from_func(get_order_status)
87
+ Depending on your prompt to the model, it identifies whether a function must be called or the model can directly answer. If that’s the case, the model returns the functions that are a good fit with the parameter.
88
+
89
+ Combine it with Gemini as a Generative Model
90
+ Let’s do a full example and dig deeper into the model's response. Functions are provided as tools to Gemini. Tools can consist of multiple function declarations and pass multiple tools to the Gemini API.
91
+
92
+ from vertexai.generative_models import (
93
+ FunctionDeclaration,
94
+ GenerationConfig,
95
+ GenerativeModel,
96
+ Tool
97
+ )
98
+
99
+ get_order_status_func = FunctionDeclaration(
100
+ name="get_order_status",
101
+ description="Retrieve the current status of an order by its order ID.",
102
+ parameters={
103
+ "type": "object",
104
+ "properties": {
105
+ "order_id": {
106
+ "type": "string",
107
+ "description": "The unique identifier of the order."
108
+ }
109
+ },
110
+ "required": ["order_id"]
111
+ },
112
+ )
113
+
114
+ order_tool = Tool(
115
+ function_declarations=[
116
+ get_order_status_func,
117
+ ],
118
+ )
119
+
120
+ model = GenerativeModel(
121
+ "gemini-1.5-flash-001",
122
+ generation_config=GenerationConfig(temperature=0),
123
+ tools=[order_tool],
124
+ )
125
+ chat = model.start_chat()
126
+
127
+ prompt = "Can you check where my order with ID 12345 is?"
128
+
129
+ response = chat.send_message(prompt)
130
+ print(response.candidates[0].content)
131
+
132
+ # only get the matching functions
133
+ function_calls = response.candidates[0].function_calls
134
+ print(function_calls)
135
+ Below is the model's response. As you can see, we just get the function that fits the prompt. As I said, the model is not calling the function for you. It is reasoning, deciding which function to call with what parameters, like the order ID that was extracted from our prompt: Can you check where my order with ID 12345 is?
136
+
137
+ role: "model"
138
+ parts {
139
+ function_call {
140
+ name: "get_order_status"
141
+ args {
142
+ fields {
143
+ key: "order_id"
144
+ value {
145
+ string_value: "12345"
146
+ }
147
+ }
148
+ }
149
+ }
150
+ }
151
+ If I ask the model about the capital of Berlin and my order in one prompt, we get both as a response as separate parts. Gemini 1.5 also allows for parallel function calling. If there are multiple functions that match, you get that as a response.
152
+
153
+ parts {
154
+ text: "The capital of Berlin is Berlin. \n\n"
155
+ }
156
+ parts {
157
+ function_call {
158
+ name: "get_order_status"
159
+ args {
160
+ fields {
161
+ key: "order_id"
162
+ value {
163
+ string_value: "12345"
164
+ }
165
+ }
166
+ }
167
+ }
168
+ }
169
+ I think this usage is brilliant, and that makes it incredibly flexible for us.
170
+
171
+ Calling the Function
172
+ The model is not calling the function, so we need to handle that ourselves. We will discuss automatic function calling at the end of the article.
173
+ Stay with me.
174
+
175
+ First, we iterate through the function_calls array returned by the model. The model only returns functions that match our query. Each function call includes the function's name and the arguments extracted from the prompt. We check the function name to determine which action to take.
176
+
177
+ function_calls = response.candidates[0].function_calls
178
+
179
+ for function_call in response.candidates[0].function_calls:
180
+ print(function_call)
181
+ if function_call.name == "get_order_status":
182
+ # call external API to get the order status
183
+ api_response = {...}
184
+ elif function_call.name == "initiate_return":
185
+ # call external API to initiate the return
186
+ api_response = {...}
187
+ Using the Response
188
+ Gemini Function Calling’s flexibility lies in its ability to identify and delegate tasks, but it relies on our code to complete the task.
189
+
190
+ Once we receive the function calls from the model, we execute these functions ourselves, retrieve the necessary data, and then pass this information back to the model.
191
+
192
+ After generating the API response, we pass this data back to the Gemini model, which generates a natural language response that is ready to be presented to the user. This could be something like, “Your order #12345 is on its way and is expected to arrive tomorrow.”
193
+
194
+ for function_call in response.candidates[0].function_calls:
195
+
196
+ if function_call.name == "get_order_status":
197
+ order_id = function_call.args["order_id"]
198
+
199
+ # dummy data
200
+ api_response = {
201
+ "order_id": order_id,
202
+ "expected_delivery": "Tomorrow"
203
+ }
204
+
205
+ elif function_call.name == "initiate_return":
206
+ order_id = function_call.args["order_id"]
207
+ reason = function_call.args.get("reason", "No reason provided")
208
+
209
+ # dummy data
210
+ api_response = {
211
+ "order_id": order_id,
212
+ "return_status": "Return initiated successfully.",
213
+ "return_label": "You will receive a return label shortly."
214
+ }
215
+
216
+ # Return the dummy API response to Gemini so it can generate a model response or request another function call
217
+ response = model.generate_content(
218
+ [
219
+ user_prompt_content, # User prompt
220
+ response.candidates[0].content, # Function call response
221
+ Content(
222
+ parts=[
223
+ Part.from_function_response(
224
+ name=function_call.name,
225
+ response={"content": api_response}, # Return the dummy API response to Gemini
226
+ ),
227
+ ],
228
+ ),
229
+ ],
230
+ tools=[support_tool],
231
+ )
232
+
233
+ # Get the model response and print it
234
+ print(response.text)
235
+ # response: Your order #12345 is expected to be delivered tomorrow.
236
+ I’ve used if and elif only for demonstration purposes. If you have many more functions, it makes sense to use a dictionary with dynamic function execution.
237
+
238
+ function_handlers = {
239
+ "get_order_status": get_order_status,
240
+ "initiate_return": initiate_return,
241
+ }
242
+
243
+ for function_call in response.candidates[0].function_calls:
244
+ print(function_call)
245
+ function_name = function_call.name
246
+ args = {key: value for key, value in function_call.args.items()}
247
+
248
+ if function_name in function_handlers:
249
+ function_response = function_handlers[function_name](args)
250
+ Security for API invocations
251
+ If you use a function that calls your model in the end, your model's users interact with your system. You must ensure the same security standards as any other user-facing product. Make sure the data sent to your APIs is not malicious.
252
+
253
+ Google AI vs Vertex AI
254
+ As of August 2024, the SDKs of Vertex AI and Google AI differ. Google AI SDK supports automatic function calling and a few other features, such as tool control. The Vertex AI SDK does not support automatic function calling. I hope Google will add this feature to Vertex AI as well.
255
+
256
+ If you stumble over the following code, you use Google AI, not Vertex AI.
257
+
258
+ model.start_chat(enable_automatic_function_calling=True)
259
+ I found this in the Vertex AI SDK, which indicates upcoming support for automatic function calling but hasn’t yet been released in Vertex AI 1.64.0. In the next sections, you can see how we can call the functions.
260
+
261
+ python-aiplatform/vertexai/generative_models/_generative_models.py at…
262
+ A Python SDK for Vertex AI, a fully managed, end-to-end platform for data science and machine learning. …
263
+ github.com
264
+
265
+ This is probably how it will work using AutomaticFunctionCalling. I will update the article as soon as I have it tested properly.
266
+
267
+ import vertexai
268
+ from vertexai.generative_models import (
269
+ Content,
270
+ FunctionDeclaration,
271
+ GenerationConfig,
272
+ GenerativeModel,
273
+ Tool,
274
+ Part
275
+ AutomaticFunctionCallingResponder,
276
+ )
277
+
278
+ # Initialize Vertex AI
279
+ project_id = "sascha-playground-doit"
280
+ vertexai.init(project=project_id, location="us-central1")
281
+
282
+ # ... functions here
283
+
284
+ # Infer function schema from the defined functions
285
+ get_order_status_func = FunctionDeclaration.from_func(get_order_status)
286
+ initiate_return_func = FunctionDeclaration.from_func(initiate_return)
287
+
288
+ # Tool is a collection of related functions
289
+ order_tool = Tool(
290
+ function_declarations=[get_order_status_func, initiate_return_func],
291
+ )
292
+
293
+ # Initialize the model with the tool and set up automatic function calling
294
+ model = GenerativeModel(
295
+ model_name="gemini-1.5-flash-001",
296
+ system_instruction="You are a store support API assistant to help with online orders.",
297
+ tools=[order_tool],
298
+ )
299
+
300
+ #Activate automatic function calling
301
+ afc_responder = AutomaticFunctionCallingResponder(
302
+ max_automatic_function_calls=5,
303
+ )
304
+
305
+ # Start a chat with the responder
306
+ chat = model.start_chat(responder=afc_responder)
307
+
308
+ response = chat.send_message("What's the status of my order ID #12345?")
309
+ print(response.text)
310
+ Limitations and Best Practices
311
+ Parallel function calling is supported with Gemini 1.5
312
+ The maximum number of function declarations is 128
313
+ Google recommends using a lower temperature, and I can confirm this drastically improves the function calling reasoning.
314
+ Focus on clearly describing your function declarations, including the parameters.
315
+ Combine the function calling with a good system prompt.
316
+ The full code for this article is available on GitHub
317
+ gen-ai-livestream/function-calling at main · SaschaHeyer/gen-ai-livestream
318
+ Contribute to SaschaHeyer/gen-ai-livestream development by creating an account on GitHub.
319
+ github.com
320
+
321
+ Thanks for reading and watching
322
+ I appreciate your feedback and questions. You can find me on LinkedIn. Even better, subscribe to my YouTube channel ❤️.
generate.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import shutil
5
+ import pyttsx3
6
+ from pydub import AudioSegment
7
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables from .env file
11
+ load_dotenv()
12
+
13
+ # Initialize GPT-2 model and tokenizer
14
+ model_name = "distilgpt2"
15
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
16
+ model = GPT2LMHeadModel.from_pretrained(model_name)
17
+
18
+ # System prompt and article content
19
+ system_prompt = """Generate a conversation between Sascha and Marina based on the article content provided.
20
+ Sascha is the article writer, and Marina is the interviewer. Make it engaging and emotional, with natural pauses (like "uh")
21
+ to make it sound conversational. This is for a podcast called "The Machine Learning Engineer"."""
22
+
23
+ # TTS voice map for Sascha and Marina
24
+ speaker_voice_map = {
25
+ "Sascha": "pyttsx3", # Sascha will use pyttsx3 for offline TTS
26
+ "Marina": "pyttsx3" # Marina uses pyttsx3 for offline TTS
27
+ }
28
+
29
+ # Initialize pyttsx3 engine for offline TTS
30
+ engine = pyttsx3.init()
31
+ engine.setProperty('rate', 150) # Speed of speech
32
+ engine.setProperty('volume', 0.9) # Volume (0.0 to 1.0)
33
+
34
+ # Pyttsx3 TTS function for offline TTS
35
+ def synthesize_speech_pyttsx3(text, speaker, index):
36
+ filename = f"audio-files/{index}_{speaker}.mp3"
37
+ engine.save_to_file(text, filename)
38
+ engine.runAndWait()
39
+ print(f'Audio content written to file "{filename}"')
40
+
41
+ # Function to synthesize speech based on the speaker
42
+ def synthesize_speech(text, speaker, index):
43
+ synthesize_speech_pyttsx3(text, speaker, index)
44
+
45
+ # Function to sort filenames naturally
46
+ def natural_sort_key(filename):
47
+ return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
48
+
49
+ # Function to merge audio files
50
+ def merge_audios(audio_folder, output_file):
51
+ combined = AudioSegment.empty()
52
+ audio_files = sorted(
53
+ [f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
54
+ key=natural_sort_key
55
+ )
56
+ for filename in audio_files:
57
+ audio_path = os.path.join(audio_folder, filename)
58
+ print(f"Processing: {audio_path}")
59
+ audio = AudioSegment.from_file(audio_path)
60
+ combined += audio
61
+ combined.export(output_file, format="mp3")
62
+ print(f"Merged audio saved as {output_file}")
63
+
64
+ # Function to generate conversation using distilgpt2
65
+ def generate_conversation(article):
66
+ input_text = f"{system_prompt}\n\n{article}\n\nSascha: "
67
+ inputs = tokenizer.encode(input_text, return_tensors="pt")
68
+
69
+ outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=1.0)
70
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
71
+
72
+ # Parse conversation into JSON format
73
+ lines = re.split(r'(Sascha:|Marina:)', generated_text)[1:] # split by speaker names
74
+ conversation = [{"speaker": lines[i].strip(), "text": lines[i + 1].strip()} for i in range(0, len(lines), 2)]
75
+
76
+ formatted_json = json.dumps(conversation, indent=4)
77
+ print(formatted_json)
78
+ return conversation
79
+
80
+ # Function to generate the podcast audio
81
+ def generate_audio(conversation):
82
+ if os.path.exists('audio-files'):
83
+ shutil.rmtree('audio-files')
84
+ os.makedirs('audio-files', exist_ok=True)
85
+
86
+ for index, part in enumerate(conversation):
87
+ speaker = part['speaker']
88
+ text = part['text']
89
+ synthesize_speech(text, speaker, index)
90
+
91
+ output_file = "podcast.mp3"
92
+ merge_audios("audio-files", output_file)
93
+ return output_file
94
+
95
+ # Read the article from the file
96
+ with open('function-calling.txt', 'r') as file:
97
+ article = file.read()
98
+
99
+ # Generate conversation and audio
100
+ conversation = generate_conversation(article)
101
+ generate_audio(conversation)