Spaces:
Sleeping
Sleeping
added base files
Browse files- .env +1 -0
- app.py +149 -0
- function-calling.txt +322 -0
- generate.py +101 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ELEVENLABS_API_KEY="sk_d508e1a1195d494e5bc09b7b60fec683000a88e3fedd561a"
|
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import shutil
|
5 |
+
import re
|
6 |
+
import requests
|
7 |
+
import pyttsx3
|
8 |
+
from pydub import AudioSegment
|
9 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
10 |
+
import torch
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
|
13 |
+
# Load environment variables from .env file
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
# Streamlit configuration
|
17 |
+
st.set_page_config(page_title="Podcast Generator", layout="wide")
|
18 |
+
st.title("🎙️ Podcast Generator")
|
19 |
+
|
20 |
+
# System prompt for conversation generation
|
21 |
+
system_prompt = """you are an experienced podcast host...
|
22 |
+
- based on text like an article you can create an engaging conversation between two people.
|
23 |
+
- make the conversation engaging with a lot of emotion.
|
24 |
+
- in the response, identify speakers as Sascha and Marina.
|
25 |
+
- Sascha is the writer, and Marina is the one asking questions.
|
26 |
+
- The podcast is called The Machine Learning Engineer.
|
27 |
+
- Short sentences that can be easily used with speech synthesis.
|
28 |
+
- Use natural conversation fillers like "äh" to make it sound real.
|
29 |
+
"""
|
30 |
+
|
31 |
+
# Load Hugging Face's distilgpt2 model and tokenizer
|
32 |
+
model_name = "distilgpt2"
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
35 |
+
|
36 |
+
# Pyttsx3 setup
|
37 |
+
engine = pyttsx3.init()
|
38 |
+
engine.setProperty("rate", 150) # Adjust speech rate as needed
|
39 |
+
engine.setProperty("voice", "english") # Set to English voice
|
40 |
+
|
41 |
+
# Retrieve ElevenLabs API key from environment
|
42 |
+
elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
|
43 |
+
elevenlabs_url = "https://api.elevenlabs.io/v1/text-to-speech/ERL3svWBAQ18ByCZTr4k"
|
44 |
+
elevenlabs_headers = {
|
45 |
+
"Accept": "audio/mpeg",
|
46 |
+
"Content-Type": "application/json",
|
47 |
+
"xi-api-key": elevenlabs_api_key
|
48 |
+
}
|
49 |
+
|
50 |
+
# ElevenLabs TTS function for Sascha
|
51 |
+
def synthesize_speech_elevenlabs(text, speaker, index):
|
52 |
+
data = {
|
53 |
+
"text": text,
|
54 |
+
"model_id": "eleven_turbo_v2_5",
|
55 |
+
"voice_settings": {
|
56 |
+
"stability": 0.5,
|
57 |
+
"similarity_boost": 0.75
|
58 |
+
}
|
59 |
+
}
|
60 |
+
response = requests.post(elevenlabs_url, json=data, headers=elevenlabs_headers)
|
61 |
+
filename = f"audio-files/{index}_{speaker}.mp3"
|
62 |
+
with open(filename, "wb") as out:
|
63 |
+
for chunk in response.iter_content(chunk_size=1024):
|
64 |
+
if chunk:
|
65 |
+
out.write(chunk)
|
66 |
+
|
67 |
+
# Pyttsx3 TTS function for Marina
|
68 |
+
def synthesize_speech_pyttsx3(text, speaker, index):
|
69 |
+
filename = f"audio-files/{index}_{speaker}.mp3"
|
70 |
+
engine.save_to_file(text, filename)
|
71 |
+
engine.runAndWait()
|
72 |
+
|
73 |
+
# Function to synthesize speech based on the speaker
|
74 |
+
def synthesize_speech(text, speaker, index):
|
75 |
+
if speaker == "Sascha":
|
76 |
+
synthesize_speech_elevenlabs(text, speaker, index)
|
77 |
+
else:
|
78 |
+
synthesize_speech_pyttsx3(text, speaker, index)
|
79 |
+
|
80 |
+
# Function to sort filenames naturally
|
81 |
+
def natural_sort_key(filename):
|
82 |
+
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
|
83 |
+
|
84 |
+
# Function to merge audio files
|
85 |
+
def merge_audios(audio_folder, output_file):
|
86 |
+
combined = AudioSegment.empty()
|
87 |
+
audio_files = sorted(
|
88 |
+
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
|
89 |
+
key=natural_sort_key
|
90 |
+
)
|
91 |
+
for filename in audio_files:
|
92 |
+
audio_path = os.path.join(audio_folder, filename)
|
93 |
+
audio = AudioSegment.from_file(audio_path)
|
94 |
+
combined += audio
|
95 |
+
combined.export(output_file, format="mp3")
|
96 |
+
|
97 |
+
# Function to generate the conversation using distilgpt2
|
98 |
+
def generate_conversation(article):
|
99 |
+
prompt = system_prompt + "\n\nArticle:\n" + article + "\n\nSascha: "
|
100 |
+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
101 |
+
output = model.generate(input_ids, max_length=8192, num_return_sequences=1, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
|
102 |
+
|
103 |
+
# Process output to create a structured conversation
|
104 |
+
conversation_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
105 |
+
lines = conversation_text.splitlines()
|
106 |
+
conversation = []
|
107 |
+
speaker = "Sascha"
|
108 |
+
for line in lines:
|
109 |
+
if line.strip():
|
110 |
+
conversation.append({"speaker": speaker, "text": line.strip()})
|
111 |
+
speaker = "Marina" if speaker == "Sascha" else "Sascha"
|
112 |
+
return conversation
|
113 |
+
|
114 |
+
# Function to generate the podcast audio from conversation data
|
115 |
+
def generate_audio(conversation):
|
116 |
+
if os.path.exists('audio-files'):
|
117 |
+
shutil.rmtree('audio-files')
|
118 |
+
os.makedirs('audio-files', exist_ok=True)
|
119 |
+
|
120 |
+
for index, part in enumerate(conversation):
|
121 |
+
speaker = part['speaker']
|
122 |
+
text = part['text']
|
123 |
+
synthesize_speech(text, speaker, index)
|
124 |
+
|
125 |
+
output_file = "podcast.mp3"
|
126 |
+
merge_audios("audio-files", output_file)
|
127 |
+
return output_file
|
128 |
+
|
129 |
+
# Streamlit inputs and outputs
|
130 |
+
article = st.text_area("Article Content", "Paste the article text here", height=300)
|
131 |
+
if st.button("Generate Podcast"):
|
132 |
+
if not article:
|
133 |
+
st.error("Please enter article content to generate a podcast.")
|
134 |
+
else:
|
135 |
+
with st.spinner("Generating conversation..."):
|
136 |
+
conversation = generate_conversation(article)
|
137 |
+
|
138 |
+
st.success("Conversation generated successfully!")
|
139 |
+
st.json(conversation)
|
140 |
+
|
141 |
+
# Generate audio files
|
142 |
+
with st.spinner("Synthesizing audio..."):
|
143 |
+
podcast_file = generate_audio(conversation)
|
144 |
+
|
145 |
+
st.success("Audio synthesis complete!")
|
146 |
+
st.audio(podcast_file, format="audio/mp3")
|
147 |
+
|
148 |
+
with open(podcast_file, "rb") as file:
|
149 |
+
st.download_button("Download Podcast", data=file, file_name="podcast.mp3", mime="audio/mp3")
|
function-calling.txt
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
Search
|
3 |
+
Write
|
4 |
+
|
5 |
+
Sascha Heyer
|
6 |
+
Vertex AI Function Calling
|
7 |
+
LLMs are turning into reasoning engines using capabilities like web search and calling external APIs.
|
8 |
+
Sascha Heyer
|
9 |
+
Google Cloud - Community
|
10 |
+
Sascha Heyer
|
11 |
+
|
12 |
+
Published in
|
13 |
+
Google Cloud - Community
|
14 |
+
|
15 |
+
·
|
16 |
+
8 min read
|
17 |
+
·
|
18 |
+
Aug 31, 2024
|
19 |
+
51
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
LLMs are stuck in time. They know everything about the past. They lack access to information after their training date, leading to inaccurate responses. Additionally, LLMs do not have a way of interacting with the world. They cannot take action on behalf of your users.
|
27 |
+
|
28 |
+
This has changed, and LLMs have become more and more capable.
|
29 |
+
|
30 |
+
1️⃣ It started with RAG feeding information that is retrieved in real-time into the LLMs context window.
|
31 |
+
|
32 |
+
2️⃣ Now, we are seeing multimodal capabilities allowing our models to process large videos, images, audio, and text of up to 2 million tokens. This allows us to process larger documents without the need for a RAG. However, retrieving documents using an RAG approach is still useful if you want to have a low-latency application.
|
33 |
+
|
34 |
+
3️⃣ In addition, LLMs turn into reasoning engines with function calling, also called tooling. This allows us to integrate web search and call external APIs into our LLMs.
|
35 |
+
|
36 |
+
Let’s see how we can call external APIs using Gemini with Function Calling.
|
37 |
+
|
38 |
+
This article is part #5 of my Friday’s livestream series. You can watch all the previous recordings. Join me every Friday from 10–11:30 AM CET / 8–10:30 UTC.
|
39 |
+
|
40 |
+
What is Function Calling?
|
41 |
+
Function calling is a feature that allows large language models to interact with external tools, APIs, and databases.
|
42 |
+
|
43 |
+
Function calling enables dynamic data retrieval instead of solely relying on the static knowledge baked into the model during training. This means that the LLM can delegate tasks like fetching weather data, querying databases, or executing custom functions and then use the results to craft a more accurate and relevant response.
|
44 |
+
|
45 |
+
|
46 |
+
Use Case
|
47 |
+
Imagine a customer reaching out to your support and asking, “What’s the status of my order ID #12345?” Instead of giving a generic or outdated response, the model connects with Function Calling with your order management system, retrieves the real-time status of the order, and responds with something like, “Your order #12345 is on its way and is expected to arrive tomorrow.”
|
48 |
+
|
49 |
+
But that’s not all. The customer then decides they want to initiate a return. Now fully connected to your external systems, the model can instantly process the return request. It confirms with the customer, “Your return for order #12345 has been initiated. You’ll receive a return label shortly.”
|
50 |
+
|
51 |
+
This example shows how Google Cloud’s Gemini Function Calling enables LLMs to retrieve real-time information and interact with external systems.
|
52 |
+
|
53 |
+
Want some more ideas?
|
54 |
+
Smart Home Integration
|
55 |
+
Use the LLM to control your smart home devices. For instance, you can ask, “Turn on the living room lights” or “What’s the temperature in the kitchen?” The model connects with your home automation system, retrieves the temperature data, or toggles the lights, and responds accordingly.
|
56 |
+
Appointment Scheduling
|
57 |
+
A customer could ask, “Can you book a doctor's appointment for me next Tuesday at 3 PM?” The model interacts with the scheduling system to find available slots, book appointments, and confirm them with the user.
|
58 |
+
All starts with a Function Declaration
|
59 |
+
Before anything, let us focus on a function declaration. A function declaration describes what the function can do and its parameters. The Gemini model uses this information to decide which function to select and how to pass the parameters. Therefore it is extremely important to include as much detail as possible.
|
60 |
+
|
61 |
+
from vertexai.generative_models import FunctionDeclaration
|
62 |
+
|
63 |
+
get_order_status_func = FunctionDeclaration(
|
64 |
+
name="get_order_status",
|
65 |
+
description="Retrieve the current status of an order by its order ID.",
|
66 |
+
parameters={
|
67 |
+
"type": "object",
|
68 |
+
"properties": {
|
69 |
+
"order_id": {
|
70 |
+
"type": "string",
|
71 |
+
"description": "The unique identifier of the order."
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"required": ["order_id"]
|
75 |
+
},
|
76 |
+
)
|
77 |
+
As an alternative, you can define a FunctionDeclaration directly from a function.
|
78 |
+
|
79 |
+
def get_order_status(order_id: str):
|
80 |
+
# Simulated response
|
81 |
+
return {
|
82 |
+
"order_id": order_id,
|
83 |
+
"expected_delivery": "Tomorrow"
|
84 |
+
}
|
85 |
+
|
86 |
+
get_order_status_func = FunctionDeclaration.from_func(get_order_status)
|
87 |
+
Depending on your prompt to the model, it identifies whether a function must be called or the model can directly answer. If that’s the case, the model returns the functions that are a good fit with the parameter.
|
88 |
+
|
89 |
+
Combine it with Gemini as a Generative Model
|
90 |
+
Let’s do a full example and dig deeper into the model's response. Functions are provided as tools to Gemini. Tools can consist of multiple function declarations and pass multiple tools to the Gemini API.
|
91 |
+
|
92 |
+
from vertexai.generative_models import (
|
93 |
+
FunctionDeclaration,
|
94 |
+
GenerationConfig,
|
95 |
+
GenerativeModel,
|
96 |
+
Tool
|
97 |
+
)
|
98 |
+
|
99 |
+
get_order_status_func = FunctionDeclaration(
|
100 |
+
name="get_order_status",
|
101 |
+
description="Retrieve the current status of an order by its order ID.",
|
102 |
+
parameters={
|
103 |
+
"type": "object",
|
104 |
+
"properties": {
|
105 |
+
"order_id": {
|
106 |
+
"type": "string",
|
107 |
+
"description": "The unique identifier of the order."
|
108 |
+
}
|
109 |
+
},
|
110 |
+
"required": ["order_id"]
|
111 |
+
},
|
112 |
+
)
|
113 |
+
|
114 |
+
order_tool = Tool(
|
115 |
+
function_declarations=[
|
116 |
+
get_order_status_func,
|
117 |
+
],
|
118 |
+
)
|
119 |
+
|
120 |
+
model = GenerativeModel(
|
121 |
+
"gemini-1.5-flash-001",
|
122 |
+
generation_config=GenerationConfig(temperature=0),
|
123 |
+
tools=[order_tool],
|
124 |
+
)
|
125 |
+
chat = model.start_chat()
|
126 |
+
|
127 |
+
prompt = "Can you check where my order with ID 12345 is?"
|
128 |
+
|
129 |
+
response = chat.send_message(prompt)
|
130 |
+
print(response.candidates[0].content)
|
131 |
+
|
132 |
+
# only get the matching functions
|
133 |
+
function_calls = response.candidates[0].function_calls
|
134 |
+
print(function_calls)
|
135 |
+
Below is the model's response. As you can see, we just get the function that fits the prompt. As I said, the model is not calling the function for you. It is reasoning, deciding which function to call with what parameters, like the order ID that was extracted from our prompt: Can you check where my order with ID 12345 is?
|
136 |
+
|
137 |
+
role: "model"
|
138 |
+
parts {
|
139 |
+
function_call {
|
140 |
+
name: "get_order_status"
|
141 |
+
args {
|
142 |
+
fields {
|
143 |
+
key: "order_id"
|
144 |
+
value {
|
145 |
+
string_value: "12345"
|
146 |
+
}
|
147 |
+
}
|
148 |
+
}
|
149 |
+
}
|
150 |
+
}
|
151 |
+
If I ask the model about the capital of Berlin and my order in one prompt, we get both as a response as separate parts. Gemini 1.5 also allows for parallel function calling. If there are multiple functions that match, you get that as a response.
|
152 |
+
|
153 |
+
parts {
|
154 |
+
text: "The capital of Berlin is Berlin. \n\n"
|
155 |
+
}
|
156 |
+
parts {
|
157 |
+
function_call {
|
158 |
+
name: "get_order_status"
|
159 |
+
args {
|
160 |
+
fields {
|
161 |
+
key: "order_id"
|
162 |
+
value {
|
163 |
+
string_value: "12345"
|
164 |
+
}
|
165 |
+
}
|
166 |
+
}
|
167 |
+
}
|
168 |
+
}
|
169 |
+
I think this usage is brilliant, and that makes it incredibly flexible for us.
|
170 |
+
|
171 |
+
Calling the Function
|
172 |
+
The model is not calling the function, so we need to handle that ourselves. We will discuss automatic function calling at the end of the article.
|
173 |
+
Stay with me.
|
174 |
+
|
175 |
+
First, we iterate through the function_calls array returned by the model. The model only returns functions that match our query. Each function call includes the function's name and the arguments extracted from the prompt. We check the function name to determine which action to take.
|
176 |
+
|
177 |
+
function_calls = response.candidates[0].function_calls
|
178 |
+
|
179 |
+
for function_call in response.candidates[0].function_calls:
|
180 |
+
print(function_call)
|
181 |
+
if function_call.name == "get_order_status":
|
182 |
+
# call external API to get the order status
|
183 |
+
api_response = {...}
|
184 |
+
elif function_call.name == "initiate_return":
|
185 |
+
# call external API to initiate the return
|
186 |
+
api_response = {...}
|
187 |
+
Using the Response
|
188 |
+
Gemini Function Calling’s flexibility lies in its ability to identify and delegate tasks, but it relies on our code to complete the task.
|
189 |
+
|
190 |
+
Once we receive the function calls from the model, we execute these functions ourselves, retrieve the necessary data, and then pass this information back to the model.
|
191 |
+
|
192 |
+
After generating the API response, we pass this data back to the Gemini model, which generates a natural language response that is ready to be presented to the user. This could be something like, “Your order #12345 is on its way and is expected to arrive tomorrow.”
|
193 |
+
|
194 |
+
for function_call in response.candidates[0].function_calls:
|
195 |
+
|
196 |
+
if function_call.name == "get_order_status":
|
197 |
+
order_id = function_call.args["order_id"]
|
198 |
+
|
199 |
+
# dummy data
|
200 |
+
api_response = {
|
201 |
+
"order_id": order_id,
|
202 |
+
"expected_delivery": "Tomorrow"
|
203 |
+
}
|
204 |
+
|
205 |
+
elif function_call.name == "initiate_return":
|
206 |
+
order_id = function_call.args["order_id"]
|
207 |
+
reason = function_call.args.get("reason", "No reason provided")
|
208 |
+
|
209 |
+
# dummy data
|
210 |
+
api_response = {
|
211 |
+
"order_id": order_id,
|
212 |
+
"return_status": "Return initiated successfully.",
|
213 |
+
"return_label": "You will receive a return label shortly."
|
214 |
+
}
|
215 |
+
|
216 |
+
# Return the dummy API response to Gemini so it can generate a model response or request another function call
|
217 |
+
response = model.generate_content(
|
218 |
+
[
|
219 |
+
user_prompt_content, # User prompt
|
220 |
+
response.candidates[0].content, # Function call response
|
221 |
+
Content(
|
222 |
+
parts=[
|
223 |
+
Part.from_function_response(
|
224 |
+
name=function_call.name,
|
225 |
+
response={"content": api_response}, # Return the dummy API response to Gemini
|
226 |
+
),
|
227 |
+
],
|
228 |
+
),
|
229 |
+
],
|
230 |
+
tools=[support_tool],
|
231 |
+
)
|
232 |
+
|
233 |
+
# Get the model response and print it
|
234 |
+
print(response.text)
|
235 |
+
# response: Your order #12345 is expected to be delivered tomorrow.
|
236 |
+
I’ve used if and elif only for demonstration purposes. If you have many more functions, it makes sense to use a dictionary with dynamic function execution.
|
237 |
+
|
238 |
+
function_handlers = {
|
239 |
+
"get_order_status": get_order_status,
|
240 |
+
"initiate_return": initiate_return,
|
241 |
+
}
|
242 |
+
|
243 |
+
for function_call in response.candidates[0].function_calls:
|
244 |
+
print(function_call)
|
245 |
+
function_name = function_call.name
|
246 |
+
args = {key: value for key, value in function_call.args.items()}
|
247 |
+
|
248 |
+
if function_name in function_handlers:
|
249 |
+
function_response = function_handlers[function_name](args)
|
250 |
+
Security for API invocations
|
251 |
+
If you use a function that calls your model in the end, your model's users interact with your system. You must ensure the same security standards as any other user-facing product. Make sure the data sent to your APIs is not malicious.
|
252 |
+
|
253 |
+
Google AI vs Vertex AI
|
254 |
+
As of August 2024, the SDKs of Vertex AI and Google AI differ. Google AI SDK supports automatic function calling and a few other features, such as tool control. The Vertex AI SDK does not support automatic function calling. I hope Google will add this feature to Vertex AI as well.
|
255 |
+
|
256 |
+
If you stumble over the following code, you use Google AI, not Vertex AI.
|
257 |
+
|
258 |
+
model.start_chat(enable_automatic_function_calling=True)
|
259 |
+
I found this in the Vertex AI SDK, which indicates upcoming support for automatic function calling but hasn’t yet been released in Vertex AI 1.64.0. In the next sections, you can see how we can call the functions.
|
260 |
+
|
261 |
+
python-aiplatform/vertexai/generative_models/_generative_models.py at…
|
262 |
+
A Python SDK for Vertex AI, a fully managed, end-to-end platform for data science and machine learning. …
|
263 |
+
github.com
|
264 |
+
|
265 |
+
This is probably how it will work using AutomaticFunctionCalling. I will update the article as soon as I have it tested properly.
|
266 |
+
|
267 |
+
import vertexai
|
268 |
+
from vertexai.generative_models import (
|
269 |
+
Content,
|
270 |
+
FunctionDeclaration,
|
271 |
+
GenerationConfig,
|
272 |
+
GenerativeModel,
|
273 |
+
Tool,
|
274 |
+
Part
|
275 |
+
AutomaticFunctionCallingResponder,
|
276 |
+
)
|
277 |
+
|
278 |
+
# Initialize Vertex AI
|
279 |
+
project_id = "sascha-playground-doit"
|
280 |
+
vertexai.init(project=project_id, location="us-central1")
|
281 |
+
|
282 |
+
# ... functions here
|
283 |
+
|
284 |
+
# Infer function schema from the defined functions
|
285 |
+
get_order_status_func = FunctionDeclaration.from_func(get_order_status)
|
286 |
+
initiate_return_func = FunctionDeclaration.from_func(initiate_return)
|
287 |
+
|
288 |
+
# Tool is a collection of related functions
|
289 |
+
order_tool = Tool(
|
290 |
+
function_declarations=[get_order_status_func, initiate_return_func],
|
291 |
+
)
|
292 |
+
|
293 |
+
# Initialize the model with the tool and set up automatic function calling
|
294 |
+
model = GenerativeModel(
|
295 |
+
model_name="gemini-1.5-flash-001",
|
296 |
+
system_instruction="You are a store support API assistant to help with online orders.",
|
297 |
+
tools=[order_tool],
|
298 |
+
)
|
299 |
+
|
300 |
+
#Activate automatic function calling
|
301 |
+
afc_responder = AutomaticFunctionCallingResponder(
|
302 |
+
max_automatic_function_calls=5,
|
303 |
+
)
|
304 |
+
|
305 |
+
# Start a chat with the responder
|
306 |
+
chat = model.start_chat(responder=afc_responder)
|
307 |
+
|
308 |
+
response = chat.send_message("What's the status of my order ID #12345?")
|
309 |
+
print(response.text)
|
310 |
+
Limitations and Best Practices
|
311 |
+
Parallel function calling is supported with Gemini 1.5
|
312 |
+
The maximum number of function declarations is 128
|
313 |
+
Google recommends using a lower temperature, and I can confirm this drastically improves the function calling reasoning.
|
314 |
+
Focus on clearly describing your function declarations, including the parameters.
|
315 |
+
Combine the function calling with a good system prompt.
|
316 |
+
The full code for this article is available on GitHub
|
317 |
+
gen-ai-livestream/function-calling at main · SaschaHeyer/gen-ai-livestream
|
318 |
+
Contribute to SaschaHeyer/gen-ai-livestream development by creating an account on GitHub.
|
319 |
+
github.com
|
320 |
+
|
321 |
+
Thanks for reading and watching
|
322 |
+
I appreciate your feedback and questions. You can find me on LinkedIn. Even better, subscribe to my YouTube channel ❤️.
|
generate.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import shutil
|
5 |
+
import pyttsx3
|
6 |
+
from pydub import AudioSegment
|
7 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
# Load environment variables from .env file
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
# Initialize GPT-2 model and tokenizer
|
14 |
+
model_name = "distilgpt2"
|
15 |
+
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
16 |
+
model = GPT2LMHeadModel.from_pretrained(model_name)
|
17 |
+
|
18 |
+
# System prompt and article content
|
19 |
+
system_prompt = """Generate a conversation between Sascha and Marina based on the article content provided.
|
20 |
+
Sascha is the article writer, and Marina is the interviewer. Make it engaging and emotional, with natural pauses (like "uh")
|
21 |
+
to make it sound conversational. This is for a podcast called "The Machine Learning Engineer"."""
|
22 |
+
|
23 |
+
# TTS voice map for Sascha and Marina
|
24 |
+
speaker_voice_map = {
|
25 |
+
"Sascha": "pyttsx3", # Sascha will use pyttsx3 for offline TTS
|
26 |
+
"Marina": "pyttsx3" # Marina uses pyttsx3 for offline TTS
|
27 |
+
}
|
28 |
+
|
29 |
+
# Initialize pyttsx3 engine for offline TTS
|
30 |
+
engine = pyttsx3.init()
|
31 |
+
engine.setProperty('rate', 150) # Speed of speech
|
32 |
+
engine.setProperty('volume', 0.9) # Volume (0.0 to 1.0)
|
33 |
+
|
34 |
+
# Pyttsx3 TTS function for offline TTS
|
35 |
+
def synthesize_speech_pyttsx3(text, speaker, index):
|
36 |
+
filename = f"audio-files/{index}_{speaker}.mp3"
|
37 |
+
engine.save_to_file(text, filename)
|
38 |
+
engine.runAndWait()
|
39 |
+
print(f'Audio content written to file "{filename}"')
|
40 |
+
|
41 |
+
# Function to synthesize speech based on the speaker
|
42 |
+
def synthesize_speech(text, speaker, index):
|
43 |
+
synthesize_speech_pyttsx3(text, speaker, index)
|
44 |
+
|
45 |
+
# Function to sort filenames naturally
|
46 |
+
def natural_sort_key(filename):
|
47 |
+
return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
|
48 |
+
|
49 |
+
# Function to merge audio files
|
50 |
+
def merge_audios(audio_folder, output_file):
|
51 |
+
combined = AudioSegment.empty()
|
52 |
+
audio_files = sorted(
|
53 |
+
[f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")],
|
54 |
+
key=natural_sort_key
|
55 |
+
)
|
56 |
+
for filename in audio_files:
|
57 |
+
audio_path = os.path.join(audio_folder, filename)
|
58 |
+
print(f"Processing: {audio_path}")
|
59 |
+
audio = AudioSegment.from_file(audio_path)
|
60 |
+
combined += audio
|
61 |
+
combined.export(output_file, format="mp3")
|
62 |
+
print(f"Merged audio saved as {output_file}")
|
63 |
+
|
64 |
+
# Function to generate conversation using distilgpt2
|
65 |
+
def generate_conversation(article):
|
66 |
+
input_text = f"{system_prompt}\n\n{article}\n\nSascha: "
|
67 |
+
inputs = tokenizer.encode(input_text, return_tensors="pt")
|
68 |
+
|
69 |
+
outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=1.0)
|
70 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
71 |
+
|
72 |
+
# Parse conversation into JSON format
|
73 |
+
lines = re.split(r'(Sascha:|Marina:)', generated_text)[1:] # split by speaker names
|
74 |
+
conversation = [{"speaker": lines[i].strip(), "text": lines[i + 1].strip()} for i in range(0, len(lines), 2)]
|
75 |
+
|
76 |
+
formatted_json = json.dumps(conversation, indent=4)
|
77 |
+
print(formatted_json)
|
78 |
+
return conversation
|
79 |
+
|
80 |
+
# Function to generate the podcast audio
|
81 |
+
def generate_audio(conversation):
|
82 |
+
if os.path.exists('audio-files'):
|
83 |
+
shutil.rmtree('audio-files')
|
84 |
+
os.makedirs('audio-files', exist_ok=True)
|
85 |
+
|
86 |
+
for index, part in enumerate(conversation):
|
87 |
+
speaker = part['speaker']
|
88 |
+
text = part['text']
|
89 |
+
synthesize_speech(text, speaker, index)
|
90 |
+
|
91 |
+
output_file = "podcast.mp3"
|
92 |
+
merge_audios("audio-files", output_file)
|
93 |
+
return output_file
|
94 |
+
|
95 |
+
# Read the article from the file
|
96 |
+
with open('function-calling.txt', 'r') as file:
|
97 |
+
article = file.read()
|
98 |
+
|
99 |
+
# Generate conversation and audio
|
100 |
+
conversation = generate_conversation(article)
|
101 |
+
generate_audio(conversation)
|