Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,10 +10,16 @@ from api import BaseSpeakerTTS, ToneColorConverter
|
|
10 |
import langid
|
11 |
import traceback
|
12 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
13 |
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
16 |
|
|
|
|
|
|
|
17 |
# Function to download and extract checkpoints
|
18 |
def download_and_extract_checkpoints():
|
19 |
zip_url = "https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip"
|
@@ -40,10 +46,6 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
40 |
if not openai.api_key:
|
41 |
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
|
42 |
|
43 |
-
parser = argparse.ArgumentParser()
|
44 |
-
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
45 |
-
args = parser.parse_args()
|
46 |
-
|
47 |
# Define paths to checkpoints
|
48 |
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
49 |
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
@@ -82,6 +84,7 @@ except Exception as e:
|
|
82 |
# Supported languages
|
83 |
supported_languages = ['zh', 'en']
|
84 |
|
|
|
85 |
def predict(audio_file_pth, agree):
|
86 |
text_hint = ''
|
87 |
synthesized_audio_path = None
|
@@ -95,7 +98,7 @@ def predict(audio_file_pth, agree):
|
|
95 |
if audio_file_pth is not None:
|
96 |
speaker_wav = audio_file_pth
|
97 |
else:
|
98 |
-
text_hint += "[ERROR] Please
|
99 |
return (text_hint, None)
|
100 |
|
101 |
# Transcribe audio to text using OpenAI Whisper
|
@@ -121,7 +124,7 @@ def predict(audio_file_pth, agree):
|
|
121 |
print(f"Detected language: {language_predicted}")
|
122 |
|
123 |
if language_predicted not in supported_languages:
|
124 |
-
text_hint += f"[ERROR] The detected language '{language_predicted}' is not supported
|
125 |
return (text_hint, None)
|
126 |
|
127 |
# Select TTS model based on language
|
@@ -134,97 +137,77 @@ def predict(audio_file_pth, agree):
|
|
134 |
language = 'English'
|
135 |
speaker_style = 'default'
|
136 |
|
137 |
-
# Generate response using OpenAI GPT-4
|
138 |
# Generate response using OpenAI GPT-4
|
139 |
try:
|
140 |
response = openai.chat.completions.create(
|
141 |
model="gpt-4o-mini",
|
142 |
messages=[
|
143 |
-
{"role": "system", "content": "You are Mickey Mouse, a friendly
|
144 |
{"role": "user", "content": input_text}
|
145 |
-
]
|
146 |
-
max_tokens=200,
|
147 |
-
n=1,
|
148 |
-
stop=None,
|
149 |
-
temperature=0.7,
|
150 |
)
|
151 |
-
|
152 |
-
reply_text = response.choices[0].message.content.strip()
|
153 |
print(f"GPT-4 Reply: {reply_text}")
|
154 |
except Exception as e:
|
155 |
-
text_hint += f"[ERROR]
|
156 |
return (text_hint, None)
|
157 |
|
158 |
# Synthesize reply text to audio
|
159 |
try:
|
160 |
src_path = os.path.join(output_dir, 'tmp_reply.wav')
|
161 |
-
|
162 |
tts_model.tts(reply_text, src_path, speaker=speaker_style, language=language)
|
163 |
-
print(f"Audio synthesized and saved to {src_path}")
|
164 |
|
165 |
save_path = os.path.join(output_dir, 'output_reply.wav')
|
166 |
-
|
167 |
tone_color_converter.convert(
|
168 |
audio_src_path=src_path,
|
169 |
src_se=en_source_default_se if language == 'English' else zh_source_se,
|
170 |
tgt_se=target_se,
|
171 |
-
output_path=save_path
|
172 |
-
message="@MickeyMouse"
|
173 |
)
|
174 |
-
print(f"Tone color conversion completed and saved to {save_path}")
|
175 |
|
176 |
-
text_hint += "Response generated successfully
|
177 |
synthesized_audio_path = save_path
|
178 |
|
179 |
except Exception as e:
|
180 |
-
text_hint += f"[ERROR]
|
181 |
-
traceback.print_exc()
|
182 |
return (text_hint, None)
|
183 |
|
184 |
return (text_hint, synthesized_audio_path)
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
with gr.Blocks(analytics_enabled=False) as demo:
|
187 |
gr.Markdown("# Mickey Mouse Voice Assistant")
|
188 |
|
189 |
with gr.Row():
|
190 |
with gr.Column():
|
191 |
-
audio_input = gr.Audio(
|
192 |
-
|
193 |
-
type="filepath",
|
194 |
-
label="Record Your Voice",
|
195 |
-
info="Click the microphone button to record your voice."
|
196 |
-
)
|
197 |
-
tos_checkbox = gr.Checkbox(
|
198 |
-
label="Agree to Terms & Conditions",
|
199 |
-
value=False,
|
200 |
-
info="I agree to the terms of service."
|
201 |
-
)
|
202 |
submit_button = gr.Button("Send")
|
203 |
|
204 |
with gr.Column():
|
205 |
-
info_output = gr.Textbox(
|
206 |
-
|
207 |
-
interactive=False,
|
208 |
-
lines=4,
|
209 |
-
)
|
210 |
-
audio_output = gr.Audio(
|
211 |
-
label="Mickey's Response",
|
212 |
-
interactive=False,
|
213 |
-
autoplay=True,
|
214 |
-
)
|
215 |
|
216 |
-
submit_button.click(
|
217 |
-
predict,
|
218 |
-
inputs=[audio_input, tos_checkbox],
|
219 |
-
outputs=[info_output, audio_output]
|
220 |
-
)
|
221 |
|
222 |
-
#
|
223 |
-
demo
|
224 |
-
demo.launch(
|
225 |
-
server_name="0.0.0.0",
|
226 |
-
server_port=int(os.environ.get("PORT", 7860)),
|
227 |
-
debug=True,
|
228 |
-
show_api=True,
|
229 |
-
share=False
|
230 |
-
)
|
|
|
10 |
import langid
|
11 |
import traceback
|
12 |
from dotenv import load_dotenv
|
13 |
+
from fastapi import FastAPI, UploadFile, Form
|
14 |
+
from fastapi.responses import JSONResponse
|
15 |
+
from gradio.routes import mount_gradio_app
|
16 |
|
17 |
# Load environment variables
|
18 |
load_dotenv()
|
19 |
|
20 |
+
# Initialize FastAPI app
|
21 |
+
app = FastAPI()
|
22 |
+
|
23 |
# Function to download and extract checkpoints
|
24 |
def download_and_extract_checkpoints():
|
25 |
zip_url = "https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip"
|
|
|
46 |
if not openai.api_key:
|
47 |
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
|
48 |
|
|
|
|
|
|
|
|
|
49 |
# Define paths to checkpoints
|
50 |
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
51 |
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
|
|
84 |
# Supported languages
|
85 |
supported_languages = ['zh', 'en']
|
86 |
|
87 |
+
# Predict function (shared between FastAPI and Gradio)
|
88 |
def predict(audio_file_pth, agree):
|
89 |
text_hint = ''
|
90 |
synthesized_audio_path = None
|
|
|
98 |
if audio_file_pth is not None:
|
99 |
speaker_wav = audio_file_pth
|
100 |
else:
|
101 |
+
text_hint += "[ERROR] Please provide an audio file.\n"
|
102 |
return (text_hint, None)
|
103 |
|
104 |
# Transcribe audio to text using OpenAI Whisper
|
|
|
124 |
print(f"Detected language: {language_predicted}")
|
125 |
|
126 |
if language_predicted not in supported_languages:
|
127 |
+
text_hint += f"[ERROR] The detected language '{language_predicted}' is not supported.\n"
|
128 |
return (text_hint, None)
|
129 |
|
130 |
# Select TTS model based on language
|
|
|
137 |
language = 'English'
|
138 |
speaker_style = 'default'
|
139 |
|
|
|
140 |
# Generate response using OpenAI GPT-4
|
141 |
try:
|
142 |
response = openai.chat.completions.create(
|
143 |
model="gpt-4o-mini",
|
144 |
messages=[
|
145 |
+
{"role": "system", "content": "You are Mickey Mouse, a friendly character."},
|
146 |
{"role": "user", "content": input_text}
|
147 |
+
]
|
|
|
|
|
|
|
|
|
148 |
)
|
149 |
+
reply_text = response['choices'][0]['message']['content'].strip()
|
|
|
150 |
print(f"GPT-4 Reply: {reply_text}")
|
151 |
except Exception as e:
|
152 |
+
text_hint += f"[ERROR] GPT-4 response failed: {str(e)}\n"
|
153 |
return (text_hint, None)
|
154 |
|
155 |
# Synthesize reply text to audio
|
156 |
try:
|
157 |
src_path = os.path.join(output_dir, 'tmp_reply.wav')
|
|
|
158 |
tts_model.tts(reply_text, src_path, speaker=speaker_style, language=language)
|
|
|
159 |
|
160 |
save_path = os.path.join(output_dir, 'output_reply.wav')
|
|
|
161 |
tone_color_converter.convert(
|
162 |
audio_src_path=src_path,
|
163 |
src_se=en_source_default_se if language == 'English' else zh_source_se,
|
164 |
tgt_se=target_se,
|
165 |
+
output_path=save_path
|
|
|
166 |
)
|
|
|
167 |
|
168 |
+
text_hint += "Response generated successfully."
|
169 |
synthesized_audio_path = save_path
|
170 |
|
171 |
except Exception as e:
|
172 |
+
text_hint += f"[ERROR] Synthesis failed: {str(e)}\n"
|
|
|
173 |
return (text_hint, None)
|
174 |
|
175 |
return (text_hint, synthesized_audio_path)
|
176 |
|
177 |
+
|
178 |
+
# FastAPI endpoint for prediction
|
179 |
+
@app.post("/predict")
|
180 |
+
async def predict_endpoint(file: UploadFile, agree: bool = Form(...)):
|
181 |
+
# Save uploaded file
|
182 |
+
temp_file_path = f"temp_{file.filename}"
|
183 |
+
with open(temp_file_path, "wb") as temp_file:
|
184 |
+
temp_file.write(await file.read())
|
185 |
+
|
186 |
+
# Call predict
|
187 |
+
info, audio_path = predict(temp_file_path, agree)
|
188 |
+
os.remove(temp_file_path)
|
189 |
+
|
190 |
+
if audio_path:
|
191 |
+
return JSONResponse({"info": info, "audio": audio_path})
|
192 |
+
else:
|
193 |
+
return JSONResponse({"info": info}, status_code=400)
|
194 |
+
|
195 |
+
|
196 |
+
# Gradio UI
|
197 |
with gr.Blocks(analytics_enabled=False) as demo:
|
198 |
gr.Markdown("# Mickey Mouse Voice Assistant")
|
199 |
|
200 |
with gr.Row():
|
201 |
with gr.Column():
|
202 |
+
audio_input = gr.Audio(source="microphone", type="filepath", label="Record Your Voice")
|
203 |
+
tos_checkbox = gr.Checkbox(label="Agree to Terms & Conditions", value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
submit_button = gr.Button("Send")
|
205 |
|
206 |
with gr.Column():
|
207 |
+
info_output = gr.Textbox(label="Info", interactive=False, lines=4)
|
208 |
+
audio_output = gr.Audio(label="Mickey's Response", interactive=False, autoplay=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
+
submit_button.click(predict, inputs=[audio_input, tos_checkbox], outputs=[info_output, audio_output])
|
|
|
|
|
|
|
|
|
211 |
|
212 |
+
# Mount Gradio app to FastAPI
|
213 |
+
mount_gradio_app(app, demo, path="/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|