Spaces:
Running
on
T4
Running
on
T4
gabrielchua
commited on
Commit
β’
8fa13bc
1
Parent(s):
112bea7
use meloTTS and suno bark
Browse files
app.py
CHANGED
@@ -37,6 +37,15 @@ LANGUAGE_MAPPING = {
|
|
37 |
"Turkish": "tr"
|
38 |
}
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
class DialogueItem(BaseModel):
|
41 |
"""A single dialogue item."""
|
42 |
|
@@ -67,19 +76,14 @@ def generate_podcast(
|
|
67 |
tone: Optional[str],
|
68 |
length: Optional[str],
|
69 |
language: str,
|
|
|
70 |
) -> Tuple[str, str]:
|
71 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
72 |
text = ""
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
-
"
|
77 |
-
"Spanish": "ES",
|
78 |
-
"French": "FR",
|
79 |
-
"Chinese": "ZH",
|
80 |
-
"Japanese": "JP",
|
81 |
-
"Korean": "KR",
|
82 |
-
}
|
83 |
|
84 |
# Check if at least one input is provided
|
85 |
if not files and not url:
|
@@ -154,7 +158,7 @@ def generate_podcast(
|
|
154 |
|
155 |
# Get audio file path
|
156 |
audio_file_path = generate_podcast_audio(
|
157 |
-
line.text, line.speaker, LANGUAGE_MAPPING[language]
|
158 |
)
|
159 |
# Read the audio file into an AudioSegment
|
160 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
@@ -191,7 +195,7 @@ demo = gr.Interface(
|
|
191 |
<table style="border-collapse: collapse; border: none; padding: 20px;">
|
192 |
<tr style="border: none;">
|
193 |
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
|
194 |
-
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/
|
195 |
</td>
|
196 |
<td style="border: none; vertical-align: top; padding: 10px;">
|
197 |
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
|
@@ -225,6 +229,10 @@ demo = gr.Interface(
|
|
225 |
value="English",
|
226 |
label="6. π Choose the language"
|
227 |
),
|
|
|
|
|
|
|
|
|
228 |
],
|
229 |
outputs=[
|
230 |
gr.Audio(label="Podcast", format="mp3"),
|
@@ -242,23 +250,26 @@ demo = gr.Interface(
|
|
242 |
"Fun",
|
243 |
"Short (1-2 min)",
|
244 |
"English",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
],
|
246 |
-
# [
|
247 |
-
# [],
|
248 |
-
# "https://en.wikipedia.org/wiki/Hugging_Face",
|
249 |
-
# "How did Hugging Face become so successful?",
|
250 |
-
# "Fun",
|
251 |
-
# "Short (1-2 min)",
|
252 |
-
# "English",
|
253 |
-
# ],
|
254 |
-
# [
|
255 |
-
# [],
|
256 |
-
# "https://simple.wikipedia.org/wiki/Taylor_Swift",
|
257 |
-
# "Why is Taylor Swift so popular?",
|
258 |
-
# "Fun",
|
259 |
-
# "Short (1-2 min)",
|
260 |
-
# "English",
|
261 |
-
# ],
|
262 |
],
|
263 |
cache_examples=True,
|
264 |
)
|
|
|
37 |
"Turkish": "tr"
|
38 |
}
|
39 |
|
40 |
+
MELO_TTS_LANGUAGE_MAPPING = {
|
41 |
+
"English": "EN",
|
42 |
+
"Spanish": "ES",
|
43 |
+
"French": "FR",
|
44 |
+
"Chinese": "ZJ",
|
45 |
+
"Japanese": "JP",
|
46 |
+
"Korean": "KR",
|
47 |
+
}
|
48 |
+
|
49 |
class DialogueItem(BaseModel):
|
50 |
"""A single dialogue item."""
|
51 |
|
|
|
76 |
tone: Optional[str],
|
77 |
length: Optional[str],
|
78 |
language: str,
|
79 |
+
use_advanced_audio: bool,
|
80 |
) -> Tuple[str, str]:
|
81 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
82 |
text = ""
|
83 |
|
84 |
+
# Check if the selected language is supported by MeloTTS when not using advanced audio
|
85 |
+
if not use_advanced_audio and language not in MELO_TTS_LANGUAGE_MAPPING:
|
86 |
+
raise gr.Error(f"The selected language '{language}' is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language.")
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# Check if at least one input is provided
|
89 |
if not files and not url:
|
|
|
158 |
|
159 |
# Get audio file path
|
160 |
audio_file_path = generate_podcast_audio(
|
161 |
+
line.text, line.speaker, LANGUAGE_MAPPING[language], use_advanced_audio
|
162 |
)
|
163 |
# Read the audio file into an AudioSegment
|
164 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
|
|
195 |
<table style="border-collapse: collapse; border: none; padding: 20px;">
|
196 |
<tr style="border: none;">
|
197 |
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
|
198 |
+
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
|
199 |
</td>
|
200 |
<td style="border: none; vertical-align: top; padding: 10px;">
|
201 |
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
|
|
|
229 |
value="English",
|
230 |
label="6. π Choose the language"
|
231 |
),
|
232 |
+
gr.Checkbox(
|
233 |
+
label="7. π Use advanced audio generation? (Experimental)",
|
234 |
+
value=False
|
235 |
+
)
|
236 |
],
|
237 |
outputs=[
|
238 |
gr.Audio(label="Podcast", format="mp3"),
|
|
|
250 |
"Fun",
|
251 |
"Short (1-2 min)",
|
252 |
"English",
|
253 |
+
True
|
254 |
+
],
|
255 |
+
[
|
256 |
+
[],
|
257 |
+
"https://en.wikipedia.org/wiki/Hugging_Face",
|
258 |
+
"How did Hugging Face become so successful?",
|
259 |
+
"Fun",
|
260 |
+
"Short (1-2 min)",
|
261 |
+
"English",
|
262 |
+
False
|
263 |
+
],
|
264 |
+
[
|
265 |
+
[],
|
266 |
+
"https://simple.wikipedia.org/wiki/Taylor_Swift",
|
267 |
+
"Why is Taylor Swift so popular?",
|
268 |
+
"Fun",
|
269 |
+
"Short (1-2 min)",
|
270 |
+
"English",
|
271 |
+
False
|
272 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
],
|
274 |
cache_examples=True,
|
275 |
)
|
utils.py
CHANGED
@@ -24,7 +24,7 @@ client = OpenAI(
|
|
24 |
api_key=os.getenv("FIREWORKS_API_KEY"),
|
25 |
)
|
26 |
|
27 |
-
|
28 |
|
29 |
# download and load all models
|
30 |
preload_models()
|
@@ -78,34 +78,35 @@ def parse_url(url: str) -> str:
|
|
78 |
return response.text
|
79 |
|
80 |
|
81 |
-
def generate_podcast_audio(text: str, speaker: str, language: str) -> str:
|
82 |
|
83 |
-
|
|
|
84 |
|
85 |
-
|
86 |
|
87 |
-
|
88 |
-
|
89 |
|
90 |
-
|
91 |
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
24 |
api_key=os.getenv("FIREWORKS_API_KEY"),
|
25 |
)
|
26 |
|
27 |
+
hf_client = Client("mrfakename/MeloTTS")
|
28 |
|
29 |
# download and load all models
|
30 |
preload_models()
|
|
|
78 |
return response.text
|
79 |
|
80 |
|
81 |
+
def generate_podcast_audio(text: str, speaker: str, language: str, use_advanced_audio: bool) -> str:
|
82 |
|
83 |
+
if use_advanced_audio:
|
84 |
+
audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
|
85 |
|
86 |
+
file_path = f"audio_{language}_{speaker}.mp3"
|
87 |
|
88 |
+
# save audio to disk
|
89 |
+
write_wav(file_path, SAMPLE_RATE, audio_array)
|
90 |
|
91 |
+
return file_path
|
92 |
|
93 |
|
94 |
+
else:
|
95 |
+
if speaker == "Guest":
|
96 |
+
accent = "EN-US" if language == "EN" else language
|
97 |
+
speed = 0.9
|
98 |
+
else: # host
|
99 |
+
accent = "EN-Default" if language == "EN" else language
|
100 |
+
speed = 1
|
101 |
+
if language != "EN" and speaker != "Guest":
|
102 |
+
speed = 1.1
|
103 |
|
104 |
+
# Generate audio
|
105 |
+
result = hf_client.predict(
|
106 |
+
text=text,
|
107 |
+
language=language,
|
108 |
+
speaker=accent,
|
109 |
+
speed=speed,
|
110 |
+
api_name="/synthesize",
|
111 |
+
)
|
112 |
+
return result
|