gabrielchua commited on
Commit
8fa13bc
β€’
1 Parent(s): 112bea7

use meloTTS and suno bark

Browse files
Files changed (2) hide show
  1. app.py +38 -27
  2. utils.py +26 -25
app.py CHANGED
@@ -37,6 +37,15 @@ LANGUAGE_MAPPING = {
37
  "Turkish": "tr"
38
  }
39
 
 
 
 
 
 
 
 
 
 
40
  class DialogueItem(BaseModel):
41
  """A single dialogue item."""
42
 
@@ -67,19 +76,14 @@ def generate_podcast(
67
  tone: Optional[str],
68
  length: Optional[str],
69
  language: str,
 
70
  ) -> Tuple[str, str]:
71
  """Generate the audio and transcript from the PDFs and/or URL."""
72
  text = ""
73
 
74
- # Change language to the appropriate code
75
- language_mapping = {
76
- "English": "EN",
77
- "Spanish": "ES",
78
- "French": "FR",
79
- "Chinese": "ZH",
80
- "Japanese": "JP",
81
- "Korean": "KR",
82
- }
83
 
84
  # Check if at least one input is provided
85
  if not files and not url:
@@ -154,7 +158,7 @@ def generate_podcast(
154
 
155
  # Get audio file path
156
  audio_file_path = generate_podcast_audio(
157
- line.text, line.speaker, LANGUAGE_MAPPING[language]
158
  )
159
  # Read the audio file into an AudioSegment
160
  audio_segment = AudioSegment.from_file(audio_file_path)
@@ -191,7 +195,7 @@ demo = gr.Interface(
191
  <table style="border-collapse: collapse; border: none; padding: 20px;">
192
  <tr style="border: none;">
193
  <td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
194
- <img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_include/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
195
  </td>
196
  <td style="border: none; vertical-align: top; padding: 10px;">
197
  <p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
@@ -225,6 +229,10 @@ demo = gr.Interface(
225
  value="English",
226
  label="6. 🌐 Choose the language"
227
  ),
 
 
 
 
228
  ],
229
  outputs=[
230
  gr.Audio(label="Podcast", format="mp3"),
@@ -242,23 +250,26 @@ demo = gr.Interface(
242
  "Fun",
243
  "Short (1-2 min)",
244
  "English",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  ],
246
- # [
247
- # [],
248
- # "https://en.wikipedia.org/wiki/Hugging_Face",
249
- # "How did Hugging Face become so successful?",
250
- # "Fun",
251
- # "Short (1-2 min)",
252
- # "English",
253
- # ],
254
- # [
255
- # [],
256
- # "https://simple.wikipedia.org/wiki/Taylor_Swift",
257
- # "Why is Taylor Swift so popular?",
258
- # "Fun",
259
- # "Short (1-2 min)",
260
- # "English",
261
- # ],
262
  ],
263
  cache_examples=True,
264
  )
 
37
  "Turkish": "tr"
38
  }
39
 
40
+ MELO_TTS_LANGUAGE_MAPPING = {
41
+ "English": "EN",
42
+ "Spanish": "ES",
43
+ "French": "FR",
44
+ "Chinese": "ZJ",
45
+ "Japanese": "JP",
46
+ "Korean": "KR",
47
+ }
48
+
49
  class DialogueItem(BaseModel):
50
  """A single dialogue item."""
51
 
 
76
  tone: Optional[str],
77
  length: Optional[str],
78
  language: str,
79
+ use_advanced_audio: bool,
80
  ) -> Tuple[str, str]:
81
  """Generate the audio and transcript from the PDFs and/or URL."""
82
  text = ""
83
 
84
+ # Check if the selected language is supported by MeloTTS when not using advanced audio
85
+ if not use_advanced_audio and language not in MELO_TTS_LANGUAGE_MAPPING:
86
+ raise gr.Error(f"The selected language '{language}' is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language.")
 
 
 
 
 
 
87
 
88
  # Check if at least one input is provided
89
  if not files and not url:
 
158
 
159
  # Get audio file path
160
  audio_file_path = generate_podcast_audio(
161
+ line.text, line.speaker, LANGUAGE_MAPPING[language], use_advanced_audio
162
  )
163
  # Read the audio file into an AudioSegment
164
  audio_segment = AudioSegment.from_file(audio_file_path)
 
195
  <table style="border-collapse: collapse; border: none; padding: 20px;">
196
  <tr style="border: none;">
197
  <td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
198
+ <img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
199
  </td>
200
  <td style="border: none; vertical-align: top; padding: 10px;">
201
  <p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
 
229
  value="English",
230
  label="6. 🌐 Choose the language"
231
  ),
232
+ gr.Checkbox(
233
+ label="7. πŸ”„ Use advanced audio generation? (Experimental)",
234
+ value=False
235
+ )
236
  ],
237
  outputs=[
238
  gr.Audio(label="Podcast", format="mp3"),
 
250
  "Fun",
251
  "Short (1-2 min)",
252
  "English",
253
+ True
254
+ ],
255
+ [
256
+ [],
257
+ "https://en.wikipedia.org/wiki/Hugging_Face",
258
+ "How did Hugging Face become so successful?",
259
+ "Fun",
260
+ "Short (1-2 min)",
261
+ "English",
262
+ False
263
+ ],
264
+ [
265
+ [],
266
+ "https://simple.wikipedia.org/wiki/Taylor_Swift",
267
+ "Why is Taylor Swift so popular?",
268
+ "Fun",
269
+ "Short (1-2 min)",
270
+ "English",
271
+ False
272
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  ],
274
  cache_examples=True,
275
  )
utils.py CHANGED
@@ -24,7 +24,7 @@ client = OpenAI(
24
  api_key=os.getenv("FIREWORKS_API_KEY"),
25
  )
26
 
27
- # hf_client = Client("mrfakename/MeloTTS")
28
 
29
  # download and load all models
30
  preload_models()
@@ -78,34 +78,35 @@ def parse_url(url: str) -> str:
78
  return response.text
79
 
80
 
81
- def generate_podcast_audio(text: str, speaker: str, language: str) -> str:
82
 
83
- audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
 
84
 
85
- file_path = f"audio_{language}_{speaker}.mp3"
86
 
87
- # save audio to disk
88
- write_wav(file_path, SAMPLE_RATE, audio_array)
89
 
90
- return file_path
91
 
92
 
93
- # """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
94
- # if speaker == "Guest":
95
- # accent = "EN-US" if language == "EN" else language
96
- # speed = 0.9
97
- # else: # host
98
- # accent = "EN-Default" if language == "EN" else language
99
- # speed = 1
100
- # if language != "EN" and speaker != "Guest":
101
- # speed = 1.1
102
 
103
- # # Generate audio
104
- # result = hf_client.predict(
105
- # text=text,
106
- # language=language,
107
- # speaker=accent,
108
- # speed=speed,
109
- # api_name="/synthesize",
110
- # )
111
- # return result
 
24
  api_key=os.getenv("FIREWORKS_API_KEY"),
25
  )
26
 
27
+ hf_client = Client("mrfakename/MeloTTS")
28
 
29
  # download and load all models
30
  preload_models()
 
78
  return response.text
79
 
80
 
81
+ def generate_podcast_audio(text: str, speaker: str, language: str, use_advanced_audio: bool) -> str:
82
 
83
+ if use_advanced_audio:
84
+ audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
85
 
86
+ file_path = f"audio_{language}_{speaker}.mp3"
87
 
88
+ # save audio to disk
89
+ write_wav(file_path, SAMPLE_RATE, audio_array)
90
 
91
+ return file_path
92
 
93
 
94
+ else:
95
+ if speaker == "Guest":
96
+ accent = "EN-US" if language == "EN" else language
97
+ speed = 0.9
98
+ else: # host
99
+ accent = "EN-Default" if language == "EN" else language
100
+ speed = 1
101
+ if language != "EN" and speaker != "Guest":
102
+ speed = 1.1
103
 
104
+ # Generate audio
105
+ result = hf_client.predict(
106
+ text=text,
107
+ language=language,
108
+ speaker=accent,
109
+ speed=speed,
110
+ api_name="/synthesize",
111
+ )
112
+ return result