Spaces:

gabrielchua
/

open-notebooklm

Running on T4

App Files Files Community

gabrielchua commited on Sep 30

Commit

8fa13bc

•

1 Parent(s): 112bea7

use meloTTS and suno bark

Browse files

Files changed (2) hide show

app.py +38 -27
utils.py +26 -25

app.py CHANGED Viewed

@@ -37,6 +37,15 @@ LANGUAGE_MAPPING = {
     "Turkish": "tr"
 }
 class DialogueItem(BaseModel):
     """A single dialogue item."""
@@ -67,19 +76,14 @@ def generate_podcast(
     tone: Optional[str],
     length: Optional[str],
     language: str,
 ) -> Tuple[str, str]:
     """Generate the audio and transcript from the PDFs and/or URL."""
     text = ""
-    # Change language to the appropriate code
-    language_mapping = {
-        "English": "EN",
-        "Spanish": "ES",
-        "French": "FR",
-        "Chinese": "ZH",
-        "Japanese": "JP",
-        "Korean": "KR",
-    }
     # Check if at least one input is provided
     if not files and not url:
@@ -154,7 +158,7 @@ def generate_podcast(
         # Get audio file path
         audio_file_path = generate_podcast_audio(
-            line.text, line.speaker, LANGUAGE_MAPPING[language]
         )
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
@@ -191,7 +195,7 @@ demo = gr.Interface(
 <table style="border-collapse: collapse; border: none; padding: 20px;">
   <tr style="border: none;">
     <td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
-      <img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_include/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
     </td>
     <td style="border: none; vertical-align: top; padding: 10px;">
       <p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
@@ -225,6 +229,10 @@ demo = gr.Interface(
             value="English",
             label="6. 🌐 Choose the language"
         ),
     ],
     outputs=[
         gr.Audio(label="Podcast", format="mp3"),
@@ -242,23 +250,26 @@ demo = gr.Interface(
             "Fun",
             "Short (1-2 min)",
             "English",
         ],
-    #     [
-    #         [],
-    #         "https://en.wikipedia.org/wiki/Hugging_Face",
-    #         "How did Hugging Face become so successful?",
-    #         "Fun",
-    #         "Short (1-2 min)",
-    #         "English",
-    #     ],
-    #     [
-    #         [],
-    #         "https://simple.wikipedia.org/wiki/Taylor_Swift",
-    #         "Why is Taylor Swift so popular?",
-    #         "Fun",
-    #         "Short (1-2 min)",
-    #         "English",
-    #     ],
     ],
     cache_examples=True,
 )

     "Turkish": "tr"
 }
+MELO_TTS_LANGUAGE_MAPPING = {
+    "English": "EN",
+    "Spanish": "ES",
+    "French": "FR",
+    "Chinese": "ZJ",
+    "Japanese": "JP",
+    "Korean": "KR",
+}
 class DialogueItem(BaseModel):
     """A single dialogue item."""
     tone: Optional[str],
     length: Optional[str],
     language: str,
+    use_advanced_audio: bool,
 ) -> Tuple[str, str]:
     """Generate the audio and transcript from the PDFs and/or URL."""
     text = ""
+    # Check if the selected language is supported by MeloTTS when not using advanced audio
+    if not use_advanced_audio and language not in MELO_TTS_LANGUAGE_MAPPING:
+        raise gr.Error(f"The selected language '{language}' is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language.")
     # Check if at least one input is provided
     if not files and not url:
         # Get audio file path
         audio_file_path = generate_podcast_audio(
+            line.text, line.speaker, LANGUAGE_MAPPING[language], use_advanced_audio
         )
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
 <table style="border-collapse: collapse; border: none; padding: 20px;">
   <tr style="border: none;">
     <td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
+      <img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
     </td>
     <td style="border: none; vertical-align: top; padding: 10px;">
       <p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
             value="English",
             label="6. 🌐 Choose the language"
         ),
+        gr.Checkbox(
+            label="7. 🔄 Use advanced audio generation? (Experimental)",
+            value=False
+        )
     ],
     outputs=[
         gr.Audio(label="Podcast", format="mp3"),
             "Fun",
             "Short (1-2 min)",
             "English",
+            True
+        ],
+        [
+            [],
+            "https://en.wikipedia.org/wiki/Hugging_Face",
+            "How did Hugging Face become so successful?",
+            "Fun",
+            "Short (1-2 min)",
+            "English",
+            False
+        ],
+        [
+            [],
+            "https://simple.wikipedia.org/wiki/Taylor_Swift",
+            "Why is Taylor Swift so popular?",
+            "Fun",
+            "Short (1-2 min)",
+            "English",
+            False
         ],
     ],
     cache_examples=True,
 )

utils.py CHANGED Viewed

@@ -24,7 +24,7 @@ client = OpenAI(
     api_key=os.getenv("FIREWORKS_API_KEY"),
 )
-# hf_client = Client("mrfakename/MeloTTS")
 # download and load all models
 preload_models()
@@ -78,34 +78,35 @@ def parse_url(url: str) -> str:
     return response.text
-def generate_podcast_audio(text: str, speaker: str, language: str) -> str:
-    audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
-    file_path = f"audio_{language}_{speaker}.mp3"
-    # save audio to disk
-    write_wav(file_path, SAMPLE_RATE, audio_array)
-    return file_path
-    # """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
-    # if speaker == "Guest":
-    #     accent = "EN-US" if language == "EN" else language
-    #     speed = 0.9
-    # else:  # host
-    #     accent = "EN-Default" if language == "EN" else language
-    #     speed = 1
-    # if language != "EN" and speaker != "Guest":
-    #     speed = 1.1
-    # # Generate audio
-    # result = hf_client.predict(
-    #     text=text,
-    #     language=language,
-    #     speaker=accent,
-    #     speed=speed,
-    #     api_name="/synthesize",
-    # )
-    # return result

     api_key=os.getenv("FIREWORKS_API_KEY"),
 )
+hf_client = Client("mrfakename/MeloTTS")
 # download and load all models
 preload_models()
     return response.text
+def generate_podcast_audio(text: str, speaker: str, language: str, use_advanced_audio: bool) -> str:
+    if use_advanced_audio:
+        audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
+        file_path = f"audio_{language}_{speaker}.mp3"
+        # save audio to disk
+        write_wav(file_path, SAMPLE_RATE, audio_array)
+        return file_path
+    else:
+        if speaker == "Guest":
+            accent = "EN-US" if language == "EN" else language
+            speed = 0.9
+        else:  # host
+            accent = "EN-Default" if language == "EN" else language
+            speed = 1
+        if language != "EN" and speaker != "Guest":
+            speed = 1.1
+        # Generate audio
+        result = hf_client.predict(
+            text=text,
+            language=language,
+            speaker=accent,
+            speed=speed,
+            api_name="/synthesize",
+        )
+        return result