Spaces:
Running
on
T4
Running
on
T4
gabrielchua
commited on
Commit
•
f17c34f
1
Parent(s):
6141c41
update app
Browse files- app.py +64 -27
- prompts.py +43 -31
- utils.py +19 -10
app.py
CHANGED
@@ -13,7 +13,7 @@ from typing import List, Literal, Tuple, Optional
|
|
13 |
# Third-party imports
|
14 |
import gradio as gr
|
15 |
from loguru import logger
|
16 |
-
from pydantic import BaseModel
|
17 |
from pypdf import PdfReader
|
18 |
from pydub import AudioSegment
|
19 |
|
@@ -29,20 +29,29 @@ class DialogueItem(BaseModel):
|
|
29 |
text: str
|
30 |
|
31 |
|
32 |
-
class
|
33 |
"""The dialogue between the host and guest."""
|
34 |
|
35 |
scratchpad: str
|
36 |
name_of_guest: str
|
37 |
-
dialogue: List[DialogueItem]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
def generate_podcast(
|
41 |
files: List[str],
|
42 |
url: Optional[str],
|
|
|
43 |
tone: Optional[str],
|
44 |
length: Optional[str],
|
45 |
-
language: str
|
46 |
) -> Tuple[str, str]:
|
47 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
48 |
text = ""
|
@@ -64,8 +73,10 @@ def generate_podcast(
|
|
64 |
# Process PDFs if any
|
65 |
if files:
|
66 |
for file in files:
|
67 |
-
if not file.lower().endswith(
|
68 |
-
raise gr.Error(
|
|
|
|
|
69 |
|
70 |
try:
|
71 |
with Path(file).open("rb") as f:
|
@@ -84,10 +95,14 @@ def generate_podcast(
|
|
84 |
|
85 |
# Check total character count
|
86 |
if len(text) > 100000:
|
87 |
-
raise gr.Error(
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
modified_system_prompt = SYSTEM_PROMPT
|
|
|
|
|
91 |
if tone:
|
92 |
modified_system_prompt += f"\n\nTONE: The tone of the podcast should be {tone}."
|
93 |
if length:
|
@@ -97,10 +112,15 @@ def generate_podcast(
|
|
97 |
}
|
98 |
modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
|
99 |
if language:
|
100 |
-
modified_system_prompt +=
|
|
|
|
|
101 |
|
102 |
# Call the LLM
|
103 |
-
|
|
|
|
|
|
|
104 |
logger.info(f"Generated dialogue: {llm_output}")
|
105 |
|
106 |
# Process the dialogue
|
@@ -118,7 +138,9 @@ def generate_podcast(
|
|
118 |
total_characters += len(line.text)
|
119 |
|
120 |
# Get audio file path
|
121 |
-
audio_file_path = generate_audio(
|
|
|
|
|
122 |
# Read the audio file into an AudioSegment
|
123 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
124 |
audio_segments.append(audio_segment)
|
@@ -149,36 +171,48 @@ def generate_podcast(
|
|
149 |
|
150 |
demo = gr.Interface(
|
151 |
title="Open NotebookLM",
|
152 |
-
description="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
fn=generate_podcast,
|
154 |
inputs=[
|
155 |
gr.File(
|
156 |
-
label="1. 📄 Upload your PDF(s)",
|
157 |
-
file_types=[".pdf"],
|
158 |
-
file_count="multiple"
|
159 |
),
|
160 |
gr.Textbox(
|
161 |
label="2. 🔗 Paste a URL (optional)",
|
162 |
-
placeholder="Enter a URL to include its content"
|
163 |
),
|
164 |
-
gr.
|
|
|
165 |
choices=["Fun", "Formal"],
|
166 |
-
label="
|
167 |
value="Fun"
|
168 |
),
|
169 |
-
gr.
|
170 |
choices=["Short (1-2 min)", "Medium (3-5 min)"],
|
171 |
-
label="
|
172 |
value="Medium (3-5 min)"
|
173 |
),
|
174 |
gr.Dropdown(
|
175 |
choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
|
176 |
value="English",
|
177 |
-
label="
|
178 |
),
|
179 |
],
|
180 |
outputs=[
|
181 |
-
gr.Audio(label="
|
182 |
gr.Markdown(label="Transcript"),
|
183 |
],
|
184 |
allow_flagging="never",
|
@@ -189,27 +223,30 @@ demo = gr.Interface(
|
|
189 |
[
|
190 |
[str(Path("examples/1310.4546v1.pdf"))],
|
191 |
"",
|
|
|
192 |
"Fun",
|
193 |
"Short (1-2 min)",
|
194 |
-
"English"
|
195 |
],
|
196 |
[
|
197 |
[],
|
198 |
"https://en.wikipedia.org/wiki/Hugging_Face",
|
|
|
199 |
"Fun",
|
200 |
"Short (1-2 min)",
|
201 |
-
"English"
|
202 |
],
|
203 |
[
|
204 |
[],
|
205 |
"https://simple.wikipedia.org/wiki/Taylor_Swift",
|
|
|
206 |
"Fun",
|
207 |
"Short (1-2 min)",
|
208 |
-
"English"
|
209 |
],
|
210 |
],
|
211 |
cache_examples=True,
|
212 |
)
|
213 |
|
214 |
if __name__ == "__main__":
|
215 |
-
demo.launch(show_api=True)
|
|
|
13 |
# Third-party imports
|
14 |
import gradio as gr
|
15 |
from loguru import logger
|
16 |
+
from pydantic import BaseModel, Field
|
17 |
from pypdf import PdfReader
|
18 |
from pydub import AudioSegment
|
19 |
|
|
|
29 |
text: str
|
30 |
|
31 |
|
32 |
+
class ShortDialogue(BaseModel):
|
33 |
"""The dialogue between the host and guest."""
|
34 |
|
35 |
scratchpad: str
|
36 |
name_of_guest: str
|
37 |
+
dialogue: List[DialogueItem] = Field(..., description="A list of dialogue items, typically between 5 to 9 items")
|
38 |
+
|
39 |
+
|
40 |
+
class MediumDialogue(BaseModel):
|
41 |
+
"""The dialogue between the host and guest."""
|
42 |
+
|
43 |
+
scratchpad: str
|
44 |
+
name_of_guest: str
|
45 |
+
dialogue: List[DialogueItem] = Field(..., description="A list of dialogue items, typically between 8 to 13 items")
|
46 |
|
47 |
|
48 |
def generate_podcast(
|
49 |
files: List[str],
|
50 |
url: Optional[str],
|
51 |
+
question: Optional[str],
|
52 |
tone: Optional[str],
|
53 |
length: Optional[str],
|
54 |
+
language: str,
|
55 |
) -> Tuple[str, str]:
|
56 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
57 |
text = ""
|
|
|
73 |
# Process PDFs if any
|
74 |
if files:
|
75 |
for file in files:
|
76 |
+
if not file.lower().endswith(".pdf"):
|
77 |
+
raise gr.Error(
|
78 |
+
f"File {file} is not a PDF. Please upload only PDF files."
|
79 |
+
)
|
80 |
|
81 |
try:
|
82 |
with Path(file).open("rb") as f:
|
|
|
95 |
|
96 |
# Check total character count
|
97 |
if len(text) > 100000:
|
98 |
+
raise gr.Error(
|
99 |
+
"The total content is too long. Please ensure the combined text from PDFs and URL is fewer than ~100,000 characters."
|
100 |
+
)
|
101 |
+
|
102 |
+
# Modify the system prompt based on the user input
|
103 |
modified_system_prompt = SYSTEM_PROMPT
|
104 |
+
if question:
|
105 |
+
modified_system_prompt += f"\n\PLEASE ANSWER THE FOLLOWING QN: {question}"
|
106 |
if tone:
|
107 |
modified_system_prompt += f"\n\nTONE: The tone of the podcast should be {tone}."
|
108 |
if length:
|
|
|
112 |
}
|
113 |
modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
|
114 |
if language:
|
115 |
+
modified_system_prompt += (
|
116 |
+
f"\n\nOUTPUT LANGUAGE <IMPORTANT>: The the podcast should be {language}."
|
117 |
+
)
|
118 |
|
119 |
# Call the LLM
|
120 |
+
if length == "Short (1-2 min)":
|
121 |
+
llm_output = generate_script(modified_system_prompt, text, ShortDialogue)
|
122 |
+
else:
|
123 |
+
llm_output = generate_script(modified_system_prompt, text, MediumDialogue)
|
124 |
logger.info(f"Generated dialogue: {llm_output}")
|
125 |
|
126 |
# Process the dialogue
|
|
|
138 |
total_characters += len(line.text)
|
139 |
|
140 |
# Get audio file path
|
141 |
+
audio_file_path = generate_audio(
|
142 |
+
line.text, line.speaker, language_mapping[language]
|
143 |
+
)
|
144 |
# Read the audio file into an AudioSegment
|
145 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
146 |
audio_segments.append(audio_segment)
|
|
|
171 |
|
172 |
demo = gr.Interface(
|
173 |
title="Open NotebookLM",
|
174 |
+
description="""
|
175 |
+
|
176 |
+
<table style="border-collapse: collapse; border: none; padding: 20px;">
|
177 |
+
<tr style="border: none;">
|
178 |
+
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
|
179 |
+
<img src="https://raw.githubusercontent.com/gabrielchua/open-notebooklm/main/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
|
180 |
+
</td>
|
181 |
+
<td style="border: none; vertical-align: top; padding: 10px;">
|
182 |
+
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
|
183 |
+
<p style="margin-top: 15px;">Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.</p>
|
184 |
+
</td>
|
185 |
+
</tr>
|
186 |
+
</table>
|
187 |
+
""",
|
188 |
fn=generate_podcast,
|
189 |
inputs=[
|
190 |
gr.File(
|
191 |
+
label="1. 📄 Upload your PDF(s)", file_types=[".pdf"], file_count="multiple"
|
|
|
|
|
192 |
),
|
193 |
gr.Textbox(
|
194 |
label="2. 🔗 Paste a URL (optional)",
|
195 |
+
placeholder="Enter a URL to include its content",
|
196 |
),
|
197 |
+
gr.Textbox(label="3. 🤔 Do you have a specific question or topic in mind?"),
|
198 |
+
gr.Dropdown(
|
199 |
choices=["Fun", "Formal"],
|
200 |
+
label="4. 🎭 Choose the tone",
|
201 |
value="Fun"
|
202 |
),
|
203 |
+
gr.Dropdown(
|
204 |
choices=["Short (1-2 min)", "Medium (3-5 min)"],
|
205 |
+
label="5. ⏱️ Choose the length",
|
206 |
value="Medium (3-5 min)"
|
207 |
),
|
208 |
gr.Dropdown(
|
209 |
choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
|
210 |
value="English",
|
211 |
+
label="6. 🌐 Choose the language"
|
212 |
),
|
213 |
],
|
214 |
outputs=[
|
215 |
+
gr.Audio(label="Podcast", format="mp3"),
|
216 |
gr.Markdown(label="Transcript"),
|
217 |
],
|
218 |
allow_flagging="never",
|
|
|
223 |
[
|
224 |
[str(Path("examples/1310.4546v1.pdf"))],
|
225 |
"",
|
226 |
+
"Explain this paper to me like I'm 5 years old",
|
227 |
"Fun",
|
228 |
"Short (1-2 min)",
|
229 |
+
"English",
|
230 |
],
|
231 |
[
|
232 |
[],
|
233 |
"https://en.wikipedia.org/wiki/Hugging_Face",
|
234 |
+
"How did Hugging Face become so successful?",
|
235 |
"Fun",
|
236 |
"Short (1-2 min)",
|
237 |
+
"English",
|
238 |
],
|
239 |
[
|
240 |
[],
|
241 |
"https://simple.wikipedia.org/wiki/Taylor_Swift",
|
242 |
+
"Why is Taylor Swift so popular?",
|
243 |
"Fun",
|
244 |
"Short (1-2 min)",
|
245 |
+
"English",
|
246 |
],
|
247 |
],
|
248 |
cache_examples=True,
|
249 |
)
|
250 |
|
251 |
if __name__ == "__main__":
|
252 |
+
demo.launch(show_api=True)
|
prompts.py
CHANGED
@@ -3,41 +3,53 @@ prompts.py
|
|
3 |
"""
|
4 |
|
5 |
SYSTEM_PROMPT = """
|
6 |
-
You are a world-class podcast producer.
|
7 |
-
Your task is to transform the provided input text into an engaging and informative podcast script.
|
8 |
-
You will receive as input a text that may be unstructured or messy, sourced from places like PDFs or web pages. Ignore irrelevant information or formatting issues. Y
|
9 |
-
Your focus is on extracting the most interesting and insightful content for a podcast discussion.
|
10 |
|
11 |
# Steps to Follow:
|
12 |
|
13 |
1. **Analyze the Input:**
|
14 |
-
Carefully
|
15 |
|
16 |
2. **Brainstorm Ideas:**
|
17 |
-
In the `<scratchpad>`, brainstorm
|
18 |
-
|
19 |
-
-
|
20 |
-
-
|
21 |
-
-
|
22 |
-
|
23 |
-
3. **
|
24 |
-
|
25 |
-
|
26 |
-
-
|
27 |
-
-
|
28 |
-
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
"""
|
|
|
3 |
"""
|
4 |
|
5 |
SYSTEM_PROMPT = """
|
6 |
+
You are a world-class podcast producer tasked with transforming the provided input text into an engaging and informative podcast script. The input may be unstructured or messy, sourced from PDFs or web pages. Your goal is to extract the most interesting and insightful content for a compelling podcast discussion.
|
|
|
|
|
|
|
7 |
|
8 |
# Steps to Follow:
|
9 |
|
10 |
1. **Analyze the Input:**
|
11 |
+
Carefully examine the text, identifying key topics, points, and interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant information or formatting issues.
|
12 |
|
13 |
2. **Brainstorm Ideas:**
|
14 |
+
In the `<scratchpad>`, creatively brainstorm ways to present the key points engagingly. Consider:
|
15 |
+
- Analogies, storytelling techniques, or hypothetical scenarios to make content relatable
|
16 |
+
- Ways to make complex topics accessible to a general audience
|
17 |
+
- Thought-provoking questions to explore during the podcast
|
18 |
+
- Creative approaches to fill any gaps in the information
|
19 |
+
|
20 |
+
3. **Craft the Dialogue:**
|
21 |
+
Develop a natural, conversational flow between the host (Jane) and the guest speaker (the author or an expert on the topic). Incorporate:
|
22 |
+
- The best ideas from your brainstorming session
|
23 |
+
- Clear explanations of complex topics
|
24 |
+
- An engaging and lively tone to captivate listeners
|
25 |
+
- A balance of information and entertainment
|
26 |
+
|
27 |
+
Rules for the dialogue:
|
28 |
+
- The host (Jane) always initiates the conversation and interviews the guest
|
29 |
+
- Include thoughtful questions from the host to guide the discussion
|
30 |
+
- Incorporate natural speech patterns, including occasional verbal fillers (e.g., "um," "well," "you know")
|
31 |
+
- Allow for natural interruptions and back-and-forth between host and guest
|
32 |
+
- Ensure the guest's responses are substantiated by the input text, avoiding unsupported claims
|
33 |
+
- Maintain a PG-rated conversation appropriate for all audiences
|
34 |
+
- Avoid any marketing or self-promotional content from the guest
|
35 |
+
- The host concludes the conversation
|
36 |
+
|
37 |
+
4. **Summarize Key Insights:**
|
38 |
+
Naturally weave a summary of key points into the closing part of the dialogue. This should feel like a casual conversation rather than a formal recap, reinforcing the main takeaways before signing off.
|
39 |
+
|
40 |
+
5. **Maintain Authenticity:**
|
41 |
+
Throughout the script, strive for authenticity in the conversation. Include:
|
42 |
+
- Moments of genuine curiosity or surprise from the host
|
43 |
+
- Instances where the guest might briefly struggle to articulate a complex idea
|
44 |
+
- Light-hearted moments or humor when appropriate
|
45 |
+
- Brief personal anecdotes or examples that relate to the topic (within the bounds of the input text)
|
46 |
+
|
47 |
+
6. **Consider Pacing and Structure:**
|
48 |
+
Ensure the dialogue has a natural ebb and flow:
|
49 |
+
- Start with a strong hook to grab the listener's attention
|
50 |
+
- Gradually build complexity as the conversation progresses
|
51 |
+
- Include brief "breather" moments for listeners to absorb complex information
|
52 |
+
- End on a high note, perhaps with a thought-provoking question or a call-to-action for listeners
|
53 |
+
|
54 |
+
Remember: Always reply in valid JSON format, without code blocks. Begin directly with the JSON output.
|
55 |
"""
|
utils.py
CHANGED
@@ -30,17 +30,22 @@ def generate_script(system_prompt: str, input_text: str, output_model):
|
|
30 |
# Load as python object
|
31 |
try:
|
32 |
response = call_llm(system_prompt, input_text, output_model)
|
33 |
-
dialogue = output_model.model_validate_json(
|
34 |
-
response.choices[0].message.content
|
35 |
-
)
|
36 |
except ValidationError as e:
|
37 |
error_message = f"Failed to parse dialogue JSON: {e}"
|
38 |
system_prompt_with_error = f"{system_prompt}\n\nPlease return a VALID JSON object. This was the earlier error: {error_message}"
|
39 |
response = call_llm(system_prompt_with_error, input_text, output_model)
|
40 |
-
dialogue = output_model.model_validate_json(
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
|
46 |
def call_llm(system_prompt: str, text: str, dialogue_format):
|
@@ -78,9 +83,13 @@ def generate_audio(text: str, speaker: str, language: str) -> bytes:
|
|
78 |
speed = 1
|
79 |
if language != "EN" and speaker != "Guest":
|
80 |
speed = 1.1
|
81 |
-
|
82 |
# Generate audio
|
83 |
result = hf_client.predict(
|
84 |
-
text=text,
|
|
|
|
|
|
|
|
|
85 |
)
|
86 |
-
return result
|
|
|
30 |
# Load as python object
|
31 |
try:
|
32 |
response = call_llm(system_prompt, input_text, output_model)
|
33 |
+
dialogue = output_model.model_validate_json(response.choices[0].message.content)
|
|
|
|
|
34 |
except ValidationError as e:
|
35 |
error_message = f"Failed to parse dialogue JSON: {e}"
|
36 |
system_prompt_with_error = f"{system_prompt}\n\nPlease return a VALID JSON object. This was the earlier error: {error_message}"
|
37 |
response = call_llm(system_prompt_with_error, input_text, output_model)
|
38 |
+
dialogue = output_model.model_validate_json(response.choices[0].message.content)
|
39 |
+
|
40 |
+
# Call the LLM again to improve the dialogue
|
41 |
+
system_prompt_with_dialogue = f"{system_prompt}\n\nHere is the first draft of the dialogue you provided:\n\n{dialogue}."
|
42 |
+
response = call_llm(
|
43 |
+
system_prompt_with_dialogue, "Please improve the dialogue.", output_model
|
44 |
+
)
|
45 |
+
improved_dialogue = output_model.model_validate_json(
|
46 |
+
response.choices[0].message.content
|
47 |
+
)
|
48 |
+
return improved_dialogue
|
49 |
|
50 |
|
51 |
def call_llm(system_prompt: str, text: str, dialogue_format):
|
|
|
83 |
speed = 1
|
84 |
if language != "EN" and speaker != "Guest":
|
85 |
speed = 1.1
|
86 |
+
|
87 |
# Generate audio
|
88 |
result = hf_client.predict(
|
89 |
+
text=text,
|
90 |
+
language=language,
|
91 |
+
speaker=accent,
|
92 |
+
speed=speed,
|
93 |
+
api_name="/synthesize",
|
94 |
)
|
95 |
+
return result
|