import gradio as gr
from .processor import process_document
SYNTHESIS_MODES = {
"narration": {
"description": "Simple document narration with clear voice and natural pacing",
"styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
"default_temp": 0.7,
"default_chunks": 300,
"system_prompt": """Convert this content into clear narration."""
},
"podcast": {
"description": "Conversational style with engaging tone and dynamic pacing",
"styles": ["Casual", "Interview", "Educational", "Commentary"],
"default_temp": 0.8,
"default_chunks": 400,
"system_prompt": """Transform this content into engaging podcast-style speech."""
},
"presentation": {
"description": "Professional presentation style with clear structure",
"styles": ["Business", "Academic", "Sales", "Training"],
"default_temp": 0.6,
"default_chunks": 250,
"system_prompt": """Convert this content into a presentation format."""
},
"storytelling": {
"description": "Narrative style with emotional engagement",
"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
"default_temp": 0.9,
"default_chunks": 500,
"system_prompt": """Transform this content into an engaging story."""
}
}
def create_interface():
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.HTML(
"""
📄 Document to Audio Synthesis 🎧
"""
)
# Overview Row
with gr.Row():
with gr.Column():
with gr.Accordion("🎯 What does it do?", open=True):
gr.Markdown("""
- 📄 Document processing - 🧠 Content transformation
- 🎧 Audio synthesis - ⚙️ Multiple output styles
""")
with gr.Column():
with gr.Accordion("⚡ How does it work?", open=True):
gr.Markdown("""
1. 📑 **Processing:** Token-based segmentation
2. 🔍 **Analysis:** LLM optimization & scripts
3. 🎵 **Synthesis:** Multiple voice options
""")
synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
# Main Settings Row
with gr.Row():
# Core Settings Column
with gr.Column():
with gr.Accordion("🔑 Core Settings", open=True):
with gr.Row():
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="sk-...",
type="password",
scale=2
)
file_input = gr.File(
label="PDF Document",
file_types=[".pdf"],
scale=1
)
# Mode Selection Column
with gr.Column():
with gr.Accordion("🎭 Output Mode", open=True):
mode_select = gr.Radio(
choices=list(SYNTHESIS_MODES.keys()),
value="narration",
label="Select Mode",
info="Choose output style"
)
mode_description = gr.Markdown(
SYNTHESIS_MODES["narration"]["description"]
)
# Voice and Processing Settings Row
with gr.Row():
# Voice Settings Column
with gr.Column():
with gr.Accordion("🎛️ Voice & Style", open=True):
voice_select = gr.Radio(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
value="onyx",
label="🎙️ Voice",
interactive=True
)
style_select = gr.Radio(
choices=SYNTHESIS_MODES["narration"]["styles"],
value=SYNTHESIS_MODES["narration"]["styles"][0],
label="💫 Style",
interactive=True
)
# Processing Settings Column
with gr.Column():
with gr.Accordion("⚙️ Processing Parameters", open=True):
with gr.Row():
chunk_size = gr.Slider(
minimum=100, maximum=1000,
value=SYNTHESIS_MODES["narration"]["default_chunks"],
step=50,
label="📏 Chunk Size"
)
temperature = gr.Slider(
minimum=0, maximum=1,
value=SYNTHESIS_MODES["narration"]["default_temp"],
step=0.1,
label="🌡️ Temperature"
)
max_tokens = gr.Slider(
minimum=100, maximum=1000,
value=300,
step=50,
label="📊 Tokens"
)
# Process Button Row
with gr.Row():
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
status_output = gr.Textbox(label="📋 Status", scale=1)
# Output Section
with gr.Tabs():
with gr.TabItem("📝 Content"):
output_table = gr.Dataframe(
headers=["🔍 Segment", "📄 Content", "🎭 Script"],
wrap=True
)
with gr.TabItem("🎧 Audio"):
with gr.Row():
with gr.Column(scale=2):
audio_output = gr.Audio(
label="🔊 Synthesized Audio",
type="filepath",
show_download_button=True
)
with gr.Column(scale=1):
with gr.Accordion("📚 Quick Tips", open=True):
gr.Markdown("""
- 🎯 Lower temperature = more consistent
- 📏 Smaller chunks = more precise
- 🎙️ Try different voices for best fit
- 💫 Match style to content type
""")
gr.HTML(
"""
"""
)
def update_mode(mode_name):
mode = SYNTHESIS_MODES[mode_name]
return (
gr.update(choices=mode["styles"], value=mode["styles"][0]),
gr.update(value=mode["default_chunks"]),
gr.update(value=mode["default_temp"]),
mode["description"]
)
mode_select.change(
update_mode,
inputs=[mode_select],
outputs=[style_select, chunk_size, temperature, mode_description]
)
def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
mode = SYNTHESIS_MODES[mode_name]
return process_document(
pdf_file=pdf_file,
api_key=api_key,
voice_choice=voice,
style_choice=style,
chunk_size=chunk_size,
temperature=temperature,
max_tokens=max_tokens,
system_prompt=mode["system_prompt"]
)
process_btn.click(
update_interface,
inputs=[
file_input, api_key, mode_select, voice_select, style_select,
chunk_size, temperature, max_tokens
],
outputs=[output_table, audio_output, status_output]
)
return demo