import gradio as gr from .processor import process_document SYNTHESIS_MODES = { "narration": { "description": "Simple document narration with clear voice and natural pacing", "styles": ["Technical", "Narrative", "Instructional", "Descriptive"], "default_temp": 0.7, "default_chunks": 300, "system_prompt": """Convert this content into clear narration.""" }, "podcast": { "description": "Conversational style with engaging tone and dynamic pacing", "styles": ["Casual", "Interview", "Educational", "Commentary"], "default_temp": 0.8, "default_chunks": 400, "system_prompt": """Transform this content into engaging podcast-style speech.""" }, "presentation": { "description": "Professional presentation style with clear structure", "styles": ["Business", "Academic", "Sales", "Training"], "default_temp": 0.6, "default_chunks": 250, "system_prompt": """Convert this content into a presentation format.""" }, "storytelling": { "description": "Narrative style with emotional engagement", "styles": ["Dynamic", "Dramatic", "Calm", "Energetic"], "default_temp": 0.9, "default_chunks": 500, "system_prompt": """Transform this content into an engaging story.""" } } def create_interface(): with gr.Blocks(theme=gr.themes.Base()) as demo: gr.HTML( """
Pixeltable

📄 Document to Audio Synthesis 🎧

""" ) # Overview Row with gr.Row(): with gr.Column(): with gr.Accordion("🎯 What does it do?", open=True): gr.Markdown(""" - 📄 Document processing - 🧠 Content transformation - 🎧 Audio synthesis - ⚙️ Multiple output styles """) with gr.Column(): with gr.Accordion("⚡ How does it work?", open=True): gr.Markdown(""" 1. 📑 **Processing:** Token-based segmentation 2. 🔍 **Analysis:** LLM optimization & scripts 3. 🎵 **Synthesis:** Multiple voice options """) synthesis_mode = gr.State(SYNTHESIS_MODES["narration"]) # Main Settings Row with gr.Row(): # Core Settings Column with gr.Column(): with gr.Accordion("🔑 Core Settings", open=True): with gr.Row(): api_key = gr.Textbox( label="OpenAI API Key", placeholder="sk-...", type="password", scale=2 ) file_input = gr.File( label="PDF Document", file_types=[".pdf"], scale=1 ) # Mode Selection Column with gr.Column(): with gr.Accordion("🎭 Output Mode", open=True): mode_select = gr.Radio( choices=list(SYNTHESIS_MODES.keys()), value="narration", label="Select Mode", info="Choose output style" ) mode_description = gr.Markdown( SYNTHESIS_MODES["narration"]["description"] ) # Voice and Processing Settings Row with gr.Row(): # Voice Settings Column with gr.Column(): with gr.Accordion("🎛️ Voice & Style", open=True): voice_select = gr.Radio( choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], value="onyx", label="🎙️ Voice", interactive=True ) style_select = gr.Radio( choices=SYNTHESIS_MODES["narration"]["styles"], value=SYNTHESIS_MODES["narration"]["styles"][0], label="💫 Style", interactive=True ) # Processing Settings Column with gr.Column(): with gr.Accordion("⚙️ Processing Parameters", open=True): with gr.Row(): chunk_size = gr.Slider( minimum=100, maximum=1000, value=SYNTHESIS_MODES["narration"]["default_chunks"], step=50, label="📏 Chunk Size" ) temperature = gr.Slider( minimum=0, maximum=1, value=SYNTHESIS_MODES["narration"]["default_temp"], step=0.1, label="🌡️ Temperature" ) max_tokens = gr.Slider( minimum=100, maximum=1000, value=300, step=50, label="📊 Tokens" ) # Process Button Row with gr.Row(): process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2) status_output = gr.Textbox(label="📋 Status", scale=1) # Output Section with gr.Tabs(): with gr.TabItem("📝 Content"): output_table = gr.Dataframe( headers=["🔍 Segment", "📄 Content", "🎭 Script"], wrap=True ) with gr.TabItem("🎧 Audio"): with gr.Row(): with gr.Column(scale=2): audio_output = gr.Audio( label="🔊 Synthesized Audio", type="filepath", show_download_button=True ) with gr.Column(scale=1): with gr.Accordion("📚 Quick Tips", open=True): gr.Markdown(""" - 🎯 Lower temperature = more consistent - 📏 Smaller chunks = more precise - 🎙️ Try different voices for best fit - 💫 Match style to content type """) gr.HTML( """

🚀 Powered by Pixeltable | 📚 Docs | 🤗 HF Space

""" ) def update_mode(mode_name): mode = SYNTHESIS_MODES[mode_name] return ( gr.update(choices=mode["styles"], value=mode["styles"][0]), gr.update(value=mode["default_chunks"]), gr.update(value=mode["default_temp"]), mode["description"] ) mode_select.change( update_mode, inputs=[mode_select], outputs=[style_select, chunk_size, temperature, mode_description] ) def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens): mode = SYNTHESIS_MODES[mode_name] return process_document( pdf_file=pdf_file, api_key=api_key, voice_choice=voice, style_choice=style, chunk_size=chunk_size, temperature=temperature, max_tokens=max_tokens, system_prompt=mode["system_prompt"] ) process_btn.click( update_interface, inputs=[ file_input, api_key, mode_select, voice_select, style_select, chunk_size, temperature, max_tokens ], outputs=[output_table, audio_output, status_output] ) return demo