📄 Document to Audio Synthesis 🎧

import gradio as gr
from .processor import process_document

SYNTHESIS_MODES = {
    "narration": {
        "description": "Simple document narration with clear voice and natural pacing",
        "styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
        "default_temp": 0.7,
        "default_chunks": 300,
        "system_prompt": """Convert this content into clear narration."""
    },
    "podcast": {
        "description": "Conversational style with engaging tone and dynamic pacing",
        "styles": ["Casual", "Interview", "Educational", "Commentary"],
        "default_temp": 0.8,
        "default_chunks": 400,
        "system_prompt": """Transform this content into engaging podcast-style speech."""
    },
    "presentation": {
        "description": "Professional presentation style with clear structure",
        "styles": ["Business", "Academic", "Sales", "Training"],
        "default_temp": 0.6,
        "default_chunks": 250,
        "system_prompt": """Convert this content into a presentation format."""
    },
    "storytelling": {
        "description": "Narrative style with emotional engagement",
        "styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
        "default_temp": 0.9,
        "default_chunks": 500,
        "system_prompt": """Transform this content into an engaging story."""
    }
}

def create_interface():
    with gr.Blocks(theme=gr.themes.Base()) as demo:
        gr.HTML(
            """
            <div style="margin-bottom: 1rem;">
                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" 
                     alt="Pixeltable" style="max-width: 150px;" />
                <h1>📄 Document to Audio Synthesis 🎧</h1>
            </div>
            """
        )

        # Overview Row
        with gr.Row():
            with gr.Column():
                with gr.Accordion("🎯 What does it do?", open=True):
                    gr.Markdown("""
                        - 📄 Document processing  - 🧠 Content transformation
                        - 🎧 Audio synthesis  - ⚙️ Multiple output styles
                    """)
            with gr.Column():
                with gr.Accordion("⚡ How does it work?", open=True):
                    gr.Markdown("""
                        1. 📑 **Processing:** Token-based segmentation
                        2. 🔍 **Analysis:** LLM optimization & scripts
                        3. 🎵 **Synthesis:** Multiple voice options
                    """)

        synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])

        # Main Settings Row
        with gr.Row():
            # Core Settings Column
            with gr.Column():
                with gr.Accordion("🔑 Core Settings", open=True):
                    with gr.Row():
                        api_key = gr.Textbox(
                            label="OpenAI API Key",
                            placeholder="sk-...",
                            type="password",
                            scale=2
                        )
                        file_input = gr.File(
                            label="PDF Document",
                            file_types=[".pdf"],
                            scale=1
                        )

            # Mode Selection Column
            with gr.Column():
                with gr.Accordion("🎭 Output Mode", open=True):
                    mode_select = gr.Radio(
                        choices=list(SYNTHESIS_MODES.keys()),
                        value="narration",
                        label="Select Mode",
                        info="Choose output style"
                    )
                    mode_description = gr.Markdown(
                        SYNTHESIS_MODES["narration"]["description"]
                    )

        # Voice and Processing Settings Row
        with gr.Row():
            # Voice Settings Column
            with gr.Column():
                with gr.Accordion("🎛️ Voice & Style", open=True):
                    voice_select = gr.Radio(
                        choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                        value="onyx",
                        label="🎙️ Voice",
                        interactive=True
                    )
                    style_select = gr.Radio(
                        choices=SYNTHESIS_MODES["narration"]["styles"],
                        value=SYNTHESIS_MODES["narration"]["styles"][0],
                        label="💫 Style",
                        interactive=True
                    )

            # Processing Settings Column
            with gr.Column():
                with gr.Accordion("⚙️ Processing Parameters", open=True):
                    with gr.Row():
                        chunk_size = gr.Slider(
                            minimum=100, maximum=1000,
                            value=SYNTHESIS_MODES["narration"]["default_chunks"],
                            step=50,
                            label="📏 Chunk Size"
                        )
                        temperature = gr.Slider(
                            minimum=0, maximum=1,
                            value=SYNTHESIS_MODES["narration"]["default_temp"],
                            step=0.1,
                            label="🌡️ Temperature"
                        )
                        max_tokens = gr.Slider(
                            minimum=100, maximum=1000,
                            value=300,
                            step=50,
                            label="📊 Tokens"
                        )

        # Process Button Row
        with gr.Row():
            process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
            status_output = gr.Textbox(label="📋 Status", scale=1)

        # Output Section
        with gr.Tabs():
            with gr.TabItem("📝 Content"):
                output_table = gr.Dataframe(
                    headers=["🔍 Segment", "📄 Content", "🎭 Script"],
                    wrap=True
                )
            with gr.TabItem("🎧 Audio"):
                with gr.Row():
                    with gr.Column(scale=2):
                        audio_output = gr.Audio(
                            label="🔊 Synthesized Audio", 
                            type="filepath",
                            show_download_button=True
                        )
                    with gr.Column(scale=1):
                        with gr.Accordion("📚 Quick Tips", open=True):
                            gr.Markdown("""
                                - 🎯 Lower temperature = more consistent
                                - 📏 Smaller chunks = more precise
                                - 🎙️ Try different voices for best fit
                                - 💫 Match style to content type
                            """)

        gr.HTML(
            """
            <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                <p style="margin: 0; color: #666; font-size: 0.8em;">
                    🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
                    | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
                    | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
                </p>
            </div>
            """
        )

        def update_mode(mode_name):
            mode = SYNTHESIS_MODES[mode_name]
            return (
                gr.update(choices=mode["styles"], value=mode["styles"][0]),
                gr.update(value=mode["default_chunks"]),
                gr.update(value=mode["default_temp"]),
                mode["description"]
            )

        mode_select.change(
            update_mode,
            inputs=[mode_select],
            outputs=[style_select, chunk_size, temperature, mode_description]
        )

        def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
            mode = SYNTHESIS_MODES[mode_name]
            return process_document(
                pdf_file=pdf_file,
                api_key=api_key,
                voice_choice=voice,
                style_choice=style,
                chunk_size=chunk_size,
                temperature=temperature,
                max_tokens=max_tokens,
                system_prompt=mode["system_prompt"]
            )
        
        process_btn.click(
            update_interface,
            inputs=[
                file_input, api_key, mode_select, voice_select, style_select,
                chunk_size, temperature, max_tokens
            ],
            outputs=[output_table, audio_output, status_output]
        )
    
    return demo