Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 25, 2024

Commit

b634609

verified ·

1 Parent(s): dceec72

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -224,26 +224,27 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
     with gr.Tab("Vision Model with TTS (Phi-3.5-vision)"):
         with gr.Row():
-        with gr.Column(scale=1):
-            vision_input_img = gr.Image(label="Upload an Image", type="pil")
-            vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
-            vision_submit_btn = gr.Button("Analyze Image and Generate Speech", variant="primary")
-        with gr.Column(scale=1):
-            vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
-            vision_output_audio = gr.Audio(label="Generated Speech")
-    vision_submit_btn.click(process_vision_query,
                             inputs=[vision_input_img, vision_text_input],
                             outputs=[vision_output_text, vision_output_audio])
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():
             with gr.Column(scale=1):
                 tts_prompt = gr.Textbox(label="Text to Speak", placeholder="Enter the text you want to convert to speech...")
                 tts_description = gr.Textbox(label="Voice Description", value="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.", lines=3)

         clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
     with gr.Tab("Vision Model with TTS (Phi-3.5-vision)"):
         with gr.Row():
+            with gr.Column(scale=1):
+                vision_input_img = gr.Image(label="Upload an Image", type="pil")
+                vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
+                vision_submit_btn = gr.Button("Analyze Image and Generate Speech", variant="primary")
+            with gr.Column(scale=1):
+                vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
+                vision_output_audio = gr.Audio(label="Generated Speech")
+        vision_submit_btn.click(process_vision_query,
                             inputs=[vision_input_img, vision_text_input],
                             outputs=[vision_output_text, vision_output_audio])
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():
             with gr.Column(scale=1):
                 tts_prompt = gr.Textbox(label="Text to Speak", placeholder="Enter the text you want to convert to speech...")
                 tts_description = gr.Textbox(label="Voice Description", value="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.", lines=3)