Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 26, 2024

Commit

1f0b302

verified ·

1 Parent(s): 1b825cc

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -18

app.py CHANGED Viewed

@@ -81,7 +81,7 @@ async def generate_speech(text, tts_model, tts_tokenizer):
 # Helper functions
 @spaces.GPU(timeout=300)
-async def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20, use_tts=True):
     try:
         conversation = [{"role": "system", "content": system_prompt}]
         for prompt, answer in history:
@@ -112,22 +112,14 @@ async def stream_text_chat(message, history, system_prompt, temperature=0.8, max
         buffer = ""
         audio_buffer = np.array([])
-        tts_future = None
         for new_text in streamer:
             buffer += new_text
-            if use_tts and len(buffer) > 50:  # Start TTS generation when buffer has enough content
-                if tts_future is None:
-                    tts_future = asyncio.get_event_loop().run_in_executor(
-                        executor, generate_speech, buffer, tts_model, tts_tokenizer
-                    )
             yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
-        # Wait for TTS to complete if it's still running
-        if use_tts and tts_future is not None:
-            audio_buffer = await tts_future
         # Final yield with complete text and audio
         yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
@@ -136,6 +128,16 @@ async def stream_text_chat(message, history, system_prompt, temperature=0.8, max
         print(f"An error occurred: {str(e)}")
         yield history + [[message, f"An error occurred: {str(e)}"]], None
 @spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
 def process_vision_query(image, text_input):
     try:
@@ -210,7 +212,7 @@ custom_suggestions = """
 </div>
 """
-# Update the Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
@@ -221,7 +223,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     block_label_text_color="#94a3b8",
 )) as demo:
     gr.HTML(custom_header)
-    gr.HTML(custom_suggestions)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
         chatbot = gr.Chatbot(height=400)
@@ -238,8 +239,13 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
-        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k, use_tts], [chatbot, audio_output])
-        clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():
@@ -250,9 +256,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
             with gr.Column(scale=1):
                 vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
-        vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
     gr.HTML("<footer>Powered by Phi 3.5 Multimodal AI</footer>")
 if __name__ == "__main__":
-    demo.launch()

 # Helper functions
 @spaces.GPU(timeout=300)
+def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20, use_tts=True):
     try:
         conversation = [{"role": "system", "content": system_prompt}]
         for prompt, answer in history:
         buffer = ""
         audio_buffer = np.array([])
         for new_text in streamer:
             buffer += new_text
             yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
+        # Generate speech after text generation is complete
+        if use_tts:
+            audio_buffer = generate_speech_sync(buffer, tts_model, tts_tokenizer)
         # Final yield with complete text and audio
         yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
         print(f"An error occurred: {str(e)}")
         yield history + [[message, f"An error occurred: {str(e)}"]], None
+def generate_speech_sync(text, tts_model, tts_tokenizer):
+    tts_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
+    tts_description = "A clear and natural voice reads the text with moderate speed and expression."
+    tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
+    with torch.no_grad():
+        audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
+    return audio_generation.cpu().numpy().squeeze()
 @spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
 def process_vision_query(image, text_input):
     try:
 </div>
 """
+# Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
     block_label_text_color="#94a3b8",
 )) as demo:
     gr.HTML(custom_header)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
         chatbot = gr.Chatbot(height=400)
         submit_btn = gr.Button("Submit", variant="primary")
         clear_btn = gr.Button("Clear Chat", variant="secondary")
+        def clear_chat():
+            return None
+        submit_btn.click(stream_text_chat,
+                         inputs=[msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k, use_tts],
+                         outputs=[chatbot, audio_output])
+        clear_btn.click(clear_chat, outputs=chatbot)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():
             with gr.Column(scale=1):
                 vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
+        vision_submit_btn.click(process_vision_query, inputs=[vision_input_img, vision_text_input], outputs=vision_output_text)
     gr.HTML("<footer>Powered by Phi 3.5 Multimodal AI</footer>")
 if __name__ == "__main__":
+    demo.launch(share=True)