Navyabhat's picture
Update app.py
bd0b898 verified
raw
history blame
4.62 kB
import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2
messages = []
multimodal_phi2 = MultiModalPhi2(
modelname_or_path="Navyabhat/Llava-Phi2",
temperature=0.2,
max_new_tokens=1024,
device="cpu",
)
def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
textflag, imageflag, audioflag = False, False, False
if text not in ["", None]:
chatbot.append((text, None))
textflag = True
if image is not None:
chatbot.append(((image,), None))
imageflag = True
if audio_mic is not None:
chatbot.append(((audio_mic,), None))
audioflag = True
else:
if audio_upload is not None:
chatbot.append(((audio_upload,), None))
audioflag = True
if not any([textflag, imageflag, audioflag]):
# Raise an error if neither text nor file is provided
raise gr.Error("Enter a valid text, image or audio")
return chatbot
def clear_data():
return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}
def run(history, text, image, audio_upload, audio_mic):
if text in [None, ""]:
text = None
if audio_upload is not None:
audio = audio_upload
elif audio_mic is not None:
audio = audio_mic
else:
audio = None
print("text", text)
print("image", image)
print("audio", audio)
if image is not None:
image = Image.open(image)
outputs = multimodal_phi2(text, audio, image)
# outputs = ""
history.append((None, outputs.title()))
return history, None, None, None, None
# Custom styling
interface_style = {
"box": {
"backgroundColor": "#f9f9f9",
"padding": "20px",
"borderRadius": "10px",
"boxShadow": "0 0 10px rgba(0, 0, 0, 0.1)",
},
"button": {
"backgroundColor": "#4caf50",
"color": "#fff",
"padding": "10px",
"border": "none",
"borderRadius": "5px",
"cursor": "pointer",
},
"textbox": {
"width": "100%",
"padding": "10px",
"marginBottom": "10px",
"boxSizing": "border-box",
},
"image": {
"width": "100%",
"marginBottom": "10px",
},
"audio": {
"width": "100%",
"marginBottom": "10px",
},
"chatbox": {
"height": "550px",
"backgroundColor": "#f0f0f0",
"borderRadius": "5px",
"padding": "10px",
"overflowY": "auto",
},
}
with gr.Blocks() as demo:
gr.Markdown("## MultiModal Phi2 Model Pretraining and Finetuning from Scratch")
with gr.Row():
with gr.Column(scale=4):
with gr.Box(style=interface_style["box"]):
with gr.Row():
prompt = gr.Textbox(
placeholder="Enter Prompt",
lines=2,
label="Query",
value=None,
style=interface_style["textbox"],
)
with gr.Row():
image = gr.Image(
type="filepath", value=None, style=interface_style["image"]
)
with gr.Row():
audio_upload = gr.Audio(
source="upload", type="filepath", style=interface_style["audio"]
)
audio_mic = gr.Audio(
source="microphone",
type="filepath",
format="mp3",
style=interface_style["audio"],
)
with gr.Column(scale=8):
with gr.Box(style=interface_style["box"]):
with gr.Row():
chatbot = gr.Chatbot(
avatar_images=("🧑", "🤖"),
height=550,
style=interface_style["chatbox"],
)
with gr.Row():
submit = gr.Button(style=interface_style["button"])
clear = gr.Button(value="Clear", style=interface_style["button"])
submit.click(
add_content,
inputs=[chatbot, prompt, image, audio_upload, audio_mic],
outputs=[chatbot],
).success(
run,
inputs=[chatbot, prompt, image, audio_upload, audio_mic],
outputs=[chatbot, prompt, image, audio_upload, audio_mic],
)
clear.click(
clear_data,
outputs=[prompt, image, audio_upload, audio_mic, chatbot],
)
demo.launch()