Steven-GU-Yu-Di's picture
Update app.py
16ac1d6 verified
raw
history blame
990 Bytes
import os
os.system('pip install torch')
os.system('pip install transformers')
from PIL import Image
import io
import streamlit as st
from transformers import pipeline
vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
tts_pipeline = pipeline("text-to-speech", "suno/bark")
def main():
st.title("Visual Question Answering & Text-to-Audio App")
image = st.file_uploader("Upload an image", type=["jpg", "png"])
question = st.text_input("Enter your question")
if image and question:
image = Image.open(io.BytesIO(image.getvalue()))
vqa_result = vqa_pipeline({"image": image, "question": question})
answer = vqa_result[0]['answer']
st.write(f"Answer: {answer}")
if st.button("Convert Answer to Audio"):
tts_result = tts_pipeline(answer)
audio_data = tts_result['audio']
st.audio(audio_data, format="audio/ogg")
if __name__ == "__main__":
main()