File size: 990 Bytes
16ac1d6
7f1b559
16ac1d6
 
afdfd46
16ac1d6
 
 
 
afdfd46
 
16ac1d6
 
7f1b559
16ac1d6
 
 
 
 
7f1b559
16ac1d6
 
8e585c6
7f1b559
16ac1d6
 
 
8e585c6
16ac1d6
 
 
 
8e585c6
16ac1d6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os

os.system('pip install torch')
os.system('pip install transformers')

from PIL import Image
import io
import streamlit as st
from transformers import pipeline


vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
tts_pipeline = pipeline("text-to-speech", "suno/bark")

def main():
    st.title("Visual Question Answering & Text-to-Audio App")
    
    image = st.file_uploader("Upload an image", type=["jpg", "png"])
    question = st.text_input("Enter your question")

    if image and question:
        image = Image.open(io.BytesIO(image.getvalue()))


        vqa_result = vqa_pipeline({"image": image, "question": question})
        answer = vqa_result[0]['answer']  
        st.write(f"Answer: {answer}") 

        if st.button("Convert Answer to Audio"):
            tts_result = tts_pipeline(answer)
            audio_data = tts_result['audio']
            st.audio(audio_data, format="audio/ogg")

if __name__ == "__main__":
    main()