Steven-GU-Yu-Di commited on
Commit
8e585c6
1 Parent(s): 27133d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -35
app.py CHANGED
@@ -1,50 +1,42 @@
 
 
1
  import streamlit as st
2
- from transformers import pipeline, Text2SpeechPipeline, VisualQAProcessor
3
  from PIL import Image
 
 
4
 
5
- # Load the text classification model
6
- classifier = pipeline("text-classification")
7
  # Load the Visual Question Answering (VQA) model
8
- vqa_model = VisualQAProcessor.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
9
- # Load the Text-to-Speech model
10
- tts_model = Text2SpeechPipeline("facebook/wav2vec2-base-960h")
11
 
12
  # Create a Streamlit app
13
- st.title("Image, Text, and Speech Classification")
14
 
15
  # Sidebar for user inputs
16
- st.sidebar.title("Input")
17
- uploaded_image = st.sidebar.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
18
- text_input = st.sidebar.text_input("Enter Text Description")
19
- question_input = st.sidebar.text_input("Enter Question for Image")
20
-
21
- # Function to classify image and text
22
- def classify(image, text, question):
23
- if image is not None and text:
24
  image = Image.open(image)
25
  st.image(image, caption="Uploaded Image", use_column_width=True)
26
- st.write("Text Description:", text)
27
- st.write("Question for Image:", question)
28
-
29
- # Text classification
30
- text_result = classifier(text)
31
- st.write("Text Classification Result:")
32
- st.write(text_result)
33
 
34
  # Visual Question Answering
35
  vqa_input = {
36
  "question": question,
37
- "context": text_result[0]['label'],
38
  }
39
- vqa_output = vqa_model(vqa_input)
40
- st.write("Visual Question Answering Result:")
41
- st.write(vqa_output)
42
-
43
- # Text-to-Speech
44
- tts_input = vqa_output['answer']
45
- tts_output = tts_model(tts_input)
46
- st.audio(tts_output[0]['audio'], format='audio/wav')
47
-
48
- # Button to trigger classification
49
- if st.sidebar.button("Classify"):
50
- classify(uploaded_image, text_input, question_input)
 
1
+ !pip install streamlit transformers gtts
2
+
3
  import streamlit as st
4
+ from transformers import pipeline
5
  from PIL import Image
6
+ from gtts import gTTS
7
+ import os
8
 
 
 
9
  # Load the Visual Question Answering (VQA) model
10
+ vqa_model = pipeline("question-answering")
 
 
11
 
12
  # Create a Streamlit app
13
+ st.title("Visual Question Answering and Text-to-Speech")
14
 
15
  # Sidebar for user inputs
16
+ uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
17
+ question_input = st.text_input("Enter Question")
18
+
19
+ # Function to perform Visual Question Answering
20
+ def perform_vqa(image, question):
21
+ if image is not None and question:
 
 
22
  image = Image.open(image)
23
  st.image(image, caption="Uploaded Image", use_column_width=True)
24
+ st.write("Question:", question)
 
 
 
 
 
 
25
 
26
  # Visual Question Answering
27
  vqa_input = {
28
  "question": question,
29
+ "context": "This is an image.",
30
  }
31
+ vqa_output = vqa_model(image=image, **vqa_input)
32
+ answer = vqa_output['answer']
33
+ st.write("Answer:", answer)
34
+
35
+ # Text-to-Speech using gTTS
36
+ tts = gTTS(answer)
37
+ tts.save("output.mp3")
38
+ st.audio("output.mp3", format='audio/mp3')
39
+
40
+ # Button to trigger Visual Question Answering and Text-to-Speech
41
+ if st.button("Perform VQA and TTS"):
42
+ perform_vqa(uploaded_image, question_input)