Spaces:

Minqin
/

carets_finetune_vqa

Running

MinxuanQin commited on Jul 19, 2023

Commit

c40a6be

•

1 Parent(s): 520d399

update img loading

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 from PIL import Image
 from transformers import ViltConfig, ViltProcessor, ViltForQuestionAnswering
 import streamlit as st
 st.title("Live demo of multimodal vqa")
@@ -16,10 +16,13 @@ uploaded_file = st.file_uploader("Please upload one image (jpg)", type="jpg")
 question = st.text_input("Type here one question on the image")
 if uploaded_file is not None:
     file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
-    img = Image.fromarray(file_bytes)
-    # st.image(img, caption="Here is the uploaded image", use_column_width=True)
-    encoding = processor(images=file_bytes, text=question, return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits

 import numpy as np
 from PIL import Image
 from transformers import ViltConfig, ViltProcessor, ViltForQuestionAnswering
+import cv2
 import streamlit as st
 st.title("Live demo of multimodal vqa")
 question = st.text_input("Type here one question on the image")
 if uploaded_file is not None:
     file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
+    opencv_img = cv2.imdecode(file_bytes, 1)
+    image_cv2 = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2RGB)
+    st.image(image_cv2, channels="RGB")
+    img = Image.fromarray(image_cv2)
+    encoding = processor(images=img, text=question, return_tensors="pt")
     outputs = model(**encoding)
     logits = outputs.logits

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ pandas
 pyarrow==10
 nltk
 torchsummary
-matplotlib

 pyarrow==10
 nltk
 torchsummary
+matplotlib
+opencv-pythons