Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -126,15 +126,11 @@ def inference_chat(input_image,input_text):
|
|
126 |
gpt3_out1=gpt3_short(input_text,out,cap)
|
127 |
return out[0][0], gpt3_out,gpt3_out1
|
128 |
title = """# VQA with VLE and LLM"""
|
129 |
-
description = """
|
|
|
|
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
* VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
|
134 |
-
|
135 |
-
* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.
|
136 |
-
|
137 |
-
For more details about VLE and the VQA pipeline, see [http://vle.hfl-rc.com](http://vle.hfl-rc.com)"""
|
138 |
|
139 |
with gr.Blocks(
|
140 |
css="""
|
@@ -145,7 +141,7 @@ with gr.Blocks(
|
|
145 |
state = gr.State([])
|
146 |
#caption_output = None
|
147 |
gr.Markdown(title)
|
148 |
-
|
149 |
#gr.Markdown(article)
|
150 |
|
151 |
with gr.Row():
|
@@ -168,11 +164,11 @@ with gr.Blocks(
|
|
168 |
)
|
169 |
'''
|
170 |
with gr.Column():
|
171 |
-
|
172 |
caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
|
173 |
caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
|
174 |
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
|
175 |
-
|
176 |
|
177 |
|
178 |
# image_input.change(
|
|
|
126 |
gpt3_out1=gpt3_short(input_text,out,cap)
|
127 |
return out[0][0], gpt3_out,gpt3_out1
|
128 |
title = """# VQA with VLE and LLM"""
|
129 |
+
description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
|
130 |
+
We demonstrate visual question answering systems built with VLE and LLM."""
|
131 |
+
description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
|
132 |
|
133 |
+
**VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
with gr.Blocks(
|
136 |
css="""
|
|
|
141 |
state = gr.State([])
|
142 |
#caption_output = None
|
143 |
gr.Markdown(title)
|
144 |
+
gr.Markdown(description)
|
145 |
#gr.Markdown(article)
|
146 |
|
147 |
with gr.Row():
|
|
|
164 |
)
|
165 |
'''
|
166 |
with gr.Column():
|
167 |
+
gr.Markdown(description1)
|
168 |
caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
|
169 |
caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
|
170 |
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
|
171 |
+
|
172 |
|
173 |
|
174 |
# image_input.change(
|