xxx1 commited on
Commit
eef074b
1 Parent(s): 1a9d365

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -11
app.py CHANGED
@@ -126,15 +126,11 @@ def inference_chat(input_image,input_text):
126
  gpt3_out1=gpt3_short(input_text,out,cap)
127
  return out[0][0], gpt3_out,gpt3_out1
128
  title = """# VQA with VLE and LLM"""
129
- description = """We demonstrate three visual question answering systems built with VLE and LLM:
 
 
130
 
131
-
132
-
133
- * VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
134
-
135
- * VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.
136
-
137
- For more details about VLE and the VQA pipeline, see [http://vle.hfl-rc.com](http://vle.hfl-rc.com)"""
138
 
139
  with gr.Blocks(
140
  css="""
@@ -145,7 +141,7 @@ with gr.Blocks(
145
  state = gr.State([])
146
  #caption_output = None
147
  gr.Markdown(title)
148
-
149
  #gr.Markdown(article)
150
 
151
  with gr.Row():
@@ -168,11 +164,11 @@ with gr.Blocks(
168
  )
169
  '''
170
  with gr.Column():
171
-
172
  caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
173
  caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
174
  gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
175
- gr.Markdown(description)
176
 
177
 
178
  # image_input.change(
 
126
  gpt3_out1=gpt3_short(input_text,out,cap)
127
  return out[0][0], gpt3_out,gpt3_out1
128
  title = """# VQA with VLE and LLM"""
129
+ description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
130
+ We demonstrate visual question answering systems built with VLE and LLM."""
131
+ description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
132
 
133
+ **VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
 
 
 
 
 
 
134
 
135
  with gr.Blocks(
136
  css="""
 
141
  state = gr.State([])
142
  #caption_output = None
143
  gr.Markdown(title)
144
+ gr.Markdown(description)
145
  #gr.Markdown(article)
146
 
147
  with gr.Row():
 
164
  )
165
  '''
166
  with gr.Column():
167
+ gr.Markdown(description1)
168
  caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
169
  caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
170
  gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
171
+
172
 
173
 
174
  # image_input.change(