Spaces:

xxx1
/

VQA_CAP_GPT

Runtime error

App Files Files Community

xxx1 commited on Mar 9, 2023

Commit

eef074b

1 Parent(s): 1a9d365

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -11

app.py CHANGED Viewed

@@ -126,15 +126,11 @@ def inference_chat(input_image,input_text):
     gpt3_out1=gpt3_short(input_text,out,cap)
     return out[0][0], gpt3_out,gpt3_out1
 title = """# VQA with VLE and LLM"""
-description = """We demonstrate three visual question answering systems built with VLE and LLM:
-* VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
-* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.
-For more details about VLE and the VQA pipeline, see [http://vle.hfl-rc.com](http://vle.hfl-rc.com)"""
 with gr.Blocks(
     css="""
@@ -145,7 +141,7 @@ with gr.Blocks(
     state = gr.State([])
     #caption_output = None
     gr.Markdown(title)
     #gr.Markdown(article)
     with gr.Row():
@@ -168,11 +164,11 @@ with gr.Blocks(
                         )
                         '''
         with gr.Column():
             caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
             caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
             gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
-            gr.Markdown(description)
         # image_input.change(

     gpt3_out1=gpt3_short(input_text,out,cap)
     return out[0][0], gpt3_out,gpt3_out1
 title = """# VQA with VLE and LLM"""
+description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
+We demonstrate visual question answering systems built with VLE and LLM."""
+description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
+**VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
 with gr.Blocks(
     css="""
     state = gr.State([])
     #caption_output = None
     gr.Markdown(title)
+    gr.Markdown(description)
     #gr.Markdown(article)
     with gr.Row():
                         )
                         '''
         with gr.Column():
+            gr.Markdown(description1)
             caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
             caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
             gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
         # image_input.change(