Spaces:
Runtime error
Runtime error
File size: 10,008 Bytes
1d42b83 9a59d7a 764d4c7 1d42b83 764d4c7 1d42b83 764d4c7 c431f44 9d3974d c431f44 764d4c7 c431f44 d65dd81 5438377 d65dd81 f2977fa 5b067c5 a2468a2 8cf99f3 243e8f7 0ca5e9e 8359a8e 243e8f7 25c7e03 243e8f7 72dd6ac 243e8f7 c9c9416 ca9d990 0ca5e9e 7f9a8c6 0ca5e9e c9c9416 243e8f7 5b067c5 c431f44 5b067c5 c411dc2 c431f44 a3b4f26 c431f44 9a59d7a 6ee92d0 1fb1f41 243e8f7 1d42b83 26889b2 5438377 9a59d7a c9c9416 243e8f7 72d31b3 764d4c7 72d31b3 1d42b83 a3d1262 0c873ec 1d42b83 6801c63 1d42b83 a3d1262 1d42b83 310ebfe 1d42b83 2cdead4 1d42b83 26889b2 c431f44 51aabb2 c431f44 5b067c5 51aabb2 5b067c5 26889b2 1d42b83 f8c1265 1a9d365 0c873ec 7fa027b f8c1265 a70df5d 1d42b83 243e8f7 1d42b83 e148f3e 1d42b83 a00a3a2 1d42b83 a3b4f26 1d42b83 a3b4f26 1d42b83 26889b2 c431f44 5b067c5 26889b2 b5f9a96 67aa591 5099b54 05b9609 b5f9a96 05b9609 1d42b83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import string
import gradio as gr
import requests
import torch
from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline
from PIL import Image
# model_name="hfl/vle-base-for-vqa"
# model = VLEForVQA.from_pretrained(model_name)
# vle_processor = VLEProcessor.from_pretrained(model_name)
# vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor)
from transformers import BlipForQuestionAnswering, BlipProcessor
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
# model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
from transformers import BlipProcessor, BlipForConditionalGeneration
# cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
# cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
def caption(input_image):
inputs = cap_processor(input_image, return_tensors="pt")
# inputs["num_beams"] = 1
# inputs['num_return_sequences'] =1
out = cap_model.generate(**inputs)
return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True))
import openai
import os
openai.api_key= os.getenv('openai_appkey')
def gpt3_short(question,vqa_answer,caption):
vqa_answer,vqa_score=vqa_answer
prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\
"; B: "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; C: "+vqa_answer[2]+", score:"+str(vqa_score[2])+\
"; D: "+vqa_answer[3]+', score:'+str(vqa_score[3])+\
". Choose A if it is not in conflict with the description of the picture and A's score is bigger than 0.8; otherwise choose the B, C or D based on the description."
# prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=10,
n=1,
stop=None,
temperature=0.7,
)
answer = response.choices[0].text.strip()
llm_ans=answer
choice=set(["A","B","C","D"])
llm_ans=llm_ans.replace("\n"," ").replace(":"," ").replace("."," " ).replace(","," ")
sllm_ans=llm_ans.split(" ")
for cho in sllm_ans:
if cho in choice:
llm_ans=cho
break
if llm_ans not in choice:
llm_ans="A"
llm_ans=vqa_answer[ord(llm_ans)-ord("A")]
answer=llm_ans
return answer
def gpt3_long(question,vqa_answer,caption):
vqa_answer,vqa_score=vqa_answer
# prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+"socre:"+str(vqa_score[0])+\
# " B: "+vqa_answer[1]+" score:"+str(vqa_score[1])+" C: "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
# " D: "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
# "Tell me the right answer with a long sentence."
prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\
"; "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; "+vqa_answer[2]+", score:"+str(vqa_score[2])+\
"; "+vqa_answer[3]+', score:'+str(vqa_score[3])+\
". Question: "+question+" Tell me the right answer with a sentence."
# prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+" socre:"+str(vqa_score[0])+\
# " "+vqa_answer[1]+" score:"+str(vqa_score[1])+" "+vqa_answer[2]+" score:"+str(vqa_score[2])+\
# " "+vqa_answer[3]+'score:'+str(vqa_score[3])+\
# "Tell me the right answer with a long sentence."
# prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=30,
n=1,
stop=None,
temperature=0.7,
)
answer = response.choices[0].text.strip()
return answer
def gpt3(question,vqa_answer,caption):
prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=30,
n=1,
stop=None,
temperature=0.7,
)
answer = response.choices[0].text.strip()
# return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer
return answer
def vle(input_image,input_text):
vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4)
# return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers]
return [vqa['answer'] for vqa in vqa_answers],[vqa['score'] for vqa in vqa_answers]
def inference_chat(input_image,input_text):
cap=caption(input_image)
print(cap)
# inputs = processor(images=input_image, text=input_text,return_tensors="pt")
# inputs["max_length"] = 10
# inputs["num_beams"] = 5
# inputs['num_return_sequences'] =4
# out = model_vqa.generate(**inputs)
# out=processor.batch_decode(out, skip_special_tokens=True)
out=vle(input_image,input_text)
# vqa="\n".join(out[0])
# gpt3_out=gpt3(input_text,vqa,cap)
gpt3_out=gpt3_long(input_text,out,cap)
gpt3_out1=gpt3_short(input_text,out,cap)
return out[0][0], gpt3_out,gpt3_out1
title = """# VQA with VLE and LLM"""
description = """We demonstrate three visual question answering systems built with VLE and LLM:
* VQA + LLM (short answer): The captioning model generates a caption of the image. We feed the caption, the question, and the answer candidates predicted by the VQA model to the LLM, and ask the LLM to select the most reasonable answer from the candidates.
* VQA + LLM (long answer): The pipeline is the same as VQA + LLM (short answer), except that the answer is freely generated by the LLM and not limited to VQA candidates.
For more details about VLE and the VQA pipeline, see [http://vle.hfl-rc.com](http://vle.hfl-rc.com)"""
with gr.Blocks(
css="""
.message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
#component-21 > div.wrap.svelte-w6rprc {height: 600px;}
"""
) as iface:
state = gr.State([])
#caption_output = None
gr.Markdown(title)
#gr.Markdown(article)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil",label="VQA Image Input")
with gr.Row():
with gr.Column(scale=1):
chat_input = gr.Textbox(lines=1, label="VQA Question Input")
with gr.Row():
clear_button = gr.Button(value="Clear", interactive=True,width=30)
submit_button = gr.Button(
value="Submit", interactive=True, variant="primary"
)
'''
cap_submit_button = gr.Button(
value="Submit_CAP", interactive=True, variant="primary"
)
gpt3_submit_button = gr.Button(
value="Submit_GPT3", interactive=True, variant="primary"
)
'''
with gr.Column():
caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption")
caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
gr.Markdown(description)
# image_input.change(
# lambda: ("", [],"","",""),
# [],
# [ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1],
# queue=False,
# )
chat_input.submit(
inference_chat,
[
image_input,
chat_input,
],
[ caption_output,gpt3_output_v1,caption_output_v1],
)
clear_button.click(
lambda: ("", [],"","",""),
[],
[chat_input, state,caption_output,gpt3_output_v1,caption_output_v1],
queue=False,
)
submit_button.click(
inference_chat,
[
image_input,
chat_input,
],
[caption_output,gpt3_output_v1,caption_output_v1],
)
'''
cap_submit_button.click(
caption,
[
image_input,
],
[caption_output_v1],
)
gpt3_submit_button.click(
gpt3,
[
chat_input,
caption_output ,
caption_output_v1,
],
[gpt3_output_v1],
)
'''
examples=[['bird.jpeg',"How many birds are there in the tree?","2","2","2"],
['qa9.jpg',"What type of vehicle is being pulled by the horses ?",'carriage','sled','Sled'],
['upload4.jpg',"What is this old man doing?","fishing","fishing","Fishing"]]
examples = gr.Examples(
examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1],
)
iface.queue(concurrency_count=1, api_open=False, max_size=10)
iface.launch(enable_queue=True) |