import string import gradio as gr import requests import torch from models.VLE import VLEForVQA, VLEProcessor, VLEForVQAPipeline from PIL import Image # model_name="hfl/vle-base-for-vqa" # model = VLEForVQA.from_pretrained(model_name) # vle_processor = VLEProcessor.from_pretrained(model_name) # vqa_pipeline = VLEForVQAPipeline(model=model, device='cpu', vle_processor=vle_processor) from transformers import BlipForQuestionAnswering, BlipProcessor # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large") # model_vqa = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device) from transformers import BlipProcessor, BlipForConditionalGeneration # cap_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") # cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") def caption(input_image): inputs = cap_processor(input_image, return_tensors="pt") # inputs["num_beams"] = 1 # inputs['num_return_sequences'] =1 out = cap_model.generate(**inputs) return "\n".join(cap_processor.batch_decode(out, skip_special_tokens=True)) import openai import os openai.api_key= os.getenv('openai_appkey') def gpt3_short(question,vqa_answer,caption): vqa_answer,vqa_score=vqa_answer prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\ "; B: "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; C: "+vqa_answer[2]+", score:"+str(vqa_score[2])+\ "; D: "+vqa_answer[3]+', score:'+str(vqa_score[3])+\ ". Choose A if it is not in conflict with the description of the picture and A's score is bigger than 0.8; otherwise choose the B, C or D based on the description." # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." response = openai.Completion.create( engine="text-davinci-003", prompt=prompt, max_tokens=10, n=1, stop=None, temperature=0.7, ) answer = response.choices[0].text.strip() llm_ans=answer choice=set(["A","B","C","D"]) llm_ans=llm_ans.replace("\n"," ").replace(":"," ").replace("."," " ).replace(","," ") sllm_ans=llm_ans.split(" ") for cho in sllm_ans: if cho in choice: llm_ans=cho break if llm_ans not in choice: llm_ans="A" llm_ans=vqa_answer[ord(llm_ans)-ord("A")] answer=llm_ans return answer def gpt3_long(question,vqa_answer,caption): vqa_answer,vqa_score=vqa_answer # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+"A: "+vqa_answer[0]+"socre:"+str(vqa_score[0])+\ # " B: "+vqa_answer[1]+" score:"+str(vqa_score[1])+" C: "+vqa_answer[2]+" score:"+str(vqa_score[2])+\ # " D: "+vqa_answer[3]+'score:'+str(vqa_score[3])+\ # "Tell me the right answer with a long sentence." prompt="This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+", socre:"+str(vqa_score[0])+\ "; "+vqa_answer[1]+", score:"+str(vqa_score[1])+"; "+vqa_answer[2]+", score:"+str(vqa_score[2])+\ "; "+vqa_answer[3]+', score:'+str(vqa_score[3])+\ ". Question: "+question+" Tell me the right answer with a sentence." # prompt="prompt: This is the caption of a picture: "+caption+". Question: "+question+" VQA model predicts:"+" "+vqa_answer[0]+" socre:"+str(vqa_score[0])+\ # " "+vqa_answer[1]+" score:"+str(vqa_score[1])+" "+vqa_answer[2]+" score:"+str(vqa_score[2])+\ # " "+vqa_answer[3]+'score:'+str(vqa_score[3])+\ # "Tell me the right answer with a long sentence." # prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." response = openai.Completion.create( engine="text-davinci-003", prompt=prompt, max_tokens=30, n=1, stop=None, temperature=0.7, ) answer = response.choices[0].text.strip() return answer def gpt3(question,vqa_answer,caption): prompt=caption+"\n"+question+"\n"+vqa_answer+"\n Tell me the right answer." response = openai.Completion.create( engine="text-davinci-003", prompt=prompt, max_tokens=30, n=1, stop=None, temperature=0.7, ) answer = response.choices[0].text.strip() # return "input_text:\n"+prompt+"\n\n output_answer:\n"+answer return answer def vle(input_image,input_text): vqa_answers = vqa_pipeline({"image":input_image, "question":input_text}, top_k=4) # return [" ".join([str(value) for key,value in vqa.items()] )for vqa in vqa_answers] return [vqa['answer'] for vqa in vqa_answers],[vqa['score'] for vqa in vqa_answers] def inference_chat(input_image,input_text): cap=caption(input_image) print(cap) # inputs = processor(images=input_image, text=input_text,return_tensors="pt") # inputs["max_length"] = 10 # inputs["num_beams"] = 5 # inputs['num_return_sequences'] =4 # out = model_vqa.generate(**inputs) # out=processor.batch_decode(out, skip_special_tokens=True) out=vle(input_image,input_text) # vqa="\n".join(out[0]) # gpt3_out=gpt3(input_text,vqa,cap) gpt3_out=gpt3_long(input_text,out,cap) gpt3_out1=gpt3_short(input_text,out,cap) return out[0][0], gpt3_out,gpt3_out1 title = """# VQA with VLE and LLM""" description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details. We demonstrate visual question answering systems built with VLE and LLM.""" description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer. **VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM.""" with gr.Blocks( css=""" .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px} #component-21 > div.wrap.svelte-w6rprc {height: 600px;} """ ) as iface: state = gr.State([]) #caption_output = None gr.Markdown(title) gr.Markdown(description) #gr.Markdown(article) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil",label="VQA Image Input") with gr.Row(): with gr.Column(scale=1): chat_input = gr.Textbox(lines=1, label="VQA Question Input") with gr.Row(): clear_button = gr.Button(value="Clear", interactive=True,width=30) submit_button = gr.Button( value="Submit", interactive=True, variant="primary" ) ''' cap_submit_button = gr.Button( value="Submit_CAP", interactive=True, variant="primary" ) gpt3_submit_button = gr.Button( value="Submit_GPT3", interactive=True, variant="primary" ) ''' with gr.Column(): gr.Markdown(description1) caption_output = gr.Textbox(lines=0, label="* VQA + LLM (short answer):\n The captioning model generates a caption \n of the image. We feed the caption") caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)") gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)") # image_input.change( # lambda: ("", [],"","",""), # [], # [ caption_output, state,caption_output,gpt3_output_v1,caption_output_v1], # queue=False, # ) chat_input.submit( inference_chat, [ image_input, chat_input, ], [ caption_output,gpt3_output_v1,caption_output_v1], ) clear_button.click( lambda: ("", [],"","",""), [], [chat_input, state,caption_output,gpt3_output_v1,caption_output_v1], queue=False, ) submit_button.click( inference_chat, [ image_input, chat_input, ], [caption_output,gpt3_output_v1,caption_output_v1], ) ''' cap_submit_button.click( caption, [ image_input, ], [caption_output_v1], ) gpt3_submit_button.click( gpt3, [ chat_input, caption_output , caption_output_v1, ], [gpt3_output_v1], ) ''' examples=[['bird.jpeg',"How many birds are there in the tree?","2","2","2"], ['qa9.jpg',"What type of vehicle is being pulled by the horses ?",'carriage','sled','Sled'], ['upload4.jpg',"What is this old man doing?","fishing","fishing","Fishing"]] examples = gr.Examples( examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1], ) iface.queue(concurrency_count=1, api_open=False, max_size=10) iface.launch(enable_queue=True)