from PIL import Image import gradio as gr import requests # import torch from models.zhclip import ZhCLIPProcessor, ZhCLIPModel # From https://www.github.com/yue-gang/ZH-CLIP version = 'nlpcver/zh-clip-vit-roberta-large-patch14' model = ZhCLIPModel.from_pretrained(version) processor = ZhCLIPProcessor.from_pretrained(version) def get_result(image,text,text1): inputs = processor(text=[text,text1], images=image, return_tensors="pt", padding=True) outputs = model(**inputs) image_features = outputs.image_features text_features = outputs.text_features text_probs = (image_features @ text_features.T).softmax(dim=-1) return text_probs with gr.Blocks( css=""" .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px} #component-21 > div.wrap.svelte-w6rprc {height: 600px;} """ ) as iface: state = gr.State([]) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil",label="Image Input") with gr.Row(): with gr.Column(scale=1): chat_input = gr.Textbox(lines=1, label="Captions0 Input") chat_input1 = gr.Textbox(lines=1, label="Captions1 Input") with gr.Row(): clear_button = gr.Button(value="Clear", interactive=True,width=30) submit_button = gr.Button( value="Submit", interactive=True, variant="primary" ) with gr.Column(): caption_output = gr.Textbox(lines=0, label="ITM") clear_button.click( lambda: ("", [],"","",""), [], [chat_input, state,caption_output], queue=False, ) submit_button.click( get_result, [ image_input, chat_input, chat_input1, ], [caption_output], ) iface.queue(concurrency_count=1, api_open=False, max_size=10) iface.launch(enable_queue=True)