Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spaces | |
from standalone_velvet import setup_models | |
models_dict = setup_models("visual_bloom.torch") | |
visual_bloom = models_dict["visual_bloom"].to('cuda') | |
tokenizer = models_dict["tokenizer"] | |
image_feature_collator = models_dict["image_feature_collator"] | |
def run_inference(text_input, image_input): | |
image_features, image_attentions = image_feature_collator([image_input]) | |
instruction_inputs = tokenizer([text_input], return_tensors="pt") | |
language_output = visual_bloom.generate( | |
image_features.to('cuda'), | |
image_attentions.to('cuda'), | |
instruction_inputs["input_ids"].to('cuda'), | |
instruction_inputs["attention_mask"].to('cuda'), | |
) | |
human_output = tokenizer.decode(language_output[0], skip_special_tokens=True) | |
return human_output.split(".")[0] | |
if __name__ == "__main__": | |
markdown = """ | |
# Quick introduction | |
We have proposed a prompting vision language model. | |
The model can caption images and answer questions related to images. | |
It is trained on CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA. | |
As the result of using Google Translate, | |
these datasets collectively contain millions of image-text pairs in English and Vietnamese. | |
For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction). | |
# Usage | |
## Run with pre-defined examples | |
1. Scroll to bottom of the page to see the examples. | |
2. Click one of them. | |
3. Click the `Run Inference` button. | |
## Run with user-defined inputs | |
### 1. Prepare text input | |
Image captioning: | |
- `Generate caption in en:` | |
- `Generate caption in vi:` | |
Visual question answering: | |
- `Generate answer in en: <question>?` | |
- `Generate answer in vi: <question>?` | |
Don't forget to replace `<question>` with your own question either in English or Vietnamese. | |
To write the prompt, one can refer to the examples at the bottom of the page. | |
### 2. Prepare image input | |
You can do as said in Image Input box. Wide range of image types are supported by PIL. | |
### 3. Click the `Run Inference` button | |
""" | |
examples = [ | |
["Generate caption in en:", "examples/cat.png"], | |
["Generate caption in vi:", "examples/cat.png"], | |
["Generate answer in en: what is the color of the cat?", "examples/cat.png"], | |
["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"], | |
] | |
with gr.Blocks() as demo: | |
gr.Markdown(markdown) | |
text_input = gr.Textbox(label="Text Input") | |
image_input = gr.Image(label="Image Input", type="pil") | |
text_output = gr.Textbox(label="Text Output") | |
infer_button = gr.Button("Run Inference") | |
infer_button.click( | |
run_inference, inputs=[text_input, image_input], outputs=text_output | |
) | |
examples = gr.Examples( | |
examples=examples, | |
inputs=[text_input, image_input], | |
) | |
demo.launch() | |