Spaces:

dinhanhx
/

velvet

Running on Zero

App Files Files Community

velvet / app.py

dinhanhx

Move to cuda

fb5ec17 verified 10 months ago

raw

history blame

3.03 kB

	import gradio as gr
	import spaces

	from standalone_velvet import setup_models

	models_dict = setup_models("visual_bloom.torch")
	visual_bloom = models_dict["visual_bloom"].to('cuda')
	tokenizer = models_dict["tokenizer"]
	image_feature_collator = models_dict["image_feature_collator"]


	@spaces.GPU
	def run_inference(text_input, image_input):
	image_features, image_attentions = image_feature_collator([image_input])
	instruction_inputs = tokenizer([text_input], return_tensors="pt")
	language_output = visual_bloom.generate(
	image_features.to('cuda'),
	image_attentions.to('cuda'),
	instruction_inputs["input_ids"].to('cuda'),
	instruction_inputs["attention_mask"].to('cuda'),
	)

	human_output = tokenizer.decode(language_output[0], skip_special_tokens=True)
	return human_output.split(".")[0]


	if __name__ == "__main__":
	markdown = """
	# Quick introduction

	We have proposed a prompting vision language model.
	The model can caption images and answer questions related to images.
	It is trained on CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA.
	As the result of using Google Translate,
	these datasets collectively contain millions of image-text pairs in English and Vietnamese.

	For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction).

	# Usage

	## Run with pre-defined examples

	1. Scroll to bottom of the page to see the examples.
	2. Click one of them.
	3. Click the `Run Inference` button.

	## Run with user-defined inputs

	### 1. Prepare text input

	Image captioning:
	- `Generate caption in en:`
	- `Generate caption in vi:`

	Visual question answering:
	- `Generate answer in en: <question>?`
	- `Generate answer in vi: <question>?`

	Don't forget to replace `<question>` with your own question either in English or Vietnamese.

	To write the prompt, one can refer to the examples at the bottom of the page.

	### 2. Prepare image input

	You can do as said in Image Input box. Wide range of image types are supported by PIL.

	### 3. Click the `Run Inference` button
	"""
	examples = [
	["Generate caption in en:", "examples/cat.png"],
	["Generate caption in vi:", "examples/cat.png"],
	["Generate answer in en: what is the color of the cat?", "examples/cat.png"],
	["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"],
	]

	with gr.Blocks() as demo:
	gr.Markdown(markdown)

	text_input = gr.Textbox(label="Text Input")
	image_input = gr.Image(label="Image Input", type="pil")

	text_output = gr.Textbox(label="Text Output")

	infer_button = gr.Button("Run Inference")
	infer_button.click(
	run_inference, inputs=[text_input, image_input], outputs=text_output
	)

	examples = gr.Examples(
	examples=examples,
	inputs=[text_input, image_input],
	)
	demo.launch()