Spaces:

maxiw
/

XGen-MM

Sleeping

App Files Files Community

XGen-MM / app.py

maxiw

added app implementation and reqs

4f199bf 8 months ago

raw

history blame

3.8 kB

	import gradio as gr
	from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor, StoppingCriteria
	import spaces
	import torch
	from PIL import Image

	models = {
	"Salesforce/xgen-mm-phi3-mini-instruct-r-v1": AutoModelForVision2Seq.from_pretrained("Salesforce/xgen-mm-phi3-mini-instruct-r-v1", trust_remote_code=True).to("cuda").eval(),
	}

	processors = {
	"Salesforce/xgen-mm-phi3-mini-instruct-r-v1": AutoImageProcessor.from_pretrained("Salesforce/xgen-mm-phi3-mini-instruct-r-v1", trust_remote_code=True),
	}

	tokenizers = {
	"Salesforce/xgen-mm-phi3-mini-instruct-r-v1": AutoTokenizer.from_pretrained("Salesforce/xgen-mm-phi3-mini-instruct-r-v1", trust_remote_code=True, use_fast=False, legacy=False)
	}


	DESCRIPTION = "# [XGen-MM Demo](https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-r-v1)"


	def apply_prompt_template(prompt):
	s = (
	'<\|system\|>\nA chat between a curious user and an artificial intelligence assistant. '
	"The assistant gives helpful, detailed, and polite answers to the user's questions.<\|end\|>\n"
	f'<\|user\|>\n<image>\n{prompt}<\|end\|>\n<\|assistant\|>\n'
	)
	return s


	class EosListStoppingCriteria(StoppingCriteria):
	def __init__(self, eos_sequence = [32007]):
	self.eos_sequence = eos_sequence

	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
	return self.eos_sequence in last_ids


	@spaces.GPU
	def run_example(image, text_input=None, model_id="Salesforce/xgen-mm-phi3-mini-instruct-r-v1"):
	model = models[model_id]
	processor = processors[model_id]
	tokenizer = tokenizers[model_id]
	tokenizer = model.update_special_tokens(tokenizer)

	image = Image.fromarray(image).convert("RGB")
	prompt = apply_prompt_template(text_input)
	language_inputs = tokenizer([prompt], return_tensors="pt")

	inputs = processor([image], return_tensors="pt", image_aspect_ratio='anyres')
	inputs.update(language_inputs)
	inputs = {name: tensor.cuda() for name, tensor in inputs.items()}

	generated_text = model.generate(**inputs, image_size=[image.size],
	pad_token_id=tokenizer.pad_token_id,
	do_sample=False, max_new_tokens=768, top_p=None, num_beams=1,
	stopping_criteria = [EosListStoppingCriteria()],
	)

	prediction = tokenizer.decode(generated_text[0], skip_special_tokens=True).split("<\|end\|>")[0]
	return prediction
	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)
	with gr.Tab(label="XGen-MM Input"):
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(label="Input Picture")
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Salesforce/xgen-mm-phi3-mini-instruct-r-v1")
	text_input = gr.Textbox(label="Question")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	gr.Examples(
	examples=[
	["image1.jpg", "ScreenQA", "What is the version of the settings?"],
	["image1.jpg", "ScreenQA", "What is the state of use lower resolution images?"],
	["image2.jpg", "ScreenQA", "How much is the discount for the product?"]
	],
	inputs=[input_img, text_input],
	outputs=[output_text],
	fn=run_example,
	cache_examples=True,
	label="Try examples"
	)

	submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

	demo.launch(debug=True)