Spaces:

akshit-g
/

SeeForMe-Image

Runtime error

App Files Files Community

SeeForMe-Image / app.py

akshit-g

add : files

8ac4152 3 months ago

raw

history blame contribute delete

3.51 kB

	import spaces
	import torch
	import re
	import gradio as gr
	from threading import Thread
	from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
	from PIL import ImageDraw
	from torchvision.transforms.v2 import Resize

	import subprocess
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	model_id = "vikhyatk/moondream2"
	revision = "2024-08-26"
	tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
	moondream = AutoModelForCausalLM.from_pretrained(
	model_id, trust_remote_code=True, revision=revision,
	torch_dtype=torch.bfloat16, device_map={"": "cuda"},
	attn_implementation="flash_attention_2"
	)
	moondream.eval()


	@spaces.GPU(duration=10)
	def answer_question(img, prompt):
	image_embeds = moondream.encode_image(img)
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
	thread = Thread(
	target=moondream.answer_question,
	kwargs={
	"image_embeds": image_embeds,
	"question": prompt,
	"tokenizer": tokenizer,
	"streamer": streamer,
	},
	)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer.strip()

	def extract_floats(text):
	# Regular expression to match an array of four floating point numbers
	pattern = r"\[\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s\]"
	match = re.search(pattern, text)
	if match:
	# Extract the numbers and convert them to floats
	return [float(num) for num in match.groups()]
	return None # Return None if no match is found


	def extract_bbox(text):
	bbox = None
	if extract_floats(text) is not None:
	x1, y1, x2, y2 = extract_floats(text)
	bbox = (x1, y1, x2, y2)
	return bbox

	def process_answer(img, answer):
	if extract_bbox(answer) is not None:
	x1, y1, x2, y2 = extract_bbox(answer)
	draw_image = Resize(768)(img)
	width, height = draw_image.size
	x1, x2 = int(x1 * width), int(x2 * width)
	y1, y2 = int(y1 * height), int(y2 * height)
	bbox = (x1, y1, x2, y2)
	ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
	return gr.update(visible=True, value=draw_image)

	return gr.update(visible=False, value=None)

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# See For Me : Real-time Video Assistance for the Visually Impaired using DL
	The "See For Me" web application is designed to support visually challenged individuals by enhancing their ability to navigate and interact with their environment. Leveraging advancements in machine learning (ML) and deep learning (DL), the project aims to provide real-time visual assistance, enabling users to access and understand textual information in their surroundings.
	"""
	)
	with gr.Row():
	prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
	submit = gr.Button("Submit")
	with gr.Row():
	img = gr.Image(type="pil", label="Upload an Image")
	with gr.Column():
	output = gr.Markdown(label="Response")
	ann = gr.Image(visible=False, label="Annotated Image")

	submit.click(answer_question, [img, prompt], output)
	prompt.submit(answer_question, [img, prompt], output)
	output.change(process_answer, [img, output], ann, show_progress=False)

	demo.queue().launch()