Spaces:
Runtime error
Runtime error
File size: 7,449 Bytes
15f516d 781befc 15f516d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import gradio as gr
from transformers import AutoModelForCausalLM,AutoProcessor,pipeline
from PIL import Image
import os
import tempfile
import torch
from pathlib import Path
import secrets
# Initialise Hugging Face LLM
model_id="microsoft/Phi-3.5-vision-instruct"
model=AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.float16,)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
math_messages=[]
# Function for processing the image
def process_image(image,should_convert=False):
'''
Saves the uploaded image or sketch and then extracts math-related descriptions using the model
'''
global math_messages
math_messages=[]
# create a temporary directory for saving images
uploaded_file_dir=os.environ.get("GRADIO_TEMP_DIR") or str(Path(tempfile.gettempdir())/"gradio")
os.makedirs(uploaded_file_dir,exist_ok=True)
# saves the uploaded image as a temporary file
name = f"tmp{secrets.token_hex(20)}.jpg"
filename = os.path.join(uploaded_file_dir, name)
# If the input was a sketch then convert into RGB format
if should_convert:
new_img = Image.new('RGB', size=(image.width, image.height), color=(255, 255, 255))
new_img.paste(image, (0, 0), mask=image)
image = new_img
# Saves the image in the temporary file
image.save(filename)
# Calling the model to process images
messages = [{
'role': 'system',
'content': [{'text': 'You are a helpful assistant.'}]
}, {
'role': 'user',
'content': [
{'image': f'file://{filename}'},
{'text': 'Please describe the math-related content in this image, ensuring that any LaTeX formulas are correctly transcribed. Non-mathematical details do not need to be described.'}
]
}]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Process the input
inputs = processor(prompt, image, return_tensors="pt")
# Generate the response
generation_args = {
"max_new_tokens": 1000,
"temperature": 0.2,
"do_sample": True,
}
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
# Decode the response
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response
# Function to get math-response from the processed image
def get_math_response(image_description,user_question):
global math_messages
if not math_messages:
math_messages.append({'role': 'system', 'content': 'You are a helpful math assistant.'})
math_messages = math_messages[:1]
if image_description is not None:
content = f'Image description: {image_description}\n\n'
else:
content = ''
query = f"{content}User question: {user_question}"
math_messages.append({'role': 'user', 'content': query})
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2.5-1210", trust_remote_code=True)
response=pipe(math_messages)
print(response)
answer = None
for resp in response:
if resp.output is None:
continue
answer = resp.output.choices[0].message.content
yield answer.replace("\\", "\\\\")
print(f'query: {query}\nanswer: {answer}')
if answer is None:
math_messages.pop()
else:
math_messages.append({'role': 'assistant', 'content': answer})
# creating the chatbot
def math_chat_bot(image, sketchpad, question, state):
current_tab_index = state["tab_index"]
image_description = None
# Upload
if current_tab_index == 0:
if image is not None:
image_description = process_image(image)
# Sketch
elif current_tab_index == 1:
print(sketchpad)
if sketchpad and sketchpad["composite"]:
image_description = process_image(sketchpad["composite"], True)
yield from get_math_response(image_description, question)
css = """
#qwen-md .katex-display { display: inline; }
#qwen-md .katex-display>.katex { display: inline; }
#qwen-md .katex-display>.katex>.katex-html { display: inline; }
"""
def tabs_select(e: gr.SelectData, _state):
_state["tab_index"] = e.index
# εε»ΊGradioζ₯ε£
with gr.Blocks(css=css) as demo:
gr.HTML(
"""\
<center><font size=3>This WebUI is based on Qwen2-VL for OCR and Qwen2.5-Math for mathematical reasoning. You can input either images or texts of mathematical or arithmetic problems.</center>"""
)
state = gr.State({"tab_index": 0})
with gr.Row():
with gr.Column():
with gr.Tabs() as input_tabs:
with gr.Tab("Upload"):
input_image = gr.Image(type="pil", label="Upload"),
with gr.Tab("Sketch"):
input_sketchpad = gr.Sketchpad(type="pil", label="Sketch", layers=False)
input_tabs.select(fn=tabs_select, inputs=[state])
input_text = gr.Textbox(label="input your question")
with gr.Row():
with gr.Column():
clear_btn = gr.ClearButton(
[*input_image, input_sketchpad, input_text])
with gr.Column():
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column():
output_md = gr.Markdown(label="answer",
latex_delimiters=[{
"left": "\\(",
"right": "\\)",
"display": True
}, {
"left": "\\begin\{equation\}",
"right": "\\end\{equation\}",
"display": True
}, {
"left": "\\begin\{align\}",
"right": "\\end\{align\}",
"display": True
}, {
"left": "\\begin\{alignat\}",
"right": "\\end\{alignat\}",
"display": True
}, {
"left": "\\begin\{gather\}",
"right": "\\end\{gather\}",
"display": True
}, {
"left": "\\begin\{CD\}",
"right": "\\end\{CD\}",
"display": True
}, {
"left": "\\[",
"right": "\\]",
"display": True
}],
elem_id="qwen-md")
submit_btn.click(
fn=math_chat_bot,
inputs=[*input_image, input_sketchpad, input_text, state],
outputs=output_md)
demo.launch() |