mathMentor / app.py
Ayush0804's picture
Update app.py
781befc verified
raw
history blame
7.45 kB
import gradio as gr
from transformers import AutoModelForCausalLM,AutoProcessor,pipeline
from PIL import Image
import os
import tempfile
import torch
from pathlib import Path
import secrets
# Initialise Hugging Face LLM
model_id="microsoft/Phi-3.5-vision-instruct"
model=AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.float16,)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
math_messages=[]
# Function for processing the image
def process_image(image,should_convert=False):
'''
Saves the uploaded image or sketch and then extracts math-related descriptions using the model
'''
global math_messages
math_messages=[]
# create a temporary directory for saving images
uploaded_file_dir=os.environ.get("GRADIO_TEMP_DIR") or str(Path(tempfile.gettempdir())/"gradio")
os.makedirs(uploaded_file_dir,exist_ok=True)
# saves the uploaded image as a temporary file
name = f"tmp{secrets.token_hex(20)}.jpg"
filename = os.path.join(uploaded_file_dir, name)
# If the input was a sketch then convert into RGB format
if should_convert:
new_img = Image.new('RGB', size=(image.width, image.height), color=(255, 255, 255))
new_img.paste(image, (0, 0), mask=image)
image = new_img
# Saves the image in the temporary file
image.save(filename)
# Calling the model to process images
messages = [{
'role': 'system',
'content': [{'text': 'You are a helpful assistant.'}]
}, {
'role': 'user',
'content': [
{'image': f'file://{filename}'},
{'text': 'Please describe the math-related content in this image, ensuring that any LaTeX formulas are correctly transcribed. Non-mathematical details do not need to be described.'}
]
}]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Process the input
inputs = processor(prompt, image, return_tensors="pt")
# Generate the response
generation_args = {
"max_new_tokens": 1000,
"temperature": 0.2,
"do_sample": True,
}
generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
# Decode the response
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response
# Function to get math-response from the processed image
def get_math_response(image_description,user_question):
global math_messages
if not math_messages:
math_messages.append({'role': 'system', 'content': 'You are a helpful math assistant.'})
math_messages = math_messages[:1]
if image_description is not None:
content = f'Image description: {image_description}\n\n'
else:
content = ''
query = f"{content}User question: {user_question}"
math_messages.append({'role': 'user', 'content': query})
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-V2.5-1210", trust_remote_code=True)
response=pipe(math_messages)
print(response)
answer = None
for resp in response:
if resp.output is None:
continue
answer = resp.output.choices[0].message.content
yield answer.replace("\\", "\\\\")
print(f'query: {query}\nanswer: {answer}')
if answer is None:
math_messages.pop()
else:
math_messages.append({'role': 'assistant', 'content': answer})
# creating the chatbot
def math_chat_bot(image, sketchpad, question, state):
current_tab_index = state["tab_index"]
image_description = None
# Upload
if current_tab_index == 0:
if image is not None:
image_description = process_image(image)
# Sketch
elif current_tab_index == 1:
print(sketchpad)
if sketchpad and sketchpad["composite"]:
image_description = process_image(sketchpad["composite"], True)
yield from get_math_response(image_description, question)
css = """
#qwen-md .katex-display { display: inline; }
#qwen-md .katex-display>.katex { display: inline; }
#qwen-md .katex-display>.katex>.katex-html { display: inline; }
"""
def tabs_select(e: gr.SelectData, _state):
_state["tab_index"] = e.index
# εˆ›ε»ΊGradioζŽ₯口
with gr.Blocks(css=css) as demo:
gr.HTML(
"""\
<center><font size=3>This WebUI is based on Qwen2-VL for OCR and Qwen2.5-Math for mathematical reasoning. You can input either images or texts of mathematical or arithmetic problems.</center>"""
)
state = gr.State({"tab_index": 0})
with gr.Row():
with gr.Column():
with gr.Tabs() as input_tabs:
with gr.Tab("Upload"):
input_image = gr.Image(type="pil", label="Upload"),
with gr.Tab("Sketch"):
input_sketchpad = gr.Sketchpad(type="pil", label="Sketch", layers=False)
input_tabs.select(fn=tabs_select, inputs=[state])
input_text = gr.Textbox(label="input your question")
with gr.Row():
with gr.Column():
clear_btn = gr.ClearButton(
[*input_image, input_sketchpad, input_text])
with gr.Column():
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column():
output_md = gr.Markdown(label="answer",
latex_delimiters=[{
"left": "\\(",
"right": "\\)",
"display": True
}, {
"left": "\\begin\{equation\}",
"right": "\\end\{equation\}",
"display": True
}, {
"left": "\\begin\{align\}",
"right": "\\end\{align\}",
"display": True
}, {
"left": "\\begin\{alignat\}",
"right": "\\end\{alignat\}",
"display": True
}, {
"left": "\\begin\{gather\}",
"right": "\\end\{gather\}",
"display": True
}, {
"left": "\\begin\{CD\}",
"right": "\\end\{CD\}",
"display": True
}, {
"left": "\\[",
"right": "\\]",
"display": True
}],
elem_id="qwen-md")
submit_btn.click(
fn=math_chat_bot,
inputs=[*input_image, input_sketchpad, input_text, state],
outputs=output_md)
demo.launch()