whyumesh's picture
Update app.py
f0b1947 verified
import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import os
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name()}")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set the default tensor type to cuda
if torch.cuda.is_available():
torch.set_default_tensor_type('torch.cuda.FloatTensor')
def load_model():
try:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
low_cpu_mem_usage=True
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
return model, processor, device
except Exception as e:
print(f"Error loading model: {e}")
return None, None, None
model, processor, device = load_model()
SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
2. If you see a function or algorithm, explain its purpose and how it works.
3. If you see a technical diagram or flowchart, interpret its meaning and purpose.
4. For any technical content, provide detailed explanations and insights.
Always maintain a professional and technical tone in your responses."""
def process_content(file, user_prompt):
if file is None:
return "No content provided. Please upload an image or video of technical content."
file_path = file.name
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
image = Image.open(file_path)
return analyze_image(image, user_prompt)
elif file_extension in ['.mp4', '.avi', '.mov']:
return analyze_video(file_path, user_prompt)
else:
return "Unsupported file type. Please provide an image (jpg, jpeg, png, bmp) or video (mp4, avi, mov) of technical content."
def analyze_image(image, prompt):
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": f"Based on the system instructions, {prompt}"},
],
}
]
return generate_response(messages)
def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
try:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return "Error: Could not open video file."
frames = []
frame_count = 0
while len(frames) < max_frames:
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
h, w = frame.shape[:2]
if h > w:
new_h, new_w = max_resolution, int(w * max_resolution / h)
else:
new_h, new_w = int(h * max_resolution / w), max_resolution
frame = cv2.resize(frame, (new_w, new_h))
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
frames.append(frame)
frame_count += 1
return generate_response([
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "video", "video": frames},
{"type": "text", "text": f"Based on the system instructions, {prompt}"},
],
}
])
except Exception as e:
return f"Error processing video: {e}"
finally:
if 'cap' in locals():
cap.release()
def generate_response(messages):
try:
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
)
# Move inputs to GPU
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=512,
do_sample=True,
top_k=20,
top_p=0.9,
temperature=0.7
)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
# Clear CUDA cache
torch.cuda.empty_cache()
return output_text[0]
except Exception as e:
return f"Error generating response: {e}"
# Gradio interface
iface = gr.Interface(
fn=process_content,
inputs=[
gr.File(label="Upload Image or Video of Technical Content"),
gr.Textbox(label="Enter your technical question", placeholder="e.g., Identify any bugs in this code and suggest fixes", value="Analyze this technical content and provide insights.")
],
outputs="text",
title="Technical Content Analysis",
description="Upload an image or video of code, diagrams, or technical content. Ask questions about bugs, errors, or explanations of functions.",
)
iface.launch(share=True)