root-sajjan's picture
sending image urls too
9af57d5 verified
import torch
from pathlib import Path
from transformers import CLIPProcessor, CLIPModel
from PIL import Image, ImageDraw
import pytesseract
import requests
import os
from llm import inference, upload_image
import re
cropped_images_dir = "cropped_images"
os.makedirs(cropped_images_dir, exist_ok=True)
# Load YOLO model
class YOLOModel:
def __init__(self, model_path="yolov5s.pt"):
"""
Initialize the YOLO model. Downloads YOLOv5 pretrained model if not available.
"""
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
self.model = torch.hub.load("ultralytics/yolov5", "custom", path=model_path, force_reload=True)
# self.model2 = YOLOv10.from_pretrained("Ultralytics/Yolov8")
# print(f'YOLO Model:\n\n{self.model}')
# self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# # print(f'CLIP Model:\n\n{self.clip_model}')
# self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# self.category_brands = {
# "electronics": ["Samsung", "Apple", "Sony", "LG", "Panasonic"],
# "furniture": ["Ikea", "Ashley", "La-Z-Boy", "Wayfair", "West Elm"],
# "appliances": ["Whirlpool", "GE", "Samsung", "LG", "Bosch"],
# "vehicles": ["Tesla", "Toyota", "Ford", "Honda", "Chevrolet"],
# "chair": ["Ikea", "Ashley", "Wayfair", "La-Z-Boy", "Herman Miller"],
# "microwave": ["Samsung", "Panasonic", "Sharp", "LG", "Whirlpool"],
# "table": ["Ikea", "Wayfair", "Ashley", "CB2", "West Elm"],
# "oven": ["Whirlpool", "GE", "Samsung", "Bosch", "LG"],
# "potted plant": ["The Sill", "PlantVine", "Lowe's", "Home Depot", "UrbanStems"],
# "couch": ["Ikea", "Ashley", "Wayfair", "La-Z-Boy", "CushionCo"],
# "cow": ["Angus", "Hereford", "Jersey", "Holstein", "Charolais"],
# "bed": ["Tempur-Pedic", "Ikea", "Sealy", "Serta", "Sleep Number"],
# "tv": ["Samsung", "LG", "Sony", "Vizio", "TCL"],
# "bin": ["Rubbermaid", "Sterilite", "Hefty", "Glad", "Simplehuman"],
# "refrigerator": ["Whirlpool", "GE", "Samsung", "LG", "Bosch"],
# "laptop": ["Dell", "HP", "Apple", "Lenovo", "Asus"],
# "smartphone": ["Apple", "Samsung", "Google", "OnePlus", "Huawei"],
# "camera": ["Canon", "Nikon", "Sony", "Fujifilm", "Panasonic"],
# "toaster": ["Breville", "Cuisinart", "Black+Decker", "Hamilton Beach", "Oster"],
# "fan": ["Dyson", "Honeywell", "Lasko", "Vornado", "Bionaire"],
# "vacuum cleaner": ["Dyson", "Shark", "Roomba", "Hoover", "Bissell"]
# }
def predict_clip(self, image, brand_names):
"""
Predict the most probable brand using CLIP.
"""
inputs = self.clip_processor(
text=brand_names,
images=image,
return_tensors="pt",
padding=True
)
# print(f'Inputs to clip processor:{inputs}')
outputs = self.clip_model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1) # Convert logits to probabilities
best_idx = probs.argmax().item()
return brand_names[best_idx], probs[0, best_idx].item()
def predict_text(self, image):
try:
# Convert image to grayscale
grayscale = image.convert('L')
# Perform OCR using pytesseract
text = pytesseract.image_to_string(grayscale)
# Return the stripped text if successful
return text.strip()
except Exception as e:
# Log the error for debugging purposes
print(f"Error during text prediction: {e}")
# Return an empty string if OCR fails
return ""
def predict(self, image_path):
"""
Run YOLO inference on an image.
:param image_path: Path to the input image
:return: List of predictions with labels and bounding boxes
"""
results = self.model(image_path)
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
predictions = results.pandas().xyxy[0] # Get predictions as pandas DataFrame
print(f'YOLO predictions:\n\n{predictions}')
output = []
for idx, row in predictions.iterrows():
category = row['name']
confidence = row['confidence']
bbox = [row["xmin"], row["ymin"], row["xmax"], row["ymax"]]
# Crop the detected region
cropped_image = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
cropped_image_path = os.path.join(cropped_images_dir, f"crop_{idx}.jpg")
cropped_image.save(cropped_image_path, "JPEG")
# uploading to cloud for getting URL to pass into LLM
print(f'Uploading now to image url')
image_url = upload_image.upload_image_to_imgbb(cropped_image_path)
print(f'Image URL received as{image_url}')
# inferencing llm for possible brands
result_llms = inference.get_name(image_url, category)
# possible_brands_llm = re.findall(r"-\s*(.+)", possible_brands_mixed)
# if len(possible_brands_llm)>0:
# predicted_brand, clip_confidence = self.predict_clip(cropped_image, possible_brands_llm)
# else:
# predicted_brand, clip_confidence = "Unknown", 0.0
'''
# Match category to possible brands
if category in self.category_brands:
possible_brands = self.category_brands[category]
print(f'Predicting with CLIP:\n\n')
predicted_brand, clip_confidence = self.predict_clip(cropped_image, possible_brands)
else:
predicted_brand, clip_confidence = "Unknown", 0.0
'''
detected_text = self.predict_text(cropped_image)
print(f'Details:{detected_text}')
print(f'Predicted brand: {result_llms["model"]}')
# Draw bounding box and label on the image
draw.rectangle(bbox, outline="red", width=3)
draw.text(
(bbox[0], bbox[1] - 10),
f'{result_llms["brand"]})',
fill="red"
)
# Append result
output.append({
"category": category,
"bbox": bbox,
"confidence": confidence,
"category_llm":result_llms["brand"],
"predicted_brand": result_llms["model"],
# "clip_confidence": clip_confidence,
"price":result_llms["price"],
"details":result_llms["description"],
"detected_text":detected_text,
"image_url":image_url
})
valid_indices = set(range(len(predictions)))
# Iterate over all files in the directory
for filename in os.listdir(cropped_images_dir):
# Check if the filename matches the pattern for cropped images
if filename.startswith("crop_") and filename.endswith(".jpg"):
# Extract the index from the filename
try:
file_idx = int(filename.split("_")[1].split(".")[0])
if file_idx not in valid_indices:
# Delete the file if its index is not valid
file_path = os.path.join(cropped_images_dir, filename)
os.remove(file_path)
print(f"Deleted excess file: {filename}")
except ValueError:
# Skip files that don't match the pattern
continue
return output