Spaces:

superuser-aisensum
/

object-detection-and-counting

Sleeping

App Files Files Community

object-detection-and-counting / app-gpt.py

superuser-aisensum

Rename app.py to app-gpt.py

2ffc4e1 verified 3 months ago

raw

history blame

6.74 kB

	import logging
	import gradio as gr
	import os
	from roboflow import Roboflow
	from dotenv import load_dotenv
	from openai import OpenAI
	import tempfile
	import numpy as np
	from PIL import Image, ImageDraw
	import base64

	# Load environment variables
	load_dotenv()

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Initialize API Keys
	roboflow_key = os.getenv("ROBOFLOW_API_KEY")
	if not roboflow_key:
	raise ValueError("ROBOFLOW_API_KEY is missing. Please add it to the .env file.")

	openai_key = os.getenv("OPENAI_API_KEY")
	if not openai_key:
	raise ValueError("OPENAI_API_KEY is missing. Please add it to the .env file.")

	# Initialize Roboflow and OpenAI clients
	rf = Roboflow(api_key=roboflow_key)
	project = rf.workspace("alat-pelindung-diri").project("nescafe-4base")
	model = project.version(16).model # Commented-out YOLO model usage, if necessary

	client_openai = OpenAI(api_key=openai_key)

	# Function to detect objects and estimate occluded objects
	def detect_and_estimate_objects(image):
	try:
	# Save image to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
	image.save(temp_file, format="JPEG")
	temp_file_path = temp_file.name

	logger.info("Image saved successfully for processing.")

	# Step 1: YOLO detection
	predictions = model.predict(temp_file_path, confidence=70, overlap=80).json()
	class_count = {}
	object_positions = []

	# Draw bounding boxes
	draw = ImageDraw.Draw(image)
	for prediction in predictions['predictions']:
	class_name = prediction['class']
	x, y, width, height = prediction['x'], prediction['y'], prediction['width'], prediction['height']

	# Calculate bounding box coordinates
	left = int(x - width / 2)
	top = int(y - height / 2)
	right = int(x + width / 2)
	bottom = int(y + height / 2)

	# Draw bounding box
	draw.rectangle([left, top, right, bottom], outline="red", width=4)

	# Count occurrences of detected classes
	class_count[class_name] = class_count.get(class_name, 0) + 1
	object_positions.append((left, top, right, bottom))

	logger.info(f"YOLO detected objects: {class_count}")

	# Step 2: Prepare base64 encoding for GPT-4
	# Encode image to Base64
	with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
	image.save(temp_file, format="JPEG")
	temp_file_path = temp_file.name

	with open(temp_file_path, "rb") as image_file:
	base64_image = base64.b64encode(image_file.read()).decode("utf-8")
	logger.info(f"Base64 encoding successful. Length: {len(base64_image)}")

	# Step 3: Use GPT-4 to estimate occluded objects
	response = client_openai.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": """Please count the number of cans of the following Nestlé products in the image, including those that are partially obstructed or hidden.
	For partially visible or obstructed cans, please estimate their number based on visible clues and assume that they belong to the same product in front of them.
	Please count accurately the number of cans of the following Nestlé products in the image:
	- Nescafe Mocha
	- Nescafe Latte
	- Nescafe Original
	- Bear Brand
	- Nescafe Cappuccino
	- Nescafe Ice Black
	- Nescafe Coconut Latte
	- Nescafe Caramel
	Please note that some products may be partially visible or obstructed, but are still important to count. Products that are only partially visible or obstructed. Think of them as cans of the same product in front of them.
	Please count the visible cans as well as the occluded ones. For partially hidden cans, assume they are the same product and estimate their presence based on the visible portion.

	Provide your response in the format:
	Nescafé Mocha: [number]
	Nescafé Latte: [number]
	Nescafé Original: [number]
	Bear Brand: [number]
	Nescafé Cappuccino: [number]
	Nescafé Ice Black: [number]
	Nescafé Coconut Latte: [number]
	Nescafé Caramel: [number]
	Total Nestlé Products: [Total number of Nestlé products]""",
	},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
	},
	],
	}
	],
	)
	gpt_estimation = response.choices[0].message.content.strip()
	print(response.choices[0].message.content)

	logger.info(f"GPT-4 estimation: {gpt_estimation}")

	# Step 4: Combine YOLO and GPT results (without YOLO part now)
	result_text = f"Results from GPT-4:\n{gpt_estimation}"

	# Step 5: Return the result text without the grid visualization
	output_path = "/tmp/prediction_result.jpg"
	image.save(output_path)

	logger.info("Processed image saved successfully.")

	# Cleanup
	os.remove(temp_file_path)

	return output_path, result_text

	except Exception as e:
	logger.error(f"Error during processing: {e}")
	return None, f"Error: {e}"

	# Create Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown("### Object Detection and Counting with YOLO and GPT-4 Assistance")
	with gr.Row():
	input_image = gr.Image(type="pil", label="Upload Image")
	output_image = gr.Image(label="Processed Image")
	output_text = gr.Textbox(label="Results", interactive=False)

	detect_button = gr.Button("Process Image")
	detect_button.click(
	fn=detect_and_estimate_objects,
	inputs=[input_image],
	outputs=[output_image, output_text]
	)

	iface.launch(debug=True)