maker-space / vision_model.py
isayahc's picture
setting up the generation and processing of images and 3d assets
c042949
raw
history blame
2.65 kB
import base64
from openai import OpenAI
from typing import List, Dict, Any
from dotenv import load_dotenv
import os
load_dotenv()
# source
# https://platform.openai.com/docs/guides/vision?lang=python
def analyze_images(
images: List[str],
prompt: str,
# api_key: str,
model: str = "gpt-4-vision-preview",
max_tokens: int = 300
) -> Dict[str, Any]:
"""
Analyze multiple images using OpenAI's vision model.
Args:
images (List[str]): List of URLs and/or local paths to the image files.
prompt (str): Prompt message for the AI model.
api_key (str): Your OpenAI API key.
model (str, optional): Name of the vision model to use. Defaults to "gpt-4-vision-preview".
max_tokens (int, optional): Maximum number of tokens for the response. Defaults to 300.
Returns:
dict: JSON response from the API.
"""
client = OpenAI()
messages = [{
"role": "user",
"content": [{"type": "text", "text": prompt}]
}]
for image in images:
if image.startswith("http://") or image.startswith("https://"):
# Image is a URL
messages.append({
"role": "user",
"content": [{"type": "image_url", "image_url": {"url": image}}]
})
else:
# Image is a local path
with open(image, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
messages.append({
"role": "user",
"content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}]
})
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens
)
return response.choices[0]
def main():
api_key = os.getenv("OPENAI_API_KEY")
images = [
"/workspaces/Maker-Tech-Tree/mesh_1.png",
"/workspaces/Maker-Tech-Tree/mesh_2.png",
"/workspaces/Maker-Tech-Tree/mesh_3.png",
]
prompt = "I am creating an 3d model of a Glass lenses for refracting light,\
using a text-to-3d model\
Do these images look correct?\
If not please make a suggesttion on how to improve the text input\
As this response will be used in a pipeline please only output a new \
potential prompt or output nothing, \
Please keep the prompt to 5 25 words to not confuse the model"
response = analyze_images(
images,
prompt,
# api_key,
)
print(response)
if __name__ == "__main__":
main()