visionB / neovision /utils.py
Simba
WIP
2874322
import cv2
import base64
import numpy as np
def encode_image_to_base64(image: np.ndarray) -> str:
"""
Encodes a given image represented as a NumPy array to a base64-encoded string.
Parameters:
image (np.ndarray): A NumPy array representing the image to be encoded.
Returns:
str: A base64-encoded string representing the input image in JPEG format.
Raises:
ValueError: If the image cannot be encoded to JPEG format.
"""
success, buffer = cv2.imencode('.jpg', image)
if not success:
raise ValueError("Could not encode image to JPEG format.")
encoded_image = base64.b64encode(buffer).decode('utf-8')
return encoded_image
def compose_payload(image: np.ndarray, prompt: str) -> dict:
"""
Composes a payload dictionary with a base64 encoded image and a text prompt for the GPT-4 Vision model.
Args:
image (np.ndarray): The image in the form of a NumPy array to encode and send.
prompt (str): The prompt text to accompany the image in the payload.
Returns:
dict: A dictionary structured as a payload for the GPT-4 Vision model, including the model name,
an array of messages each containing a role and content with text and the base64 encoded image,
and the maximum number of tokens to generate.
"""
base64_image = encode_image_to_base64(image)
return {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}