VoucherVision / vouchervision /OCR_GPT4oMini.py
phyloforfun's picture
fixing pdfs
d0291ae
raw
history blame
3.43 kB
import os, base64, requests, yaml
from PIL import Image
from openai import OpenAI
from general_utils import calculate_cost
# PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}"""
PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:"""
class GPT4oMiniOCR:
def __init__(self, api_key):
self.api_key = api_key
self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml')
def encode_image(self, image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512):
# Getting the base64 string
base64_image = self.encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": resolution,
}
}
]
}
],
"max_tokens": max_tokens
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response_json = response.json()
if "choices" in response_json :
parsed_answer = response_json["choices"][0]["message"]["content"]
else:
parsed_answer = None
usage_report = response_json.get('usage', {})
tokens_in = usage_report["prompt_tokens"]
tokens_out = usage_report["completion_tokens"]
total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out)
cost_in, cost_out, total_cost, rates_in, rates_out = total_cost
return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out
def main():
# img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'
# $env:OPENAI_API_KEY="KEY"
API_KEY = ""
ocr = GPT4oMiniOCR(API_KEY)
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512)
print(f"Parsed Answer: {parsed_answer}")
print(f"Total Cost: {total_cost}")
parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512)
print(f"Parsed Answer: {parsed_answer}")
print(f"Total Cost: {total_cost}")
if __name__ == '__main__':
main()