import os, base64, requests, yaml from PIL import Image from openai import OpenAI from general_utils import calculate_cost # PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}""" PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:""" class GPT4oMiniOCR: def __init__(self, api_key): self.api_key = api_key self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml') def encode_image(self, image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512): # Getting the base64 string base64_image = self.encode_image(image_path) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } payload = { "model": "gpt-4o-mini", "messages": [ { "role": "user", "content": [ { "type": "text", "text": PROMPT, }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", "detail": resolution, } } ] } ], "max_tokens": max_tokens } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) response_json = response.json() if "choices" in response_json : parsed_answer = response_json["choices"][0]["message"]["content"] else: parsed_answer = None usage_report = response_json.get('usage', {}) tokens_in = usage_report["prompt_tokens"] tokens_out = usage_report["completion_tokens"] total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out) cost_in, cost_out, total_cost, rates_in, rates_out = total_cost return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out def main(): # img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg' img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg' # $env:OPENAI_API_KEY="KEY" API_KEY = "" ocr = GPT4oMiniOCR(API_KEY) parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512) print(f"Parsed Answer: {parsed_answer}") print(f"Total Cost: {total_cost}") parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512) print(f"Parsed Answer: {parsed_answer}") print(f"Total Cost: {total_cost}") if __name__ == '__main__': main()