Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

VoucherVision / vouchervision /OCR_GPT4oMini.py

phyloforfun

fixing pdfs

d0291ae 3 months ago

raw

history blame

3.43 kB

	import os, base64, requests, yaml
	from PIL import Image
	from openai import OpenAI

	from general_utils import calculate_cost

	# PROMPT = """Please perform OCR on this scientific image and extract the printed and handwritten text verbatim. Do not explain your answer, only return the verbatim text in this JSON dictionary format: {'printed_text': '', 'handwritten_text': ''}"""
	PROMPT = """Please perform OCR on this scientific image and extract all of the words and text verbatim. Do not explain your answer, only return the verbatim text:"""

	class GPT4oMiniOCR:
	def __init__(self, api_key):
	self.api_key = api_key
	self.path_api_cost = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'api_cost', 'api_cost.yaml')


	def encode_image(self, image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')

	def ocr_gpt4o(self, image_path, resolution="low", max_tokens=512):
	# Getting the base64 string
	base64_image = self.encode_image(image_path)

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.api_key}"
	}

	payload = {
	"model": "gpt-4o-mini",
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": PROMPT,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": resolution,
	}
	}
	]
	}
	],
	"max_tokens": max_tokens
	}

	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
	response_json = response.json()

	if "choices" in response_json :
	parsed_answer = response_json["choices"][0]["message"]["content"]
	else:
	parsed_answer = None

	usage_report = response_json.get('usage', {})
	tokens_in = usage_report["prompt_tokens"]
	tokens_out = usage_report["completion_tokens"]

	total_cost = calculate_cost('GPT_4o_mini_2024_07_18', self.path_api_cost, tokens_in, tokens_out)
	cost_in, cost_out, total_cost, rates_in, rates_out = total_cost

	return parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out




	def main():
	# img_path = '/home/brlab/Downloads/gem_2024_06_26__02-26-02/Cropped_Images/By_Class/label/1.jpg'
	img_path = 'D:/D_Desktop/BR_1839468565_Ochnaceae_Campylospermum_reticulatum_label.jpg'

	# $env:OPENAI_API_KEY="KEY"
	API_KEY = ""


	ocr = GPT4oMiniOCR(API_KEY)

	parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="low", max_tokens=512)
	print(f"Parsed Answer: {parsed_answer}")
	print(f"Total Cost: {total_cost}")

	parsed_answer, cost_in, cost_out, total_cost, rates_in, rates_out, tokens_in, tokens_out = ocr.ocr_gpt4o(img_path, resolution="high", max_tokens=512)
	print(f"Parsed Answer: {parsed_answer}")
	print(f"Total Cost: {total_cost}")




	if __name__ == '__main__':
	main()