VoucherVision / vouchervision /prompt_catalog.py
phyloforfun's picture
Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing
e91ac58
raw
history blame
5.29 kB
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json
@dataclass
class PromptCatalog:
domain_knowledge_example: str = ""
similarity: str = ""
OCR: str = ""
n_fields: int = 0
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
# These are for dynamically creating your own prompts with n-columns
def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
self.OCR = OCR
self.rules_config_path = rules_config_path
self.rules_config = self.load_rules_config()
self.instructions = self.rules_config['instructions']
self.json_formatting_instructions = self.rules_config['json_formatting_instructions']
self.rules_list = self.rules_config['rules']
self.n_fields = len(self.rules_config['rules'])
# Set the rules for processing OCR into JSON format
self.rules = self.create_rules(is_palm)
self.structure, self.dictionary_structure = self.create_structure(is_palm)
''' between instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
The unstructured OCR text is:
{self.OCR}
'''
if is_palm:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.structure}
{self.structure}
{self.structure}
"""
else:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.structure}
"""
# xlsx_headers = self.generate_xlsx_headers(is_palm)
# return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
return prompt, self.dictionary_structure
def load_rules_config(self):
with open(self.rules_config_path, 'r') as stream:
try:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return None
def create_rules(self, is_palm=False):
dictionary_structure = {key: value for key, value in self.rules_list.items()}
# Convert the structure to a JSON string without indentation
structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
return structure_json_str
def create_structure(self, is_palm=False):
# Create fields for the Pydantic model dynamically
fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}
# Dynamically create the Pydantic model
DynamicJSONParsingModel = create_model('SLTPvA', **fields)
DynamicJSONParsingModel_use = DynamicJSONParsingModel()
# Define the structure for the "Dictionary" section
dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
# Dynamically create the "Dictionary" Pydantic model
PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)
# Convert the model to JSON string (for demonstration)
dictionary_structure = PromptJSONModel().dict()
structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
return structure_json_str, dictionary_structure
def generate_xlsx_headers(self, is_palm):
# Extract headers from the 'Dictionary' keys in the JSON template rules
if is_palm:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers
else:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers