Spaces:

phyloforfun
/

VoucherVision

Running

File size: 5,290 Bytes

from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json

@dataclass
class PromptCatalog:
    domain_knowledge_example: str = ""
    similarity: str = ""
    OCR: str = ""
    n_fields: int = 0

    
    #############################################################################################
    #############################################################################################
    #############################################################################################
    #############################################################################################
    # These are for dynamically creating your own prompts with n-columns


    def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
        self.OCR = OCR

        self.rules_config_path = rules_config_path
        self.rules_config = self.load_rules_config()

        self.instructions = self.rules_config['instructions']
        self.json_formatting_instructions = self.rules_config['json_formatting_instructions']

        self.rules_list = self.rules_config['rules']
        self.n_fields = len(self.rules_config['rules'])

        # Set the rules for processing OCR into JSON format
        self.rules = self.create_rules(is_palm)

        self.structure, self.dictionary_structure = self.create_structure(is_palm)

        ''' between  instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
        The unstructured OCR text is:
        {self.OCR}
        '''
        if is_palm:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                {self.structure}
                {self.structure}
                """
        else:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                """
        # xlsx_headers = self.generate_xlsx_headers(is_palm)
        
        # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
        return prompt, self.dictionary_structure

    def load_rules_config(self):
        with open(self.rules_config_path, 'r') as stream:
            try:
                return yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                return None

    def create_rules(self, is_palm=False):
        dictionary_structure = {key: value for key, value in self.rules_list.items()}

        # Convert the structure to a JSON string without indentation
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
        return structure_json_str
    
    def create_structure(self, is_palm=False):
        # Create fields for the Pydantic model dynamically
        fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}

        # Dynamically create the Pydantic model
        DynamicJSONParsingModel = create_model('SLTPvA', **fields)
        DynamicJSONParsingModel_use = DynamicJSONParsingModel()

        # Define the structure for the "Dictionary" section
        dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
        
        # Dynamically create the "Dictionary" Pydantic model
        PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)

        # Convert the model to JSON string (for demonstration)
        dictionary_structure = PromptJSONModel().dict()
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
        return structure_json_str, dictionary_structure


    def generate_xlsx_headers(self, is_palm):
        # Extract headers from the 'Dictionary' keys in the JSON template rules
        if is_palm:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers     
        else:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers