File size: 5,290 Bytes
87c3140
e91ac58
87c3140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e91ac58
87c3140
 
 
 
 
 
 
 
 
e91ac58
87c3140
 
 
 
e91ac58
87c3140
e91ac58
 
 
 
87c3140
 
 
 
 
 
 
e91ac58
 
87c3140
 
 
 
 
 
 
 
 
 
 
 
e91ac58
 
87c3140
 
 
e91ac58
87c3140
e91ac58
 
87c3140
 
 
 
 
 
 
 
 
 
e91ac58
87c3140
e91ac58
 
 
87c3140
 
e91ac58
 
87c3140
e91ac58
 
 
 
 
 
 
 
 
87c3140
e91ac58
 
 
 
87c3140
 
 
 
 
 
 
 
e91ac58
87c3140
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json

@dataclass
class PromptCatalog:
    domain_knowledge_example: str = ""
    similarity: str = ""
    OCR: str = ""
    n_fields: int = 0

    
    #############################################################################################
    #############################################################################################
    #############################################################################################
    #############################################################################################
    # These are for dynamically creating your own prompts with n-columns


    def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
        self.OCR = OCR

        self.rules_config_path = rules_config_path
        self.rules_config = self.load_rules_config()

        self.instructions = self.rules_config['instructions']
        self.json_formatting_instructions = self.rules_config['json_formatting_instructions']

        self.rules_list = self.rules_config['rules']
        self.n_fields = len(self.rules_config['rules'])

        # Set the rules for processing OCR into JSON format
        self.rules = self.create_rules(is_palm)

        self.structure, self.dictionary_structure = self.create_structure(is_palm)

        ''' between  instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
        The unstructured OCR text is:
        {self.OCR}
        '''
        if is_palm:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                {self.structure}
                {self.structure}
                """
        else:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.structure}
                """
        # xlsx_headers = self.generate_xlsx_headers(is_palm)
        
        # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
        return prompt, self.dictionary_structure

    def load_rules_config(self):
        with open(self.rules_config_path, 'r') as stream:
            try:
                return yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                return None

    def create_rules(self, is_palm=False):
        dictionary_structure = {key: value for key, value in self.rules_list.items()}

        # Convert the structure to a JSON string without indentation
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
        return structure_json_str
    
    def create_structure(self, is_palm=False):
        # Create fields for the Pydantic model dynamically
        fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}

        # Dynamically create the Pydantic model
        DynamicJSONParsingModel = create_model('SLTPvA', **fields)
        DynamicJSONParsingModel_use = DynamicJSONParsingModel()

        # Define the structure for the "Dictionary" section
        dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
        
        # Dynamically create the "Dictionary" Pydantic model
        PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)

        # Convert the model to JSON string (for demonstration)
        dictionary_structure = PromptJSONModel().dict()
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
        return structure_json_str, dictionary_structure


    def generate_xlsx_headers(self, is_palm):
        # Extract headers from the 'Dictionary' keys in the JSON template rules
        if is_palm:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers     
        else:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers