Sakshi commited on
Commit
ddacfa7
·
1 Parent(s): 4980502

created invoice extraction app

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.pycache
2
+ *.env
3
+ *.pyc
4
+ *__pycache__
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="light"
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+
5
+ import streamlit as st
6
+ import pandas as pd
7
+
8
+ from utils import validate_pdf, displayPDF
9
+ from styles import apply_custom_styles
10
+ from invoice_extractor.extraction import Auto
11
+
12
+ if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
13
+ os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
14
+
15
+ if 'auto_extractor' not in st.session_state:
16
+ st.session_state.auto_extractor = Auto()
17
+
18
+ def markdown_table_to_json(markdown):
19
+ lines = markdown.strip().split("\n")
20
+
21
+ # Extract headers
22
+ headers = [h.strip() for h in lines[0].split("|") if h.strip()]
23
+
24
+ # Extract rows
25
+ rows = []
26
+ for line in lines[2:]: # Skip header and separator line
27
+ values = [v.strip() for v in line.split("|") if v.strip()]
28
+ row_dict = dict(zip(headers, values))
29
+ rows.append(row_dict)
30
+
31
+ return rows
32
+
33
+ def visualise_pie_chart(analysis):
34
+ verdicts = {}
35
+ score = 0
36
+ total = 0
37
+ for verdict in ['GOOD', 'AVERAGE', 'BAD']:
38
+ table = analysis.split(f'<{verdict}>')[-1].split(f'</{verdict}>')[0]
39
+ table = markdown_table_to_json(table)
40
+ if len(table) > 0:
41
+ verdicts[verdict] = table
42
+ if verdict == 'GOOD':
43
+ score += 5 * len(table)
44
+ if verdict == 'AVERAGE':
45
+ score += 3 * len(table)
46
+ elif verdict == 'BAD':
47
+ score += len(table)
48
+ total += 5 * len(table)
49
+ gauge(gVal = total, gTitle = '', gMode = 'gauge+number',
50
+ grLow = total // 3,
51
+ grMid = 2 * (total // 3))
52
+
53
+ def main():
54
+ # Apply custom styles
55
+ apply_custom_styles()
56
+
57
+ # Header
58
+ st.markdown("""
59
+ <div class="header-container">
60
+ <img src="https://acko-brand.ackoassets.com/brand/vector-svg/gradient/horizontal-reverse.svg" height=50 width=100>
61
+ <h1>Invoice Extractor</h1>
62
+ <p>Upload and extract data from invoices</p>
63
+ </div>
64
+ """, unsafe_allow_html=True)
65
+
66
+ # File upload section
67
+ st.markdown('<div class="upload-container">', unsafe_allow_html=True)
68
+ uploaded_files = st.file_uploader("Choose invoice PDF files", type="pdf", accept_multiple_files=True)
69
+ print(uploaded_files)
70
+ lob = st.selectbox(
71
+ 'LOB',
72
+ options = ['Health', 'Life', 'Auto'],
73
+ index = 2
74
+ )
75
+ st.markdown('</div>', unsafe_allow_html=True)
76
+
77
+ if uploaded_files and st.button('Extract'):
78
+ # Process each uploaded file
79
+ for uploaded_file in uploaded_files:
80
+ # Read PDF content
81
+ pdf_bytes = uploaded_file.read()
82
+ # displayPDF(pdf_bytes)
83
+
84
+ # Validate PDF
85
+ if not validate_pdf(pdf_bytes):
86
+ st.error(f"Invalid PDF file: {uploaded_file.name}")
87
+ continue
88
+
89
+ # Show loading state
90
+ with st.spinner(f"Extracting {uploaded_file.name}..."):
91
+ try:
92
+ # Make API call
93
+ response = st.session_state.auto_extractor(pdf_bytes)
94
+ extraction = next(
95
+ (item for item in response if item.get("stage") == "POST_PROCESS"), None
96
+ )['response']
97
+ with st.expander(f'### Invoice : {uploaded_file.name}'):
98
+ displayPDF(pdf_bytes)
99
+ for entity in extraction:
100
+ # cols = st.columns(2)
101
+ # with cols[0]:
102
+ if isinstance(entity['entityValue'], list):
103
+ st.markdown(f'{entity["entityName"]}')
104
+ df = pd.DataFrame.from_records(entity['entityValue'])
105
+ st.table(df)
106
+ elif isinstance(entity['entityValue'], dict):
107
+ st.markdown(f'{entity["entityName"]}')
108
+ for k, v in entity['entityValue'].items():
109
+ st.markdown(f'{k.upper()}')
110
+ if isinstance(v, list):
111
+ df = pd.DataFrame.from_records(v)
112
+ st.table(v)
113
+ else:
114
+ st.text_input(f'{entity["entityName"]}', entity['entityValue'])
115
+
116
+ except Exception as e:
117
+ st.error(f"Error extracting {uploaded_file.name}: {str(e)}")
118
+
119
+ # Footer
120
+ st.markdown("""
121
+ <div style="margin-top: 50px; text-align: center; color: #666;">
122
+ <p>Upload one or more invoice PDFs to get detailed extraction.</p>
123
+ <p>We support all major formats.</p>
124
+ </div>
125
+ """, unsafe_allow_html=True)
126
+
127
+ if __name__ == "__main__":
128
+ st.set_page_config(
129
+ page_title="Invoice Extractor",
130
+ page_icon="📋",
131
+ layout="wide"
132
+ )
133
+ main()
invoice_extractor/__init__.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+
5
+ try:
6
+ load_dotenv('.env')
7
+ except:
8
+ pass
9
+
10
+ PACKAGE = 'invoice_extractor'
11
+ PROJECT_DIR = os.getcwd()
12
+ PACKAGE_PATH = os.path.join(PROJECT_DIR, PACKAGE)
13
+ PROMPTS_DIR = os.path.join(PACKAGE_PATH, 'prompts')
14
+ DATA_DIR = os.path.join(PACKAGE_PATH, 'data')
15
+
16
+ CREDENTIALS = {
17
+ 'azure' : {
18
+ 'plain-text' : {
19
+ 'endpoint' : os.environ.get('AZURE_PLAIN_TEXT_ENDPOINT', ''),
20
+ 'key' : os.environ.get('AZURE_PLAIN_TEXT_KEY')
21
+ },
22
+ 'layout' : {
23
+ 'endpoint' : os.environ.get('AZURE_LAYOUT_ENDPOINT', ''),
24
+ 'key' : os.environ.get('AZURE_LAYOUT_KEY', ''),
25
+ 'model' : os.environ.get('AZURE_LAYOUT_MODEL', '')
26
+ }
27
+ }
28
+ }
29
+
30
+ GPT_ENGINE = 'o1-mini'
31
+ GPT_KEY = os.environ.get('GPT_KEY', '')
32
+ GPT_VERSION = '2024-12-01-preview'
33
+ GPT_API_BASE = 'https://ackotest.openai.azure.com/'
34
+
35
+ # EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
36
+ # entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
37
+ # for entity in entities:
38
+ # del entity['entityId']
39
+ # entities_str = '\n---\n'.join(['\n'.join([f'{k} : {v}' for k, v in entity.items()]) for entity in entities])
40
+ # EXTRACTION_PROMPT += entities_str
invoice_extractor/data/__init__.py ADDED
File without changes
invoice_extractor/extraction.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extraction
3
+ @author : Sakshi Tantak
4
+ """
5
+
6
+ # Imports
7
+ import os
8
+ import re
9
+ import json
10
+ from time import time
11
+ from datetime import datetime
12
+
13
+ from invoice_extractor import PROMPTS_DIR
14
+ from invoice_extractor.llm import call_openai
15
+ from invoice_extractor.ocr import PyMuPDF4LLMOCR, AzureLayoutOCR
16
+
17
+ class LOB:
18
+ def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
19
+ if ocr_engine == 'open-source/pymupdf4llm':
20
+ self.engine = PyMuPDF4LLMOCR()
21
+ elif ocr_engine == 'azure/layout':
22
+ self.engine = AzureLayoutOCR()
23
+ self.file_type = 'pdf'
24
+ with open(os.path.join(PROMPTS_DIR, 'extraction_system_prompt.txt'), 'r') as f:
25
+ self.analysis_prompt = f.read()
26
+
27
+ def __call__(self, file_bytes):
28
+ response = [
29
+ {
30
+ 'stage' : 'OCR',
31
+ 'response' : '',
32
+ 'time' : 0
33
+ },
34
+ {
35
+ 'stage' : 'EXTRACTION',
36
+ 'response' : '',
37
+ 'time' : 0
38
+ },
39
+ {
40
+ 'stage' : 'POST_PROCESS',
41
+ 'response' : '',
42
+ 'time' : 0
43
+ }
44
+ ]
45
+ try:
46
+ print('OCR Started ...')
47
+ ocr_start = time()
48
+ if isinstance(file_bytes, str):
49
+ text = file_bytes
50
+ elif isinstance(file_bytes, (bytearray, bytes)):
51
+ text, _ = self.engine(file_bytes)
52
+ ocr_end = time()
53
+ print(f'OCR done [{ocr_end - ocr_start}]')
54
+
55
+ if len(text) > 0:
56
+ response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
57
+ try:
58
+ print('Extracting ...')
59
+ extraction_start = time()
60
+ raw_response = self._extract(text = text)
61
+ extraction_end = time()
62
+ print('Extraction : ', raw_response)
63
+ print(f'Extracted [{extraction_end - extraction_start}]')
64
+ if raw_response is not None and len(raw_response) > 0:
65
+ response[1].update({'response' : raw_response, 'time' : extraction_end - extraction_start})
66
+ try:
67
+ print('Post processing extraction ...')
68
+ post_process_start = time()
69
+ post_processed = self._post_process(response = raw_response)
70
+ post_process_end = time()
71
+ print(f'Suggested [{post_process_end - post_process_start}]')
72
+ if post_processed is not None and len(post_processed) > 0:
73
+ response[2].update({'response' : post_processed, 'time' : post_process_end - post_process_start})
74
+ except Exception as pp_e:
75
+ print(f'Exception while post processing : {pp_e}')
76
+ except Exception as extraction_e:
77
+ print(f'Exception while extracting : {extraction_e}')
78
+ except Exception as ocr_e:
79
+ print(f'Exception while OCR : {ocr_e}')
80
+ return response
81
+
82
+ def _extract(self, **kwargs):
83
+ raise NotImplemented
84
+ def _post_process(self, **kwargs):
85
+ raise NotImplemented
86
+
87
+ class Auto(LOB):
88
+ def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
89
+ super().__init__(ocr_engine)
90
+ with open(os.path.join(PROMPTS_DIR, 'extraction_system_prompt.txt'), 'r') as f:
91
+ self.extraction_prompt = f.read()
92
+ with open(os.path.join(PROMPTS_DIR, 'auto', 'entities.json'), 'r') as f:
93
+ self.entities = json.load(f)
94
+ for entity in self.entities:
95
+ entity.update({'entityNameRaw' : '', 'entityValueRaw' : ''})
96
+
97
+ def _extract(self, **kwargs):
98
+ text = kwargs.get('text')
99
+ if len(text) > 0:
100
+ prompt = self.extraction_prompt.replace("{{date}}", f'{datetime.today().day}/{datetime.today().month}/{datetime.today().year}') + str(self.entities)
101
+ prompt += '\nInvoice : ' + text
102
+ response = call_openai(prompt)
103
+ if len(response) > 0:
104
+ return response
105
+ return ''
106
+
107
+ def _post_process(self, **kwargs):
108
+ response = kwargs.get('response', '')
109
+ if len(response) > 0:
110
+ jsonified_str_list = [e['entityName'] for e in self.entities if 'json' in e['expectedOutputFormat'].lower()]
111
+ response = re.sub(r'`|json', '', response)
112
+ if len(response) > 0:
113
+ try:
114
+ response = json.loads(response)
115
+ for entity in response:
116
+ if entity['entityName'] in jsonified_str_list:
117
+ try:
118
+ entity['entityValue'] = json.loads(entity['entityValue'])
119
+ except Exception as e:
120
+ pass
121
+ return response
122
+ except Exception as jsonify_exc:
123
+ print(f'Error JSONifying {jsonify_exc}')
124
+ return []
125
+
126
+
127
+ if __name__ == '__main__':
128
+ import os
129
+ import json
130
+ import sys
131
+ from tqdm import tqdm
132
+ filepaths = sys.argv[1:]
133
+ auto = Auto()
134
+
135
+ for filepath in tqdm(filepaths):
136
+ print(filepath)
137
+ if filepath.endswith('.pdf'):
138
+ file_bytes = open(filepath, 'rb').read()
139
+ elif filepath.endswith(('.txt', '.md')):
140
+ file_bytes = open(filepath).read()
141
+
142
+ extraction = auto(file_bytes)
143
+ print(extraction)
144
+ basepath = os.path.splitext(filepath)[0]
145
+ with open(basepath + '.json', 'w') as f:
146
+ json.dump(extraction, f, indent = 4)
147
+ with open(basepath + '.entities.json', 'w') as f:
148
+ json.dump(extraction[-1]['response'], f, indent = 4)
invoice_extractor/llm.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Call OpenAI
3
+ @author : Sakshi Tantak
4
+ """
5
+
6
+ # Imports
7
+ from openai import AzureOpenAI
8
+
9
+ from invoice_extractor import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
10
+
11
+ CLIENT = AzureOpenAI(
12
+ azure_endpoint = GPT_API_BASE,
13
+ api_key = GPT_KEY,
14
+ api_version = GPT_VERSION
15
+ )
16
+
17
+ def call_openai(system_prompt, seed = 42):
18
+ print('Calling openai')
19
+ # messages = [{'role' : 'system', 'content' : system_prompt},
20
+ # {'role' : 'user', 'content' : document}]
21
+ messages = [{'role' : 'user', 'content' : system_prompt}]
22
+ response = CLIENT.chat.completions.create(
23
+ model = GPT_ENGINE,
24
+ messages = messages,
25
+ # response_format = response_format,
26
+ # reasoning_effort = 'low'
27
+ )
28
+ print('LLM response : ', response)
29
+ return response.choices[0].message.content
invoice_extractor/ocr.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OCR
3
+ @author : Sakshi Tantak
4
+ """
5
+
6
+ # Imports
7
+ import json
8
+
9
+ from azure.core.credentials import AzureKeyCredential
10
+ from azure.ai.formrecognizer import DocumentAnalysisClient
11
+ import pymupdf4llm, pymupdf
12
+
13
+ from invoice_extractor import CREDENTIALS
14
+
15
+ def convert_nested_complex_obj_to_json(result):
16
+ result = json.loads(json.dumps(result, default = lambda o : o.__dict__))
17
+ return result
18
+
19
+ class AzureLayoutOCR:
20
+ def __init__(self):
21
+ self.client = self._authenticate()
22
+ self.engine = 'azure/layout'
23
+
24
+ def _authenticate(self):
25
+ client = DocumentAnalysisClient(
26
+ endpoint=CREDENTIALS['azure']['layout']['endpoint'],
27
+ credential=AzureKeyCredential(CREDENTIALS['azure']['layout']['key']),
28
+ connection_verify=False
29
+ )
30
+ return client
31
+
32
+ def _table2md(self, table, **kwargs):
33
+ row_count, column_count = table['row_count'], table['column_count']
34
+ cells = table['cells']
35
+
36
+ markdown_table = []
37
+ table_offsets = (table['spans'][0]['offset'], table['spans'][-1]['offset'] + table['spans'][-1]['length'])
38
+
39
+ for _ in range(row_count + 1):
40
+ row = [''] * column_count
41
+ markdown_table.append(row)
42
+
43
+ header_row_idx = [0]
44
+ for cell in cells:
45
+ row_index = cell['row_index']
46
+ if cell['kind'] == 'columnHeader':
47
+ # Headers are in the first row of markdown_table, which is row_index 0
48
+ markdown_table[row_index + 1][cell['column_index']] = '**' + cell['content'].replace('|', '') + '**'
49
+ header_row_idx.append(row_index + 1)
50
+ else:
51
+ # Content cells are offset by 1 due to headers
52
+ markdown_table[row_index + 1][cell['column_index']] = cell['content'].replace('|', '')
53
+
54
+ markdown_output = ''
55
+ for row in markdown_table:
56
+ markdown_output += '| ' + ' | '.join(row) + ' |\n'
57
+ if markdown_table.index(row) in header_row_idx:
58
+ # if markdown_table.index(row) == 0:
59
+ # Add a separator after the header
60
+ markdown_output += '| ' + ' | '.join(['---'] * column_count) + ' |\n'
61
+
62
+ return markdown_output, table_offsets
63
+
64
+ def _paragraphs2md(self, paragraph, element_offsets, **kwargs):
65
+ paragraph_offsets = (
66
+ paragraph['spans'][0]['offset'], paragraph['spans'][-1]['offset'] + paragraph['spans'][-1]['length'])
67
+ for offset in element_offsets:
68
+ if paragraph_offsets[0] >= offset[0] and paragraph['spans'][0]['offset'] <= offset[1]:
69
+ return None, None
70
+
71
+ markdown_text = ''
72
+
73
+ if paragraph['role'] == 'title':
74
+ markdown_text += f'# {paragraph["content"]}'
75
+ elif paragraph == "sectionHeading":
76
+ markdown_text += f'## {paragraph["content"]}'
77
+ else:
78
+ markdown_text += f'{paragraph["content"]}'
79
+ return markdown_text, paragraph_offsets
80
+
81
+ def _stitch_paragraphs_elements(self, paragraphs, elements, **kwargs):
82
+ new_list = paragraphs + elements
83
+ sorted_new_list = sorted(new_list, key=lambda x: x['offset'][0])
84
+ return sorted_new_list
85
+
86
+ def _convert2md(self, result, **kwargs):
87
+ paragraphs, tables = result['paragraphs'], result['tables']
88
+ md_tables = []
89
+ for table in tables:
90
+ md, offset = self._table2md(table, requestId=kwargs.get('requestId'))
91
+ md_tables.append({'content': md, 'offset': offset})
92
+
93
+ table_offsets = [element['offset'] for element in md_tables]
94
+ md_paragraphs = []
95
+
96
+ for para in paragraphs:
97
+ md, offset = self._paragraphs2md(para, table_offsets, requestId=kwargs.get('requestId'))
98
+ if md is not None:
99
+ md_paragraphs.append({'content': md, 'offset': offset})
100
+
101
+ all_md_elements = self._stitch_paragraphs_elements(md_paragraphs, md_tables, requestId=kwargs.get('requestId'))
102
+ full_md = '\n\n'.join([record['content'] for record in all_md_elements])
103
+ return full_md
104
+
105
+ def _call_engine(self, image_reader, **kwargs):
106
+ poller = self.client.begin_analyze_document(
107
+ CREDENTIALS['azure']['layout']['model'],
108
+ image_reader
109
+ )
110
+ result = poller.result()
111
+
112
+ result = convert_nested_complex_obj_to_json(result)
113
+ md_text = self._convert2md(result, requestId=kwargs.get('requestId'))
114
+
115
+ return md_text, result
116
+
117
+ def __call__(self, file_bytes):
118
+ text, raw_response = self._call_engine(file_bytes)
119
+ return text, raw_response
120
+
121
+ class PyMuPDF4LLMOCR:
122
+ def __init__(self):
123
+ self.engine = 'open-source/pymupdf4llm'
124
+ self.file_type = 'pdf'
125
+
126
+ def _create_document(self, file_bytes, file_type = None):
127
+ return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
128
+
129
+ def __call__(self, file_bytes, file_type = None):
130
+ document = self._create_document(file_bytes, file_type)
131
+ response = pymupdf4llm.to_markdown(document)
132
+ return response, None
133
+
134
+ if __name__ == '__main__':
135
+ import sys
136
+ filepath = sys.argv[1]
137
+ file_bytes = open(filepath, 'rb').read()
138
+ ocr = AzureLayoutOCR()
139
+ text, raw_response = ocr(file_bytes)
140
+ print(text)
invoice_extractor/prompts/__init__.py ADDED
File without changes
invoice_extractor/prompts/auto/__init__.py ADDED
File without changes
invoice_extractor/prompts/auto/entities.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "expectedOutputFormat": "Stringified Json List",
4
+ "entityName": "labour",
5
+ "entityId": 1,
6
+ "entityDesc": "All line items of the labour repairs performed on the vehicle in the following schema: [{\"sr_no\": \"serial number of item in alphanumeric\", \"item_name\": \"Name of line item\", \"labour_code\": \"Labour code\", \"hsn_sac\": \"HSN/SAC code\", \"qty\": \"quantity of the item in float\", \"unit_price\": \"Unit price of item in float\", \"cgst\": \"CGST on the item in float, if any\", \"sgst\": \"SGST on the item in float, if any\", \"igst\": \"IGST on the item in float, if any\", \"net_amount\": \"Final amount for the item in float\", \"discount\": \"discount for the item in float, if any\"}]"
7
+ },
8
+ {
9
+ "expectedOutputFormat": "Stringified Json List",
10
+ "entityName": "parts",
11
+ "entityId": 2,
12
+ "entityDesc": "All line items of the parts repaired or replaced on the vehicle in the following schema: [{\"sr_no\": \"serial number of item in alphanumeric\", \"item_name\": \"Name of line item\", \"part_number\": \"Part number\", \"hsn_sac\": \"HSN/SAC code\", \"qty\": \"quantity of the item in float\", \"unit_price\": \"Unit price of item in float\", \"cgst\": \"CGST on the item in float, if any\", \"sgst\": \"SGST on the item in float, if any\", \"igst\": \"IGST on the item in float, if any\", \"net_amount\": \"Final amount for the item in float\", \"discount\": \"discount for the item in float, if any\"}]"
13
+ },
14
+ {
15
+ "expectedOutputFormat": "String",
16
+ "entityName": "vendor_gst_number",
17
+ "entityId": 3,
18
+ "entityDesc": "Alphanumeric GST Number of the Vendor"
19
+ },
20
+ {
21
+ "expectedOutputFormat": "String",
22
+ "entityName": "customer_gst_number",
23
+ "entityId": 4,
24
+ "entityDesc": "Alphanumeric GST Number of the Customer"
25
+ },
26
+ {
27
+ "expectedOutputFormat": "Stringified Json dictionary",
28
+ "entityName": "tax_details",
29
+ "entityId": 5,
30
+ "entityDesc": "Tax details in the following schema : {\"cgst\": [{\"rate\": \"Rate of CGST levied in float\", \"amount\": \"Amount charged as CGST in float if any\"}], \"sgst\": [{\"rate\": \"amount\": \"Amount charged as SGST in float if any\"}], \"igst\": [{\"rate\": \"amount\": \"Amount charged as IGST in float if any\"}]}"
31
+ },
32
+ {
33
+ "expectedOutputFormat": "dd/mm/yyyy HH:MM:SS",
34
+ "entityName": "invoice_date",
35
+ "entityId": 6,
36
+ "entityDesc": "Date of invoice in dd/mm/yyyy HH:MM:SS format"
37
+ },
38
+ {
39
+ "expectedOutputFormat": "float",
40
+ "entityName": "total_amount",
41
+ "entityId": 7,
42
+ "entityDesc": "Total amount charged"
43
+ },
44
+ {
45
+ "expectedOutputFormat": "String",
46
+ "entityName": "invoice_number",
47
+ "entityId": 8,
48
+ "entityDesc": "Invoice Number"
49
+ },
50
+ {
51
+ "expectedOutputFormat": "String",
52
+ "entityName": "registration_number",
53
+ "entityId": 9,
54
+ "entityDesc": "Alphanumeric registration number of the vehicle"
55
+ },
56
+ {
57
+ "expectedOutputFormat": "Stringified Json dictionary",
58
+ "entityName": "insurance_claim_details",
59
+ "entityId": 10,
60
+ "entityDesc": "Insurance claim details in the following schema: {\"claim_no\": \"Alphanumeric claim number\", \"insured_name\": \"Name of the insured person\"}"
61
+ }
62
+ ]
invoice_extractor/prompts/extraction_system_prompt.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an intelligent agent in an insurance company called `Acko`
2
+ Your job is to extract data from documents.
3
+ Today's date is : {{date}}
4
+
5
+ Given the markdown text of a document, you are supposed to extract the given entities.
6
+ Entities are given as a list of dictionaries.
7
+ You are supposed to complete the `entityValue` key of the given dictionaries based on the `entityDesc` given as description that explains the entity.
8
+ If any entity is not found in the document, don't generate it in the response JSON.
9
+ Format your response strictly as a list of JSON dictionaries.
10
+
11
+ In the following entities JSON, keys mean the following
12
+ - "entityId": [Unique ID of the entity. PRESERVE IN RESPONSE.]
13
+ - "entityName": [Entity to be extracted based on the entity description.]
14
+ - "entityValue": [Entity value to be generated by you in the expectedOutputFormat mentioned.]
15
+ - "expectedOutputFormat": [Expected format of the entityValue.]
16
+ - "entityDesc": [Description of entity]
17
+
18
+ Entities:
invoice_extractor/utils.py ADDED
File without changes
requirements.txt ADDED
File without changes
styles.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def apply_custom_styles():
4
+ st.markdown("""
5
+ <style>
6
+ .stApp {
7
+ max-width: 1200px;
8
+ margin: 0 auto;
9
+ background: linear-gradient(135deg, #f5f7fa 0%, #e4e9f2 100%);
10
+ background-attachment: fixed;
11
+ min-height: 100vh;
12
+ }
13
+ .upload-container {
14
+ border: 2px dashed #0066cc;
15
+ border-radius: 10px;
16
+ padding: 20px;
17
+ text-align: center;
18
+ margin: 20px 0;
19
+ background: rgba(255, 255, 255, 0.9);
20
+ backdrop-filter: blur(5px);
21
+ }
22
+ .factor-card {
23
+ background-color: rgba(255, 255, 255, 0.95);
24
+ padding: 20px;
25
+ border-radius: 10px;
26
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
27
+ margin: 10px 0;
28
+ backdrop-filter: blur(5px);
29
+ height: 100%;
30
+ }
31
+ .good-factor {
32
+ border-left: 4px solid #28a745;
33
+ }
34
+ .average-factor {
35
+ border-left: 4px solid #ffc107;
36
+ }
37
+ .bad-factor {
38
+ border-left: 4px solid #dc3545;
39
+ }
40
+ .header-container {
41
+ padding: 2rem 0;
42
+ margin-bottom: 2rem;
43
+ background: linear-gradient(90deg, #0066cc 0%, #0099ff 100%);
44
+ color: white;
45
+ border-radius: 10px;
46
+ text-align: center;
47
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
48
+ }
49
+ .detailed-factor {
50
+ padding: 15px;
51
+ border-radius: 8px;
52
+ margin: 10px 0;
53
+ background: rgba(255, 255, 255, 0.9);
54
+ border-left: 4px solid #666;
55
+ }
56
+ .detailed-factor.good {
57
+ border-left-color: #28a745;
58
+ background: rgba(40, 167, 69, 0.1);
59
+ }
60
+ .detailed-factor.average {
61
+ border-left-color: #ffc107;
62
+ background: rgba(255, 193, 7, 0.1);
63
+ }
64
+ .detailed-factor.bad {
65
+ border-left-color: #dc3545;
66
+ background: rgba(220, 53, 69, 0.1);
67
+ }
68
+ .comparison-table {
69
+ background: white;
70
+ border-radius: 10px;
71
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
72
+ margin: 20px 0;
73
+ }
74
+ </style>
75
+ """, unsafe_allow_html=True)
76
+
77
+ def show_factor_section(title, factors, color):
78
+ if factors:
79
+ st.markdown(f"""
80
+ <div class="factor-card {color}-factor">
81
+ <h3 style="color: #333;">{title}</h3>
82
+ <ul style="list-style-type: none; padding-left: 0;">
83
+ {"".join(f'<li style="margin: 10px 0; padding: 10px; background: rgba(248, 249, 250, 0.8); border-radius: 5px;">{factor}</li>' for factor in factors)}
84
+ </ul>
85
+ </div>
86
+ """, unsafe_allow_html=True)
87
+
88
+ def show_detailed_factors(good_factors, average_factors, bad_factors):
89
+ for factor in good_factors:
90
+ name, explanation = factor.split(':', 1)
91
+ st.markdown(f"""
92
+ <div class="detailed-factor good">
93
+ <strong>{name}</strong>
94
+ <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
95
+ </div>
96
+ """, unsafe_allow_html=True)
97
+
98
+ for factor in average_factors:
99
+ name, explanation = factor.split(':', 1)
100
+ st.markdown(f"""
101
+ <div class="detailed-factor average">
102
+ <strong>{name}</strong>
103
+ <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
104
+ </div>
105
+ """, unsafe_allow_html=True)
106
+
107
+ for factor in bad_factors:
108
+ name, explanation = factor.split(':', 1)
109
+ st.markdown(f"""
110
+ <div class="detailed-factor bad">
111
+ <strong>{name}</strong>
112
+ <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
113
+ </div>
114
+ """, unsafe_allow_html=True)
115
+
116
+ def show_factor_summary(summary, verdict, sentiment_title):
117
+ if len(summary) > 0:
118
+ st.markdown(f"""
119
+ <div class="detailed-factor {verdict}">
120
+ <strong>{sentiment_title}</strong>
121
+ <p style="margin: 5px 0 0 0; color: #666;">{summary}</p>
122
+ </div>
123
+ """, unsafe_allow_html=True)
utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities
3
+ @author : Sakshi Tantak
4
+ """
5
+ import streamlit as st
6
+ import base64
7
+
8
+ def markdown_table_to_json(markdown):
9
+ lines = markdown.strip().split("\n")
10
+
11
+ # Extract headers
12
+ headers = [h.strip() for h in lines[0].split("|") if h.strip()]
13
+
14
+ # Extract rows
15
+ rows = []
16
+ for line in lines[2:]: # Skip header and separator line
17
+ values = [v.strip() for v in line.split("|") if v.strip()]
18
+ row_dict = dict(zip(headers, values))
19
+ rows.append(row_dict)
20
+
21
+ return rows
22
+
23
+ def validate_pdf(pdf_bytes: bytes) -> bool:
24
+ """
25
+ Validates the uploaded PDF file.
26
+ """
27
+ if not pdf_bytes:
28
+ return False
29
+
30
+ # Check file signature for PDF (%PDF-)
31
+ return pdf_bytes.startswith(b'%PDF-')
32
+
33
+ def displayPDF(file):
34
+ # Opening file from file path
35
+ if isinstance(file, str):
36
+ file_bytes = open(file, 'rb').read()
37
+ else:
38
+ file_bytes = file
39
+ # with open(file, "rb") as f:
40
+ base64_pdf = base64.b64encode(file_bytes).decode('utf-8')
41
+
42
+ # Embedding PDF in HTML
43
+ pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">'
44
+
45
+ # Displaying File
46
+ st.markdown(pdf_display, unsafe_allow_html=True)