import os
import re
import json
import streamlit as st
import pandas as pd
from utils import validate_pdf, displayPDF
from styles import apply_custom_styles
from invoice_extractor.extraction import Auto
if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
if 'auto_extractor' not in st.session_state:
st.session_state.auto_extractor = Auto()
def markdown_table_to_json(markdown):
lines = markdown.strip().split("\n")
# Extract headers
headers = [h.strip() for h in lines[0].split("|") if h.strip()]
# Extract rows
rows = []
for line in lines[2:]: # Skip header and separator line
values = [v.strip() for v in line.split("|") if v.strip()]
row_dict = dict(zip(headers, values))
rows.append(row_dict)
return rows
def visualise_pie_chart(analysis):
verdicts = {}
score = 0
total = 0
for verdict in ['GOOD', 'AVERAGE', 'BAD']:
table = analysis.split(f'<{verdict}>')[-1].split(f'{verdict}>')[0]
table = markdown_table_to_json(table)
if len(table) > 0:
verdicts[verdict] = table
if verdict == 'GOOD':
score += 5 * len(table)
if verdict == 'AVERAGE':
score += 3 * len(table)
elif verdict == 'BAD':
score += len(table)
total += 5 * len(table)
gauge(gVal = total, gTitle = '', gMode = 'gauge+number',
grLow = total // 3,
grMid = 2 * (total // 3))
def main():
# Apply custom styles
apply_custom_styles()
# Header
st.markdown("""
""", unsafe_allow_html=True)
# File upload section
st.markdown('', unsafe_allow_html=True)
uploaded_files = st.file_uploader("Choose invoice PDF files", type="pdf", accept_multiple_files=True)
print(uploaded_files)
lob = st.selectbox(
'LOB',
options = ['Health', 'Life', 'Auto'],
index = 2
)
st.markdown('
', unsafe_allow_html=True)
if uploaded_files and st.button('Extract'):
# Process each uploaded file
for uploaded_file in uploaded_files:
# Read PDF content
pdf_bytes = uploaded_file.read()
# displayPDF(pdf_bytes)
# Validate PDF
if not validate_pdf(pdf_bytes):
st.error(f"Invalid PDF file: {uploaded_file.name}")
continue
# Show loading state
with st.spinner(f"Extracting {uploaded_file.name}..."):
try:
# Make API call
response = st.session_state.auto_extractor(pdf_bytes)
extraction = next(
(item for item in response if item.get("stage") == "POST_PROCESS"), None
)['response']
with st.expander(f'### Invoice : {uploaded_file.name}'):
displayPDF(pdf_bytes)
for entity in extraction:
# cols = st.columns(2)
# with cols[0]:
if isinstance(entity['entityValue'], list):
st.markdown(f'{entity["entityName"]}')
df = pd.DataFrame.from_records(entity['entityValue'])
st.table(df)
elif isinstance(entity['entityValue'], dict):
st.markdown(f'{entity["entityName"]}')
for k, v in entity['entityValue'].items():
st.markdown(f'{k.upper()}')
if isinstance(v, list):
df = pd.DataFrame.from_records(v)
st.table(v)
else:
st.text_input(f'{entity["entityName"]}', entity['entityValue'])
except Exception as e:
st.error(f"Error extracting {uploaded_file.name}: {str(e)}")
# Footer
st.markdown("""
Upload one or more invoice PDFs to get detailed extraction.
We support all major formats.
""", unsafe_allow_html=True)
if __name__ == "__main__":
st.set_page_config(
page_title="Invoice Extractor",
page_icon="📋",
layout="wide"
)
main()