Invoice Extractor

import os
import re
import json

import streamlit as st
import pandas as pd

from utils import validate_pdf, displayPDF
from styles import apply_custom_styles
from invoice_extractor.extraction import Auto

if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
    os.environ['GPT_KEY'] = st.secrets['GPT_KEY']

if 'auto_extractor' not in st.session_state:
    st.session_state.auto_extractor = Auto()

def markdown_table_to_json(markdown):
    lines = markdown.strip().split("\n")
    
    # Extract headers
    headers = [h.strip() for h in lines[0].split("|") if h.strip()]
    
    # Extract rows
    rows = []
    for line in lines[2:]:  # Skip header and separator line
        values = [v.strip() for v in line.split("|") if v.strip()]
        row_dict = dict(zip(headers, values))
        rows.append(row_dict)
    
    return rows

def visualise_pie_chart(analysis):
    verdicts = {}
    score = 0
    total = 0
    for verdict in ['GOOD', 'AVERAGE', 'BAD']:
        table = analysis.split(f'<{verdict}>')[-1].split(f'</{verdict}>')[0]
        table = markdown_table_to_json(table)
        if len(table) > 0:
            verdicts[verdict] = table
            if verdict == 'GOOD':
                score += 5 * len(table)
            if verdict == 'AVERAGE':
                score += 3 * len(table)
            elif verdict == 'BAD':
                score += len(table)
            total += 5 * len(table)
    gauge(gVal = total, gTitle = '', gMode = 'gauge+number',
          grLow = total // 3,
          grMid = 2 * (total // 3))

def main():
    # Apply custom styles
    apply_custom_styles()

    # Header
    st.markdown("""
        <div class="header-container">
            <img src="https://acko-brand.ackoassets.com/brand/vector-svg/gradient/horizontal-reverse.svg" height=50 width=100>
                <h1>Invoice Extractor</h1>
            <p>Upload and extract data from invoices</p>
        </div>
    """, unsafe_allow_html=True)

    # File upload section
    st.markdown('<div class="upload-container">', unsafe_allow_html=True)
    uploaded_files = st.file_uploader("Choose invoice PDF files", type="pdf", accept_multiple_files=True)
    print(uploaded_files)
    lob = st.selectbox(
        'LOB',
        options = ['Health', 'Life', 'Auto'],
        index = 2
    )
    st.markdown('</div>', unsafe_allow_html=True)

    if uploaded_files and st.button('Extract'):
        # Process each uploaded file
        for uploaded_file in uploaded_files:
            # Read PDF content
            pdf_bytes = uploaded_file.read()
            # displayPDF(pdf_bytes)

            # Validate PDF
            if not validate_pdf(pdf_bytes):
                st.error(f"Invalid PDF file: {uploaded_file.name}")
                continue

            # Show loading state
            with st.spinner(f"Extracting {uploaded_file.name}..."):
                try:
                    # Make API call
                    response = st.session_state.auto_extractor(pdf_bytes)
                    extraction = next(
                        (item for item in response if item.get("stage") == "POST_PROCESS"), None
                    )['response']
                    with st.expander(f'### Invoice : {uploaded_file.name}'):
                        displayPDF(pdf_bytes)
                        for entity in extraction:
                            # cols = st.columns(2)
                            # with cols[0]:
                            if isinstance(entity['entityValue'], list):
                                st.markdown(f'{entity["entityName"]}')
                                df = pd.DataFrame.from_records(entity['entityValue'])
                                st.table(df)
                            elif isinstance(entity['entityValue'], dict):
                                st.markdown(f'{entity["entityName"]}')
                                for k, v in entity['entityValue'].items():
                                    st.markdown(f'{k.upper()}')
                                    if isinstance(v, list):
                                        df = pd.DataFrame.from_records(v)
                                        st.table(v)
                            else:
                                st.text_input(f'{entity["entityName"]}', entity['entityValue'])

                except Exception as e:
                    st.error(f"Error extracting {uploaded_file.name}: {str(e)}")

    # Footer
    st.markdown("""
        <div style="margin-top: 50px; text-align: center; color: #666;">
            <p>Upload one or more invoice PDFs to get detailed extraction.</p>
            <p>We support all major formats.</p>
        </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    st.set_page_config(
        page_title="Invoice Extractor",
        page_icon="📋",
        layout="wide"
    )
    main()