Spaces:

pikaduck
/

invoice-extraction

Running

App Files Files Community

Sakshi commited on 28 days ago

Commit

ddacfa7

1 Parent(s): 4980502

created invoice extraction app

Browse files

Files changed (16) hide show

.gitignore +4 -0
.streamlit/config.toml +2 -0
app.py +133 -0
invoice_extractor/__init__.py +40 -0
invoice_extractor/data/__init__.py +0 -0
invoice_extractor/extraction.py +148 -0
invoice_extractor/llm.py +29 -0
invoice_extractor/ocr.py +140 -0
invoice_extractor/prompts/__init__.py +0 -0
invoice_extractor/prompts/auto/__init__.py +0 -0
invoice_extractor/prompts/auto/entities.json +62 -0
invoice_extractor/prompts/extraction_system_prompt.txt +18 -0
invoice_extractor/utils.py +0 -0
requirements.txt +0 -0
styles.py +123 -0
utils.py +46 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*.pycache
+*.env
+*.pyc
+*__pycache__

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [theme]
2	+ base="light"

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import re
+import json
+import streamlit as st
+import pandas as pd
+from utils import validate_pdf, displayPDF
+from styles import apply_custom_styles
+from invoice_extractor.extraction import Auto
+if 'GPT_KEY' not in os.environ or os.environ.get('GPT_KEY') in [None, '']:
+    os.environ['GPT_KEY'] = st.secrets['GPT_KEY']
+if 'auto_extractor' not in st.session_state:
+    st.session_state.auto_extractor = Auto()
+def markdown_table_to_json(markdown):
+    lines = markdown.strip().split("\n")
+    # Extract headers
+    headers = [h.strip() for h in lines[0].split("|") if h.strip()]
+    # Extract rows
+    rows = []
+    for line in lines[2:]:  # Skip header and separator line
+        values = [v.strip() for v in line.split("|") if v.strip()]
+        row_dict = dict(zip(headers, values))
+        rows.append(row_dict)
+    return rows
+def visualise_pie_chart(analysis):
+    verdicts = {}
+    score = 0
+    total = 0
+    for verdict in ['GOOD', 'AVERAGE', 'BAD']:
+        table = analysis.split(f'<{verdict}>')[-1].split(f'</{verdict}>')[0]
+        table = markdown_table_to_json(table)
+        if len(table) > 0:
+            verdicts[verdict] = table
+            if verdict == 'GOOD':
+                score += 5 * len(table)
+            if verdict == 'AVERAGE':
+                score += 3 * len(table)
+            elif verdict == 'BAD':
+                score += len(table)
+            total += 5 * len(table)
+    gauge(gVal = total, gTitle = '', gMode = 'gauge+number',
+          grLow = total // 3,
+          grMid = 2 * (total // 3))
+def main():
+    # Apply custom styles
+    apply_custom_styles()
+    # Header
+    st.markdown("""
+        <div class="header-container">
+            <img src="https://acko-brand.ackoassets.com/brand/vector-svg/gradient/horizontal-reverse.svg" height=50 width=100>
+                <h1>Invoice Extractor</h1>
+            <p>Upload and extract data from invoices</p>
+        </div>
+    """, unsafe_allow_html=True)
+    # File upload section
+    st.markdown('<div class="upload-container">', unsafe_allow_html=True)
+    uploaded_files = st.file_uploader("Choose invoice PDF files", type="pdf", accept_multiple_files=True)
+    print(uploaded_files)
+    lob = st.selectbox(
+        'LOB',
+        options = ['Health', 'Life', 'Auto'],
+        index = 2
+    )
+    st.markdown('</div>', unsafe_allow_html=True)
+    if uploaded_files and st.button('Extract'):
+        # Process each uploaded file
+        for uploaded_file in uploaded_files:
+            # Read PDF content
+            pdf_bytes = uploaded_file.read()
+            # displayPDF(pdf_bytes)
+            # Validate PDF
+            if not validate_pdf(pdf_bytes):
+                st.error(f"Invalid PDF file: {uploaded_file.name}")
+                continue
+            # Show loading state
+            with st.spinner(f"Extracting {uploaded_file.name}..."):
+                try:
+                    # Make API call
+                    response = st.session_state.auto_extractor(pdf_bytes)
+                    extraction = next(
+                        (item for item in response if item.get("stage") == "POST_PROCESS"), None
+                    )['response']
+                    with st.expander(f'### Invoice : {uploaded_file.name}'):
+                        displayPDF(pdf_bytes)
+                        for entity in extraction:
+                            # cols = st.columns(2)
+                            # with cols[0]:
+                            if isinstance(entity['entityValue'], list):
+                                st.markdown(f'{entity["entityName"]}')
+                                df = pd.DataFrame.from_records(entity['entityValue'])
+                                st.table(df)
+                            elif isinstance(entity['entityValue'], dict):
+                                st.markdown(f'{entity["entityName"]}')
+                                for k, v in entity['entityValue'].items():
+                                    st.markdown(f'{k.upper()}')
+                                    if isinstance(v, list):
+                                        df = pd.DataFrame.from_records(v)
+                                        st.table(v)
+                            else:
+                                st.text_input(f'{entity["entityName"]}', entity['entityValue'])
+                except Exception as e:
+                    st.error(f"Error extracting {uploaded_file.name}: {str(e)}")
+    # Footer
+    st.markdown("""
+        <div style="margin-top: 50px; text-align: center; color: #666;">
+            <p>Upload one or more invoice PDFs to get detailed extraction.</p>
+            <p>We support all major formats.</p>
+        </div>
+    """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    st.set_page_config(
+        page_title="Invoice Extractor",
+        page_icon="📋",
+        layout="wide"
+    )
+    main()

invoice_extractor/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import json
+from dotenv import load_dotenv
+try:
+    load_dotenv('.env')
+except:
+    pass
+PACKAGE = 'invoice_extractor'
+PROJECT_DIR = os.getcwd()
+PACKAGE_PATH = os.path.join(PROJECT_DIR, PACKAGE)
+PROMPTS_DIR = os.path.join(PACKAGE_PATH, 'prompts')
+DATA_DIR = os.path.join(PACKAGE_PATH, 'data')
+CREDENTIALS = {
+    'azure' : {
+        'plain-text' : {
+            'endpoint' : os.environ.get('AZURE_PLAIN_TEXT_ENDPOINT', ''),
+            'key' : os.environ.get('AZURE_PLAIN_TEXT_KEY')
+        },
+        'layout' : {
+            'endpoint' : os.environ.get('AZURE_LAYOUT_ENDPOINT', ''),
+            'key' : os.environ.get('AZURE_LAYOUT_KEY', ''),
+            'model' : os.environ.get('AZURE_LAYOUT_MODEL', '')
+        }
+    }
+}
+GPT_ENGINE = 'o1-mini'
+GPT_KEY = os.environ.get('GPT_KEY', '')
+GPT_VERSION = '2024-12-01-preview'
+GPT_API_BASE = 'https://ackotest.openai.azure.com/'
+# EXTRACTION_PROMPT = open(os.path.join(PROMPTS_DIR, 'extraction.txt')).read()
+# entities = json.load(open(os.path.join(DATA_DIR, 'policy_analyser_entities.json')))
+# for entity in entities:
+    # del entity['entityId']
+# entities_str = '\n---\n'.join(['\n'.join([f'{k} : {v}' for k, v in entity.items()]) for entity in entities])
+# EXTRACTION_PROMPT += entities_str

invoice_extractor/data/__init__.py ADDED Viewed

File without changes

invoice_extractor/extraction.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+    Extraction
+    @author : Sakshi Tantak
+"""
+# Imports
+import os
+import re
+import json
+from time import time
+from datetime import datetime
+from invoice_extractor import PROMPTS_DIR
+from invoice_extractor.llm import call_openai
+from invoice_extractor.ocr import PyMuPDF4LLMOCR, AzureLayoutOCR
+class LOB:
+    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
+        if ocr_engine == 'open-source/pymupdf4llm':
+            self.engine = PyMuPDF4LLMOCR()
+        elif ocr_engine == 'azure/layout':
+            self.engine = AzureLayoutOCR()
+        self.file_type = 'pdf'
+        with open(os.path.join(PROMPTS_DIR, 'extraction_system_prompt.txt'), 'r') as f:
+            self.analysis_prompt = f.read()
+    def __call__(self, file_bytes):
+        response = [
+            {
+                'stage' : 'OCR',
+                'response' : '',
+                'time' : 0
+            },
+            {
+                'stage' : 'EXTRACTION',
+                'response' : '',
+                'time' : 0
+            },
+            {
+                'stage' : 'POST_PROCESS',
+                'response' : '',
+                'time' : 0
+            }
+        ]
+        try:
+            print('OCR Started ...')
+            ocr_start = time()
+            if isinstance(file_bytes, str):
+                text = file_bytes
+            elif isinstance(file_bytes, (bytearray, bytes)):
+                text, _ = self.engine(file_bytes)
+            ocr_end = time()
+            print(f'OCR done [{ocr_end - ocr_start}]')
+            if len(text) > 0:
+                response[0].update({'response' : text, 'time' : ocr_end - ocr_start})
+                try:
+                    print('Extracting ...')
+                    extraction_start = time()
+                    raw_response = self._extract(text = text)
+                    extraction_end = time()
+                    print('Extraction : ', raw_response)
+                    print(f'Extracted [{extraction_end - extraction_start}]')
+                    if raw_response is not None and len(raw_response) > 0:
+                        response[1].update({'response' : raw_response, 'time' : extraction_end - extraction_start})
+                        try:
+                            print('Post processing extraction ...')
+                            post_process_start = time()
+                            post_processed = self._post_process(response = raw_response)
+                            post_process_end = time()
+                            print(f'Suggested [{post_process_end - post_process_start}]')
+                            if post_processed is not None and len(post_processed) > 0:
+                                response[2].update({'response' : post_processed, 'time' : post_process_end - post_process_start})
+                        except Exception as pp_e:
+                            print(f'Exception while post processing : {pp_e}')
+                except Exception as extraction_e:
+                    print(f'Exception while extracting : {extraction_e}')
+        except Exception as ocr_e:
+            print(f'Exception while OCR : {ocr_e}')
+        return response
+    def _extract(self, **kwargs):
+        raise NotImplemented
+    def _post_process(self, **kwargs):
+        raise NotImplemented
+class Auto(LOB):
+    def __init__(self, ocr_engine = 'open-source/pymupdf4llm'):
+        super().__init__(ocr_engine)
+        with open(os.path.join(PROMPTS_DIR, 'extraction_system_prompt.txt'), 'r') as f:
+            self.extraction_prompt = f.read()
+        with open(os.path.join(PROMPTS_DIR, 'auto', 'entities.json'), 'r') as f:
+            self.entities = json.load(f)
+            for entity in self.entities:
+                entity.update({'entityNameRaw' : '', 'entityValueRaw' : ''})
+    def _extract(self, **kwargs):
+        text = kwargs.get('text')
+        if len(text) > 0:
+            prompt = self.extraction_prompt.replace("{{date}}", f'{datetime.today().day}/{datetime.today().month}/{datetime.today().year}') + str(self.entities)
+            prompt += '\nInvoice : ' + text
+            response = call_openai(prompt)
+            if len(response) > 0:
+                return response
+        return ''
+    def _post_process(self, **kwargs):
+        response = kwargs.get('response', '')
+        if len(response) > 0:
+            jsonified_str_list = [e['entityName'] for e in self.entities if 'json' in e['expectedOutputFormat'].lower()]
+            response = re.sub(r'`|json', '', response)
+            if len(response) > 0:
+                try:
+                    response = json.loads(response)
+                    for entity in response:
+                        if entity['entityName'] in jsonified_str_list:
+                            try:
+                                entity['entityValue'] = json.loads(entity['entityValue'])
+                            except Exception as e:
+                                pass
+                    return response
+                except Exception as jsonify_exc:
+                    print(f'Error JSONifying {jsonify_exc}')
+        return []
+if __name__ == '__main__':
+    import os
+    import json
+    import sys
+    from tqdm import tqdm
+    filepaths = sys.argv[1:]
+    auto = Auto()
+    for filepath in tqdm(filepaths):
+        print(filepath)
+        if filepath.endswith('.pdf'):
+            file_bytes = open(filepath, 'rb').read()
+        elif filepath.endswith(('.txt', '.md')):
+            file_bytes = open(filepath).read()
+        extraction = auto(file_bytes)
+        print(extraction)
+        basepath = os.path.splitext(filepath)[0]
+        with open(basepath + '.json', 'w') as f:
+            json.dump(extraction, f, indent = 4)
+        with open(basepath + '.entities.json', 'w') as f:
+            json.dump(extraction[-1]['response'], f, indent = 4)

invoice_extractor/llm.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+    Call OpenAI
+    @author : Sakshi Tantak
+"""
+# Imports
+from openai import AzureOpenAI
+from invoice_extractor import GPT_ENGINE, GPT_API_BASE, GPT_KEY, GPT_VERSION
+CLIENT = AzureOpenAI(
+  azure_endpoint = GPT_API_BASE,
+  api_key = GPT_KEY,
+  api_version = GPT_VERSION
+)
+def call_openai(system_prompt, seed = 42):
+    print('Calling openai')
+    # messages = [{'role' : 'system', 'content' : system_prompt},
+                # {'role' : 'user', 'content' : document}]
+    messages = [{'role' : 'user', 'content' : system_prompt}]
+    response = CLIENT.chat.completions.create(
+        model = GPT_ENGINE,
+        messages = messages,
+        # response_format = response_format,
+        # reasoning_effort = 'low'
+    )
+    print('LLM response : ', response)
+    return response.choices[0].message.content

invoice_extractor/ocr.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+    OCR
+    @author : Sakshi Tantak
+"""
+# Imports
+import json
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.formrecognizer import DocumentAnalysisClient
+import pymupdf4llm, pymupdf
+from invoice_extractor import CREDENTIALS
+def convert_nested_complex_obj_to_json(result):
+    result = json.loads(json.dumps(result, default = lambda o : o.__dict__))
+    return result
+class AzureLayoutOCR:
+    def __init__(self):
+        self.client = self._authenticate()
+        self.engine = 'azure/layout'
+    def _authenticate(self):
+        client = DocumentAnalysisClient(
+            endpoint=CREDENTIALS['azure']['layout']['endpoint'],
+            credential=AzureKeyCredential(CREDENTIALS['azure']['layout']['key']),
+            connection_verify=False
+        )
+        return client
+    def _table2md(self, table, **kwargs):
+        row_count, column_count = table['row_count'], table['column_count']
+        cells = table['cells']
+        markdown_table = []
+        table_offsets = (table['spans'][0]['offset'], table['spans'][-1]['offset'] + table['spans'][-1]['length'])
+        for _ in range(row_count + 1):
+            row = [''] * column_count
+            markdown_table.append(row)
+        header_row_idx = [0]
+        for cell in cells:
+            row_index = cell['row_index']
+            if cell['kind'] == 'columnHeader':
+                # Headers are in the first row of markdown_table, which is row_index 0
+                markdown_table[row_index + 1][cell['column_index']] = '**' + cell['content'].replace('|', '') + '**'
+                header_row_idx.append(row_index + 1)
+            else:
+                # Content cells are offset by 1 due to headers
+                markdown_table[row_index + 1][cell['column_index']] = cell['content'].replace('|', '')
+        markdown_output = ''
+        for row in markdown_table:
+            markdown_output += '| ' + ' | '.join(row) + ' |\n'
+            if markdown_table.index(row) in header_row_idx:
+                # if markdown_table.index(row) == 0:
+                # Add a separator after the header
+                markdown_output += '| ' + ' | '.join(['---'] * column_count) + ' |\n'
+        return markdown_output, table_offsets
+    def _paragraphs2md(self, paragraph, element_offsets, **kwargs):
+        paragraph_offsets = (
+        paragraph['spans'][0]['offset'], paragraph['spans'][-1]['offset'] + paragraph['spans'][-1]['length'])
+        for offset in element_offsets:
+            if paragraph_offsets[0] >= offset[0] and paragraph['spans'][0]['offset'] <= offset[1]:
+                return None, None
+        markdown_text = ''
+        if paragraph['role'] == 'title':
+            markdown_text += f'# {paragraph["content"]}'
+        elif paragraph == "sectionHeading":
+            markdown_text += f'## {paragraph["content"]}'
+        else:
+            markdown_text += f'{paragraph["content"]}'
+        return markdown_text, paragraph_offsets
+    def _stitch_paragraphs_elements(self, paragraphs, elements, **kwargs):
+        new_list = paragraphs + elements
+        sorted_new_list = sorted(new_list, key=lambda x: x['offset'][0])
+        return sorted_new_list
+    def _convert2md(self, result, **kwargs):
+        paragraphs, tables = result['paragraphs'], result['tables']
+        md_tables = []
+        for table in tables:
+            md, offset = self._table2md(table, requestId=kwargs.get('requestId'))
+            md_tables.append({'content': md, 'offset': offset})
+        table_offsets = [element['offset'] for element in md_tables]
+        md_paragraphs = []
+        for para in paragraphs:
+            md, offset = self._paragraphs2md(para, table_offsets, requestId=kwargs.get('requestId'))
+            if md is not None:
+                md_paragraphs.append({'content': md, 'offset': offset})
+        all_md_elements = self._stitch_paragraphs_elements(md_paragraphs, md_tables, requestId=kwargs.get('requestId'))
+        full_md = '\n\n'.join([record['content'] for record in all_md_elements])
+        return full_md
+    def _call_engine(self, image_reader, **kwargs):
+        poller = self.client.begin_analyze_document(
+            CREDENTIALS['azure']['layout']['model'],
+            image_reader
+        )
+        result = poller.result()
+        result = convert_nested_complex_obj_to_json(result)
+        md_text = self._convert2md(result, requestId=kwargs.get('requestId'))
+        return md_text, result
+    def __call__(self, file_bytes):
+        text, raw_response = self._call_engine(file_bytes)
+        return text, raw_response
+class PyMuPDF4LLMOCR:
+    def __init__(self):
+        self.engine = 'open-source/pymupdf4llm'
+        self.file_type = 'pdf'
+    def _create_document(self, file_bytes, file_type = None):
+        return pymupdf.open(stream = file_bytes, filetype = self.file_type if file_type is None else file_type)
+    def __call__(self, file_bytes, file_type = None):
+        document = self._create_document(file_bytes, file_type)
+        response = pymupdf4llm.to_markdown(document)
+        return response, None
+if __name__ == '__main__':
+    import sys
+    filepath = sys.argv[1]
+    file_bytes = open(filepath, 'rb').read()
+    ocr = AzureLayoutOCR()
+    text, raw_response = ocr(file_bytes)
+    print(text)

invoice_extractor/prompts/__init__.py ADDED Viewed

File without changes

invoice_extractor/prompts/auto/__init__.py ADDED Viewed

File without changes

invoice_extractor/prompts/auto/entities.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+    {
+        "expectedOutputFormat": "Stringified Json List",
+        "entityName": "labour",
+        "entityId": 1,
+        "entityDesc": "All line items of the labour repairs performed on the vehicle in the following schema: [{\"sr_no\": \"serial number of item in alphanumeric\", \"item_name\": \"Name of line item\", \"labour_code\": \"Labour code\", \"hsn_sac\": \"HSN/SAC code\", \"qty\": \"quantity of the item in float\", \"unit_price\": \"Unit price of item in float\", \"cgst\": \"CGST on the item in float, if any\", \"sgst\": \"SGST on the item in float, if any\", \"igst\": \"IGST on the item in float, if any\", \"net_amount\": \"Final amount for the item in float\", \"discount\": \"discount for the item in float, if any\"}]"
+    },
+    {
+        "expectedOutputFormat": "Stringified Json List",
+        "entityName": "parts",
+        "entityId": 2,
+        "entityDesc": "All line items of the parts repaired or replaced on the vehicle in the following schema: [{\"sr_no\": \"serial number of item in alphanumeric\", \"item_name\": \"Name of line item\", \"part_number\": \"Part number\", \"hsn_sac\": \"HSN/SAC code\", \"qty\": \"quantity of the item in float\", \"unit_price\": \"Unit price of item in float\", \"cgst\": \"CGST on the item in float, if any\", \"sgst\": \"SGST on the item in float, if any\", \"igst\": \"IGST on the item in float, if any\", \"net_amount\": \"Final amount for the item in float\", \"discount\": \"discount for the item in float, if any\"}]"
+    },
+    {
+        "expectedOutputFormat": "String",
+        "entityName": "vendor_gst_number",
+        "entityId": 3,
+        "entityDesc": "Alphanumeric GST Number of the Vendor"
+    },
+    {
+        "expectedOutputFormat": "String",
+        "entityName": "customer_gst_number",
+        "entityId": 4,
+        "entityDesc": "Alphanumeric GST Number of the Customer"
+    },
+    {
+        "expectedOutputFormat": "Stringified Json dictionary",
+        "entityName": "tax_details",
+        "entityId": 5,
+        "entityDesc": "Tax details in the following schema : {\"cgst\": [{\"rate\": \"Rate of CGST levied in float\", \"amount\": \"Amount charged as CGST in float if any\"}], \"sgst\": [{\"rate\": \"amount\": \"Amount charged as SGST in float if any\"}], \"igst\": [{\"rate\": \"amount\": \"Amount charged as IGST in float if any\"}]}"
+    },
+    {
+        "expectedOutputFormat": "dd/mm/yyyy HH:MM:SS",
+        "entityName": "invoice_date",
+        "entityId": 6,
+        "entityDesc": "Date of invoice in dd/mm/yyyy HH:MM:SS format"
+    },
+    {
+        "expectedOutputFormat": "float",
+        "entityName": "total_amount",
+        "entityId": 7,
+        "entityDesc": "Total amount charged"
+    },
+    {
+        "expectedOutputFormat": "String",
+        "entityName": "invoice_number",
+        "entityId": 8,
+        "entityDesc": "Invoice Number"
+    },
+    {
+        "expectedOutputFormat": "String",
+        "entityName": "registration_number",
+        "entityId": 9,
+        "entityDesc": "Alphanumeric registration number of the vehicle"
+    },
+    {
+        "expectedOutputFormat": "Stringified Json dictionary",
+        "entityName": "insurance_claim_details",
+        "entityId": 10,
+        "entityDesc": "Insurance claim details in the following schema: {\"claim_no\": \"Alphanumeric claim number\", \"insured_name\": \"Name of the insured person\"}"
+    }
+]

invoice_extractor/prompts/extraction_system_prompt.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+You are an intelligent agent in an insurance company called `Acko`
+Your job is to extract data from documents.
+Today's date is : {{date}}
+Given the markdown text of a document, you are supposed to extract the given entities.
+Entities are given as a list of dictionaries.
+You are supposed to complete the `entityValue` key of the given dictionaries based on the `entityDesc` given as description that explains the entity.
+If any entity is not found in the document, don't generate it in the response JSON.
+Format your response strictly as a list of JSON dictionaries.
+In the following entities JSON, keys mean the following
+    - "entityId": [Unique ID of the entity. PRESERVE IN RESPONSE.]
+    - "entityName": [Entity to be extracted based on the entity description.]
+    - "entityValue": [Entity value to be generated by you in the expectedOutputFormat mentioned.]
+    - "expectedOutputFormat": [Expected format of the entityValue.]
+    - "entityDesc": [Description of entity]
+Entities:

invoice_extractor/utils.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

File without changes

styles.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import streamlit as st
+def apply_custom_styles():
+    st.markdown("""
+        <style>
+        .stApp {
+            max-width: 1200px;
+            margin: 0 auto;
+            background: linear-gradient(135deg, #f5f7fa 0%, #e4e9f2 100%);
+            background-attachment: fixed;
+            min-height: 100vh;
+        }
+        .upload-container {
+            border: 2px dashed #0066cc;
+            border-radius: 10px;
+            padding: 20px;
+            text-align: center;
+            margin: 20px 0;
+            background: rgba(255, 255, 255, 0.9);
+            backdrop-filter: blur(5px);
+        }
+        .factor-card {
+            background-color: rgba(255, 255, 255, 0.95);
+            padding: 20px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+            margin: 10px 0;
+            backdrop-filter: blur(5px);
+            height: 100%;
+        }
+        .good-factor {
+            border-left: 4px solid #28a745;
+        }
+        .average-factor {
+            border-left: 4px solid #ffc107;
+        }
+        .bad-factor {
+            border-left: 4px solid #dc3545;
+        }
+        .header-container {
+            padding: 2rem 0;
+            margin-bottom: 2rem;
+            background: linear-gradient(90deg, #0066cc 0%, #0099ff 100%);
+            color: white;
+            border-radius: 10px;
+            text-align: center;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+        }
+        .detailed-factor {
+            padding: 15px;
+            border-radius: 8px;
+            margin: 10px 0;
+            background: rgba(255, 255, 255, 0.9);
+            border-left: 4px solid #666;
+        }
+        .detailed-factor.good {
+            border-left-color: #28a745;
+            background: rgba(40, 167, 69, 0.1);
+        }
+        .detailed-factor.average {
+            border-left-color: #ffc107;
+            background: rgba(255, 193, 7, 0.1);
+        }
+        .detailed-factor.bad {
+            border-left-color: #dc3545;
+            background: rgba(220, 53, 69, 0.1);
+        }
+        .comparison-table {
+            background: white;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+            margin: 20px 0;
+        }
+        </style>
+    """, unsafe_allow_html=True)
+def show_factor_section(title, factors, color):
+    if factors:
+        st.markdown(f"""
+            <div class="factor-card {color}-factor">
+                <h3 style="color: #333;">{title}</h3>
+                <ul style="list-style-type: none; padding-left: 0;">
+                    {"".join(f'<li style="margin: 10px 0; padding: 10px; background: rgba(248, 249, 250, 0.8); border-radius: 5px;">{factor}</li>' for factor in factors)}
+                </ul>
+            </div>
+        """, unsafe_allow_html=True)
+def show_detailed_factors(good_factors, average_factors, bad_factors):
+    for factor in good_factors:
+        name, explanation = factor.split(':', 1)
+        st.markdown(f"""
+            <div class="detailed-factor good">
+                <strong>{name}</strong>
+                <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
+            </div>
+        """, unsafe_allow_html=True)
+    for factor in average_factors:
+        name, explanation = factor.split(':', 1)
+        st.markdown(f"""
+            <div class="detailed-factor average">
+                <strong>{name}</strong>
+                <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
+            </div>
+        """, unsafe_allow_html=True)
+    for factor in bad_factors:
+        name, explanation = factor.split(':', 1)
+        st.markdown(f"""
+            <div class="detailed-factor bad">
+                <strong>{name}</strong>
+                <p style="margin: 5px 0 0 0; color: #666;">{explanation}</p>
+            </div>
+        """, unsafe_allow_html=True)
+def show_factor_summary(summary, verdict, sentiment_title):
+    if len(summary) > 0:
+        st.markdown(f"""
+                <div class="detailed-factor {verdict}">
+                    <strong>{sentiment_title}</strong>
+                    <p style="margin: 5px 0 0 0; color: #666;">{summary}</p>
+                </div>
+            """, unsafe_allow_html=True)

utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+    Utilities
+    @author : Sakshi Tantak
+"""
+import streamlit as st
+import base64
+def markdown_table_to_json(markdown):
+    lines = markdown.strip().split("\n")
+    # Extract headers
+    headers = [h.strip() for h in lines[0].split("|") if h.strip()]
+    # Extract rows
+    rows = []
+    for line in lines[2:]:  # Skip header and separator line
+        values = [v.strip() for v in line.split("|") if v.strip()]
+        row_dict = dict(zip(headers, values))
+        rows.append(row_dict)
+    return rows
+def validate_pdf(pdf_bytes: bytes) -> bool:
+    """
+    Validates the uploaded PDF file.
+    """
+    if not pdf_bytes:
+        return False
+    # Check file signature for PDF (%PDF-)
+    return pdf_bytes.startswith(b'%PDF-')
+def displayPDF(file):
+    # Opening file from file path
+    if isinstance(file, str):
+        file_bytes = open(file, 'rb').read()
+    else:
+        file_bytes = file
+    # with open(file, "rb") as f:
+    base64_pdf = base64.b64encode(file_bytes).decode('utf-8')
+    # Embedding PDF in HTML
+    pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">'
+    # Displaying File
+    st.markdown(pdf_display, unsafe_allow_html=True)