Spaces:

mlkorra
/

Product-doc-classifier

Sleeping

File size: 11,940 Bytes

dcb2841

import streamlit as st
from utils.util_classifier import TextClassificationPipeline
import time
import requests
import io
import pdfplumber
from urllib.parse import urlparse
import plotly.graph_objects as go
import plotly.express as px

def validate_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def download_pdf(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'application/pdf,*/*',
            'Referer': 'https://www.inter-lux.com/'
        }
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Verify content type is PDF
        content_type = response.headers.get('content-type', '')
        if 'application/pdf' not in content_type.lower():
            raise ValueError(f"URL does not point to a PDF file. Content-Type: {content_type}")
            
        return io.BytesIO(response.content)
    except Exception as e:
        st.error(f"Download error: {str(e)}")
        return None

def extract_text(pdf_file):
    try:
        # Reset file pointer
        pdf_file.seek(0)
        
        with pdfplumber.open(pdf_file) as pdf:
            text = ""
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
            
            if not text.strip():
                raise ValueError("No text could be extracted from the PDF")
                
            return text.strip()
    except Exception as e:
        st.error(f"Text extraction error: {str(e)}")
        return None

def main():
    st.title("🎯 Document Classifier")
    
    # Model selection
  
    
    method = "bertbased" 
    
    # Initialize classifier
    classifier = TextClassificationPipeline(method=method)
    
    # File input tabs
    tab1, tab2 = st.tabs(["🔗 URL Input", "📁 File Upload"])
    
    with tab1:
        url = st.text_input("Enter PDF URL")
        process_btn = st.button("Classify Document", key="url_classify")
        
        if process_btn and url:
            if not validate_url(url):
                st.error("Please enter a valid URL")
                return
                
            progress_container = st.container()
            
            with progress_container:
                # Step 1: Downloading
                with st.spinner("Downloading PDF..."):
                    pdf_file = download_pdf(url)
                    if pdf_file is None:
                        return
                    st.success("PDF downloaded successfully!")
                
                # Step 2: Extracting Text
                with st.spinner("Extracting text from PDF..."):
                    text = extract_text(pdf_file)
                    if text is None or len(text.strip()) == 0:
                        return
                    st.success("Text extracted successfully!")
                    
                    with st.expander("View Extracted Text"):
                        st.text(text[:500] + "..." if len(text) > 500 else text)
                
                # Step 3: Classification
                with st.spinner("Classifying document..."):
                    result = classifier.predict(text, return_probability=True)
                    if isinstance(result, list):
                        result = result[0]
                
                # Display results
                                                                
                def create_gauge_chart(confidence):
                    """Create a gauge chart for confidence score"""
                    fig = go.Figure(go.Indicator(
                        mode = "gauge+number+delta",
                        value = confidence * 100,
                        domain = {'x': [0, 1], 'y': [0, 1]},
                        gauge = {
                            'axis': {'range': [None, 100], 'tickwidth': 1, 'tickcolor': "darkblue"},
                            'bar': {'color': "darkblue"},
                            'bgcolor': "white",
                            'borderwidth': 2,
                            'bordercolor': "gray",
                            'steps': [
                                {'range': [0, 50], 'color': '#FF9999'},
                                {'range': [50, 75], 'color': '#FFCC99'},
                                {'range': [75, 100], 'color': '#99FF99'}
                            ],
                        },
                        title = {'text': "Confidence Score"}
                    ))
                    
                    fig.update_layout(
                        height=300,
                        margin=dict(l=10, r=10, t=50, b=10),
                        paper_bgcolor='rgba(0,0,0,0)',
                        font={'color': "darkblue", 'family': "Arial"}
                    )
                    return fig

                def create_probability_chart(probabilities):
                    """Create a horizontal bar chart for probability distribution"""
                    labels = list(probabilities.keys())
                    values = list(probabilities.values())
                    
                    fig = go.Figure()
                    
                    # Add bars
                    fig.add_trace(go.Bar(
                        y=labels,
                        x=[v * 100 for v in values],
                        orientation='h',
                        marker=dict(
                            color=[px.colors.sequential.Blues[i] for i in range(2, len(labels) + 2)],
                            line=dict(color='rgba(0,0,0,0.8)', width=2)
                        ),
                        text=[f'{v:.1f}%' for v in [v * 100 for v in values]],
                        textposition='auto',
                    ))
                    
                    # Update layout
                    fig.update_layout(
                        title=dict(
                            text='Probability Distribution',
                            y=0.95,
                            x=0.5,
                            xanchor='center',
                            yanchor='top',
                            font=dict(size=20, color='darkblue')
                        ),
                        xaxis_title="Probability (%)",
                        yaxis_title="Categories",
                        height=400,
                        margin=dict(l=20, r=20, t=70, b=20),
                        paper_bgcolor='rgba(0,0,0,0)',
                        plot_bgcolor='rgba(0,0,0,0)',
                        font=dict(family="Arial", size=14),
                        showlegend=False
                    )
                    
                    # Update axes
                    fig.update_xaxes(
                        range=[0, 100],
                        gridcolor='rgba(0,0,0,0.1)',
                        zerolinecolor='rgba(0,0,0,0.2)'
                    )
                    fig.update_yaxes(
                        gridcolor='rgba(0,0,0,0.1)',
                        zerolinecolor='rgba(0,0,0,0.2)'
                    )
                    
                    return fig

                # Update the results display section
                def display_results(result):
                    """Display classification results with modern visualizations"""
                    
                    # Create three columns for the results
                    col1, col2 = st.columns([1, 2])
                    
                    with col1:
                        # Predicted Category Card
                        st.markdown("""
                            <div style='
                                background-color: white;
                                padding: 20px;
                                border-radius: 10px;
                                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
                                text-align: center;
                                margin-bottom: 20px;
                            '>
                                <h4 style='color: #1f77b4; margin-bottom: 10px;'>Predicted Category</h4>
                                <p style='
                                    font-size: 24px;
                                    font-weight: bold;
                                    color: #2c3e50;
                                    margin: 0;
                                    padding: 10px;
                                    background-color: #f8f9fa;
                                    border-radius: 5px;
                                '>{}</p>
                            </div>
                        """.format(result['predicted_label']), unsafe_allow_html=True)
                        
                        # Confidence Gauge
                        st.plotly_chart(create_gauge_chart(result['confidence']), use_container_width=True)
                    
                    with col2:
                        # Probability Distribution
                        st.plotly_chart(create_probability_chart(result['probabilities']), use_container_width=True)
                    
                    # Add metadata section
                    with st.expander("📊 Classification Details"):
                        st.markdown(f"""
                            - **Model Type**: {result['model_type'].title()}
                            - **Document Length**: {len(result['text'])} characters
                        """)

                # Update the main classification results section
                # Replace the existing results display with:
                st.markdown("### 📊 Classification Results")
                display_results(result) 
                
    
    with tab2:
        uploaded_file = st.file_uploader("Upload PDF file", type="pdf")
        process_btn = st.button("Classify Document", key="file_classify")
        
        if process_btn and uploaded_file:
            with st.spinner("Processing uploaded PDF..."):
                text = extract_text(uploaded_file)
                if text is None:
                    return
                
                result = classifier.predict(text, return_probability=True)
                if isinstance(result, list):
                    result = result[0]
                
                # Display results (same as URL tab)
                st.markdown("### 📊 Classification Results")
                
                confidence = result['confidence']
                st.markdown(f"""
                    <div class="confidence-meter">
                        <div class="meter-fill" style="width: {confidence*100}%"></div>
                        <span class="meter-text">{confidence:.1%} Confident</span>
                    </div>
                """, unsafe_allow_html=True)
                
                st.markdown(f"""
                    <div class="result-card">
                        <h4>Predicted Category</h4>
                        <p class="prediction">{result['predicted_label']}</p>
                    </div>
                """, unsafe_allow_html=True)
                
                st.markdown("#### Probability Distribution")
                for label, prob in result['probabilities'].items():
                    st.markdown(f"""
                        <div class="prob-bar">
                            <span class="label">{label}</span>
                            <div class="bar">
                                <div class="fill" style="width: {prob*100}%"></div>
                            </div>
                            <span class="value">{prob:.1%}</span>
                        </div>
                    """, unsafe_allow_html=True)


main()