document-SDG-App-cpu

Running

File size: 38,306 Bytes

5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e20a29
 
5bc0ffd
 
 
 
 
 
 
 
00b2fb8
 
 
 
 
 
 
 
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
a1cd214
51eb07c
f4ac013
a1cd214
 
5bc0ffd
f4ac013
 
 
51eb07c
 
f4ac013
a1cd214
f4ac013
 
5bc0ffd
 
 
 
 
 
f4ac013
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d448c9
5bc0ffd
 
 
 
 
 
 
 
 
51eb07c
 
5bc0ffd
9e7371c
5bc0ffd
51eb07c
5bc0ffd
 
a1cd214
51eb07c
5cd89cd
a1cd214
5bc0ffd
 
f4ac013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7b3132
a1cd214
 
 
f4ac013
 
 
 
 
 
5bc0ffd
 
 
 
51eb07c
5bc0ffd
 
d57c3ca
5bc0ffd
 
 
d57c3ca
5bc0ffd
 
 
 
 
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
d57c3ca
 
 
 
5bc0ffd
 
 
d57c3ca
 
5bc0ffd
51eb07c
 
 
 
 
 
 
 
 
 
1e0c688
5bc0ffd
 
 
 
 
d57c3ca
 
5bc0ffd
d57c3ca
5bc0ffd
 
 
d57c3ca
5bc0ffd
 
 
 
 
 
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
d57c3ca
 
 
 
5bc0ffd
 
 
d57c3ca
 
5bc0ffd
51eb07c
 
 
 
 
 
 
 
 
 
 
5bc0ffd
 
 
 
 
d57c3ca
 
5bc0ffd
00b2fb8
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
 
 
 
 
 
5bc0ffd
 
 
 
00b2fb8
5bc0ffd
 
 
00b2fb8
5bc0ffd
 
5e20a29
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
00b2fb8
5bc0ffd
 
28852e9
5bc0ffd
 
 
 
 
62aa076
5bc0ffd
 
d57c3ca
5bc0ffd
 
00b2fb8
 
 
8dbd1eb
62aa076
 
00b2fb8
5bc0ffd
00b2fb8
 
 
 
 
 
 
 
62aa076
4539ed0
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
be92a53
a1cd214
 
d57c3ca
5bc0ffd
 
8dbd1eb
5bc0ffd
 
 
 
 
62aa076
 
 
 
 
8dbd1eb
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
a1cd214
eda1b2b
8dbd1eb
 
 
 
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
454e4fc
eda1b2b
8dbd1eb
 
 
62aa076
5bc0ffd
d57c3ca
5bc0ffd
 
8dbd1eb
5bc0ffd
 
8dbd1eb
5bc0ffd
 
62aa076
 
 
 
 
8dbd1eb
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
a1cd214
eda1b2b
8dbd1eb
 
 
 
 
eda1b2b
454e4fc
eda1b2b
dac7577
454e4fc
 
235f490
a1cd214
454e4fc
a1cd214
eda1b2b
8dbd1eb
 
 
 
62aa076
5bc0ffd
 
 
 
8dbd1eb
a1cd214
5bc0ffd
 
 
 
d57c3ca
5bc0ffd
 
 
 
 
d57c3ca
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
51eb07c
 
 
 
5bc0ffd
 
51eb07c
 
 
 
5bc0ffd
 
d57c3ca
62aa076
 
 
8dbd1eb
a1cd214
62aa076
 
d57c3ca
8dbd1eb
d57c3ca
8dbd1eb
 
 
 
5bc0ffd
 
d57c3ca
 
5bc0ffd
 
a1cd214
 
 
8dbd1eb
5bc0ffd
 
 
a1cd214
5bc0ffd
 
 
 
 
8dbd1eb
a1cd214
5bc0ffd
 
 
 
d57c3ca
5bc0ffd
 
 
 
 
d57c3ca
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
51eb07c
 
 
 
5bc0ffd
 
51eb07c
 
 
 
5bc0ffd
 
d57c3ca
62aa076
 
 
8dbd1eb
a1cd214
62aa076
 
d57c3ca
8dbd1eb
d57c3ca
8dbd1eb
 
 
 
5bc0ffd
 
d57c3ca
 
5bc0ffd
 
a1cd214
 
 
8dbd1eb
5bc0ffd
 
 
a1cd214
5bc0ffd
 
 
a1cd214
5bc0ffd
 
a1cd214
5bc0ffd
8dbd1eb
5bc0ffd
 
 
8dbd1eb
 
 
 
 
 
2189bdb
62aa076
8dbd1eb
 
 
 
 
 
 
 
 
 
 
 
2189bdb
8dbd1eb
 
 
5bc0ffd
 
8dbd1eb
5bc0ffd
 
 
8dbd1eb
 
 
 
 
 
2189bdb
62aa076
8dbd1eb
 
5bc0ffd
 
 
 
8dbd1eb
 
 
 
 
 
2189bdb
8dbd1eb
 
 
5bc0ffd
 
 
 
a1cd214

import gradio as gr
import os
import re
import torch
import pandas as pd
import plotly.express as px
import plotly.io as pio
import nltk
import tempfile
from io import BytesIO
import base64
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
from docx.shared import Inches
from docx import Document
import numpy as np
# Needed for HF GPU access
import spaces

from styles import custom_css  # Importing custom CSS

nltk.download('punkt')

# Import PyPDFLoader for PDF processing
from langchain_community.document_loaders import PyPDFLoader

# Model checkpoint for SDG BERT
checkpoint = "sadickam/sdgBERT"

# Text cleaning function
def clean_text(text):
    """
    Cleans the extracted text by removing irrelevant characters but retains currency symbols.
    """
    text = text.strip()
    # Define the allowed characters (including currency symbols)
    allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
    text = re.sub(allowed_chars, '', text)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

# Preprocessing function for text
def prep_text(text):
    clean_sents = []
    sent_tokens = sent_tokenize(str(text))
    for sent_token in sent_tokens:
        word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
        clean_sents.append(' '.join(word_tokens))
    joined = ' '.join(clean_sents).strip()
    return re.sub(r'`|"', "", joined)

# Load the tokenizer and model with GPU support
def load_model_and_tokenizer():
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return model, tokenizer

# Define device (ensure usage of GPU if available in Hugging Face Spaces)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# SDG labels
label_list = [
    'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
    'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
    'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
    'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
    'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
    'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
]

# Function to predict SDGs for a batch of text inputs
def predict_sdg_labels_batch(texts, model, tokenizer):
    tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    model.eval()
    with torch.no_grad():
        text_logits = model(**tokenized_texts).logits
    predictions = torch.softmax(text_logits, dim=1).tolist()
    return predictions

# Page-level predictions with batch processing
def predict_pages(page_df, batch_size=32):
    model, tokenizer = load_model_and_tokenizer()
    df_results = page_df.copy()
    num_rows = len(page_df)
    all_predicted_labels = [[] for _ in range(16)]
    all_prediction_scores = [[] for _ in range(16)]

    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        df_chunk = page_df.iloc[start:end]
        # Clean text
        texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
        for predictions in predictions_batch:
            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
            for i, (label, score) in enumerate(sorted_preds):
                all_predicted_labels[i].append(label)
                all_prediction_scores[i].append(score)

    # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
    for i in range(16):
        df_results[f'pred{i + 1}'] = all_predicted_labels[i]
        df_results[f'score{i + 1}'] = all_prediction_scores[i]

    # Reorder columns to ensure preds and scores are interleaved in the correct order
    reordered_columns = []
    for i in range(16):
        reordered_columns.append(f'pred{i + 1}')
        reordered_columns.append(f'score{i + 1}')
    other_columns = [col for col in df_results.columns if col not in reordered_columns]
    df_results = df_results[other_columns + reordered_columns]

    return df_results

# Sentence-level predictions with batch processing
def predict_sentences(sentence_df, batch_size=32):
    model, tokenizer = load_model_and_tokenizer()
    df_combined_sentences = sentence_df.copy()

    num_rows = len(sentence_df)
    all_predicted_labels = [[] for _ in range(16)]
    all_prediction_scores = [[] for _ in range(16)]

    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        df_chunk = sentence_df.iloc[start:end]
        # Clean text
        texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
        for predictions in predictions_batch:
            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
            for i, (label, score) in enumerate(sorted_preds):
                all_predicted_labels[i].append(label)
                all_prediction_scores[i].append(round(score, 3))

    # Add predictions and scores to DataFrame
    for i in range(16):
        df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
        df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]

    # Reorder columns
    reordered_columns = []
    for i in range(16):
        reordered_columns.append(f'pred{i + 1}')
        reordered_columns.append(f'score{i + 1}')
    other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
    df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]

    return df_combined_sentences

# Define unique colors for each SDG
sdg_colors = {
    "SDG1_No Poverty": "#E5243B",
    "SDG2_Zero Hunger": "#DDA63A",
    "SDG3_Good Health and Well-being": "#4C9F38",
    "SDG4_Quality Education": "#C5192D",
    "SDG5_Gender Equality": "#FF3A21",
    "SDG6_Clean Water and Sanitation": "#26BDE2",
    "SDG7_Affordable and Clean Energy": "#FCC30B",
    "SDG8_Decent Work and Economic Growth": "#A21942",
    "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
    "SDG10_Reduced Inequality": "#DD1367",
    "SDG11_Sustainable Cities and Communities": "#FD9D24",
    "SDG12_Responsible Consumption and Production": "#BF8B2E",
    "SDG13_Climate Action": "#3F7E44",
    "SDG14_Life Below Water": "#0A97D9",
    "SDG15_Life on Land": "#56C02B",
    "SDG16_Peace, Justice and Strong Institutions": "#00689D"
}

# Function to plot SDG dominant bar graphs using Plotly
# Function to plot SDG dominant bar graphs using Plotly
def plot_sdg(df, title, pred_column, x_axis_title=None, y_axis_title=None, icons_folder='assets/icons/'):
    """
    Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
    
    Args:
        df (pd.DataFrame): DataFrame containing SDG predictions.
        title (str): Title of the plot.
        pred_column (str): Column name to use for plotting (e.g., 'pred1').
        x_axis_title (str): Title for the x-axis.
        y_axis_title (str): Title for the y-axis.
        icons_folder (str): Path to the folder containing SDG icons.
    
    Returns:
        plotly.graph_objs._figure.Figure: The Plotly figure object.
    """
    df_filtered = df[df[pred_column].notna()]
    labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
    total = labels.sum()
    percentages = (labels / total) * 100

    # Create a horizontal bar plot with Plotly
    fig = px.bar(
        percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
        y='SDG Label',
        x='Percentage',
        orientation='h',
        title=title,
        color='SDG Label',
        color_discrete_map=sdg_colors  # Use the defined unique colors for each SDG
    )

    # Update y-axis to show labels
    fig.update_yaxes(showticklabels=True)

    # Add percentage labels to the bars
    fig.update_traces(
        texttemplate='%{x:.2f}%',
        textposition='auto',
        textfont=dict(size=11)
    )

    # Adjust layout for better visibility
    fig.update_layout(
        title=dict(
            text=title, font=dict(size=14)  # Increase title font size
        ),
        yaxis=dict(
            automargin=True,
            title=y_axis_title,
            tickfont=dict(size=13)
        ),
        margin=dict(l=20, r=100, t=30, b=20),  # Increased right margin for icon
        height=600,
        #width=800,
        showlegend=False,
        template="simple_white",
        xaxis=dict(
            title=x_axis_title,
            tickfont=dict(size=13)  # Reduce x-axis font size
        ),
    )

    # Identify the most frequent SDG
    if not percentages.empty:
        top_sdg_label = percentages.index[0]  # e.g., 'SDG1_No Poverty'

        # Map SDG label to icon filename
        # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
        sdg_number = top_sdg_label.split('_')[0]  # Extract 'SDG1'
        icon_filename = f"{sdg_number}.png"  # e.g., 'SDG1.png'
        icon_path = os.path.join(icons_folder, icon_filename)

        # Check if the icon file exists
        if os.path.exists(icon_path):
            # Read and encode the image
            with open(icon_path, 'rb') as image_file:
                encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

            # Add the icon as an image in the Plotly figure
            fig.add_layout_image(
                dict(
                    source='data:image/png;base64,' + encoded_image,
                    xref="paper", yref="paper",
                    x=.98, y=1.0,  # Positioning: slightly to the right and top
                    sizex=0.2, sizey=0.2,  # Size of the icon
                    xanchor="left",
                    yanchor="top",
                    layer="above"  # Ensure the icon is above other plot elements
                )
            )
        else:
            print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")

    return fig

def save_figure_as_jpeg(fig, filename):
    """Saves the Plotly figure as a high-resolution JPEG."""
    pio.write_image(fig, filename, format='jpeg', width=700, height=650, scale=7, engine="kaleido")

# Generate reports (page and sentence levels)
def generate_page_report(df_pages, report_file_name):
    doc = Document()
    doc.add_heading("Page-Level SDG Analysis Report", 0)

    doc.add_heading("📋 General Notes", level=2)
    doc.add_paragraph(
        'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
        'representing the likelihood that the text is aligned with particular SDGs. This page-level '
        'analysis provides high-level insight into SDG alignment.'
        '\n\n'
        'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
        '(Primary and Secondary) for each page with a probability score greater than zero.'
    )

    doc.add_heading("Primary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph displays the most essential SDG the AI model associates with pages. The bars '
        'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
        'sustainable development theme within the document.'
    )

    doc.add_heading("Secondary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
        'not the primary focus, the text has some relevance to these goals.'
    )
    
    for doc_name in df_pages['Document'].unique():
        # Sanitize doc_name to use in file names
        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])

        doc.add_heading(f"📄 Document: {doc_name}", level=2)
        df_doc = df_pages[df_pages['Document'] == doc_name]

        # Generate and save graphs
        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"

        plot_sdg(
            df_doc, "", 'pred1',
            x_axis_title="Percentage (%) of aligned pages",
            y_axis_title="Primary SDGs"
        ).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
        
        plot_sdg(
            df_doc, "", 'pred2',
            x_axis_title="Percentage (%) of aligned pages",
            y_axis_title="Secondary SDGs"
        ).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        # Add plots to the Word document
        doc.add_picture(first_sdg_plot_path, width=Inches(6))
        doc.add_picture(second_sdg_plot_path, width=Inches(6))

    doc.save(report_file_name)
    return report_file_name

def generate_sentence_report(df_sentences, report_file_name):
    doc = Document()
    doc.add_heading("Sentence-Level SDG Analysis Report", 0)

    doc.add_heading("📋 General Notes", level=2)
    doc.add_paragraph(
        'This app splits documents into sentences using a natural language processing algorithm. '
        'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
        'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
        'analysis provides deeper insight into SDG alignment.'
        '\n\n'
        'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
        '(Primary and Secondary) for each sentence with a probability score greater than zero.'
    )

    doc.add_heading("Primary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph displays the most essential SDG the AI model associates with sentences. The bars '
        'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
        'into the dominant sustainable development theme within the document.'
    )

    doc.add_heading("Secondary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
        'the primary focus, the text has some relevance to these goals.'
    )

    for doc_name in df_sentences['Document'].unique():
        # Sanitize doc_name to use in file names
        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])

        doc.add_heading(f"📄 Document: {doc_name}", level=2)
        df_doc = df_sentences[df_sentences['Document'] == doc_name]

        # Generate and save graphs
        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"

        plot_sdg(
            df_doc, "", 'pred1',
            x_axis_title="Percentage (%) of aligned sentences",
            y_axis_title="Primary SDGs"
        ).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        plot_sdg(
            df_doc, "", 'pred2',
            x_axis_title="Percentage (%) of aligned sentences",
            y_axis_title="Secondary SDGs"
        ).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        # Add plots to the Word document
        doc.add_picture(first_sdg_plot_path, width=Inches(6))
        doc.add_picture(second_sdg_plot_path, width=Inches(6))

    doc.save(report_file_name)
    return report_file_name

# New text extraction functions with text cleaning and line joining
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
    """
    Extract text from a PDF page by page using LangChain's PyPDFLoader.
    Args:
        pdf_file_path (str): The file path to the uploaded PDF.
        start_page (int, optional): The starting page number for extraction (1-based index).
        end_page (int, optional): The ending page number for extraction (1-based index).
    Returns:
        tuple: 
            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
    """
    try:
        # Initialize the loader
        loader = PyPDFLoader(pdf_file_path)
        documents = loader.load_and_split()  # Each document corresponds to a single page

        total_pages = len(documents)
        doc_name = os.path.basename(pdf_file_path)  # Extract document name

        # Validate and adjust page range
        if start_page is not None and end_page is not None:
            # Convert to integers to avoid slicing issues
            start_page = int(start_page)
            end_page = int(end_page)

            # Adjust to valid range
            if start_page < 1:
                start_page = 1
            if end_page > total_pages:
                end_page = total_pages
            if start_page > end_page:
                start_page, end_page = end_page, start_page  # Swap if out of order

            # Select the subset of documents based on user input
            selected_docs = documents[start_page - 1:end_page]
        else:
            selected_docs = documents
            start_page = 1
            end_page = total_pages

        # Initialize lists to store data
        page_data = []
        sentence_data = []

        for idx, doc in enumerate(selected_docs, start=start_page):
            page_num = idx
            text = doc.page_content.strip()

            # Join lines that belong to the same sentence
            lines = text.split('\n')
            joined_text = ' '.join(line.strip() for line in lines if line.strip())

            # Clean text
            cleaned_text = clean_text(joined_text)

            # Append page-wise data
            page_data.append({
                "Document": doc_name,
                "Page": page_num,
                "Text": cleaned_text
            })

            # Sentence tokenization
            sentences = sent_tokenize(cleaned_text)
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence and len(sentence) > 70:
                    sentence_data.append({
                        "Document": doc_name,
                        "Page": page_num,
                        "Sentence": sentence
                    })

        # Create DataFrames
        page_df = pd.DataFrame(page_data)
        sentence_df = pd.DataFrame(sentence_data)

        return page_df, sentence_df

    except Exception as e:
        raise RuntimeError(f"Error during PDF extraction: {e}")

def df_to_csv_bytes(df):
    """
    Convert DataFrame to CSV in bytes.
    Args:
        df (pd.DataFrame): The DataFrame to convert.
    Returns:
        bytes: CSV data in bytes.
    """
    try:
        buffer = BytesIO()
        df.to_csv(buffer, index=False)
        csv_data = buffer.getvalue()
        buffer.close()
        return csv_data
    except Exception as e:
        raise RuntimeError(f"Error during CSV conversion: {e}")

def launch_interface():
    with gr.Blocks(css=custom_css) as demo:

        # Title as a visible heading at the top of the page with an icon
        gr.Markdown(
            """
            # 🌍 SDG Document Analysis App  
            Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
            """
        )

        # Shared PDF file input for both analyses
        gr.Markdown("## Upload PDF File")
        with gr.Row():
            file_input = gr.File(
                label="📁 Upload PDF File for Analysis", file_types=[".pdf"]
            )

        # Extraction mode selection with explanatory text
        gr.Markdown(
            """
            ## PDF Text Extraction Mode  
            Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select 
            "Range of Pages" and specify the start and end pages. 
            """
        )
        with gr.Row():
            extraction_mode = gr.Radio(
                choices=["All Pages", "Range of Pages"],
                value="All Pages",
                label="Extraction Mode"
            )

        with gr.Row():
            start_page = gr.Number(value=1, label="🔢 Start Page", visible=False, info="The cover page is page 1")
            end_page = gr.Number(value=1, label="🔢 End Page", visible=False)

        # Function to update visibility of start_page and end_page
        def update_page_inputs(extraction_mode):
            if extraction_mode == "Range of Pages":
                return gr.update(visible=True), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=False)

        extraction_mode.change(
            update_page_inputs,
            inputs=extraction_mode,
            outputs=[start_page, end_page]
        )

        # Main Tabs for Page-Level and Sentence-Level Analysis
        gr.Markdown("## SDG Analysis Type")
        
        with gr.Tab("📄 Page-Level Analysis"):
            gr.Markdown(
                """
                ### Page-Level SDG Analysis  
                This section conducts Sustainable Development Goals (SDG) mapping 
                of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). 
                It provides **high-level SDG mapping** of documents at the page level.
                """
            )

            with gr.Row():
                page_button = gr.Button("🏃‍♂️ Run Page-Level Analysis")
                reset_page_button = gr.Button("🔄 Reset Page-Level Analysis", elem_classes="reset-button")

            # Nested Tabs for Primary and Secondary SDGs
            with gr.Tabs():
                with gr.TabItem("📊 Primary SDGs"):
                    with gr.Row():
                        primary_page_plot = gr.Plot(label="📊 Primary SDGs Graph [Page-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
                            "the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
                            "SDG will be highlighted on the graph. Download the Page Predictions CSV for further details.",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")    
                    with gr.Row():
                        page_csv = gr.File(label="📊 Download Page Predictions CSV")
                        page_docx = gr.File(label="📄 Download Page Report DOCX")
                        page_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")

                with gr.TabItem("📈 Secondary SDGs"):
                    with gr.Row():
                        secondary_page_plot = gr.Plot(label="📈 Secondary SDGs Graph [Page-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
                            "SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
                            "Primary SDGs. Download the Sentence Predictions CSV for further details",
                            label = "Note", container=True
                        )
                    
                    gr.Markdown("##### Download Results") 
                    with gr.Row():
                        page_csv_secondary = gr.File(label="📊 Download Page Predictions CSV")
                        page_report_file_secondary = gr.File(label="📄 Download Page Report DOCX")
                        secondary_page_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")    

        with gr.Tab("✍️ Sentence-Level Analysis"):
            gr.Markdown(
                """
                ### Sentence-Level SDG Analysis  
                This section conducts Sustainable Development Goals (SDG) mapping 
                using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). 
                It provides **detailed SDG mapping** at the sentence level.
                """
            )

            with gr.Row():
                sentence_button = gr.Button("🏃‍♂️ Run Sentence-Level Analysis")
                reset_sentence_button = gr.Button("🔄 Reset Sentence-Level Analysis", elem_classes="reset-button")

            # Nested Tabs for Primary and Secondary SDGs
            with gr.Tabs():
                with gr.TabItem("📊 Primary SDGs"):
                    with gr.Row():
                        primary_sentence_plot = gr.Plot(label="📊 Primary SDGs Graph [Sentence-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
                            "the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
                            "SDG will be highlighted on the graph. Download the Sentence Predictions CSV for further details.",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")  
                    with gr.Row():
                        sentence_csv = gr.File(label="📊 Download Sentence Predictions CSV")
                        sentence_docx = gr.File(label="📄 Download Sentence Report DOCX")
                        sentence_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")

                with gr.TabItem("📈 Secondary SDGs"):
                    with gr.Row():
                        secondary_sentence_plot = gr.Plot(label="📈 Secondary SDGs Graph [Sentence-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
                            "SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
                            "Primary SDGs. Download the Sentence Predictions CSV for further details",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")  
                    with gr.Row():
                        sentence_csv_secondary = gr.File(label="📊 Download Sentence Predictions CSV")
                        sentence_report_file_secondary = gr.File(label="📄 Download Sentence Report DOCX")
                        secondary_sentence_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
            
        # Function to process page-level analysis
        @spaces.GPU
        def process_pages(file, extraction_mode, start_page, end_page):
            if not file:
                # Return None for each output component
                return [None, None, None, None, None, None, None, None]

            try:
                if hasattr(file, 'name'):
                    pdf_file_path = file.name
                    original_file_name = os.path.basename(file.name)
                else:
                    # Save the file to a temporary location
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                        temp_pdf.write(file.read())
                        pdf_file_path = temp_pdf.name
                    original_file_name = 'uploaded_document'

                # Sanitize the file name to use in output file names
                sanitized_file_name = os.path.splitext(original_file_name)[0]
                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)

                # Determine page range based on extraction_mode
                if extraction_mode == "All Pages":
                    selected_start = None
                    selected_end = None
                else:
                    selected_start = int(start_page)
                    selected_end = int(end_page)

                # Extract text and create DataFrames
                page_df, _ = extract_text_with_py_pdf_loader(
                    pdf_file_path,
                    start_page=selected_start,
                    end_page=selected_end
                )

                # Predict SDGs at page level
                df_page_predictions = predict_pages(page_df)

                # Generate plots with icon overlay
                first_plot = plot_sdg(
                    df_page_predictions, "",
                    'pred1',
                    x_axis_title="Percentage (%) of aligned pages",
                    y_axis_title="Primary SDGs"
                )
                second_plot = plot_sdg(
                    df_page_predictions, "",
                    'pred2',
                    x_axis_title="Percentage (%) of aligned pages",
                    y_axis_title="Secondary SDGs"
                )

                # Define output file names
                page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
                page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
                primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"

                page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
                page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
                secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"

                # Save CSV and reports
                df_page_predictions.to_csv(page_csv_file, index=False)
                page_report_primary = generate_page_report(df_page_predictions, page_report_file)

                df_page_predictions.to_csv(page_csv_file_secondary, index=False)
                page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)

                # Save figures as JPEG
                save_figure_as_jpeg(first_plot, primary_page_jpeg)
                save_figure_as_jpeg(second_plot, secondary_page_jpeg)

                return (
                    first_plot, second_plot,
                    page_csv_file, page_report_file, primary_page_jpeg,
                    page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
                )

            except Exception as e:
                print(f"Error: {e}")
                return [None, None, None, None, None, None, None, None]

        # Function to process sentence-level analysis
        @spaces.GPU
        def process_sentences(file, extraction_mode, start_page, end_page):
            if not file:
                # Return None for each output component
                return [None, None, None, None, None, None, None, None]

            try:
                if hasattr(file, 'name'):
                    pdf_file_path = file.name
                    original_file_name = os.path.basename(file.name)
                else:
                    # Save the file to a temporary location
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                        temp_pdf.write(file.read())
                        pdf_file_path = temp_pdf.name
                    original_file_name = 'uploaded_document'

                # Sanitize the file name to use in output file names
                sanitized_file_name = os.path.splitext(original_file_name)[0]
                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)

                # Determine page range based on extraction_mode
                if extraction_mode == "All Pages":
                    selected_start = None
                    selected_end = None
                else:
                    selected_start = int(start_page)
                    selected_end = int(end_page)

                # Extract text and create DataFrames
                _, sentence_df = extract_text_with_py_pdf_loader(
                    pdf_file_path,
                    start_page=selected_start,
                    end_page=selected_end
                )

                # Predict SDGs at sentence level
                df_sentence_predictions = predict_sentences(sentence_df)

                # Generate plots with icon overlay
                first_plot = plot_sdg(
                    df_sentence_predictions, "",
                    'pred1',
                    x_axis_title="Percentage (%) of aligned sentences",
                    y_axis_title="Primary SDGs"
                )
                second_plot = plot_sdg(
                    df_sentence_predictions, "",
                    'pred2',
                    x_axis_title="Percentage (%) of aligned sentences",
                    y_axis_title="Secondary SDGs"
                )

                # Define output file names
                sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
                sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
                primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"

                sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
                sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
                secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"

                # Save CSV and reports
                df_sentence_predictions.to_csv(sentence_csv_file, index=False)
                sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)

                df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
                sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)

                # Save figures as JPEG
                save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
                save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)

                return (
                    first_plot, second_plot,
                    sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
                    sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
                )

            except Exception as e:
                print(f"Error: {e}")
                return [None, None, None, None, None, None, None, None]

        # Reset functions to clear the outputs
        def reset_page_outputs():
            return [None, None, None, None, None, None, None, None]

        def reset_sentence_outputs():
            return [None, None, None, None, None, None, None, None]

        # Button actions for Page-Level Analysis
        page_button.click(
            process_pages,
            inputs=[file_input, extraction_mode, start_page, end_page],
            outputs=[
                primary_page_plot,           # 📊 Primary SDGs [Page-Level]
                secondary_page_plot,         # 📈 Secondary SDGs [Page-Level]
                page_csv,                    # 📊 Download Page Predictions CSV
                page_docx,                   # 📄 Download Page Report DOCX
                page_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
                page_csv_secondary,          # 📊 Download Page Predictions CSV
                page_report_file_secondary,  # 📄 Download Page Report DOCX
                secondary_page_jpeg          # 🖼️ Download Secondary SDGs JPEG
            ]
        )

        reset_page_button.click(
            reset_page_outputs,
            outputs=[
                primary_page_plot,
                secondary_page_plot,
                page_csv,
                page_docx,
                page_jpeg1,
                page_csv_secondary,
                page_report_file_secondary,
                secondary_page_jpeg
            ]
        )

        # Button actions for Sentence-Level Analysis
        sentence_button.click(
            process_sentences,
            inputs=[file_input, extraction_mode, start_page, end_page],
            outputs=[
                primary_sentence_plot,           # 📊 Primary SDGs [Sentence-Level]
                secondary_sentence_plot,         # 📈 Secondary SDGs [Sentence-Level]
                sentence_csv,                    # 📊 Download Sentence Predictions CSV
                sentence_docx,                   # 📄 Download Sentence Report DOCX
                sentence_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
                sentence_csv_secondary,          # 📊 Download Sentence Predictions CSV
                sentence_report_file_secondary,  # 📄 Download Sentence Report DOCX
                secondary_sentence_jpeg          # 🖼️ Download Secondary SDGs JPEG
            ]
        )

        reset_sentence_button.click(
            reset_sentence_outputs,
            outputs=[
                primary_sentence_plot,
                secondary_sentence_plot,
                sentence_csv,
                sentence_docx,
                sentence_jpeg1,
                sentence_csv_secondary,
                sentence_report_file_secondary,
                secondary_sentence_jpeg
            ]
        )

    demo.queue().launch()

launch_interface()