import gradio as gr import os import re import torch import pandas as pd import plotly.express as px import plotly.io as pio import nltk import tempfile from io import BytesIO import base64 from transformers import AutoTokenizer, AutoModelForSequenceClassification from nltk.tokenize import sent_tokenize from docx.shared import Inches from docx import Document import numpy as np # Needed for HF GPU access import spaces nltk.download('punkt') # Import PyPDFLoader for PDF processing from langchain_community.document_loaders import PyPDFLoader # Model checkpoint for SDG BERT checkpoint = "sadickam/sdgBERT" # Preprocessing function for text def prep_text(text): clean_sents = [] sent_tokens = sent_tokenize(str(text)) for sent_token in sent_tokens: word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()] clean_sents.append(' '.join(word_tokens)) joined = ' '.join(clean_sents).strip() return re.sub(r'`|"', "", joined) # Load the tokenizer and model with GPU support def load_model_and_tokenizer(): model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device) tokenizer = AutoTokenizer.from_pretrained(checkpoint) return model, tokenizer # Define device (ensure usage of GPU if available in Hugging Face Spaces) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # SDG labels label_list = [ 'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education', 'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy', 'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure', 'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities', 'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action', 'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions' ] # Function to predict SDGs for a batch of text inputs def predict_sdg_labels_batch(texts, model, tokenizer): tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device) model.eval() with torch.no_grad(): text_logits = model(**tokenized_texts).logits predictions = torch.softmax(text_logits, dim=1).tolist() return predictions # Page-level predictions with batch processing def predict_pages(page_df, batch_size=32): model, tokenizer = load_model_and_tokenizer() df_results = page_df.copy() num_rows = len(page_df) all_predicted_labels = [[] for _ in range(16)] all_prediction_scores = [[] for _ in range(16)] for start in range(0, num_rows, batch_size): end = min(start + batch_size, num_rows) df_chunk = page_df.iloc[start:end] texts = df_chunk['Text'].apply(prep_text).tolist() predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer) for predictions in predictions_batch: sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True) for i, (label, score) in enumerate(sorted_preds): all_predicted_labels[i].append(label) all_prediction_scores[i].append(score) # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...) for i in range(16): df_results[f'pred{i + 1}'] = all_predicted_labels[i] df_results[f'score{i + 1}'] = all_prediction_scores[i] # Reorder columns to ensure preds and scores are interleaved in the correct order reordered_columns = [] for i in range(16): reordered_columns.append(f'pred{i + 1}') reordered_columns.append(f'score{i + 1}') other_columns = [col for col in df_results.columns if col not in reordered_columns] df_results = df_results[other_columns + reordered_columns] return df_results # Sentence-level predictions with batch processing def predict_sentences(sentence_df, batch_size=32): model, tokenizer = load_model_and_tokenizer() df_combined_sentences = sentence_df.copy() num_rows = len(sentence_df) all_predicted_labels = [[] for _ in range(16)] all_prediction_scores = [[] for _ in range(16)] for start in range(0, num_rows, batch_size): end = min(start + batch_size, num_rows) df_chunk = sentence_df.iloc[start:end] texts = df_chunk['Sentence'].apply(prep_text).tolist() predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer) for predictions in predictions_batch: sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True) for i, (label, score) in enumerate(sorted_preds): all_predicted_labels[i].append(label) all_prediction_scores[i].append(round(score, 3)) # Add predictions and scores to DataFrame for i in range(16): df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i] df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i] # Reorder columns reordered_columns = [] for i in range(16): reordered_columns.append(f'pred{i + 1}') reordered_columns.append(f'score{i + 1}') other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns] df_combined_sentences = df_combined_sentences[other_columns + reordered_columns] return df_combined_sentences # Define unique colors for each SDG sdg_colors = { "SDG1_No Poverty": "#E5243B", "SDG2_Zero Hunger": "#DDA63A", "SDG3_Good Health and Well-being": "#4C9F38", "SDG4_Quality Education": "#C5192D", "SDG5_Gender Equality": "#FF3A21", "SDG6_Clean Water and Sanitation": "#26BDE2", "SDG7_Affordable and Clean Energy": "#FCC30B", "SDG8_Decent Work and Economic Growth": "#A21942", "SDG9_Industry, Innovation and Infrastructure": "#FD6925", "SDG10_Reduced Inequality": "#DD1367", "SDG11_Sustainable Cities and Communities": "#FD9D24", "SDG12_Responsible Consumption and Production": "#BF8B2E", "SDG13_Climate Action": "#3F7E44", "SDG14_Life Below Water": "#0A97D9", "SDG15_Life on Land": "#56C02B", "SDG16_Peace, Justice and Strong Institutions": "#00689D" } # Function to plot SDG dominant bar graphs using Plotly def plot_sdg(df, title, pred_column): """Plots a bar graph for SDG data using Plotly. Args: df: DataFrame containing SDG predictions. title: Title of the plot. pred_column: Column to use for plotting. """ df_filtered = df[df[pred_column].notna()] labels = df_filtered[pred_column].value_counts().sort_values(ascending=False) total = labels.sum() percentages = (labels / total) * 100 # Create a bar plot with Plotly fig = px.bar( percentages.rename_axis('SDG Label').reset_index(name='Percentage'), y='SDG Label', x='Percentage', orientation='h', title=title, color='SDG Label', color_discrete_map=sdg_colors # Use the defined unique colors for each SDG ) # Update y-axis to show labels fig.update_yaxes(showticklabels=True) # Add percentage labels to the bars fig.update_traces( texttemplate='%{x:.2f}%', textposition='auto', textfont=dict(size=10) ) # Adjust layout for better visibility fig.update_layout( title=dict( text=title, font=dict(size=14) # Increase title font size ), yaxis=dict( automargin=True, title=None, tickfont=dict(size=12) ), margin=dict(l=20, r=5, t=30, b=20), height=600, width=700, showlegend=False, template="simple_white", xaxis=dict( tickfont=dict(size=12) # Reduce x-axis font size ), ) return fig def save_figure_as_jpeg(fig, filename): """Saves the Plotly figure as a high-resolution JPEG.""" pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5) # Generate reports (page and sentence levels) def generate_page_report(df_pages): doc = Document() doc.add_heading("Page-Level SDG Analysis Report", 0) doc.add_heading("General Notes", level=2) doc.add_paragraph( 'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 ' 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores ' 'representing the likelihood that the text is aligned with particular SDGs. This page-level ' 'analysis provides high-level insight into SDG alignment.' '\n\n' 'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions ' '(Primary and Secondary) for each page with a probability score greater than zero.' ) doc.add_heading("Primary SDGs Bar Graph", level=3) doc.add_paragraph( 'This graph displays the most essential SDG the AI model associates with pages. The bars ' 'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant ' 'sustainable development theme within the document.' ) doc.add_heading("Secondary SDGs Bar Graph", level=3) doc.add_paragraph( 'This graph shows the second most relevant SDGs for pages. Although these SDGs are ' 'not the primary focus, the text has some relevance to these goals.' ) for doc_name in df_pages['Document'].unique(): doc.add_heading(f"Document: {doc_name}", level=2) df_doc = df_pages[df_pages['Document'] == doc_name] # Generate and save graphs first_sdg_plot_path = f"{doc_name}_first_sdg_page.jpeg" second_sdg_plot_path = f"{doc_name}_second_sdg_page.jpeg" plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image( first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido") plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image( second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido") # Add plots to the Word document doc.add_picture(first_sdg_plot_path, width=Inches(6)) doc.add_picture(second_sdg_plot_path, width=Inches(6)) doc.save("page_report.docx") return "page_report.docx" def generate_sentence_report(df_sentences): doc = Document() doc.add_heading("Sentence-Level SDG Analysis Report", 0) doc.add_heading("General Notes", level=2) doc.add_paragraph( 'This app splits documents into sentences using a natural language processing algorithm. ' 'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 ' 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores ' 'representing the likelihood that the text is aligned with particular SDGs. This sentence-level ' 'analysis provides deeper insight into SDG alignment.' '\n\n' 'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions ' '(Primary and Secondary) for each sentence with a probability score greater than zero.' ) doc.add_heading("Primary SDGs Bar Graph", level=3) doc.add_paragraph( 'This graph displays the most essential SDG the AI model associates with sentences. The bars ' 'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight ' 'into the dominant sustainable development theme within the document.' ) doc.add_heading("Secondary SDGs Bar Graph", level=3) doc.add_paragraph( 'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not ' 'the primary focus, the text has some relevance to these goals.' ) for doc_name in df_sentences['Document'].unique(): doc.add_heading(f"Document: {doc_name}", level=2) df_doc = df_sentences[df_sentences['Document'] == doc_name] # Generate and save graphs first_sdg_plot_path = f"{doc_name}_first_sdg_sentence.jpeg" second_sdg_plot_path = f"{doc_name}_second_sdg_sentence.jpeg" plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image( first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido") plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image( second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido") # Add plots to the Word document doc.add_picture(first_sdg_plot_path, width=Inches(6)) doc.add_picture(second_sdg_plot_path, width=Inches(6)) doc.save("sentence_report.docx") return "sentence_report.docx" # New text extraction functions def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None): """ Extract text from a PDF page by page using LangChain's PyPDFLoader. Args: pdf_file_path (str): The file path to the uploaded PDF. start_page (int, optional): The starting page number for extraction (1-based index). end_page (int, optional): The ending page number for extraction (1-based index). Returns: tuple: - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text. - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence. """ try: # Initialize the loader loader = PyPDFLoader(pdf_file_path) documents = loader.load_and_split() # Each document corresponds to a single page total_pages = len(documents) doc_name = os.path.basename(pdf_file_path) # Extract document name # Validate and adjust page range if start_page is not None and end_page is not None: # Convert to integers to avoid slicing issues start_page = int(start_page) end_page = int(end_page) # Adjust to valid range if start_page < 1: start_page = 1 if end_page > total_pages: end_page = total_pages if start_page > end_page: start_page, end_page = end_page, start_page # Swap if out of order # Select the subset of documents based on user input selected_docs = documents[start_page - 1:end_page] else: selected_docs = documents start_page = 1 end_page = total_pages # Initialize lists to store data page_data = [] sentence_data = [] for idx, doc in enumerate(selected_docs, start=start_page): page_num = idx text = doc.page_content.strip() # Append page-wise data page_data.append({ "Document": doc_name, "Page": page_num, "Text": text }) # Sentence tokenization sentences = sent_tokenize(text) for sentence in sentences: sentence = sentence.strip() if sentence: sentence_data.append({ "Document": doc_name, "Page": page_num, "Sentence": sentence }) # Create DataFrames page_df = pd.DataFrame(page_data) sentence_df = pd.DataFrame(sentence_data) return page_df, sentence_df except Exception as e: raise RuntimeError(f"Error during PDF extraction: {e}") def df_to_csv_bytes(df): """ Convert DataFrame to CSV in bytes. Args: df (pd.DataFrame): The DataFrame to convert. Returns: bytes: CSV data in bytes. """ try: buffer = BytesIO() df.to_csv(buffer, index=False) csv_data = buffer.getvalue() buffer.close() return csv_data except Exception as e: raise RuntimeError(f"Error during CSV conversion: {e}") def launch_interface(): with gr.Blocks(title="SDG Document Analysis App") as demo: # Title as a visible heading at the top of the page gr.Markdown( """ # SDG Document Analysis App Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels. """ ) # Shared PDF file input for both analyses with gr.Row(): file_input = gr.File( label="Upload PDF File for Analysis", file_types=[".pdf"] ) # Extraction mode selection extraction_mode = gr.Radio( choices=["All Pages", "Range of Pages"], value="All Pages", label="Extraction Mode" ) start_page = gr.Number(value=1, label="Start Page", visible=False) end_page = gr.Number(value=1, label="End Page", visible=False) # Function to update visibility of start_page and end_page def update_page_inputs(extraction_mode): if extraction_mode == "Range of Pages": return gr.update(visible=True), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False) extraction_mode.change( update_page_inputs, inputs=extraction_mode, outputs=[start_page, end_page] ) # Tabs for page-level and sentence-level analysis with gr.Tab("Page-Level Analysis"): gr.Markdown( """ ## Page-Level SDG Analysis This section conducts Sustainable Development Goals (SDG) mapping of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). It provides **high-level SDG mapping** of documents at the page level. """ ) with gr.Row(): with gr.Column(): primary_page_plot = gr.Plot(label="Primary SDGs [Page-Level]") with gr.Column(): secondary_page_plot = gr.Plot(label="Secondary SDGs [Page-Level]") with gr.Row(): page_csv = gr.File(label="Download Page Predictions CSV") page_docx = gr.File(label="Download Page Report DOCX") page_jpeg1 = gr.File(label="Download Primary SDGs JPEG") page_jpeg2 = gr.File(label="Download Secondary SDGs JPEG") page_button = gr.Button("Run Page-Level Analysis") reset_page_button = gr.Button("Reset Page-Level Analysis") with gr.Tab("Sentence-Level Analysis"): gr.Markdown( """ ## Sentence-Level SDG Analysis This section conducts Sustainable Development Goals (SDG) mapping using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). It provides **detailed SDG mapping** at the sentence level. """ ) with gr.Row(): with gr.Column(): primary_sentence_plot = gr.Plot(label="Primary SDGs [Sentence-Level]") with gr.Column(): secondary_sentence_plot = gr.Plot(label="Secondary SDGs [Sentence-Level]") with gr.Row(): sentence_csv = gr.File(label="Download Sentence Predictions CSV") sentence_docx = gr.File(label="Download Sentence Report DOCX") sentence_jpeg1 = gr.File(label="Download Primary SDGs JPEG") sentence_jpeg2 = gr.File(label="Download Secondary SDGs JPEG") sentence_button = gr.Button("Run Sentence-Level Analysis") reset_sentence_button = gr.Button("Reset Sentence-Level Analysis") # Function to process page-level analysis @spaces.GPU def process_pages(file, extraction_mode, start_page, end_page): if not file: return None, None, None, None, None, None try: if hasattr(file, 'name'): pdf_file_path = file.name else: # Save the file to a temporary location with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: temp_pdf.write(file.read()) pdf_file_path = temp_pdf.name # Determine page range based on extraction_mode if extraction_mode == "All Pages": selected_start = None selected_end = None else: selected_start = int(start_page) selected_end = int(end_page) # Extract text and create DataFrames page_df, _ = extract_text_with_py_pdf_loader( pdf_file_path, start_page=selected_start, end_page=selected_end ) # Predict SDGs at page level df_page_predictions = predict_pages(page_df) first_plot = plot_sdg( df_page_predictions, "", 'pred1' ) second_plot = plot_sdg( df_page_predictions, "", 'pred2' ) df_page_predictions.to_csv('page_predictions.csv', index=False) page_report = generate_page_report(df_page_predictions) # Save figures as JPEG save_figure_as_jpeg(first_plot, "primary_page.jpeg") save_figure_as_jpeg(second_plot, "secondary_page.jpeg") return ( first_plot, second_plot, 'page_predictions.csv', page_report, 'primary_page.jpeg', 'secondary_page.jpeg') except Exception as e: print(f"Error: {e}") return None, None, None, None, None, None # Function to process sentence-level analysis @spaces.GPU def process_sentences(file, extraction_mode, start_page, end_page): if not file: return None, None, None, None, None, None try: if hasattr(file, 'name'): pdf_file_path = file.name else: # Save the file to a temporary location with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: temp_pdf.write(file.read()) pdf_file_path = temp_pdf.name # Determine page range based on extraction_mode if extraction_mode == "All Pages": selected_start = None selected_end = None else: selected_start = int(start_page) selected_end = int(end_page) # Extract text and create DataFrames _, sentence_df = extract_text_with_py_pdf_loader( pdf_file_path, start_page=selected_start, end_page=selected_end ) # Predict SDGs at sentence level df_sentence_predictions = predict_sentences(sentence_df) first_plot = plot_sdg( df_sentence_predictions, "", 'pred1' ) second_plot = plot_sdg( df_sentence_predictions, "", 'pred2' ) df_sentence_predictions.to_csv('sentence_predictions.csv', index=False) sentence_report = generate_sentence_report(df_sentence_predictions) # Save figures as JPEG save_figure_as_jpeg(first_plot, "primary_sentence.jpeg") save_figure_as_jpeg(second_plot, "secondary_sentence.jpeg") return ( first_plot, second_plot, 'sentence_predictions.csv', sentence_report, 'primary_sentence.jpeg', 'secondary_sentence.jpeg') except Exception as e: print(f"Error: {e}") return None, None, None, None, None, None # Reset functions to clear the outputs def reset_page_outputs(): return None, None, None, None, None, None def reset_sentence_outputs(): return None, None, None, None, None, None # Button actions for each tab page_button.click( process_pages, inputs=[file_input, extraction_mode, start_page, end_page], outputs=[primary_page_plot, secondary_page_plot, page_csv, page_docx, page_jpeg1, page_jpeg2] ) sentence_button.click( process_sentences, inputs=[file_input, extraction_mode, start_page, end_page], outputs=[primary_sentence_plot, secondary_sentence_plot, sentence_csv, sentence_docx, sentence_jpeg1, sentence_jpeg2] ) # Reset button actions to clear outputs reset_page_button.click( reset_page_outputs, outputs=[primary_page_plot, secondary_page_plot, page_csv, page_docx, page_jpeg1, page_jpeg2] ) reset_sentence_button.click( reset_sentence_outputs, outputs=[primary_sentence_plot, secondary_sentence_plot, sentence_csv, sentence_docx, sentence_jpeg1, sentence_jpeg2] ) demo.queue().launch() launch_interface()